Source code for gneiss.cluster._pba

# ----------------------------------------------------------------------------
# Copyright (c) 2016--, gneiss development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
import numpy as np
import pandas as pd
from gneiss.sort import mean_niche_estimator
from gneiss.util import match, rename_internal_nodes
from gneiss.composition._variance import variation_matrix

from skbio import TreeNode, DistanceMatrix
from scipy.spatial.distance import euclidean
from scipy.cluster.hierarchy import linkage


[docs]def correlation_linkage(X, method='ward'):
    r"""
    Hierarchical Clustering based on proportionality.

    The hierarchy is built based on the correlationity between
    any two pairs of features.  Specifically the correlation between
    two features :math:`x` and :math:`y` is measured by

    .. math::
        p(x, y) = var (\ln \frac{x}{y})

    If :math:`p(x, y)` is very small, then :math:`x` and :math:`y`
    are said to be highly correlation. A hierarchical clustering is
    then performed using this correlation as a distance metric.

    This can be useful for constructing principal balances [1]_.

    Parameters
    ----------
    X : pd.DataFrame
        Contingency table where the samples are rows and the features
        are columns.
    method : str
        Clustering method.  (default='ward')

    Returns
    -------
    skbio.TreeNode
        Tree for constructing principal balances.

    References
    ----------

    .. [1] Pawlowsky-Glahn V, Egozcue JJ, and Tolosana-Delgado R.
       Principal Balances (2011).

    Examples
    --------
    >>> import pandas as pd
    >>> from gneiss.cluster import correlation_linkage
    >>> table = pd.DataFrame([[1, 1, 0, 0, 0],
    ...                       [0, 1, 1, 0, 0],
    ...                       [0, 0, 1, 1, 0],
    ...                       [0, 0, 0, 1, 1]],
    ...                      columns=['s1', 's2', 's3', 's4', 's5'],
    ...                      index=['o1', 'o2', 'o3', 'o4']).T
    >>> tree = correlation_linkage(table+0.1)
    >>> print(tree.ascii_art())
                        /-o1
              /y1------|
             |          \-o2
    -y0------|
             |          /-o3
              \y2------|
                        \-o4
    """
    dm = variation_matrix(X)
    lm = linkage(dm.condensed_form(), method=method)
    t = TreeNode.from_linkage_matrix(lm, X.columns)
    t = rename_internal_nodes(t)
    return t


[docs]def rank_linkage(r, method='average'):
    r""" Hierchical Clustering on feature ranks.

    The hierarchy is built based on the rank values of the features given
    an input vector `r` of ranks. The distance between two features :math:`x`
    and :math:`y` can be defined as

    .. math::
       d(x, y) = (r(x) - r(y))^2

    Where :math:`r(x)` is the rank of the features.  Hierarchical clustering is
    then performed using :math:`d(x, y)` as the distance metric.

    This can be useful for constructing principal balances.

    Parameters
    ----------
    r : pd.Series
        Continuous vector representing some ordering of the features in X.
    method : str
        Clustering method.  (default='average')

    Returns
    -------
    skbio.TreeNode
        Tree for constructing principal balances.

    Examples
    --------
    >>> import pandas as pd
    >>> from gneiss.cluster import rank_linkage
    >>> ranks = pd.Series([1, 2, 4, 5],
    ...                   index=['o1', 'o2', 'o3', 'o4'])
    >>> tree = rank_linkage(ranks)
    >>> print(tree.ascii_art())
                        /-o1
              /y1------|
             |          \-o2
    -y0------|
             |          /-o3
              \y2------|
                        \-o4
    """
    dm = DistanceMatrix.from_iterable(r, euclidean)
    lm = linkage(dm.condensed_form(), method)
    t = TreeNode.from_linkage_matrix(lm, r.index)
    t = rename_internal_nodes(t)
    return t


[docs]def gradient_linkage(X, y, method='average'):
    r"""
    Hierarchical Clustering on known gradient.

    The hierarchy is built based on the values of the samples
    located along a gradient.  Given a feature :math:`x`, the mean gradient
    values that :math:`x` was observed in is calculated by

    .. math::
        f(g , x) =
         \sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j}

    Where :math:`N` is the number of samples, :math:`x_i` is the proportion of
    feature :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value
    at sample `i`.

    The distance between two features :math:`x` and :math:`y` can be defined as

    .. math::
        d(x, y) = (f(g, x) - f(g, y))^2

    If :math:`d(x, y)` is very small, then :math:`x` and :math:`y`
    are expected to live in very similar positions across the gradient.
    A hierarchical clustering is then performed using :math:`d(x, y)` as
    the distance metric.

    This can be useful for constructing principal balances.

    Parameters
    ----------
    X : pd.DataFrame
        Contingency table where the samples are rows and the features
        are columns.
    y : pd.Series
        Continuous vector representing some ordering of the samples in X.
    method : str
        Clustering method.  (default='average')

    Returns
    -------
    skbio.TreeNode
        Tree for constructing principal balances.

    See Also
    --------
    mean_niche_estimator

    Examples
    --------
    >>> import pandas as pd
    >>> from gneiss.cluster import gradient_linkage
    >>> table = pd.DataFrame([[1, 1, 0, 0, 0],
    ...                       [0, 1, 1, 0, 0],
    ...                       [0, 0, 1, 1, 0],
    ...                       [0, 0, 0, 1, 1]],
    ...                      columns=['s1', 's2', 's3', 's4', 's5'],
    ...                      index=['o1', 'o2', 'o3', 'o4']).T
    >>> gradient = pd.Series([1, 2, 3, 4, 5],
    ...                      index=['s1', 's2', 's3', 's4', 's5'])
    >>> tree = gradient_linkage(table, gradient)
    >>> print(tree.ascii_art())
                        /-o1
              /y1------|
             |          \-o2
    -y0------|
             |          /-o3
              \y2------|
                        \-o4
    """
    _X, _y = match(X, y)
    mean_X = mean_niche_estimator(_X, gradient=_y)
    t = rank_linkage(mean_X)
    return t


[docs]def random_linkage(n):
    """ Generates a tree with random topology.

    Parameters
    ----------
    n : int
        Number of nodes in the tree

    Returns
    -------
    skbio.TreeNode
    Random tree for constructing principal balances.

    Examples
    --------
    >>> from gneiss.cluster import random_linkage
    >>> tree = random_linkage(10)

    Notes
    -----
    The nodes will be labeled from 0 to n.
    """
    index = np.arange(n).astype(np.str)
    x = pd.Series(np.random.rand(n), index=index)
    t = rank_linkage(x)
    return t