Source code for gneiss.cluster._pba

# ----------------------------------------------------------------------------
# Copyright (c) 2016--, gneiss development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
from gneiss.sort import mean_niche_estimator
from gneiss.util import match
from gneiss.stats.composition import variation_matrix

from skbio import TreeNode, DistanceMatrix
from scipy.spatial.distance import euclidean
from scipy.cluster.hierarchy import linkage


[docs]def proportional_linkage(X, method='ward'):
    r"""
    Principal Balance Analysis using Hierarchical Clustering
    based on proportionality.

    The hierarchy is built based on the proportionality between
    any two pairs of features.  Specifically the proportionality between
    two features :math:`x` and :math:`y` is measured by

    .. math::
        p(x, y) = var (\ln \frac{x}{y})

    If :math:`p(x, y)` is very small, then :math:`x` and :math:`y`
    are said to be highly proportional. A hierarchical clustering is
    then performed using this proportionality as a distance metric.

    Parameters
    ----------
    X : pd.DataFrame
        Contingency table where the samples are rows and the features
        are columns.
    method : str
        Clustering method.  (default='ward')

    Returns
    -------
    skbio.TreeNode
        Tree generated from principal balance analysis.

    References
    ----------

    .. [1] Pawlowsky-Glahn V, Egozcue JJ, and Tolosana-Delgado R.
       Principal Balances (2011).

    Examples
    --------
    >>> import pandas as pd
    >>> from gneiss.cluster import proportional_linkage
    >>> table = pd.DataFrame([[1, 1, 0, 0, 0],
    ...                       [0, 1, 1, 0, 0],
    ...                       [0, 0, 1, 1, 0],
    ...                       [0, 0, 0, 1, 1]],
    ...                      columns=['s1', 's2', 's3', 's4', 's5'],
    ...                      index=['o1', 'o2', 'o3', 'o4']).T
    >>> tree = proportional_linkage(table+0.1)

    """
    dm = variation_matrix(X)
    lm = linkage(dm.condensed_form(), method=method)
    return TreeNode.from_linkage_matrix(lm, X.columns)


[docs]def gradient_linkage(X, y, method='average'):
    r"""
    Principal Balance Analysis using Hierarchical Clustering
    on known gradient.

    The hierarchy is built based on the values of the samples
    located along a gradient.  Given a feature :math:`x`, the mean gradient
    values that :math:`x` was observed in is calculated by

    .. math::
        f(g , x) =
         \sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j}

    Where :math:`N` is the number of samples, :math:`x_i` is the proportion of
    feature :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value
    at sample `i`.

    The distance between two features :math:`x` and :math:`y` can be defined as

    .. math::
        d(x, y) = (f(g, x) - f(g, y))^2

    If :math:`d(x, y)` is very small, then :math:`x` and :math:`y`
    are expected to live in very similar positions across the gradient.
    A hierarchical clustering is  then performed using :math:`d(x, y)` as
    the distance metric.

    Parameters
    ----------
    X : pd.DataFrame
        Contingency table where the samples are rows and the features
        are columns.
    y : pd.Series
        Continuous vector representing some ordering of the features in X.
    method : str
        Clustering method.  (default='average')

    Returns
    -------
    skbio.TreeNode
        Tree generated from principal balance analysis.

    See Also
    --------
    mean_niche_estimator

    Examples
    --------
    >>> import pandas as pd
    >>> from gneiss.cluster import gradient_linkage
    >>> table = pd.DataFrame([[1, 1, 0, 0, 0],
    ...                       [0, 1, 1, 0, 0],
    ...                       [0, 0, 1, 1, 0],
    ...                       [0, 0, 0, 1, 1]],
    ...                      columns=['s1', 's2', 's3', 's4', 's5'],
    ...                      index=['o1', 'o2', 'o3', 'o4']).T
    >>> gradient = pd.Series([1, 2, 3, 4, 5],
    ...                      index=['s1', 's2', 's3', 's4', 's5'])
    >>> tree = gradient_linkage(table, gradient)

    """
    _X, _y = match(X, y)
    mean_X = mean_niche_estimator(_X, gradient=_y)
    dm = DistanceMatrix.from_iterable(mean_X, euclidean)
    lm = linkage(dm.condensed_form(), method)
    return TreeNode.from_linkage_matrix(lm, X.columns)