# ----------------------------------------------------------------------------
# Copyright (c) 2016--, gneiss development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
from gneiss.sort import mean_niche_estimator
from gneiss.util import match
from gneiss.stats.composition import variation_matrix
from skbio import TreeNode, DistanceMatrix
from scipy.spatial.distance import euclidean
from scipy.cluster.hierarchy import linkage
[docs]def proportional_linkage(X, method='ward'):
r"""
Principal Balance Analysis using Hierarchical Clustering
based on proportionality.
The hierarchy is built based on the proportionality between
any two pairs of features. Specifically the proportionality between
two features :math:`x` and :math:`y` is measured by
.. math::
p(x, y) = var (\ln \frac{x}{y})
If :math:`p(x, y)` is very small, then :math:`x` and :math:`y`
are said to be highly proportional. A hierarchical clustering is
then performed using this proportionality as a distance metric.
Parameters
----------
X : pd.DataFrame
Contingency table where the samples are rows and the features
are columns.
method : str
Clustering method. (default='ward')
Returns
-------
skbio.TreeNode
Tree generated from principal balance analysis.
References
----------
.. [1] Pawlowsky-Glahn V, Egozcue JJ, and Tolosana-Delgado R.
Principal Balances (2011).
Examples
--------
>>> import pandas as pd
>>> from gneiss.cluster import proportional_linkage
>>> table = pd.DataFrame([[1, 1, 0, 0, 0],
... [0, 1, 1, 0, 0],
... [0, 0, 1, 1, 0],
... [0, 0, 0, 1, 1]],
... columns=['s1', 's2', 's3', 's4', 's5'],
... index=['o1', 'o2', 'o3', 'o4']).T
>>> tree = proportional_linkage(table+0.1)
"""
dm = variation_matrix(X)
lm = linkage(dm.condensed_form(), method=method)
return TreeNode.from_linkage_matrix(lm, X.columns)
[docs]def gradient_linkage(X, y, method='average'):
r"""
Principal Balance Analysis using Hierarchical Clustering
on known gradient.
The hierarchy is built based on the values of the samples
located along a gradient. Given a feature :math:`x`, the mean gradient
values that :math:`x` was observed in is calculated by
.. math::
f(g , x) =
\sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j}
Where :math:`N` is the number of samples, :math:`x_i` is the proportion of
feature :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value
at sample `i`.
The distance between two features :math:`x` and :math:`y` can be defined as
.. math::
d(x, y) = (f(g, x) - f(g, y))^2
If :math:`d(x, y)` is very small, then :math:`x` and :math:`y`
are expected to live in very similar positions across the gradient.
A hierarchical clustering is then performed using :math:`d(x, y)` as
the distance metric.
Parameters
----------
X : pd.DataFrame
Contingency table where the samples are rows and the features
are columns.
y : pd.Series
Continuous vector representing some ordering of the features in X.
method : str
Clustering method. (default='average')
Returns
-------
skbio.TreeNode
Tree generated from principal balance analysis.
See Also
--------
mean_niche_estimator
Examples
--------
>>> import pandas as pd
>>> from gneiss.cluster import gradient_linkage
>>> table = pd.DataFrame([[1, 1, 0, 0, 0],
... [0, 1, 1, 0, 0],
... [0, 0, 1, 1, 0],
... [0, 0, 0, 1, 1]],
... columns=['s1', 's2', 's3', 's4', 's5'],
... index=['o1', 'o2', 'o3', 'o4']).T
>>> gradient = pd.Series([1, 2, 3, 4, 5],
... index=['s1', 's2', 's3', 's4', 's5'])
>>> tree = gradient_linkage(table, gradient)
"""
_X, _y = match(X, y)
mean_X = mean_niche_estimator(_X, gradient=_y)
dm = DistanceMatrix.from_iterable(mean_X, euclidean)
lm = linkage(dm.condensed_form(), method)
return TreeNode.from_linkage_matrix(lm, X.columns)