# ----------------------------------------------------------------------------
# Copyright (c) 2016--, gneiss development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
import numpy as np
import pandas as pd
from gneiss.sort import mean_niche_estimator
from gneiss.util import match, rename_internal_nodes
from gneiss.composition._variance import variation_matrix
from skbio import TreeNode, DistanceMatrix
from scipy.spatial.distance import euclidean
from scipy.cluster.hierarchy import linkage
[docs]def correlation_linkage(X, method='ward'):
r"""
Hierarchical Clustering based on proportionality.
The hierarchy is built based on the correlationity between
any two pairs of features. Specifically the correlation between
two features :math:`x` and :math:`y` is measured by
.. math::
p(x, y) = var (\ln \frac{x}{y})
If :math:`p(x, y)` is very small, then :math:`x` and :math:`y`
are said to be highly correlation. A hierarchical clustering is
then performed using this correlation as a distance metric.
This can be useful for constructing principal balances [1]_.
Parameters
----------
X : pd.DataFrame
Contingency table where the samples are rows and the features
are columns.
method : str
Clustering method. (default='ward')
Returns
-------
skbio.TreeNode
Tree for constructing principal balances.
References
----------
.. [1] Pawlowsky-Glahn V, Egozcue JJ, and Tolosana-Delgado R.
Principal Balances (2011).
Examples
--------
>>> import pandas as pd
>>> from gneiss.cluster import correlation_linkage
>>> table = pd.DataFrame([[1, 1, 0, 0, 0],
... [0, 1, 1, 0, 0],
... [0, 0, 1, 1, 0],
... [0, 0, 0, 1, 1]],
... columns=['s1', 's2', 's3', 's4', 's5'],
... index=['o1', 'o2', 'o3', 'o4']).T
>>> tree = correlation_linkage(table+0.1)
>>> print(tree.ascii_art())
/-o1
/y1------|
| \-o2
-y0------|
| /-o3
\y2------|
\-o4
"""
dm = variation_matrix(X)
lm = linkage(dm.condensed_form(), method=method)
t = TreeNode.from_linkage_matrix(lm, X.columns)
t = rename_internal_nodes(t)
return t
[docs]def rank_linkage(r, method='average'):
r""" Hierchical Clustering on feature ranks.
The hierarchy is built based on the rank values of the features given
an input vector `r` of ranks. The distance between two features :math:`x`
and :math:`y` can be defined as
.. math::
d(x, y) = (r(x) - r(y))^2
Where :math:`r(x)` is the rank of the features. Hierarchical clustering is
then performed using :math:`d(x, y)` as the distance metric.
This can be useful for constructing principal balances.
Parameters
----------
r : pd.Series
Continuous vector representing some ordering of the features in X.
method : str
Clustering method. (default='average')
Returns
-------
skbio.TreeNode
Tree for constructing principal balances.
Examples
--------
>>> import pandas as pd
>>> from gneiss.cluster import rank_linkage
>>> ranks = pd.Series([1, 2, 4, 5],
... index=['o1', 'o2', 'o3', 'o4'])
>>> tree = rank_linkage(ranks)
>>> print(tree.ascii_art())
/-o1
/y1------|
| \-o2
-y0------|
| /-o3
\y2------|
\-o4
"""
dm = DistanceMatrix.from_iterable(r, euclidean)
lm = linkage(dm.condensed_form(), method)
t = TreeNode.from_linkage_matrix(lm, r.index)
t = rename_internal_nodes(t)
return t
[docs]def gradient_linkage(X, y, method='average'):
r"""
Hierarchical Clustering on known gradient.
The hierarchy is built based on the values of the samples
located along a gradient. Given a feature :math:`x`, the mean gradient
values that :math:`x` was observed in is calculated by
.. math::
f(g , x) =
\sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j}
Where :math:`N` is the number of samples, :math:`x_i` is the proportion of
feature :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value
at sample `i`.
The distance between two features :math:`x` and :math:`y` can be defined as
.. math::
d(x, y) = (f(g, x) - f(g, y))^2
If :math:`d(x, y)` is very small, then :math:`x` and :math:`y`
are expected to live in very similar positions across the gradient.
A hierarchical clustering is then performed using :math:`d(x, y)` as
the distance metric.
This can be useful for constructing principal balances.
Parameters
----------
X : pd.DataFrame
Contingency table where the samples are rows and the features
are columns.
y : pd.Series
Continuous vector representing some ordering of the samples in X.
method : str
Clustering method. (default='average')
Returns
-------
skbio.TreeNode
Tree for constructing principal balances.
See Also
--------
mean_niche_estimator
Examples
--------
>>> import pandas as pd
>>> from gneiss.cluster import gradient_linkage
>>> table = pd.DataFrame([[1, 1, 0, 0, 0],
... [0, 1, 1, 0, 0],
... [0, 0, 1, 1, 0],
... [0, 0, 0, 1, 1]],
... columns=['s1', 's2', 's3', 's4', 's5'],
... index=['o1', 'o2', 'o3', 'o4']).T
>>> gradient = pd.Series([1, 2, 3, 4, 5],
... index=['s1', 's2', 's3', 's4', 's5'])
>>> tree = gradient_linkage(table, gradient)
>>> print(tree.ascii_art())
/-o1
/y1------|
| \-o2
-y0------|
| /-o3
\y2------|
\-o4
"""
_X, _y = match(X, y)
mean_X = mean_niche_estimator(_X, gradient=_y)
t = rank_linkage(mean_X)
return t
[docs]def random_linkage(n):
""" Generates a tree with random topology.
Parameters
----------
n : int
Number of nodes in the tree
Returns
-------
skbio.TreeNode
Random tree for constructing principal balances.
Examples
--------
>>> from gneiss.cluster import random_linkage
>>> tree = random_linkage(10)
Notes
-----
The nodes will be labeled from 0 to n.
"""
index = np.arange(n).astype(np.str)
x = pd.Series(np.random.rand(n), index=index)
t = rank_linkage(x)
return t