Source code for gneiss.sort
"""
Sort functions (:mod:`gneiss.sort`)
===================================
.. currentmodule:: gneiss.sort
This module contains sorting functions that sort contingency tables
in addition to trees.
Functions
---------
.. autosummary::
:toctree: generated/
mean_niche_estimator
niche_sort
ladderize
gradient_sort
"""
# ----------------------------------------------------------------------------
# Copyright (c) 2016--, gneiss development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
import numpy as np
import pandas as pd
from functools import partial
from gneiss.util import match
[docs]def mean_niche_estimator(abundances, gradient):
r""" Estimates the mean niche along a gradient of an organism.
Calculates the mean niche of an organism along a gradient.
This is done by calculating the mean gradient values that
an organism is observed in.
Specifically, this module calculates the following
.. math::
f(g , x) =
\sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j}
Where :math:`N` is the number of samples, :math:`x_i` is the proportion of
species :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value
at sample `i`.
Parameters
----------
abundances : pd.DataFrame or pd.Series, np.float
Vector of fraction abundances of an organism over a list of samples.
gradient : pd.Series, np.float
Vector of numerical gradient values.
Returns
-------
pd.Series or np.float :
The mean gradient that the feature is observed in.
If `abundances` is a `pd.DataFrame` containing the mean gradient
values for each feature. Otherwise a float is returned.
Raises
------
ValueError:
If the length of `abundances` is not the same length as `gradient`.
ValueError:
If the length of `gradient` contains nans.
"""
len_abundances = len(abundances)
len_gradient = len(gradient)
if len_abundances != len_gradient:
raise ValueError("Length of `abundances` (%d) doesn't match the length"
" of the `gradient` (%d)" % (len_abundances,
len_gradient))
if np.any(pd.isnull(gradient)):
raise ValueError("`gradient` cannot have any nans.")
# normalizes the proportions of the organism across all of the
# samples to add to 1.
v = abundances / abundances.sum()
m = np.dot(gradient, v)
if isinstance(abundances, pd.DataFrame):
m = pd.Series(m, index=abundances.columns)
return m
[docs]def niche_sort(table, gradient, niche_estimator=mean_niche_estimator):
""" Sort the table according to estimated niches.
Sorts the table by samples along the gradient
and otus by their estimated niche along the gradient.
Parameters
----------
table : pd.DataFrame
Contingency table where samples are rows and features (i.e. OTUs)
are columns.
gradient : pd.Series
Vector of numerical gradient values.
niche_estimator : function, optional
A function that takes in two pandas series and returns an ordered
object. The ability for the object to be ordered is critical, since
this will allow the table to be sorted according to this ordering.
By default, `mean_niche_estimator` will be used.
Returns
-------
pd.DataFrame :
Sorted table according to the gradient of the samples, and the niches
of the organisms along that gradient.
Raises
------
ValueError :
Raised if `niche_estimator` is not a function.
"""
if not callable(niche_estimator):
raise ValueError("`niche_estimator` is not a function.")
table, gradient = match(table, gradient)
niche_estimator = partial(niche_estimator, gradient=gradient)
# normalizes feature abundances to sum to 1, for each sample.
# (i.e. scales values in each row to sum to 1).
normtable = table.apply(lambda x: x/x.sum(), axis=1)
# calculates estimated niche for each feature
est_niche = normtable.apply(niche_estimator, axis=0)
gradient = gradient.sort_values()
est_niche = est_niche.sort_values()
table = table.reindex(index=gradient.index,
columns=est_niche.index)
return table
def _cache_ntips(tree):
for n in tree.postorder(include_self=True):
if n.is_tip():
n._n_tips = 1
else:
n._n_tips = sum(c._n_tips for c in n.children)
return tree
[docs]def ladderize(tree, ascending=True):
"""
Sorts tree according to the size of the subtrees.
Parameters
----------
tree : skbio.TreeNode
Input tree where leafs correspond to features.
Returns
-------
skbio.TreeNode
A tree whose tips are sorted according to subtree size.
Examples
--------
>>> from skbio import TreeNode
>>> tree = TreeNode.read([u'((a,b)c, ((g,h)e,f)d)r;'])
>>> print(tree.ascii_art())
/-a
/c-------|
| \-b
-r-------|
| /-g
| /e-------|
\d-------| \-h
|
\-f
>>> sorted_tree = ladderize(tree)
>>> print(sorted_tree.ascii_art())
/-a
/c-------|
| \-b
-r-------|
| /-f
\d-------|
| /-g
\e-------|
\-h
"""
sorted_tree = tree.copy()
sorted_tree = _cache_ntips(tree)
for n in sorted_tree.postorder(include_self=True):
sizes = [k._n_tips for k in n.children]
idx = np.argsort(sizes)
if not ascending:
idx = idx[::-1]
n.children = [n.children[i] for i in idx]
return sorted_tree
[docs]def gradient_sort(tree, gradient, ascending=True):
"""
Sorts tree according to ordering in gradient.
Parameters
----------
tree : skbio.TreeNode
Input tree where leafs correspond to features
contained in the index in `gradient`.
gradient : pd.Series, numeric
Gradient where the index correspond to feature names.
The index in the gradient must be consistent with
names of the tips in the `tree`.
Returns
-------
skbio.TreeNode
A tree whose tips are sorted along the gradient.
Examples
--------
>>> from skbio import TreeNode
>>> import pandas as pd
>>> tree = TreeNode.read([u'((a,b)c, ((g,h)e,f)d)r;'])
>>> x = pd.Series({'f':3, 'g':1, 'h':2, 'a':4, 'b':5})
>>> print(tree.ascii_art())
/-a
/c-------|
| \-b
-r-------|
| /-g
| /e-------|
\d-------| \-h
|
\-f
>>> res = gradient_sort(tree, x)
>>> print(res.ascii_art())
/-g
/e-------|
/d-------| \-h
| |
-r-------| \-f
|
| /-a
\c-------|
\-b
"""
sorted_tree = tree.copy()
if not np.issubdtype(gradient, np.number):
raise ValueError('`gradient` needs to be numeric, not %s' %
gradient.dtype)
# Note that this operation is not optimal
# See https://github.com/biocore/gneiss/issues/58
for n in sorted_tree.postorder(include_self=True):
means = [gradient.loc[list(k.subset())].mean() for k in n.children]
idx = np.argsort(means)
if not ascending:
idx = idx[::-1]
n.children = [n.children[i] for i in idx]
return sorted_tree