Source code for gneiss.regression._summary

# ----------------------------------------------------------------------------
# Copyright (c) 2016--, gneiss development team.
#
# Distributed under the terms of the GPLv3 License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
import pandas as pd
from skbio.stats.composition import ilr_inv, clr_inv
import pickle
from skbio import TreeNode


[docs]class RegressionResults(): """ Summary object for storing regression results. A `RegressionResults` object stores information about the individual balances used in the regression, the coefficients, residuals. This object can be used to perform predictions. In addition, summary statistics such as the coefficient of determination for the overall fit can be calculated. Parameters ---------- stat_results : list, sm.RegressionResults List of RegressionResults objects. feature_names : array_like, str, optional List of original names for features that are found in `tree`. basis : np.array, optional Orthonormal basis in the Aitchison simplex. If this is not specified, then `project` cannot be enabled in `coefficients` or `predict`. balances : np.array, optional A table of balances where samples are rows and balances are columns. These balances were calculated using `tree`. tree : skbio.TreeNode Bifurcating tree that defines `basis` Methods ------- """
[docs] def __init__(self, stat_results, feature_names=None, basis=None, balances=None, tree=None): self.feature_names = feature_names # basis is now handled differently # https://github.com/biocore/scikit-bio/pull/1396 if basis is not None: self.basis = clr_inv(basis) else: self.basis = basis self.results = stat_results self._tree = str(tree) self.balances = balances # obtain pvalues self.pvalues = pd.DataFrame() for r in self.results: p = r.pvalues p.name = r.model.endog_names self.pvalues = self.pvalues.append(p)
@property def r2(self): """ Coefficient of determination for overall fit""" # Reason why we wanted to move this out was because not # all types of statsmodels regressions have this property. # See `statsmodels.regression.linear_model.RegressionResults` # for more explanation on `ess` and `ssr`. # sum of squares regression. Also referred to as # explained sum of squares. ssr = sum([r.ess for r in self.results]) # sum of squares error. Also referred to as sum of squares residuals sse = sum([r.ssr for r in self.results]) # calculate the overall coefficient of determination (i.e. R2) sst = sse + ssr return 1 - sse / sst def _check_projection(self, project): """ Checks to make sure that the `ilr_inv` can be performed. Parameters ---------- project : bool Specifies if a projection into the Aitchison simplex can be performed. Raises ------ ValueError: Cannot perform projection into Aitchison simplex if `basis` is not specified. ValueError: Cannot perform projection into Aitchison simplex if `feature_names` is not specified. """ if self.basis is None and project: raise ValueError("Cannot perform projection into Aitchison simplex" " if `basis` is not specified.") if self.feature_names is None and project: raise ValueError("Cannot perform projection into Aitchison simplex" " if `feature_names` is not specified.") def coefficients(self, project=False): """ Returns coefficients from fit. Parameters ---------- project : bool, optional Specifies if coefficients should be projected back into the Aitchison simplex [1]_. If false, the coefficients will be represented as balances (default: False). Returns ------- pd.DataFrame A table of values where columns are coefficients, and the index is either balances or proportions, depending on the value of `project`. Raises ------ ValueError: Cannot perform projection into Aitchison simplex if `basis` is not specified. ValueError: Cannot perform projection into Aitchison simplex if `feature_names` is not specified. References ---------- .. [1] Aitchison, J. "A concise guide to compositional data analysis, CDA work." Girona 24 (2003): 73-81. """ self._check_projection(project) coef = pd.DataFrame() for r in self.results: c = r.params c.name = r.model.endog_names coef = coef.append(c) if project: # `check=False`, due to a problem with error handling # addressed here https://github.com/biocore/scikit-bio/pull/1396 # This will need to be fixed here: # https://github.com/biocore/gneiss/issues/34 c = ilr_inv(coef.values.T, basis=self.basis, check=False).T return pd.DataFrame(c, index=self.feature_names, columns=coef.columns) else: return coef def residuals(self, project=False): """ Returns calculated residuals. Parameters ---------- X : pd.DataFrame, optional Input table of covariates. If not specified, then the fitted values calculated from training the model will be returned. project : bool, optional Specifies if coefficients should be projected back into the Aitchison simplex [1]_. If false, the coefficients will be represented as balances (default: False). Returns ------- pd.DataFrame A table of values where rows are samples, and the columns are either balances or proportions, depending on the value of `project`. References ---------- .. [1] Aitchison, J. "A concise guide to compositional data analysis, CDA work." Girona 24 (2003): 73-81. """ self._check_projection(project) resid = pd.DataFrame() for r in self.results: err = r.resid err.name = r.model.endog_names resid = resid.append(err) if project: # `check=False`, due to a problem with error handling # addressed here https://github.com/biocore/scikit-bio/pull/1396 # This will need to be fixed here: # https://github.com/biocore/gneiss/issues/34 proj_resid = ilr_inv(resid.values.T, basis=self.basis, check=False).T return pd.DataFrame(proj_resid, index=self.feature_names, columns=resid.columns).T else: return resid.T def predict(self, X=None, project=False, **kwargs): """ Performs a prediction based on model. Parameters ---------- X : pd.DataFrame, optional Input table of covariates, where columns are covariates, and rows are samples. If not specified, then the fitted values calculated from training the model will be returned. project : bool, optional Specifies if coefficients should be projected back into the Aitchison simplex [1]_. If false, the coefficients will be represented as balances (default: False). **kwargs : dict Other arguments to be passed into the model prediction. Returns ------- pd.DataFrame A table of values where rows are coefficients, and the columns are either balances or proportions, depending on the value of `project`. References ---------- .. [1] Aitchison, J. "A concise guide to compositional data analysis, CDA work." Girona 24 (2003): 73-81. """ self._check_projection(project) prediction = pd.DataFrame() for m in self.results: # check if X is none. p = pd.Series(m.predict(X, **kwargs)) p.name = m.model.endog_names if X is not None: p.index = X.index else: p.index = m.fittedvalues.index prediction = prediction.append(p) if project: # `check=False`, due to a problem with error handling # addressed here https://github.com/biocore/scikit-bio/pull/1396 # This will need to be fixed here: # https://github.com/biocore/gneiss/issues/34 proj_prediction = ilr_inv(prediction.values.T, basis=self.basis, check=False) return pd.DataFrame(proj_prediction, columns=self.feature_names, index=prediction.columns) return prediction.T @property def tree(self): """ Bifurcating tree used to calculate ilr transform.""" return TreeNode.read([self._tree]) @classmethod def read_pickle(self, filename): """ Reads RegressionResults object from pickle file. Parameters ---------- filename : str Input file to unpickle. Returns ------- RegressionResults Notes ----- Warning: Loading pickled data received from untrusted sources can be unsafe. See: https://wiki.python.org/moin/UsingPickle """ with open(filename, 'rb') as fh: res = pickle.load(fh) return res def write_pickle(self, filename): """ Writes RegressionResults object to pickle file. Parameters ---------- filename : str Output file to store pickled object. """ with open(filename, 'wb') as fh: pickle.dump(self, fh)