Source code for gneiss.regression._ols

# ----------------------------------------------------------------------------
# Copyright (c) 2016--, gneiss development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
from collections import OrderedDict
import numpy as np
import pandas as pd
from gneiss.regression._model import RegressionModel
from gneiss.util import _type_cast_to_float
from gneiss.balances import balance_basis
from skbio.stats.composition import ilr_inv

from statsmodels.iolib.summary2 import Summary
from statsmodels.sandbox.tools.cross_val import LeaveOneOut
from patsy import dmatrix
from scipy import stats


[docs]def ols(formula, table, metadata):
    """ Ordinary Least Squares applied to balances.

    An ordinary least squares (OLS) regression is a method for estimating
    parameters in a linear regression model.  OLS is a common statistical
    technique for fitting and testing the effects of covariates on a response.
    This implementation is focused on performing a multivariate response
    regression where the response is a matrix of balances (`table`) and the
    covariates (`metadata`) are made up of external variables.

    Global statistical tests indicating goodness of fit and contributions
    from covariates can be accessed from a coefficient of determination (`r2`),
    leave-one-variable-out cross validation (`lovo`), leave-one-out
    cross validation (`loo`) and k-fold cross validation (`kfold`).
    In addition residuals (`residuals`) can be accessed for diagnostic
    purposes.

    T-statistics (`tvalues`) and p-values (`pvalues`) can be obtained to
    investigate to evaluate statistical significance for a covariate for a
    given balance.  Predictions on the resulting model can be made using
    (`predict`), and these results can be interpreted as either balances or
    proportions.

    Parameters
    ----------
    formula : str
        Formula representing the statistical equation to be evaluated.
        These strings are similar to how equations are handled in R and
        statsmodels. Note that the dependent variable in this string should
        not be specified, since this method will be run on each of the
        individual balances. See `patsy` for more details.
    table : pd.DataFrame
        Contingency table where samples correspond to rows and
        balances correspond to columns.
    metadata: pd.DataFrame
        Metadata table that contains information about the samples contained
        in the `table` object.  Samples correspond to rows and covariates
        correspond to columns.

    Returns
    -------
    OLSModel
        Container object that holds information about the overall fit.
        This includes information about coefficients, pvalues, residuals
        and coefficient of determination from the resulting regression.

    Example
    -------
    >>> import numpy as np
    >>> import pandas as pd
    >>> from skbio import TreeNode
    >>> from gneiss.regression import ols

    Here, we will define a table of balances as follows

    >>> np.random.seed(0)
    >>> n = 100
    >>> g1 = np.linspace(0, 15, n)
    >>> y1 = g1 + 5
    >>> y2 = -g1 - 2
    >>> Y = pd.DataFrame({'y1': y1, 'y2': y2})

    Once we have the balances defined, we will add some errors

    >>> e = np.random.normal(loc=1, scale=0.1, size=(n, 2))
    >>> Y = Y + e

    Now we will define the environment variables that we want to
    regress against the balances.

    >>> X = pd.DataFrame({'g1': g1})

    Once these variables are defined, a regression can be performed.
    These proportions will be converted to balances according to the
    tree specified.  And the regression formula is specified to run
    `temp` and `ph` against the proportions in a single model.

    >>> res = ols('g1', Y, X)
    >>> res.fit()

    From the summary results of the `ols` function, we can view the
    pvalues according to how well each individual balance fitted in the
    regression model.

    >>> res.pvalues
                          y1             y2
    Intercept  8.826379e-148   7.842085e-71
    g1         1.923597e-163  1.277152e-163

    We can also view the balance coefficients estimated in the regression
    model. These coefficients can also be viewed as proportions by passing
    `project=True` as an argument in `res.coefficients()`.

    >>> res.coefficients()
                     y1        y2
    Intercept  6.016459 -0.983476
    g1         0.997793 -1.000299

    The overall model fit can be obtained as follows

    >>> res.r2
    0.99945903186495066

    """

    # one-time creation of exogenous data matrix allows for faster run-time
    metadata = _type_cast_to_float(metadata.copy())
    x = dmatrix(formula, metadata, return_type='dataframe')
    ilr_table, x = table.align(x, join='inner', axis=0)
    return OLSModel(Y=ilr_table, Xs=x)


[docs]class OLSModel(RegressionModel):
    """ Summary object for storing ordinary least squares results.

    A `OLSModel` object stores information about the
    individual balances used in the regression, the coefficients,
    residuals. This object can be used to perform predictions.
    In addition, summary statistics such as the coefficient
    of determination for the overall fit can be calculated.


    Attributes
    ----------
    submodels : list of statsmodels objects
        List of statsmodels result objects.
    balances : pd.DataFrame
        A table of balances where samples are rows and
        balances are columns.  These balances were calculated
        using `tree`.
    """
[docs]    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def fit(self, **kwargs):
        """ Fit the ordinary least squares model.

        Here, the coefficients of the model are estimated.
        In addition, there are additional summary statistics
        that are being calculated, such as residuals, t-statistics,
        pvalues and coefficient of determination.


        Parameters
        ----------
        **kwargs : dict
           Keyword arguments used to tune the parameter estimation.
        """
        Y = self.response_matrix
        X = self.design_matrices

        n, p = X.shape
        inv = np.linalg.pinv(np.dot(X.T, X))
        cross = np.dot(inv, X.T)
        beta = np.dot(cross, Y)
        pX = np.dot(X, beta)
        resid = (Y - pX)
        sst = (Y - Y.mean(axis=0))
        sse = (resid**2).sum(axis=0)

        sst_balance = ((Y - Y.mean(axis=0))**2).sum(axis=0)

        sse_balance = (resid**2).sum(axis=0)
        ssr_balance = (sst_balance - sse_balance)

        df_resid = n - p + 1
        mse = sse / df_resid
        self._mse = mse
        # t tests
        # TODO: Look into more robust ways to calculate cov
        cov = np.linalg.pinv(np.dot(X.T, X))
        bse = np.sqrt(np.outer(np.diag(cov), mse))
        tvalues = np.divide(beta, bse)
        pvals = stats.t.sf(np.abs(tvalues), df_resid)*2
        self._tvalues = pd.DataFrame(tvalues, index=X.columns,
                                     columns=Y.columns)
        self._pvalues = pd.DataFrame(pvals, index=X.columns,
                                     columns=Y.columns)
        self._beta = pd.DataFrame(beta, index=X.columns,
                                  columns=Y.columns)
        self._resid = pd.DataFrame(resid, index=Y.index,
                                   columns=Y.columns)
        self._fitted = True
        self._ess = ssr_balance
        self._r2 = 1 - ((resid**2).values.sum() / (sst**2).values.sum())

    def predict(self, X=None, tree=None, **kwargs):
        """ Performs a prediction based on model.

        Parameters
        ----------
        X : pd.DataFrame, optional
            Input table of covariates, where columns are covariates, and
            rows are samples.  If not specified, then the fitted values
            calculated from training the model will be returned.
        tree : skbio.TreeNode, optional
            The tree used to perform the ilr transformation.  If this
            is specified, then the prediction will be represented
            as proportions. Otherwise, if this is not specified,
            the prediction will be represented as balances. (default: None).
        **kwargs : dict
            Other arguments to be passed into the model prediction.

        Returns
        -------
        pd.DataFrame
            A table of predicted values where columns are coefficients,
            and the rows are balances. If `tree` is specified, then
            the rows are proportions.

        """
        if not self._fitted:
            ValueError(('Model not fitted - coefficients not calculated.'
                        'See `fit()`'))
        if X is None:
            X = self.design_matrices

        prediction = X.dot(self._beta)
        if tree is not None:
            basis, _ = balance_basis(tree)
            proj_prediction = ilr_inv(prediction.values, basis=basis)
            ids = [n.name for n in tree.tips()]
            return pd.DataFrame(proj_prediction,
                                columns=ids,
                                index=prediction.index)
        else:
            return prediction

    @property
    def pvalues(self):
        """ Return pvalues from each of the coefficients in the fit. """
        return self._pvalues

    @property
    def tvalues(self):
        """ Return t-statistics from each of the coefficients in the fit. """
        return self._tvalues

    @property
    def r2(self):
        """ Coefficient of determination for overall fit"""
        return self._r2

    @property
    def mse(self):
        """ Mean Sum of squares Error"""
        return self._mse

    @property
    def ess(self):
        """ Explained Sum of squares"""
        return self._ess

    def summary(self, kfolds, lovo):
        """ Summarize the Ordinary Least Squares Regression Results.

        Parameters
        ----------
        kfold : pd.DataFrame
            Results from kfold cross-validation
        lovo : pd.DataFrame
            Results from leave-one-variable-out cross-validation.

        Returns
        -------
        str :
            This holds the summary of regression coefficients and fit
            information.
        """

        _r2 = self.r2

        self.params = self._beta

        # number of observations
        self.nobs = self.response_matrix.shape[0]
        self.model = None

        # Start filling in summary information
        smry = Summary()
        # Top results
        info = OrderedDict()
        info["No. Observations"] = self.nobs
        info["Model:"] = "OLS"
        info["Rsquared: "] = _r2

        # TODO: Investigate how to properly resize the tables
        smry.add_dict(info, ncols=1)
        smry.add_title("Simplicial Least Squares Results")
        smry.add_df(lovo, align='l')
        smry.add_df(kfolds, align='l')
        return smry

    def kfold(self, num_folds=10, **kwargs):
        """ K-fold cross-validation.

        Performs k-fold cross-validation by spliting the data
        into k partitions, building a model on k-1 partitions and
        evaluating the predictions on the remaining partition.
        This process is performed k times.

        Parameters
        ----------
        num_folds: int, optional
            The number of partitions used for the cross validation.
        **kwargs : dict
           Keyword arguments used to tune the parameter estimation.

        Returns
        -------
        pd.DataFrame
           model_mse : np.array, float
               The within model mean sum of squares error for each iteration of
               the cross validation.
           Rsquared : np.array, float
               The Rsquared of the model fitted on training data.
           pred_mse : np.array, float
               Prediction mean sum of squares error for each iteration of
               the cross validation.
        """
        # number of observations (i.e. samples)
        nobs = self.response_matrix.shape[0]
        s = nobs // num_folds
        folds = [np.arange(i*s, ((i*s)+s) % nobs) for i in range(num_folds)]
        results = pd.DataFrame(index=['fold_%d' % i for i in range(num_folds)],
                               columns=['model_mse', 'Rsquared', 'pred_mse'],
                               dtype=np.float64)

        for k in range(num_folds):
            test = folds[k]
            train = np.hstack(folds[:k] + folds[k+1:])

            res_i = OLSModel(self.response_matrix.iloc[train],
                             self.design_matrix.iloc[train])
            res_i.fit(**kwargs)

            # model error
            p = res_i.predict(X=self.design_matrix.iloc[train]).values
            r = self.response_matrix.iloc[train].values

            model_resid = ((p - r)**2)
            model_mse = np.mean(model_resid.sum(axis=0))

            results.loc['fold_%d' % k, 'model_mse'] = model_mse
            results.loc['fold_%d' % k, 'Rsquared'] = res_i.r2

            # prediction error
            p = res_i.predict(X=self.design_matrix.iloc[test]).values
            r = self.response_matrix.iloc[test].values

            pred_resid = ((p - r)**2)
            pred_mse = np.mean(pred_resid.sum(axis=0))

            results.loc['fold_%d' % k, 'pred_mse'] = pred_mse

        return results

    def loo(self, **kwargs):
        """ Leave one out cross-validation.

        Calculates summary statistics for each iteraction of
        leave one out cross-validation, specially `mse` on entire model
        and `pred_err` to measure prediction error.

        Parameters
        ----------
        **kwargs : dict
           Keyword arguments used to tune the parameter estimation.

        Returns
        -------
        pd.DataFrame
           model_mse : np.array, float
               Mean sum of squares error for each iteration of
               the cross validation.
           pred_mse : np.array, float
               Prediction mean sum of squares error for each iteration of
               the cross validation.

        See Also
        --------
        fit
        statsmodels.regression.linear_model.
        """
        # number of observations (i.e. samples)
        nobs = self.response_matrix.shape[0]
        cv_iter = LeaveOneOut(nobs)

        results = pd.DataFrame(index=self.response_matrix.index,
                               columns=['model_mse', 'pred_mse'],
                               dtype=np.float64)

        for i, (train, test) in enumerate(cv_iter):
            sample_id = self.response_matrix.index[i]

            res_i = OLSModel(self.response_matrix.iloc[train],
                             self.design_matrix.iloc[train])
            res_i.fit(**kwargs)

            # model error
            predicted = res_i.predict(X=self.design_matrix.iloc[train])
            r = self.response_matrix.iloc[train].values
            p = predicted.values
            model_resid = ((r - p)**2)
            model_mse = np.mean(model_resid.sum(axis=0))
            results.loc[sample_id, 'model_mse'] = model_mse

            # prediction error
            predicted = res_i.predict(X=self.design_matrix.iloc[test])
            r = self.response_matrix.iloc[test].values
            p = predicted.values
            pred_resid = ((r - p)**2)
            pred_mse = np.mean(pred_resid.sum(axis=0))
            results.loc[sample_id, 'pred_mse'] = pred_mse

        return results

    def lovo(self, **kwargs):
        """ Leave one variable out cross-validation.

        Calculates summary statistics for each iteraction of leave one variable
        out cross-validation, specially `r2` and `mse` on entire model.
        This technique is particularly useful for feature selection.

        Parameters
        ----------
        **kwargs : dict
           Keyword arguments used to tune the parameter estimation.

        Returns
        -------
        pd.DataFrame
           mse : np.array, float
               Mean sum of squares error for each iteration of
               the cross validation.
           Rsquared : np.array, float
               Coefficient of determination for each variable left out.
           R2diff : np.array, float
               Decrease in Rsquared for each variable left out.
        """
        cv_iter = LeaveOneOut(len(self.design_matrix.columns))
        results = pd.DataFrame(index=self.design_matrix.columns,
                               columns=['mse', 'Rsquared', 'R2diff'],
                               dtype=np.float64)
        for i, (inidx, outidx) in enumerate(cv_iter):

            feature_id = self.design_matrix.columns[i]

            res_i = OLSModel(Y=self.response_matrix,
                             Xs=self.design_matrix.iloc[:, inidx])
            res_i.fit(**kwargs)
            predicted = res_i.predict()
            r = self.response_matrix.values
            p = predicted.values

            model_resid = ((r - p)**2)
            model_mse = np.mean(model_resid.sum(axis=0))
            results.loc[feature_id, 'mse'] = model_mse
            results.loc[feature_id, 'Rsquared'] = res_i.r2
            results.loc[feature_id, 'R2diff'] = self.r2 - res_i.r2
        return results