From efcc69cc260f1e0dacfaeefb55a5419eb64c18f5 Mon Sep 17 00:00:00 2001
From: HereticSK <kuanghongkaisk@gmail.com>
Date: Wed, 14 Mar 2018 17:52:00 +0800
Subject: [PATCH] API: run Alphalens with returns instead of prices
 (utils.get_clean_factor)

Issue #270

* refactored compute_forward_returns and get_clean_factor_and_forward_returns
* added get_clean_factor
* fix bug in compute_factor_returns: call infer_trading_calendar before factor index is changed
---
 alphalens/tests/test_utils.py |   3 +-
 alphalens/utils.py            | 332 +++++++++++++++++++++++++---------
 2 files changed, 246 insertions(+), 89 deletions(-)

diff --git a/alphalens/tests/test_utils.py b/alphalens/tests/test_utils.py
index 6fba4a57..b4a58b6c 100644
--- a/alphalens/tests/test_utils.py
+++ b/alphalens/tests/test_utils.py
@@ -54,8 +54,9 @@ def test_compute_forward_returns(self):
         dr = date_range(start='2015-1-1', end='2015-1-3')
         prices = DataFrame(index=dr, columns=['A', 'B'],
                            data=[[1, 1], [1, 2], [2, 1]])
+        factor = prices.stack()
 
-        fp = compute_forward_returns(prices.index, prices, periods=[1, 2])
+        fp = compute_forward_returns(factor, prices, periods=[1, 2])
 
         ix = MultiIndex.from_product([dr, ['A', 'B']],
                                      names=['date', 'asset'])
diff --git a/alphalens/utils.py b/alphalens/utils.py
index a22b5119..eb069f89 100644
--- a/alphalens/utils.py
+++ b/alphalens/utils.py
@@ -98,7 +98,9 @@ def quantize_factor(factor_data,
         containing the values for a single alpha factor, forward returns for
         each period, the factor quantile/bin that factor value belongs to, and
         (optionally) the group the asset belongs to.
+
         - See full explanation in utils.get_clean_factor_and_forward_returns
+
     quantiles : int or sequence[float]
         Number of equal-sized quantile buckets to use in factor bucketing.
         Alternately sequence of quantiles, allowing non-equal-sized buckets
@@ -177,7 +179,7 @@ def infer_trading_calendar(factor_idx, prices_idx):
     return CustomBusinessDay(weekmask=days_to_keep)
 
 
-def compute_forward_returns(factor_idx,
+def compute_forward_returns(factor,
                             prices,
                             periods=(1, 5, 10),
                             filter_zscore=None):
@@ -187,8 +189,12 @@ def compute_forward_returns(factor_idx,
 
     Parameters
     ----------
-    factor_idx : pd.DatetimeIndex
-        The factor datetimes for which we are computing the forward returns
+    factor : pd.Series - MultiIndex
+        A MultiIndex Series indexed by timestamp (level 0) and asset
+        (level 1), containing the values for a single alpha factor.
+
+        - See full explanation in utils.get_clean_factor_and_forward_returns
+
     prices : pd.DataFrame
         Pricing data to use in forward price calculation.
         Assets as columns, dates as index. Pricing data must
@@ -205,29 +211,45 @@ def compute_forward_returns(factor_idx,
     Returns
     -------
     forward_returns : pd.DataFrame - MultiIndex
-        Forward returns in indexed by date and asset.
-        Separate column for each forward return window.
+        A MultiIndex DataFrame indexed by timestamp (level 0) and asset
+        (level 1), containing the forward returns for assets.
+        Forward returns column names follow the format accepted by
+        pd.Timedelta (e.g. '1D', '30m', '3h15m', '1D1h', etc).
+        'date' index freq property (forward_returns.index.levels[0].freq)
+        will be set to a trading calendar (pandas DateOffset) inferred
+        from the input data (see infer_trading_calendar for more details).
     """
 
-    factor_idx = factor_idx.intersection(prices.index)
+    factor_dateindex = factor.index.levels[0]
+    if factor_dateindex.tz != prices.index.tz:
+        raise NonMatchingTimezoneError("The timezone of 'factor' is not the "
+                                       "same as the timezone of 'prices'. See "
+                                       "the pandas methods tz_localize and "
+                                       "tz_convert.")
+
+    freq = infer_trading_calendar(factor_dateindex, prices.index)
 
-    if len(factor_idx) == 0:
+    factor_dateindex = factor_dateindex.intersection(prices.index)
+
+    if len(factor_dateindex) == 0:
         raise ValueError("Factor and prices indices don't match: make sure "
                          "they have the same convention in terms of datetimes "
                          "and symbol-names")
 
     forward_returns = pd.DataFrame(index=pd.MultiIndex.from_product(
-        [factor_idx, prices.columns], names=['date', 'asset']))
+        [factor_dateindex, prices.columns], names=['date', 'asset']))
 
-    freq = infer_trading_calendar(factor_idx, prices.index)
     forward_returns.index.levels[0].freq = freq
 
-    for period in periods:
-
+    for period in sorted(periods):
         #
         # build forward returns
         #
-        fwdret = prices.pct_change(period).shift(-period).reindex(factor_idx)
+        fwdret = (prices
+                  .pct_change(period)
+                  .shift(-period)
+                  .reindex(factor_dateindex)
+                  )
 
         if filter_zscore is not None:
             mask = abs(fwdret - fwdret.mean()) > (filter_zscore * fwdret.std())
@@ -277,7 +299,7 @@ def demean_forward_returns(factor_data, grouper=None):
     Parameters
     ----------
     factor_data : pd.DataFrame - MultiIndex
-        Forward returns in indexed by date and asset.
+        Forward returns indexed by date and asset.
         Separate column for each forward return window.
     grouper : list
         If True, demean according to group.
@@ -335,6 +357,207 @@ def print_table(table, name=None, fmt=None):
         pd.set_option('display.float_format', prev_option)
 
 
+def get_clean_factor(factor,
+                     forward_returns,
+                     groupby=None,
+                     binning_by_group=False,
+                     quantiles=5,
+                     bins=None,
+                     groupby_labels=None,
+                     max_loss=0.35):
+    """
+    Formats the factor data, forward return data, and group mappings into a
+    DataFrame that contains aligned MultiIndex indices of timestamp and asset.
+    The returned data will be formatted to be suitable for Alphalens functions.
+
+    It is safe to skip a call to this function and still make use of Alphalens
+    functionalities as long as the factor data conforms to the format returned
+    from get_clean_factor_and_forward_returns and documented here
+
+    Parameters
+    ----------
+    factor : pd.Series - MultiIndex
+        A MultiIndex Series indexed by timestamp (level 0) and asset
+        (level 1), containing the values for a single alpha factor.
+        ::
+            -----------------------------------
+                date    |    asset   |
+            -----------------------------------
+                        |   AAPL     |   0.5
+                        -----------------------
+                        |   BA       |  -1.1
+                        -----------------------
+            2014-01-01  |   CMG      |   1.7
+                        -----------------------
+                        |   DAL      |  -0.1
+                        -----------------------
+                        |   LULU     |   2.7
+                        -----------------------
+
+    forward_returns : pd.DataFrame - MultiIndex
+        A MultiIndex DataFrame indexed by timestamp (level 0) and asset
+        (level 1), containing the forward returns for assets.
+        Forward returns column names must follow the format accepted by
+        pd.Timedelta (e.g. '1D', '30m', '3h15m', '1D1h', etc).
+        'date' index freq property must be set to a trading calendar
+        (pandas DateOffset), see infer_trading_calendar for more details.
+        This information is currently used only in cumulative returns
+        computation
+        ::
+            ---------------------------------------
+                       |       | 1D  | 5D  | 10D
+            ---------------------------------------
+                date   | asset |     |     |
+            ---------------------------------------
+                       | AAPL  | 0.09|-0.01|-0.079
+                       ----------------------------
+                       | BA    | 0.02| 0.06| 0.020
+                       ----------------------------
+            2014-01-01 | CMG   | 0.03| 0.09| 0.036
+                       ----------------------------
+                       | DAL   |-0.02|-0.06|-0.029
+                       ----------------------------
+                       | LULU  |-0.03| 0.05|-0.009
+                       ----------------------------
+
+    groupby : pd.Series - MultiIndex or dict
+        Either A MultiIndex Series indexed by date and asset,
+        containing the period wise group codes for each asset, or
+        a dict of asset to group mappings. If a dict is passed,
+        it is assumed that group mappings are unchanged for the
+        entire time period of the passed factor data.
+    binning_by_group : bool
+        If True, compute quantile buckets separately for each group.
+        This is useful when the factor values range vary considerably
+        across gorups so that it is wise to make the binning group relative.
+        You should probably enable this if the factor is intended
+        to be analyzed for a group neutral portfolio
+    quantiles : int or sequence[float]
+        Number of equal-sized quantile buckets to use in factor bucketing.
+        Alternately sequence of quantiles, allowing non-equal-sized buckets
+        e.g. [0, .10, .5, .90, 1.] or [.05, .5, .95]
+        Only one of 'quantiles' or 'bins' can be not-None
+    bins : int or sequence[float]
+        Number of equal-width (valuewise) bins to use in factor bucketing.
+        Alternately sequence of bin edges allowing for non-uniform bin width
+        e.g. [-4, -2, -0.5, 0, 10]
+        Chooses the buckets to be evenly spaced according to the values
+        themselves. Useful when the factor contains discrete values.
+        Only one of 'quantiles' or 'bins' can be not-None
+    groupby_labels : dict
+        A dictionary keyed by group code with values corresponding
+        to the display name for each group.
+    max_loss : float, optional
+        Maximum percentage (0.00 to 1.00) of factor data dropping allowed,
+        computed comparing the number of items in the input factor index and
+        the number of items in the output DataFrame index.
+        Factor data can be partially dropped due to being flawed itself
+        (e.g. NaNs), not having provided enough price data to compute
+        forward returns for all factor values, or because it is not possible
+        to perform binning.
+        Set max_loss=0 to avoid Exceptions suppression.
+
+    Returns
+    -------
+    merged_data : pd.DataFrame - MultiIndex
+        A MultiIndex Series indexed by date (level 0) and asset (level 1),
+        containing the values for a single alpha factor, forward returns for
+        each period, the factor quantile/bin that factor value belongs to, and
+        (optionally) the group the asset belongs to.
+
+        - forward returns column names follow the format accepted by
+          pd.Timedelta (e.g. '1D', '30m', '3h15m', '1D1h', etc)
+
+        - 'date' index freq property (merged_data.index.levels[0].freq) is the
+          same as that of the input forward returns data. This is currently
+          used only in cumulative returns computation
+        ::
+           -------------------------------------------------------------------
+                      |       | 1D  | 5D  | 10D  |factor|group|factor_quantile
+           -------------------------------------------------------------------
+               date   | asset |     |     |      |      |     |
+           -------------------------------------------------------------------
+                      | AAPL  | 0.09|-0.01|-0.079|  0.5 |  G1 |      3
+                      --------------------------------------------------------
+                      | BA    | 0.02| 0.06| 0.020| -1.1 |  G2 |      5
+                      --------------------------------------------------------
+           2014-01-01 | CMG   | 0.03| 0.09| 0.036|  1.7 |  G2 |      1
+                      --------------------------------------------------------
+                      | DAL   |-0.02|-0.06|-0.029| -0.1 |  G3 |      5
+                      --------------------------------------------------------
+                      | LULU  |-0.03| 0.05|-0.009|  2.7 |  G1 |      2
+                      --------------------------------------------------------
+    """
+
+    initial_amount = float(len(factor.index))
+
+    factor = factor.copy()
+    factor.index = factor.index.rename(['date', 'asset'])
+
+    merged_data = forward_returns.copy()
+    merged_data['factor'] = factor
+
+    if groupby is not None:
+        if isinstance(groupby, dict):
+            diff = set(factor.index.get_level_values(
+                'asset')) - set(groupby.keys())
+            if len(diff) > 0:
+                raise KeyError(
+                    "Assets {} not in group mapping".format(
+                        list(diff)))
+
+            ss = pd.Series(groupby)
+            groupby = pd.Series(index=factor.index,
+                                data=ss[factor.index.get_level_values(
+                                    'asset')].values)
+
+        if groupby_labels is not None:
+            diff = set(groupby.values) - set(groupby_labels.keys())
+            if len(diff) > 0:
+                raise KeyError(
+                    "groups {} not in passed group names".format(
+                        list(diff)))
+
+            sn = pd.Series(groupby_labels)
+            groupby = pd.Series(index=factor.index,
+                                data=sn[groupby.values].values)
+
+        merged_data['group'] = groupby.astype('category')
+
+    merged_data = merged_data.dropna()
+
+    fwdret_amount = float(len(merged_data.index))
+
+    no_raise = False if max_loss == 0 else True
+    merged_data['factor_quantile'] = quantize_factor(merged_data,
+                                                     quantiles,
+                                                     bins,
+                                                     binning_by_group,
+                                                     no_raise)
+
+    merged_data = merged_data.dropna()
+
+    binning_amount = float(len(merged_data.index))
+
+    tot_loss = (initial_amount - binning_amount) / initial_amount
+    fwdret_loss = (initial_amount - fwdret_amount) / initial_amount
+    bin_loss = tot_loss - fwdret_loss
+
+    print("Dropped %.1f%% entries from factor data: %.1f%% in forward "
+          "returns computation and %.1f%% in binning phase "
+          "(set max_loss=0 to see potentially suppressed Exceptions)." %
+          (tot_loss * 100, fwdret_loss * 100,  bin_loss * 100))
+
+    if tot_loss > max_loss:
+        message = ("max_loss (%.1f%%) exceeded %.1f%%, consider increasing it."
+                   % (max_loss * 100, tot_loss * 100))
+        raise MaxLossExceededError(message)
+    else:
+        print("max_loss is %.1f%%, not exceeded: OK!" % (max_loss * 100))
+
+    return merged_data
+
+
 def get_clean_factor_and_forward_returns_api_change_warning(func):
     """
     Decorator used to help API transition: maintain the function backward
@@ -522,83 +745,16 @@ def get_clean_factor_and_forward_returns(factor,
                       --------------------------------------------------------
     """
 
-    if factor.index.levels[0].tz != prices.index.tz:
-        raise NonMatchingTimezoneError("The timezone of 'factor' is not the "
-                                       "same as the timezone of 'prices'. See "
-                                       "the pandas methods tz_localize and "
-                                       "tz_convert.")
-
-    periods = sorted(periods)
-
-    initial_amount = float(len(factor.index))
-
-    factor = factor.copy()
-    factor.index = factor.index.rename(['date', 'asset'])
-    factor_dateindex = factor.index.get_level_values('date').unique()
-
-    merged_data = compute_forward_returns(factor_dateindex, prices, periods,
-                                          filter_zscore)
-    merged_data['factor'] = factor
-
-    if groupby is not None:
-        if isinstance(groupby, dict):
-            diff = set(factor.index.get_level_values(
-                'asset')) - set(groupby.keys())
-            if len(diff) > 0:
-                raise KeyError(
-                    "Assets {} not in group mapping".format(
-                        list(diff)))
-
-            ss = pd.Series(groupby)
-            groupby = pd.Series(index=factor.index,
-                                data=ss[factor.index.get_level_values(
-                                    'asset')].values)
-
-        if groupby_labels is not None:
-            diff = set(groupby.values) - set(groupby_labels.keys())
-            if len(diff) > 0:
-                raise KeyError(
-                    "groups {} not in passed group names".format(
-                        list(diff)))
-
-            sn = pd.Series(groupby_labels)
-            groupby = pd.Series(index=factor.index,
-                                data=sn[groupby.values].values)
-
-        merged_data['group'] = groupby.astype('category')
+    forward_returns = compute_forward_returns(factor, prices, periods,
+                                              filter_zscore)
 
-    merged_data = merged_data.dropna()
+    factor_data = get_clean_factor(factor, forward_returns, groupby=groupby,
+                                   groupby_labels=groupby_labels,
+                                   quantiles=quantiles, bins=bins,
+                                   binning_by_group=binning_by_group,
+                                   max_loss=max_loss)
 
-    fwdret_amount = float(len(merged_data.index))
-
-    no_raise = False if max_loss == 0 else True
-    merged_data['factor_quantile'] = quantize_factor(merged_data,
-                                                     quantiles,
-                                                     bins,
-                                                     binning_by_group,
-                                                     no_raise)
-
-    merged_data = merged_data.dropna()
-
-    binning_amount = float(len(merged_data.index))
-
-    tot_loss = (initial_amount - binning_amount) / initial_amount
-    fwdret_loss = (initial_amount - fwdret_amount) / initial_amount
-    bin_loss = tot_loss - fwdret_loss
-
-    print("Dropped %.1f%% entries from factor data: %.1f%% in forward "
-          "returns computation and %.1f%% in binning phase "
-          "(set max_loss=0 to see potentially suppressed Exceptions)." %
-          (tot_loss * 100, fwdret_loss * 100,  bin_loss * 100))
-
-    if tot_loss > max_loss:
-        message = ("max_loss (%.1f%%) exceeded %.1f%%, consider increasing it."
-                   % (max_loss * 100, tot_loss * 100))
-        raise MaxLossExceededError(message)
-    else:
-        print("max_loss is %.1f%%, not exceeded: OK!" % (max_loss * 100))
-
-    return merged_data
+    return factor_data
 
 
 def rate_of_return(period_ret, base_period):