0.9.59 新增 rolling layers

waditu · Sep 13, 2024 · 1c35040 · 1c35040
1 parent d20d48a
commit 1c35040
Show file tree

Hide file tree

Showing 3 changed files with 108 additions and 0 deletions.
diff --git a/czsc/__init__.py b/czsc/__init__.py
@@ -155,6 +155,7 @@
     show_strategies_recent,
     show_factor_value,
     show_code_editor,
+    show_classify,
 )
 
 from czsc.utils.bi_info import (
@@ -205,6 +206,7 @@
     judge_factor_direction,
     monotonicity,
     min_max_limit,
+    rolling_layers,
 )
 
 

diff --git a/czsc/eda.py b/czsc/eda.py
@@ -171,3 +171,54 @@ def min_max_limit(x, min_val, max_val, digits=4):
     :return: float
     """
     return round(max(min_val, min(max_val, x)), digits)
+
+
+def rolling_layers(df, factor, n=5, **kwargs):
+    """对时间序列数据进行分层
+
+    :param df: 因子数据，必须包含 dt, factor 列，其中 dt 为日期，factor 为因子值
+    :param factor: 因子列名
+    :param n: 分层数量，默认为10
+    :param kwargs:
+
+        - window: 窗口大小，默认为2000
+        - min_periods: 最小样本数量，默认为300
+        - mode: str, {'loose', 'strict'}, 分层模式，默认为 'loose'；
+            loose 表示使用 rolling + rank 的方式分层，有一点点未来信息，存在一定的数据穿越问题；
+            strict 表示使用 rolling + qcut 的方式分层，无未来信息，但是执行速度较慢。
+
+    :return: df, 添加了 factor分层 列
+    """
+    assert df[factor].nunique() > n * 2, "因子值的取值数量必须大于分层数量"
+    assert df[factor].isna().sum() == 0, "因子有缺失值，缺失数量为：{}".format(df[factor].isna().sum())
+    assert df['dt'].duplicated().sum() == 0, f"dt 列不能有重复值，存在重复值数量：{df['dt'].duplicated().sum()}"
+
+    window = kwargs.get("window", 600)
+    min_periods = kwargs.get("min_periods", 300)
+
+    # 不能有 inf 和 -inf
+    if df.loc[df[factor].isin([float("inf"), float("-inf")]), factor].shape[0] > 0:
+        raise ValueError(f"存在 {factor} 为 inf / -inf 的数据")
+
+    if kwargs.get('mode', 'loose') == 'loose':
+        # loose 模式，可能存在一点点未来信息
+        df['pct_rank'] = df[factor].rolling(window=window, min_periods=min_periods).rank(pct=True, ascending=True)
+        bins = [i/n for i in range(n+1)]
+        df['pct_rank_cut'] = pd.cut(df['pct_rank'], bins=bins, labels=False)
+        df['pct_rank_cut'] = df['pct_rank_cut'].fillna(-1)
+        # 第00层表示缺失值
+        df[f"{factor}分层"] = df['pct_rank_cut'].apply(lambda x: f"第{str(int(x+1)).zfill(2)}层")
+        df.drop(['pct_rank', 'pct_rank_cut'], axis=1, inplace=True)
+
+    else:
+        assert kwargs.get('mode', 'strict') == 'strict'
+        df[f"{factor}_qcut"] = (
+            df[factor].rolling(window=window, min_periods=min_periods)
+            .apply(lambda x: pd.qcut(x, q=n, labels=False, duplicates="drop", retbins=False).values[-1], raw=False)
+        )
+        df[f"{factor}_qcut"] = df[f"{factor}_qcut"].fillna(-1)
+        # 第00层表示缺失值
+        df[f"{factor}分层"] = df[f"{factor}_qcut"].apply(lambda x: f"第{str(int(x+1)).zfill(2)}层")
+        df.drop([f"{factor}_qcut"], axis=1, inplace=True)
+
+    return df
diff --git a/czsc/utils/st_components.py b/czsc/utils/st_components.py
@@ -1688,3 +1688,58 @@ def __editor():
         with st.expander(expander_title, expanded=True):
             code = __editor()
     return code
+
+
+def show_classify(df, col1, col2, n=10, method="cut", **kwargs):
+    """显示 col1 对 col2 的分类作用
+
+    :param df: 数据，pd.DataFrame
+    :param col1: 分层列
+    :param col2: 统计列
+    :param n: 分层数量
+    :param method: 分层方法，cut 或 qcut
+    :param kwargs:
+
+        - show_bar: bool, 是否展示柱状图，默认为 False
+
+    """
+    df = df[[col1, col2]].copy()
+    if method == "cut":
+        df[f"{col1}_分层"] = pd.cut(df[col1], bins=n, duplicates="drop")
+    elif method == "qcut":
+        df[f"{col1}_分层"] = pd.qcut(df[col1], q=n, duplicates="drop")
+    else:
+        raise ValueError("method must be 'cut' or 'qcut'")
+
+    dfg = df.groupby(f"{col1}_分层", observed=True)[col2].describe().reset_index()
+    dfx = dfg.copy()
+    info = (
+        f"{col1} 分层对应 {col2} 的均值单调性：:red[{czsc.monotonicity(dfx['mean']):.2%}]； "
+        f"最后一层的均值：:red[{dfx['mean'].iloc[-1]:.4f}]；"
+        f"第一层的均值：:red[{dfx['mean'].iloc[0]:.4f}]"
+    )
+    st.markdown(info)
+
+    if kwargs.get("show_bar", False):
+        dfx["标记"] = dfx[f"{col1}_分层"].astype(str)
+        dfx["text"] = dfx["mean"].apply(lambda x: f"{x:.4f}")
+        fig = px.bar(dfx, x="标记", y="mean", text="text", color="mean", color_continuous_scale="RdYlGn_r")
+        fig.update_xaxes(title=None)
+        fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
+        st.plotly_chart(fig, use_container_width=True)
+
+    dfg = dfg.style.background_gradient(cmap="RdYlGn_r", axis=None, subset=["count"])
+    dfg = dfg.background_gradient(cmap="RdYlGn_r", axis=None, subset=["mean", "std", "min", "25%", "50%", "75%", "max"])
+    dfg = dfg.format(
+        {
+            "count": "{:.0f}",
+            "mean": "{:.4f}",
+            "std": "{:.2%}",
+            "min": "{:.4f}",
+            "25%": "{:.4f}",
+            "50%": "{:.4f}",
+            "75%": "{:.4f}",
+            "max": "{:.4f}",
+        }
+    )
+    st.dataframe(dfg, use_container_width=True)