Skip to content

Commit

Permalink
0.9.59 新增 rolling layers
Browse files Browse the repository at this point in the history
  • Loading branch information
zengbin93 committed Sep 13, 2024
1 parent d20d48a commit 1c35040
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 0 deletions.
2 changes: 2 additions & 0 deletions czsc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@
show_strategies_recent,
show_factor_value,
show_code_editor,
show_classify,
)

from czsc.utils.bi_info import (
Expand Down Expand Up @@ -205,6 +206,7 @@
judge_factor_direction,
monotonicity,
min_max_limit,
rolling_layers,
)


Expand Down
51 changes: 51 additions & 0 deletions czsc/eda.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,54 @@ def min_max_limit(x, min_val, max_val, digits=4):
:return: float
"""
return round(max(min_val, min(max_val, x)), digits)


def rolling_layers(df, factor, n=5, **kwargs):
"""对时间序列数据进行分层
:param df: 因子数据,必须包含 dt, factor 列,其中 dt 为日期,factor 为因子值
:param factor: 因子列名
:param n: 分层数量,默认为10
:param kwargs:
- window: 窗口大小,默认为2000
- min_periods: 最小样本数量,默认为300
- mode: str, {'loose', 'strict'}, 分层模式,默认为 'loose';
loose 表示使用 rolling + rank 的方式分层,有一点点未来信息,存在一定的数据穿越问题;
strict 表示使用 rolling + qcut 的方式分层,无未来信息,但是执行速度较慢。
:return: df, 添加了 factor分层 列
"""
assert df[factor].nunique() > n * 2, "因子值的取值数量必须大于分层数量"
assert df[factor].isna().sum() == 0, "因子有缺失值,缺失数量为:{}".format(df[factor].isna().sum())
assert df['dt'].duplicated().sum() == 0, f"dt 列不能有重复值,存在重复值数量:{df['dt'].duplicated().sum()}"

window = kwargs.get("window", 600)
min_periods = kwargs.get("min_periods", 300)

# 不能有 inf 和 -inf
if df.loc[df[factor].isin([float("inf"), float("-inf")]), factor].shape[0] > 0:
raise ValueError(f"存在 {factor} 为 inf / -inf 的数据")

if kwargs.get('mode', 'loose') == 'loose':
# loose 模式,可能存在一点点未来信息
df['pct_rank'] = df[factor].rolling(window=window, min_periods=min_periods).rank(pct=True, ascending=True)
bins = [i/n for i in range(n+1)]
df['pct_rank_cut'] = pd.cut(df['pct_rank'], bins=bins, labels=False)
df['pct_rank_cut'] = df['pct_rank_cut'].fillna(-1)
# 第00层表示缺失值
df[f"{factor}分层"] = df['pct_rank_cut'].apply(lambda x: f"第{str(int(x+1)).zfill(2)}层")
df.drop(['pct_rank', 'pct_rank_cut'], axis=1, inplace=True)

else:
assert kwargs.get('mode', 'strict') == 'strict'
df[f"{factor}_qcut"] = (
df[factor].rolling(window=window, min_periods=min_periods)
.apply(lambda x: pd.qcut(x, q=n, labels=False, duplicates="drop", retbins=False).values[-1], raw=False)
)
df[f"{factor}_qcut"] = df[f"{factor}_qcut"].fillna(-1)
# 第00层表示缺失值
df[f"{factor}分层"] = df[f"{factor}_qcut"].apply(lambda x: f"第{str(int(x+1)).zfill(2)}层")
df.drop([f"{factor}_qcut"], axis=1, inplace=True)

return df
55 changes: 55 additions & 0 deletions czsc/utils/st_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -1688,3 +1688,58 @@ def __editor():
with st.expander(expander_title, expanded=True):
code = __editor()
return code


def show_classify(df, col1, col2, n=10, method="cut", **kwargs):
"""显示 col1 对 col2 的分类作用
:param df: 数据,pd.DataFrame
:param col1: 分层列
:param col2: 统计列
:param n: 分层数量
:param method: 分层方法,cut 或 qcut
:param kwargs:
- show_bar: bool, 是否展示柱状图,默认为 False
"""
df = df[[col1, col2]].copy()
if method == "cut":
df[f"{col1}_分层"] = pd.cut(df[col1], bins=n, duplicates="drop")
elif method == "qcut":
df[f"{col1}_分层"] = pd.qcut(df[col1], q=n, duplicates="drop")
else:
raise ValueError("method must be 'cut' or 'qcut'")

dfg = df.groupby(f"{col1}_分层", observed=True)[col2].describe().reset_index()
dfx = dfg.copy()
info = (
f"{col1} 分层对应 {col2} 的均值单调性::red[{czsc.monotonicity(dfx['mean']):.2%}]; "
f"最后一层的均值::red[{dfx['mean'].iloc[-1]:.4f}];"
f"第一层的均值::red[{dfx['mean'].iloc[0]:.4f}]"
)
st.markdown(info)

if kwargs.get("show_bar", False):
dfx["标记"] = dfx[f"{col1}_分层"].astype(str)
dfx["text"] = dfx["mean"].apply(lambda x: f"{x:.4f}")
fig = px.bar(dfx, x="标记", y="mean", text="text", color="mean", color_continuous_scale="RdYlGn_r")
fig.update_xaxes(title=None)
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
st.plotly_chart(fig, use_container_width=True)

dfg = dfg.style.background_gradient(cmap="RdYlGn_r", axis=None, subset=["count"])
dfg = dfg.background_gradient(cmap="RdYlGn_r", axis=None, subset=["mean", "std", "min", "25%", "50%", "75%", "max"])
dfg = dfg.format(
{
"count": "{:.0f}",
"mean": "{:.4f}",
"std": "{:.2%}",
"min": "{:.4f}",
"25%": "{:.4f}",
"50%": "{:.4f}",
"75%": "{:.4f}",
"max": "{:.4f}",
}
)
st.dataframe(dfg, use_container_width=True)

0 comments on commit 1c35040

Please sign in to comment.