waditu · zengbin93 · Sep 19, 2024 · Sep 1, 2024 · Sep 2, 2024 · Sep 2, 2024
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -5,7 +5,7 @@ name: Python package
 
 on:
   push:
-    branches: [ master, V0.9.58 ]
+    branches: [ master, V0.9.59 ]
   pull_request:
     branches: [ master ]
 

diff --git a/czsc/__init__.py b/czsc/__init__.py
@@ -47,6 +47,7 @@
 from czsc.utils import (
     mac_address,
     overlap,
+    to_arrow,
 
     format_standard_kline,
 
@@ -65,6 +66,7 @@
     save_json,
     get_sub_elements,
     get_py_namespace,
+    code_namespace,
     freqs_sorted,
     x_round,
     import_by_name,
@@ -155,6 +157,7 @@
     show_strategies_recent,
     show_factor_value,
     show_code_editor,
+    show_classify,
 )
 
 from czsc.utils.bi_info import (
@@ -204,13 +207,15 @@
     cross_sectional_strategy,
     judge_factor_direction,
     monotonicity,
+    min_max_limit,
+    rolling_layers,
 )
 
 
-__version__ = "0.9.58"
+__version__ = "0.9.59"
 __author__ = "zengbin93"
 __email__ = "[email protected]"
-__date__ = "20240808"
+__date__ = "20240901"
 
 
 def welcome():

diff --git a/czsc/eda.py b/czsc/eda.py
@@ -159,3 +159,66 @@ def monotonicity(sequence):
     """
     from scipy.stats import spearmanr
     return spearmanr(sequence, range(len(sequence)))[0]
+
+
+def min_max_limit(x, min_val, max_val, digits=4):
+    """限制 x 的取值范围在 min_val 和 max_val 之间
+
+    :param x: float, 输入值
+    :param min_val: float, 最小值
+    :param max_val: float, 最大值
+    :param digits: int, 保留小数位数
+    :return: float
+    """
+    return round(max(min_val, min(max_val, x)), digits)
+
+
+def rolling_layers(df, factor, n=5, **kwargs):
+    """对时间序列数据进行分层
+
+    :param df: 因子数据，必须包含 dt, factor 列，其中 dt 为日期，factor 为因子值
+    :param factor: 因子列名
+    :param n: 分层数量，默认为10
+    :param kwargs:
+
+        - window: 窗口大小，默认为2000
+        - min_periods: 最小样本数量，默认为300
+        - mode: str, {'loose', 'strict'}, 分层模式，默认为 'loose'；
+            loose 表示使用 rolling + rank 的方式分层，有一点点未来信息，存在一定的数据穿越问题；
+            strict 表示使用 rolling + qcut 的方式分层，无未来信息，但是执行速度较慢。
+
+    :return: df, 添加了 factor分层 列
+    """
+    assert df[factor].nunique() > n * 2, "因子值的取值数量必须大于分层数量"
+    assert df[factor].isna().sum() == 0, "因子有缺失值，缺失数量为：{}".format(df[factor].isna().sum())
+    assert df['dt'].duplicated().sum() == 0, f"dt 列不能有重复值，存在重复值数量：{df['dt'].duplicated().sum()}"
+
+    window = kwargs.get("window", 600)
+    min_periods = kwargs.get("min_periods", 300)
+
+    # 不能有 inf 和 -inf
+    if df.loc[df[factor].isin([float("inf"), float("-inf")]), factor].shape[0] > 0:
+        raise ValueError(f"存在 {factor} 为 inf / -inf 的数据")
+
+    if kwargs.get('mode', 'loose') == 'loose':
+        # loose 模式，可能存在一点点未来信息
+        df['pct_rank'] = df[factor].rolling(window=window, min_periods=min_periods).rank(pct=True, ascending=True)
+        bins = [i/n for i in range(n+1)]
+        df['pct_rank_cut'] = pd.cut(df['pct_rank'], bins=bins, labels=False)
+        df['pct_rank_cut'] = df['pct_rank_cut'].fillna(-1)
+        # 第00层表示缺失值
+        df[f"{factor}分层"] = df['pct_rank_cut'].apply(lambda x: f"第{str(int(x+1)).zfill(2)}层")
+        df.drop(['pct_rank', 'pct_rank_cut'], axis=1, inplace=True)
+
+    else:
+        assert kwargs.get('mode', 'strict') == 'strict'
+        df[f"{factor}_qcut"] = (
+            df[factor].rolling(window=window, min_periods=min_periods)
+            .apply(lambda x: pd.qcut(x, q=n, labels=False, duplicates="drop", retbins=False).values[-1], raw=False)
+        )
+        df[f"{factor}_qcut"] = df[f"{factor}_qcut"].fillna(-1)
+        # 第00层表示缺失值
+        df[f"{factor}分层"] = df[f"{factor}_qcut"].apply(lambda x: f"第{str(int(x+1)).zfill(2)}层")
+        df.drop([f"{factor}_qcut"], axis=1, inplace=True)
+
+    return df
diff --git a/czsc/traders/rwc.py b/czsc/traders/rwc.py
@@ -176,6 +176,7 @@ def publish_dataframe(self, df, overwrite=False, batch_size=10000):
         :param df: pandas.DataFrame, 必需包含['symbol', 'dt', 'weight']列,
                 可选['price', 'ref']列, 如没有price则写0, dtype同publish方法
         :param overwrite: boolean, 是否覆盖已有记录
+        :param batch_size: int, 每次发布的最大数量
         :return: 成功发布信号的条数
         """
         df = df.copy()
@@ -392,7 +393,7 @@ def get_hist_weights(self, symbol, sdt, edt) -> pd.DataFrame:
             price = price if price is None else float(price)
             try:
                 ref = json.loads(ref)
-            except Exception:
+            except Exception as e:
                 ref = ref
             weights.append((self.strategy_name, symbol, dt, weight, price, ref))
 

diff --git a/czsc/traders/weight_backtest.py b/czsc/traders/weight_backtest.py
@@ -277,6 +277,7 @@ def __init__(self, dfw, digits=2, **kwargs) -> None:
         """
         self.kwargs = kwargs
         self.dfw = dfw.copy()
+        self.dfw["dt"] = pd.to_datetime(self.dfw["dt"])
         if self.dfw.isnull().sum().sum() > 0:
             raise ValueError("dfw 中存在空值, 请先处理")
         self.digits = digits
@@ -553,9 +554,10 @@ def backtest(self, n_jobs=1):
         dret = pd.concat([v["daily"] for k, v in res.items() if k in symbols], ignore_index=True)
         dret = pd.pivot_table(dret, index="date", columns="symbol", values="return").fillna(0)
         dret["total"] = dret[list(res.keys())].mean(axis=1)
+        dret = dret.round(4).reset_index()
         res["品种等权日收益"] = dret
 
-        stats = {"开始日期": dret.index.min().strftime("%Y%m%d"), "结束日期": dret.index.max().strftime("%Y%m%d")}
+        stats = {"开始日期": dret["date"].min().strftime("%Y%m%d"), "结束日期": dret["date"].max().strftime("%Y%m%d")}
         stats.update(daily_performance(dret["total"]))
         dfp = pd.concat([v["pairs"] for k, v in res.items() if k in symbols], ignore_index=True)
         pairs_stats = evaluate_pairs(dfp)

diff --git a/czsc/utils/__init__.py b/czsc/utils/__init__.py
@@ -1,5 +1,6 @@
 # coding: utf-8
 import os
+import pandas as pd
 from typing import List, Union
 
 from . import qywx
@@ -95,6 +96,20 @@ def get_py_namespace(file_py: str, keys: list = []) -> dict:
     return namespace
 
 
+def code_namespace(code: str, keys: list = []) -> dict:
+    """获取 python 代码中的 namespace
+
+    :param code: python 代码
+    :param keys: 指定需要的对象名称
+    :return: namespace
+    """
+    namespace = {"code": code}
+    exec(code, namespace)
+    if keys:
+        namespace = {k: v for k, v in namespace.items() if k in keys}
+    return namespace
+
+
 def import_by_name(name):
     """通过字符串导入模块、类、函数
 
@@ -199,3 +214,15 @@ def mac_address():
     x = uuid.UUID(int=uuid.getnode()).hex[-12:].upper()
     x = "-".join([x[i : i + 2] for i in range(0, 11, 2)])
     return x
+
+
+def to_arrow(df: pd.DataFrame):
+    """将 pandas.DataFrame 转换为 pyarrow.Table"""
+    import io
+    import pyarrow as pa
+
+    table = pa.Table.from_pandas(df)
+    with io.BytesIO() as sink:
+        with pa.ipc.new_file(sink, table.schema) as writer:
+            writer.write_table(table)
+        return sink.getvalue()
diff --git a/czsc/utils/bar_generator.py b/czsc/utils/bar_generator.py
@@ -31,6 +31,7 @@ def is_trading_time(dt: datetime = datetime.now(), market="A股"):
 def get_intraday_times(freq="1分钟", market="A股"):
     """获取指定市场的交易时间段
 
+    :param freq: K线周期，如 1分钟、5分钟、15分钟、30分钟、60分钟
     :param market: 市场名称，可选值：A股、期货、默认
     :return: 交易时间段列表
     """