From 995f59f26818630f140e02c65689535f5e5005cc Mon Sep 17 00:00:00 2001 From: "leizhang.real@gmail.com" Date: Wed, 16 Oct 2024 13:03:07 +0000 Subject: [PATCH 1/2] update the preview_csv_data function --- mle/function/data.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/mle/function/data.py b/mle/function/data.py index b99e857..f7f8b85 100644 --- a/mle/function/data.py +++ b/mle/function/data.py @@ -1,24 +1,38 @@ import textwrap import pandas as pd +from pandas.api.types import is_numeric_dtype -def preview_csv_data(path: str, limit_rows: int = 1) -> str: +def preview_csv_data(path: str, limit_rows: int = 5) -> str: """ Preview the sample dataset from the project data path and include metadata. + Refer to: https://github.com/WecoAI/aideml/blob/main/aide/utils/data_preview.py :param path: the path to a local CSV file. :param limit_rows: the number of rows to preview. :return: the sample dataset with metadata as a string. """ try: df = pd.read_csv(path) - num_rows = len(df) - columns = ', '.join(df.columns) - df_limited = df.head(limit_rows) - data_dict_list = df_limited.to_dict(orient='records') - data_dict_str = "\n".join([str(record) for record in data_dict_list]) - - return textwrap.dedent(f""" - Data file: {path}\nNumber of all rows: {num_rows}\nAll columns: {columns}\nData example:\n{data_dict_str} - """).strip() + num_rows, num_cols = df.shape + summary = [f"-> {path} has {num_rows} rows and {num_cols} columns."] + summary.append("Here is some information about the columns:") + for col in sorted(df.columns): + dtype = df[col].dtype + name = f"{col} ({dtype})" + nan_count = df[col].isnull().sum() + if dtype == "bool": + true_percentage = df[col].mean() * 100 + summary.append(f"{name} is {true_percentage:.2f}% True, {100 - true_percentage:.2f}% False") + elif df[col].nunique() < 10: + unique_values = df[col].unique().tolist() + summary.append(f"{name} has {df[col].nunique()} unique values: {unique_values}") + elif is_numeric_dtype(df[col]): + min_val, max_val = df[col].min(), df[col].max() + summary.append(f"{name} has range: {min_val:.2f} - {max_val:.2f}, {nan_count} NaN values") + elif dtype == "object": + unique_count = df[col].nunique() + example_values = df[col].value_counts().head(limit_rows).index.tolist() + summary.append(f"{name} has {unique_count} unique values. Some example values: {example_values}") + return textwrap.dedent("\n".join(summary)).strip() except Exception as e: return f"cannot read csv data: {e}" From 0a8b80c421ffed8ad0bcf32a11c9a761f31c5a28 Mon Sep 17 00:00:00 2001 From: "leizhang.real@gmail.com" Date: Wed, 16 Oct 2024 13:05:13 +0000 Subject: [PATCH 2/2] update summary --- mle/function/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mle/function/data.py b/mle/function/data.py index f7f8b85..a406ff0 100644 --- a/mle/function/data.py +++ b/mle/function/data.py @@ -14,7 +14,7 @@ def preview_csv_data(path: str, limit_rows: int = 5) -> str: try: df = pd.read_csv(path) num_rows, num_cols = df.shape - summary = [f"-> {path} has {num_rows} rows and {num_cols} columns."] + summary = [f"CSV file in `{path}` has {num_rows} rows and {num_cols} columns."] summary.append("Here is some information about the columns:") for col in sorted(df.columns): dtype = df[col].dtype