diff --git a/mle/function/data.py b/mle/function/data.py index b99e857..a406ff0 100644 --- a/mle/function/data.py +++ b/mle/function/data.py @@ -1,24 +1,38 @@ import textwrap import pandas as pd +from pandas.api.types import is_numeric_dtype -def preview_csv_data(path: str, limit_rows: int = 1) -> str: +def preview_csv_data(path: str, limit_rows: int = 5) -> str: """ Preview the sample dataset from the project data path and include metadata. + Refer to: https://github.com/WecoAI/aideml/blob/main/aide/utils/data_preview.py :param path: the path to a local CSV file. :param limit_rows: the number of rows to preview. :return: the sample dataset with metadata as a string. """ try: df = pd.read_csv(path) - num_rows = len(df) - columns = ', '.join(df.columns) - df_limited = df.head(limit_rows) - data_dict_list = df_limited.to_dict(orient='records') - data_dict_str = "\n".join([str(record) for record in data_dict_list]) - - return textwrap.dedent(f""" - Data file: {path}\nNumber of all rows: {num_rows}\nAll columns: {columns}\nData example:\n{data_dict_str} - """).strip() + num_rows, num_cols = df.shape + summary = [f"CSV file in `{path}` has {num_rows} rows and {num_cols} columns."] + summary.append("Here is some information about the columns:") + for col in sorted(df.columns): + dtype = df[col].dtype + name = f"{col} ({dtype})" + nan_count = df[col].isnull().sum() + if dtype == "bool": + true_percentage = df[col].mean() * 100 + summary.append(f"{name} is {true_percentage:.2f}% True, {100 - true_percentage:.2f}% False") + elif df[col].nunique() < 10: + unique_values = df[col].unique().tolist() + summary.append(f"{name} has {df[col].nunique()} unique values: {unique_values}") + elif is_numeric_dtype(df[col]): + min_val, max_val = df[col].min(), df[col].max() + summary.append(f"{name} has range: {min_val:.2f} - {max_val:.2f}, {nan_count} NaN values") + elif dtype == "object": + unique_count = df[col].nunique() + example_values = df[col].value_counts().head(limit_rows).index.tolist() + summary.append(f"{name} has {unique_count} unique values. Some example values: {example_values}") + return textwrap.dedent("\n".join(summary)).strip() except Exception as e: return f"cannot read csv data: {e}"