From 995f59f26818630f140e02c65689535f5e5005cc Mon Sep 17 00:00:00 2001
From: "leizhang.real@gmail.com" <leizhang.real@gmail.com>
Date: Wed, 16 Oct 2024 13:03:07 +0000
Subject: [PATCH 1/2] update the preview_csv_data function

---
 mle/function/data.py | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/mle/function/data.py b/mle/function/data.py
index b99e857..f7f8b85 100644
--- a/mle/function/data.py
+++ b/mle/function/data.py
@@ -1,24 +1,38 @@
 import textwrap
 import pandas as pd
+from pandas.api.types import is_numeric_dtype
 
 
-def preview_csv_data(path: str, limit_rows: int = 1) -> str:
+def preview_csv_data(path: str, limit_rows: int = 5) -> str:
     """
     Preview the sample dataset from the project data path and include metadata.
+    Refer to: https://github.com/WecoAI/aideml/blob/main/aide/utils/data_preview.py
     :param path: the path to a local CSV file.
     :param limit_rows: the number of rows to preview.
     :return: the sample dataset with metadata as a string.
     """
     try:
         df = pd.read_csv(path)
-        num_rows = len(df)
-        columns = ', '.join(df.columns)
-        df_limited = df.head(limit_rows)
-        data_dict_list = df_limited.to_dict(orient='records')
-        data_dict_str = "\n".join([str(record) for record in data_dict_list])
-
-        return textwrap.dedent(f"""
-        Data file: {path}\nNumber of all rows: {num_rows}\nAll columns: {columns}\nData example:\n{data_dict_str}
-        """).strip()
+        num_rows, num_cols = df.shape
+        summary = [f"-> {path} has {num_rows} rows and {num_cols} columns."]
+        summary.append("Here is some information about the columns:")
+        for col in sorted(df.columns):
+            dtype = df[col].dtype
+            name = f"{col} ({dtype})"
+            nan_count = df[col].isnull().sum()
+            if dtype == "bool":
+                true_percentage = df[col].mean() * 100
+                summary.append(f"{name} is {true_percentage:.2f}% True, {100 - true_percentage:.2f}% False")
+            elif df[col].nunique() < 10:
+                unique_values = df[col].unique().tolist()
+                summary.append(f"{name} has {df[col].nunique()} unique values: {unique_values}")
+            elif is_numeric_dtype(df[col]):
+                min_val, max_val = df[col].min(), df[col].max()
+                summary.append(f"{name} has range: {min_val:.2f} - {max_val:.2f}, {nan_count} NaN values")
+            elif dtype == "object":
+                unique_count = df[col].nunique()
+                example_values = df[col].value_counts().head(limit_rows).index.tolist()
+                summary.append(f"{name} has {unique_count} unique values. Some example values: {example_values}")
+        return textwrap.dedent("\n".join(summary)).strip()
     except Exception as e:
         return f"cannot read csv data: {e}"

From 0a8b80c421ffed8ad0bcf32a11c9a761f31c5a28 Mon Sep 17 00:00:00 2001
From: "leizhang.real@gmail.com" <leizhang.real@gmail.com>
Date: Wed, 16 Oct 2024 13:05:13 +0000
Subject: [PATCH 2/2] update summary

---
 mle/function/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mle/function/data.py b/mle/function/data.py
index f7f8b85..a406ff0 100644
--- a/mle/function/data.py
+++ b/mle/function/data.py
@@ -14,7 +14,7 @@ def preview_csv_data(path: str, limit_rows: int = 5) -> str:
     try:
         df = pd.read_csv(path)
         num_rows, num_cols = df.shape
-        summary = [f"-> {path} has {num_rows} rows and {num_cols} columns."]
+        summary = [f"CSV file in `{path}` has {num_rows} rows and {num_cols} columns."]
         summary.append("Here is some information about the columns:")
         for col in sorted(df.columns):
             dtype = df[col].dtype