Skip to content

Commit

Permalink
update the preview_csv_data function
Browse files Browse the repository at this point in the history
  • Loading branch information
leeeizhang committed Oct 16, 2024
1 parent 699f858 commit 995f59f
Showing 1 changed file with 24 additions and 10 deletions.
34 changes: 24 additions & 10 deletions mle/function/data.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,38 @@
import textwrap
import pandas as pd
from pandas.api.types import is_numeric_dtype


def preview_csv_data(path: str, limit_rows: int = 1) -> str:
def preview_csv_data(path: str, limit_rows: int = 5) -> str:
"""
Preview the sample dataset from the project data path and include metadata.
Refer to: https://github.com/WecoAI/aideml/blob/main/aide/utils/data_preview.py
:param path: the path to a local CSV file.
:param limit_rows: the number of rows to preview.
:return: the sample dataset with metadata as a string.
"""
try:
df = pd.read_csv(path)
num_rows = len(df)
columns = ', '.join(df.columns)
df_limited = df.head(limit_rows)
data_dict_list = df_limited.to_dict(orient='records')
data_dict_str = "\n".join([str(record) for record in data_dict_list])

return textwrap.dedent(f"""
Data file: {path}\nNumber of all rows: {num_rows}\nAll columns: {columns}\nData example:\n{data_dict_str}
""").strip()
num_rows, num_cols = df.shape
summary = [f"-> {path} has {num_rows} rows and {num_cols} columns."]
summary.append("Here is some information about the columns:")
for col in sorted(df.columns):
dtype = df[col].dtype
name = f"{col} ({dtype})"
nan_count = df[col].isnull().sum()
if dtype == "bool":
true_percentage = df[col].mean() * 100
summary.append(f"{name} is {true_percentage:.2f}% True, {100 - true_percentage:.2f}% False")
elif df[col].nunique() < 10:
unique_values = df[col].unique().tolist()
summary.append(f"{name} has {df[col].nunique()} unique values: {unique_values}")
elif is_numeric_dtype(df[col]):
min_val, max_val = df[col].min(), df[col].max()
summary.append(f"{name} has range: {min_val:.2f} - {max_val:.2f}, {nan_count} NaN values")
elif dtype == "object":
unique_count = df[col].nunique()
example_values = df[col].value_counts().head(limit_rows).index.tolist()
summary.append(f"{name} has {unique_count} unique values. Some example values: {example_values}")
return textwrap.dedent("\n".join(summary)).strip()
except Exception as e:
return f"cannot read csv data: {e}"

0 comments on commit 995f59f

Please sign in to comment.