Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] update the preview_csv_data function #249

Merged
merged 2 commits into from
Oct 16, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 24 additions & 10 deletions mle/function/data.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,38 @@
import textwrap
import pandas as pd
from pandas.api.types import is_numeric_dtype


def preview_csv_data(path: str, limit_rows: int = 1) -> str:
def preview_csv_data(path: str, limit_rows: int = 5) -> str:
"""
Preview the sample dataset from the project data path and include metadata.
Refer to: https://github.com/WecoAI/aideml/blob/main/aide/utils/data_preview.py
:param path: the path to a local CSV file.
:param limit_rows: the number of rows to preview.
:return: the sample dataset with metadata as a string.
"""
try:
df = pd.read_csv(path)
num_rows = len(df)
columns = ', '.join(df.columns)
df_limited = df.head(limit_rows)
data_dict_list = df_limited.to_dict(orient='records')
data_dict_str = "\n".join([str(record) for record in data_dict_list])

return textwrap.dedent(f"""
Data file: {path}\nNumber of all rows: {num_rows}\nAll columns: {columns}\nData example:\n{data_dict_str}
""").strip()
num_rows, num_cols = df.shape
summary = [f"CSV file in `{path}` has {num_rows} rows and {num_cols} columns."]
summary.append("Here is some information about the columns:")
for col in sorted(df.columns):
dtype = df[col].dtype
name = f"{col} ({dtype})"
nan_count = df[col].isnull().sum()
if dtype == "bool":
true_percentage = df[col].mean() * 100
summary.append(f"{name} is {true_percentage:.2f}% True, {100 - true_percentage:.2f}% False")
elif df[col].nunique() < 10:
unique_values = df[col].unique().tolist()
summary.append(f"{name} has {df[col].nunique()} unique values: {unique_values}")
elif is_numeric_dtype(df[col]):
min_val, max_val = df[col].min(), df[col].max()
summary.append(f"{name} has range: {min_val:.2f} - {max_val:.2f}, {nan_count} NaN values")
elif dtype == "object":
unique_count = df[col].nunique()
example_values = df[col].value_counts().head(limit_rows).index.tolist()
summary.append(f"{name} has {unique_count} unique values. Some example values: {example_values}")
return textwrap.dedent("\n".join(summary)).strip()
except Exception as e:
return f"cannot read csv data: {e}"
Loading