-
Notifications
You must be signed in to change notification settings - Fork 44
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
update the preview_csv_data function
- Loading branch information
1 parent
699f858
commit 995f59f
Showing
1 changed file
with
24 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,24 +1,38 @@ | ||
import textwrap | ||
import pandas as pd | ||
from pandas.api.types import is_numeric_dtype | ||
|
||
|
||
def preview_csv_data(path: str, limit_rows: int = 1) -> str: | ||
def preview_csv_data(path: str, limit_rows: int = 5) -> str: | ||
""" | ||
Preview the sample dataset from the project data path and include metadata. | ||
Refer to: https://github.com/WecoAI/aideml/blob/main/aide/utils/data_preview.py | ||
:param path: the path to a local CSV file. | ||
:param limit_rows: the number of rows to preview. | ||
:return: the sample dataset with metadata as a string. | ||
""" | ||
try: | ||
df = pd.read_csv(path) | ||
num_rows = len(df) | ||
columns = ', '.join(df.columns) | ||
df_limited = df.head(limit_rows) | ||
data_dict_list = df_limited.to_dict(orient='records') | ||
data_dict_str = "\n".join([str(record) for record in data_dict_list]) | ||
|
||
return textwrap.dedent(f""" | ||
Data file: {path}\nNumber of all rows: {num_rows}\nAll columns: {columns}\nData example:\n{data_dict_str} | ||
""").strip() | ||
num_rows, num_cols = df.shape | ||
summary = [f"-> {path} has {num_rows} rows and {num_cols} columns."] | ||
summary.append("Here is some information about the columns:") | ||
for col in sorted(df.columns): | ||
dtype = df[col].dtype | ||
name = f"{col} ({dtype})" | ||
nan_count = df[col].isnull().sum() | ||
if dtype == "bool": | ||
true_percentage = df[col].mean() * 100 | ||
summary.append(f"{name} is {true_percentage:.2f}% True, {100 - true_percentage:.2f}% False") | ||
elif df[col].nunique() < 10: | ||
unique_values = df[col].unique().tolist() | ||
summary.append(f"{name} has {df[col].nunique()} unique values: {unique_values}") | ||
elif is_numeric_dtype(df[col]): | ||
min_val, max_val = df[col].min(), df[col].max() | ||
summary.append(f"{name} has range: {min_val:.2f} - {max_val:.2f}, {nan_count} NaN values") | ||
elif dtype == "object": | ||
unique_count = df[col].nunique() | ||
example_values = df[col].value_counts().head(limit_rows).index.tolist() | ||
summary.append(f"{name} has {unique_count} unique values. Some example values: {example_values}") | ||
return textwrap.dedent("\n".join(summary)).strip() | ||
except Exception as e: | ||
return f"cannot read csv data: {e}" |