Split python script (#57)

* split-python-script * split-python-script
kevmo314 · Jan 22, 2024 · ebc3633 · ebc3633
1 parent 800b5b3
commit ebc3633
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 5 deletions.
diff --git a/.github/workflows/example-client.yml b/.github/workflows/example-client.yml
@@ -29,7 +29,7 @@ jobs:
           # Fetch the data in workspace
           cd examples/workspace
           python3 -m pip install -r requirements.txt
-          python3 fetch_data.py
+          python3 fetch_jsonl.py
           cd -
 
           # Build the index

diff --git a/examples/README.md b/examples/README.md
@@ -18,7 +18,8 @@ cd workspace
 # green tripdata
 python3 -m pip install -r requirements.txt
 
-python3 fetch_data.py
+# fetch data with .jsonl format
+python3 fetch_jsonl.py
 ```
 
 Then run the indexing process:

diff --git a/examples/workspace/fetch_csv.py b/examples/workspace/fetch_csv.py
@@ -0,0 +1,11 @@
+# Data taken from https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
+
+import io
+
+import pandas as pd
+import requests
+
+response = requests.get('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet')
+
+df = pd.read_parquet(io.BytesIO(response.content))
+df.to_csv('green_tripdata_2023-01.csv', index=False)
diff --git a/examples/workspace/fetch_data.py → examples/workspace/fetch_jsonl.py b/examples/workspace/fetch_data.py → examples/workspace/fetch_jsonl.py
@@ -8,6 +8,3 @@
 response = requests.get('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet')
 
 pd.read_parquet(io.BytesIO(response.content)).to_json('green_tripdata_2023-01.jsonl', orient='records', lines=True)
-
-df = pd.read_parquet(io.BytesIO(response.content))
-df.to_csv('green_tripdata_2023-01.csv', index=False)