Skip to content

Commit

Permalink
try fix
Browse files Browse the repository at this point in the history
  • Loading branch information
gitfrosh committed Apr 27, 2024
1 parent 1345913 commit 07f03a7
Showing 1 changed file with 9 additions and 1 deletion.
10 changes: 9 additions & 1 deletion db/convert-csv-to-json.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import os
import pandas as pd
import json
import regex as re

def transform_objectid(text):
"""Replace MongoDB ObjectId references to proper JSON format."""
# Use non-capturing group and directly format the string with $oid.
return re.sub(r'ObjectId\(([^)]+)\)', r'{"$oid": "\1"}', text)
pattern = r'ObjectId\(([^)]+)\)'
replacements = re.findall(pattern, text)
for r in replacements:
text = text.replace(f'ObjectId({r})', f'{{"$oid": "{r}"}}')
return text

def main():
os.makedirs('db/json', exist_ok=True) # Ensure the directory for JSON files exists
Expand All @@ -16,6 +21,9 @@ def main():
# Transform all string columns that may contain ObjectId references
for column in df.select_dtypes(include=['object']):
df[column] = df[column].apply(lambda x: transform_objectid(str(x)) if pd.notna(x) else x)
# Convert transformed string JSON to actual JSON objects
for column in df.select_dtypes(include=['object']):
df[column] = df[column].apply(lambda x: json.loads(x) if pd.notna(x) and x.startswith('{') else x)
# Save each dataframe as a JSON file with all objects in a single array
json_path = f'db/json/{file.replace(".csv", ".json")}'
df.to_json(json_path, orient='records', indent=4)
Expand Down

0 comments on commit 07f03a7

Please sign in to comment.