-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Optimize harvesting flow and fix the item title miss-match issue
- Loading branch information
Showing
3 changed files
with
493 additions
and
416 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
|
||
import boto3 | ||
import logging | ||
from botocore.exceptions import ClientError | ||
import pandas as pd | ||
import io | ||
import os | ||
|
||
file_name = "records.parquet" | ||
bucket_name = "webpresence-geocore-geojson-to-parquet-dev" | ||
|
||
|
||
# Change directory | ||
# Get the current working directory | ||
current_directory = os.getcwd() | ||
print(f"Current Directory: {current_directory}") | ||
|
||
# Change the working directory | ||
new_directory = " " # Replace with the directory you want to switch to | ||
os.chdir(new_directory) | ||
current_directory = os.getcwd() | ||
print(f"New Directory: {current_directory}") | ||
|
||
|
||
# Function to open a S3 file from bucket and filename and return the parquet as pandas dataframe | ||
def open_S3_file_as_df(bucket_name, file_name): | ||
"""Open a S3 parquet file from bucket and filename and return the parquet as pandas dataframe | ||
:param bucket_name: Bucket name | ||
:param file_name: Specific file name to open | ||
:return: body of the file as a string | ||
""" | ||
try: | ||
s3 = boto3.resource('s3') | ||
object = s3.Object(bucket_name, file_name) | ||
body = object.get()['Body'].read() | ||
df = pd.read_parquet(io.BytesIO(body)) | ||
print(f'Loading {file_name} from {bucket_name} to pandas dataframe') | ||
return df | ||
except ClientError as e: | ||
logging.error(e) | ||
return e | ||
|
||
df = open_S3_file_as_df(bucket_name, file_name) | ||
print(f'The shape of the raw metadata parquet dataset is {df.shape}') | ||
|
||
""" | ||
# Add a new column to log the process, and loop through the pandas rows to assign values | ||
df['process_log'] = '' | ||
## Loop through the DataFrame and update the new column based on processing condition 'Fail' or 'Success' | ||
for index, row in df.iterrows(): | ||
if Transfromed == True: | ||
df.at[index, 'process_log'] = 'Success' # or 1 | ||
else: | ||
df.at[index, 'process_log'] = 'Fail' # or 0 | ||
""" | ||
# Save all the records as a CSV to local path | ||
save_path = os.path.join(os.getcwd(), 'records.csv') | ||
df.to_csv(save_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.