Skip to content

Commit

Permalink
Prevent data loss from backfill calls
Browse files Browse the repository at this point in the history
  • Loading branch information
kelvinou01 committed Aug 21, 2023
1 parent 6dc2d5c commit 7c6cdae
Show file tree
Hide file tree
Showing 5 changed files with 8 additions and 8 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build_images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
build:
name: "Build Docker Images"
runs-on: ubuntu-latest
environment: ${GITHUB_REF##*/}
environment: ${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}

defaults:
run:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/terraform.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
terraform:
name: 'Terraform'
runs-on: ubuntu-latest
environment: ${GITHUB_REF##*/}
environment: ${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}

defaults:
run:
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ pre-commit:
pre-commit run --all-files

test: pre-commit
cd tests && export REDDIT_CLIENT_ID="" REDDIT_CLIENT_SECRET="" SUBREDDITS="" HUGGINGFACE_TOKEN="" GCS_RAW_BUCKET_NAME="" GCS_TRANSFORMED_BUCKET_NAME="" && python3 -m pytest -v
cd tests && export REDDIT_CLIENT_ID="" REDDIT_CLIENT_SECRET="" SUBREDDITS="" HUGGINGFACE_TOKEN="" GCS_RAW_BUCKET_NAME="" GCS_TRANSFORMED_BUCKET_NAME="" BIGQUERY_DATASET_ID="" BIGQUERY_TABLE_ID="" && python3 -m pytest -v

first-time-setup:
gcloud artifacts repositories create etl-images --location=asia-southeast1 --repository-format=docker
Expand Down
4 changes: 2 additions & 2 deletions src/common/reddit_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@ def _remove_submissions_not_on_date(
) -> list[dict]:
"""Removes submissions not made on date"""
return [
submission for submission in submissions
submission
for submission in submissions
if datetime.utcfromtimestamp(submission["created_utc"]).date() == date
]


def fetch_submissions_made_on_date(self, subreddit: str, date: Date) -> list[dict]:
posts_made_on_date = []
last_post_id = None
Expand Down
6 changes: 3 additions & 3 deletions src/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,10 @@ def parse_and_check_date(input_date: str) -> Date:
date = datetime.strptime(input_date, "%d/%m/%Y").date()

today = datetime.now().date()
more_than_a_month_ago = (today - date).days > 30
if more_than_a_month_ago:
more_than_ten_days_ago = (today - date).days > 10
if more_than_ten_days_ago:
raise ValueError(
"Use extract_backfill.py to extract posts made more than a month ago.",
"Cannot extract for dates made more than 10 days ago. Data loss may occur.",
)

return date
Expand Down

0 comments on commit 7c6cdae

Please sign in to comment.