From 7c6cdae52b006593524109647866b0efd409c9cc Mon Sep 17 00:00:00 2001 From: Kelvin Ou Date: Thu, 17 Aug 2023 03:14:21 +0800 Subject: [PATCH] Prevent data loss from backfill calls --- .github/workflows/build_images.yml | 2 +- .github/workflows/terraform.yml | 2 +- Makefile | 2 +- src/common/reddit_client.py | 4 ++-- src/extract.py | 6 +++--- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build_images.yml b/.github/workflows/build_images.yml index c92673c..0dab789 100644 --- a/.github/workflows/build_images.yml +++ b/.github/workflows/build_images.yml @@ -11,7 +11,7 @@ jobs: build: name: "Build Docker Images" runs-on: ubuntu-latest - environment: ${GITHUB_REF##*/} + environment: ${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}} defaults: run: diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml index 51d5a0e..c8d0a26 100644 --- a/.github/workflows/terraform.yml +++ b/.github/workflows/terraform.yml @@ -16,7 +16,7 @@ jobs: terraform: name: 'Terraform' runs-on: ubuntu-latest - environment: ${GITHUB_REF##*/} + environment: ${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}} defaults: run: diff --git a/Makefile b/Makefile index 8452a0f..d45859d 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ pre-commit: pre-commit run --all-files test: pre-commit - cd tests && export REDDIT_CLIENT_ID="" REDDIT_CLIENT_SECRET="" SUBREDDITS="" HUGGINGFACE_TOKEN="" GCS_RAW_BUCKET_NAME="" GCS_TRANSFORMED_BUCKET_NAME="" && python3 -m pytest -v + cd tests && export REDDIT_CLIENT_ID="" REDDIT_CLIENT_SECRET="" SUBREDDITS="" HUGGINGFACE_TOKEN="" GCS_RAW_BUCKET_NAME="" GCS_TRANSFORMED_BUCKET_NAME="" BIGQUERY_DATASET_ID="" BIGQUERY_TABLE_ID="" && python3 -m pytest -v first-time-setup: gcloud artifacts repositories create etl-images --location=asia-southeast1 --repository-format=docker diff --git a/src/common/reddit_client.py b/src/common/reddit_client.py index a9e03a4..d834eb3 100644 --- a/src/common/reddit_client.py +++ b/src/common/reddit_client.py @@ -40,11 +40,11 @@ def _remove_submissions_not_on_date( ) -> list[dict]: """Removes submissions not made on date""" return [ - submission for submission in submissions + submission + for submission in submissions if datetime.utcfromtimestamp(submission["created_utc"]).date() == date ] - def fetch_submissions_made_on_date(self, subreddit: str, date: Date) -> list[dict]: posts_made_on_date = [] last_post_id = None diff --git a/src/extract.py b/src/extract.py index cab9596..e558ffa 100644 --- a/src/extract.py +++ b/src/extract.py @@ -102,10 +102,10 @@ def parse_and_check_date(input_date: str) -> Date: date = datetime.strptime(input_date, "%d/%m/%Y").date() today = datetime.now().date() - more_than_a_month_ago = (today - date).days > 30 - if more_than_a_month_ago: + more_than_ten_days_ago = (today - date).days > 10 + if more_than_ten_days_ago: raise ValueError( - "Use extract_backfill.py to extract posts made more than a month ago.", + "Cannot extract for dates made more than 10 days ago. Data loss may occur.", ) return date