Merge pull request #688 from opentargets/dev

chore: trigger release process
opentargets · Jul 12, 2024 · 1dfc55a · 1dfc55a
2 parents 54be3ce + 16f3d71
commit 1dfc55a
Show file tree

Hide file tree

Showing 114 changed files with 6,165 additions and 2,523 deletions.
diff --git a/.github/workflows/artifact.yml b/.github/workflows/artifact.yml
@@ -3,12 +3,13 @@ name: Build and Push to Artifact Registry
 "on":
   push:
     branches: ["dev"]
+    tags: ["v*"]
 
 env:
   PROJECT_ID: open-targets-genetics-dev
   REGION: europe-west1
   GAR_LOCATION: europe-west1-docker.pkg.dev/open-targets-genetics-dev
-  IMAGE_NAME: gentropy-app
+  REPOSITORY: gentropy-app
 
 jobs:
   build-push-artifact:
@@ -33,7 +34,13 @@ jobs:
           gcloud auth configure-docker ${{ env.REGION }}-docker.pkg.dev --quiet
 
       - name: Build image
-        run: docker build . --tag "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}/gentropy:${{ github.ref_name }}"
+        run: docker build . --tag "${{ env.GAR_LOCATION }}/${{ env.REPOSITORY }}/gentropy:${{ github.ref_name }}"
 
       - name: Push image
-        run: docker push "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}/gentropy:${{ github.ref_name }}"
+        run: docker push "${{ env.GAR_LOCATION }}/${{ env.REPOSITORY }}/gentropy:${{ github.ref_name }}"
+
+      - name: Build VEP image
+        run: docker build src/vep --tag "${{ env.GAR_LOCATION }}/${{ env.REPOSITORY }}/custom_ensembl_vep:${{ github.ref_name }}"
+
+      - name: Push VEP image
+        run: docker push "${{ env.GAR_LOCATION }}/${{ env.REPOSITORY }}/custom_ensembl_vep:${{ github.ref_name }}"
diff --git a/.github/workflows/pr_release_trigger.yaml b/.github/workflows/pr_release_trigger.yaml
@@ -15,7 +15,7 @@ jobs:
           source_branch: "dev"
           destination_branch: "main"
           pr_title: "chore: trigger release process"
-          pr_body: ":warning: *This PR requires a MERGE or REBASE COMMIT (Don't squash!)*"
+          pr_body: ":warning: *This PR requires a MERGE COMMIT (Don't squash!)*"
           pr_label: "auto-pr"
           pr_draft: false
           pr_allow_empty: true

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -31,7 +31,7 @@ jobs:
 
       - name: Python Semantic Release
         id: semrelease
-        uses: python-semantic-release/python-semantic-release@v9.6.0
+        uses: python-semantic-release/python-semantic-release@v9.8.3
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
 

diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,4 @@ src/airflow/logs/*
 site/
 .env
 .coverage*
+wandb/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ ci:
   skip: [poetry-lock]
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.3
+    rev: v0.5.1
     hooks:
       - id: ruff
         args:
@@ -21,7 +21,6 @@ repos:
       - id: end-of-file-fixer
         exclude: "CHANGELOG.md"
       - id: debug-statements
-      - id: check-merge-conflict
       - id: check-case-conflict
       - id: check-json
         exclude: (.vscode|.devcontainer)
@@ -66,7 +65,7 @@ repos:
         stages: [commit-msg]
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: "v1.10.0"
+    rev: "v1.10.1"
     hooks:
       - id: mypy
         args:
@@ -99,7 +98,7 @@ repos:
       - id: beautysh
 
   - repo: https://github.com/jsh9/pydoclint
-    rev: 0.4.1
+    rev: 0.5.3
     hooks:
       - id: pydoclint
 

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -25,5 +25,8 @@
   "python.testing.pytestEnabled": true,
   "mypy-type-checker.severity": {
     "error": "Information"
-  }
+  },
+  "yaml.extension.recommendations": false,
+  "workbench.remoteIndicator.showExtensionRecommendations": false,
+  "extensions.ignoreRecommendations": true
 }
diff --git a/Dockerfile b/Dockerfile
@@ -1,10 +1,26 @@
 FROM python:3.10-bullseye
 
-
+ARG CLOUD_SDK_VERSION=452.0.0
+ENV GCLOUD_HOME=/home/google-cloud-sdk
 RUN apt-get update && \
-    apt-get install -y openjdk-11-jdk && \
+    apt-get install -y openjdk-11-jdk jq && \
     apt-get clean && rm -rf /var/lib/apt/lists/*
 
+ENV PATH="$PATH:$GCLOUD_HOME"
+# required by google batch scripts
+RUN DOWNLOAD_URL="https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz" \
+    && TMP_DIR="$(mktemp -d)" \
+    && curl -fL "${DOWNLOAD_URL}" --output "${TMP_DIR}/google-cloud-sdk.tar.gz" \
+    && mkdir -p "${GCLOUD_HOME}" \
+    && tar xzf "${TMP_DIR}/google-cloud-sdk.tar.gz" -C "${GCLOUD_HOME}" --strip-components=1 \
+    && "${GCLOUD_HOME}/install.sh" \
+    --bash-completion=false \
+    --path-update=false \
+    --usage-reporting=false \
+    --quiet \
+    && rm -rf "${TMP_DIR}"
+
+ENV PATH="$PATH:$GCLOUD_HOME/bin"
 RUN java -version
 
 # Set environment variables for Java

diff --git a/Makefile b/Makefile
@@ -44,7 +44,11 @@ create-dev-cluster: build ## Spin up a simple dataproc cluster with all dependen
 		--master-machine-type n1-standard-16 \
 		--initialization-actions=gs://genetics_etl_python_playground/initialisation/${VERSION_NO}/install_dependencies_on_cluster.sh \
 		--metadata="PACKAGE=gs://genetics_etl_python_playground/initialisation/${VERSION_NO}/gentropy-${VERSION_NO}-py3-none-any.whl,CONFIGTAR=gs://genetics_etl_python_playground/initialisation/${VERSION_NO}/config.tar.gz" \
-		--single-node \
+		--num-workers 4 \
+		--primary-worker-type n1-standard-8 \
+		--worker-machine-type n1-standard-4 \
+		--worker-boot-disk-size 500 \
+		--autoscaling_policy=f"projects/${PROJECT_ID}/regions/${REGION}/autoscalingPolicies/eqtl-preprocess", \
 		--optional-components=JUPYTER \
 		--enable-component-gateway
 

diff --git a/config/datasets/ot_gcp.yaml b/config/datasets/ot_gcp.yaml
@@ -37,13 +37,13 @@ gnomad_public_bucket: gs://gcp-public-data--gnomad/release/
 ld_matrix_template: ${datasets.gnomad_public_bucket}/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm
 ld_index_raw_template: ${datasets.gnomad_public_bucket}/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht
 liftover_ht_path: ${datasets.gnomad_public_bucket}/2.1.1/liftover_grch38/ht/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.ht
-# variant_annotation
+# GnomAD variant set:
 gnomad_genomes_path: ${datasets.gnomad_public_bucket}4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/
 
 # Others
 chain_38_37: gs://hail-common/references/grch38_to_grch37.over.chain.gz
 chain_37_38: ${datasets.static_assets}/grch37_to_grch38.over.chain
-vep_consequences: ${datasets.static_assets}/vep_consequences.tsv
+vep_consequences: ${datasets.static_assets}/variant_consequence_to_score.tsv
 anderson: ${datasets.static_assets}/andersson2014/enhancer_tss_associations.bed
 javierre: ${datasets.static_assets}/javierre_2016_preprocessed
 jung: ${datasets.static_assets}/jung2019_pchic_tableS3.csv
@@ -55,7 +55,7 @@ finngen_finemapping_results_path: ${datasets.inputs}/Finngen_susie_finemapping_r
 finngen_finemapping_summaries_path: ${datasets.inputs}/Finngen_susie_finemapping_r10/Finngen_susie_credset_summary_r10.tsv
 
 # Dev output datasets
-variant_annotation: ${datasets.outputs}/variant_annotation
+gnomad_variants: ${datasets.outputs}/gnomad_variants
 study_locus: ${datasets.outputs}/study_locus
 summary_statistics: ${datasets.outputs}/summary_statistics
 study_locus_overlap: ${datasets.outputs}/study_locus_overlap
@@ -68,14 +68,16 @@ catalog_study_locus: ${datasets.study_locus}/catalog_study_locus
 from_sumstats_study_locus: ${datasets.study_locus}/from_sumstats
 from_sumstats_pics: ${datasets.credible_set}/from_sumstats
 
+vep_output_path: gs://genetics_etl_python_playground/vep/full_variant_index_vcf
+
 # ETL output datasets:
 l2g_gold_standard_curation: ${datasets.release_folder}/locus_to_gene_gold_standard.json
-l2g_model: ${datasets.release_folder}/locus_to_gene_model
+l2g_model: ${datasets.release_folder}/locus_to_gene_model/classifier.skops
 l2g_predictions: ${datasets.release_folder}/locus_to_gene_predictions
 l2g_feature_matrix: ${datasets.release_folder}/locus_to_gene_feature_matrix
 colocalisation: ${datasets.release_folder}/colocalisation
 study_index: ${datasets.release_folder}/study_index
 variant_index: ${datasets.release_folder}/variant_index
 credible_set: ${datasets.release_folder}/credible_set
 gene_index: ${datasets.release_folder}/gene_index
-v2g: ${datasets.release_folder}/variant_to_gene
+variant_to_gene: ${datasets.release_folder}/variant_to_gene
diff --git a/config/step/ot_finngen_studies.yaml b/config/step/ot_finngen_studies.yaml
diff --git a/config/step/ot_finngen_sumstat_preprocess.yaml b/config/step/ot_finngen_sumstat_preprocess.yaml
diff --git a/config/step/ot_gwas_catalog_sumstat_preprocess.yaml b/config/step/ot_gwas_catalog_sumstat_preprocess.yaml
diff --git a/config/step/ot_locus_to_gene_predict.yaml b/config/step/ot_locus_to_gene_predict.yaml
@@ -2,7 +2,7 @@ defaults:
   - locus_to_gene
 
 run_mode: predict
-model_path: ${datasets.l2g_model}
+model_path: null
 predictions_path: ${datasets.l2g_predictions}
 feature_matrix_path: ${datasets.l2g_feature_matrix}
 credible_set_path: ${datasets.credible_set}

diff --git a/config/step/ot_locus_to_gene_train.yaml b/config/step/ot_locus_to_gene_train.yaml
@@ -3,7 +3,7 @@ defaults:
 
 run_mode: train
 wandb_run_name: null
-perform_cross_validation: false
+hf_hub_repo_id: opentargets/locus_to_gene
 model_path: ${datasets.l2g_model}
 predictions_path: ${datasets.l2g_predictions}
 credible_set_path: ${datasets.credible_set}
@@ -13,5 +13,7 @@ study_index_path: ${datasets.study_index}
 gold_standard_curation_path: ${datasets.l2g_gold_standard_curation}
 gene_interactions_path: ${datasets.gene_interactions}
 hyperparameters:
+  n_estimators: 100
   max_depth: 5
-  loss_function: binary:logistic
+  loss: log_loss
+download_from_hub: true
diff --git a/config/step/ot_pics.yaml b/config/step/ot_pics.yaml
diff --git a/config/step/ot_ukb_ppp_eur_sumstat_preprocess.yaml b/config/step/ot_ukb_ppp_eur_sumstat_preprocess.yaml
@@ -0,0 +1,13 @@
+defaults:
+  - ukb_ppp_eur_sumstat_preprocess
+
+raw_study_index_path: ???
+raw_summary_stats_path: ???
+variant_annotation_path: ???
+tmp_variant_annotation_path: ???
+study_index_output_path: ???
+summary_stats_output_path: ???
+
+session:
+  extended_spark_conf:
+    "spark.sql.shuffle.partitions": "3200"
diff --git a/config/step/ot_variant_annotation.yaml b/config/step/ot_variant_annotation.yaml
diff --git a/config/step/ot_variant_index.yaml b/config/step/ot_variant_index.yaml
@@ -1,6 +1,6 @@
 defaults:
   - variant_index
 
-variant_annotation_path: ${datasets.variant_annotation}
-credible_set_path: ${datasets.credible_set}
+vep_output_json_path: ${datasets.vep_output_path}
+gnomad_variant_annotations_path: ${datasets.gnomad_variants}
 variant_index_path: ${datasets.variant_index}
diff --git a/config/step/ot_variant_to_gene.yaml b/config/step/ot_variant_to_gene.yaml
@@ -2,7 +2,6 @@ defaults:
   - variant_to_gene
 
 variant_index_path: ${datasets.variant_index}
-variant_annotation_path: ${datasets.variant_annotation}
 gene_index_path: ${datasets.gene_index}
 vep_consequences_path: ${datasets.vep_consequences}
 liftover_chain_file_path: ${datasets.chain_37_38}
@@ -11,4 +10,4 @@ interval_sources:
   javierre: ${datasets.javierre}
   jung: ${datasets.jung}
   thurman: ${datasets.thurman}
-v2g_path: ${datasets.v2g}
+v2g_path: ${datasets.variant_to_gene}
diff --git a/config/step/ot_window_based_clumping.yaml b/config/step/ot_window_based_clumping.yaml
diff --git a/docs/assets/imgs/ensembl_logo.png b/docs/assets/imgs/ensembl_logo.png
diff --git a/docs/development/contributing.md b/docs/development/contributing.md
@@ -41,7 +41,7 @@ In order to run the code:
    - Note that the version must comply with [PEP440 conventions](https://peps.python.org/pep-0440/#normalization), otherwise Poetry will not allow it to be deployed.
    - Do not use underscores or hyphens in your version name. When building the WHL file, they will be automatically converted to dots, which means the file name will no longer match the version and the build will fail. Use dots instead.
 
-3. Manually edit your local `src/airflow/dags/common_airflow.py` and set `OTG_VERSION` to the same version as you did in the previous step.
+3. Manually edit your local `src/airflow/dags/common_airflow.py` and set `GENTROPY_VERSION` to the same version as you did in the previous step.
 
 4. Run `make build`.
 
@@ -66,21 +66,27 @@ For more details on each of these steps, see the sections below.
 
 - If during development you had a question which wasn't covered in the documentation, and someone explained it to you, add it to the documentation. The same applies if you encountered any instructions in the documentation which were obsolete or incorrect.
 - Documentation autogeneration expressions start with `:::`. They will automatically generate sections of the documentation based on class and method docstrings. Be sure to update them for:
-  - Dataset definitions in `docs/python_api/datasource/STEP` (example: `docs/python_api/datasource/finngen/study_index.md`)
-  - Step definition in `docs/python_api/step/STEP.md` (example: `docs/python_api/step/finngen.md`)
+  - Datasource main page, for example: `docs/python_api/datasources/finngen/_finngen.md`
+  - Dataset definitions, for example: `docs/python_api/datasources/finngen/study_index.md`
+  - Step definition, for example: `docs/python_api/steps/finngen_sumstat_preprocess.md`
 
 ### Configuration
 
-- Input and output paths in `config/datasets/gcp.yaml`
-- Step configuration in `config/step/STEP.yaml` (example: `config/step/finngen.yaml`)
+- Input and output paths in `config/datasets/ot_gcp.yaml`
+- Step configuration, for example: `config/step/ot_finngen_sumstat_preprocess.yaml`
 
 ### Classes
 
-- Dataset class in `src/gentropy/datasource/STEP` (example: `src/gentropy/datasource/finngen/study_index.py` → `FinnGenStudyIndex`)
-- Step main running class in `src/gentropy/STEP.py` (example: `src/gentropy/finngen.py`)
+- Datasource init, for example: `src/gentropy/datasource/finngen/__init__.py`
+- Dataset classes, for example: `src/gentropy/datasource/finngen/study_index.py` → `FinnGenStudyIndex`
+- Step main running class, for example: `src/gentropy/finngen_sumstat_preprocess.py`
 
 ### Tests
 
-- Test study fixture in `tests/conftest.py` (example: `mock_study_index_finngen` in that module)
-- Test sample data in `tests/data_samples` (example: `tests/gentropy/data_samples/finngen_studies_sample.json`)
-- Test definition in `tests/` (example: `tests/dataset/test_study_index.py` → `test_study_index_finngen_creation`)
+- Test study fixture in `tests/conftest.py`, for example: `mock_study_index_finngen` in that module
+- Test sample data, for example: `tests/gentropy/data_samples/finngen_studies_sample.json`
+- Test definition, for example: `tests/dataset/test_study_index.py` → `test_study_index_finngen_creation`)
+
+### Orchestration
+
+- Airflow DAG, for example: `src/airflow/dags/finngen_harmonisation.py`
diff --git a/docs/python_api/datasets/variant_annotation.md b/docs/python_api/datasets/variant_annotation.md
diff --git a/docs/python_api/datasources/_datasources.md b/docs/python_api/datasources/_datasources.md
@@ -10,6 +10,7 @@ This section contains information about the data source harmonisation tools avai
 
 1. [GWAS Catalog](gwas_catalog/_gwas_catalog.md) (with or without full summary statistics)
 1. [FinnGen](finngen/_finngen.md)
+1. [UKB PPP (EUR)](ukb_ppp_eur/_ukb_ppp_eur.md)
 
 ## Molecular QTLs
 
@@ -22,7 +23,8 @@ This section contains information about the data source harmonisation tools avai
 ## Variant annotation/validation
 
 1. [GnomAD](gnomad/_gnomad.md) v4.0
-1. GWAS catalog harmonisation pipeline [more info](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics#_harmonised_summary_statistics_data)
+2. GWAS catalog's [harmonisation pipeline](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics#_harmonised_summary_statistics_data)
+3. Ensembl's [Variant Effect Predictor](https://www.ensembl.org/info/docs/tools/vep/index.html)
 
 ## Linkage desiquilibrium
 

diff --git a/docs/python_api/datasources/ensembl/_ensembl.md b/docs/python_api/datasources/ensembl/_ensembl.md
@@ -0,0 +1,10 @@
+---
+title: Ensembl annotations
+---
+
+<div align="center">
+  <img width="100" height="100" src="../../../../assets/imgs/ensembl_logo.png">
+  <h1>Ensembl</h1>
+</div>
+
+[Ensembl](https://www.ensembl.org/index.html) provides a diverse set of genetic data Gentropy takes advantage of including gene set, and variant annotations.
diff --git a/docs/python_api/datasources/ensembl/variant_effect_predictor_parser.md b/docs/python_api/datasources/ensembl/variant_effect_predictor_parser.md
@@ -0,0 +1,5 @@
+---
+title: Variant effector parser
+---
+
+::: gentropy.datasource.ensembl.vep_parser.VariantEffectPredictorParser
diff --git a/docs/python_api/datasources/ukb_ppp_eur/_ukb_ppp_eur.md b/docs/python_api/datasources/ukb_ppp_eur/_ukb_ppp_eur.md
@@ -0,0 +1,7 @@
+---
+title: UKB-PPP (EUR)
+---
+
+The UKB-PPP is a collaboration between the UK Biobank (UKB) and thirteen biopharmaceutical companies characterising the plasma proteomic profiles of 54,219 UKB participants.
+
+The original data is available at https://www.synapse.org/#!Synapse:syn51364943/. The associated paper is https://www.nature.com/articles/s41586-023-06592-6.