Merge branch 'dev' of https://github.com/opentargets/genetics_etl_python

into release/0.x.x
opentargets · Jan 5, 2024 · d11112f · d11112f
2 parents fd199ba + ac1064f
commit d11112f
Show file tree

Hide file tree

Showing 55 changed files with 1,729 additions and 300 deletions.
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -58,7 +58,7 @@ jobs:
 
       - name: Python Semantic Release
         id: release
-        uses: python-semantic-release/python-semantic-release@v8.3.0
+        uses: python-semantic-release/python-semantic-release@v8.7.0
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
 

diff --git a/Makefile b/Makefile
@@ -1,6 +1,6 @@
 PROJECT_ID ?= open-targets-genetics-dev
 REGION ?= europe-west1
-APP_NAME ?= $$(cat pyproject.toml| grep name | cut -d" " -f3 | sed  's/"//g')
+APP_NAME ?= $$(cat pyproject.toml| grep -m 1 "name" | cut -d" " -f3 | sed  's/"//g')
 VERSION_NO ?= $$(poetry version --short)
 CLEAN_VERSION_NO := $(shell echo "$(VERSION_NO)" | tr -cd '[:alnum:]')
 BUCKET_NAME=gs://genetics_etl_python_playground/initialisation/${VERSION_NO}/
@@ -35,8 +35,7 @@ build-documentation: ## Create local server with documentation
 	@echo "Building Documentation..."
 	@poetry run mkdocs serve
 
-create-dev-cluster: ## Spin up a simple dataproc cluster with all dependencies for development purposes
-	@${MAKE} build
+create-dev-cluster: build ## Spin up a simple dataproc cluster with all dependencies for development purposes
 	@echo "Creating Dataproc Dev Cluster"
 	@gcloud config set project ${PROJECT_ID}
 	@gcloud dataproc clusters create "ot-genetics-dev-${CLEAN_VERSION_NO}" \
@@ -49,8 +48,7 @@ create-dev-cluster: ## Spin up a simple dataproc cluster with all dependencies f
 		--optional-components=JUPYTER \
 		--enable-component-gateway
 
-make update-dev-cluster: ## Reinstalls the package on the dev-cluster
-	@${MAKE} build
+make update-dev-cluster: build ## Reinstalls the package on the dev-cluster
 	@echo "Updating Dataproc Dev Cluster"
 	@gcloud config set project ${PROJECT_ID}
 	gcloud dataproc jobs submit pig --cluster="ot-genetics-dev-${CLEAN_VERSION_NO}" \
@@ -61,7 +59,6 @@ make update-dev-cluster: ## Reinstalls the package on the dev-cluster
 build: clean ## Build Python package with dependencies
 	@gcloud config set project ${PROJECT_ID}
 	@echo "Packaging Code and Dependencies for ${APP_NAME}-${VERSION_NO}"
-	@rm -rf ./dist
 	@poetry build
 	@tar -czf dist/config.tar.gz config/
 	@echo "Uploading to Dataproc"

diff --git a/codecov.yml b/codecov.yml
@@ -1,3 +1,6 @@
+codecov:
+  branch: dev
+
 comment:
   layout: "reach, diff, flags, files"
   behavior: default

diff --git a/config/datasets/gcp.yaml b/config/datasets/gcp.yaml
@@ -24,7 +24,6 @@ catalog_sumstats_lut: ${datasets.inputs}/v2d/harmonised_list-r2023-11-24a.txt
 ukbiobank_manifest: gs://genetics-portal-input/ukb_phenotypes/neale2_saige_study_manifest.190430.tsv
 l2g_gold_standard_curation: ${datasets.inputs}/l2g/gold_standard/curation.json
 gene_interactions: ${datasets.inputs}/l2g/interaction # 23.09 data
-finngen_phenotype_table_url: https://r9.finngen.fi/api/phenos
 eqtl_catalogue_paths_imported: ${datasets.inputs}/preprocess/eqtl_catalogue/tabix_ftp_paths_imported.tsv
 
 # Output datasets

diff --git a/config/step/finngen.yaml b/config/step/finngen.yaml
diff --git a/config/step/finngen_studies.yaml b/config/step/finngen_studies.yaml
@@ -0,0 +1,2 @@
+_target_: otg.finngen_studies.FinnGenStudiesStep
+finngen_study_index_out: ${datasets.finngen_study_index}
diff --git a/config/step/finngen_sumstat_preprocess.yaml b/config/step/finngen_sumstat_preprocess.yaml
@@ -0,0 +1,3 @@
+_target_: otg.finngen_sumstat_preprocess.FinnGenSumstatPreprocessStep
+raw_sumstats_path: ???
+out_sumstats_path: ???
diff --git a/docs/python_api/datasource/_datasource.md b/docs/python_api/datasource/_datasource.md
@@ -4,4 +4,21 @@ title: Data Source
 
 # Data Source
 
-TBC
+This section contains information about the data sources used in Open Targets Genetics.
+
+We use GnomAD v4.0 as a source for variant annotation and GnomAD v2.1.1 as a source for linkage disequilibrium (LD) information (described in the **GnomAD** section).
+
+We rely on Open Targets as a source for the list of targets and the Gold Standard training set (described in the **Open Targets** section).
+
+## Study Sources
+
+1. GWAS catalog
+2. FinnGen
+
+## Molecular QTLs
+
+1. eQTL catalogue
+
+## Interaction / Interval-based Experiments
+
+We integrate a list of studies that focus on interaction and interval-based investigations, shedding light on the intricate relationships between genetic elements and their functional implications. For more detils see section **"Intervals"**.
diff --git a/docs/python_api/datasource/eqtl_catalogue/_eqtl_catalogue.md b/docs/python_api/datasource/eqtl_catalogue/_eqtl_catalogue.md
@@ -0,0 +1,11 @@
+---
+title: eQTL Catalogue
+---
+
+The [eQTL Catalogue](https://www.ebi.ac.uk/eqtl/) aims to provide uniformly processed gene expression and splicing Quantitative Trait Loci (QTLs) from all available public studies on humans.
+
+It serves as the ultimate resource of eQTLs that we use for colocalization and target prioritization.
+
+We utilize data from the following study within the eQTL Catalogue:
+
+1. **GTEx v8**, 49 tissues
diff --git a/docs/python_api/datasource/finngen/_finngen.md b/docs/python_api/datasource/finngen/_finngen.md
@@ -12,4 +12,6 @@ title: FinnGen
   }
 </style>
 
-FinnGen is a research project in genomics and personalized medicine. It is large public-private partnership that has collected and analysed genome and health data from 500,000 Finnish biobank donors to understand the genetic basis of diseases. FinnGen is a now expanding into understanding the progression and biological mechanisms of diseases. FinnGen provides a world-class resource for further breakthroughs in disease prevention, diagnosis, and treatment and a outlook into our genetic make-up.
+[FinnGen](https://www.finngen.fi/en) is a research project in genomics and personalized medicine, representing a large public-private partnership. The project has collected and analyzed genome and health data from 500,000 Finnish biobank donors to understand the genetic basis of diseases. FinnGen is now expanding its focus to comprehend the progression and biological mechanisms of diseases. This initiative provides a world-class resource for further breakthroughs in disease prevention, diagnosis, and treatment, offering insights into our genetic makeup.
+
+For a comprehensive understanding of the dataset and methods, refer to [Kurki et al., 2023](https://www.nature.com/articles/s41586-022-05473-8).
diff --git a/docs/python_api/datasource/gnomad/_gnomad.md b/docs/python_api/datasource/gnomad/_gnomad.md
@@ -11,3 +11,9 @@ title: GnomAD
     display: none;
   }
 </style>
+
+[GnomAD](https://gnomad.broadinstitute.org/) (Genome Aggregation Database) is a comprehensive resource that provides aggregated genomic data from large-scale sequencing projects. It encompasses variants from diverse populations and is widely used for variant annotation and population genetics studies.
+
+We use **GnomAD v4.0** as a source for variant annotation, offering detailed information about the prevalence and distribution of genetic variants across different populations. This version of GnomAD provides valuable insights into the genomic landscape, aiding in the interpretation of genetic variants and their potential functional implications.
+
+Additionally, [**GnomAD v2.1.1**](https://gnomad.broadinstitute.org/news/2018-10-gnomad-v2-1/) is utilized as a source for linkage disequilibrium (LD) information.
diff --git a/docs/python_api/datasource/gwas_catalog/_gwas_catalog.md b/docs/python_api/datasource/gwas_catalog/_gwas_catalog.md
@@ -6,3 +6,17 @@ title: GWAS Catalog
   <img width="100" height="100" src="../../../../assets/imgs/GWAS_Catalog_circle_178x178.png">
   <h1>GWAS Catalog</h1>
 </div>
+
+The [GWAS Catalog](https://www.ebi.ac.uk/gwas/) is a comprehensive resource that aims to provide a curated collection of Genome-Wide Association Studies (GWAS) (including harmonized full GWAS summary statistics) across various traits and diseases in humans.
+
+It serves as a valuable repository of genetic associations identified in diverse populations, offering insights into the genetic basis of complex traits and diseases.
+
+We rely on the GWAS Catalog for a rich source of genetic associations, utilizing the data for analysis and interpretation.
+
+For detailed information on specific genetic associations, their significance, and associated studies, refer to the [GWAS Catalog](https://www.ebi.ac.uk/gwas/).
+
+Within our analyses, we leverage two different types of studies from the GWAS Catalog:
+
+1. **Studies with (full) GWAS summary stats**
+
+2. **Studies with top hits only - GWAS curated studies**
diff --git a/docs/python_api/datasource/intervals/_intervals.md b/docs/python_api/datasource/intervals/_intervals.md
@@ -1,7 +1,25 @@
 ---
-title: Chromatin intevals
+title: Interaction and Interval-based Studies
 ---
 
-# Chromatin intervals
+# List of Interaction and Interval-based Studies
 
-TBC
+In this section, we provide a list of studies that focus on interaction and interval-based investigations, shedding light on the intricate relationships between genetic elements and their functional implications.
+
+1. **Promoter Capture Hi-C (Javierre et al., 2016):**
+   _Title:_ "Lineage-Specific Genome Architecture Links Enhancers and Non-coding Disease Variants to Target Gene Promoters".
+   This study presents evidence linking genetic variation to genes through the application of Promoter Capture Hi-C across each of the 17 human primary hematopoietic cell types. The method captures interactions between promoters and distal regulatory elements, providing valuable insights into the three-dimensional chromatin architecture. DOI: 10.1016/j.cell.2016.09.037
+
+2. **Enhancer-TSS Correlation (Andersson et al., 2014):**
+   _Title:_ "An Atlas of Active Enhancers across Human Cell Types and Tissues".
+   This study explores genetic variation's impact on genes by examining the correlation between the transcriptional activity of enhancers and transcription start sites. The findings are documented in the FANTOM5 CAGE expression atlas, offering a comprehensive view of the regulatory landscape. DOI: 10.1038/nature12787
+
+3. **DHS-Promoter Correlation (Thurman et al., 2012):**
+   _Title:_ "The accessible chromatin landscape of the human genome".
+   Investigating genetic variation's connection to genes, this study employs the correlation of DNase I hypersensitive sites (DHS) and gene promoters. The analysis spans 125 cell and tissue types from the ENCODE project, providing a broad understanding of the regulatory interactions across diverse biological contexts. DOI: 10.1038/nature11232
+
+4. **Promoter Capture Hi-C (Jung et al., 2019):**
+   _Title:_ "A compendium of promoter-centered long-range chromatin interactions in the human genome".
+   This study compiles a compendium of promoter-centered long-range chromatin interactions in the human genome. By focusing on the three-dimensional organization of chromatin, the research contributes to our understanding of the spatial arrangement of genetic elements and their implications in gene regulation. DOI: 10.1038/s41588-019-0494-8
+
+For in-depth details on each study, you may refer to the respective publications.
diff --git a/docs/python_api/datasource/open_targets/_open_targets.md b/docs/python_api/datasource/open_targets/_open_targets.md
@@ -12,6 +12,12 @@ title: Open Targets
   }
 </style>
 
-The Open Targets Platform is a comprehensive resource that aims to aggregate and harmonise various types of data to facilitate the identification, prioritisation, and validation of drug targets. By integrating publicly available datasets including data generated by the Open Targets consortium, the Platform builds and scores target-disease associations to assist in drug target identification and prioritisation. It also integrates relevant annotation information about targets, diseases, phenotypes, and drugs, as well as their most relevant relationships.
+The Open Targets Platform is a comprehensive resource that aims to aggregate and harmonize various types of data to facilitate the identification, prioritization, and validation of drug targets. By integrating publicly available datasets, including data generated by the Open Targets consortium, the Platform builds and scores target-disease associations to assist in drug target identification and prioritization. It also integrates relevant annotation information about targets, diseases, phenotypes, and drugs, as well as their most relevant relationships.
 
-Genomic data from Open Targets integrates human genome-wide association studies (GWAS) and functional genomics data including gene expression, protein abundance, chromatin interaction and conformation data from a wide range of cell types and tissues to make robust connections between GWAS-associated loci, variants and likely causal genes.
+Within our analyses, we utilize Open Targets to infer two datasets:
+
+1. **The list of targets:**
+   This dataset provides a compilation of targets. In the Open Targets Platform, a target is understood as any naturally-occurring molecule that can be targeted by a medicinal product. The EMBL-EBI Ensembl database serves as the source for human targets in the Platform, with the Ensembl gene ID as the primary identifier. For more details, refer to [this link](https://platform-docs.opentargets.org/target).
+
+2. **The list of Gold Standard Positives:**
+   We use this dataset for training the Locus-to-Gene model. The current list contains 496 Gold Standard Positives.
diff --git a/docs/python_api/datasource/ukbiobank/_ukbiobank.md b/docs/python_api/datasource/ukbiobank/_ukbiobank.md
diff --git a/docs/python_api/datasource/ukbiobank/study_index.md b/docs/python_api/datasource/ukbiobank/study_index.md
diff --git a/docs/python_api/method/carma.md b/docs/python_api/method/carma.md
@@ -0,0 +1,20 @@
+---
+title: CARMA
+---
+
+CARMA is the method of the fine-mapping and outlier detection, originally implemented in R ([CARMA on GitHub](https://github.com/ZikunY/CARMA)).
+
+The full repository for the reimplementation of CARMA in Python can be found [here](https://github.com/hlnicholls/carmapy/tree/0.1.0).
+
+This is a simplified version of CARMA with the following features:
+
+1. It uses only Spike-slab effect size priors and Poisson model priors.
+2. C++ is re-implemented in Python.
+3. The way of storing the configuration list is changed. It uses a string with the list of indexes for causal SNPs instead of a sparse matrix.
+4. Fixed bugs in PIP calculation.
+5. No credible models.
+6. No credible sets, only PIPs.
+7. No functional annotations.
+8. Removed unnecessary parameters.
+
+:::otg.method.carma.CARMA
diff --git a/docs/python_api/step/finngen.md b/docs/python_api/step/finngen.md
diff --git a/docs/python_api/step/finngen_studies.md b/docs/python_api/step/finngen_studies.md
@@ -0,0 +1,5 @@
+---
+title: FinnGen Studies
+---
+
+::: otg.finngen_studies.FinnGenStudiesStep
diff --git a/docs/python_api/step/finngen_sumstat_preprocess.md b/docs/python_api/step/finngen_sumstat_preprocess.md
@@ -0,0 +1,5 @@
+---
+title: FinnGen Preprocess Summary Stats
+---
+
+::: otg.finngen_sumstat_preprocess.FinnGenSumstatPreprocessStep