Merge pull request #65 from uclahs-cds/nzeltser-indel-matching

indel matching on rsid
uclahs-cds · Sep 18, 2024 · bbe9bf5 · bbe9bf5
2 parents 0438dfc + cb4c4f8
commit bbe9bf5
Show file tree

Hide file tree

Showing 20 changed files with 499 additions and 91 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,7 +1,8 @@
 import('vcfR');
 importFrom(
     'data.table',
-    'tstrsplit'
+    'tstrsplit',
+    'setDT'
     );
 import('reshape2');
 import('BoutrosLab.plotting.general');

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # Unreleased
 * Added handling of overlapping deletion allele notation
+* Added secondary PGS/VCF variant matching method using rsID after first attempt with genomic coordinates
+* Added checks for rsID as an optional column in input PGS weight files
 
 # ApplyPolygenicScore 2.0.0 (2024-07-31)
 

diff --git a/R/apply-pgs.R b/R/apply-pgs.R
@@ -143,7 +143,11 @@ validate.phenotype.data.input <- function(phenotype.data, phenotype.analysis.col
 #'
 #' \strong{Missing Genotype Handling}
 #'
-#' Missing genotypes are handled by three methods:
+#' VCF genotype data are matched to PGS data by chromosome, position, and effect allele. If a SNP cannot be matched by genomic coordinate,
+#' an attempt is made to match by rsID (if available). If a SNP from the PGS weight data is not found in the VCF data after these two matching attempts,
+#' it is considered a cohort-wide missing variant.
+#'
+#' Missing genotypes (in individual samples) among successfully matched variants are handled by three methods:
 #'
 #' \code{none}: Missing genotype dosages are excluded from the PGS calculation.
 #' This is equivalent to assuming that all missing genotypes are homozygous for the non-effect allele, resulting in a dosage of 0.
@@ -158,13 +162,13 @@ validate.phenotype.data.input <- function(phenotype.data, phenotype.analysis.col
 #' where \emph{k} is a PGS component variant that is missing in between 1 and n-1 individuals in the cohort and \emph{P} = ploidy = 2
 #' This dosage calculation holds under assumptions of Hardy-Weinberg equilibrium.
 #' By default, the effect allele frequency is calculated from the provided VCF data.
-#' For variants that are missing in all individuals, dosage is assumed to be zero (homozygous non-reference) for all individuals.
+#' For variants that are missing in all individuals (cohort-wide), dosage is assumed to be zero (homozygous non-reference) for all individuals.
 #' An external allele frequency can be provided in the \code{pgs.weight.data} as a column named \code{allelefrequency_effect} and by setting \code{use.external.effect.allele.frequency} to \code{TRUE}.
 #'
 #' \strong{Multiallelic Site Handling}
 #'
-#' VCF genotype data are matched to PGS data by chromosome, position, and effect allele. If a PGS weight file provides weights for multiple effect alleles, the appropriate dosage is calculated for the
-#' alleles that each individual carries. It is assumed that multiallelic variants are encoded in the same row in the VCF data. This is known as "merged" format. Split multiallelic sites are not accepted.
+#' If a PGS weight file provides weights for multiple effect alleles, the appropriate dosage is calculated for the alleles that each individual carries.
+#' It is assumed that multiallelic variants are encoded in the same row in the VCF data. This is known as "merged" format. Split multiallelic sites are not accepted.
 #' VCF data can be formatted to merged format using external tools for VCF file manipulation.
 #'
 #' @examples
@@ -293,18 +297,22 @@ apply.polygenic.score <- function(
         );
 
     ### Start Missing Genotype Handling ###
-
+    variant.id <- paste(merged.vcf.with.pgs.data$CHROM, merged.vcf.with.pgs.data$POS, merged.vcf.with.pgs.data$effect_allele, sep = ':');
     if ('mean.dosage' %in% missing.genotype.method) {
         # calculate dosage to replace missing genotypes
         if (use.external.effect.allele.frequency) {
-            missing.genotype.dosage <- convert.allele.frequency.to.dosage(allele.frequency = pgs.weight.data$allelefrequency_effect);
-            names(missing.genotype.dosage) <- paste(pgs.weight.data$CHROM, pgs.weight.data$POS, pgs.weight.data$effect_allele, sep = ':');
+            missing.genotype.dosage <- data.frame(dosage = convert.allele.frequency.to.dosage(allele.frequency = merged.vcf.with.pgs.data$allelefrequency_effect));
+            missing.genotype.dosage$variant.id <- variant.id;
+            # remove duplicated rows
+            missing.genotype.dosage <- missing.genotype.dosage[!duplicated(missing.genotype.dosage$variant.id), ];
+            # convert to named vector
+            missing.genotype.dosage <- setNames(missing.genotype.dosage$dosage, missing.genotype.dosage$variant.id);
             # identify missing genotypes
             # this method includes variants that are missing in all Indivs, these are all replaced with the same mean dosage
             missing.genotype.row.index <- which(is.na(merged.vcf.with.pgs.data$dosage));
             } else {
             # create sample by variant dosage matrix
-            variant.id <- paste(merged.vcf.with.pgs.data$CHROM, merged.vcf.with.pgs.data$POS, merged.vcf.with.pgs.data$effect_allele, sep = ':');
+
             dosage.matrix <- get.variant.by.sample.matrix(
                 long.data = merged.vcf.with.pgs.data,
                 variant.id = variant.id,

diff --git a/R/combine-vcf-with-pgs.R b/R/combine-vcf-with-pgs.R
@@ -1,10 +1,11 @@
 #' @title Combine VCF with PGS
-#' @description Match PGS SNPs to corresponding VCF information by genomic coordinates using a merge operation.
+#' @description Match PGS SNPs to corresponding VCF information by genomic coordinates or rsID using a merge operation.
 #' @param vcf.data A data.frame containing VCF data. Required columns: \code{CHROM, POS}.
 #' @param pgs.weight.data A data.frame containing PGS data. Required columns: \code{CHROM, POS}.
 #' @return A list containing a data.frame of merged VCF and PGS data and a data.frame of PGS SNPs missing from the VCF.
 #'
-#' Merge is performed on chromosome and base pair coordinates.
+#' A primary merge is first performed on chromosome and base pair coordinates. For SNPs that could not be matched in the first mergs, a second merge is attempted by rsID if available.
+#' This action can account for short INDELs that can have coordinate mismatches between the PGS and VCF data.
 #' The merge is a left outer join: all PGS SNPs are kept as rows even if they are missing from the VCF, and all VCF SNPs that are not a component of the PGS are dropped.
 #' If no PGS SNPs are present in the VCF, the function will terminate with an error.
 #' @examples
@@ -52,6 +53,18 @@ combine.vcf.with.pgs <- function(vcf.data, pgs.weight.data) {
         stop('pgs.weight.data must contain columns named CHROM and POS');
         }
 
+    # check for optional ID column
+    rsid.available <- TRUE;
+    if (!'ID' %in% colnames(vcf.data)) {
+        rsid.available <- FALSE;
+        warning('ID column not found in VCF data. Merging by rsID will not be possible.');
+        }
+
+    if (!'ID' %in% colnames(pgs.weight.data)) {
+        rsid.available <- FALSE;
+        warning('ID column not found in PGS weight data. Merging by rsID will not be possible.');
+        }
+
     # match CHROM notation in pgs data to vcf data formatting
     chr.prefix <- grepl('^chr', vcf.data$CHROM[1]);
     numeric.sex.chr <- any(grepl('23$', vcf.data$CHROM));
@@ -69,19 +82,97 @@ combine.vcf.with.pgs <- function(vcf.data, pgs.weight.data) {
         x = pgs.weight.data,
         y = vcf.data,
         by = c('CHROM', 'POS'),
+        suffixes = c('.pgs', '.vcf'),
         all.x = TRUE
         );
 
+    merged.vcf.with.pgs.data$merge.strategy <- 'genomic coordinate';
+
     # check for pgs SNPs missing from the VCF data
+    missing.snp.data <- NULL;
     missing.pgs.snp.index <- is.na(merged.vcf.with.pgs.data$REF);
 
-    if (all(missing.pgs.snp.index)) {
-        stop('All PGS SNPs are missing from the VCF, terminating merge.');
+    # attempt additional merge for missing SNPs, this time on rsID
+    if (any(missing.pgs.snp.index) & rsid.available) {
+        # extract merged rows corresponding to missing PGS SNPs
+        missing.snp.merged.data <- merged.vcf.with.pgs.data[missing.pgs.snp.index, ];
+        # rename ID column to match pre-merge pgs weight data
+        missing.snp.merged.data$ID <- missing.snp.merged.data$ID.pgs;
+        missing.snp.merged.data$ID.vcf <- NULL;
+        missing.snp.merged.data$ID.pgs <- NULL;
+        # extract only original pgs weight columns from merged missing SNP data (for neater second merge)
+        missing.snp.pgs.weight.data <- subset(missing.snp.merged.data, select = colnames(pgs.weight.data));
+        rm(missing.snp.merged.data);
+
+        # Split VCF$ID column into separate rows for each rsID (multiple rsIDs are separated by ;)
+        # most efficient way to do this is to use the data.table package
+        if (any(grepl(';', vcf.data$ID))) {
+            data.table::setDT(vcf.data);
+            split.rsid.vcf.data <- merge(
+                x = vcf.data,
+                # split only entries with multiple rsIDs, save in new column, and merge back with the original data
+                y = vcf.data[grepl(';', ID), unique(unlist(strsplit(as.character(ID), ';', fixed = TRUE))), by = .(Indiv, CHROM, POS)
+                    ][,.(new.ID = V1, Indiv, CHROM, POS)],
+                by = c('CHROM', 'POS', 'Indiv'),
+                all = TRUE
+                );
+            # replace entries with multiple rsIDs with the new, split, rsID
+            split.rsid.vcf.data <- split.rsid.vcf.data[!is.na(new.ID), ID := new.ID][, new.ID := NULL];
+            } else {
+            split.rsid.vcf.data <- vcf.data;
+            }
+
+        # merge missing SNP data on split rsID
+        merged.vcf.with.missing.pgs.data <- merge(
+            x = missing.snp.pgs.weight.data,
+            y = split.rsid.vcf.data,
+            suffixes = c('.pgs', '.vcf'),
+            by = 'ID',
+            all.x = TRUE
+            );
+        rm(split.rsid.vcf.data);
+
+        # update missing SNP index
+        second.merge.missing.pgs.snp.index <- is.na(merged.vcf.with.missing.pgs.data$REF);
+
+        if (all(missing.pgs.snp.index) & (sum(missing.pgs.snp.index) == sum(second.merge.missing.pgs.snp.index))) {
+            stop('All PGS SNPs are missing from the VCF, terminating merge.');
+            } else if (any(second.merge.missing.pgs.snp.index)) {
+                warning(paste('PGS is missing', sum(second.merge.missing.pgs.snp.index)), ' SNPs from VCF');
+                missing.snp.data <- merged.vcf.with.missing.pgs.data[second.merge.missing.pgs.snp.index, ];
+            } else {
+            missing.snp.data <- NULL;
+            }
+
+        # keep coordinates from VCF data for matched SNPs with coordinate mismatch
+        merged.vcf.with.missing.pgs.data[!is.na(merged.vcf.with.missing.pgs.data$REF), 'CHROM'] <- merged.vcf.with.missing.pgs.data[!is.na(merged.vcf.with.missing.pgs.data$REF), 'CHROM.vcf'];
+        merged.vcf.with.missing.pgs.data[!is.na(merged.vcf.with.missing.pgs.data$REF), 'POS'] <- merged.vcf.with.missing.pgs.data[!is.na(merged.vcf.with.missing.pgs.data$REF), 'POS.vcf'];
+
+        # keep coordinates from PGS data for unmatched (missing) SNPs
+        merged.vcf.with.missing.pgs.data[is.na(merged.vcf.with.missing.pgs.data$REF), 'CHROM'] <- merged.vcf.with.missing.pgs.data[is.na(merged.vcf.with.missing.pgs.data$REF), 'CHROM.pgs'];
+        merged.vcf.with.missing.pgs.data[is.na(merged.vcf.with.missing.pgs.data$REF), 'POS'] <- merged.vcf.with.missing.pgs.data[is.na(merged.vcf.with.missing.pgs.data$REF), 'POS.pgs'];
+
+        # add columns to match original merge
+        merged.vcf.with.missing.pgs.data$ID.pgs <- merged.vcf.with.missing.pgs.data$ID;
+        merged.vcf.with.missing.pgs.data$ID.vcf <- merged.vcf.with.missing.pgs.data$ID;
+        merged.vcf.with.missing.pgs.data$merge.strategy <- 'rsID';
+
+        # subset columns to match original merge
+        merged.vcf.with.missing.pgs.data <- subset(merged.vcf.with.missing.pgs.data, select = colnames(merged.vcf.with.pgs.data));
+
+        # combine merged data
+        merged.vcf.with.pgs.data <- rbind(merged.vcf.with.pgs.data[!missing.pgs.snp.index, ], merged.vcf.with.missing.pgs.data);
+        rm(merged.vcf.with.missing.pgs.data);
+
         } else if (any(missing.pgs.snp.index)) {
-            warning(paste('PGS is missing', sum(missing.pgs.snp.index)), ' SNPs from VCF');
-            missing.snp.data <- merged.vcf.with.pgs.data[missing.pgs.snp.index, ];
+
+        warning(paste('PGS is missing', sum(missing.pgs.snp.index)), ' SNPs from VCF');
+        missing.snp.data <- merged.vcf.with.pgs.data[missing.pgs.snp.index, ];
+
         } else {
+
         missing.snp.data <- NULL;
+
         }
 
     output <- list(

diff --git a/R/handle-weight-files.R b/R/handle-weight-files.R
@@ -18,6 +18,11 @@ check.pgs.weight.columns <- function(pgs.weight.colnames, harmonized = TRUE) {
         stop('The following required columns are missing from the PGS weight file: ', paste(setdiff(required.columns, pgs.weight.colnames), collapse = ', '));
         }
 
+    rsid.columns <- c('rsID', 'hm_rsID');
+    if (!all(rsid.columns %in% pgs.weight.colnames)) {
+        warning('rsID or hm_rsID column not found in PGS weight file. Merging by rsID will not be possible.');
+        }
+
     return(TRUE);
     }
 
@@ -135,14 +140,28 @@ import.pgs.weight.file <- function(pgs.weight.path, use.harmonized.data = TRUE)
     if (use.harmonized.data) {
 
         # label harmonized data columns with standardized names
+        # if harmonized rsID column is present and not empty, use it
+        if ('hm_rsID' %in% colnames(pgs.weight.data)) {
+            if (!all(is.na(pgs.weight.data$hm_rsID))) {
+                pgs.weight.data$ID <- pgs.weight.data$hm_rsID;
+                }
+            }
+
         pgs.weight.data$CHROM <- pgs.weight.data$hm_chr;
         pgs.weight.data$POS <- pgs.weight.data$hm_pos;
-        format.harmonized.columns <- TRUE;
+
         } else {
 
         # label non-harmonized data columns with standardized names
+        if ('rsID' %in% colnames(pgs.weight.data)) {
+            if (!all(is.na(pgs.weight.data$rsID == ''))) {
+                pgs.weight.data$ID <- pgs.weight.data$rsID;
+                }
+            }
+
         pgs.weight.data$CHROM <- pgs.weight.data$chr_name;
         pgs.weight.data$POS <- pgs.weight.data$chr_position;
+
         }
 
     # extract weight format from file metadata key 'weight_type'

diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@ Below is an overview of the data you will require to get started. For even more
 ### Input Data
 You will need only two pieces of data to get started: 
 1. A VCF file: Genotype data of the individuals upon which you wish to apply a polygenic score.
-2. A PGS weight file: Coordinates of each SNP that compose the polygenic score you with to apply, and their associated weights.
+2. A PGS weight file: Coordinates of each SNP that compose the polygenic score you with to apply, a SNP ID, and their associated weights.
 
 #### Genotype data
 - Genotype data should be provided in the form of a VCF (Variant Call Format) file.
@@ -47,7 +47,16 @@ If you wish to apply a PGS to a cohort, we recommend that genotypes for the whol
 - [The PGS Catalog](https://www.pgscatalog.org/) is a public database of PGSs and their weight files, and a great first stop for acquiring a PGS weight file.
 - The functions of ApplyPolygenicScore have been designed to operate on weight files that have been formatted according to the standards established by The PGS Catalog. These are very well documented [here](https://www.pgscatalog.org/downloads/#:~:text=the%20different%20samples.-,PGS%20Scoring%20Files,-Formatted%20Files).
 - You could easily create your own compatible PGS weight file, simply by formatting all required columns by Catalog standards.
-- When in doubt, use our `check.pgs.weight.columns()` function to make sure any data table you import into R contains the reqiuired columns for downstream functions.
+- When in doubt, use our `check.pgs.weight.columns()` function to make sure any data table you import into R contains the required columns for downstream functions.
+- Required columns in text files prior to importation are: `chr_name`, `chr_position`, `effect_allele`, `effect_weight`
+- An optional column that can be used in certain functions is `rsID`.
+- The following PGS Catalog column names are converted upon importation to column names used by VCF files to facilitate PGS/VCF matching
+
+|input to `import.pgs.weight.file`| output of `import.pgs.weight.file`|
+|---------------------------------|-----------------------------------|
+|`rsID`| `ID`|
+|`chr_name`|`CHROM`|
+|`chr_position`|`POS`|
 
 ### Recommended Workflow
 

diff --git a/inst/extdata/phenotype.test.data.Rda b/inst/extdata/phenotype.test.data.Rda
diff --git a/man/apply.polygenic.score.Rd b/man/apply.polygenic.score.Rd
diff --git a/man/combine.vcf.with.pgs.Rd b/man/combine.vcf.with.pgs.Rd
diff --git a/tests/testthat/data/PGS003766_hmPOS_GRCh38.txt.gz b/tests/testthat/data/PGS003766_hmPOS_GRCh38.txt.gz
diff --git a/tests/testthat/data/merged.multiallelic.site.test.data.Rda b/tests/testthat/data/merged.multiallelic.site.test.data.Rda
diff --git a/tests/testthat/data/missing.genotype.test.data.Rda b/tests/testthat/data/missing.genotype.test.data.Rda
diff --git a/tests/testthat/data/phenotype.test.data.Rda b/tests/testthat/data/phenotype.test.data.Rda
diff --git a/tests/testthat/data/simple.pgs.application.test.data.Rda b/tests/testthat/data/simple.pgs.application.test.data.Rda
diff --git a/tests/testthat/data/split.multiallelic.site.test.data.Rda b/tests/testthat/data/split.multiallelic.site.test.data.Rda