From 3785132ca45d49bac4db22594543b328b1bfdae9 Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Mon, 14 Aug 2023 15:20:53 -0700
Subject: [PATCH 01/12] switched from click to argparse for cmdline arguments

---
 .../gnomad/variant_scoring/variant_scoring.py | 29 ++++++++++++-------
 1 file changed, 18 insertions(+), 11 deletions(-)
 mode change 100644 => 100755 pipeline/gnomad/variant_scoring/variant_scoring.py

diff --git a/pipeline/gnomad/variant_scoring/variant_scoring.py b/pipeline/gnomad/variant_scoring/variant_scoring.py
old mode 100644
new mode 100755
index 086b52370..3162598af
--- a/pipeline/gnomad/variant_scoring/variant_scoring.py
+++ b/pipeline/gnomad/variant_scoring/variant_scoring.py
@@ -4,7 +4,7 @@
 import itertools
 from pathlib import Path
 
-import click
+import argparse
 import numpy as np
 import pandas as pd
 
@@ -264,14 +264,21 @@ def _convert_to_name(converted):
     return df.drop(columns=[TMP_HGVS_HG38, TMP_VAR_OBJ_FIELD, TMP_GENE_SYMBOL])
 
 
-@click.command()
-@click.argument('data_dir', type=click.Path(readable=True))
-@click.argument('output_path', type=click.Path(writable=True))
-@click.option('--resource-dir', type=click.Path(readable=True), help="resource dir for lift over (same as for the main pipeline)")
-@click.option('--gene-config-path', type=click.Path(readable=True))
-def main(data_dir, output_path, resource_dir, gene_config_path):
-    data_dir = Path(data_dir)
-    cfg_df = config.load_config(gene_config_path)
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input_path", help="input directory")
+    parser.add_argument("-o", "--output_path", help="output directory")
+    parser.add_argument("-g", "--gene_config_path", help="genes for analysis")
+    parser.add_argument("-r", "--resource_dir", help="resource dir for lift over (same as for the main pipeline)")
+    args = parser.parse_args()
+    return(args)
+
+
+def main():
+    args = parse_args()
+    data_dir = Path(args.input_path)
+    cfg_df = config.load_config(args.gene_config_path)
     df_cov2 = pd.read_parquet(data_dir / 'df_cov_v2.parquet')
     df_cov3 = pd.read_parquet(data_dir / 'df_cov_v3.parquet')
 
@@ -280,13 +287,13 @@ def main(data_dir, output_path, resource_dir, gene_config_path):
 
     read_depth_thresh = 30
 
-    df = extract_variant_scoring_data(df_cov2, df_cov3, df_var2, df_var3, read_depth_thresh, Path(resource_dir))
+    df = extract_variant_scoring_data(df_cov2, df_cov3, df_var2, df_var3, read_depth_thresh, Path(args.resource_dir))
 
     # add var_name columns with different normalization to join data with other sources (e.g. brca exchange output data)
     strand_dict = { int(r['chr']) : r[config.STRAND_COL] for _, r in cfg_df.iterrows() }
     df = add_normalization_cols(df, strand_dict)
 
-    df.to_parquet(output_path)
+    df.to_parquet(args.output_path)
 
 
 if __name__ == "__main__":

From 4869476158b5e6e8b6db3ce782f7bd625f5ef095 Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Tue, 15 Aug 2023 12:53:06 -0700
Subject: [PATCH 02/12] Chnaging the seqrepo dir to use the absolute pathname
 for the latest seqrepo dir rather than the 'latest' symlink, it's not clear
 why the symlink isn't working...

---
 pipeline/gnomad/variant_scoring/run_pipeline.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline/gnomad/variant_scoring/run_pipeline.sh b/pipeline/gnomad/variant_scoring/run_pipeline.sh
index a1247f4ed..eac58d788 100755
--- a/pipeline/gnomad/variant_scoring/run_pipeline.sh
+++ b/pipeline/gnomad/variant_scoring/run_pipeline.sh
@@ -20,7 +20,7 @@ UTA_PORT=50828
 UTA_RELEASE_DATE=20171026
 SEQ_REPO_DIR_DOCKER=/mnt/seq_repo
 
-COMMON_DOCKER_OPT=" --rm -v ${DATA_DIR}:/mnt --network host -v /data/variant_scoring/code/pipeline:/opt/variant_scoring -v /data/seqrepo:${SEQ_REPO_DIR_DOCKER} -e UTA_DB_URL=postgresql://anonymous@0.0.0.0:${UTA_PORT}/uta/uta_${UTA_RELEASE_DATE} -e HGVS_SEQREPO_DIR=${SEQ_REPO_DIR_DOCKER}/latest"
+COMMON_DOCKER_OPT=" --rm -v ${DATA_DIR}:/mnt --network host -v /data/variant_scoring/code/pipeline:/opt/variant_scoring -v /data/seqrepo:${SEQ_REPO_DIR_DOCKER} -e UTA_DB_URL=postgresql://anonymous@0.0.0.0:${UTA_PORT}/uta/uta_${UTA_RELEASE_DATE} -e HGVS_SEQREPO_DIR=${SEQ_REPO_DIR_DOCKER}/2021-01-29"
 
 # run process_gnomad_data.py
 docker run ${COMMON_DOCKER_OPT} variant_scoring bash -c "python /opt/variant_scoring/gnomad/variant_scoring/process_gnomad_data.py --cores 16 --mem-per-core-mb 2400 --gene-config-path /opt/variant_scoring/workflow/gene_config_brca_only.txt /mnt/original_gnomad_data /mnt/processed_brca"

From 4a46d80e7444e32af2cc9b4901412b7d4d579cfb Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Fri, 10 May 2024 11:55:04 -0700
Subject: [PATCH 03/12] Adding the variant assessment to the pipeline workflow

---
 pipeline/workflow/CompileVCFFiles.py |  4 +--
 pipeline/workflow/pipeline_common.py |  1 +
 pipeline/workflow/variant_scoring.py | 48 ++++++++++++++++++++++++++++
 3 files changed, 51 insertions(+), 2 deletions(-)
 create mode 100644 pipeline/workflow/variant_scoring.py

diff --git a/pipeline/workflow/CompileVCFFiles.py b/pipeline/workflow/CompileVCFFiles.py
index 4a761dd12..f3e107bf4 100644
--- a/pipeline/workflow/CompileVCFFiles.py
+++ b/pipeline/workflow/CompileVCFFiles.py
@@ -13,7 +13,7 @@
 
 luigi.auto_namespace(scope=__name__)
 
-from workflow import bayesdel_processing, esp_processing, gnomad_processing, pipeline_common, pipeline_utils
+from workflow import bayesdel_processing, esp_processing, gnomad_processing, pipeline_common, pipeline_utils, variant_scoring
 from workflow.pipeline_common import DefaultPipelineTask, clinvar_method_dir, lovd_method_dir, \
     functional_assays_method_dir, data_merging_method_dir, priors_method_dir, priors_filter_method_dir, \
     utilities_method_dir, vr_method_dir, splice_ai_method_dir, field_metadata_path, field_metadata_path_additional
@@ -754,7 +754,7 @@ def run(self):
             self.output().path)
 
 
-@requires(bayesdel_processing.AddBayesdelScores)
+@requires(variant_scoring.AddVariantScores)
 class FindMissingReports(DefaultPipelineTask):
     def output(self):
         return luigi.LocalTarget(os.path.join(self.artifacts_dir, "missing_reports.log"))
diff --git a/pipeline/workflow/pipeline_common.py b/pipeline/workflow/pipeline_common.py
index 47f24c4c1..d0d0a5515 100644
--- a/pipeline/workflow/pipeline_common.py
+++ b/pipeline/workflow/pipeline_common.py
@@ -24,6 +24,7 @@
 utilities_method_dir = os.path.abspath('../utilities')
 vr_method_dir = os.path.abspath('../vr')
 splice_ai_method_dir = os.path.abspath('../splice_ai')
+analysis_method_dir = os.path.abspath('../analysis')
 
 field_metadata_path = os.path.abspath(os.path.join('..', 'field_metadata.tsv'))
 field_metadata_path_additional = os.path.abspath(os.path.join('..', 'field_metadata_additional_fields_variant_output_file.tsv'))
diff --git a/pipeline/workflow/variant_scoring.py b/pipeline/workflow/variant_scoring.py
new file mode 100644
index 000000000..662d2a89c
--- /dev/null
+++ b/pipeline/workflow/variant_scoring.py
@@ -0,0 +1,48 @@
+import os
+import subprocess
+from pathlib import Path
+
+import luigi
+from luigi.util import requires
+
+import workflow.pipeline_utils as pipeline_utils
+from workflow.pipeline_common import DefaultPipelineTask, data_merging_method_dir, analysis_method_dir
+
+
+@requires(bayesdel_processing.AddBayesdelScores)
+class DownloadResources(DefaultPipelineTask):
+    def output(self):
+        wdir = Path(self.cfg.output_dir) / 'release' / 'artifacts' / 'variant_scoring_wdir'
+        return{'wdir': luigi.LocalTarget(wdir),
+               'covV2': "df_cov_v2.parquet",
+               'covV3': "df_cov_v2.parquet",
+               'flagsV2': "brca.gnomAD.2.1.1.hg19.flags.tsv",
+               'flagsV3': "brca.gnomAD.3.1.1.hg38.flags.tsv"
+               }
+    
+    def run(self):
+        wdir = self.output()['wdir'].path
+        if not os.path.exists(wdir):
+            os.mkdir(wdir)
+        os.chdir(wdir)
+        variant_scoring_data_url = "https://brcaexchange.org/backend/downloads/variant_scoring/*"
+        pipeline_utils.download_file_and_display_progress(brca1_data_url)
+        
+
+@requires(DownloadResources)
+class runPopfreqAssessment(DefaultPipelineTask):
+    def output(self):
+        return luigi.LocalTarget(os.path.join(self.artifacts_dir, 'built_with_popfreq.tsv'))
+
+    def run(self):
+        os.chdir(analysis_method_dir)
+        args = ["python", "popfreq_assessment.py",
+                "--input", "bayesdel_processing.AddBayesdelScores.output()",
+                "--data_dir", DownloadResources.output()["wdir"],
+                "--output", self.output().path]
+        pipeline_utils.run_process(args)
+        pipeline_utils.check_file_for_contents(self.output().path)
+                 
+
+        
+

From edeaddeeebebf4dc5a4772ed2efe4ec8ace05fca Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Fri, 10 May 2024 14:19:53 -0700
Subject: [PATCH 04/12] Added popfreq_assessment

---
 pipeline/analysis/popfreq_assessment.py | 375 ++++++++++++++++++++++++
 pipeline/workflow/variant_scoring.py    |   1 +
 2 files changed, 376 insertions(+)
 create mode 100755 pipeline/analysis/popfreq_assessment.py

diff --git a/pipeline/analysis/popfreq_assessment.py b/pipeline/analysis/popfreq_assessment.py
new file mode 100755
index 000000000..0f6381074
--- /dev/null
+++ b/pipeline/analysis/popfreq_assessment.py
@@ -0,0 +1,375 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import itertools
+import argparse
+import csv
+from collections import OrderedDict
+import math
+import numpy as np
+import pandas as pd
+
+#import gnomad.variant_scoring.constants as cnts
+#from common import config, hgvs_utils, variant_utils, utils
+#from data_merging.brca_pseudonym_generator import _normalize_genomic_fnc
+
+POPFREQ_CODE_ID = "Provisional_Evidence_Code_Popfreq"
+POPFREQ_CODE_DESCR = "Provisional_Evidence_Description_Popfreq"
+
+
+BA1 = "BA1 (met)"
+BS1 = "BS1 (met)"
+BS1_SUPPORTING = "BS1_Supporting (met)"
+NO_CODE = "No code met (below threshold)"
+NO_CODE_NON_SNV = "No code met for population data (indel)"
+PM2_SUPPORTING = "PM2_Supporting (met)"
+FAIL_NOT_ASSAYED = "Fail_Not_Assayed"
+FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG = "No code met (read depth, flags)"
+FAIL_NEEDS_REVIEW = "No code met (needs review)"
+FAIL_NEEDS_SOFTWARE_REVIEW = "No code met (needs software review)"
+
+READ_DEPTH_THRESHOLD_FREQUENT_VARIANT = 20
+READ_DEPTH_THRESHOLD_RARE_VARIANT = 25
+
+BA1_MSG = "The highest non-cancer, non-founder population filter allele frequency in gnomAD v2.1 (exomes only, non-cancer subset, read depth ≥20) or gnomAD v3.1 (non-cancer subset, read depth ≥20) is %s in the %s population, which is above the ENIGMA BRCA1/2 VCEP threshold (>0.001) for BA1 (BA1 met)."
+BS1_MSG = "The highest non-cancer, non-founder population filter allele frequency in gnomAD v2.1 (exomes only, non-cancer subset, read depth ≥20) or gnomAD v3.1 (non-cancer subset, read depth ≥20) is %s in the %s population, which is above the ENIGMA BRCA1/2 VCEP threshold (>0.0001) for BS1, and below the BA1 threshold (>0.001) (BS1 met)."
+BS1_SUPPORTING_MSG = "The highest non-cancer, non-founder population filter allele frequency in gnomAD v2.1 (exomes only, non-cancer subset, read depth ≥20) or gnomAD v3.1 f(non-cancer subset, read depth ≥20) is %s in the %s population which is within the ENIGMA BRCA1/2 VCEP threshold (>0.00002 to ≤ 0.0001) for BS1_Supporting (BS1_Supporting met)."
+PM2_SUPPORTING_MSG = "This variant is absent from gnomAD v2.1 (exomes only, non-cancer subset, read depth ≥25) and gnomAD v3.1 (non-cancer subset, read depth ≥25) (PM2_Supporting met)."
+NO_CODE_MSG = "This variant is present in gnomAD v2.1 (exomes only, non-cancer subset) or gnomAD v3.1 (non-cancer subset) but is below the ENIGMA BRCA1/2 VCEP threshold >0.00002 for BS1_Supporting (PM2_Supporting, BS1, and BA1 are not met)."
+NO_CODE_NON_SNV_MSG = "This [insertion/deletion/large genomic rearrangement] variant was not observed in gnomAD v2.1 (exomes only, non-cancer subset) or gnomAD v3.1 (non-cancer subset), but PM2_Supporting was not applied since recall is suboptimal for this type of variant (PM2_Supporting not met)."
+FAIL_NOT_ASSAYED_MSG = "Variant not tested in this dataset"
+FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG_MSG = "This variant is present in gnomAD v2.1 (exomes only, non-cancer subset) or gnomAD v3.1 (non-cancer subset) but is not meeting the specified read depths threshold ≥20 OR was flagged as suspect by gnomAD (PM2_Supporting, BS1, and BA1 are not met)."
+FAIL_NEEDS_REVIEW_MSG = "No code is met (variant needs review)"
+FAIL_NEEDS_SOFTWARE_REVIEW_MSG = "No code is met (variant needs software review)"
+
+MESSAGES_PER_CODE = {
+    BA1: BA1_MSG,
+    BS1: BS1_MSG,
+    BS1_SUPPORTING: BS1_SUPPORTING_MSG,
+    NO_CODE: NO_CODE_MSG,
+    NO_CODE_NON_SNV: NO_CODE_NON_SNV_MSG,
+    PM2_SUPPORTING: PM2_SUPPORTING_MSG,
+    FAIL_NOT_ASSAYED: FAIL_NOT_ASSAYED_MSG,
+    FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG: FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG_MSG,
+    FAIL_NEEDS_REVIEW: FAIL_NEEDS_REVIEW_MSG,
+    FAIL_NEEDS_SOFTWARE_REVIEW: FAIL_NEEDS_SOFTWARE_REVIEW_MSG
+    }
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input", default="built_final.tsv",
+                        help="Input file with variant data")
+    parser.add_argument("-o", "--output", default="built_with_popfreq.tsv",
+                        help="Output file with new columns")
+    parser.add_argument("-d", "--data_dir", default="./processed_brca",
+                        help="Directory with the processed files")
+    args = parser.parse_args()
+    return(args)
+
+
+def read_flags(flag_data):
+    flags = {}
+    for row in flag_data:
+        flags[row["ID"]] = row
+    return(flags)
+    
+
+def estimate_coverage(start, end, chrom, df_cov):
+    positions = list(range(start, end+1))
+    coverage_this_chrom = df_cov.loc[df_cov["chrom"] == int(chrom)]
+    positions_this_variant = coverage_this_chrom[coverage_this_chrom["pos"].isin(positions)]
+    meanval = positions_this_variant["mean"].mean()
+    medianval = positions_this_variant["median"].median()
+    return(meanval, medianval)
+
+
+
+
+def initialize_output_file(input_file, output_filename):
+    """
+    Create an empty output file with the new columns
+    """
+    new_columns = [POPFREQ_CODE_ID, POPFREQ_CODE_DESCR]
+    input_header_row = input_file.fieldnames
+    if input_header_row.index("change_type"):
+        idx = input_header_row.index("change_type")
+        output_header_row = input_header_row[:idx] + new_columns \
+            + input_header_row[idx:]
+    else:
+        output_header_row = input_header_row + new_columns
+    output_file = csv.DictWriter(open(output_filename,"w"),
+                                 fieldnames=output_header_row,
+                                 delimiter = '\t')
+    output_file.writeheader()
+    return(output_file)
+
+
+def field_defined(field):
+    """
+    Return a binary value indicating whether or not this variant has the popmax FAF defined
+    """
+    return(field != "-")
+
+
+def is_variant_observable(start, end, chrom, coverage):
+    #
+    # Read the coverage at the variant position.  If the coverage metrics returned
+    # are not a number, that means that the variant could not be tested in the
+    # indicated dataset.
+    (mean_read_depth, median_read_depth) = estimate_coverage(int(start),int(end),
+                                                             int(chrom),coverage)
+    if pd.isna(mean_read_depth) and pd.isna(median_read_depth):
+        return(False)
+    else:
+        return(True)
+    
+
+def analyze_one_dataset(faf95_popmax_str, faf95_population, allele_count, is_snv,
+                        read_depth, vcf_filter_flag, debug=True):
+    #
+    # Get the coverage data.  Rule out error conditions: low coverage, VCF filter flag.
+    rare_variant = False
+    if field_defined(faf95_popmax_str):
+        faf = float(faf95_popmax_str)
+        if pd.isna(faf):
+            rare_variant = True
+        elif faf <= 0.00002:
+            rare_variant = True
+    else:
+        faf = None
+        rare_variant = True
+    if rare_variant and read_depth < READ_DEPTH_THRESHOLD_RARE_VARIANT:
+        return(FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG)
+    if (not rare_variant) and read_depth < READ_DEPTH_THRESHOLD_FREQUENT_VARIANT:
+        return(FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG)
+    if vcf_filter_flag:
+        return(FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG)
+    #
+    # Address the cases where FAF is defined, and the variant is a candidate for a
+    # evidence code for high population frequency (BA1, BS1, BS1_SUPPORTING)
+    if not rare_variant:
+        if faf > 0.001:
+            return(BA1)
+        elif faf >  0.0001:
+            return(BS1)
+        elif faf > 0.00002:
+            return(BS1_SUPPORTING)
+    if rare_variant:
+        if not field_defined(allele_count):
+            return(NO_CODE)
+        elif int(allele_count) > 0:
+            return(NO_CODE)
+        else:
+            if is_snv:
+                return(PM2_SUPPORTING)
+            else:
+                return(NO_CODE_NON_SNV)
+    return(NEEDS_REVIEW)
+
+
+def analyze_across_datasets(code_v2, faf_v2, faf_popmax_v2, in_v2,
+                            code_v3, faf_v3, faf_popmax_v3, in_v3,
+                            debug=True):
+    """
+    Given the per-dataset evidence codes, generate an overall evidence code
+    """
+    benign_codes = [BA1, BS1, BS1_SUPPORTING]
+    pathogenic_codes = [PM2_SUPPORTING]
+    intermediate_codes = [ NO_CODE, NO_CODE_NON_SNV]
+    ordered_success_codes = benign_codes + intermediate_codes + pathogenic_codes
+    success_codes = set(ordered_success_codes)
+    failure_codes = set([FAIL_NOT_ASSAYED, FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG])
+    ordered_codes = ordered_success_codes + list(failure_codes)
+
+    #
+    # First, rule out the case of outright contradictions
+    if code_v2 in benign_codes and code_v3 in pathogenic_codes \
+       or code_v2 in pathogenic_codes and code_v3 in benign_codes:
+        return(FAIL_NEEDS_REVIEW, FAIL_NEEDS_REVIEW_MSG)
+    #
+    # Next, rule out the case where neither dataset has reliable data.
+    # Arbitrarily use the message for v3, as the newer and presumably more robust.
+    if code_v2 in failure_codes and code_v3 in failure_codes:
+        return(code_v3, MESSAGES_PER_CODE[code_v3])
+
+    #
+    # At this point, we can assume that at least one dataset has
+    # reliable data.
+    #
+    # Next, if both datasets point to a pathogenic effect, or
+    # if one points to a pathogenic effect and the other an error,
+    # then return the pathogenic effect.
+    if code_v2 in pathogenic_codes and code_v3 in pathogenic_codes:
+        return(code_v2, MESSAGES_PER_CODE[code_v2])
+    elif code_v2 in pathogenic_codes and code_v3 in failure_codes:
+        return(code_v2, MESSAGES_PER_CODE[code_v2])
+    elif code_v3 in pathogenic_codes and code_v2 in failure_codes:
+        return(code_v3, MESSAGES_PER_CODE[code_v3])
+
+    #
+    # Next, if either dataset has an intermediate effect and the other
+    # is not stronger (i.e. is also intermediate, or pathogenic, or failure),
+    # return the intermediate effect code.
+    if code_v2 in intermediate_codes and ordered_codes.index(code_v2) <= ordered_codes.index(code_v3):
+        return(code_v2, MESSAGES_PER_CODE[code_v2])
+    elif code_v3 in intermediate_codes and ordered_codes.index(code_v3) <= ordered_codes.index(code_v2):
+        return(code_v3, MESSAGES_PER_CODE[code_v3])
+
+    #
+    # Now, at least one dataset must have a success code.  We can also assume that
+    # neither is pathogenic (i.e. boht are benign, intermediate or failure).
+    # In this case, identify and return the stronger code.
+    if debug:
+        print("prior to assertions, codes are", code_v2, code_v3)
+    assert(code_v2 in benign_codes or code_v2 in intermediate_codes or code_v3 in benign_codes or code_v3 in intermediate_codes)
+    if code_v3 == BA1:
+        return(BA1, BA1_MSG % (faf_v3, faf_popmax_v3))
+    elif code_v2 == BA1:
+        return(BA1, BA1_MSG % (faf_v2, faf_popmax_v2))
+    elif code_v3 == BS1:
+        return(BS1, BS1_MSG % (faf_v3, faf_popmax_v3))
+    elif code_v2 == BS1:
+        return(BS1, BS1_MSG % (faf_v2, faf_popmax_v2))
+    elif code_v3 == BS1_SUPPORTING:
+        return(BS1_SUPPORTING, BS1_SUPPORTING_MSG % (faf_v3, faf_popmax_v3))
+    elif code_v2 == BS1_SUPPORTING:
+        return(BS1_SUPPORTING, BS1_SUPPORTING_MSG % (faf_v2, faf_popmax_v2))
+    elif code_v2 == NO_CODE:
+        return(code_v2, NO_CODE_MSG)
+    elif code_v3 == NO_CODE:
+        return(code_v3, NO_CODE_MSG)
+    #
+    # If we get here, there is a hole in the logic
+    return(FAIL_NEEDS_REVIEW, FAIL_NEEDS_SOFTWARE_REVIEW_MSG)
+
+
+def variant_is_flagged(variant_id, flags):
+    assert(variant_id in flags)
+    variant_flagged = False
+    if flags[variant_id]["Filters"] != "PASS":
+        variant_flagged = True
+    return(variant_flagged)
+
+
+def analyze_variant(variant, coverage_v2, coverage_v3, flags_v2, flags_v3,
+                    debug=True):
+    """
+    Analyze a single variant, adding the output columns
+    """
+    # Initialize the output columns.  First, check the read depth.  If the
+    # read depth is defined at the variant position, this means that the
+    # variant could have been observed in principle; set the default to
+    # PM2_SUPPORTING (indicating that by default, the variant could have been
+    # observed, but wasn't).  If no read depth is defined, this means that
+    # the variant could not have been observed in the dataset in question
+    # (for example, a deep intronic variant in an exome dataset).
+    # In such a case, set the default value to NOT_ASSAYED.
+    variant_v2_code_id = FAIL_NOT_ASSAYED
+    variant_v3_code_id = FAIL_NOT_ASSAYED
+    variant[POPFREQ_CODE_ID] = FAIL_NOT_ASSAYED
+    variant[POPFREQ_CODE_DESCR] = FAIL_NOT_ASSAYED_MSG
+    observable_in_v2 = False
+    observable_in_v3 = False
+    if is_variant_observable(int(variant["pyhgvs_Hg37_Start"]),int(variant["pyhgvs_Hg37_End"]),
+                             int(variant["Chr"]),coverage_v2):
+        variant_v2_code_id = PM2_SUPPORTING
+        variant[POPFREQ_CODE_ID] = PM2_SUPPORTING
+        variant[POPFREQ_CODE_DESCR] = PM2_SUPPORTING_MSG
+        observable_in_v2 = True
+    if is_variant_observable(int(variant["Hg38_Start"]),
+                             int(variant["Hg38_End"]),
+                             int(variant["Chr"]), coverage_v3):
+        variant_v3_code_id = PM2_SUPPORTING
+        variant[POPFREQ_CODE_ID] = PM2_SUPPORTING
+        variant[POPFREQ_CODE_DESCR] = PM2_SUPPORTING_MSG
+        observable_in_v3 = True
+    is_snv = (variant["Hg38_Start"] == variant["Hg38_End"])
+    if debug:
+        print("variant is snv:", is_snv)
+    #
+    # the gnomAD v2 variant ID is set when the variant is in the genome
+    # OR exome portion of gnomAD. Focus on variants that are in the exome
+    # data by making sure that the allele count is defined.  The allele
+    # count is the total number of observations of the variant in the gnomAD
+    # dataset
+    variant_in_v2 = False
+    if (field_defined(variant["Variant_id_GnomAD"])
+        and field_defined(variant["Allele_count_exome_GnomAD"])
+        and observable_in_v2):
+        variant_in_v2 = True 
+        (mean_read_depth,
+         median_read_depth) = estimate_coverage(int(variant["pyhgvs_Hg37_Start"]),
+                                                int(variant["pyhgvs_Hg37_End"]),
+                                                int(variant["Chr"]),coverage_v2)
+        read_depth_v2 = min(mean_read_depth, median_read_depth)
+        if pd.isna(read_depth_v2):
+            read_depth_v2 = 0
+        variant_v2_code_id = analyze_one_dataset(variant["faf95_popmax_exome_GnomAD"],
+                                                         variant["faf95_popmax_population_exome_GnomAD"],
+                                                         variant["Allele_count_exome_GnomAD"],
+                                                         is_snv, read_depth_v2,
+                                                         variant_is_flagged(variant["Variant_id_GnomAD"],
+                                                                            flags_v2))
+        if debug:
+            print("gnomAD V2 variant", variant["Variant_id_GnomAD"],
+                  "popmax", variant["faf95_popmax_exome_GnomAD"],
+                  "allele count", variant["Allele_count_exome_GnomAD"],
+                  "mean read depth", mean_read_depth,
+                  "median read depth", median_read_depth, "snv", is_snv,
+                  "V2 code", variant_v2_code_id)
+    variant_in_v3 = False
+    if (field_defined(variant["Variant_id_GnomADv3"]) and observable_in_v3):
+        variant_in_v3 = True
+        (mean_read_depth,
+         median_read_depth) = estimate_coverage(int(variant["Hg38_Start"]),
+                                                int(variant["Hg38_End"]),
+                                                int(variant["Chr"]),
+                                                coverage_v3)
+        read_depth_v3 = min(mean_read_depth, median_read_depth)
+        if pd.isna(read_depth_v3):
+            read_depth_v3 = 0
+        variant_v3_code_id = analyze_one_dataset(variant["faf95_popmax_genome_GnomADv3"],
+                                                         variant["faf95_popmax_population_genome_GnomADv3"],
+                                                         variant["Allele_count_genome_GnomADv3"],
+                                                         is_snv, read_depth_v3,
+                                                         variant_is_flagged(variant["Variant_id_GnomADv3"],
+                                                                            flags_v3))
+        if debug:
+            print("gnomAD V3 variant", variant["Variant_id_GnomADv3"],
+                  "popmax", variant["faf95_popmax_genome_GnomADv3"],
+                  "allele count", variant["Allele_count_genome_GnomADv3"],
+                  "mean read depth", mean_read_depth,
+                  "median read depth", median_read_depth, "snv", is_snv,
+                  "V3 code", variant_v3_code_id)
+    (variant[POPFREQ_CODE_ID],
+     variant[POPFREQ_CODE_DESCR]) = analyze_across_datasets(variant_v2_code_id,variant["faf95_popmax_exome_GnomAD"],
+                                                            variant["faf95_popmax_population_exome_GnomAD"],
+                                                            variant_in_v2, variant_v3_code_id, 
+                                                            variant["faf95_popmax_genome_GnomADv3"],
+                                                            variant["faf95_popmax_population_genome_GnomADv3"],
+                                                            variant_in_v3)
+    if debug:
+        print("consensus code:", variant[POPFREQ_CODE_ID], "msg",
+              variant[POPFREQ_CODE_DESCR])
+    return()
+
+
+
+def main():
+    args = parse_args()
+    df_cov2 = pd.read_parquet(args.data_dir + '/df_cov_v2.parquet')
+    df_cov3 = pd.read_parquet(args.data_dir + '/df_cov_v3.parquet')
+    flags_v2 = read_flags(csv.DictReader(open(args.data_dir + "/brca.gnomAD.2.1.1.hg19.flags.tsv"),
+                                         delimiter = "\t"))
+    flags_v3 = read_flags(csv.DictReader(open(args.data_dir + "/brca.gnomAD.3.1.1.hg38.flags.tsv"),
+                                         delimiter = "\t"))
+    input_file = csv.DictReader(open(args.input), delimiter = "\t")
+    output_file = initialize_output_file(input_file, args.output)
+    for variant in input_file:
+        analyze_variant(variant, df_cov2, df_cov3, flags_v2, flags_v3)
+        output_file.writerow(variant)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/pipeline/workflow/variant_scoring.py b/pipeline/workflow/variant_scoring.py
index 662d2a89c..212f0c780 100644
--- a/pipeline/workflow/variant_scoring.py
+++ b/pipeline/workflow/variant_scoring.py
@@ -9,6 +9,7 @@
 from workflow.pipeline_common import DefaultPipelineTask, data_merging_method_dir, analysis_method_dir
 
 
+
 @requires(bayesdel_processing.AddBayesdelScores)
 class DownloadResources(DefaultPipelineTask):
     def output(self):

From 8b0a6837a714eeaf0e8b93698b5bdab1f6d98a89 Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Mon, 13 May 2024 18:27:00 -0700
Subject: [PATCH 05/12] Renamed variant_scoring to anaysis

---
 pipeline/workflow/CompileVCFFiles.py                  | 4 ++--
 pipeline/workflow/{variant_scoring.py => analysis.py} | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)
 rename pipeline/workflow/{variant_scoring.py => analysis.py} (97%)

diff --git a/pipeline/workflow/CompileVCFFiles.py b/pipeline/workflow/CompileVCFFiles.py
index 1d6b8b224..e31a5c821 100644
--- a/pipeline/workflow/CompileVCFFiles.py
+++ b/pipeline/workflow/CompileVCFFiles.py
@@ -13,7 +13,7 @@
 
 luigi.auto_namespace(scope=__name__)
 
-from workflow import bayesdel_processing, esp_processing, gnomad_processing, pipeline_common, pipeline_utils, variant_scoring
+from workflow import bayesdel_processing, esp_processing, gnomad_processing, pipeline_common, pipeline_utils, analysis
 from workflow.pipeline_common import DefaultPipelineTask, clinvar_method_dir, lovd_method_dir, \
     functional_assays_method_dir, data_merging_method_dir, priors_method_dir, priors_filter_method_dir, \
     utilities_method_dir, vr_method_dir, splice_ai_method_dir, field_metadata_path, field_metadata_path_additional
@@ -773,7 +773,7 @@ def run(self):
             self.output().path)
 
 
-@requires(variant_scoring.AddVariantScores)
+@requires(analysis.runPopfreqAssessment)
 class FindMissingReports(DefaultPipelineTask):
     def output(self):
         return luigi.LocalTarget(os.path.join(self.artifacts_dir, "missing_reports.log"))
diff --git a/pipeline/workflow/variant_scoring.py b/pipeline/workflow/analysis.py
similarity index 97%
rename from pipeline/workflow/variant_scoring.py
rename to pipeline/workflow/analysis.py
index 212f0c780..c566ae64a 100644
--- a/pipeline/workflow/variant_scoring.py
+++ b/pipeline/workflow/analysis.py
@@ -7,7 +7,7 @@
 
 import workflow.pipeline_utils as pipeline_utils
 from workflow.pipeline_common import DefaultPipelineTask, data_merging_method_dir, analysis_method_dir
-
+from workflow import bayesdel_processing
 
 
 @requires(bayesdel_processing.AddBayesdelScores)

From 35eec874aea40fcd32588404d6f57880cfcd560a Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Tue, 14 May 2024 18:54:10 -0700
Subject: [PATCH 06/12] Moved needle on automation of latest pipeline task

---
 pipeline/workflow/analysis.py        | 35 ++++++++++++++--------------
 pipeline/workflow/pipeline_common.py |  2 +-
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/pipeline/workflow/analysis.py b/pipeline/workflow/analysis.py
index c566ae64a..ce33f27bc 100644
--- a/pipeline/workflow/analysis.py
+++ b/pipeline/workflow/analysis.py
@@ -10,36 +10,37 @@
 from workflow import bayesdel_processing
 
 
-@requires(bayesdel_processing.AddBayesdelScores)
+#@requires(bayesdel_processing.AddBayesdelScores)
 class DownloadResources(DefaultPipelineTask):
     def output(self):
-        wdir = Path(self.cfg.output_dir) / 'release' / 'artifacts' / 'variant_scoring_wdir'
-        return{'wdir': luigi.LocalTarget(wdir),
-               'covV2': "df_cov_v2.parquet",
-               'covV3': "df_cov_v2.parquet",
-               'flagsV2': "brca.gnomAD.2.1.1.hg19.flags.tsv",
-               'flagsV3': "brca.gnomAD.3.1.1.hg38.flags.tsv"
-               }
+        return(luigi.LocalTarget(Path(self.cfg.output_dir) / 'release' / 'artifacts' / 'analysis'))
+
     
     def run(self):
-        wdir = self.output()['wdir'].path
-        if not os.path.exists(wdir):
-            os.mkdir(wdir)
-        os.chdir(wdir)
-        variant_scoring_data_url = "https://brcaexchange.org/backend/downloads/variant_scoring/*"
-        pipeline_utils.download_file_and_display_progress(brca1_data_url)
+        download_url = "https://brcaexchange.org/backend/downloads/variant_scoring/"
+        self.inputfiles = [ "df_cov_v2.parquet", "df_cov_v3.parquet",
+                            "brca.gnomAD.2.1.1.hg19.flags.tsv",
+                            "brca.gnomAD.3.1.1.hg38.flags.tsv"
+                            ]
+        self.wdir = self.output().path
+        if not os.path.exists(self.wdir):
+            os.mkdir(self.wdir)
+        os.chdir(self.wdir)
+        for this_file in self.inputfiles:
+            pathname = download_url + "/" + this_file
+            pipeline_utils.download_file_and_display_progress(pathname)
         
 
 @requires(DownloadResources)
 class runPopfreqAssessment(DefaultPipelineTask):
     def output(self):
-        return luigi.LocalTarget(os.path.join(self.artifacts_dir, 'built_with_popfreq.tsv'))
+        return luigi.LocalTarget(Path(self.cfg.output_dir)/'release'/'artifacts'/'built_with_popfreq.tsv')
 
     def run(self):
         os.chdir(analysis_method_dir)
         args = ["python", "popfreq_assessment.py",
-                "--input", "bayesdel_processing.AddBayesdelScores.output()",
-                "--data_dir", DownloadResources.output()["wdir"],
+                "--input", bayesdel_processing.AddBayesdelScores().output().path,
+                "--data_dir", DownloadResources().output().path,
                 "--output", self.output().path]
         pipeline_utils.run_process(args)
         pipeline_utils.check_file_for_contents(self.output().path)
diff --git a/pipeline/workflow/pipeline_common.py b/pipeline/workflow/pipeline_common.py
index d0d0a5515..d70957248 100644
--- a/pipeline/workflow/pipeline_common.py
+++ b/pipeline/workflow/pipeline_common.py
@@ -24,7 +24,7 @@
 utilities_method_dir = os.path.abspath('../utilities')
 vr_method_dir = os.path.abspath('../vr')
 splice_ai_method_dir = os.path.abspath('../splice_ai')
-analysis_method_dir = os.path.abspath('../analysis')
+analysis_method_dir = os.path.abspath('../pipeline/analysis')
 
 field_metadata_path = os.path.abspath(os.path.join('..', 'field_metadata.tsv'))
 field_metadata_path_additional = os.path.abspath(os.path.join('..', 'field_metadata_additional_fields_variant_output_file.tsv'))

From d0794a3ffe49212a2ec4245aa7a2921a8f4a18b3 Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Tue, 4 Jun 2024 13:32:41 -0700
Subject: [PATCH 07/12] First cut of popfreq_assessment

---
 pipeline/analysis/popfreq_assessment.py | 14 +++++++-------
 pipeline/requirements.txt               |  1 +
 pipeline/workflow/pipeline_common.py    |  2 +-
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/pipeline/analysis/popfreq_assessment.py b/pipeline/analysis/popfreq_assessment.py
index 0f6381074..d020c4e29 100755
--- a/pipeline/analysis/popfreq_assessment.py
+++ b/pipeline/analysis/popfreq_assessment.py
@@ -9,9 +9,6 @@
 import numpy as np
 import pandas as pd
 
-#import gnomad.variant_scoring.constants as cnts
-#from common import config, hgvs_utils, variant_utils, utils
-#from data_merging.brca_pseudonym_generator import _normalize_genomic_fnc
 
 POPFREQ_CODE_ID = "Provisional_Evidence_Code_Popfreq"
 POPFREQ_CODE_DESCR = "Provisional_Evidence_Description_Popfreq"
@@ -63,6 +60,8 @@ def parse_args():
                         help="Output file with new columns")
     parser.add_argument("-d", "--data_dir", default="./processed_brca",
                         help="Directory with the processed files")
+    parser.add_argument("--debug", action="store_true", default=False,
+                        help="Print debugging info")
     args = parser.parse_args()
     return(args)
 
@@ -91,7 +90,7 @@ def initialize_output_file(input_file, output_filename):
     """
     new_columns = [POPFREQ_CODE_ID, POPFREQ_CODE_DESCR]
     input_header_row = input_file.fieldnames
-    if input_header_row.index("change_type"):
+    if "change_type" in input_header_row:
         idx = input_header_row.index("change_type")
         output_header_row = input_header_row[:idx] + new_columns \
             + input_header_row[idx:]
@@ -169,7 +168,7 @@ def analyze_one_dataset(faf95_popmax_str, faf95_population, allele_count, is_snv
 
 def analyze_across_datasets(code_v2, faf_v2, faf_popmax_v2, in_v2,
                             code_v3, faf_v3, faf_popmax_v3, in_v3,
-                            debug=True):
+                            debug=False):
     """
     Given the per-dataset evidence codes, generate an overall evidence code
     """
@@ -252,7 +251,7 @@ def variant_is_flagged(variant_id, flags):
 
 
 def analyze_variant(variant, coverage_v2, coverage_v3, flags_v2, flags_v3,
-                    debug=True):
+                    debug=False):
     """
     Analyze a single variant, adding the output columns
     """
@@ -366,7 +365,8 @@ def main():
     input_file = csv.DictReader(open(args.input), delimiter = "\t")
     output_file = initialize_output_file(input_file, args.output)
     for variant in input_file:
-        analyze_variant(variant, df_cov2, df_cov3, flags_v2, flags_v3)
+        analyze_variant(variant, df_cov2, df_cov3, flags_v2, flags_v3,
+                        debug=args.debug)
         output_file.writerow(variant)
 
 
diff --git a/pipeline/requirements.txt b/pipeline/requirements.txt
index 8828be0e9..7b92a0f34 100644
--- a/pipeline/requirements.txt
+++ b/pipeline/requirements.txt
@@ -22,6 +22,7 @@ decorator==4.4.2
 distro==1.4.0
 docutils==0.16
 fake-useragent==0.1.11
+fastparquet==2024.5.0
 ga4gh.common==0.0.7
 ga4gh.vrs==0.8.4
 hgvs==1.5.4
diff --git a/pipeline/workflow/pipeline_common.py b/pipeline/workflow/pipeline_common.py
index d70957248..d0d0a5515 100644
--- a/pipeline/workflow/pipeline_common.py
+++ b/pipeline/workflow/pipeline_common.py
@@ -24,7 +24,7 @@
 utilities_method_dir = os.path.abspath('../utilities')
 vr_method_dir = os.path.abspath('../vr')
 splice_ai_method_dir = os.path.abspath('../splice_ai')
-analysis_method_dir = os.path.abspath('../pipeline/analysis')
+analysis_method_dir = os.path.abspath('../analysis')
 
 field_metadata_path = os.path.abspath(os.path.join('..', 'field_metadata.tsv'))
 field_metadata_path_additional = os.path.abspath(os.path.join('..', 'field_metadata_additional_fields_variant_output_file.tsv'))

From 4dc22b3cad25f4e10a38b2552d291162bc856856 Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Tue, 4 Jun 2024 16:12:07 -0700
Subject: [PATCH 08/12] Added a stub for bioinformatic prediction

---
 pipeline/analysis/add_bioinfo_pred.py | 204 ++++++++++++++++++++++++++
 pipeline/workflow/CompileVCFFiles.py  |   4 +-
 pipeline/workflow/analysis.py         |  15 +-
 3 files changed, 220 insertions(+), 3 deletions(-)
 create mode 100755 pipeline/analysis/add_bioinfo_pred.py

diff --git a/pipeline/analysis/add_bioinfo_pred.py b/pipeline/analysis/add_bioinfo_pred.py
new file mode 100755
index 000000000..a6922dcb1
--- /dev/null
+++ b/pipeline/analysis/add_bioinfo_pred.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+
+import argparse
+import csv
+import re
+import sys
+
+BIOINFO_CODE_ID = "Provisional_Evidence_Code_Bioinfo"
+BIOINFO_CODE_DESCR = "Provisional_Evidence_Description_Bioinfo"
+
+
+NO_CODE = "NO_CODE"
+PP3 = "PP3"
+BP4_BP7 = "BP4,BP7"
+BP4 = "BP4"
+BP1_STRONG = "BP1_STRONG"
+PVS1_CODE = "PVS1"
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input", default="build_final.tsv",
+                        help="built_final")
+    parser.add_argument("-o", "--output", default="built_with_bioinfo.tsv",
+                        help="version of input file with new columns added")
+    parser.add_argument("-d", "--debug", action="store_true", default=False,
+                        help="Print debugging info")
+    args = parser.parse_args()
+    return(args)
+
+
+def initialize_output_file(input_file, output_filename):
+    """
+    Create an empty output file with the new columns                        
+    """
+    new_columns = [BIOINFO_CODE_ID, BIOINFO_CODE_DESCR]
+    input_header_row = input_file.fieldnames
+    if "change_type" in input_header_row:
+        idx = input_header_row.index("change_type")
+        output_header_row = input_header_row[:idx] + new_columns \
+            + input_header_row[idx:]
+    else:
+        output_header_row = input_header_row + new_columns
+    output_file = csv.DictWriter(open(output_filename,"w"),
+                                 fieldnames=output_header_row,
+                                 delimiter = '\t')
+    output_file.writeheader()
+    return(output_file)
+
+
+def extract_protein_coordinate(variant):
+    coordinate = None
+    hit = re.search("[0-9]+", variant["Protein_Change"])
+    if hit:
+        token = variant["Protein_Change"][hit.start():hit.end()]
+        pos = int(token)
+        print("from", variant["Protein_Change"], "derived", pos)
+        return(pos)
+
+def inside_functional_domain(variant):
+    inside_domain = False
+    pos = extract_protein_coordinate(variant)
+    if pos:
+        if variant["Gene_Symbol"] == "BRCA1":
+            if pos >= 2 and pos <= 99:
+                inside_domain = True
+            elif pos >= 503 and pos <= 508:
+                inside_domain = True
+            elif pos >= 607 and pos <= 614:
+                inside_domain = True
+            elif pos >= 651 and pos <= 656:
+                inside_domain = True
+            elif pos >= 1391 and pos <= 1424:
+                inside_domain = True
+            elif pos >= 1650 and pos <= 1863:
+                inside_domain = True
+        elif variant["Gene_Symbol"] == "BRCA2":
+            if pos >= 10 and pos <= 40:
+                inside_domain = True
+            elif pos >= 1002 and pos <= 1036:
+                inside_domain = True
+            elif pos >= 1212 and pos <= 1246:
+                inside_domain = True
+            elif pos >= 1422 and pos <= 1453:
+                inside_domain = True
+            elif pos >= 1518 and pos <= 1549:
+                inside_domain = True
+            elif pos >= 1665 and pos <= 1696:
+                inside_domain = True
+            elif pos >= 1837 and pos <= 1871:
+                inside_domain = True
+            elif pos >= 1971 and pos <= 2005:
+                inside_domain = True
+            elif pos >= 2051 and pos <= 2085:
+                inside_domain = True
+            elif pos >= 2481 and pos <= 3186:
+                inside_domain = True
+            elif pos >= 3263 and pos <= 3269:
+                inside_domain = True
+            elif pos >= 3265 and pos <= 3330:
+                inside_domain = True
+            elif pos >= 3381 and pos <= 3385:
+                inside_domain = True
+    return(inside_domain)
+
+
+        
+def estimate_bioinfo_code(variant):
+    effect = "unknown"
+    bioinfo_code = NO_CODE
+    if re.search("=\)$", variant["pyhgvs_Protein"]):
+        effect = "synonymous_variant"
+    elif re.search("[A-Z]+[0-9]+[A-Z]+", variant["Protein_Change"]):
+        effect = "missense_variant"
+    elif re.search("c\.[0-9]+[+]", variant["pyhgvs_cDNA"]):
+        effect = "intron_variant"
+    elif re.search("c\.[0-9]+[-]", variant["pyhgvs_cDNA"]):
+        effect = "intron_variant"
+    print("variant", variant["pyhgvs_cDNA"], "protein change", variant["Protein_Change"], variant["pyhgvs_Protein"], "effect", effect)
+    if  variant["result_spliceai"] == "-":
+        splicing_effect = False
+        no_splicing_effect = True
+    else:
+        splicing_effect = (float(variant["result_spliceai"]) > 0.2)
+        no_splicing_effect = (float(variant["result_spliceai"]) < 0.1)
+    if variant["Gene_Symbol"] == "BRCA1":
+        if variant["BayesDel_nsfp33a_noAF"] == "-":
+            protein_effect = False
+            no_protein_effect = True
+        elif float(variant["BayesDel_nsfp33a_noAF"]) > 0.28:
+            protein_effect = True
+            no_prptein_effect = False
+        elif float(variant["BayesDel_nsfp33a_noAF"]) < 0.15:
+            protein_effect = False
+            no_protein_effect = True
+        else:
+            protein_effect = False
+            no_protein_effect = False
+    if variant["Gene_Symbol"] == "BRCA2":
+        if variant["BayesDel_nsfp33a_noAF"] == "-":
+            protein_effect = False
+            no_protein_effect = True
+        elif float(variant["BayesDel_nsfp33a_noAF"]) > 0.30:
+            protein_effect = True
+            no_prptein_effect = False
+        elif float(variant["BayesDel_nsfp33a_noAF"]) < 0.18:
+            protein_effect = False
+            no_protein_effect = True
+        else:
+            protein_effect = False
+            no_protein_effect = False
+    inside_domain = inside_functional_domain(variant)
+    print("effect", effect, "splicing effect", splicing_effect, "inside domain", inside_domain)
+    if effect == "synonymous_variant":
+        if splicing_effect:
+            bioinfo_code = PP3
+        elif inside_domain:
+            bioinfo_code = BP4_BP7
+        else:
+            bioinfo_code = BP1_STRONG
+    elif effect == "intron_variant":
+        if splicing_effect:
+            bioinfo_code = PP3
+        else:
+            bioinfo_code = BP4
+    elif effect == "missense_variant":
+        if splicing_effect:
+            bioinfo_code = PP3
+        elif no_splicing_effect:
+            if not inside_domain:
+                bioinfo_code = BP1_STRONG
+            elif protein_effect:
+                bioinfo_code = PP3
+            elif no_protein_effect:
+                bioinfo_code = BP4
+        else:
+            if inside_domain and protein_effect:
+                bioinfo_code = PP3
+    return(bioinfo_code)
+
+
+def apply_pvs1_code(variant):
+    pvs1_code = NO_CODE
+    protein_hgvs = variant["pyhgvs_Protein"]
+    stop_added = re.search("Ter", protein_hgvs)
+    if stop_added:
+        pvs1_code = PVS1_CODE
+    return(pvs1_code)
+
+    
+def main():
+    csv.field_size_limit(sys.maxsize)
+    args = parse_args()
+    with open(args.input, 'r') as input_fp:
+        input_reader = csv.DictReader(input_fp, delimiter = "\t")
+        writer = initialize_output_file(input_reader, args.output)
+        for variant in input_reader:
+            #variant[BIOINFO_CODE_ID] = estimate_bioinfo_code(variant, debug=args.debug)
+            #pvs1_code = apply_pvs1_code(variant)
+            variant[BIOINFO_CODE_ID] = ""
+            variant[BIOINFO_CODE_DESCR] = ""
+            writer.writerow(variant)
+
+if __name__ == "__main__":
+    main()
diff --git a/pipeline/workflow/CompileVCFFiles.py b/pipeline/workflow/CompileVCFFiles.py
index e31a5c821..48d2ed3d7 100644
--- a/pipeline/workflow/CompileVCFFiles.py
+++ b/pipeline/workflow/CompileVCFFiles.py
@@ -13,7 +13,7 @@
 
 luigi.auto_namespace(scope=__name__)
 
-from workflow import bayesdel_processing, esp_processing, gnomad_processing, pipeline_common, pipeline_utils, analysis
+from workflow import analysis, esp_processing, gnomad_processing, pipeline_common, pipeline_utils
 from workflow.pipeline_common import DefaultPipelineTask, clinvar_method_dir, lovd_method_dir, \
     functional_assays_method_dir, data_merging_method_dir, priors_method_dir, priors_filter_method_dir, \
     utilities_method_dir, vr_method_dir, splice_ai_method_dir, field_metadata_path, field_metadata_path_additional
@@ -790,7 +790,7 @@ def run(self):
         pipeline_utils.check_file_for_contents(self.output().path)
 
 
-@requires(bayesdel_processing.AddBayesdelScores)
+@requires(analysis.runBioinfoPred)
 class RunDiffAndAppendChangeTypesToOutput(DefaultPipelineTask):
     def _extract_release_date(self, version_json):
         with open(version_json, 'r') as f:
diff --git a/pipeline/workflow/analysis.py b/pipeline/workflow/analysis.py
index ce33f27bc..a8d412701 100644
--- a/pipeline/workflow/analysis.py
+++ b/pipeline/workflow/analysis.py
@@ -10,7 +10,7 @@
 from workflow import bayesdel_processing
 
 
-#@requires(bayesdel_processing.AddBayesdelScores)
+@requires(bayesdel_processing.AddBayesdelScores)
 class DownloadResources(DefaultPipelineTask):
     def output(self):
         return(luigi.LocalTarget(Path(self.cfg.output_dir) / 'release' / 'artifacts' / 'analysis'))
@@ -45,6 +45,19 @@ def run(self):
         pipeline_utils.run_process(args)
         pipeline_utils.check_file_for_contents(self.output().path)
                  
+@requires(runPopfreqAssessment)
+class runBioinfoPred(DefaultPipelineTask):
+    def output(self):
+        return luigi.LocalTarget(Path(self.cfg.output_dir)/'release'/'artifacts'/'built_with_bioinfo_pred.tsv')
+
+    def run(self):
+        os.chdir(analysis_method_dir)
+        args = ["python", "add_bioinfo_pred.py",
+                "--input", runPopfreqAssessment().output().path,
+                "--output", self.output().path]
+        pipeline_utils.run_process(args)
+        pipeline_utils.check_file_for_contents(self.output().path)
+                 
 
         
 

From ae1f2f108285b8a01978562f1e6a3ffc932de805 Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Tue, 4 Jun 2024 16:31:23 -0700
Subject: [PATCH 09/12] Added two more files to the tarball discard list

---
 pipeline/workflow/tarball_files_discard_list.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pipeline/workflow/tarball_files_discard_list.txt b/pipeline/workflow/tarball_files_discard_list.txt
index 4860ac24e..104b74d3f 100644
--- a/pipeline/workflow/tarball_files_discard_list.txt
+++ b/pipeline/workflow/tarball_files_discard_list.txt
@@ -141,3 +141,5 @@
 ./release/artifacts/victor_wdir/slurm.annotate.run_1.start
 ./release/artifacts/victor_wdir/slurm.annotate.run_1.stop
 ./release/artifacts/victor_wdir/slurm.annotate.run_1.version
+./release/artifacts/built_with_popfreq.tsv
+./release/artifacts/built_with_bioinfo_pred.tsv

From fc8b079e9625dc17bb54f8dba9b2c59cbb316397 Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Tue, 4 Jun 2024 17:16:22 -0700
Subject: [PATCH 10/12] Bumped the docker-compose version from 1.12.0 to 1.29.2

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7bba53737..7ccb090fb 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -95,7 +95,7 @@ jobs:
              apk add --no-cache \
                py-pip=9.0.0-r1
              pip install \
-              docker-compose==1.12.0
+              docker-compose==1.29.2
        - restore_cache:
            keys:
              - v1-{{ .Branch }}

From 018d2385b04c1c3fe5b848e21c2964b4d2bfac79 Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Tue, 4 Jun 2024 17:55:32 -0700
Subject: [PATCH 11/12] Added new fields to the fields metadata

---
 pipeline/field_metadata.tsv | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pipeline/field_metadata.tsv b/pipeline/field_metadata.tsv
index a64b8604e..f4715623b 100644
--- a/pipeline/field_metadata.tsv
+++ b/pipeline/field_metadata.tsv
@@ -465,3 +465,7 @@ DP_DL_spliceAI	Evidence	#COPY	"The location of donor loss relative to variant po
 result_spliceai	Evidence	#COPY	"The maximum of four delta scores (DS_AG, DS_AL, DS_DG, DS_DL) is used as basis for spliceogenicity predictions"	0.0
 BayesDel_nsfp33a_noAF	Evidence	bayesdel_noaf	"BayesDel score, computed without allele frequency"	0.55
 change_type	Internal	#SKIP	"type of change with respect to last release"	"changed_information"
+Provisional_Evidence_Code_Popfreq	Evidence #COPY	"The provisional ACMG population frequency evidence code, by the VCEP rules"	"PM2_Supporting (met)"
+Provisional_Evidence_Description_Popfreq	 #Copy	"Details on how the provisional ACMG popfreq evidence code was determined"   "This variant is absent from gnomAD v2.1 (exomes only, non-cancer subset, read depth ≥25) and gnomAD v3.1 (non-cancer subset, read depth ≥25) (PM2_Supporting met)"
+Provisional_Evidence_Code_Bioinfo		 Evidence	 #SKIP	"The provisional ACMG bioinformatic evidence code, by the VCEP rules"  N/A
+Provisional_Evidence_Description_Bioinfo	 Evidence	 #SKIP	"Details on how the provisional ACMG bioinformatic evidence code was determined"	NA
\ No newline at end of file

From 0e208d7cb2a0d788633af203e7e4ddaa10095471 Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Tue, 4 Jun 2024 18:24:12 -0700
Subject: [PATCH 12/12] Added one more file to the tarball discard list

---
 pipeline/workflow/tarball_files_discard_list.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipeline/workflow/tarball_files_discard_list.txt b/pipeline/workflow/tarball_files_discard_list.txt
index 104b74d3f..8bf9605df 100644
--- a/pipeline/workflow/tarball_files_discard_list.txt
+++ b/pipeline/workflow/tarball_files_discard_list.txt
@@ -30,6 +30,7 @@
 ./release/artifacts/GnomADv3.vcf
 ./release/artifacts/LOVDready.vcf
 ./release/artifacts/LOVD.vcf
+./release/artifacts/ready_for_priors.tsv
 ./release/artifacts/releaseDiff.log
 ./release/artifacts/right1000_Genomes
 ./release/artifacts/rightBIC