bugfix: int string comparison for nextclade (#96)

phac-nml · Jul 15, 2024 · 7ee8caa · 7ee8caa
1 parent 1510450
commit 7ee8caa
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 11 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,13 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## v2.0.1 - [2024-07-15]
+Version 2.0.1 fixes a minor bug in which all numeric sample names (ex. 231301) were not being compared to the nextclade output for frameshift reporting and adjustments along with updating the nextclade dataset to default to 'latest'
+
+### `Changed`:
+- Bugfix in `qc.py` such to allow numeric sample names to be properly compared to the nextclade output for frameshift reporting
+- Default nextclade dataset set to 'latest'
+
 ## v2.0.0 - [2024-04-25]
 Overall version 2.0.0 has the all the same outputs as version 1.1.0 but with some adjustments to the output locations and the input parameter names. This makes this release incompatible with previous automation unfortunately but it is ultimately for a more robust  and easier to run/develop pipeline
 

diff --git a/README.md b/README.md
@@ -24,6 +24,13 @@ This Nextflow pipeline automates the ARTIC network [nCoV-2019 novel coronavirus
 ### Release Notes
 For full changes visit the [CHANGELOG](CHANGELOG.md)
 
+#### *v2.0.1*
+Version 2.0.1 fixes a minor bug in which all numeric sample names (ex. 231301) were not being compared to the nextclade output for frameshift reporting and adjustments along with updating the nextclade dataset to 'latest'
+
+`Changed`:
+- Bugfix in `qc.py` such to allow numeric sample names to be properly compared to the nextclade output for frameshift reporting
+- Default nextclade dataset set to 'latest'
+
 #### *v2.0.0*
 Overall version 2.0.0 has the all the same outputs as version 1.1.0 but with some adjustments to the output locations and the input parameter names. This makes this release incompatible with previous automation unfortunately but it is ultimately for a more robust  and easier to run/develop pipeline
 

diff --git a/bin/qc.py b/bin/qc.py
@@ -343,29 +343,29 @@ def parse_ncov_tsv(file_in, sample, negative=False):
 
     return negative_df
 
-def compare_nextclade_fs_to_ncovtools_fs(sample: str, next_df: pd.DataFrame, ncov_df: pd.DataFrame) -> None:
+def compare_nextclade_fs_to_ncovtools_fs(sample: str, nextclade_df: pd.DataFrame, ncov_df: pd.DataFrame) -> None:
     '''
     Parse the nextclade dataframe for the presence of frameshift indels and update the qc_pass flag
     in the ncov summary df if they do not match
     INPUTS:
-        sample  --> `str` sample name from input
-        next_df --> `df` from nextclade 
-        ncov_df --> `df` Parsed ncov-tools summary df
+        sample       --> `str` sample name from input
+        nextclade_df --> `df` from nextclade 
+        ncov_df      --> `df` Parsed ncov-tools summary df
     '''
     # Adding in a column for tracking if correction occured
     ncov_df.reset_index(inplace=True, drop=True)
     ncov_df['qc_adjustment_from_nextclade'] = 'No adjustment'
 
     # Filter down nextclade df to just the wanted sample
     #  It should only be 1 sample but just in case
-    next_df = next_df.loc[next_df['seqName'] == sample]
-    if next_df.empty:
+    nextclade_df = nextclade_df.loc[nextclade_df['seqName'] == sample]
+    if nextclade_df.empty:
         return
 
     # Determine if there are any non-ignored frameshifts
     #  Both df are 1 line now so can just pull the first value
-    total_fs = next_df['qc.frameShifts.totalFrameShifts'].values[0]
-    ignored_fs = next_df['qc.frameShifts.totalFrameShiftsIgnored'].values[0]
+    total_fs = nextclade_df['qc.frameShifts.totalFrameShifts'].values[0]
+    ignored_fs = nextclade_df['qc.frameShifts.totalFrameShiftsIgnored'].values[0]
     ncov_qc_value_list = ncov_df['qc_pass'].values[0].split(';')
 
     # If its not in the list we don't worry
@@ -495,8 +495,10 @@ def go(args):
     negative_df = parse_ncov_tsv(args.ncov_negative, args.sample, negative=True)
 
     # Nextclade double check of fs mutations
-    next_df = pd.read_csv(args.nextclade_tsv, sep='\t')
-    compare_nextclade_fs_to_ncovtools_fs(args.sample, next_df, summary_df)
+    nextclade_df = pd.read_csv(args.nextclade_tsv, sep='\t')
+    # Convert the seqName column type to string in case of all integer sample names
+    nextclade_df = nextclade_df.astype({'seqName': 'str'})
+    compare_nextclade_fs_to_ncovtools_fs(args.sample, nextclade_df, summary_df)
 
     # If we have a samplesheet, use its values to create final output
     if args.sample_sheet:

diff --git a/nextflow.config b/nextflow.config
@@ -39,7 +39,7 @@ params {
 
     //- Nextclade
     nextclade_dataset = "sars-cov-2"
-    nextclade_tag = "2024-04-15--15-08-22Z"
+    nextclade_tag = "latest"
 
     //- Metadata and IRIDA Uploads
     //-- Metadata is supplied with `--irida metadata.tsv` and requires specific columns. See README