fix merge conflict samplist

LadnerLab · May 16, 2022 · 9692a14 · 9692a14
2 parents 9bd1c5c + 2fdeead
commit 9692a14
Show file tree

Hide file tree

Showing 18 changed files with 662 additions and 418 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,8 @@
 .DS_Store
 !.gitignore
+.vscode
 # backup files
+/build
 
 *~
 \#*#
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.5.0]
+- #35, added new feature to demux. If samplenames or index name sets have duplicates in samplelist file, then those duplicates will be output to the terminal.
+- #57, demux now has an additional option for providing a tab-delimited file with 5 ordered columns: 1) index name, which should correspond to a header name in the sample sheet, 2) read name, which should be either "r1" or "r2" to specify whether the index is in "--input_r1" or "--input_r2", 3) index start location (0-based, inclusive), 4) index length and 5) number of mismatched to allow. Note: the last three columns correspond to the info currently provided on the command line with "--f_index" and "--r_index" (or "--index1" and "--index2", with recent changes). With this feature, the demux module can now analyze an arbitrary amount of indexes to be found in r1 or r2 input sequences.
+- #57, demux output diagnostics may now provide more index matches for flexibility with demux changes in #57.
+- #138, demux now automatically removes reference duplicates when running in a reference dependent mode.
+- #105, a check is added that verifys the bins provided to the Z score module. It is no longer possible to run the Z score module with the wrong set of bins.
+- #156, solved memory race condition in demux created during development of this release.
+- #163, solved memory race condition in demux that created incorrect counts.
+
 ## [1.4.0] - 2021-07-09
 - #117, CMakelists has been updated to include a new flag for the CXX flags: '-Xpreprocessor'. This flag is used to make compilation in different environments for cpp easier. This issue arose when pepsirf was attempted to be compiled in 'Big Sur' and failed to compile due to an error with '-fopenmp'.
 - #116, link module had occuring error when protein sequences were not found in the metadata map. This has been changed so the situation is handled and an error is thrown stating a sequence was not found in the metadata file.

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -99,6 +99,7 @@ add_library(modules_demux STATIC src/modules/demux/options_demux.cpp src/modules
 			src/modules/parsers/fasta_parser.cpp src/modules/parsers/fastq_parser.cpp
             src/modules/core/sequence.cpp
 			src/modules/demux/samplelist_parser.cpp
+			src/modules/demux/fif_parser.cpp
 			src/modules/demux/translation_map.cpp
 			src/modules/core/sequence_indexer.cpp
 			src/modules/core/fastq_score.cpp

diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 
 ## GPL-3.0-or-later
 
-### Current Version: v1.4.0
+### Current Version: v1.5.0
 
 Visit our [GitHub Pages website](https://ladnerlab.github.io/PepSIRF/)
 

diff --git a/docs/1-index.md b/docs/1-index.md
@@ -3,8 +3,10 @@ layout: default
 title: Home
 permalink: /
 ---
+
 <img src="./assets/images/PepSIRF_logo_BW.png" alt="" width="1024">
 
+
 ### Current Version: v1.4.0
 
 ### Please cite:

diff --git a/include/modules/core/pepsirf_version.h b/include/modules/core/pepsirf_version.h
@@ -1,6 +1,6 @@
 #ifndef PEPSIRF_VERSION_HH_INCLUDED
 #define PEPSIRF_VERSION_HH_INCLUDED
 
-#define PEPSIRF_VERSION "1.4.0"
+#define PEPSIRF_VERSION "1.5.0"
 
 #endif
diff --git a/include/modules/demux/et_search.h b/include/modules/demux/et_search.h
@@ -114,13 +114,14 @@ class et_seq_search
             query_matches;
         sequence *best_match = nullptr;
         sequence seq_temp;
+        std::string empty_string = "";
 
         std::uint32_t num_matches; 
 
         std::string substr = probe.seq.substr( start, len );
 
         // Note: hash( sequence& seq ) = hash( seq.seq )
-        auto temp = counts.find( sequence( "", substr ) );
+        auto temp = counts.find( sequence( empty_string, substr ) );
 
         // first check for an exact match in the expected location
         if( temp != counts.end() )
@@ -131,14 +132,14 @@ class et_seq_search
         if( start > 0 )
             {
                 substr = probe.seq.substr( start - 1, len );
-                temp = counts.find( sequence( "", substr ) );
+                temp = counts.find( sequence( empty_string, substr ) );
 
                 if( temp == counts.end()
                     && start > 1
                   )
                     {
                         substr = probe.seq.substr( start - 2, len );
-                        temp = counts.find( sequence( "", substr ) );
+                        temp = counts.find( sequence( empty_string, substr ) );
                     }
 
                 if( temp != counts.end() )
@@ -152,15 +153,16 @@ class et_seq_search
             if( start + 1 + len <= probe.seq.length() )
                 {
                     substr = probe.seq.substr( start + 1, len );
-                    temp = counts.find( sequence( "", substr ) );
+                    temp = counts.find( sequence( empty_string, substr ) );
                 }
 
             // look for a match at the expected coordinates within
             // the number of mismatches that are tolerated
             if( hamming_tol > 0 && temp == counts.end() )
                 {
                     substr = probe.seq.substr( start, len );
-                    seq_temp = sequence( "", substr );
+                    seq_temp.set_name( empty_string );
+                    seq_temp.set_seq( substr );
                     num_matches = idx.query( query_matches,
                                              seq_temp,
                                              hamming_tol

diff --git a/include/modules/demux/fif_parser.h b/include/modules/demux/fif_parser.h
@@ -0,0 +1,54 @@
+#ifndef FIF_PARSER_HH_INCLUDED
+#define FIF_PARSER_HH_INCLUDED
+#include <string>
+#include <vector>
+#include <unordered_set>
+#include <stdexcept>
+#include <fstream>
+#include <iostream>
+#include <utility>
+#include <boost/algorithm/string.hpp>
+
+class flex_idx
+    {
+        public:
+            std::string idx_name; // name corresponding to header name in samplesheet
+            std::string read_name; // name should either be R1 or R2
+            std::size_t idx_start; // index start location (0-based, inclusive)
+            std::size_t idx_len; // index length
+            std::size_t num_mismatch; // number of mismatches allowed
+            std::unordered_set<std::string> barcode_ids;
+
+            flex_idx( std::string col1, std::string col2, std::string col3, std::string col4, std::string col5 )
+                {
+                    idx_name = col1;
+                    read_name = col2;
+                    idx_start = std::stoi( col3 );
+                    idx_len = std::stoi( col4 );
+                    num_mismatch = std::stoi( col5 );
+                }
+
+            flex_idx( std::string col1, std::string col2, std::size_t col3, std::size_t col4, std::size_t col5 )
+                {
+                    idx_name = col1;
+                    read_name = col2;
+                    idx_start = col3 ;
+                    idx_len = col4;
+                    num_mismatch = col5;
+                }
+    };
+class fif_parser
+    {
+        public:
+            /**
+             * Parse flexible index file containing index information.
+             * See help message for (--fif,-f) for more info.
+             * @param d_opts contains name of input file to generate flexible index
+             *               data and useful info to identify index data for return vec.
+             * @return returns vector with each element as data for a single index.
+            */
+            std::vector<flex_idx> parse( const std::string fif_fname );
+
+    };
+
+#endif // FIF_PARSER_HH_INCLUDED
diff --git a/include/modules/demux/module_demux.h b/include/modules/demux/module_demux.h
@@ -5,6 +5,7 @@
 #include <fstream>
 #include <limits>
 #include <algorithm>
+#include <unordered_map>
 #include <boost/algorithm/string.hpp>
 #include "omp_opt.h"
 
@@ -19,7 +20,7 @@
 #include "sequence_indexer.h"
 #include "sample.h"
 #include <iomanip>
-
+#include "fif_parser.h"
 
 /**
  * Class for running the demultiplex module. Given a file of reads and a file containing
@@ -66,8 +67,12 @@ class module_demux : public module
      * Writes output to diagnostic_fname. Output as tab-delimited file, with three columns, samplename, index pair matches, variable region matches.
      * Output is optional, defaulted as unused.
     **/
-    void write_diagnostic_output( options_demux* d_opts, std::map<std::pair<std::string,std::string>,
-                                  std::pair<std::string,std::vector<std::size_t>>>& diagnostic_map );
+    void write_diagnostic_output( options_demux* d_opts,
+                                  phmap::parallel_flat_hash_map<sample, std::vector<std::size_t>>& diagnostic_map );
+
+    void create_diagnostic_map( bool reference_dependent,
+                                phmap::parallel_flat_hash_map<sample,std::vector<std::size_t>>& diagnostic_map,
+                                std::vector<sample> samplelist );
 
     /**
      * Writes output to the outfile_name.
@@ -83,9 +88,12 @@ class module_demux : public module
      *        vector of each seq_score. Note that samples[ i ].id must equal j[ i ] for each
      *        j = 1, 2, ... j.size(), i.e. The id of a sample must correspond with its entry in
      *        the count vector.
+     * @param sample_duplicates map of dna tags. contains the number of each dna tag that 
+     *        appears in a run. used to determine the samples included in the output.
      **/
-    void write_outputs( std::string outfile_name,
+    void write_outputs( options_demux* d_opts,
                         parallel_map<sequence, std::vector<std::size_t>*>& seq_scores,
+                        std::map<std::string, std::size_t> duplicate_map,
                         std::vector<sample>& samples
                       );
 
@@ -122,22 +130,22 @@ class module_demux : public module
      * @returns Iterator to the match if found, map.end() otherwise
      **/
     template<class M>
-        typename M::iterator _find_with_shifted_mismatch( M &map,
-                                                          sequence probe_seq,
-                                                          sequence_indexer& idx, std::size_t num_mism,
-                                                          std::size_t f_start, std::size_t f_len
-                                                         )
+    typename M::iterator _find_with_shifted_mismatch( M& map,
+                                                      sequence probe_seq,
+                                                      sequence_indexer& idx, std::size_t num_mism,
+                                                      std::size_t f_start, std::size_t f_len
+                                                    )
         {
             std::vector<std::pair<sequence *, int>> query_matches;
             sequence *best_match = nullptr;
             sequence seq_temp;
-
+            std::string empty_string = "";
             unsigned int num_matches = 0;
 
             std::string substr = probe_seq.seq.substr( f_start, f_len );
 
             // Note: hash( sequence& seq ) = hash( seq.seq )
-            auto temp = map.find( sequence( "", substr ) );
+            auto temp = map.find( sequence( empty_string, substr ) );
 
             // first check for an exact match in the expected location
             if( temp != map.end() )
@@ -148,12 +156,12 @@ class module_demux : public module
             if( f_start > 0 ) // check that we are not shifting left from the beginning
                 {
                     substr = probe_seq.seq.substr( f_start - 1, f_len );
-                    temp = map.find( sequence( "", substr ) );
+                    temp = map.find( sequence( empty_string, substr ) );
 
                     if( temp == map.end() )
                         {
                             substr = probe_seq.seq.substr( f_start - 2, f_len );
-                            temp = map.find( sequence( "", substr ) );
+                            temp = map.find( sequence( empty_string, substr ) );
                         }
 
                     if( temp != map.end() )
@@ -167,15 +175,16 @@ class module_demux : public module
             if( f_start + 1 + f_len <= probe_seq.seq.length() )
                 {
                     substr = probe_seq.seq.substr( f_start + 1, f_len );
-                    temp = map.find( sequence( "", substr ) );
+                    temp = map.find( sequence( empty_string, substr ) );
                 }
 
             // look for a match at the expected coordinates within
             // the number of mismatches that are tolerated
             if( num_mism > 0 && temp == map.end() )
                 {
                     substr = probe_seq.seq.substr( f_start, f_len );
-                    seq_temp = sequence( "", substr );
+                    seq_temp.set_name( empty_string );
+                    seq_temp.set_seq( substr );
                     num_matches = idx.query( query_matches,
                                              seq_temp,
                                              num_mism
@@ -202,9 +211,22 @@ class module_demux : public module
      **/
     sequence *_get_min_dist( std::vector<std::pair<sequence *, int>>& matches );
 
+    /**
+     * @note Creates two unordered multimaps based on DNA tag sequences referenced by the samplelist index columns.
+     * The 'seq_lookup' map will contain the associated sequence to the ids in the barcodes/DNA tags file (--index-column).
+     * Note only ids referenced in the samplelist file will be included.
+     * 'map' contains concatenated DNA tag sequences, order specified by the index sample column name (--fif or --sIndex).
+     * Example. The concatenation would follow as: Index1 sequence + Index2 sequence + Index3 sequence and so on. The accumulation of the sequences
+     * is added to 'map' from the first index to the last index id for each sample. So three elements: sequence Index1, sequence Index1 + Index2,
+     * and sequence Index1 + Index2 + Index3 will all be added.
+     * @param map unordered multimap with unsorted sequence and samples. Samples organized into buckets where key sequence is identical.
+     * This allows access to individual elements directly by the sequence object.
+     * @param seq_lookup identical in concept to the map.
+     **/
     void create_index_map( sequential_map<sequence, sample>& map,
-                           std::vector<sequence>& index_seqs,
-                           std::vector<sample>& samplelist
+                           std::vector<sequence>& dna_tags,
+                           std::vector<sample>& samplelist,
+                           sequential_map<sequence, sample>& seq_lookup
                          );
 
     /**

diff --git a/include/modules/demux/options_demux.h b/include/modules/demux/options_demux.h
@@ -19,10 +19,11 @@ class options_demux: public options
     std::string library_fname; //!< Filename containing a FASTA file containing a librry of amino acid peptide sequences.
     std::string output_fname; //!< Filename of file to write output to.
     std::string aggregate_fname; //!< Filename to write aggregate counts to.
+    std::string flexible_idx_fname;
     std::string samplelist_fname; //!< Name of tab-delimited file containing a list of samples.
     std::string samplename;
-    std::string sample_idx1;
-    std::string sample_idx2;
+    std::string indexes;
+    std::vector<std::string> sample_indexes;
     std::string diagnostic_fname;
     std::tuple<std::size_t, std::size_t, std::size_t> index1_data; //!< 0 = start, 1 = len, 2 = num_mismatches
     std::tuple<std::size_t, std::size_t, std::size_t> index2_data; //!< 0 = start, 1 = len, 2 = num_mismatches
@@ -31,6 +32,7 @@ class options_demux: public options
     std::string concatemer; //!< Concatenated primer sequences, we look for this in our reads to determine whether a peptide exists
     int phred_base;
     int min_phred_score;
+    int num_indexes;
 
     bool translation_aggregation;