From 11756d42e6003fb9b835a7e78cb30393cecaf3d7 Mon Sep 17 00:00:00 2001
From: FranBonath <franziska.bonath@scilifelab.se>
Date: Mon, 12 Aug 2019 09:50:46 +0200
Subject: [PATCH 1/4] first (yet unfinished) version of a python script to
 check completeness of library submission sheets. Checks for empty entries,
 missing values and does basic checks on indexes, like check for double index
 in pool, color balance and length.

---
 library_information_validator.py | 410 +++++++++++++++++++++++++++++++
 1 file changed, 410 insertions(+)
 create mode 100755 library_information_validator.py

diff --git a/library_information_validator.py b/library_information_validator.py
new file mode 100755
index 0000000..b44c5e8
--- /dev/null
+++ b/library_information_validator.py
@@ -0,0 +1,410 @@
+#!/usr/bin/env python
+
+# load libraries
+from __future__ import print_function
+import argparse
+from openpyxl import load_workbook
+import coloredlogs
+import logging
+import re
+import couchdb
+import numbers
+import decimal
+import yaml
+from numpy import setdiff1d
+from collections import Counter
+
+#global variable
+WARNINGS = 0
+
+
+# Set up a logger with colored output
+logger = logging.getLogger(__name__)
+logger.propagate = False  # Otherwise the messages appeared twice
+coloredlogs.install(level='INFO', logger=logger,
+                    fmt='%(asctime)s %(levelname)s %(message)s')
+
+
+class LibrarySheet:
+#Class Attributes
+    SHEET_NAME = 'Sample information'
+    INDEX_SHEET_NAME = 'Index'
+    FIRST_LINE = 20  # First line where user submitted data is located
+    SAMPLE_NAME_COL = 'P'
+    POOL_NAME_COL = 'V'
+    POOL_NAME_SAMPLE_COL = "O"
+    LENGTH_COL = 'Y'  # average fragment length
+    MOLARITY_COL = 'AA'  # molarity of the pool
+    SINDEX_COL = 'S' # sample index (automated)
+    CINDEX_COL = 'T' # custom index
+    SAMPLE_TYPE = 'P8' # from drop down menue ("Finished library" or "Amplicon with adapters (low diversity)")
+    PLATE_ID = 'N6'
+    PROJECT_NAME_USER_SHEET = 'N3'
+
+    # Initializer / Instance attributes
+    def __init__(self, library_info_sheet):
+        self.library_info_sheet = library_info_sheet
+        self.library_sheet = None
+        self.sample_rec = None
+
+    # instance methods
+    def getAccessUserSheet(self):
+        if self.library_sheet == None:
+            wb = load_workbook(self.library_info_sheet, read_only=True, data_only=True)
+            ws = wb[LibrarySheet.SHEET_NAME]
+            self.library_sheet = ws
+
+    def projectID(self):
+        """retrieves the project and plate ID from the excel sheet and checks the
+        correctness of the plate ID format."""
+        self.getAccessUserSheet()
+        plate_id = self.library_sheet[LibrarySheet.PLATE_ID].value
+        if(len(re.findall('P\d+P\d+', plate_id))>0):
+            project_id_user = re.findall('P\d+', plate_id)[0]
+        else:
+            logger.error(
+                'Given plate ID ({}) in cell {} has the wrong format. It should be in the format'
+                ' PxxxxxPx, where x are numbers. If your Plate ID is correct, contact your project coordinator.'\
+                .format(plate_id, LibrarySheet.PLATE_ID)
+                )
+            quit()
+        return([project_id_user, plate_id])
+
+    def getRows(self, column):
+        """ identifies the all rows containing a sample name, discards rows without entry.
+        Rows containing whitespace only trigger a warning and are discarded for subsequent
+        tests """
+        cellID_withSample = list()
+        cellID_noSample = list()
+        for i in range(LibrarySheet.FIRST_LINE, LibrarySheet.FIRST_LINE+96):
+            cell_id = "{col}{row_iter}".format(col=column,row_iter=i)
+            cell_value = str(self.library_sheet[cell_id].value)
+            if(cell_value.isspace()):
+                logger.warning(
+                    'Cell {} contains empty spaces only. Remove content.'.format(cell_id)
+                    )
+                global WARNINGS
+                WARNINGS += 1
+            elif(self.library_sheet[cell_id].value != None):
+                cellID_withSample.append(i)
+            else:
+                cellID_noSample.append(cell_id)
+        return(cellID_withSample)
+
+    def getSamples(self):
+        """ identifies the all rows containing a sample name, discards rows without entry.
+        Rows containing whitespace only trigger a warning and are discarded for subsequent
+        tests """
+        cellID_withSample = list()
+        cellID_noSample = list()
+        for i in range(LibrarySheet.FIRST_LINE, LibrarySheet.FIRST_LINE+96):
+            cell_id = "{col}{row_iter}".format(col=LibrarySheet.SAMPLE_NAME_COL,row_iter=i)
+            cell_value = str(self.library_sheet[cell_id].value)
+            if(cell_value.isspace()):
+                logger.warning(
+                    'Cell {} contains empty spaces only. Remove content.'.format(cell_id)
+                   )
+                global WARNINGS
+                WARNINGS += 1
+            elif(self.library_sheet[cell_id].value != None):
+                cellID_withSample.append(i)
+            else:
+                cellID_noSample.append(cell_id)
+        return(cellID_withSample)
+
+    def getPool(self):
+
+        cellID_withSample = list()
+        cellID_noSample = list()
+        for i in range(LibrarySheet.FIRST_LINE, LibrarySheet.FIRST_LINE+96):
+            cell_id = "{col}{row_iter}".format(col=LibrarySheet.POOL_NAME_COL,row_iter=i)
+            cell_value = str(self.library_sheet[cell_id].value)
+            if(cell_value.isspace()):
+                logger.warning(
+                    'Cell {} contains empty spaces only. Remove content.'.format(cell_id)
+                   )
+                global WARNINGS
+                WARNINGS += 1
+            elif(self.library_sheet[cell_id].value != None):
+                cellID_withSample.append(i)
+            else:
+                cellID_noSample.append(cell_id)# TODO check here that these rows do really not contain information
+        return(cellID_withSample)
+
+    def ProjectInfo(self, config):
+        """
+        Retrieves the project information from couchDB, checks that the project exists in
+        couchDB and is unique. Returns the information and the full project plateID.
+        """
+        with open(config) as settings_file:
+            server_settings = yaml.load(settings_file, Loader=yaml.FullLoader)
+        couch = couchdb.Server(server_settings.get("couch_server", None))
+        db = couch["projects"]
+        # check the existence of the project number in couchDB
+        project_plate_ID = self.projectID()
+        project_id_found = db.view("project/project_id", key=project_plate_ID[0])
+        prow = project_id_found.rows
+        # Project not found
+        if len(prow) == 0:
+            logger.error(
+                'Project not found, please check your entry for the PlateID, it should have the format'
+                'PxxxxxPx, where x are numbers. If your Plate ID is correct, contact your project coordinator.'
+                )
+            quit()
+        # more than one project found
+        elif len(prow) > 1:
+            logger.error(
+                'Project ID not unique, please check your entry for the PlateID, it should have the format'
+                'PxxxxxPx, where x are numbers. If your Plate ID is correct, contact your project coordinator.'
+                )
+            quit()
+        else:
+            # puts the Document of the identified project in a new variable "pdoc"
+            pdoc = db.get(prow[0].id)
+            return pdoc, project_plate_ID[1]
+
+    def validate_project_Name(self, info, project_plate_ID):
+        """
+        Prints the identified project name based on the user supplied Plate/Project ID for
+        control purposes by the project coordinator. Further checks that the
+        plate number is not already in couchDB.
+        """
+        project_name_DB = info['project_name']
+        samples = info['samples'].keys()
+        plate ='P{}_{}'.format(project_plate_ID.split("P")[1],project_plate_ID.split("P")[2])
+        found_plate = [s for s in samples if plate in s]
+        if(len(found_plate)>0):
+            new_plate_no = int(project_plate_ID.split("P")[2])
+            new_plate_no += 1
+            new_plate_ID = 'P{}P{}'.format(project_plate_ID.split("P")[1], new_plate_no)
+            logger.warning(
+                'Plate number {} is already used. Please increase the plate number to {}.'.format(project_plate_ID, new_plate_ID))
+            global WARNINGS
+            WARNINGS += 1
+        logger.info('identified project name: {}'.format(project_name_DB))
+
+    def validate(self):
+        """Validates all rows with a sample ID
+        Given the column letter and which rows to validate:
+        - Initiates the given validators for cell content (numeric), concentration,
+        volume and RIN (RNA samples only) with the optional attributes.
+        - Loops through all the given cells and validates them individually.
+        - prints summaries of the warnings and of the Excel file check.
+        """
+        #        print(self.getRows(LibrarySheet.POOL_NAME_SAMPLE_COL))
+
+        for row_nr in self.getPool():
+            cell_id_mol = "{col}{row_nr}".format(col=LibrarySheet.MOLARITY_COL, row_nr=row_nr)
+            validator = Validator(self.library_sheet,cell_id_mol)
+            result_numeric = validator.validate_numeric()
+
+        current_pool_rows = []
+        pool_values = []
+        for row_nr in self.getRows(LibrarySheet.SAMPLE_NAME_COL):
+            current_cell_id_pool ="{col}{row_nr}".format(col=LibrarySheet.POOL_NAME_SAMPLE_COL, row_nr=row_nr)
+            current_cell_value_pool = self.library_sheet[current_cell_id_pool].value
+            pool_values.append(current_cell_value_pool)
+
+        poolIDs = list(dict.fromkeys(pool_values))
+        cell_rowid_sample = self.getRows(LibrarySheet.SAMPLE_NAME_COL)
+        cell_rowid_pool = self.getRows(LibrarySheet.POOL_NAME_SAMPLE_COL)
+
+        if(len(cell_rowid_sample) > len(cell_rowid_pool)):
+            missing_pool_rowid_list = setdiff1d(cell_rowid_sample, cell_rowid_pool)
+            for missing_pool_rowid in missing_pool_rowid_list:
+                logger.error(
+                    'Missing pool definition in {}{}'.format(LibrarySheet.POOL_NAME_SAMPLE_COL, missing_pool_rowid))
+            quit()
+        for pool in poolIDs:
+            for nrow_nr in cell_rowid_sample:
+                current_cell_id_pool ="{col}{row_nr}".format(col=LibrarySheet.POOL_NAME_SAMPLE_COL, row_nr=nrow_nr)
+                current_cell_value_pool = self.library_sheet[current_cell_id_pool].value
+                if(current_cell_value_pool == pool):
+                    current_pool_rows.append(nrow_nr)
+            validator = Validator(self.library_sheet, None)
+            result_index, sindex = validator.select_index(current_pool_rows, pool)
+            validator.validate_index(result_index, pool, sindex)
+            current_pool_rows =[]
+
+class Validator(object):
+    # Initializer / Instance attributes
+    def __init__(self, access_sample_info_sheet, molarityID):
+        self.access_sample_info_sheet = access_sample_info_sheet
+        self.molarityID = molarityID
+
+
+    # instance methods
+    def validate_numeric(self):
+        """Checks whether value is numeric or not."""
+        for checkNumbers in [self.molarityID]:
+            if not isinstance(self.access_sample_info_sheet[checkNumbers].value, numbers.Number):
+                try:
+                    float(self.access_sample_info_sheet[checkNumbers].value.replace(",", "."))
+                    logger.error(
+                        'Cell {} with value \"{}\" is not numeric due to decimal point/comma clash.'\
+                        .format(self.access_sample_info_sheet[checkNumbers].coordinate, self.access_sample_info_sheet[checkNumbers].value)
+                        )
+                    return False
+                except ValueError:
+                    logger.error(
+                    'Cell {} with value \"'.format(self.access_sample_info_sheet[checkNumbers].coordinate)+ self.access_sample_info_sheet[checkNumbers].value + '\" is not numeric'
+                    )
+                except TypeError:
+                    if self.access_sample_info_sheet[checkNumbers].value is None:
+                        logger.error(
+                        'Cell {} is numeric but empty'.format(self.access_sample_info_sheet[checkNumbers].coordinate)
+                        )
+                        return False
+                    else:
+                        raise
+            return False
+
+    def validate_mol(self, min_mol, max_mol):
+        if(self.access_sample_info_sheet[self.concentrationID].value < min_mol) \
+        or (self.access_sample_info_sheet[self.concentrationID].value > max_mol):
+            global WARNINGS
+            WARNINGS += 1
+
+            logger.warning('Sample molarity ({}ng/ul) in cell {} is out of specifications: {}-{}ng/ul'\
+            .format(self.access_sample_info_sheet[self.molarityID].value,self.molarityID, min_mol, max_mol))
+        return True
+
+    def select_index(self, pool_rows,pool):
+        sindex_list = []
+        sindex_absent = []
+        cell_id_standard_index = []
+        cell_id_custom_index = []
+        sindex_chosen = True
+        for row_nr in pool_rows:
+           cell_id_standard_index.append("{col}{row_nr}".format(col  =LibrarySheet.SINDEX_COL, row_nr = row_nr))
+           cell_id_custom_index.append("{col}{row_nr}".format(col = LibrarySheet.CINDEX_COL, row_nr = row_nr))
+
+        for sindex in cell_id_standard_index:
+            if (self.access_sample_info_sheet[sindex].value is None):
+                sindex_absent.append(sindex)
+            else:
+                index_only = re.split('\(|\)', self.access_sample_info_sheet[sindex].value)[1]
+                index_only = re.sub('-', '', index_only)
+                sindex_list.append(index_only)
+                #print(sindex_list)
+                if (self.access_sample_info_sheet["{col}{row_nr}".format(col=LibrarySheet.CINDEX_COL, row_nr=sindex.split(LibrarySheet.SINDEX_COL)[1])].value is not None):
+                    logger.warning('Custom and Standard Index selected for the sample in fields {}{} and {}{}. Please clarify which of the two indexes was used.'\
+                    .format(LibrarySheet.SINDEX_COL, sindex.split(LibrarySheet.SINDEX_COL)[1], LibrarySheet.CINDEX_COL, sindex.split(LibrarySheet.SINDEX_COL)[1]))
+
+        cindex_list = []
+        cindex_absent = []
+        for cindex in cell_id_custom_index:
+            if (self.access_sample_info_sheet[cindex].value is None):
+                cindex_absent.append(cindex)
+            else:
+                index_only = re.sub('-','', self.access_sample_info_sheet[cindex].value)
+                cindex_list.append(index_only)
+
+        if(len(cindex_absent) == len(pool_rows)):
+            sel_index = sindex_list
+            if(len(sindex_list) != len(pool_rows)):
+                for absent_index in sindex_absent:
+                    logger.error("missing standard index in {}".format(absent_index))
+
+        elif(len(sindex_absent) == len(pool_rows)):
+            sel_index = cindex_list
+            sindex_chosen = False
+            if(len(cindex_list) != len(pool_rows)):
+                for absent_index in cindex_absent:
+                    logger.error("missing custom index in {}".format(absent_index))
+        else:
+            sindex_chosen = False
+
+            sel_index = sindex_list + cindex_list
+            if(len(sel_index) != len(pool_rows)):
+
+                rowID_cindex_absent = [re.sub("\D","",x) for x in cindex_absent]
+                rowID_sindex_absent = [re.sub("\D","",x) for x in sindex_absent]
+
+                missing_indexes = list(set(rowID_cindex_absent).intersection(rowID_sindex_absent))
+                for mIndex in missing_indexes:
+                    logger.error('missing index in row {}'.format(mIndex))
+
+            logger.warning('mix between custom and standard indexes in pool {}.'\
+            .format(pool))
+        return(sel_index, sindex_chosen)
+
+    def validate_index(self, index_seq, pool_name, sindex):
+        c = Counter(index_seq)
+        for index, index_count in c.most_common():
+            if(index_count>1):
+                logger.error('The index sequence \"{}\" in pool {} is not unique for this pool.'\
+                .format(index, pool_name))
+            else:
+                break
+
+        for index in index_seq: ###TODO currently, standard indexes contain other characters!
+            charRE = re.compile(r'[^ATCGatcg\-.]')
+            index_search = charRE.search(index)
+            if(bool(index_search)):
+                logger.warning('The index sequence \"{}\" in pool {} contains invalid characters.'\
+                .format(index, pool_name))
+
+        if(not sindex):
+            index_length = []
+            for index in index_seq:
+                index_length.append(len(index))
+            count_length = Counter(index_length)
+
+            if(len(count_length) > 1):
+                logger.warning('There are {} different index lengths in pool {}, please double check the sequences.'\
+                .format(len(count_length),pool_name))
+
+            max_length = sorted(count_length.keys())[-1]
+            min_length = sorted(count_length.keys())[0]
+            index_list_colour = []
+            for index in index_seq:
+                index_colour = index.replace('T','G').replace('A','R').replace('C', 'R')
+                index_list_colour.append(list(index_colour))
+
+            for row_nr in range(0,max_length):
+                column = []
+                for row in index_list_colour:
+                    try:
+                        column.append(row[row_nr])
+                    except IndexError:
+                        pass
+                count_colour = Counter(column)
+            #    print(count_colour.values()[1])
+                if(len(count_colour)<2 and sum(count_colour.values()) > 1):
+                    logger.warning('Indexes in pool {} unbalanced at position {}'\
+                    .format(pool_name, row_nr+1))
+    #        a = get_row(index_list_colour,1)
+    #        print(a)
+        # check that index length is the same in all getSamples
+
+#def get_row(dic, row_nr):
+#    column = []
+#    for row in dic:
+#        column.append(row[row_nr])
+#    return(column)
+
+
+def main(input_sheet, config_statusDB):
+    # Instantiate the LibrarySheet object
+    sheetOI = LibrarySheet(input_sheet)
+    # get Project Information from couchDB
+    Project_Information, project_plate_ID = sheetOI.ProjectInfo(config_statusDB)
+    # validate the project name to ensure correct identification in couchDB
+    sheetOI.validate_project_Name(Project_Information, project_plate_ID)
+    # get info about prep type
+#    prep_recommendations = sheetOI.prep_standards(Project_Information, recom_path)
+    # validate all entries
+    sheetOI.validate()#Project_Information, config_statusDB)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('libraryInfoSheet',
+                        help="Completed sample info sent to NGI by the user.")
+    parser.add_argument('config_statusDB',
+                        help="settings file in yaml format to access statusDB \
+                        in the format \"couch_server: http://<username>:<password>@tools.scilifelab.se:5984\"")
+    args = parser.parse_args()
+
+    main(args.libraryInfoSheet,  args.config_statusDB)

From 60108eff444263d4cc068776899fb6b0450cb2f0 Mon Sep 17 00:00:00 2001
From: FranBonath <franziska.bonath@scilifelab.se>
Date: Mon, 28 Oct 2019 21:16:45 +0100
Subject: [PATCH 2/4] added additional index checks and changes requested by M.
 Ormestad. Cleaned up code and added comments.

---
 library_information_validator.py | 373 +++++++++++++++++++------------
 1 file changed, 229 insertions(+), 144 deletions(-)

diff --git a/library_information_validator.py b/library_information_validator.py
index b44c5e8..c9a88b0 100755
--- a/library_information_validator.py
+++ b/library_information_validator.py
@@ -13,10 +13,7 @@
 import yaml
 from numpy import setdiff1d
 from collections import Counter
-
-#global variable
-WARNINGS = 0
-
+import Levenshtein as lev
 
 # Set up a logger with colored output
 logger = logging.getLogger(__name__)
@@ -28,18 +25,16 @@
 class LibrarySheet:
 #Class Attributes
     SHEET_NAME = 'Sample information'
-    INDEX_SHEET_NAME = 'Index'
     FIRST_LINE = 20  # First line where user submitted data is located
-    SAMPLE_NAME_COL = 'P'
-    POOL_NAME_COL = 'V'
-    POOL_NAME_SAMPLE_COL = "O"
-    LENGTH_COL = 'Y'  # average fragment length
-    MOLARITY_COL = 'AA'  # molarity of the pool
-    SINDEX_COL = 'S' # sample index (automated)
-    CINDEX_COL = 'T' # custom index
-    SAMPLE_TYPE = 'P8' # from drop down menue ("Finished library" or "Amplicon with adapters (low diversity)")
-    PLATE_ID = 'N6'
-    PROJECT_NAME_USER_SHEET = 'N3'
+    SAMPLE_NAME_COL = 'P' # user defined sample name
+    POOL_NAME_COL = 'V' # column of pool names in pool summary
+    POOL_NAME_SAMPLE_COL = "O" # column of pool names in index definition
+    LENGTH_COL = 'Y'  # average fragment length of pools
+    MOLARITY_COL = 'AA'  # molarity of the pool, currently not used
+    SINDEX_COL = 'S' # NGI standard index sequences (automated)
+    CINDEX_COL = 'T' # custom index sequences
+    PLATE_ID = 'N6' # plate ID as specified in user sheet
+    MAX_DISTANCE = 2 # required differences between Indexes in the same pool
 
     # Initializer / Instance attributes
     def __init__(self, library_info_sheet):
@@ -63,17 +58,18 @@ def projectID(self):
             project_id_user = re.findall('P\d+', plate_id)[0]
         else:
             logger.error(
-                'Given plate ID ({}) in cell {} has the wrong format. It should be in the format'
-                ' PxxxxxPx, where x are numbers. If your Plate ID is correct, contact your project coordinator.'\
+                'The given plate ID ({}) in cell {} has the wrong format. It should be in the format'
+                ' PxxxxxPx, where x are numbers. If you think your Plate ID is correct, contact your project coordinator.'\
                 .format(plate_id, LibrarySheet.PLATE_ID)
                 )
             quit()
         return([project_id_user, plate_id])
 
     def getRows(self, column):
-        """ identifies the all rows containing a sample name, discards rows without entry.
-        Rows containing whitespace only trigger a warning and are discarded for subsequent
+        """ identifies all rows containing a sample name, discards rows without entry.
+        Rows containing whitespace only trigger a warning and are disregarded in subsequent
         tests """
+        warning_empty_row = 0
         cellID_withSample = list()
         cellID_noSample = list()
         for i in range(LibrarySheet.FIRST_LINE, LibrarySheet.FIRST_LINE+96):
@@ -83,59 +79,19 @@ def getRows(self, column):
                 logger.warning(
                     'Cell {} contains empty spaces only. Remove content.'.format(cell_id)
                     )
-                global WARNINGS
-                WARNINGS += 1
+                warning_empty_row += 1
             elif(self.library_sheet[cell_id].value != None):
                 cellID_withSample.append(i)
             else:
                 cellID_noSample.append(cell_id)
-        return(cellID_withSample)
-
-    def getSamples(self):
-        """ identifies the all rows containing a sample name, discards rows without entry.
-        Rows containing whitespace only trigger a warning and are discarded for subsequent
-        tests """
-        cellID_withSample = list()
-        cellID_noSample = list()
-        for i in range(LibrarySheet.FIRST_LINE, LibrarySheet.FIRST_LINE+96):
-            cell_id = "{col}{row_iter}".format(col=LibrarySheet.SAMPLE_NAME_COL,row_iter=i)
-            cell_value = str(self.library_sheet[cell_id].value)
-            if(cell_value.isspace()):
-                logger.warning(
-                    'Cell {} contains empty spaces only. Remove content.'.format(cell_id)
-                   )
-                global WARNINGS
-                WARNINGS += 1
-            elif(self.library_sheet[cell_id].value != None):
-                cellID_withSample.append(i)
-            else:
-                cellID_noSample.append(cell_id)
-        return(cellID_withSample)
-
-    def getPool(self):
-
-        cellID_withSample = list()
-        cellID_noSample = list()
-        for i in range(LibrarySheet.FIRST_LINE, LibrarySheet.FIRST_LINE+96):
-            cell_id = "{col}{row_iter}".format(col=LibrarySheet.POOL_NAME_COL,row_iter=i)
-            cell_value = str(self.library_sheet[cell_id].value)
-            if(cell_value.isspace()):
-                logger.warning(
-                    'Cell {} contains empty spaces only. Remove content.'.format(cell_id)
-                   )
-                global WARNINGS
-                WARNINGS += 1
-            elif(self.library_sheet[cell_id].value != None):
-                cellID_withSample.append(i)
-            else:
-                cellID_noSample.append(cell_id)# TODO check here that these rows do really not contain information
-        return(cellID_withSample)
+        return(cellID_withSample, warning_empty_row)
 
     def ProjectInfo(self, config):
         """
         Retrieves the project information from couchDB, checks that the project exists in
         couchDB and is unique. Returns the information and the full project plateID.
         """
+        # access to the project in couchDB using information from the given config file
         with open(config) as settings_file:
             server_settings = yaml.load(settings_file, Loader=yaml.FullLoader)
         couch = couchdb.Server(server_settings.get("couch_server", None))
@@ -173,48 +129,82 @@ def validate_project_Name(self, info, project_plate_ID):
         samples = info['samples'].keys()
         plate ='P{}_{}'.format(project_plate_ID.split("P")[1],project_plate_ID.split("P")[2])
         found_plate = [s for s in samples if plate in s]
+        warning_project_name = 0
         if(len(found_plate)>0):
             new_plate_no = int(project_plate_ID.split("P")[2])
             new_plate_no += 1
             new_plate_ID = 'P{}P{}'.format(project_plate_ID.split("P")[1], new_plate_no)
             logger.warning(
                 'Plate number {} is already used. Please increase the plate number to {}.'.format(project_plate_ID, new_plate_ID))
-            global WARNINGS
-            WARNINGS += 1
-        logger.info('identified project name: {}'.format(project_name_DB))
-
-    def validate(self):
-        """Validates all rows with a sample ID
-        Given the column letter and which rows to validate:
-        - Initiates the given validators for cell content (numeric), concentration,
-        volume and RIN (RNA samples only) with the optional attributes.
-        - Loops through all the given cells and validates them individually.
-        - prints summaries of the warnings and of the Excel file check.
-        """
-        #        print(self.getRows(LibrarySheet.POOL_NAME_SAMPLE_COL))
+            warning_project_name = 1
+        return(warning_project_name)
 
-        for row_nr in self.getPool():
-            cell_id_mol = "{col}{row_nr}".format(col=LibrarySheet.MOLARITY_COL, row_nr=row_nr)
-            validator = Validator(self.library_sheet,cell_id_mol)
-            result_numeric = validator.validate_numeric()
+    def validate_sequencing_setup(self, info, cell_id_length):
+        """
+        checks that the ordered cycle number fits the given the insert length
+        OBS currently there is a 15% difference allowed before a warning is given,
+        this is hard coded atm.
+        """
+        warning_cycles = 0
+        cycle_no_string = info['details']['sequencing_setup']
+        cycle_no_array = cycle_no_string.split("x")
+        cycle_no = int(cycle_no_array[0])*int(cycle_no_array[1])
+        insert_length = self.library_sheet[cell_id_length].value
+
+        if((int(cycle_no)*1.15) < int(insert_length)):
+            logger.warning('The cycle number ({} = {}) is lower than the insert length ({}bp) in cell {}.'\
+            .format(cycle_no_string,cycle_no,insert_length, cell_id_length))
+            warning_cycles += 1
+        return(warning_cycles)
+
+    def validate(self, project_info):
+        """
+        - identifies the samples in a pool
+        - detects missing entry in pool column
+        - initiates index check which is done for each pool independently
+        """
+        pool_warnings = []
 
-        current_pool_rows = []
-        pool_values = []
-        for row_nr in self.getRows(LibrarySheet.SAMPLE_NAME_COL):
-            current_cell_id_pool ="{col}{row_nr}".format(col=LibrarySheet.POOL_NAME_SAMPLE_COL, row_nr=row_nr)
-            current_cell_value_pool = self.library_sheet[current_cell_id_pool].value
-            pool_values.append(current_cell_value_pool)
-
-        poolIDs = list(dict.fromkeys(pool_values))
-        cell_rowid_sample = self.getRows(LibrarySheet.SAMPLE_NAME_COL)
-        cell_rowid_pool = self.getRows(LibrarySheet.POOL_NAME_SAMPLE_COL)
+        cell_rowid_sample, warning_eR_sample = self.getRows(LibrarySheet.SAMPLE_NAME_COL)
+        cell_rowid_pool, warning_eR_pool_sample = self.getRows(LibrarySheet.POOL_NAME_SAMPLE_COL)
+        cell_rowid_pool_info, warning_eR_pool = self.getRows(LibrarySheet.POOL_NAME_COL)
 
+        #check for missing  for pool ID entries
         if(len(cell_rowid_sample) > len(cell_rowid_pool)):
             missing_pool_rowid_list = setdiff1d(cell_rowid_sample, cell_rowid_pool)
             for missing_pool_rowid in missing_pool_rowid_list:
                 logger.error(
                     'Missing pool definition in {}{}'.format(LibrarySheet.POOL_NAME_SAMPLE_COL, missing_pool_rowid))
             quit()
+
+        #initiate check for sequencing setup and discrepancies between ordered numbers
+        #of cycles and average read length
+        warnings_cycle = []
+        for row_nr in cell_rowid_pool_info:
+            cell_id_mol = "{col}{row_nr}".format(col=LibrarySheet.MOLARITY_COL, row_nr=row_nr) # to work on later
+            cell_id_length = "{col}{row_nr}".format(col=LibrarySheet.LENGTH_COL, row_nr=row_nr)
+            cell_id_pool = "{col}{row_nr}".format(col=LibrarySheet.POOL_NAME_COL, row_nr=row_nr)
+            validator = Validator(self.library_sheet,cell_id_mol) # molarity is currently not checked
+            result_numeric, warnings_numeric = validator.validate_numeric()
+            warnings_c = self.validate_sequencing_setup(project_info, cell_id_length)
+            warnings_cycle.append(warnings_c)
+
+        #retrieve all pool IDs defined, in order to later analyse by pool
+        pool_values = []
+        for row_nr in cell_rowid_sample:
+            current_cell_id_pool ="{col}{row_nr}".format(col=LibrarySheet.POOL_NAME_SAMPLE_COL, row_nr=row_nr)
+            current_cell_value_pool = self.library_sheet[current_cell_id_pool].value
+            pool_values.append(current_cell_value_pool)
+
+        # initiate check by pool for index issues
+        #    - differences in index lengths
+        #    - low diversity between Indexes
+        #    - duplicated indixes
+        #    - differences in length within dual Indexes
+        #    - unbalanced pools
+        poolIDs = list(dict.fromkeys(pool_values))
+        i = 0
+        current_pool_rows = []
         for pool in poolIDs:
             for nrow_nr in cell_rowid_sample:
                 current_cell_id_pool ="{col}{row_nr}".format(col=LibrarySheet.POOL_NAME_SAMPLE_COL, row_nr=nrow_nr)
@@ -222,9 +212,27 @@ def validate(self):
                 if(current_cell_value_pool == pool):
                     current_pool_rows.append(nrow_nr)
             validator = Validator(self.library_sheet, None)
-            result_index, sindex = validator.select_index(current_pool_rows, pool)
-            validator.validate_index(result_index, pool, sindex)
+
+            result_index, sindex, warning_index_mix, \
+            warning_length_comp = validator.select_index(current_pool_rows, pool)
+
+            warning_low_div, warning_index_length,\
+            warning_index_balance = validator.validate_index(result_index, pool, sindex)
+
+            pool_warning = pool, [warning_low_div, warning_index_length, \
+            warning_index_balance, warnings_cycle[i], warning_index_mix, warning_length_comp]
+
+            pool_warnings.append(pool_warning)
             current_pool_rows =[]
+            i += 1
+
+        # summarise warnings and return
+        pools_with_warnings = []
+        for warning in pool_warnings:
+            sums_warnings = sum(warning[1])
+            if(sums_warnings > 0):
+                pools_with_warnings.append(warning[0])
+        return(len(pools_with_warnings), len(poolIDs))
 
 class Validator(object):
     # Initializer / Instance attributes
@@ -232,10 +240,10 @@ def __init__(self, access_sample_info_sheet, molarityID):
         self.access_sample_info_sheet = access_sample_info_sheet
         self.molarityID = molarityID
 
-
     # instance methods
     def validate_numeric(self):
         """Checks whether value is numeric or not."""
+        warnings_numeric = 0
         for checkNumbers in [self.molarityID]:
             if not isinstance(self.access_sample_info_sheet[checkNumbers].value, numbers.Number):
                 try:
@@ -244,80 +252,125 @@ def validate_numeric(self):
                         'Cell {} with value \"{}\" is not numeric due to decimal point/comma clash.'\
                         .format(self.access_sample_info_sheet[checkNumbers].coordinate, self.access_sample_info_sheet[checkNumbers].value)
                         )
+                    warnings_numeric += 1
                     return False
                 except ValueError:
                     logger.error(
-                    'Cell {} with value \"'.format(self.access_sample_info_sheet[checkNumbers].coordinate)+ self.access_sample_info_sheet[checkNumbers].value + '\" is not numeric'
+                    'Cell {} with value {} is \"'.format(self.access_sample_info_sheet[checkNumbers].coordinate)+ self.access_sample_info_sheet[checkNumbers].value + '\" is not numeric'
                     )
+                    warnings_numeric += 1
                 except TypeError:
                     if self.access_sample_info_sheet[checkNumbers].value is None:
                         logger.error(
                         'Cell {} is numeric but empty'.format(self.access_sample_info_sheet[checkNumbers].coordinate)
                         )
+                        warnings_numeric += 1
                         return False
                     else:
                         raise
-            return False
-
-    def validate_mol(self, min_mol, max_mol):
-        if(self.access_sample_info_sheet[self.concentrationID].value < min_mol) \
-        or (self.access_sample_info_sheet[self.concentrationID].value > max_mol):
-            global WARNINGS
-            WARNINGS += 1
-
-            logger.warning('Sample molarity ({}ng/ul) in cell {} is out of specifications: {}-{}ng/ul'\
-            .format(self.access_sample_info_sheet[self.molarityID].value,self.molarityID, min_mol, max_mol))
-        return True
-
-    def select_index(self, pool_rows,pool):
-        sindex_list = []
-        sindex_absent = []
+            return(False, warnings_numeric)
+
+    # currently not used
+    #def validate_mol(self, min_mol, max_mol):
+    #    warning_val_mol = 0
+    #    if(self.access_sample_info_sheet[self.concentrationID].value < min_mol) \
+    #    or (self.access_sample_info_sheet[self.concentrationID].value > max_mol):
+    #        logger.warning('Sample molarity ({}ng/ul) in cell {} is out of specifications: {}-{}ng/ul'\
+    #        .format(self.access_sample_info_sheet[self.molarityID].value,self.molarityID, min_mol, max_mol))
+    #        warning_val_mol += 1
+    #    return(True, warning_val_mol)
+
+    def select_index(self, pool_rows, pool):
+        '''
+        - identifes whether pools contain standard indixes, custom indexes or both
+        - will detect non-nucleotide letters
+        - checks for double indexes in pools
+        - detects missing Indexes
+        - checks for minimal distance between Indexes
+        - checks for similar index length (warning)
+        - checks pool balance
+        '''
+        warning_mixed_indexes = 0
+        warning_component_length = 0
+
+        # retrieve the Cell IDs in for NGI standard indexes or custom indexes
         cell_id_standard_index = []
         cell_id_custom_index = []
-        sindex_chosen = True
         for row_nr in pool_rows:
            cell_id_standard_index.append("{col}{row_nr}".format(col  =LibrarySheet.SINDEX_COL, row_nr = row_nr))
            cell_id_custom_index.append("{col}{row_nr}".format(col = LibrarySheet.CINDEX_COL, row_nr = row_nr))
 
+        # retrieves index sequences for NGI standard indexes
+        sindex_list = []
+        sindex_absent = []
         for sindex in cell_id_standard_index:
             if (self.access_sample_info_sheet[sindex].value is None):
                 sindex_absent.append(sindex)
+            elif(self.access_sample_info_sheet[sindex].value == "noIndex"):
+                sindex_list.append(self.access_sample_info_sheet[sindex].value)
             else:
                 index_only = re.split('\(|\)', self.access_sample_info_sheet[sindex].value)[1]
-                index_only = re.sub('-', '', index_only)
+
+                # checks that dual indexes have the same length in both components
+                split_index = re.split('[-_]',index_only)
+                if(len(split_index) > 1):
+                    if(len(split_index[0]) != len(split_index[1])):
+                        logger.warning("Length of the two components in dual index {} and {} is different.".format(split_index[0], split_index[1]))
+                        warning_component_length += 1
+
+                # merges dual indexes to one sequence
+                index_only = re.sub('[-_]', '', index_only)
                 sindex_list.append(index_only)
-                #print(sindex_list)
+
+                # generates error if both custom and NGI standard index are selected for the same sample
                 if (self.access_sample_info_sheet["{col}{row_nr}".format(col=LibrarySheet.CINDEX_COL, row_nr=sindex.split(LibrarySheet.SINDEX_COL)[1])].value is not None):
-                    logger.warning('Custom and Standard Index selected for the sample in fields {}{} and {}{}. Please clarify which of the two indexes was used.'\
+                    logger.error('Custom and Standard Index selected for the sample in fields {}{} and {}{}. Please clarify which of the two indexes was used.'\
                     .format(LibrarySheet.SINDEX_COL, sindex.split(LibrarySheet.SINDEX_COL)[1], LibrarySheet.CINDEX_COL, sindex.split(LibrarySheet.SINDEX_COL)[1]))
+                    quit()
 
+        # retrieves index sequences for custom indexes
         cindex_list = []
         cindex_absent = []
         for cindex in cell_id_custom_index:
-            if (self.access_sample_info_sheet[cindex].value is None):
+            cindex_value = self.access_sample_info_sheet[cindex].value
+            if (cindex_value is None):
                 cindex_absent.append(cindex)
             else:
-                index_only = re.sub('-','', self.access_sample_info_sheet[cindex].value)
+                # checks that dual indexes have the same length in both components
+                split_cindex = re.split('[-_]',cindex_value)
+                if(len(split_cindex) > 1):
+                    if(len(split_cindex[0]) != len(split_cindex[1])):
+                        logger.warning("Length of the two components in dual index {} and {} is different.".format(split_cindex[0], split_cindex[1]))
+                        warning_component_length += 1
+
+                # merges dual indexes to one sequence
+                index_only = re.sub('[-]','', cindex_value)
                 cindex_list.append(index_only)
 
+        # identifies missing index specification for individual samples
+        # and detects wether NGI standard indexes or custom indexes are chosen for a given pool
+        error_missing_index = 0
+        warning_mixed_indexes = 0
+        sindex_chosen = True
         if(len(cindex_absent) == len(pool_rows)):
             sel_index = sindex_list
             if(len(sindex_list) != len(pool_rows)):
                 for absent_index in sindex_absent:
-                    logger.error("missing standard index in {}".format(absent_index))
-
+                    logger.error("missing index in row {}".format(re.sub("\D","",absent_index)))
+                    quit()
         elif(len(sindex_absent) == len(pool_rows)):
             sel_index = cindex_list
             sindex_chosen = False
             if(len(cindex_list) != len(pool_rows)):
                 for absent_index in cindex_absent:
-                    logger.error("missing custom index in {}".format(absent_index))
+                    logger.error("missing index in row {}".format(re.sub("\D","",absent_index)))
+                    quit()
         else:
             sindex_chosen = False
 
+            # warning if a pool consists out of a mix of NGI standard indexes and custom indexes
             sel_index = sindex_list + cindex_list
             if(len(sel_index) != len(pool_rows)):
-
                 rowID_cindex_absent = [re.sub("\D","",x) for x in cindex_absent]
                 rowID_sindex_absent = [re.sub("\D","",x) for x in sindex_absent]
 
@@ -327,24 +380,59 @@ def select_index(self, pool_rows,pool):
 
             logger.warning('mix between custom and standard indexes in pool {}.'\
             .format(pool))
-        return(sel_index, sindex_chosen)
+            warning_mixed_indexes += 1
+
+        # returns warnings for check summary
+        return(sel_index, sindex_chosen, warning_mixed_indexes, warning_component_length)
 
     def validate_index(self, index_seq, pool_name, sindex):
-        c = Counter(index_seq)
-        for index, index_count in c.most_common():
-            if(index_count>1):
-                logger.error('The index sequence \"{}\" in pool {} is not unique for this pool.'\
-                .format(index, pool_name))
-            else:
-                break
+        '''
+        does all the fancy index checks
+        '''
 
-        for index in index_seq: ###TODO currently, standard indexes contain other characters!
-            charRE = re.compile(r'[^ATCGatcg\-.]')
+        # allows for entry "noIndex" if only one sample is defined in the pool
+        c = Counter(index_seq)
+        if(c['noIndex'] > 0 and len(index_seq) != 1):
+            logger.error('Pool {} contains undefined index(es) (\"noIndex\")'\
+            .format(pool_name))
+            quit()
+        elif(c['noIndex'] == 1 and len(index_seq) == 1):
+            logger.info('Pool {} containing one sample is not indexed.'\
+            .format(pool_name))
+        elif(c['noIndex'] == 0):
+            # checks that all indexes in a pool are unique
+            for index, index_count in c.most_common():
+                if(index_count>1):
+                    logger.error('The index sequence \"{}\" in pool {} is not unique for this pool.'\
+                    .format(index, pool_name))
+                    quit()
+                else:
+                    break
+
+        warning_low_div = 0
+        index_count = 1
+        for index in index_seq:
+            # checks that indexes only contain valid letters
+            charRE = re.compile(r'[^ATCGNatcgn\-.]')
             index_search = charRE.search(index)
-            if(bool(index_search)):
-                logger.warning('The index sequence \"{}\" in pool {} contains invalid characters.'\
+            if(bool(index_search) and index != "noIndex"):
+                logger.error('The index sequence \"{}\" in pool {} contains invalid characters.'\
+                ' Allowed characters: A/T/C/G/N/a/t/c/g/n/-'
                 .format(index, pool_name))
-
+                quit()
+
+            # check that indexes within a pool have minimum diversity
+            for i in range(index_count,len(index_seq)):
+                if lev.distance(index.lower(), index_seq[i].lower()) < LibrarySheet.MAX_DISTANCE:
+                    logger.warning('The index sequences {} and {} in pool {}'\
+                    ' display low diversity (only {} nt difference).'\
+                    .format(index,index_seq[i], pool_name, lev.distance(index.lower(), index_seq[i].lower())))
+                    warning_low_div += 1
+            index_count += 1
+
+        # checks index length
+        warning_index_length = 0
+        warning_index_balance = 0
         if(not sindex):
             index_length = []
             for index in index_seq:
@@ -354,7 +442,9 @@ def validate_index(self, index_seq, pool_name, sindex):
             if(len(count_length) > 1):
                 logger.warning('There are {} different index lengths in pool {}, please double check the sequences.'\
                 .format(len(count_length),pool_name))
+                warning_index_length += 1
 
+            # checks color balance in the pool
             max_length = sorted(count_length.keys())[-1]
             min_length = sorted(count_length.keys())[0]
             index_list_colour = []
@@ -370,19 +460,13 @@ def validate_index(self, index_seq, pool_name, sindex):
                     except IndexError:
                         pass
                 count_colour = Counter(column)
-            #    print(count_colour.values()[1])
                 if(len(count_colour)<2 and sum(count_colour.values()) > 1):
                     logger.warning('Indexes in pool {} unbalanced at position {}'\
                     .format(pool_name, row_nr+1))
-    #        a = get_row(index_list_colour,1)
-    #        print(a)
-        # check that index length is the same in all getSamples
+                    warning_index_balance += 1
 
-#def get_row(dic, row_nr):
-#    column = []
-#    for row in dic:
-#        column.append(row[row_nr])
-#    return(column)
+        # returns warnings for check summary
+        return(warning_low_div, warning_index_length, warning_index_balance)
 
 
 def main(input_sheet, config_statusDB):
@@ -392,11 +476,12 @@ def main(input_sheet, config_statusDB):
     Project_Information, project_plate_ID = sheetOI.ProjectInfo(config_statusDB)
     # validate the project name to ensure correct identification in couchDB
     sheetOI.validate_project_Name(Project_Information, project_plate_ID)
-    # get info about prep type
-#    prep_recommendations = sheetOI.prep_standards(Project_Information, recom_path)
     # validate all entries
-    sheetOI.validate()#Project_Information, config_statusDB)
-
+    pool_fail, poolIDs = sheetOI.validate(Project_Information)
+    # final check summary
+    logger.info(
+        'Library submission check complete. {}/{} pool(s) pass without warnings.'\
+        .format((poolIDs-pool_fail), poolIDs))
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description=__doc__)

From ea2d4e736ef87ef00ea7550b9c84df21a977727f Mon Sep 17 00:00:00 2001
From: FranBonath <franziska.bonath@scilifelab.se>
Date: Tue, 31 Mar 2020 16:36:06 +0200
Subject: [PATCH 3/4] responses to Phil's/Joel's comments, cleanup of script

---
 library_information_validator.py | 202 ++++++++++++++++---------------
 1 file changed, 102 insertions(+), 100 deletions(-)

diff --git a/library_information_validator.py b/library_information_validator.py
index c9a88b0..73738e5 100755
--- a/library_information_validator.py
+++ b/library_information_validator.py
@@ -14,6 +14,7 @@
 from numpy import setdiff1d
 from collections import Counter
 import Levenshtein as lev
+import sys
 
 # Set up a logger with colored output
 logger = logging.getLogger(__name__)
@@ -41,6 +42,7 @@ def __init__(self, library_info_sheet):
         self.library_info_sheet = library_info_sheet
         self.library_sheet = None
         self.sample_rec = None
+        self.Project_Information = None
 
     # instance methods
     def getAccessUserSheet(self):
@@ -57,12 +59,11 @@ def projectID(self):
         if(len(re.findall('P\d+P\d+', plate_id))>0):
             project_id_user = re.findall('P\d+', plate_id)[0]
         else:
-            logger.error(
+            sys.exit(logger.error(
                 'The given plate ID ({}) in cell {} has the wrong format. It should be in the format'
                 ' PxxxxxPx, where x are numbers. If you think your Plate ID is correct, contact your project coordinator.'\
                 .format(plate_id, LibrarySheet.PLATE_ID)
-                )
-            quit()
+                ))
         return([project_id_user, plate_id])
 
     def getRows(self, column):
@@ -102,40 +103,39 @@ def ProjectInfo(self, config):
         prow = project_id_found.rows
         # Project not found
         if len(prow) == 0:
-            logger.error(
+            sys.exit(logger.error(
                 'Project not found, please check your entry for the PlateID, it should have the format'
                 'PxxxxxPx, where x are numbers. If your Plate ID is correct, contact your project coordinator.'
-                )
-            quit()
+                ))
         # more than one project found
         elif len(prow) > 1:
-            logger.error(
+            sys.exit(logger.error(
                 'Project ID not unique, please check your entry for the PlateID, it should have the format'
                 'PxxxxxPx, where x are numbers. If your Plate ID is correct, contact your project coordinator.'
-                )
-            quit()
+                ))
         else:
             # puts the Document of the identified project in a new variable "pdoc"
-            pdoc = db.get(prow[0].id)
-            return pdoc, project_plate_ID[1]
+            self.Project_Information = db.get(prow[0].id)
+            self.Project_Plate_ID = project_plate_ID[1]
 
-    def validate_project_Name(self, info, project_plate_ID):
+    def validate_project_Name(self):
         """
         Prints the identified project name based on the user supplied Plate/Project ID for
         control purposes by the project coordinator. Further checks that the
         plate number is not already in couchDB.
         """
-        project_name_DB = info['project_name']
-        samples = info['samples'].keys()
-        plate ='P{}_{}'.format(project_plate_ID.split("P")[1],project_plate_ID.split("P")[2])
+    #    print(self.pdoc)
+        project_name_DB = self.Project_Information['project_name']
+        samples = self.Project_Information['samples'].keys()
+        plate ='P{}_{}'.format(self.Project_Plate_ID.split("P")[1],self.Project_Plate_ID.split("P")[2])
         found_plate = [s for s in samples if plate in s]
         warning_project_name = 0
         if(len(found_plate)>0):
-            new_plate_no = int(project_plate_ID.split("P")[2])
+            new_plate_no = int(self.Project_Plate_ID.split("P")[2])
             new_plate_no += 1
-            new_plate_ID = 'P{}P{}'.format(project_plate_ID.split("P")[1], new_plate_no)
+            new_plate_ID = 'P{}P{}'.format(self.Project_Plate_ID.split("P")[1], new_plate_no)
             logger.warning(
-                'Plate number {} is already used. Please increase the plate number to {}.'.format(project_plate_ID, new_plate_ID))
+                'Plate number {} is already used. Please increase the plate number to {}.'.format(self.Project_Plate_ID, new_plate_ID))
             warning_project_name = 1
         return(warning_project_name)
 
@@ -157,7 +157,7 @@ def validate_sequencing_setup(self, info, cell_id_length):
             warning_cycles += 1
         return(warning_cycles)
 
-    def validate(self, project_info):
+    def validate(self):
         """
         - identifies the samples in a pool
         - detects missing entry in pool column
@@ -173,9 +173,8 @@ def validate(self, project_info):
         if(len(cell_rowid_sample) > len(cell_rowid_pool)):
             missing_pool_rowid_list = setdiff1d(cell_rowid_sample, cell_rowid_pool)
             for missing_pool_rowid in missing_pool_rowid_list:
-                logger.error(
-                    'Missing pool definition in {}{}'.format(LibrarySheet.POOL_NAME_SAMPLE_COL, missing_pool_rowid))
-            quit()
+                sys.exit(logger.error(
+                    'Missing pool definition in {}{}'.format(LibrarySheet.POOL_NAME_SAMPLE_COL, missing_pool_rowid)))
 
         #initiate check for sequencing setup and discrepancies between ordered numbers
         #of cycles and average read length
@@ -186,7 +185,7 @@ def validate(self, project_info):
             cell_id_pool = "{col}{row_nr}".format(col=LibrarySheet.POOL_NAME_COL, row_nr=row_nr)
             validator = Validator(self.library_sheet,cell_id_mol) # molarity is currently not checked
             result_numeric, warnings_numeric = validator.validate_numeric()
-            warnings_c = self.validate_sequencing_setup(project_info, cell_id_length)
+            warnings_c = self.validate_sequencing_setup(self.Project_Information, cell_id_length)
             warnings_cycle.append(warnings_c)
 
         #retrieve all pool IDs defined, in order to later analyse by pool
@@ -205,38 +204,42 @@ def validate(self, project_info):
         poolIDs = list(dict.fromkeys(pool_values))
         i = 0
         current_pool_rows = []
+        pools_with_warnings = []
         for pool in poolIDs:
             for nrow_nr in cell_rowid_sample:
                 current_cell_id_pool ="{col}{row_nr}".format(col=LibrarySheet.POOL_NAME_SAMPLE_COL, row_nr=nrow_nr)
                 current_cell_value_pool = self.library_sheet[current_cell_id_pool].value
                 if(current_cell_value_pool == pool):
                     current_pool_rows.append(nrow_nr)
-            validator = Validator(self.library_sheet, None)
-
-            result_index, sindex, warning_index_mix, \
-            warning_length_comp = validator.select_index(current_pool_rows, pool)
-
-            warning_low_div, warning_index_length,\
-            warning_index_balance = validator.validate_index(result_index, pool, sindex)
-
-            pool_warning = pool, [warning_low_div, warning_index_length, \
-            warning_index_balance, warnings_cycle[i], warning_index_mix, warning_length_comp]
-
-            pool_warnings.append(pool_warning)
+            validator = Validator(self.library_sheet)
+
+            dictionary_sel_index = validator.select_index(current_pool_rows, pool)
+            dictionary_val_index = \
+            validator.validate_index(dictionary_sel_index["result_index"],
+                                    pool,
+                                    dictionary_sel_index["sindex"]
+
+                                    )
+            # summary of warnings
+            all_warnings = [
+                            dictionary_val_index["warning_low_div"],
+                            dictionary_val_index["warning_index_length"],
+                            dictionary_val_index["warning_index_balance"],
+                            warnings_cycle[i],
+                            dictionary_sel_index["warning_index_mix"],
+                            dictionary_sel_index["warning_length_comp"]
+                            ]
+            sums_warnings = sum(all_warnings)
+            if(sums_warnings > 0):
+                pools_with_warnings.append(pool)
             current_pool_rows =[]
             i += 1
 
-        # summarise warnings and return
-        pools_with_warnings = []
-        for warning in pool_warnings:
-            sums_warnings = sum(warning[1])
-            if(sums_warnings > 0):
-                pools_with_warnings.append(warning[0])
         return(len(pools_with_warnings), len(poolIDs))
 
 class Validator(object):
     # Initializer / Instance attributes
-    def __init__(self, access_sample_info_sheet, molarityID):
+    def __init__(self, access_sample_info_sheet, molarityID = None):
         self.access_sample_info_sheet = access_sample_info_sheet
         self.molarityID = molarityID
 
@@ -324,9 +327,8 @@ def select_index(self, pool_rows, pool):
 
                 # generates error if both custom and NGI standard index are selected for the same sample
                 if (self.access_sample_info_sheet["{col}{row_nr}".format(col=LibrarySheet.CINDEX_COL, row_nr=sindex.split(LibrarySheet.SINDEX_COL)[1])].value is not None):
-                    logger.error('Custom and Standard Index selected for the sample in fields {}{} and {}{}. Please clarify which of the two indexes was used.'\
-                    .format(LibrarySheet.SINDEX_COL, sindex.split(LibrarySheet.SINDEX_COL)[1], LibrarySheet.CINDEX_COL, sindex.split(LibrarySheet.SINDEX_COL)[1]))
-                    quit()
+                    sys.exit(logger.error('Custom and Standard Index selected for the sample in fields {}{} and {}{}. Please clarify which of the two indexes was used.'\
+                    .format(LibrarySheet.SINDEX_COL, sindex.split(LibrarySheet.SINDEX_COL)[1], LibrarySheet.CINDEX_COL, sindex.split(LibrarySheet.SINDEX_COL)[1])))
 
         # retrieves index sequences for custom indexes
         cindex_list = []
@@ -356,15 +358,13 @@ def select_index(self, pool_rows, pool):
             sel_index = sindex_list
             if(len(sindex_list) != len(pool_rows)):
                 for absent_index in sindex_absent:
-                    logger.error("missing index in row {}".format(re.sub("\D","",absent_index)))
-                    quit()
+                    sys.exit(logger.error("missing index in row {}".format(re.sub("\D","",absent_index))))
         elif(len(sindex_absent) == len(pool_rows)):
             sel_index = cindex_list
             sindex_chosen = False
             if(len(cindex_list) != len(pool_rows)):
                 for absent_index in cindex_absent:
-                    logger.error("missing index in row {}".format(re.sub("\D","",absent_index)))
-                    quit()
+                    sys.exit(logger.error("missing index in row {}".format(re.sub("\D","",absent_index))))
         else:
             sindex_chosen = False
 
@@ -382,102 +382,104 @@ def select_index(self, pool_rows, pool):
             .format(pool))
             warning_mixed_indexes += 1
 
+        dictionary_index = {
+        "result_index" : sel_index,
+        "sindex" : sindex_chosen,
+        "warning_index_mix" : warning_mixed_indexes,
+        "warning_length_comp" : warning_component_length
+        }
+
         # returns warnings for check summary
-        return(sel_index, sindex_chosen, warning_mixed_indexes, warning_component_length)
+        return(dictionary_index)
 
     def validate_index(self, index_seq, pool_name, sindex):
         '''
         does all the fancy index checks
         '''
-
         # allows for entry "noIndex" if only one sample is defined in the pool
         c = Counter(index_seq)
         if(c['noIndex'] > 0 and len(index_seq) != 1):
-            logger.error('Pool {} contains undefined index(es) (\"noIndex\")'\
-            .format(pool_name))
-            quit()
+            sys.exit(logger.error('Pool {} contains undefined index(es) (\"noIndex\")'.format(pool_name)))
         elif(c['noIndex'] == 1 and len(index_seq) == 1):
-            logger.info('Pool {} containing one sample is not indexed.'\
-            .format(pool_name))
+            logger.info('Pool {} containing one sample is not indexed.'.format(pool_name))
         elif(c['noIndex'] == 0):
             # checks that all indexes in a pool are unique
             for index, index_count in c.most_common():
                 if(index_count>1):
-                    logger.error('The index sequence \"{}\" in pool {} is not unique for this pool.'\
-                    .format(index, pool_name))
-                    quit()
+                    sys.exit(logger.error('The index sequence \"{}\" in pool {} is not unique for this pool.'\
+                    .format(index, pool_name)))
                 else:
                     break
 
         warning_low_div = 0
         index_count = 1
+        charRE = re.compile(r'[^ATCGNatcgn\-\.]')
         for index in index_seq:
             # checks that indexes only contain valid letters
-            charRE = re.compile(r'[^ATCGNatcgn\-.]')
             index_search = charRE.search(index)
             if(bool(index_search) and index != "noIndex"):
-                logger.error('The index sequence \"{}\" in pool {} contains invalid characters.'\
+                sys.exit(logger.error('The index sequence \"{}\" in pool {} contains invalid characters.'\
                 ' Allowed characters: A/T/C/G/N/a/t/c/g/n/-'
-                .format(index, pool_name))
-                quit()
+                .format(index, pool_name)))
 
             # check that indexes within a pool have minimum diversity
             for i in range(index_count,len(index_seq)):
-                if lev.distance(index.lower(), index_seq[i].lower()) < LibrarySheet.MAX_DISTANCE:
+                levenshtein_distance = lev.distance(index.lower(), index_seq[i].lower())
+                if levenshtein_distance < LibrarySheet.MAX_DISTANCE:
                     logger.warning('The index sequences {} and {} in pool {}'\
                     ' display low diversity (only {} nt difference).'\
-                    .format(index,index_seq[i], pool_name, lev.distance(index.lower(), index_seq[i].lower())))
+                    .format(index,index_seq[i], pool_name, levenshtein_distance))
                     warning_low_div += 1
             index_count += 1
 
         # checks index length
         warning_index_length = 0
         warning_index_balance = 0
-        if(not sindex):
-            index_length = []
-            for index in index_seq:
-                index_length.append(len(index))
-            count_length = Counter(index_length)
-
-            if(len(count_length) > 1):
-                logger.warning('There are {} different index lengths in pool {}, please double check the sequences.'\
-                .format(len(count_length),pool_name))
-                warning_index_length += 1
-
-            # checks color balance in the pool
-            max_length = sorted(count_length.keys())[-1]
-            min_length = sorted(count_length.keys())[0]
-            index_list_colour = []
-            for index in index_seq:
-                index_colour = index.replace('T','G').replace('A','R').replace('C', 'R')
-                index_list_colour.append(list(index_colour))
-
-            for row_nr in range(0,max_length):
-                column = []
-                for row in index_list_colour:
-                    try:
-                        column.append(row[row_nr])
-                    except IndexError:
-                        pass
-                count_colour = Counter(column)
-                if(len(count_colour)<2 and sum(count_colour.values()) > 1):
-                    logger.warning('Indexes in pool {} unbalanced at position {}'\
-                    .format(pool_name, row_nr+1))
-                    warning_index_balance += 1
+        index_length = []
+        for index in index_seq:
+            index_length.append(len(index))
+        count_length = Counter(index_length)
 
+        # checks color balance in the pool
+        max_length = sorted(count_length.keys())[-1]
+        min_length = sorted(count_length.keys())[0]
+        index_list_colour = []
+        for index in index_seq:
+            index_colour = index.replace('T','G').replace('A','B').replace('C', 'R')
+            index_list_colour.append(list(index_colour))
+        for row_nr in range(0,max_length):
+            column = []
+            for row in index_list_colour:
+                try:
+                    column.append(row[row_nr])
+                except IndexError:                      # exception for pools with different index length of samples
+                    logger.warning('There are {} different index lengths in pool {}, please double check the sequences.'\
+                    .format(len(count_length),pool_name))
+                    warning_index_length += 1
+            count_colour = Counter(column)
+            if(len(count_colour)<2 and sum(count_colour.values()) > 1):
+                logger.warning('Indexes in pool {} unbalanced at position {}'\
+                .format(pool_name, row_nr+1))
+                warning_index_balance += 1
+
+        dictionary_val_index = {
+        "warning_low_div" : warning_low_div,
+        "warning_index_length" : warning_index_length,
+        "warning_index_balance" : warning_index_balance
+        }
         # returns warnings for check summary
-        return(warning_low_div, warning_index_length, warning_index_balance)
+        return(dictionary_val_index)
 
 
 def main(input_sheet, config_statusDB):
     # Instantiate the LibrarySheet object
     sheetOI = LibrarySheet(input_sheet)
     # get Project Information from couchDB
-    Project_Information, project_plate_ID = sheetOI.ProjectInfo(config_statusDB)
+    sheetOI.ProjectInfo(config_statusDB)
     # validate the project name to ensure correct identification in couchDB
-    sheetOI.validate_project_Name(Project_Information, project_plate_ID)
+    sheetOI.validate_project_Name()
     # validate all entries
-    pool_fail, poolIDs = sheetOI.validate(Project_Information)
+    pool_fail, poolIDs = sheetOI.validate()
     # final check summary
     logger.info(
         'Library submission check complete. {}/{} pool(s) pass without warnings.'\

From fdb78966a1fbfc0cde5279d7b64f03aea53ce895 Mon Sep 17 00:00:00 2001
From: FranBonath <franziska.bonath@scilifelab.se>
Date: Thu, 2 Apr 2020 10:05:02 +0200
Subject: [PATCH 4/4] changed sys.exit() code from logger.error to 1

---
 library_information_validator.py | 44 ++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/library_information_validator.py b/library_information_validator.py
index 73738e5..fa6260a 100755
--- a/library_information_validator.py
+++ b/library_information_validator.py
@@ -59,11 +59,12 @@ def projectID(self):
         if(len(re.findall('P\d+P\d+', plate_id))>0):
             project_id_user = re.findall('P\d+', plate_id)[0]
         else:
-            sys.exit(logger.error(
+            logger.error(
                 'The given plate ID ({}) in cell {} has the wrong format. It should be in the format'
                 ' PxxxxxPx, where x are numbers. If you think your Plate ID is correct, contact your project coordinator.'\
                 .format(plate_id, LibrarySheet.PLATE_ID)
-                ))
+                )
+            sys.exit(1)
         return([project_id_user, plate_id])
 
     def getRows(self, column):
@@ -103,16 +104,18 @@ def ProjectInfo(self, config):
         prow = project_id_found.rows
         # Project not found
         if len(prow) == 0:
-            sys.exit(logger.error(
+            logger.error(
                 'Project not found, please check your entry for the PlateID, it should have the format'
                 'PxxxxxPx, where x are numbers. If your Plate ID is correct, contact your project coordinator.'
-                ))
+                )
+            sys.exit(1)
         # more than one project found
         elif len(prow) > 1:
-            sys.exit(logger.error(
+            logger.error(
                 'Project ID not unique, please check your entry for the PlateID, it should have the format'
                 'PxxxxxPx, where x are numbers. If your Plate ID is correct, contact your project coordinator.'
-                ))
+                )
+            sys.exit(1)
         else:
             # puts the Document of the identified project in a new variable "pdoc"
             self.Project_Information = db.get(prow[0].id)
@@ -173,8 +176,9 @@ def validate(self):
         if(len(cell_rowid_sample) > len(cell_rowid_pool)):
             missing_pool_rowid_list = setdiff1d(cell_rowid_sample, cell_rowid_pool)
             for missing_pool_rowid in missing_pool_rowid_list:
-                sys.exit(logger.error(
-                    'Missing pool definition in {}{}'.format(LibrarySheet.POOL_NAME_SAMPLE_COL, missing_pool_rowid)))
+                logger.error(
+                    'Missing pool definition in {}{}'.format(LibrarySheet.POOL_NAME_SAMPLE_COL, missing_pool_rowid))
+                sys.exit(1)
 
         #initiate check for sequencing setup and discrepancies between ordered numbers
         #of cycles and average read length
@@ -327,8 +331,9 @@ def select_index(self, pool_rows, pool):
 
                 # generates error if both custom and NGI standard index are selected for the same sample
                 if (self.access_sample_info_sheet["{col}{row_nr}".format(col=LibrarySheet.CINDEX_COL, row_nr=sindex.split(LibrarySheet.SINDEX_COL)[1])].value is not None):
-                    sys.exit(logger.error('Custom and Standard Index selected for the sample in fields {}{} and {}{}. Please clarify which of the two indexes was used.'\
-                    .format(LibrarySheet.SINDEX_COL, sindex.split(LibrarySheet.SINDEX_COL)[1], LibrarySheet.CINDEX_COL, sindex.split(LibrarySheet.SINDEX_COL)[1])))
+                    logger.error('Custom and Standard Index selected for the sample in fields {}{} and {}{}. Please clarify which of the two indexes was used.'\
+                    .format(LibrarySheet.SINDEX_COL, sindex.split(LibrarySheet.SINDEX_COL)[1], LibrarySheet.CINDEX_COL, sindex.split(LibrarySheet.SINDEX_COL)[1]))
+                    sys.exit(1)
 
         # retrieves index sequences for custom indexes
         cindex_list = []
@@ -358,13 +363,15 @@ def select_index(self, pool_rows, pool):
             sel_index = sindex_list
             if(len(sindex_list) != len(pool_rows)):
                 for absent_index in sindex_absent:
-                    sys.exit(logger.error("missing index in row {}".format(re.sub("\D","",absent_index))))
+                    logger.error("missing index in row {}".format(re.sub("\D","",absent_index)))
+                    sys.exit(1)
         elif(len(sindex_absent) == len(pool_rows)):
             sel_index = cindex_list
             sindex_chosen = False
             if(len(cindex_list) != len(pool_rows)):
                 for absent_index in cindex_absent:
-                    sys.exit(logger.error("missing index in row {}".format(re.sub("\D","",absent_index))))
+                    logger.error("missing index in row {}".format(re.sub("\D","",absent_index)))
+                    sys.exit(1)
         else:
             sindex_chosen = False
 
@@ -399,15 +406,17 @@ def validate_index(self, index_seq, pool_name, sindex):
         # allows for entry "noIndex" if only one sample is defined in the pool
         c = Counter(index_seq)
         if(c['noIndex'] > 0 and len(index_seq) != 1):
-            sys.exit(logger.error('Pool {} contains undefined index(es) (\"noIndex\")'.format(pool_name)))
+            logger.error('Pool {} contains undefined index(es) (\"noIndex\")'.format(pool_name))
+            sys.exit(1)
         elif(c['noIndex'] == 1 and len(index_seq) == 1):
             logger.info('Pool {} containing one sample is not indexed.'.format(pool_name))
         elif(c['noIndex'] == 0):
             # checks that all indexes in a pool are unique
             for index, index_count in c.most_common():
                 if(index_count>1):
-                    sys.exit(logger.error('The index sequence \"{}\" in pool {} is not unique for this pool.'\
-                    .format(index, pool_name)))
+                    logger.error('The index sequence \"{}\" in pool {} is not unique for this pool.'\
+                    .format(index, pool_name))
+                    sys.exit(1)
                 else:
                     break
 
@@ -418,9 +427,10 @@ def validate_index(self, index_seq, pool_name, sindex):
             # checks that indexes only contain valid letters
             index_search = charRE.search(index)
             if(bool(index_search) and index != "noIndex"):
-                sys.exit(logger.error('The index sequence \"{}\" in pool {} contains invalid characters.'\
+                logger.error('The index sequence \"{}\" in pool {} contains invalid characters.'\
                 ' Allowed characters: A/T/C/G/N/a/t/c/g/n/-'
-                .format(index, pool_name)))
+                .format(index, pool_name))
+                sys.exit(1)
 
             # check that indexes within a pool have minimum diversity
             for i in range(index_count,len(index_seq)):