diff --git a/DeepSequence/helper.py b/DeepSequence/helper.py
index 8ffdbf5..27165ae 100644
--- a/DeepSequence/helper.py
+++ b/DeepSequence/helper.py
@@ -1,4 +1,8 @@
 from __future__ import print_function
+from collections import defaultdict
+import cPickle
+import os
+
 import numpy as np
 import theano
 import theano.tensor as T
@@ -9,14 +13,18 @@
 
 class DataHelper:
     def __init__(self,
-        dataset="",
-        alignment_file="",
-        focus_seq_name="",
-        calc_weights=True,
-        working_dir=".",
-        theta=0.2,
-        load_all_sequences=True,
-        alphabet_type="protein"):
+                 dataset="",
+                 alignment_file="",
+                 focus_seq_name="",
+                 calc_weights=True,
+                 working_dir=".",
+                 theta=None,
+                 load_all_sequences=True,
+                 alphabet_type="protein",
+                 weights_dir="",
+                 save_weights=False,
+                 alignments_dir=None,
+                 ):
 
         """
         Class to load and organize alignment data.
@@ -42,6 +50,8 @@ def __init__(self,
         load_all_sequences:
         alphabet_type: Alphabet type of associated dataset.
                             Options are DNA, RNA, protein, allelic
+        weights_dir: location of the weights, assumed to be in form: <self.dataset>_<etc>.npy,
+            where etc could be theta_0.x
 
         Returns
         ------------
@@ -50,19 +60,24 @@ def __init__(self,
 
         np.random.seed(42)
         self.dataset = dataset
+        self.dataset = self.dataset.split(".a2m")[0]  # Remove prefix (if there's no prefix, this will still be ok)
         self.alignment_file = alignment_file
         self.focus_seq_name = focus_seq_name
         self.working_dir = working_dir
         self.calc_weights = calc_weights
         self.alphabet_type = alphabet_type
+        self.weights_dir = weights_dir
+        self.save_weights = save_weights
+        self.alignments_dir = alignments_dir
 
         # Initalize the elbo of the wt to None
         #   will be useful if eventually doing mutation effect prediction
         self.wt_elbo = None
 
         # Alignment processing parameters
-        self.theta = theta
-
+        # Note: Script will fail if calc_weights is True and theta is not set
+        if theta is not None:
+            self.theta = theta
         # If I am running tests with the model, I don't need all the
         #    sequences loaded
         self.load_all_sequences = load_all_sequences
@@ -75,9 +90,15 @@ def __init__(self,
         if self.alphabet_type == "protein":
             self.alphabet = "ACDEFGHIKLMNPQRSTVWY"
             self.reorder_alphabet = "DEKRHNQSTPGAVILMCFYW"
+        elif self.alphabet_type == "protein_withgap":
+            self.alphabet = "ACDEFGHIKLMNPQRSTVWY-"
+            self.reorder_alphabet = "DEKRHNQSTPGAVILMCFYW-"
         elif self.alphabet_type == "RNA":
             self.alphabet = "ACGU"
             self.reorder_alphabet = "ACGU"
+        elif self.alphabet_type == "RNA_withgap":
+            self.alphabet = "ACGU-"
+            self.reorder_alphabet = "ACGU-"
         elif self.alphabet_type == "DNA":
             self.alphabet = "ACGT"
             self.reorder_alphabet = "ACGT"
@@ -85,7 +106,7 @@ def __init__(self,
             self.alphabet = "012"
             self.reorder_alphabet = "012"
 
-        #then generate the experimental data
+        # then generate the experimental data
         self.gen_basic_alignment()
 
         if self.load_all_sequences:
@@ -94,44 +115,178 @@ def __init__(self,
     def configure_datasets(self):
 
         if self.dataset == "BLAT_ECOLX":
-            self.alignment_file = self.working_dir+"/datasets/BLAT_ECOLX_hmmerbit_plmc_n5_m30_f50_t0.2_r24-286_id100_b105.a2m"
+            self.alignment_file = self.working_dir + "/datasets/alignments/BLAT_ECOLX_hmmerbit_plmc_n5_m30_f50_t0.2_r24-286_id100_b105.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "BLAT_ECOLX_withgaps":
+            self.alignment_file = self.working_dir + "/datasets/alignments/BLAT_ECOLX_1_b0.5.a2m"
+            self.alphabet_type = "protein_withgap"
+            self.theta = 0.2
+
+        elif self.dataset == "PTEN_HUMAN_withgaps":
+            self.alignment_file = self.working_dir + "/datasets/alignments/PTEN_HUMAN_1_b0.3.a2m"
+            self.alphabet_type = "protein_withgap"
+            self.theta = 0.2
+
+        elif self.dataset == "HIS7_YEAST_withgaps":
+            self.alignment_file = self.working_dir + "/datasets/alignments/HIS7_YEAST_1_b0.5.a2m"
+            self.alphabet_type = "protein_withgap"
+            self.theta = 0.2
+
+        elif self.dataset == "P53_HUMAN_withgaps":
+            self.alignment_file = self.working_dir + "/datasets/alignments/P53_HUMAN_r90-300_uniref100_Nov17_b0.06.a2m"
+            self.alphabet_type = "protein_withgap"
+            self.theta = 0.2
+
+        elif self.dataset == "SNORNA_YEAST_withgaps":
+            self.alignment_file = self.working_dir + "/datasets/alignments/CL00100_cmRF00012_m70_f50.a2m"
+            self.alphabet_type = "RNA_withgap"
+            self.theta = 0.2
+
+        elif self.dataset == 'naive_repertoire_fullseqs_withgaps':
+            self.alignment_file = self.working_dir + '/datasets/alignments/naive_repertoire_annotated_aligned_fullseqs.fa'
+            self.alphabet_type = "protein_withgap"
             self.theta = 0.2
 
         elif self.dataset == "PABP_YEAST":
-            self.alignment_file = self.working_dir+"/datasets/PABP_YEAST_hmmerbit_plmc_n5_m30_f50_t0.2_r115-210_id100_b48.a2m"
+            self.alignment_file = self.working_dir + "/datasets/PABP_YEAST_hmmerbit_plmc_n5_m30_f50_t0.2_r115-210_id100_b48.a2m"
             self.theta = 0.2
 
         elif self.dataset == "DLG4_RAT":
-            self.alignment_file = self.working_dir+"/datasets/DLG4_RAT_hmmerbit_plmc_n5_m30_f50_t0.2_r300-400_id100_b50.a2m"
+            self.alignment_file = self.working_dir + "/datasets/DLG4_RAT_hmmerbit_plmc_n5_m30_f50_t0.2_r300-400_id100_b50.a2m"
             self.theta = 0.2
 
         elif self.dataset == "trna":
-            self.alignment_file = self.working_dir+"/datasets/RF00005_CCU.fasta"
+            self.alignment_file = self.working_dir + "/datasets/RF00005_CCU.fasta"
             self.alphabet_type = "RNA"
             self.theta = 0.2
 
-        else:
-            self.alignment_file = self.working_dir+'/datasets/alignments/'+self.dataset+'.a2m'
+        elif self.dataset == "PA_FLU":
+            self.alignment_file = self.working_dir + "/datasets/alignments/PA_FLU_1_b0.5.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "PA_FLU_orig":
+            self.alignment_file = self.working_dir + "/datasets/alignments/PA_FLU_1_b0.5.a2m"
+            self.theta = 0.01
+
+        elif self.dataset == "PA_FLU_jonny":
+            self.alignment_file = self.working_dir + "/datasets/alignments/PA_FLU_jonny_1_b0.5.a2m"
+            self.theta = 0.2
+            self.calc_weights = False
+
+        elif self.dataset == "DPO1_KLULA":
+            self.alignment_file = self.working_dir + "/datasets/alignments/DPO1_KLULA_b0.1.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "DRTS_PLAFK":
+            self.alignment_file = self.working_dir + "/datasets/alignments/DRTS_PLAFK_r1-280_b0.1.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "HIS4_THEMA":
+            self.alignment_file = self.working_dir + "/datasets/alignments/HIS4_THEMA_b0.2.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "HIS4_YEAST":
+            self.alignment_file = self.working_dir + "/datasets/alignments/HIS4_YEAST_b0.2.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "HIS4_YEAST_b0.1":
+            self.alignment_file = self.working_dir + "/datasets/alignments/HIS4_YEAST_b0.1.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "HIS4_THEMA_b0.1":
+            self.alignment_file = self.working_dir + "/datasets/alignments/HIS4_THEMA_b0.1.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "TRP_YEAST":
+            self.alignment_file = self.working_dir + "/datasets/alignments/TRP_YEAST_b0.1.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "TRPB1_THEMA":
+            self.alignment_file = self.working_dir + "/datasets/alignments/TRPB1_THEMA_b0.4.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "TRPB2_THEMA":
+            self.alignment_file = self.working_dir + "/datasets/alignments/TRPB2_THEMA_b0.4.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "TRPF_YEAST":
+            self.alignment_file = self.working_dir + "/datasets/alignments/TRPF_YEAST_b0.2.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "TRPF_YEAST_b0.1":
+            self.alignment_file = self.working_dir + "/datasets/alignments/TRPF_YEAST_b0.1.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "BLAT_ECOLX_1_seqid0.3":
+            self.alignment_file = self.working_dir + "/datasets/alignments/BLAT_ECOLX_1_b0.5_seqid0.3.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "BLAT_ECOLX_1":
+            self.alignment_file = self.working_dir + "/datasets/alignments/BLAT_ECOLX_1_b0.5.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "AMIE_PSEAE_1_seqid0.3":
+            self.alignment_file = self.working_dir + "/datasets/alignments/AMIE_PSEAE_1_b0.3_seqid0.3.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "AMIE_PSEAE_1":
+            self.alignment_file = self.working_dir + "/datasets/alignments/AMIE_PSEAE_1_b0.3.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == "P53_HUMAN":
+            self.alignment_file = self.working_dir + "/datasets/alignments/P53_HUMAN_r90-300_uniref100_Nov17_b0.06.a2m"
+            self.theta = 0.2
+
+        elif self.dataset == 'BF520_env':
+            self.alignment_file = self.working_dir + '/datasets/alignments/BF520_env_1_b0.5.a2m'
+            self.theta = 0.01
+
+        elif self.dataset == 'BG505_env':
+            self.alignment_file = self.working_dir + '/datasets/alignments/BG505_env_1_b0.5.a2m'
+            self.theta = 0.01
+
+        elif self.dataset == 'deltaNTRP_YEAST':
+            self.alignment_file = self.working_dir + '/datasets/alignments/deltaNTRP_YEAST_b0.4.a2m'
+            self.theta = 0.2
+
+        elif self.dataset == 'PfDHFR':
+            self.alignment_file = self.working_dir + '/datasets/alignments/PfDHFR_b0.1.a2m'
+            self.theta = 0.2
+
+        elif self.dataset == 'TP-DNAP1':
+            self.alignment_file = self.working_dir + '/datasets/alignments/TP-DNAP1_b0.1.a2m'
+            self.theta = 0.2
+
+        elif self.dataset == 'BRCA1_HUMAN':
+            self.alignment_file = self.working_dir + '/datasets/alignments/BRCA1_HUMAN_1_b0.5.a2m'
             self.theta = 0.2
 
+        else:
+            if self.alignments_dir is not None:
+                self.alignment_file = os.path.join(self.alignments_dir, self.dataset + ".a2m")
+            else:
+                self.alignment_file = self.working_dir + '/datasets/alignments/' + self.dataset + '.a2m'
+        assert os.path.isfile(self.alignment_file), "Alignment file not found: " + self.alignment_file
+
     def one_hot_3D(self, s):
         """ Transform sequence string into one-hot aa vector"""
         # One-hot encode as row vector
         x = np.zeros((len(s), len(self.alphabet)))
         for i, letter in enumerate(s):
             if letter in self.aa_dict:
-                x[i , self.aa_dict[letter]] = 1
+                x[i, self.aa_dict[letter]] = 1
         return x
 
     def gen_basic_alignment(self):
         """ Read training alignment and store basics in class instance """
         # Make a dictionary that goes from aa to a number for one-hot
         self.aa_dict = {}
-        for i,aa in enumerate(self.alphabet):
+        for i, aa in enumerate(self.alphabet):
             self.aa_dict[aa] = i
 
         # Do the inverse as well
-        self.num_to_aa = {i:aa for aa,i in self.aa_dict.items()}
+        self.num_to_aa = {i: aa for aa, i in self.aa_dict.items()}
 
         ix = np.array([self.alphabet.find(s) for s in self.reorder_alphabet])
 
@@ -165,8 +320,12 @@ def gen_basic_alignment(self):
 
         # We also expect the focus sequence to be formatted as:
         # >[NAME]/[start]-[end]
-        focus_loc = self.focus_seq_name.split("/")[-1]
-        start,stop = focus_loc.split("-")
+        focus_loc = self.focus_seq_name
+        # Can include extra information (e.g. a custom weight) after the fasta header
+        if ':' in focus_loc:
+            focus_loc = focus_loc[:focus_loc.rfind(':')]
+        focus_loc = focus_loc.split("/")[-1]
+        start, stop = focus_loc.split("-")
         self.focus_start_loc = int(start)
         self.focus_stop_loc = int(stop)
         self.uniprot_focus_cols_list \
@@ -176,21 +335,20 @@ def gen_basic_alignment(self):
         self.uniprot_focus_col_to_focus_idx \
             = {idx_col+int(start):idx_col for idx_col in self.focus_cols}
 
-
     def gen_full_alignment(self):
 
         # Get only the focus columns
-        for seq_name,sequence in self.seq_name_to_sequence.items():
+        for seq_name, sequence in self.seq_name_to_sequence.items():
             # Replace periods with dashes (the uppercase equivalent)
-            sequence = sequence.replace(".","-")
+            sequence = sequence.replace(".", "-")
 
-            #then get only the focus columns
+            # then get only the focus columns
             self.seq_name_to_sequence[seq_name] = [sequence[ix].upper() for ix in self.focus_cols]
 
         # Remove sequences that have bad characters
         alphabet_set = set(list(self.alphabet))
         seq_names_to_remove = []
-        for seq_name,sequence in self.seq_name_to_sequence.items():
+        for seq_name, sequence in self.seq_name_to_sequence.items():
             for letter in sequence:
                 if letter not in alphabet_set and letter != "-":
                     seq_names_to_remove.append(seq_name)
@@ -200,17 +358,16 @@ def gen_full_alignment(self):
             del self.seq_name_to_sequence[seq_name]
 
         # Encode the sequences
-        print ("Encoding sequences")
-        self.x_train = np.zeros((len(self.seq_name_to_sequence.keys()),len(self.focus_cols),len(self.alphabet)))
+        print("Encoding sequences")
+        self.x_train = np.zeros((len(self.seq_name_to_sequence.keys()), len(self.focus_cols), len(self.alphabet)))
         self.x_train_name_list = []
-        for i,seq_name in enumerate(self.seq_name_to_sequence.keys()):
+        for i, seq_name in enumerate(self.seq_name_to_sequence.keys()):
             sequence = self.seq_name_to_sequence[seq_name]
             self.x_train_name_list.append(seq_name)
-            for j,letter in enumerate(sequence):
+            for j, letter in enumerate(sequence):
                 if letter in self.aa_dict:
                     k = self.aa_dict[letter]
-                    self.x_train[i,j,k] = 1.0
-
+                    self.x_train[i, j, k] = 1.0
 
         # Fast sequence weights with Theano
         if self.calc_weights:
@@ -229,16 +386,58 @@ def gen_full_alignment(self):
             #
             self.weights = weightfun(self.x_train, self.theta)[0]
 
+            if self.save_weights:
+                print("Saving sequence weights, dataset={} in  dir {}".format(self.dataset, self.weights_dir))
+                if os.path.isdir(self.weights_dir):
+                    weights_dir_found = self.weights_dir
+                else:
+                    weights_dir_found = os.path.join(self.working_dir, self.weights_dir)
+                    assert os.path.isdir(weights_dir_found), "Could not find weights directory: {} given, expanded to {} using working_dir".format(self.weights_dir, weights_dir_found)
+                # e.g. BLAT_ECOLX_theta_0.2.npy
+                filename_out = os.path.join(weights_dir_found, "{}_theta_{}.npy".format(self.dataset, self.theta))
+                print("Saving weights to {}".format(filename_out))
+                np.save(filename_out, self.weights)
+
         else:
-            # If not using weights, use an isotropic weight matrix
-            self.weights = np.ones(self.x_train.shape[0])
+            if ':' in self.focus_seq_name:
+                print("Loading detected sequence weights")
+                self.weights = np.zeros(self.x_train.shape[0])
+                for i, seq_name in enumerate(self.x_train_name_list):
+                    self.weights[i] = float(seq_name.split(':')[-1])
+            elif self.weights_dir != "":
+                print("Loading sequence weights from file, looking for {} in {}".format(self.dataset, self.weights_dir))
+
+                # Get UniProt ID (the prefix before the second underscore)
+                # TODO Note: This fails for some of the MSA,weight pairs in the original DeepSeq dataset,
+                #  since they don't have unique UniProt ids
+                dataset_prefix = "_".join(self.dataset.split("_")[:2])
+                # Set path
+                if os.path.isdir(self.weights_dir):
+                    weights_dir_found = self.weights_dir
+                else:
+                    weights_dir_found = os.path.join(self.working_dir, self.weights_dir)
+                assert os.path.isdir(weights_dir_found), "Could not find weights directory: {} given, expanded to {}".format(
+                        self.weights_dir, weights_dir_found)
+
+                # Find weights file using dataset prefix
+                found = [file for file in os.listdir(weights_dir_found) if file.startswith(dataset_prefix) and file.endswith(".npy")]
+                assert len(found) == 1, \
+                    "Could not find unique weights file for dataset {} with prefix {}, in {}, found {} files"\
+                    .format(self.dataset, dataset_prefix, weights_dir_found, found)
+                weights_location = os.path.join(weights_dir_found, found[0])
+
+                self.weights = np.load(file=weights_location)
+                print("Weights loaded from {}".format(weights_location))
+            else:
+                # If not using weights, use an isotropic weight matrix
+                print("Not using weights, using isotropic weight matrix")
+                self.weights = np.ones(self.x_train.shape[0])
 
         self.Neff = np.sum(self.weights)
 
         print ("Neff =",str(self.Neff))
         print ("Data Shape =",self.x_train.shape)
 
-
     def delta_elbo(self, model, mutant_tuple_list, N_pred_iterations=10):
 
         for pos,wt_aa,mut_aa in mutant_tuple_list:
@@ -251,7 +450,6 @@ def delta_elbo(self, model, mutant_tuple_list, N_pred_iterations=10):
         for pos,wt_aa,mut_aa in mutant_tuple_list:
             mut_seq[self.uniprot_focus_col_to_focus_idx[pos]] = mut_aa
 
-
         if self.wt_elbo == None:
             mutant_sequences = [self.focus_seq_trimmed, mut_seq]
         else:
@@ -269,7 +467,6 @@ def delta_elbo(self, model, mutant_tuple_list, N_pred_iterations=10):
         prediction_matrix = np.zeros((mutant_sequences_one_hot.shape[0],N_pred_iterations))
         idx_batch = np.arange(mutant_sequences_one_hot.shape[0])
         for i in range(N_pred_iterations):
-
             batch_preds, _, _ = model.all_likelihood_components(mutant_sequences_one_hot)
 
             prediction_matrix[:,i] = batch_preds
@@ -463,7 +660,171 @@ def custom_mutant_matrix(self, input_filename, model, N_pred_iterations=10, \
                 OUTPUT.write(descriptor+";"+str(self.delta_elbos[i])+"\n")
 
             OUTPUT.close()
-    
+
+    def custom_mutant_matrix_df(self, input_filename, model, mutant_col, effect_col, N_pred_iterations=2000, \
+                                minibatch_size=2000, output_filename_prefix="", silent_allowed=False,
+                                lowercase_allowed=False, random_seed=None):
+
+        """ Predict the delta elbo for a custom mutation filename, using pandas dataframes.
+        """
+        if silent_allowed:
+            print("Silent mutations allowed")
+        if lowercase_allowed:
+            print("Lowercase allowed")
+        import pandas as pd  # Need pandas for this function
+
+        # run through the input file
+        if not os.path.isfile(input_filename):
+            input_filename = os.path.join(self.working_dir, input_filename)
+            assert os.path.isfile(input_filename), "File not found: " + input_filename
+
+        print("Going through input file:", input_filename)
+        df_input = pd.read_csv(input_filename)
+        print(df_input.head(30))
+        print(len(df_input), "rows")
+        assert mutant_col in df_input.columns, "Mutant column {} not found in input file, columns={}".format(mutant_col, str(df_input.columns))
+        assert effect_col in df_input.columns, "Effect column {} not found in input file, columns={}".format(effect_col, str(df_input.columns))
+        print("cleaning up dms")
+        df, _ = DMS_file_cleanup(DMS_data=df_input, target_seq=self.focus_seq, alphabet=self.alphabet,
+                                 DMS_mutant_column=mutant_col, DMS_phenotype_name=effect_col)
+        print(df.head(30))
+        print(len(df), "rows after cleanup")
+
+        # Get the start and end index from the sequence name
+        start_idx, end_idx = self.focus_seq_name.split("/")[-1].split("-")
+        start_idx = int(start_idx)
+
+        # wt_pos_focus_idx_tuple_list = []
+        focus_seq_index = 0
+        # focus_seq_list = []
+        mutant_to_letter_pos_idx_focus_list = {}
+
+        # find all possible valid mutations that can be run with this alignment
+        print("Generating valid mutations")
+        print("focus_seq", self.focus_seq_name, ":", self.focus_seq)
+        print("focus_seq_trimmed ", self.focus_seq_trimmed)
+
+        assert model.seq_len == len(self.focus_cols)
+
+        for i, letter in enumerate(self.focus_seq):
+            if i % 10 == 0:
+                print(i)
+
+            # seq=aBcdE, focus_seq_trimmed=BE,
+            # then A1B will be invalid (lowercase);
+            # B2M is valid and the mutated focus_seq is ME (mutating index 0)
+            if letter == letter.upper():  # or lowercase_allowed:
+                for mut in self.alphabet:
+                    pos = start_idx + i
+                    if letter != mut:
+                        mutant = letter + str(pos) + mut
+                        mutant_to_letter_pos_idx_focus_list[mutant] = [letter, pos, focus_seq_index]
+                focus_seq_index += 1
+            # else: # for debugging
+            #     print("lowercase letter: ", focus_seq_index, letter)
+
+        print("All possible (single?) mutations:", mutant_to_letter_pos_idx_focus_list)
+
+        def get_mutated_seq(mutants):
+            mutant_list = mutants.split(":")
+            focus_seq_copy = list(self.focus_seq_trimmed)[:]
+
+            for mutant in mutant_list:
+                wt_aa, pos, idx_focus = mutant_to_letter_pos_idx_focus_list[mutant]
+                mut_aa = mutant[-1]
+                focus_seq_copy[idx_focus] = mut_aa
+
+            return focus_seq_copy
+
+        print("Getting mutated sequences")
+        # First sequence is the wt sequence
+        mutant_sequences = [list(self.focus_seq_trimmed)[:]]
+        valid_mutants = ['wt']
+        # df_input.apply(lambda row: get_mutated_seq(row['mutant']), axis=1).to_list()
+        for mutants in df['mutant'].to_list():
+            assert isinstance(mutants, str), "mutants is not a string: " + str(mutants)
+            mutant_list = mutants.split(":")
+
+            valid_mutant = True
+            # TODO handle silent mutations, don't raise errors for them
+            # if any of the mutants in this list aren"t in the focus sequence,
+            #    I cannot make a prediction
+            for mutant in mutant_list:
+                if mutant not in mutant_to_letter_pos_idx_focus_list:
+                    valid_mutant = False
+                    print("Invalid mutant:", mutant)
+                    print("mutant_list", mutant_list)
+                    break
+
+            # If it is a valid mutant, add it to my list to make predictions
+            if valid_mutant:
+                focus_seq_copy = list(self.focus_seq_trimmed)[:]
+
+                for mutant in mutant_list:
+                    wt_aa, pos, idx_focus = mutant_to_letter_pos_idx_focus_list[mutant]
+                    mut_aa = mutant[-1]
+                    focus_seq_copy[idx_focus] = mut_aa
+
+                mutant_sequences.append("".join(focus_seq_copy))
+                valid_mutants.append(mutants)
+
+        print("Number of valid mutant sequences:", len(mutant_sequences))
+        if len(mutant_sequences) == 0:
+            raise ValueError("No valid mutant sequences found")
+
+        print("Making one-hot matrix")
+        # Then make the one hot sequence
+        mutant_sequences_one_hot = np.zeros((len(mutant_sequences), len(self.focus_cols), len(self.alphabet)))
+        for i, sequence in enumerate(mutant_sequences):
+            if i % 10 == 0:
+                print(i)
+            for j, letter in enumerate(sequence):
+                k = self.aa_dict[letter]
+                mutant_sequences_one_hot[i, j, k] = 1.0
+
+        prediction_matrix = np.zeros((mutant_sequences_one_hot.shape[0], N_pred_iterations))
+        batch_order = np.arange(mutant_sequences_one_hot.shape[0])
+
+        print("Getting ELBOs over ", N_pred_iterations, " iterations and ", minibatch_size, " size minibatches")
+        print("Prediction matrix size:", prediction_matrix.shape)
+
+        # Why not batch along the iterations direction? if we have mutants < minibatch_size this doesn't help.
+        #  Although if iterations is too big, it may not fit in a minibatch.
+        #  And if mutants is too big, it may also not fit.
+        for i in range(N_pred_iterations):
+            if i % 10 == 0:
+                print(i)
+            np.random.shuffle(batch_order)
+            for j in range(0, mutant_sequences_one_hot.shape[0], minibatch_size):
+                batch_index = batch_order[j:j + minibatch_size]
+                batch_preds, _, _ = model.all_likelihood_components(mutant_sequences_one_hot[batch_index])
+
+                for k, idx_batch in enumerate(batch_index.tolist()):
+                    prediction_matrix[idx_batch][i] = batch_preds[k]
+
+        # Then take the mean of all my elbo samples
+        mean_elbos = np.mean(prediction_matrix, axis=1).flatten().tolist()
+
+        # Remove the wild type sequence
+        wt_elbo = mean_elbos.pop(0)
+        valid_mutants.pop(0)
+
+        delta_elbos = np.asarray(mean_elbos) - wt_elbo
+        assert len(delta_elbos) == len(valid_mutants), "delta_elbos and valid_mutants should be the same length" + str(
+            len(delta_elbos)) + " " + str(len(valid_mutants))
+        elbos = pd.DataFrame({'DeepSequence': delta_elbos, 'mutant': valid_mutants})
+        print("elbos:\n", len(elbos), elbos.head(30))
+        df = df.merge(elbos, how='inner', on='mutant')
+        print("merged:\n", len(df), df.head())
+
+        print("Saving to file")
+        output_filename = output_filename_prefix + "_samples-" + str(N_pred_iterations) + "_elbo_predictions.csv"
+        if random_seed is not None:
+            output_filename = output_filename_prefix + "_samples-" + str(N_pred_iterations) + "_seed-" + str(random_seed) + "_elbo_predictions.csv"
+        df.to_csv(output_filename, index=False)
+
+        print("Written to file", output_filename)
+
     def custom_sequences(self, input_filename, model, N_pred_iterations=10, \
             minibatch_size=2000, filename_prefix="", offset=0):
 
@@ -483,15 +844,18 @@ def custom_sequences(self, input_filename, model, N_pred_iterations=10, \
         self.mutant_sequences_descriptor = ["wt"]
 
         # run through the input file
-        INPUT = open(self.working_dir+"/"+input_filename, "r")
+        if not os.path.isfile(input_filename):
+            input_filename = os.path.join(self.working_dir, input_filename)
+            assert os.path.isfile(input_filename), "File not found: "+input_filename
+
+        INPUT = open(input_filename, "r")
         #INPUT = open(input_filename, "r")
         header = ''
         new_line = ''
         alphabet = set(self.alphabet)
 
-        print(self.focus_seq)
-        print(self.focus_seq)
-        print(self.focus_seq_trimmed)
+        print("focus_seq", self.focus_seq)
+        print("focus_seq_trimmed", self.focus_seq_trimmed)
 
         for i,line in enumerate(INPUT):
             line = line.rstrip()
@@ -513,12 +877,7 @@ def custom_sequences(self, input_filename, model, N_pred_iterations=10, \
         if len(new_line) == len(self.focus_seq_trimmed):
             self.mutant_sequences.append(new_line)
             self.mutant_sequences_descriptor.append(header)
-            
-        print(self.mutant_sequences_descriptor[0])
-        print(self.mutant_sequences[0])
-        print(self.mutant_sequences_descriptor[1])
-        print(self.mutant_sequences[1])
-        
+
         INPUT.close()
         print(self.alphabet)
         # Then make the one hot sequence
@@ -590,7 +949,6 @@ def get_pattern_activations(self, model, update_num, filename_prefix="",
 
         OUTPUT.close()
 
-
     def get_embeddings(self, model, update_num, filename_prefix="",
                         verbose=False, minibatch_size=2000):
         """ Save the latent variables from all the sequences in the alignment """
@@ -609,7 +967,6 @@ def get_embeddings(self, model, update_num, filename_prefix="",
             header_list = mu_header_list + log_sigma_header_list
             OUTPUT.write("update_num,name,"+",".join(header_list)+"\n")
 
-
         batch_order = np.arange(len(self.x_train_name_list))
 
         for i in range(0,len(self.x_train_name_list),minibatch_size):
@@ -645,6 +1002,7 @@ def get_elbo_samples(self, model, N_pred_iterations=100, minibatch_size=2000):
                 for k,idx_batch in enumerate(batch_index.tolist()):
                     self.prediction_matrix[idx_batch][i]= batch_preds[k]
 
+
 def gen_job_string(data_params, model_params):
     """
         Generates a unique job string given data and model parameters.
@@ -679,6 +1037,7 @@ def gen_job_string(data_params, model_params):
     encoder_architecture_str = "-".join([str(size) for size in encoder_architecture])
     decoder_architecture_str = "-".join([str(size) for size in decoder_architecture])
 
+    # Note: If job_str is too long it will cause an error when saving
     job_str = "vae_output_encoder-"+encoder_architecture_str+"_Nlatent-"+str(n_latent)\
         +"_decoder-"+decoder_architecture_str
 
@@ -687,6 +1046,9 @@ def gen_job_string(data_params, model_params):
         if data_id not in written_out_vals:
             if str(type(data_val)) == "<type 'list'>":
                 job_id_list.append(data_id+"-"+"-".join([str(val) for val in data_val]))
+            # LvN: Skipped '/' character because it causes errors when using job_id as filename
+            elif isinstance(data_val, str) and "/" in data_val:
+                pass
             else:
                 job_id_list.append(data_id+"-"+str(data_val))
 
@@ -700,3 +1062,66 @@ def gen_job_string(data_params, model_params):
 
 
     return job_str+"_"+"_".join(job_id_list)
+
+
+# Copied from Javier's /n/groups/marks/users/javier/ESM-1b/protein_transformer/utils/mutation_scoring.py
+def DMS_file_cleanup(DMS_data, target_seq, alphabet, start_idx=1, end_idx=None, DMS_mutant_column='mutant',
+                     DMS_phenotype_name='score', DMS_directionality=1, keep_singles_only=False):
+    import pandas as pd
+
+    end_idx = start_idx + len(target_seq) - 1 if end_idx is None else end_idx
+
+    DMS_data['mutant'] = DMS_data[DMS_mutant_column]
+    num_starting_mutants = len(DMS_data)
+    DMS_data = DMS_data[DMS_data['mutant'].notnull()].copy()  # This should be filtering NaNs
+    # Make sure we're filtering nans
+    DMS_data.dropna(subset=['mutant'], inplace=True)
+    # Different way of filtering nans
+    DMS_data = DMS_data[DMS_data['mutant'].apply(lambda x: isinstance(x, str))]
+
+    DMS_data = DMS_data[DMS_data['mutant'].apply(lambda x: all([len(y) >= 3 for y in x.split(":")]))].copy()  # filter first set of degenerate mutants
+
+    def is_int(s):
+        try:
+            int(s)
+            return True
+        except ValueError:
+            return False
+
+    DMS_data=DMS_data[DMS_data['mutant'].apply(lambda x: all([(y[0] in alphabet) and is_int(y[1:-1]) and (y[-1] in alphabet) for y in x.split(":")]))].copy()
+    num_valid_format_mutants = len(DMS_data)
+    print("num_valid_format_mutants", num_valid_format_mutants)
+
+    # TODO note the removal of these lines
+    # DMS_data = DMS_data[DMS_data['mutant'].apply(lambda x: all([int(y[1:-1]) - start_idx >= 0 and int(y[1:-1]) <= end_idx for y in x.split(":")]))].copy()
+    # DMS_data = DMS_data[DMS_data['mutant'].apply(lambda x: all([y[0] == target_seq[int(y[1:-1]) - start_idx] for y in x.split(":")]))].copy()  # This is done in the method
+    num_fully_valid_mutants = len(DMS_data)
+    print("num_fully_valid_mutants", num_fully_valid_mutants)
+    if num_fully_valid_mutants == 0:
+        raise ValueError("No fully valid mutants found")
+
+    DMS_data = DMS_data[DMS_data['mutant'].apply(lambda x: any([y[0] != y[-1] for y in x.split(":")]))].copy()  # At least of the mutations in mutant is not silent
+    num_fully_valid_non_silent_mutants = len(DMS_data)
+
+    if keep_singles_only:
+        DMS_data = DMS_data[DMS_data['mutant'].apply(lambda x: len(x.split(":")) == 1)].copy()
+        num_fully_valid_single_mutants = len(DMS_data)
+
+    DMS_data[DMS_phenotype_name] = pd.to_numeric(DMS_data[DMS_phenotype_name], errors='coerce')
+    DMS_data.dropna(subset=[DMS_phenotype_name], inplace=True)
+
+    DMS_data['DMS_score'] = DMS_data[DMS_phenotype_name] * DMS_directionality
+    DMS_data = DMS_data[['mutant', 'DMS_score']]
+
+    num_mutants_with_non_missing_DMS_score = len(DMS_data)
+    DMS_data = DMS_data.groupby('mutant').mean().reset_index()
+    num_distinct_mutants_with_non_missing_DMS_score = len(DMS_data)
+    if keep_singles_only:
+        quality_checks = [num_starting_mutants, num_valid_format_mutants, num_fully_valid_mutants,
+                          num_fully_valid_non_silent_mutants, num_fully_valid_single_mutants,
+                          num_mutants_with_non_missing_DMS_score, num_distinct_mutants_with_non_missing_DMS_score]
+    else:
+        quality_checks = [num_starting_mutants, num_valid_format_mutants, num_fully_valid_mutants,
+                          num_fully_valid_non_silent_mutants, num_mutants_with_non_missing_DMS_score,
+                          num_distinct_mutants_with_non_missing_DMS_score]
+    return DMS_data, quality_checks
diff --git a/calc_weights.py b/calc_weights.py
new file mode 100644
index 0000000..b43a498
--- /dev/null
+++ b/calc_weights.py
@@ -0,0 +1,45 @@
+# Basically a copy of the first part of run_svi.py, but not loading the model after calculating the weights
+import time
+import sys
+
+sys.path.insert(0, "../DeepSequence/")
+import numpy as np
+
+import helper
+import argparse
+
+parser = argparse.ArgumentParser(description="Calculating the weights and storing in weights_dir.")
+parser.add_argument("--dataset", type=str, default="BLAT_ECOLX",
+                    help="Dataset name for fitting model.")
+parser.add_argument("--theta-override", type=float, default=None,
+                    help="Override the model theta.")
+# Keeping this different from weights_dir just so that we don't make mistakes and overwrite weights
+parser.add_argument("--alignments_dir", type=str, help="Overrides the default ./datasets/alignments/")
+parser.add_argument("--weights_dir_out", type=str, default="", help="Location to store weights.")
+args = parser.parse_args()
+
+# DataHelper expects the dataset name without extension
+args.dataset = args.dataset.split(".a2m")[0]
+assert not args.dataset.endswith(".a2m")
+
+data_params = {
+    "dataset": args.dataset,
+    "weights_dir": args.weights_dir_out,
+}
+
+if __name__ == "__main__":
+    start_time = time.time()
+
+    data_helper = helper.DataHelper(dataset=data_params["dataset"],
+                                    working_dir='.',
+                                    theta=args.theta_override,
+                                    weights_dir=data_params["weights_dir"],
+                                    calc_weights=True,
+                                    alignments_dir=args.alignments_dir,
+                                    save_weights=True,
+                                    )
+    # write out what theta was used
+    data_params['theta'] = data_helper.theta
+
+    end_time = time.time()
+    print("Done in " + str(time.time() - start_time) + " seconds")
diff --git a/examples/run_muteff_pred_batch.py b/examples/run_muteff_pred_batch.py
new file mode 100644
index 0000000..dd53938
--- /dev/null
+++ b/examples/run_muteff_pred_batch.py
@@ -0,0 +1,181 @@
+# Similar to run_muteff_pred, but using a CSV to specify the MSA and DMS inputs.
+
+import argparse
+import os
+import time
+import sys
+
+import pandas as pd
+
+sys.path.insert(0, "../DeepSequence/")
+import model
+import helper
+import train
+
+model_params = {
+    "bs"                :   100,
+    "encode_dim_zero"   :   1500,
+    "encode_dim_one"    :   1500,
+    "decode_dim_zero"   :   100,
+    "decode_dim_one"    :   500,
+    "n_latent"          :   30,
+    "logit_p"           :   0.001,
+    "sparsity"          :   "logit",
+    "final_decode_nonlin":  "sigmoid",
+    "final_pwm_scale"   :   True,
+    "n_pat"             :   4,
+    "r_seed"            :   12345,
+    "conv_pat"          :   True,
+    "d_c_size"          :   40
+    }
+
+
+def create_parser():
+    parser = argparse.ArgumentParser(description="Calculate DeepSequence mutation effect predictions.")
+    parser.add_argument("--samples", type=int, default=2000,
+                        help="Number of prediction iterations")
+    parser.add_argument("--alphabet_type", type=str, default="protein", help="Specify alphabet type")
+
+    # New arguments
+    parser.add_argument("--dms_input_dir", type=str, required=True)
+    parser.add_argument("--dms_output_dir", type=str, required=True)
+    parser.add_argument("--msa_path", type=str, required=True)
+    parser.add_argument("--dms_index", type=int, required=True)
+    parser.add_argument("--model_checkpoint", type=str, required=True)
+    parser.add_argument("--dms_mapping", type=str, required=True)
+    parser.add_argument("--seed", type=int)
+    parser.add_argument("--msa_use_uniprot", action="store_true")
+
+    # Mirror batch_size default of 2000 in helper.py
+    parser.add_argument("--batch_size", type=int, default=2000, help="Batch size for ELBO estimation")
+    return parser
+
+
+def main(args):
+    # Load the deep mutational scan
+    assert os.path.isdir(args.model_checkpoint), "Model checkpoint directory does not exist:" + args.model_checkpoint
+    assert args.dms_index is not None, "Must specify a dms index"
+    assert os.path.isfile(args.dms_mapping), "Mapping file does not exist:" + args.dms_mapping
+    assert os.path.isdir(args.dms_input_dir), "DMS input directory does not exist:" + args.dms_input_dir
+    assert os.path.isdir(args.dms_output_dir), "DMS output directory does not exist:" + args.dms_output_dir
+
+    if args.seed is not None:
+        print("Using seed:", args.seed)
+        model_params['r_seed'] = args.seed
+
+    DMS_phenotype_name, dms_input, dms_output, msa_path, mutant_col, sequence = get_dms_mapping(args)
+
+    # Don't need weights, just using the DataHelper to get the focus_seq_trimmed
+    data_helper = helper.DataHelper(alignment_file=msa_path,
+                                    working_dir='..',
+                                    calc_weights=False,
+                                    alphabet_type=args.alphabet_type,
+                                    )
+    assert sequence != data_helper.focus_seq_trimmed, "Sequence in DMS file does not match sequence in MSA file"
+
+    vae_model   = model.VariationalAutoencoder(data_helper,
+        batch_size                     =   model_params["bs"],
+        encoder_architecture           =   [model_params["encode_dim_zero"],
+                                                model_params["encode_dim_one"]],
+        decoder_architecture           =   [model_params["decode_dim_zero"],
+                                                model_params["decode_dim_one"]],
+        n_latent                       =   model_params["n_latent"],
+        logit_p                        =   model_params["logit_p"],
+        sparsity                       =   model_params["sparsity"],
+        encode_nonlinearity_type       =   "relu",
+        decode_nonlinearity_type       =   "relu",
+        final_decode_nonlinearity      =   model_params["final_decode_nonlin"],
+        final_pwm_scale                =   model_params["final_pwm_scale"],
+        conv_decoder_size              =   model_params["d_c_size"],
+        convolve_patterns              =   model_params["conv_pat"],
+        n_patterns                     =   model_params["n_pat"],
+        random_seed                    =   model_params["r_seed"],
+    )
+    print("Model skeleton built")
+
+    # Could also use Uniprot ID here
+    print("Using MSA path as model prefix: " + msa_path)
+    dataset_name = msa_path.split('.a2m')[0].split('/')[-1]
+    print("Searching for dataset {} in checkpoint dir: {}".format(dataset_name, args.model_checkpoint))
+    # TODO not using model_params["r_seed"] here
+    vae_model.load_parameters(file_prefix="dataset-" + str(dataset_name), seed=args.seed,
+                              override_params_dir=args.model_checkpoint)
+
+    print("Parameters loaded\n\n")
+
+    # Note(Lood): This should probably return the df, and then we can save it outside of the function
+    data_helper.custom_mutant_matrix_df(
+        input_filename=dms_input,
+        model=vae_model,
+        mutant_col=mutant_col,
+        effect_col=DMS_phenotype_name,
+        N_pred_iterations=args.samples,
+        minibatch_size=args.batch_size,
+        output_filename_prefix=dms_output,
+        silent_allowed=True,
+        random_seed=args.seed,
+    )
+
+
+def get_dms_mapping(args):
+    mapping_protein_seq_DMS = pd.read_csv(args.dms_mapping)
+    DMS_id = mapping_protein_seq_DMS["DMS_id"][args.dms_index]
+    print("Compute scores for DMS: " + str(DMS_id))
+    sequence = mapping_protein_seq_DMS["target_seq"][mapping_protein_seq_DMS["DMS_id"] == DMS_id].values[0].upper()
+    dms_input = os.path.join(args.dms_input_dir, mapping_protein_seq_DMS["DMS_filename"][
+        mapping_protein_seq_DMS["DMS_id"] == DMS_id].values[0])
+    assert os.path.isfile(dms_input), "DMS input file not found" + dms_input
+    print("DMS input file: " + dms_input)
+    if "DMS_mutant_column" in mapping_protein_seq_DMS.columns:
+        mutant_col = mapping_protein_seq_DMS["DMS_mutant_column"][mapping_protein_seq_DMS["DMS_id"] == DMS_id].values[0]
+    else:
+        print("DMS_mutant_column not found in mapping file, using mutant")
+        mutant_col = "mutant"
+    DMS_phenotype_name = \
+        mapping_protein_seq_DMS["DMS_phenotype_name"][mapping_protein_seq_DMS["DMS_id"] == DMS_id].values[0]
+    print("DMS mutant column: " + mutant_col)
+    print("DMS phenotype name: " + DMS_phenotype_name)
+    dms_output = os.path.join(args.dms_output_dir, DMS_id)  # Only the prefix, as the file will have _samples etc
+    msa_path = os.path.join(args.msa_path,
+                            mapping_protein_seq_DMS["MSA_filename"][mapping_protein_seq_DMS["DMS_id"] == DMS_id].values[
+                                0])  # msa_path is expected to be the path to the directory where MSAs are located.
+
+    # Rather use UniProt ID as MSA prefix
+    if args.msa_use_uniprot:
+        if not os.path.isfile(msa_path):
+            print("MSA file not found: " + msa_path)
+            print("Looking for close match using UniProt ID")
+            uniprot_id = mapping_protein_seq_DMS["UniProt_ID"][mapping_protein_seq_DMS["DMS_id"] == DMS_id].values[0]
+            print("UniProt ID: " + uniprot_id)
+            found = [file for file in os.listdir(args.msa_path) if
+                     file.startswith(uniprot_id) and file.endswith(".a2m")]
+            assert len(found) == 1, "Could not find unique MSA file for Uniprot ID {} in {}, found {} files".format(
+                uniprot_id, args.msa_path, found)
+            msa_path = os.path.join(args.msa_path, found[0])
+            print("New MSA file: " + msa_path)
+
+    print("MSA file: " + msa_path)
+    assert os.path.isfile(msa_path), "MSA file not found: " + msa_path
+    target_seq_start_index = mapping_protein_seq_DMS["start_idx"][mapping_protein_seq_DMS["DMS_id"] == DMS_id].values[0] \
+        if "start_idx" in mapping_protein_seq_DMS.columns else 1
+    target_seq_end_index = target_seq_start_index + len(sequence)
+    # msa_start_index = mapping_protein_seq_DMS["MSA_start"][mapping_protein_seq_DMS["DMS_id"]==DMS_id].values[0] if "MSA_start" in mapping_protein_seq_DMS.columns else 1
+    # msa_end_index = mapping_protein_seq_DMS["MSA_end"][mapping_protein_seq_DMS["DMS_id"]==DMS_id].values[0] if "MSA_end" in mapping_protein_seq_DMS.columns else len(args.sequence)
+    # if (target_seq_start_index!=msa_start_index) or (target_seq_end_index!=msa_end_index):
+    #     args.sequence = args.sequence[msa_start_index-1:msa_end_index]
+    #     target_seq_start_index = msa_start_index
+    #     target_seq_end_index = msa_end_index
+    # df = pd.read_csv(args.dms_input)
+    # df,_ = DMS_file_cleanup(df, target_seq=args.sequence, start_idx=target_seq_start_index, end_idx=target_seq_end_index, DMS_mutant_column=mutant_col, DMS_phenotype_name=DMS_phenotype_name)
+    # else:
+    #     df = pd.read_csv(args.dms_input)
+    return DMS_phenotype_name, dms_input, dms_output, msa_path, mutant_col, sequence
+
+
+if __name__ == "__main__":
+    parser = create_parser()
+    args = parser.parse_args()
+    print("args:", args)
+    start_time = time.time()
+    main(args)
+    print("Done in " + str(time.time() - start_time) + " seconds")
\ No newline at end of file
diff --git a/examples/scripts/calc_weights_job_array.sh b/examples/scripts/calc_weights_job_array.sh
new file mode 100644
index 0000000..f3cdec4
--- /dev/null
+++ b/examples/scripts/calc_weights_job_array.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+#SBATCH -c 2                              # Request one core
+#SBATCH -N 1                              # Request one node (if you request more than one core with -c, also using
+                                          # -N 1 means all cores will be on the same node)
+#SBATCH -t 0-5:59                         # Runtime in D-HH:MM format
+#SBATCH -p short                          # Partition to run in
+#SBATCH --mem=10G                         # Memory total in MB (for all cores)
+
+# To get email notifications, set both of these options below
+##SBATCH --mail-type=TIME_LIMIT_80,TIME_LIMIT,FAIL,ARRAY_TASKS
+##SBATCH --mail-user="<your_email>@hms.harvard.edu"
+
+#SBATCH --job-name="deepseq_calcweights_date"
+# Job array-specific
+#SBATCH --output=slurm-%A_%a-%x.out  # File to which STDOUT + STDERR will be written, %A: jobID, %a: array task ID, %x: jobname
+#SBATCH --array=0-41%10  		  # Job arrays (e.g. 1-100 with a maximum of 5 jobs at once)
+
+hostname
+pwd
+module load gcc/6.2.0 cuda/9.0
+export THEANO_FLAGS='floatX=float32,device=cuda,force_device=True' # Otherwise will only raise a warning and carry on with CPU
+
+# To generate this file from a directory, just do e.g. 'ls -1 ALIGNMENTS_DIR/*.a2m > msas.txt'
+lines=( $(cat "msas.txt") )
+dataset_name=${lines[$SLURM_ARRAY_TASK_ID]}
+echo $dataset_name
+
+## Monitor GPU usage (store outputs in ./gpu_logs/)
+#/home/lov701/job_gpu_monitor.sh gpu_logs &
+
+srun stdbuf -oL -eL /n/groups/marks/users/aaron/deep_seqs/deep_seqs_env/bin/python \
+  /n/groups/marks/users/lood/DeepSequence_runs/examples/calc_weights.py \
+  --dataset $dataset_name \
+  --weights_dir_out /n/groups/marks/users/lood/DeepSequence_runs/weights_2021_11_16/ \
+  --alignments_dir datasets/alignments/
+#  --theta-override 0.9