diff --git a/DeepSequence/helper.py b/DeepSequence/helper.py index 8ffdbf5..27165ae 100644 --- a/DeepSequence/helper.py +++ b/DeepSequence/helper.py @@ -1,4 +1,8 @@ from __future__ import print_function +from collections import defaultdict +import cPickle +import os + import numpy as np import theano import theano.tensor as T @@ -9,14 +13,18 @@ class DataHelper: def __init__(self, - dataset="", - alignment_file="", - focus_seq_name="", - calc_weights=True, - working_dir=".", - theta=0.2, - load_all_sequences=True, - alphabet_type="protein"): + dataset="", + alignment_file="", + focus_seq_name="", + calc_weights=True, + working_dir=".", + theta=None, + load_all_sequences=True, + alphabet_type="protein", + weights_dir="", + save_weights=False, + alignments_dir=None, + ): """ Class to load and organize alignment data. @@ -42,6 +50,8 @@ def __init__(self, load_all_sequences: alphabet_type: Alphabet type of associated dataset. Options are DNA, RNA, protein, allelic + weights_dir: location of the weights, assumed to be in form: _.npy, + where etc could be theta_0.x Returns ------------ @@ -50,19 +60,24 @@ def __init__(self, np.random.seed(42) self.dataset = dataset + self.dataset = self.dataset.split(".a2m")[0] # Remove prefix (if there's no prefix, this will still be ok) self.alignment_file = alignment_file self.focus_seq_name = focus_seq_name self.working_dir = working_dir self.calc_weights = calc_weights self.alphabet_type = alphabet_type + self.weights_dir = weights_dir + self.save_weights = save_weights + self.alignments_dir = alignments_dir # Initalize the elbo of the wt to None # will be useful if eventually doing mutation effect prediction self.wt_elbo = None # Alignment processing parameters - self.theta = theta - + # Note: Script will fail if calc_weights is True and theta is not set + if theta is not None: + self.theta = theta # If I am running tests with the model, I don't need all the # sequences loaded self.load_all_sequences = load_all_sequences @@ -75,9 +90,15 @@ def __init__(self, if self.alphabet_type == "protein": self.alphabet = "ACDEFGHIKLMNPQRSTVWY" self.reorder_alphabet = "DEKRHNQSTPGAVILMCFYW" + elif self.alphabet_type == "protein_withgap": + self.alphabet = "ACDEFGHIKLMNPQRSTVWY-" + self.reorder_alphabet = "DEKRHNQSTPGAVILMCFYW-" elif self.alphabet_type == "RNA": self.alphabet = "ACGU" self.reorder_alphabet = "ACGU" + elif self.alphabet_type == "RNA_withgap": + self.alphabet = "ACGU-" + self.reorder_alphabet = "ACGU-" elif self.alphabet_type == "DNA": self.alphabet = "ACGT" self.reorder_alphabet = "ACGT" @@ -85,7 +106,7 @@ def __init__(self, self.alphabet = "012" self.reorder_alphabet = "012" - #then generate the experimental data + # then generate the experimental data self.gen_basic_alignment() if self.load_all_sequences: @@ -94,44 +115,178 @@ def __init__(self, def configure_datasets(self): if self.dataset == "BLAT_ECOLX": - self.alignment_file = self.working_dir+"/datasets/BLAT_ECOLX_hmmerbit_plmc_n5_m30_f50_t0.2_r24-286_id100_b105.a2m" + self.alignment_file = self.working_dir + "/datasets/alignments/BLAT_ECOLX_hmmerbit_plmc_n5_m30_f50_t0.2_r24-286_id100_b105.a2m" + self.theta = 0.2 + + elif self.dataset == "BLAT_ECOLX_withgaps": + self.alignment_file = self.working_dir + "/datasets/alignments/BLAT_ECOLX_1_b0.5.a2m" + self.alphabet_type = "protein_withgap" + self.theta = 0.2 + + elif self.dataset == "PTEN_HUMAN_withgaps": + self.alignment_file = self.working_dir + "/datasets/alignments/PTEN_HUMAN_1_b0.3.a2m" + self.alphabet_type = "protein_withgap" + self.theta = 0.2 + + elif self.dataset == "HIS7_YEAST_withgaps": + self.alignment_file = self.working_dir + "/datasets/alignments/HIS7_YEAST_1_b0.5.a2m" + self.alphabet_type = "protein_withgap" + self.theta = 0.2 + + elif self.dataset == "P53_HUMAN_withgaps": + self.alignment_file = self.working_dir + "/datasets/alignments/P53_HUMAN_r90-300_uniref100_Nov17_b0.06.a2m" + self.alphabet_type = "protein_withgap" + self.theta = 0.2 + + elif self.dataset == "SNORNA_YEAST_withgaps": + self.alignment_file = self.working_dir + "/datasets/alignments/CL00100_cmRF00012_m70_f50.a2m" + self.alphabet_type = "RNA_withgap" + self.theta = 0.2 + + elif self.dataset == 'naive_repertoire_fullseqs_withgaps': + self.alignment_file = self.working_dir + '/datasets/alignments/naive_repertoire_annotated_aligned_fullseqs.fa' + self.alphabet_type = "protein_withgap" self.theta = 0.2 elif self.dataset == "PABP_YEAST": - self.alignment_file = self.working_dir+"/datasets/PABP_YEAST_hmmerbit_plmc_n5_m30_f50_t0.2_r115-210_id100_b48.a2m" + self.alignment_file = self.working_dir + "/datasets/PABP_YEAST_hmmerbit_plmc_n5_m30_f50_t0.2_r115-210_id100_b48.a2m" self.theta = 0.2 elif self.dataset == "DLG4_RAT": - self.alignment_file = self.working_dir+"/datasets/DLG4_RAT_hmmerbit_plmc_n5_m30_f50_t0.2_r300-400_id100_b50.a2m" + self.alignment_file = self.working_dir + "/datasets/DLG4_RAT_hmmerbit_plmc_n5_m30_f50_t0.2_r300-400_id100_b50.a2m" self.theta = 0.2 elif self.dataset == "trna": - self.alignment_file = self.working_dir+"/datasets/RF00005_CCU.fasta" + self.alignment_file = self.working_dir + "/datasets/RF00005_CCU.fasta" self.alphabet_type = "RNA" self.theta = 0.2 - else: - self.alignment_file = self.working_dir+'/datasets/alignments/'+self.dataset+'.a2m' + elif self.dataset == "PA_FLU": + self.alignment_file = self.working_dir + "/datasets/alignments/PA_FLU_1_b0.5.a2m" + self.theta = 0.2 + + elif self.dataset == "PA_FLU_orig": + self.alignment_file = self.working_dir + "/datasets/alignments/PA_FLU_1_b0.5.a2m" + self.theta = 0.01 + + elif self.dataset == "PA_FLU_jonny": + self.alignment_file = self.working_dir + "/datasets/alignments/PA_FLU_jonny_1_b0.5.a2m" + self.theta = 0.2 + self.calc_weights = False + + elif self.dataset == "DPO1_KLULA": + self.alignment_file = self.working_dir + "/datasets/alignments/DPO1_KLULA_b0.1.a2m" + self.theta = 0.2 + + elif self.dataset == "DRTS_PLAFK": + self.alignment_file = self.working_dir + "/datasets/alignments/DRTS_PLAFK_r1-280_b0.1.a2m" + self.theta = 0.2 + + elif self.dataset == "HIS4_THEMA": + self.alignment_file = self.working_dir + "/datasets/alignments/HIS4_THEMA_b0.2.a2m" + self.theta = 0.2 + + elif self.dataset == "HIS4_YEAST": + self.alignment_file = self.working_dir + "/datasets/alignments/HIS4_YEAST_b0.2.a2m" + self.theta = 0.2 + + elif self.dataset == "HIS4_YEAST_b0.1": + self.alignment_file = self.working_dir + "/datasets/alignments/HIS4_YEAST_b0.1.a2m" + self.theta = 0.2 + + elif self.dataset == "HIS4_THEMA_b0.1": + self.alignment_file = self.working_dir + "/datasets/alignments/HIS4_THEMA_b0.1.a2m" + self.theta = 0.2 + + elif self.dataset == "TRP_YEAST": + self.alignment_file = self.working_dir + "/datasets/alignments/TRP_YEAST_b0.1.a2m" + self.theta = 0.2 + + elif self.dataset == "TRPB1_THEMA": + self.alignment_file = self.working_dir + "/datasets/alignments/TRPB1_THEMA_b0.4.a2m" + self.theta = 0.2 + + elif self.dataset == "TRPB2_THEMA": + self.alignment_file = self.working_dir + "/datasets/alignments/TRPB2_THEMA_b0.4.a2m" + self.theta = 0.2 + + elif self.dataset == "TRPF_YEAST": + self.alignment_file = self.working_dir + "/datasets/alignments/TRPF_YEAST_b0.2.a2m" + self.theta = 0.2 + + elif self.dataset == "TRPF_YEAST_b0.1": + self.alignment_file = self.working_dir + "/datasets/alignments/TRPF_YEAST_b0.1.a2m" + self.theta = 0.2 + + elif self.dataset == "BLAT_ECOLX_1_seqid0.3": + self.alignment_file = self.working_dir + "/datasets/alignments/BLAT_ECOLX_1_b0.5_seqid0.3.a2m" + self.theta = 0.2 + + elif self.dataset == "BLAT_ECOLX_1": + self.alignment_file = self.working_dir + "/datasets/alignments/BLAT_ECOLX_1_b0.5.a2m" + self.theta = 0.2 + + elif self.dataset == "AMIE_PSEAE_1_seqid0.3": + self.alignment_file = self.working_dir + "/datasets/alignments/AMIE_PSEAE_1_b0.3_seqid0.3.a2m" + self.theta = 0.2 + + elif self.dataset == "AMIE_PSEAE_1": + self.alignment_file = self.working_dir + "/datasets/alignments/AMIE_PSEAE_1_b0.3.a2m" + self.theta = 0.2 + + elif self.dataset == "P53_HUMAN": + self.alignment_file = self.working_dir + "/datasets/alignments/P53_HUMAN_r90-300_uniref100_Nov17_b0.06.a2m" + self.theta = 0.2 + + elif self.dataset == 'BF520_env': + self.alignment_file = self.working_dir + '/datasets/alignments/BF520_env_1_b0.5.a2m' + self.theta = 0.01 + + elif self.dataset == 'BG505_env': + self.alignment_file = self.working_dir + '/datasets/alignments/BG505_env_1_b0.5.a2m' + self.theta = 0.01 + + elif self.dataset == 'deltaNTRP_YEAST': + self.alignment_file = self.working_dir + '/datasets/alignments/deltaNTRP_YEAST_b0.4.a2m' + self.theta = 0.2 + + elif self.dataset == 'PfDHFR': + self.alignment_file = self.working_dir + '/datasets/alignments/PfDHFR_b0.1.a2m' + self.theta = 0.2 + + elif self.dataset == 'TP-DNAP1': + self.alignment_file = self.working_dir + '/datasets/alignments/TP-DNAP1_b0.1.a2m' + self.theta = 0.2 + + elif self.dataset == 'BRCA1_HUMAN': + self.alignment_file = self.working_dir + '/datasets/alignments/BRCA1_HUMAN_1_b0.5.a2m' self.theta = 0.2 + else: + if self.alignments_dir is not None: + self.alignment_file = os.path.join(self.alignments_dir, self.dataset + ".a2m") + else: + self.alignment_file = self.working_dir + '/datasets/alignments/' + self.dataset + '.a2m' + assert os.path.isfile(self.alignment_file), "Alignment file not found: " + self.alignment_file + def one_hot_3D(self, s): """ Transform sequence string into one-hot aa vector""" # One-hot encode as row vector x = np.zeros((len(s), len(self.alphabet))) for i, letter in enumerate(s): if letter in self.aa_dict: - x[i , self.aa_dict[letter]] = 1 + x[i, self.aa_dict[letter]] = 1 return x def gen_basic_alignment(self): """ Read training alignment and store basics in class instance """ # Make a dictionary that goes from aa to a number for one-hot self.aa_dict = {} - for i,aa in enumerate(self.alphabet): + for i, aa in enumerate(self.alphabet): self.aa_dict[aa] = i # Do the inverse as well - self.num_to_aa = {i:aa for aa,i in self.aa_dict.items()} + self.num_to_aa = {i: aa for aa, i in self.aa_dict.items()} ix = np.array([self.alphabet.find(s) for s in self.reorder_alphabet]) @@ -165,8 +320,12 @@ def gen_basic_alignment(self): # We also expect the focus sequence to be formatted as: # >[NAME]/[start]-[end] - focus_loc = self.focus_seq_name.split("/")[-1] - start,stop = focus_loc.split("-") + focus_loc = self.focus_seq_name + # Can include extra information (e.g. a custom weight) after the fasta header + if ':' in focus_loc: + focus_loc = focus_loc[:focus_loc.rfind(':')] + focus_loc = focus_loc.split("/")[-1] + start, stop = focus_loc.split("-") self.focus_start_loc = int(start) self.focus_stop_loc = int(stop) self.uniprot_focus_cols_list \ @@ -176,21 +335,20 @@ def gen_basic_alignment(self): self.uniprot_focus_col_to_focus_idx \ = {idx_col+int(start):idx_col for idx_col in self.focus_cols} - def gen_full_alignment(self): # Get only the focus columns - for seq_name,sequence in self.seq_name_to_sequence.items(): + for seq_name, sequence in self.seq_name_to_sequence.items(): # Replace periods with dashes (the uppercase equivalent) - sequence = sequence.replace(".","-") + sequence = sequence.replace(".", "-") - #then get only the focus columns + # then get only the focus columns self.seq_name_to_sequence[seq_name] = [sequence[ix].upper() for ix in self.focus_cols] # Remove sequences that have bad characters alphabet_set = set(list(self.alphabet)) seq_names_to_remove = [] - for seq_name,sequence in self.seq_name_to_sequence.items(): + for seq_name, sequence in self.seq_name_to_sequence.items(): for letter in sequence: if letter not in alphabet_set and letter != "-": seq_names_to_remove.append(seq_name) @@ -200,17 +358,16 @@ def gen_full_alignment(self): del self.seq_name_to_sequence[seq_name] # Encode the sequences - print ("Encoding sequences") - self.x_train = np.zeros((len(self.seq_name_to_sequence.keys()),len(self.focus_cols),len(self.alphabet))) + print("Encoding sequences") + self.x_train = np.zeros((len(self.seq_name_to_sequence.keys()), len(self.focus_cols), len(self.alphabet))) self.x_train_name_list = [] - for i,seq_name in enumerate(self.seq_name_to_sequence.keys()): + for i, seq_name in enumerate(self.seq_name_to_sequence.keys()): sequence = self.seq_name_to_sequence[seq_name] self.x_train_name_list.append(seq_name) - for j,letter in enumerate(sequence): + for j, letter in enumerate(sequence): if letter in self.aa_dict: k = self.aa_dict[letter] - self.x_train[i,j,k] = 1.0 - + self.x_train[i, j, k] = 1.0 # Fast sequence weights with Theano if self.calc_weights: @@ -229,16 +386,58 @@ def gen_full_alignment(self): # self.weights = weightfun(self.x_train, self.theta)[0] + if self.save_weights: + print("Saving sequence weights, dataset={} in dir {}".format(self.dataset, self.weights_dir)) + if os.path.isdir(self.weights_dir): + weights_dir_found = self.weights_dir + else: + weights_dir_found = os.path.join(self.working_dir, self.weights_dir) + assert os.path.isdir(weights_dir_found), "Could not find weights directory: {} given, expanded to {} using working_dir".format(self.weights_dir, weights_dir_found) + # e.g. BLAT_ECOLX_theta_0.2.npy + filename_out = os.path.join(weights_dir_found, "{}_theta_{}.npy".format(self.dataset, self.theta)) + print("Saving weights to {}".format(filename_out)) + np.save(filename_out, self.weights) + else: - # If not using weights, use an isotropic weight matrix - self.weights = np.ones(self.x_train.shape[0]) + if ':' in self.focus_seq_name: + print("Loading detected sequence weights") + self.weights = np.zeros(self.x_train.shape[0]) + for i, seq_name in enumerate(self.x_train_name_list): + self.weights[i] = float(seq_name.split(':')[-1]) + elif self.weights_dir != "": + print("Loading sequence weights from file, looking for {} in {}".format(self.dataset, self.weights_dir)) + + # Get UniProt ID (the prefix before the second underscore) + # TODO Note: This fails for some of the MSA,weight pairs in the original DeepSeq dataset, + # since they don't have unique UniProt ids + dataset_prefix = "_".join(self.dataset.split("_")[:2]) + # Set path + if os.path.isdir(self.weights_dir): + weights_dir_found = self.weights_dir + else: + weights_dir_found = os.path.join(self.working_dir, self.weights_dir) + assert os.path.isdir(weights_dir_found), "Could not find weights directory: {} given, expanded to {}".format( + self.weights_dir, weights_dir_found) + + # Find weights file using dataset prefix + found = [file for file in os.listdir(weights_dir_found) if file.startswith(dataset_prefix) and file.endswith(".npy")] + assert len(found) == 1, \ + "Could not find unique weights file for dataset {} with prefix {}, in {}, found {} files"\ + .format(self.dataset, dataset_prefix, weights_dir_found, found) + weights_location = os.path.join(weights_dir_found, found[0]) + + self.weights = np.load(file=weights_location) + print("Weights loaded from {}".format(weights_location)) + else: + # If not using weights, use an isotropic weight matrix + print("Not using weights, using isotropic weight matrix") + self.weights = np.ones(self.x_train.shape[0]) self.Neff = np.sum(self.weights) print ("Neff =",str(self.Neff)) print ("Data Shape =",self.x_train.shape) - def delta_elbo(self, model, mutant_tuple_list, N_pred_iterations=10): for pos,wt_aa,mut_aa in mutant_tuple_list: @@ -251,7 +450,6 @@ def delta_elbo(self, model, mutant_tuple_list, N_pred_iterations=10): for pos,wt_aa,mut_aa in mutant_tuple_list: mut_seq[self.uniprot_focus_col_to_focus_idx[pos]] = mut_aa - if self.wt_elbo == None: mutant_sequences = [self.focus_seq_trimmed, mut_seq] else: @@ -269,7 +467,6 @@ def delta_elbo(self, model, mutant_tuple_list, N_pred_iterations=10): prediction_matrix = np.zeros((mutant_sequences_one_hot.shape[0],N_pred_iterations)) idx_batch = np.arange(mutant_sequences_one_hot.shape[0]) for i in range(N_pred_iterations): - batch_preds, _, _ = model.all_likelihood_components(mutant_sequences_one_hot) prediction_matrix[:,i] = batch_preds @@ -463,7 +660,171 @@ def custom_mutant_matrix(self, input_filename, model, N_pred_iterations=10, \ OUTPUT.write(descriptor+";"+str(self.delta_elbos[i])+"\n") OUTPUT.close() - + + def custom_mutant_matrix_df(self, input_filename, model, mutant_col, effect_col, N_pred_iterations=2000, \ + minibatch_size=2000, output_filename_prefix="", silent_allowed=False, + lowercase_allowed=False, random_seed=None): + + """ Predict the delta elbo for a custom mutation filename, using pandas dataframes. + """ + if silent_allowed: + print("Silent mutations allowed") + if lowercase_allowed: + print("Lowercase allowed") + import pandas as pd # Need pandas for this function + + # run through the input file + if not os.path.isfile(input_filename): + input_filename = os.path.join(self.working_dir, input_filename) + assert os.path.isfile(input_filename), "File not found: " + input_filename + + print("Going through input file:", input_filename) + df_input = pd.read_csv(input_filename) + print(df_input.head(30)) + print(len(df_input), "rows") + assert mutant_col in df_input.columns, "Mutant column {} not found in input file, columns={}".format(mutant_col, str(df_input.columns)) + assert effect_col in df_input.columns, "Effect column {} not found in input file, columns={}".format(effect_col, str(df_input.columns)) + print("cleaning up dms") + df, _ = DMS_file_cleanup(DMS_data=df_input, target_seq=self.focus_seq, alphabet=self.alphabet, + DMS_mutant_column=mutant_col, DMS_phenotype_name=effect_col) + print(df.head(30)) + print(len(df), "rows after cleanup") + + # Get the start and end index from the sequence name + start_idx, end_idx = self.focus_seq_name.split("/")[-1].split("-") + start_idx = int(start_idx) + + # wt_pos_focus_idx_tuple_list = [] + focus_seq_index = 0 + # focus_seq_list = [] + mutant_to_letter_pos_idx_focus_list = {} + + # find all possible valid mutations that can be run with this alignment + print("Generating valid mutations") + print("focus_seq", self.focus_seq_name, ":", self.focus_seq) + print("focus_seq_trimmed ", self.focus_seq_trimmed) + + assert model.seq_len == len(self.focus_cols) + + for i, letter in enumerate(self.focus_seq): + if i % 10 == 0: + print(i) + + # seq=aBcdE, focus_seq_trimmed=BE, + # then A1B will be invalid (lowercase); + # B2M is valid and the mutated focus_seq is ME (mutating index 0) + if letter == letter.upper(): # or lowercase_allowed: + for mut in self.alphabet: + pos = start_idx + i + if letter != mut: + mutant = letter + str(pos) + mut + mutant_to_letter_pos_idx_focus_list[mutant] = [letter, pos, focus_seq_index] + focus_seq_index += 1 + # else: # for debugging + # print("lowercase letter: ", focus_seq_index, letter) + + print("All possible (single?) mutations:", mutant_to_letter_pos_idx_focus_list) + + def get_mutated_seq(mutants): + mutant_list = mutants.split(":") + focus_seq_copy = list(self.focus_seq_trimmed)[:] + + for mutant in mutant_list: + wt_aa, pos, idx_focus = mutant_to_letter_pos_idx_focus_list[mutant] + mut_aa = mutant[-1] + focus_seq_copy[idx_focus] = mut_aa + + return focus_seq_copy + + print("Getting mutated sequences") + # First sequence is the wt sequence + mutant_sequences = [list(self.focus_seq_trimmed)[:]] + valid_mutants = ['wt'] + # df_input.apply(lambda row: get_mutated_seq(row['mutant']), axis=1).to_list() + for mutants in df['mutant'].to_list(): + assert isinstance(mutants, str), "mutants is not a string: " + str(mutants) + mutant_list = mutants.split(":") + + valid_mutant = True + # TODO handle silent mutations, don't raise errors for them + # if any of the mutants in this list aren"t in the focus sequence, + # I cannot make a prediction + for mutant in mutant_list: + if mutant not in mutant_to_letter_pos_idx_focus_list: + valid_mutant = False + print("Invalid mutant:", mutant) + print("mutant_list", mutant_list) + break + + # If it is a valid mutant, add it to my list to make predictions + if valid_mutant: + focus_seq_copy = list(self.focus_seq_trimmed)[:] + + for mutant in mutant_list: + wt_aa, pos, idx_focus = mutant_to_letter_pos_idx_focus_list[mutant] + mut_aa = mutant[-1] + focus_seq_copy[idx_focus] = mut_aa + + mutant_sequences.append("".join(focus_seq_copy)) + valid_mutants.append(mutants) + + print("Number of valid mutant sequences:", len(mutant_sequences)) + if len(mutant_sequences) == 0: + raise ValueError("No valid mutant sequences found") + + print("Making one-hot matrix") + # Then make the one hot sequence + mutant_sequences_one_hot = np.zeros((len(mutant_sequences), len(self.focus_cols), len(self.alphabet))) + for i, sequence in enumerate(mutant_sequences): + if i % 10 == 0: + print(i) + for j, letter in enumerate(sequence): + k = self.aa_dict[letter] + mutant_sequences_one_hot[i, j, k] = 1.0 + + prediction_matrix = np.zeros((mutant_sequences_one_hot.shape[0], N_pred_iterations)) + batch_order = np.arange(mutant_sequences_one_hot.shape[0]) + + print("Getting ELBOs over ", N_pred_iterations, " iterations and ", minibatch_size, " size minibatches") + print("Prediction matrix size:", prediction_matrix.shape) + + # Why not batch along the iterations direction? if we have mutants < minibatch_size this doesn't help. + # Although if iterations is too big, it may not fit in a minibatch. + # And if mutants is too big, it may also not fit. + for i in range(N_pred_iterations): + if i % 10 == 0: + print(i) + np.random.shuffle(batch_order) + for j in range(0, mutant_sequences_one_hot.shape[0], minibatch_size): + batch_index = batch_order[j:j + minibatch_size] + batch_preds, _, _ = model.all_likelihood_components(mutant_sequences_one_hot[batch_index]) + + for k, idx_batch in enumerate(batch_index.tolist()): + prediction_matrix[idx_batch][i] = batch_preds[k] + + # Then take the mean of all my elbo samples + mean_elbos = np.mean(prediction_matrix, axis=1).flatten().tolist() + + # Remove the wild type sequence + wt_elbo = mean_elbos.pop(0) + valid_mutants.pop(0) + + delta_elbos = np.asarray(mean_elbos) - wt_elbo + assert len(delta_elbos) == len(valid_mutants), "delta_elbos and valid_mutants should be the same length" + str( + len(delta_elbos)) + " " + str(len(valid_mutants)) + elbos = pd.DataFrame({'DeepSequence': delta_elbos, 'mutant': valid_mutants}) + print("elbos:\n", len(elbos), elbos.head(30)) + df = df.merge(elbos, how='inner', on='mutant') + print("merged:\n", len(df), df.head()) + + print("Saving to file") + output_filename = output_filename_prefix + "_samples-" + str(N_pred_iterations) + "_elbo_predictions.csv" + if random_seed is not None: + output_filename = output_filename_prefix + "_samples-" + str(N_pred_iterations) + "_seed-" + str(random_seed) + "_elbo_predictions.csv" + df.to_csv(output_filename, index=False) + + print("Written to file", output_filename) + def custom_sequences(self, input_filename, model, N_pred_iterations=10, \ minibatch_size=2000, filename_prefix="", offset=0): @@ -483,15 +844,18 @@ def custom_sequences(self, input_filename, model, N_pred_iterations=10, \ self.mutant_sequences_descriptor = ["wt"] # run through the input file - INPUT = open(self.working_dir+"/"+input_filename, "r") + if not os.path.isfile(input_filename): + input_filename = os.path.join(self.working_dir, input_filename) + assert os.path.isfile(input_filename), "File not found: "+input_filename + + INPUT = open(input_filename, "r") #INPUT = open(input_filename, "r") header = '' new_line = '' alphabet = set(self.alphabet) - print(self.focus_seq) - print(self.focus_seq) - print(self.focus_seq_trimmed) + print("focus_seq", self.focus_seq) + print("focus_seq_trimmed", self.focus_seq_trimmed) for i,line in enumerate(INPUT): line = line.rstrip() @@ -513,12 +877,7 @@ def custom_sequences(self, input_filename, model, N_pred_iterations=10, \ if len(new_line) == len(self.focus_seq_trimmed): self.mutant_sequences.append(new_line) self.mutant_sequences_descriptor.append(header) - - print(self.mutant_sequences_descriptor[0]) - print(self.mutant_sequences[0]) - print(self.mutant_sequences_descriptor[1]) - print(self.mutant_sequences[1]) - + INPUT.close() print(self.alphabet) # Then make the one hot sequence @@ -590,7 +949,6 @@ def get_pattern_activations(self, model, update_num, filename_prefix="", OUTPUT.close() - def get_embeddings(self, model, update_num, filename_prefix="", verbose=False, minibatch_size=2000): """ Save the latent variables from all the sequences in the alignment """ @@ -609,7 +967,6 @@ def get_embeddings(self, model, update_num, filename_prefix="", header_list = mu_header_list + log_sigma_header_list OUTPUT.write("update_num,name,"+",".join(header_list)+"\n") - batch_order = np.arange(len(self.x_train_name_list)) for i in range(0,len(self.x_train_name_list),minibatch_size): @@ -645,6 +1002,7 @@ def get_elbo_samples(self, model, N_pred_iterations=100, minibatch_size=2000): for k,idx_batch in enumerate(batch_index.tolist()): self.prediction_matrix[idx_batch][i]= batch_preds[k] + def gen_job_string(data_params, model_params): """ Generates a unique job string given data and model parameters. @@ -679,6 +1037,7 @@ def gen_job_string(data_params, model_params): encoder_architecture_str = "-".join([str(size) for size in encoder_architecture]) decoder_architecture_str = "-".join([str(size) for size in decoder_architecture]) + # Note: If job_str is too long it will cause an error when saving job_str = "vae_output_encoder-"+encoder_architecture_str+"_Nlatent-"+str(n_latent)\ +"_decoder-"+decoder_architecture_str @@ -687,6 +1046,9 @@ def gen_job_string(data_params, model_params): if data_id not in written_out_vals: if str(type(data_val)) == "": job_id_list.append(data_id+"-"+"-".join([str(val) for val in data_val])) + # LvN: Skipped '/' character because it causes errors when using job_id as filename + elif isinstance(data_val, str) and "/" in data_val: + pass else: job_id_list.append(data_id+"-"+str(data_val)) @@ -700,3 +1062,66 @@ def gen_job_string(data_params, model_params): return job_str+"_"+"_".join(job_id_list) + + +# Copied from Javier's /n/groups/marks/users/javier/ESM-1b/protein_transformer/utils/mutation_scoring.py +def DMS_file_cleanup(DMS_data, target_seq, alphabet, start_idx=1, end_idx=None, DMS_mutant_column='mutant', + DMS_phenotype_name='score', DMS_directionality=1, keep_singles_only=False): + import pandas as pd + + end_idx = start_idx + len(target_seq) - 1 if end_idx is None else end_idx + + DMS_data['mutant'] = DMS_data[DMS_mutant_column] + num_starting_mutants = len(DMS_data) + DMS_data = DMS_data[DMS_data['mutant'].notnull()].copy() # This should be filtering NaNs + # Make sure we're filtering nans + DMS_data.dropna(subset=['mutant'], inplace=True) + # Different way of filtering nans + DMS_data = DMS_data[DMS_data['mutant'].apply(lambda x: isinstance(x, str))] + + DMS_data = DMS_data[DMS_data['mutant'].apply(lambda x: all([len(y) >= 3 for y in x.split(":")]))].copy() # filter first set of degenerate mutants + + def is_int(s): + try: + int(s) + return True + except ValueError: + return False + + DMS_data=DMS_data[DMS_data['mutant'].apply(lambda x: all([(y[0] in alphabet) and is_int(y[1:-1]) and (y[-1] in alphabet) for y in x.split(":")]))].copy() + num_valid_format_mutants = len(DMS_data) + print("num_valid_format_mutants", num_valid_format_mutants) + + # TODO note the removal of these lines + # DMS_data = DMS_data[DMS_data['mutant'].apply(lambda x: all([int(y[1:-1]) - start_idx >= 0 and int(y[1:-1]) <= end_idx for y in x.split(":")]))].copy() + # DMS_data = DMS_data[DMS_data['mutant'].apply(lambda x: all([y[0] == target_seq[int(y[1:-1]) - start_idx] for y in x.split(":")]))].copy() # This is done in the method + num_fully_valid_mutants = len(DMS_data) + print("num_fully_valid_mutants", num_fully_valid_mutants) + if num_fully_valid_mutants == 0: + raise ValueError("No fully valid mutants found") + + DMS_data = DMS_data[DMS_data['mutant'].apply(lambda x: any([y[0] != y[-1] for y in x.split(":")]))].copy() # At least of the mutations in mutant is not silent + num_fully_valid_non_silent_mutants = len(DMS_data) + + if keep_singles_only: + DMS_data = DMS_data[DMS_data['mutant'].apply(lambda x: len(x.split(":")) == 1)].copy() + num_fully_valid_single_mutants = len(DMS_data) + + DMS_data[DMS_phenotype_name] = pd.to_numeric(DMS_data[DMS_phenotype_name], errors='coerce') + DMS_data.dropna(subset=[DMS_phenotype_name], inplace=True) + + DMS_data['DMS_score'] = DMS_data[DMS_phenotype_name] * DMS_directionality + DMS_data = DMS_data[['mutant', 'DMS_score']] + + num_mutants_with_non_missing_DMS_score = len(DMS_data) + DMS_data = DMS_data.groupby('mutant').mean().reset_index() + num_distinct_mutants_with_non_missing_DMS_score = len(DMS_data) + if keep_singles_only: + quality_checks = [num_starting_mutants, num_valid_format_mutants, num_fully_valid_mutants, + num_fully_valid_non_silent_mutants, num_fully_valid_single_mutants, + num_mutants_with_non_missing_DMS_score, num_distinct_mutants_with_non_missing_DMS_score] + else: + quality_checks = [num_starting_mutants, num_valid_format_mutants, num_fully_valid_mutants, + num_fully_valid_non_silent_mutants, num_mutants_with_non_missing_DMS_score, + num_distinct_mutants_with_non_missing_DMS_score] + return DMS_data, quality_checks diff --git a/calc_weights.py b/calc_weights.py new file mode 100644 index 0000000..b43a498 --- /dev/null +++ b/calc_weights.py @@ -0,0 +1,45 @@ +# Basically a copy of the first part of run_svi.py, but not loading the model after calculating the weights +import time +import sys + +sys.path.insert(0, "../DeepSequence/") +import numpy as np + +import helper +import argparse + +parser = argparse.ArgumentParser(description="Calculating the weights and storing in weights_dir.") +parser.add_argument("--dataset", type=str, default="BLAT_ECOLX", + help="Dataset name for fitting model.") +parser.add_argument("--theta-override", type=float, default=None, + help="Override the model theta.") +# Keeping this different from weights_dir just so that we don't make mistakes and overwrite weights +parser.add_argument("--alignments_dir", type=str, help="Overrides the default ./datasets/alignments/") +parser.add_argument("--weights_dir_out", type=str, default="", help="Location to store weights.") +args = parser.parse_args() + +# DataHelper expects the dataset name without extension +args.dataset = args.dataset.split(".a2m")[0] +assert not args.dataset.endswith(".a2m") + +data_params = { + "dataset": args.dataset, + "weights_dir": args.weights_dir_out, +} + +if __name__ == "__main__": + start_time = time.time() + + data_helper = helper.DataHelper(dataset=data_params["dataset"], + working_dir='.', + theta=args.theta_override, + weights_dir=data_params["weights_dir"], + calc_weights=True, + alignments_dir=args.alignments_dir, + save_weights=True, + ) + # write out what theta was used + data_params['theta'] = data_helper.theta + + end_time = time.time() + print("Done in " + str(time.time() - start_time) + " seconds") diff --git a/examples/run_muteff_pred_batch.py b/examples/run_muteff_pred_batch.py new file mode 100644 index 0000000..dd53938 --- /dev/null +++ b/examples/run_muteff_pred_batch.py @@ -0,0 +1,181 @@ +# Similar to run_muteff_pred, but using a CSV to specify the MSA and DMS inputs. + +import argparse +import os +import time +import sys + +import pandas as pd + +sys.path.insert(0, "../DeepSequence/") +import model +import helper +import train + +model_params = { + "bs" : 100, + "encode_dim_zero" : 1500, + "encode_dim_one" : 1500, + "decode_dim_zero" : 100, + "decode_dim_one" : 500, + "n_latent" : 30, + "logit_p" : 0.001, + "sparsity" : "logit", + "final_decode_nonlin": "sigmoid", + "final_pwm_scale" : True, + "n_pat" : 4, + "r_seed" : 12345, + "conv_pat" : True, + "d_c_size" : 40 + } + + +def create_parser(): + parser = argparse.ArgumentParser(description="Calculate DeepSequence mutation effect predictions.") + parser.add_argument("--samples", type=int, default=2000, + help="Number of prediction iterations") + parser.add_argument("--alphabet_type", type=str, default="protein", help="Specify alphabet type") + + # New arguments + parser.add_argument("--dms_input_dir", type=str, required=True) + parser.add_argument("--dms_output_dir", type=str, required=True) + parser.add_argument("--msa_path", type=str, required=True) + parser.add_argument("--dms_index", type=int, required=True) + parser.add_argument("--model_checkpoint", type=str, required=True) + parser.add_argument("--dms_mapping", type=str, required=True) + parser.add_argument("--seed", type=int) + parser.add_argument("--msa_use_uniprot", action="store_true") + + # Mirror batch_size default of 2000 in helper.py + parser.add_argument("--batch_size", type=int, default=2000, help="Batch size for ELBO estimation") + return parser + + +def main(args): + # Load the deep mutational scan + assert os.path.isdir(args.model_checkpoint), "Model checkpoint directory does not exist:" + args.model_checkpoint + assert args.dms_index is not None, "Must specify a dms index" + assert os.path.isfile(args.dms_mapping), "Mapping file does not exist:" + args.dms_mapping + assert os.path.isdir(args.dms_input_dir), "DMS input directory does not exist:" + args.dms_input_dir + assert os.path.isdir(args.dms_output_dir), "DMS output directory does not exist:" + args.dms_output_dir + + if args.seed is not None: + print("Using seed:", args.seed) + model_params['r_seed'] = args.seed + + DMS_phenotype_name, dms_input, dms_output, msa_path, mutant_col, sequence = get_dms_mapping(args) + + # Don't need weights, just using the DataHelper to get the focus_seq_trimmed + data_helper = helper.DataHelper(alignment_file=msa_path, + working_dir='..', + calc_weights=False, + alphabet_type=args.alphabet_type, + ) + assert sequence != data_helper.focus_seq_trimmed, "Sequence in DMS file does not match sequence in MSA file" + + vae_model = model.VariationalAutoencoder(data_helper, + batch_size = model_params["bs"], + encoder_architecture = [model_params["encode_dim_zero"], + model_params["encode_dim_one"]], + decoder_architecture = [model_params["decode_dim_zero"], + model_params["decode_dim_one"]], + n_latent = model_params["n_latent"], + logit_p = model_params["logit_p"], + sparsity = model_params["sparsity"], + encode_nonlinearity_type = "relu", + decode_nonlinearity_type = "relu", + final_decode_nonlinearity = model_params["final_decode_nonlin"], + final_pwm_scale = model_params["final_pwm_scale"], + conv_decoder_size = model_params["d_c_size"], + convolve_patterns = model_params["conv_pat"], + n_patterns = model_params["n_pat"], + random_seed = model_params["r_seed"], + ) + print("Model skeleton built") + + # Could also use Uniprot ID here + print("Using MSA path as model prefix: " + msa_path) + dataset_name = msa_path.split('.a2m')[0].split('/')[-1] + print("Searching for dataset {} in checkpoint dir: {}".format(dataset_name, args.model_checkpoint)) + # TODO not using model_params["r_seed"] here + vae_model.load_parameters(file_prefix="dataset-" + str(dataset_name), seed=args.seed, + override_params_dir=args.model_checkpoint) + + print("Parameters loaded\n\n") + + # Note(Lood): This should probably return the df, and then we can save it outside of the function + data_helper.custom_mutant_matrix_df( + input_filename=dms_input, + model=vae_model, + mutant_col=mutant_col, + effect_col=DMS_phenotype_name, + N_pred_iterations=args.samples, + minibatch_size=args.batch_size, + output_filename_prefix=dms_output, + silent_allowed=True, + random_seed=args.seed, + ) + + +def get_dms_mapping(args): + mapping_protein_seq_DMS = pd.read_csv(args.dms_mapping) + DMS_id = mapping_protein_seq_DMS["DMS_id"][args.dms_index] + print("Compute scores for DMS: " + str(DMS_id)) + sequence = mapping_protein_seq_DMS["target_seq"][mapping_protein_seq_DMS["DMS_id"] == DMS_id].values[0].upper() + dms_input = os.path.join(args.dms_input_dir, mapping_protein_seq_DMS["DMS_filename"][ + mapping_protein_seq_DMS["DMS_id"] == DMS_id].values[0]) + assert os.path.isfile(dms_input), "DMS input file not found" + dms_input + print("DMS input file: " + dms_input) + if "DMS_mutant_column" in mapping_protein_seq_DMS.columns: + mutant_col = mapping_protein_seq_DMS["DMS_mutant_column"][mapping_protein_seq_DMS["DMS_id"] == DMS_id].values[0] + else: + print("DMS_mutant_column not found in mapping file, using mutant") + mutant_col = "mutant" + DMS_phenotype_name = \ + mapping_protein_seq_DMS["DMS_phenotype_name"][mapping_protein_seq_DMS["DMS_id"] == DMS_id].values[0] + print("DMS mutant column: " + mutant_col) + print("DMS phenotype name: " + DMS_phenotype_name) + dms_output = os.path.join(args.dms_output_dir, DMS_id) # Only the prefix, as the file will have _samples etc + msa_path = os.path.join(args.msa_path, + mapping_protein_seq_DMS["MSA_filename"][mapping_protein_seq_DMS["DMS_id"] == DMS_id].values[ + 0]) # msa_path is expected to be the path to the directory where MSAs are located. + + # Rather use UniProt ID as MSA prefix + if args.msa_use_uniprot: + if not os.path.isfile(msa_path): + print("MSA file not found: " + msa_path) + print("Looking for close match using UniProt ID") + uniprot_id = mapping_protein_seq_DMS["UniProt_ID"][mapping_protein_seq_DMS["DMS_id"] == DMS_id].values[0] + print("UniProt ID: " + uniprot_id) + found = [file for file in os.listdir(args.msa_path) if + file.startswith(uniprot_id) and file.endswith(".a2m")] + assert len(found) == 1, "Could not find unique MSA file for Uniprot ID {} in {}, found {} files".format( + uniprot_id, args.msa_path, found) + msa_path = os.path.join(args.msa_path, found[0]) + print("New MSA file: " + msa_path) + + print("MSA file: " + msa_path) + assert os.path.isfile(msa_path), "MSA file not found: " + msa_path + target_seq_start_index = mapping_protein_seq_DMS["start_idx"][mapping_protein_seq_DMS["DMS_id"] == DMS_id].values[0] \ + if "start_idx" in mapping_protein_seq_DMS.columns else 1 + target_seq_end_index = target_seq_start_index + len(sequence) + # msa_start_index = mapping_protein_seq_DMS["MSA_start"][mapping_protein_seq_DMS["DMS_id"]==DMS_id].values[0] if "MSA_start" in mapping_protein_seq_DMS.columns else 1 + # msa_end_index = mapping_protein_seq_DMS["MSA_end"][mapping_protein_seq_DMS["DMS_id"]==DMS_id].values[0] if "MSA_end" in mapping_protein_seq_DMS.columns else len(args.sequence) + # if (target_seq_start_index!=msa_start_index) or (target_seq_end_index!=msa_end_index): + # args.sequence = args.sequence[msa_start_index-1:msa_end_index] + # target_seq_start_index = msa_start_index + # target_seq_end_index = msa_end_index + # df = pd.read_csv(args.dms_input) + # df,_ = DMS_file_cleanup(df, target_seq=args.sequence, start_idx=target_seq_start_index, end_idx=target_seq_end_index, DMS_mutant_column=mutant_col, DMS_phenotype_name=DMS_phenotype_name) + # else: + # df = pd.read_csv(args.dms_input) + return DMS_phenotype_name, dms_input, dms_output, msa_path, mutant_col, sequence + + +if __name__ == "__main__": + parser = create_parser() + args = parser.parse_args() + print("args:", args) + start_time = time.time() + main(args) + print("Done in " + str(time.time() - start_time) + " seconds") \ No newline at end of file diff --git a/examples/scripts/calc_weights_job_array.sh b/examples/scripts/calc_weights_job_array.sh new file mode 100644 index 0000000..f3cdec4 --- /dev/null +++ b/examples/scripts/calc_weights_job_array.sh @@ -0,0 +1,36 @@ +#!/bin/bash +#SBATCH -c 2 # Request one core +#SBATCH -N 1 # Request one node (if you request more than one core with -c, also using + # -N 1 means all cores will be on the same node) +#SBATCH -t 0-5:59 # Runtime in D-HH:MM format +#SBATCH -p short # Partition to run in +#SBATCH --mem=10G # Memory total in MB (for all cores) + +# To get email notifications, set both of these options below +##SBATCH --mail-type=TIME_LIMIT_80,TIME_LIMIT,FAIL,ARRAY_TASKS +##SBATCH --mail-user="@hms.harvard.edu" + +#SBATCH --job-name="deepseq_calcweights_date" +# Job array-specific +#SBATCH --output=slurm-%A_%a-%x.out # File to which STDOUT + STDERR will be written, %A: jobID, %a: array task ID, %x: jobname +#SBATCH --array=0-41%10 # Job arrays (e.g. 1-100 with a maximum of 5 jobs at once) + +hostname +pwd +module load gcc/6.2.0 cuda/9.0 +export THEANO_FLAGS='floatX=float32,device=cuda,force_device=True' # Otherwise will only raise a warning and carry on with CPU + +# To generate this file from a directory, just do e.g. 'ls -1 ALIGNMENTS_DIR/*.a2m > msas.txt' +lines=( $(cat "msas.txt") ) +dataset_name=${lines[$SLURM_ARRAY_TASK_ID]} +echo $dataset_name + +## Monitor GPU usage (store outputs in ./gpu_logs/) +#/home/lov701/job_gpu_monitor.sh gpu_logs & + +srun stdbuf -oL -eL /n/groups/marks/users/aaron/deep_seqs/deep_seqs_env/bin/python \ + /n/groups/marks/users/lood/DeepSequence_runs/examples/calc_weights.py \ + --dataset $dataset_name \ + --weights_dir_out /n/groups/marks/users/lood/DeepSequence_runs/weights_2021_11_16/ \ + --alignments_dir datasets/alignments/ +# --theta-override 0.9