Skip to content

Commit

Permalink
update generic assay importer (python script)
Browse files Browse the repository at this point in the history
  • Loading branch information
dippindots committed Apr 28, 2020
1 parent f8a2f20 commit 86812c4
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -237,13 +237,16 @@ public static void importData(File dataFile, GeneticAlterationType geneticAltera
currentLine = buf.readLine();
}

// show import message
if (updateInfo) {
ProgressMonitor.setCurrentMessage(updatedEntities.size() + " generic entities existing in the database that were overridden during import.");
} else {
ProgressMonitor.setCurrentMessage(notUpdatedEntities.size() + " generic entities existing in the database that were not overridden during import.");
// show import result message
if (updatedEntities.size() > 0) {
ProgressMonitor.setCurrentMessage("--> Entities updated: " + updatedEntities.size() + " generic entities existing in the database that were overridden during import.");
}
if (notUpdatedEntities.size() > 0) {
ProgressMonitor.setCurrentMessage("--> Entities not updated: " + notUpdatedEntities.size() + " generic entities existing in the database that were not overridden during import.");
}
if (newEntities.size() > 0) {
ProgressMonitor.setCurrentMessage("--> New Entities: " + newEntities.size() + " generic entities have been imported into database during import.");
}
ProgressMonitor.setCurrentMessage(newEntities.size() + " generic entities have been imported into database during import.");

reader.close();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,14 @@ public void run() {
// Parse arguments
// using a real options parser, helps avoid bugs
String description = "Import 'profile' files that contain data matrices indexed by gene, case";
OptionSet options = ConsoleUtil.parseStandardDataAndMetaOptions(args, description, true);
OptionSet options = ConsoleUtil.parseStandardDataAndMetaUpdateOptions(args, description, true);
File dataFile = new File((String) options.valueOf("data"));
File descriptorFile = new File((String) options.valueOf( "meta" ) );
// Check options, set default as true
boolean updateInfo = true;
if (options.has("update-info") && (((String) options.valueOf("update-info")).equalsIgnoreCase("false") || options.valueOf("update-info").equals("0"))) {
updateInfo = false;
}
SpringUtil.initDataSource();
ProgressMonitor.setCurrentMessage("Reading data from: " + dataFile.getAbsolutePath());
// Load genetic profile and gene panel
Expand Down Expand Up @@ -100,7 +105,7 @@ public void run() {
importer.importData();
} else if (geneticProfile.getGeneticAlterationType() == GeneticAlterationType.GENERIC_ASSAY) {
// add all missing `genetic_entities` for this assay to the database
ImportGenericAssayEntity.importData(dataFile, geneticProfile.getGeneticAlterationType(), geneticProfile.getOtherMetaDataField("generic_entity_meta_properties"), true);
ImportGenericAssayEntity.importData(dataFile, geneticProfile.getGeneticAlterationType(), geneticProfile.getOtherMetaDataField("generic_entity_meta_properties"), updateInfo);

ImportTabDelimData genericAssayProfileImporter = new ImportTabDelimData(dataFile, geneticProfile.getTargetLine(), geneticProfile.getGeneticProfileId(), genePanel, geneticProfile.getOtherMetaDataField("generic_entity_meta_properties"));
genericAssayProfileImporter.importData(numLines);
Expand Down
68 changes: 68 additions & 0 deletions core/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,74 @@ public static OptionSet parseStandardDataAndStudyOptions(String[] args, String d
"Error: 'study' argument required.");
}

return options;
}

/**
* Default method to be used when Importer class main method expects only 'data' and 'meta' as mandatory options
* and an optional 'loadMode' parameter and an optional 'update-info' parameter
*
* @param args: the same args given to main() method of the tool
* @param description: short description of the tool (to display in the usage line if necessary)
* @param hasLoadMode: set to true to let this method validate whether the command line argument loadMode was given
*
* @return the parsed options
*/
public static OptionSet parseStandardDataAndMetaUpdateOptions(String[] args, String description, boolean hasLoadMode) {
// using a real options parser, helps avoid bugs
OptionParser parser = new OptionParser();
parser.accepts("noprogress", "this option can be given to avoid the messages regarding memory usage and % complete");
OptionSpec<Void> help = parser.accepts( "help", "print this help info" );
parser.accepts( "data", "profile data file" ).withRequiredArg().describedAs( "data_file.txt" ).ofType( String.class );
parser.accepts( "update-info", "Update information for existing entities in the database").withOptionalArg().ofType(String.class);
parser.accepts( "meta", "meta (description) file" ).withRequiredArg().describedAs( "meta_file.txt" ).ofType( String.class );
if (hasLoadMode) {
parser.accepts( "loadMode", "direct (per record) or bulk load of data" )
.withRequiredArg().describedAs( "[directLoad|bulkLoad (default)]" ).ofType( String.class );
}
String progName = "importScript";

OptionSet options = null;
try {
options = parser.parse( args );
} catch (OptionException e) {
throw new UsageException(progName, description, parser,
e.getMessage());
}

if( options.has( help ) ){
throw new UsageException(progName, description, parser);
}

//these extra checks are needed, since withRequiredArg above only indicated that the option
//has a mandatory argument but does not make the option itself mandatory.
if(!options.has("data")) {
throw new UsageException(progName, description, parser,
"Error: 'data' argument required.");
}

if(!options.has("meta")) {
throw new UsageException(progName, description, parser,
"Error: 'meta' argument required.");
}

if (hasLoadMode) {
if( options.has( "loadMode" ) ){
String actionArg = (String) options.valueOf( "loadMode" );
if (actionArg.equalsIgnoreCase("directLoad")) {
MySQLbulkLoader.bulkLoadOff();
} else if (actionArg.equalsIgnoreCase( "bulkLoad" )) {
MySQLbulkLoader.bulkLoadOn();
} else {
throw new UsageException(progName, description, parser,
"Error: unknown loadMode action: " + actionArg);
}
}
else {
throw new UsageException(progName, description, parser,
"Error: 'loadMode' argument required.");
}
}
return options;
}
}
43 changes: 28 additions & 15 deletions core/src/main/scripts/importer/cbioportalImporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def remove_study_id(jvm_args, study_id):
run_java(*args)


def import_study_data(jvm_args, meta_filename, data_filename, meta_file_dictionary = None):
def import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, meta_file_dictionary = None):
args = jvm_args.split(' ')

# In case the meta file is already parsed in a previous function, it is not
Expand All @@ -114,6 +114,11 @@ def import_study_data(jvm_args, meta_filename, data_filename, meta_file_dictiona
# Retrieve meta file type
meta_file_type = meta_file_dictionary['meta_file_type']

# Update entities by default
shouldUpdateGenericAssayEntities = True
if update_generic_assay_entity != None and update_generic_assay_entity.casefold() == "False".casefold():
shouldUpdateGenericAssayEntities = False

# invalid file, skip
if meta_file_type is None:
print(("Unrecognized meta file type '%s', skipping file"
Expand All @@ -133,6 +138,12 @@ def import_study_data(jvm_args, meta_filename, data_filename, meta_file_dictiona
args.append(meta_filename)
args.append("--loadMode")
args.append("bulkload")
if importer == "org.mskcc.cbio.portal.scripts.ImportProfileData" and shouldUpdateGenericAssayEntities:
args.append("--update-info")
args.append("True")
elif importer == "org.mskcc.cbio.portal.scripts.ImportProfileData" and not shouldUpdateGenericAssayEntities:
args.append("--update-info")
args.append("False")
if importer in ("org.mskcc.cbio.portal.scripts.ImportMutSigData", "org.mskcc.cbio.portal.scripts.ImportGisticData"):
args.append("--data")
args.append(data_filename)
Expand Down Expand Up @@ -186,7 +197,7 @@ def process_case_lists(jvm_args, case_list_dir):
if not (case_list.startswith('.') or case_list.endswith('~')):
import_case_list(jvm_args, os.path.join(case_list_dir, case_list))

def process_command(jvm_args, command, meta_filename, data_filename, study_ids):
def process_command(jvm_args, command, meta_filename, data_filename, study_ids, update_generic_assay_entity = None):
if command == IMPORT_CANCER_TYPE:
import_cancer_type(jvm_args, data_filename)
elif command == IMPORT_STUDY:
Expand All @@ -201,11 +212,11 @@ def process_command(jvm_args, command, meta_filename, data_filename, study_ids):
else:
raise RuntimeError('Your command uses both -id and -meta. Please, use only one of the two parameters.')
elif command == IMPORT_STUDY_DATA:
import_study_data(jvm_args, meta_filename, data_filename)
import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity)
elif command == IMPORT_CASE_LIST:
import_case_list(jvm_args, meta_filename)

def process_directory(jvm_args, study_directory):
def process_directory(jvm_args, study_directory, update_generic_assay_entity = None):
"""
Import an entire study directory based on meta files found.
Expand Down Expand Up @@ -338,47 +349,47 @@ def process_directory(jvm_args, study_directory):
raise RuntimeError('No sample attribute file found')
else:
meta_filename, data_filename = sample_attr_filepair
import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename])
import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])

# Next, we need to import resource definitions for resource data
if resource_definition_filepair is not None:
meta_filename, data_filename = resource_definition_filepair
import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename])
import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])

# Next, we need to import sample definitions for resource data
if sample_resource_filepair is not None:
meta_filename, data_filename = sample_resource_filepair
import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename])
import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])

# Next, import everything else except gene panel, fusion data, GSVA and
# z-score expression. If in the future more types refer to each other, (like
# in a tree structure) this could be programmed in a recursive fashion.
for meta_filename, data_filename in regular_filepairs:
import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename])
import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])

# Import fusion data (after mutation)
if fusion_filepair is not None:
meta_filename, data_filename = fusion_filepair
import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename])
import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])

# Import expression z-score (after expression)
for meta_filename, data_filename in zscore_filepairs:
import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename])
import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])

# Import GSVA genetic profiles (after expression and z-scores)
if gsva_score_filepair is not None:

# First import the GSVA score data
meta_filename, data_filename = gsva_score_filepair
import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename])
import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])

# Second import the GSVA p-value data
meta_filename, data_filename = gsva_pvalue_filepair
import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename])
import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])

if gene_panel_matrix_filepair is not None:
meta_filename, data_filename = gene_panel_matrix_filepair
import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename])
import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])

# Import the case lists
case_list_dirname = os.path.join(study_directory, 'case_lists')
Expand Down Expand Up @@ -456,6 +467,8 @@ def interface():
parser.add_argument('-id', '--study_ids', type=str, required=False,
help='Cancer Study IDs for `remove-study` command, comma separated')

parser.add_argument('-update', '--update_generic_assay_entity', type=str, required=False,
help='Set as True to update the existing generic assay entities, set as False to keep the existing generic assay entities for generic assay')
# TODO - add same argument to metaimporter
# TODO - harmonize on - and _

Expand Down Expand Up @@ -517,11 +530,11 @@ def main(args):

if study_directory != None:
check_dir(study_directory)
process_directory(jvm_args, study_directory)
process_directory(jvm_args, study_directory, args.update_generic_assay_entity)
else:
check_args(args.command)
check_files(args.meta_filename, args.data_filename)
process_command(jvm_args, args.command, args.meta_filename, args.data_filename, args.study_ids)
process_command(jvm_args, args.command, args.meta_filename, args.data_filename, args.study_ids, args.update_generic_assay_entity)

# ------------------------------------------------------------------------------
# ready to roll
Expand Down
2 changes: 2 additions & 0 deletions core/src/main/scripts/importer/metaImport.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ def interface():
'report. For example, set this to a high number to '
'report all genes that could not be loaded, instead '
'of reporting "(GeneA, GeneB, GeneC, 213 more)".')
parser.add_argument('-update', '--update_generic_assay_entity', type=str, required=False, default="True",
help='Set as True to update the existing generic assay entities, set as False to keep the existing generic assay entities for generic assay')
parser = parser.parse_args()
return parser

Expand Down

0 comments on commit 86812c4

Please sign in to comment.