new quick and dirty script for filtering taxon for #82

geneontology · Mar 22, 2022 · 8d8cd00 · 8d8cd00
1 parent 5d03c76
commit 8d8cd00
Show file tree

Hide file tree

Showing 2 changed files with 128 additions and 2 deletions.
diff --git a/Makefile b/Makefile
@@ -29,8 +29,8 @@ IMPORTS = imports/pr_import.obo
 neo.obo:  $(OBO_SRCS) $(IMPORTS)
 	owltools --create-ontology http://purl.obolibrary.org/obo/go/noctua/neo.owl $^ --merge-support-ontologies  -o -f obo $@.tmp && grep -v ^owl-axioms $@.tmp > $@
 
-
-
+## datasets.json is created as a throwaway in the NEO versions of the
+## pipeline and is based on the go-sitr master data.
 datasets.json: trigger
 	wget http://s3.amazonaws.com/go-build/metadata/datasets.json -O $@ && touch $@
 

diff --git a/filter.pl b/filter.pl
@@ -0,0 +1,126 @@
+#!/usr/bin/perl -w
+####
+#### Cheap script to filter out all the "known" species from the
+#### collision-prone uniprot_reviewed file, as described in
+#### https://github.com/geneontology/neo/issues/82#issuecomment-1074494641
+#### .
+####
+#### Usage:
+####   perl filter.pl -v --metadata /tmp/datasets.json --input /tmp/uniprot_reviewed.gpi > /tmp/clean_file.gpi
+####
+
+## Bring in necessaries.
+use utf8;
+use strict;
+use Data::Dumper;
+use Getopt::Long;
+
+## Opts.
+my $verbose = '';
+my $help = '';
+my $metadata = '';
+my $input = '';
+GetOptions ('verbose' => \$verbose,
+	    'help' => \$help,
+	    'metadata=s' => \$metadata,
+	    'input=s' => \$input);
+
+## Just a little easy printin' when feeling verbose.
+sub ll {
+  my $str = shift || '';
+  print STDERR $str if $verbose;
+}
+ll("Verbose ON.\n");
+
+## Embedded help through perldoc.
+if( $help ){
+  system('perldoc', __FILE__);
+  exit 0;
+}
+
+###
+### Main.
+###
+
+## Grab anything that looks like a taxon.
+my %taxa_hash;
+open(METADATA, '<', $metadata) or die "Cannot even open metadata: $metadata: $!";
+while( <METADATA> ){
+  ## We are going from NCBITaxon -> taxon.
+  if( /NCBITaxon\:([0-9]+)/ ){
+    $taxa_hash{'taxon:'. $1} = 1;
+  }
+}
+close METADATA;
+my @taxa = keys(%taxa_hash);
+
+# ## Force error for testing.
+# push @taxa, 'taxon:399742';
+
+## Check:
+ll(Dumper \@taxa);
+
+## Filter input file.
+open(INPUT, '<', $input) or die "Cannot even open input: $input: $!";
+while( <INPUT> ){
+  #if( $_ ~~ @taxa ){
+  my $line = $_;
+  my $good_p = 1;
+  for( @taxa ){
+    if( $line =~ $_ ){
+      ll('SKIPPING: '. $line);
+      $good_p = 0;
+      last;
+    }
+  }
+  print STDOUT $line if $good_p;
+}
+close INPUT;
+
+###
+### Doc.
+###
+
+=head1 NAME
+
+filter.pl
+
+=head1 SYNOPSIS
+
+filter.pl [-h/--help] [-v/--verbose] [-m/--metadata FILE] [-i/--input FILE]
+
+=head1 DESCRIPTION
+
+This script takes a datasets.json file (from the NEO pipeline--long story) and uses the taxon information in it to filter a GPI (or any other) file. Output to STDOUT.
+
+Example usage:
+
+perl filter.pl -v --metadata /tmp/datasets.json --input /tmp/uniprot_reviewed.gpi > /tmp/clean_file.gpi
+
+=head1 OPTIONS
+
+=over
+
+=item -v/--verbose
+
+Verbose
+
+=item -h/--help
+
+Help.
+
+=item -m/--metadata FILE
+
+The location of the datasets.json file.
+
+=item -i/--input FILE
+
+The location of the file to be filtered.
+
+=back
+
+=head1 SEE ALSO
+
+https://github.com/geneontology/neo
+
+=cut