Skip to content

Commit

Permalink
new quick and dirty script for filtering taxon for #82
Browse files Browse the repository at this point in the history
  • Loading branch information
kltm committed Mar 22, 2022
1 parent 5d03c76 commit 8d8cd00
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 2 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ IMPORTS = imports/pr_import.obo
neo.obo: $(OBO_SRCS) $(IMPORTS)
owltools --create-ontology http://purl.obolibrary.org/obo/go/noctua/neo.owl $^ --merge-support-ontologies -o -f obo $@.tmp && grep -v ^owl-axioms $@.tmp > $@



## datasets.json is created as a throwaway in the NEO versions of the
## pipeline and is based on the go-sitr master data.
datasets.json: trigger
wget http://s3.amazonaws.com/go-build/metadata/datasets.json -O $@ && touch $@

Expand Down
126 changes: 126 additions & 0 deletions filter.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/usr/bin/perl -w
####
#### Cheap script to filter out all the "known" species from the
#### collision-prone uniprot_reviewed file, as described in
#### https://github.com/geneontology/neo/issues/82#issuecomment-1074494641
#### .
####
#### Usage:
#### perl filter.pl -v --metadata /tmp/datasets.json --input /tmp/uniprot_reviewed.gpi > /tmp/clean_file.gpi
####

## Bring in necessaries.
use utf8;
use strict;
use Data::Dumper;
use Getopt::Long;

## Opts.
my $verbose = '';
my $help = '';
my $metadata = '';
my $input = '';
GetOptions ('verbose' => \$verbose,
'help' => \$help,
'metadata=s' => \$metadata,
'input=s' => \$input);

## Just a little easy printin' when feeling verbose.
sub ll {
my $str = shift || '';
print STDERR $str if $verbose;
}
ll("Verbose ON.\n");

## Embedded help through perldoc.
if( $help ){
system('perldoc', __FILE__);
exit 0;
}

###
### Main.
###

## Grab anything that looks like a taxon.
my %taxa_hash;
open(METADATA, '<', $metadata) or die "Cannot even open metadata: $metadata: $!";
while( <METADATA> ){
## We are going from NCBITaxon -> taxon.
if( /NCBITaxon\:([0-9]+)/ ){
$taxa_hash{'taxon:'. $1} = 1;
}
}
close METADATA;
my @taxa = keys(%taxa_hash);

# ## Force error for testing.
# push @taxa, 'taxon:399742';

## Check:
ll(Dumper \@taxa);

## Filter input file.
open(INPUT, '<', $input) or die "Cannot even open input: $input: $!";
while( <INPUT> ){
#if( $_ ~~ @taxa ){
my $line = $_;
my $good_p = 1;
for( @taxa ){
if( $line =~ $_ ){
ll('SKIPPING: '. $line);
$good_p = 0;
last;
}
}
print STDOUT $line if $good_p;
}
close INPUT;

###
### Doc.
###

=head1 NAME
filter.pl
=head1 SYNOPSIS
filter.pl [-h/--help] [-v/--verbose] [-m/--metadata FILE] [-i/--input FILE]
=head1 DESCRIPTION
This script takes a datasets.json file (from the NEO pipeline--long story) and uses the taxon information in it to filter a GPI (or any other) file. Output to STDOUT.
Example usage:
perl filter.pl -v --metadata /tmp/datasets.json --input /tmp/uniprot_reviewed.gpi > /tmp/clean_file.gpi
=head1 OPTIONS
=over
=item -v/--verbose
Verbose
=item -h/--help
Help.
=item -m/--metadata FILE
The location of the datasets.json file.
=item -i/--input FILE
The location of the file to be filtered.
=back
=head1 SEE ALSO
https://github.com/geneontology/neo
=cut

0 comments on commit 8d8cd00

Please sign in to comment.