Skip to content

Commit

Permalink
additional cleaning; add another (optional) filter file to add manual…
Browse files Browse the repository at this point in the history
… things that are not present in the datasets.json; work on #82
  • Loading branch information
kltm committed Apr 6, 2022
1 parent 81f9ef1 commit dea77bd
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 5 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ OBO = http://purl.obolibrary.org/obo
all: target all_obo neo.obo neo.owl

clean:
rm trigger datasets.json mirror/*gz target/*.obo || echo "not all files present, perhaps last build did not complete"
rm trigger datasets.json mirror/*gz mirror/*tmp target/*.obo || echo "not all files present, perhaps last build did not complete"

TEST_SRCS ?= sgd pombase
#SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap uniprot_reviewed_virus_bacteria
Expand Down Expand Up @@ -62,7 +62,7 @@ target/neo-goa_sars-cov-2.obo: mirror/goa_sars-cov-2.gpi.gz
mirror/uniprot_reviewed.gpi.gz: datasets.json
wget --no-check-certificate http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed.gpi.gz -O mirror/uniprot_reviewed.gpi.gz.tmp
gzip -dc mirror/uniprot_reviewed.gpi.gz.tmp > mirror/uniprot_reviewed.gpi.tmp
perl filter.pl -v --metadata datasets.json --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp
perl filter.pl -v --metadata datasets.json --filter filter_list.txt --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp
gzip -c mirror/filtered_uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.gz.tmp
mv mirror/filtered_uniprot_reviewed.gpi.gz.tmp mirror/uniprot_reviewed.gpi.gz
target/neo-uniprot_reviewed.obo: mirror/uniprot_reviewed.gpi.gz
Expand Down
27 changes: 24 additions & 3 deletions filter.pl
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,14 @@
#### Usage:
#### perl filter.pl -v --metadata /tmp/datasets.json --input /tmp/uniprot_reviewed.gpi > /tmp/clean_file.gpi
####
#### Makefile usage:
#### perl ./filter.pl -v --metadata ./datasets.json --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp
#### Makefile usage (ideal, if all things are in datasets.json):
#### perl filter.pl -v --metadata datasets.json --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp
####
#### Makefile usage (w/a list generated from grepping out all the non-reviews taxon):
#### perl filter.pl -v --metadata datasets.json --filter filter_list.txt --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp
####
#### This list can be generated by taking the uniprot_reviewed files
#### out of mirrors/ and: "zgrep -ohi 'taxon\:[0-9]*' *.gz | sort | uniq > /tmp/filter_list.txt"
####

## Bring in necessaries.
Expand All @@ -21,10 +27,12 @@
my $verbose = '';
my $help = '';
my $metadata = '';
my $filter = '';
my $input = '';
GetOptions ('verbose' => \$verbose,
'help' => \$help,
'metadata=s' => \$metadata,
'filter=s' => \$filter,
'input=s' => \$input);

## Just a little easy printin' when feeling verbose.
Expand Down Expand Up @@ -54,8 +62,21 @@ sub ll {
}
}
close METADATA;
my @taxa = keys(%taxa_hash);

## Do the same for an optional test list file.
if( $filter ){
open(FILTER, '<', $filter) or die "Cannot even open filter: $filter: $!";
while( <FILTER> ){
## We are going from NCBITaxon -> taxon.
if( /NCBITaxon\:([0-9]+)/ ){
$taxa_hash{'taxon:'. $1} = 1;
}
}
}
close FILTER;

## Convert to something a little more usable.
my @taxa = keys(%taxa_hash);
# ## Force error for testing.
# push @taxa, 'taxon:399742';

Expand Down

0 comments on commit dea77bd

Please sign in to comment.