diff --git a/Makefile b/Makefile index 0e70364..78f3ab9 100644 --- a/Makefile +++ b/Makefile @@ -3,11 +3,10 @@ OBO = http://purl.obolibrary.org/obo all: target all_obo neo.obo neo.owl clean: - rm trigger datasets.json mirror/*gz target/*.obo || echo "not all files present, perhaps last build did not complete" + rm trigger datasets.json mirror/*gz mirror/*tmp target/*.obo || echo "not all files present, perhaps last build did not complete" TEST_SRCS ?= sgd pombase -#SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap uniprot_reviewed_virus_bacteria -SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap ecocyc goa_sars-cov-2 +SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap ecocyc goa_sars-cov-2 uniprot_reviewed OBO_SRCS = $(patsubst %,target/neo-%.obo,$(SRCS)) all_obo: $(OBO_SRCS) @@ -27,8 +26,8 @@ IMPORTS = imports/pr_import.obo neo.obo: $(OBO_SRCS) $(IMPORTS) owltools --create-ontology http://purl.obolibrary.org/obo/go/noctua/neo.owl $^ --merge-support-ontologies -o -f obo $@.tmp && grep -v ^owl-axioms $@.tmp > $@ - - +## datasets.json is created as a throwaway in the NEO versions of the +## pipeline and is based on the go-site master data. datasets.json: trigger wget http://s3.amazonaws.com/go-build/metadata/datasets.json -O $@ && touch $@ @@ -45,14 +44,28 @@ mirror/goa_sars-cov-2.gpi.gz: target/neo-goa_sars-cov-2.obo: mirror/goa_sars-cov-2.gpi.gz gzip -dc $< | ./gpi2obo.pl -s Scov2 -n sars-cov-2 > $@.tmp && mv $@.tmp $@ -## In support of including viruses and bacteria -## (https://github.com/geneontology/neo/issues/77). -## http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed_virus_bacteria.gpi.gz -mirror/uniprot_reviewed_virus_bacteria.gpi.gz: - wget --no-check-certificate http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed_virus_bacteria.gpi.gz -O mirror/uniprot_reviewed_virus_bacteria.gpi.gz -target/neo-uniprot_reviewed_virus_bacteria.obo: mirror/uniprot_reviewed_virus_bacteria.gpi.gz - gzip -dc $< | ./gpi2obo.pl -F -n reviewed_virus_bacteria > $@.tmp && mv $@.tmp $@ - +# ## In support of including viruses and bacteria +# ## (https://github.com/geneontology/neo/issues/77). +# ## http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed_virus_bacteria.gpi.gz +# mirror/uniprot_reviewed_virus_bacteria.gpi.gz: +# wget --no-check-certificate http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed_virus_bacteria.gpi.gz -O mirror/uniprot_reviewed_virus_bacteria.gpi.gz +# target/neo-uniprot_reviewed_virus_bacteria.obo: mirror/uniprot_reviewed_virus_bacteria.gpi.gz +# gzip -dc $< | ./gpi2obo.pl -F -n reviewed_virus_bacteria > $@.tmp && mv $@.tmp $@ + +## In support of including all swissprot reviewed. +## Download and /filter out by species/. +## (https://github.com/geneontology/neo/issues/82). +## http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed.gpi.gz +## The filter_list.txt (and option) should not be needed in the future +## as we should be drawing exclusively from datasets.json. +mirror/uniprot_reviewed.gpi.gz: datasets.json + wget --no-check-certificate http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed.gpi.gz -O mirror/uniprot_reviewed.gpi.gz.tmp + gzip -dc mirror/uniprot_reviewed.gpi.gz.tmp > mirror/uniprot_reviewed.gpi.tmp + perl filter.pl -v --metadata datasets.json --filter filter_list.txt --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp + gzip -c mirror/filtered_uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.gz.tmp + mv mirror/filtered_uniprot_reviewed.gpi.gz.tmp mirror/uniprot_reviewed.gpi.gz +target/neo-uniprot_reviewed.obo: mirror/uniprot_reviewed.gpi.gz + gzip -dc $< | ./gpi2obo.pl -F -n reviewed > $@.tmp && mv $@.tmp $@ # Sub-makefile # diff --git a/Makefile-gafs b/Makefile-gafs index 82bc72d..583f803 100644 --- a/Makefile-gafs +++ b/Makefile-gafs @@ -39,7 +39,7 @@ target/neo-fb.obo: mirror/gene_association.fb.gz mirror/Lmajor.gaf.gz: - wget --no-check-certificate ftp://ftp.sanger.ac.uk/pub/project/pathogens/as44/Lmajor.gaf.gz -O $@.tmp && mv $@.tmp $@ + wget --no-check-certificate ftp://ftp.sanger.ac.uk/pub/genedb/releases/latest/Lmajor/Lmajor.gaf.gz -O $@.tmp && mv $@.tmp $@ target/neo-genedb_lmajor.obo: mirror/Lmajor.gaf.gz @@ -47,13 +47,21 @@ target/neo-genedb_lmajor.obo: mirror/Lmajor.gaf.gz mirror/Tbruceibrucei927.gaf.gz: - wget --no-check-certificate ftp://ftp.sanger.ac.uk/pub/project/pathogens/as44/Tbruceibrucei927.gaf.gz -O $@.tmp && mv $@.tmp $@ + wget --no-check-certificate ftp://ftp.sanger.ac.uk/pub/genedb/releases/latest/Tbruceibrucei927/Tbruceibrucei927.gaf.gz -O $@.tmp && mv $@.tmp $@ target/neo-genedb_tbrucei.obo: mirror/Tbruceibrucei927.gaf.gz gzip -dc mirror/Tbruceibrucei927.gaf.gz | ./gaf2obo.pl -s genedb_tbrucei -n genedb_tbrucei > $@.tmp && mv $@.tmp $@ +mirror/Pfalciparum.gaf.gz: + wget --no-check-certificate ftp://ftp.sanger.ac.uk/pub/genedb/releases/latest/Pfalciparum/Pfalciparum.gaf.gz -O $@.tmp && mv $@.tmp $@ + + +target/neo-genedb_pfalciparum.obo: mirror/Pfalciparum.gaf.gz + gzip -dc mirror/Pfalciparum.gaf.gz | ./gaf2obo.pl -s genedb_pfalciparum -n genedb_pfalciparum > $@.tmp && mv $@.tmp $@ + + mirror/goa_chicken.gpi.gz: wget --no-check-certificate ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/CHICKEN/goa_chicken.gpi.gz -O $@.tmp && mv $@.tmp $@ @@ -415,7 +423,7 @@ target/neo-sgd.obo: mirror/gpi.sgd.gz mirror/gene_association.sgn.gz: - wget --no-check-certificate ftp://ftp.solgenomics.net/ontology/GO/gene_association.sgn.gz -O $@.tmp && mv $@.tmp $@ + wget --no-check-certificate https://solgenomics.net/ftp/ontology/GO/gene_association.sgn.gz -O $@.tmp && mv $@.tmp $@ target/neo-sgn.obo: mirror/gene_association.sgn.gz @@ -439,7 +447,7 @@ target/neo-wb.obo: mirror/c_elegans.PRJNA13758.current.gene_product_info.gpi.gz mirror/xenbase.gpi.gz: - wget --no-check-certificate https://github.com/geneontology/pipeline/files/7862197/xenbase.gpi.gz -O $@.tmp && mv $@.tmp $@ + wget --no-check-certificate https://ftp.xenbase.org/pub/GenePageReports/xenbase.gpi.gz -O $@.tmp && mv $@.tmp $@ target/neo-xenbase.obo: mirror/xenbase.gpi.gz diff --git a/filter.pl b/filter.pl new file mode 100755 index 0000000..7c019e6 --- /dev/null +++ b/filter.pl @@ -0,0 +1,149 @@ +#!/usr/bin/perl -w +#### +#### Cheap script to filter out all the "known" species from the +#### collision-prone uniprot_reviewed file, as described in +#### https://github.com/geneontology/neo/issues/82#issuecomment-1074494641 . +#### +#### Usage: +#### perl filter.pl -v --metadata /tmp/datasets.json --input /tmp/uniprot_reviewed.gpi > /tmp/clean_file.gpi +#### +#### Makefile usage (ideal, if all things are in datasets.json): +#### perl filter.pl -v --metadata datasets.json --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp +#### +#### Makefile usage (w/a list generated from grepping out all the non-reviews taxon): +#### perl filter.pl -v --metadata datasets.json --filter filter_list.txt --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp +#### +#### This list can be generated by taking the uniprot_reviewed files +#### out of mirrors/ and: "zgrep -ohi 'taxon\:[0-9]*' *.gz | sort | uniq > /tmp/filter_list.txt" +#### + +## Bring in necessaries. +use utf8; +use strict; +use Data::Dumper; +use Getopt::Long; + +## Opts. +my $verbose = ''; +my $help = ''; +my $metadata = ''; +my $filter = ''; +my $input = ''; +GetOptions ('verbose' => \$verbose, + 'help' => \$help, + 'metadata=s' => \$metadata, + 'filter=s' => \$filter, + 'input=s' => \$input); + +## Just a little easy printin' when feeling verbose. +sub ll { + my $str = shift || ''; + print STDERR $str if $verbose; +} +ll("Verbose ON.\n"); + +## Embedded help through perldoc. +if( $help ){ + system('perldoc', __FILE__); + exit 0; +} + +### +### Main. +### + +## Grab anything that looks like a taxon. +my %taxa_hash; +open(METADATA, '<', $metadata) or die "Cannot even open metadata: $metadata: $!"; +while( ){ + ## We are going from NCBITaxon -> taxon. + if( /NCBITaxon\:([0-9]+)/ ){ + $taxa_hash{'taxon:'. $1} = 1; + } +} +close METADATA; + +## Do the same for an optional test list file. +if( $filter ){ + open(FILTER, '<', $filter) or die "Cannot even open filter: $filter: $!"; + while( ){ + ## We are going from NCBITaxon -> taxon. + if( /NCBITaxon\:([0-9]+)/ ){ + $taxa_hash{'taxon:'. $1} = 1; + } + } +} +close FILTER; + +## Convert to something a little more usable. +my @taxa = keys(%taxa_hash); +# ## Force error for testing. +# push @taxa, 'taxon:399742'; + +## Check: +ll(Dumper \@taxa); + +## Filter input file. +open(INPUT, '<', $input) or die "Cannot even open input: $input: $!"; +while( ){ + #if( $_ ~~ @taxa ){ + my $line = $_; + my $good_p = 1; + for( @taxa ){ + if( $line =~ $_ ){ + ll('SKIPPING: '. $line); + $good_p = 0; + last; + } + } + print STDOUT $line if $good_p; +} +close INPUT; + +### +### Doc. +### + +=head1 NAME + +filter.pl + +=head1 SYNOPSIS + +filter.pl [-h/--help] [-v/--verbose] [-m/--metadata FILE] [-i/--input FILE] + +=head1 DESCRIPTION + +This script takes a datasets.json file (from the NEO pipeline--long story) and uses the taxon information in it to filter a GPI (or any other) file. Output to STDOUT. + +Example usage: + +perl filter.pl -v --metadata /tmp/datasets.json --input /tmp/uniprot_reviewed.gpi > /tmp/clean_file.gpi + +=head1 OPTIONS + +=over + +=item -v/--verbose + +Verbose + +=item -h/--help + +Help. + +=item -m/--metadata FILE + +The location of the datasets.json file. + +=item -i/--input FILE + +The location of the file to be filtered. + +=back + +=head1 SEE ALSO + +https://github.com/geneontology/neo + +=cut diff --git a/filter_list.txt b/filter_list.txt new file mode 100644 index 0000000..5cef4d8 --- /dev/null +++ b/filter_list.txt @@ -0,0 +1,23 @@ +NCBITaxon:10090 +NCBITaxon:10116 +NCBITaxon:1280 +NCBITaxon:1314 +NCBITaxon:196620 +NCBITaxon:208964 +NCBITaxon:2697049 +NCBITaxon:282459 +NCBITaxon:3702 +NCBITaxon:383379 +NCBITaxon:44689 +NCBITaxon:471876 +NCBITaxon:4896 +NCBITaxon:5085 +NCBITaxon:5476 +NCBITaxon:559292 +NCBITaxon:588858 +NCBITaxon:6239 +NCBITaxon:7227 +NCBITaxon:7955 +NCBITaxon:83333 +NCBITaxon:9606 +NCBITaxon:9823