geneontology · kltm · Apr 7, 2022 · Mar 18, 2022 · Mar 21, 2022 · Mar 22, 2022
diff --git a/Makefile b/Makefile
@@ -3,11 +3,10 @@ OBO = http://purl.obolibrary.org/obo
 all: target all_obo neo.obo neo.owl
 
 clean:
-	rm trigger datasets.json mirror/*gz target/*.obo || echo "not all files present, perhaps last build did not complete"
+	rm trigger datasets.json mirror/*gz mirror/*tmp target/*.obo || echo "not all files present, perhaps last build did not complete"
 
 TEST_SRCS ?= sgd pombase
-#SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap uniprot_reviewed_virus_bacteria
-SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap ecocyc goa_sars-cov-2
+SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap ecocyc goa_sars-cov-2 uniprot_reviewed
 
 OBO_SRCS = $(patsubst %,target/neo-%.obo,$(SRCS))
 all_obo: $(OBO_SRCS)
@@ -27,8 +26,8 @@ IMPORTS = imports/pr_import.obo
 neo.obo:  $(OBO_SRCS) $(IMPORTS)
 	owltools --create-ontology http://purl.obolibrary.org/obo/go/noctua/neo.owl $^ --merge-support-ontologies  -o -f obo [email protected] && grep -v ^owl-axioms [email protected] > $@
 
-
-
+## datasets.json is created as a throwaway in the NEO versions of the
+## pipeline and is based on the go-site master data.
 datasets.json: trigger
 	wget http://s3.amazonaws.com/go-build/metadata/datasets.json -O $@ && touch $@
 
@@ -45,14 +44,28 @@ mirror/goa_sars-cov-2.gpi.gz:
 target/neo-goa_sars-cov-2.obo: mirror/goa_sars-cov-2.gpi.gz
 	gzip -dc $< | ./gpi2obo.pl -s Scov2 -n sars-cov-2 > [email protected] && mv [email protected] $@
 
-## In support of including viruses and bacteria
-## (https://github.com/geneontology/neo/issues/77).
-## http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed_virus_bacteria.gpi.gz
-mirror/uniprot_reviewed_virus_bacteria.gpi.gz:
-	wget --no-check-certificate http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed_virus_bacteria.gpi.gz -O mirror/uniprot_reviewed_virus_bacteria.gpi.gz
-target/neo-uniprot_reviewed_virus_bacteria.obo: mirror/uniprot_reviewed_virus_bacteria.gpi.gz
-	gzip -dc $< | ./gpi2obo.pl -F -n reviewed_virus_bacteria > [email protected] && mv [email protected] $@
-
+# ## In support of including viruses and bacteria
+# ## (https://github.com/geneontology/neo/issues/77).
+# ## http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed_virus_bacteria.gpi.gz
+# mirror/uniprot_reviewed_virus_bacteria.gpi.gz:
+# 	wget --no-check-certificate http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed_virus_bacteria.gpi.gz -O mirror/uniprot_reviewed_virus_bacteria.gpi.gz
+# target/neo-uniprot_reviewed_virus_bacteria.obo: mirror/uniprot_reviewed_virus_bacteria.gpi.gz
+# 	gzip -dc $< | ./gpi2obo.pl -F -n reviewed_virus_bacteria > [email protected] && mv [email protected] $@
+
+## In support of including all swissprot reviewed.
+## Download and /filter out by species/.
+## (https://github.com/geneontology/neo/issues/82).
+## http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed.gpi.gz
+## The filter_list.txt (and option) should not be needed in the future
+## as we should be drawing exclusively from datasets.json.
+mirror/uniprot_reviewed.gpi.gz: datasets.json
+	wget --no-check-certificate http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed.gpi.gz -O mirror/uniprot_reviewed.gpi.gz.tmp
+	gzip -dc mirror/uniprot_reviewed.gpi.gz.tmp > mirror/uniprot_reviewed.gpi.tmp
+	perl filter.pl -v --metadata datasets.json --filter filter_list.txt --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp
+	gzip -c mirror/filtered_uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.gz.tmp
+	mv mirror/filtered_uniprot_reviewed.gpi.gz.tmp mirror/uniprot_reviewed.gpi.gz
+target/neo-uniprot_reviewed.obo: mirror/uniprot_reviewed.gpi.gz
+	gzip -dc $< | ./gpi2obo.pl -F -n reviewed > [email protected] && mv [email protected] $@
 
 # Sub-makefile
 #

diff --git a/Makefile-gafs b/Makefile-gafs
@@ -39,21 +39,29 @@ target/neo-fb.obo: mirror/gene_association.fb.gz
 
 
 mirror/Lmajor.gaf.gz: 
-	wget --no-check-certificate ftp://ftp.sanger.ac.uk/pub/project/pathogens/as44/Lmajor.gaf.gz -O [email protected] && mv [email protected] $@
+	wget --no-check-certificate ftp://ftp.sanger.ac.uk/pub/genedb/releases/latest/Lmajor/Lmajor.gaf.gz -O [email protected] && mv [email protected] $@
 
 
 target/neo-genedb_lmajor.obo: mirror/Lmajor.gaf.gz
 	gzip -dc mirror/Lmajor.gaf.gz | ./gaf2obo.pl -s genedb_lmajor -n genedb_lmajor > [email protected] && mv [email protected] $@
 
 
 mirror/Tbruceibrucei927.gaf.gz: 
-	wget --no-check-certificate ftp://ftp.sanger.ac.uk/pub/project/pathogens/as44/Tbruceibrucei927.gaf.gz -O [email protected] && mv [email protected] $@
+	wget --no-check-certificate ftp://ftp.sanger.ac.uk/pub/genedb/releases/latest/Tbruceibrucei927/Tbruceibrucei927.gaf.gz -O [email protected] && mv [email protected] $@
 
 
 target/neo-genedb_tbrucei.obo: mirror/Tbruceibrucei927.gaf.gz
 	gzip -dc mirror/Tbruceibrucei927.gaf.gz | ./gaf2obo.pl -s genedb_tbrucei -n genedb_tbrucei > [email protected] && mv [email protected] $@
 
 
+mirror/Pfalciparum.gaf.gz: 
+	wget --no-check-certificate ftp://ftp.sanger.ac.uk/pub/genedb/releases/latest/Pfalciparum/Pfalciparum.gaf.gz -O [email protected] && mv [email protected] $@
+
+
+target/neo-genedb_pfalciparum.obo: mirror/Pfalciparum.gaf.gz
+	gzip -dc mirror/Pfalciparum.gaf.gz | ./gaf2obo.pl -s genedb_pfalciparum -n genedb_pfalciparum > [email protected] && mv [email protected] $@
+
+
 mirror/goa_chicken.gpi.gz: 
 	wget --no-check-certificate ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/CHICKEN/goa_chicken.gpi.gz -O [email protected] && mv [email protected] $@
 
@@ -415,7 +423,7 @@ target/neo-sgd.obo: mirror/gpi.sgd.gz
 
 
 mirror/gene_association.sgn.gz: 
-	wget --no-check-certificate ftp://ftp.solgenomics.net/ontology/GO/gene_association.sgn.gz -O [email protected] && mv [email protected] $@
+	wget --no-check-certificate https://solgenomics.net/ftp/ontology/GO/gene_association.sgn.gz -O [email protected] && mv [email protected] $@
 
 
 target/neo-sgn.obo: mirror/gene_association.sgn.gz
@@ -439,7 +447,7 @@ target/neo-wb.obo: mirror/c_elegans.PRJNA13758.current.gene_product_info.gpi.gz
 
 
 mirror/xenbase.gpi.gz: 
-	wget --no-check-certificate https://github.com/geneontology/pipeline/files/7862197/xenbase.gpi.gz -O [email protected] && mv [email protected] $@
+	wget --no-check-certificate https://ftp.xenbase.org/pub/GenePageReports/xenbase.gpi.gz -O [email protected] && mv [email protected] $@
 
 
 target/neo-xenbase.obo: mirror/xenbase.gpi.gz

diff --git a/filter.pl b/filter.pl
@@ -0,0 +1,149 @@
+#!/usr/bin/perl -w
+####
+#### Cheap script to filter out all the "known" species from the
+#### collision-prone uniprot_reviewed file, as described in
+#### https://github.com/geneontology/neo/issues/82#issuecomment-1074494641 .
+####
+#### Usage:
+####   perl filter.pl -v --metadata /tmp/datasets.json --input /tmp/uniprot_reviewed.gpi > /tmp/clean_file.gpi
+####
+#### Makefile usage (ideal, if all things are in datasets.json):
+####   perl filter.pl -v --metadata datasets.json --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp
+####
+#### Makefile usage (w/a list generated from grepping out all the non-reviews taxon):
+####   perl filter.pl -v --metadata datasets.json --filter filter_list.txt --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp
+####
+#### This list can be generated by taking the uniprot_reviewed files
+#### out of mirrors/ and: "zgrep -ohi 'taxon\:[0-9]*' *.gz | sort | uniq > /tmp/filter_list.txt"
+####
+
+## Bring in necessaries.
+use utf8;
+use strict;
+use Data::Dumper;
+use Getopt::Long;
+
+## Opts.
+my $verbose = '';
+my $help = '';
+my $metadata = '';
+my $filter = '';
+my $input = '';
+GetOptions ('verbose' => \$verbose,
+	    'help' => \$help,
+	    'metadata=s' => \$metadata,
+	    'filter=s' => \$filter,
+	    'input=s' => \$input);
+
+## Just a little easy printin' when feeling verbose.
+sub ll {
+  my $str = shift || '';
+  print STDERR $str if $verbose;
+}
+ll("Verbose ON.\n");
+
+## Embedded help through perldoc.
+if( $help ){
+  system('perldoc', __FILE__);
+  exit 0;
+}
+
+###
+### Main.
+###
+
+## Grab anything that looks like a taxon.
+my %taxa_hash;
+open(METADATA, '<', $metadata) or die "Cannot even open metadata: $metadata: $!";
+while( <METADATA> ){
+  ## We are going from NCBITaxon -> taxon.
+  if( /NCBITaxon\:([0-9]+)/ ){
+    $taxa_hash{'taxon:'. $1} = 1;
+  }
+}
+close METADATA;
+
+## Do the same for an optional test list file.
+if( $filter ){
+  open(FILTER, '<', $filter) or die "Cannot even open filter: $filter: $!";
+  while( <FILTER> ){
+    ## We are going from NCBITaxon -> taxon.
+    if( /NCBITaxon\:([0-9]+)/ ){
+      $taxa_hash{'taxon:'. $1} = 1;
+    }
+  }
+}
+close FILTER;
+
+## Convert to something a little more usable.
+my @taxa = keys(%taxa_hash);
+# ## Force error for testing.
+# push @taxa, 'taxon:399742';
+
+## Check:
+ll(Dumper \@taxa);
+
+## Filter input file.
+open(INPUT, '<', $input) or die "Cannot even open input: $input: $!";
+while( <INPUT> ){
+  #if( $_ ~~ @taxa ){
+  my $line = $_;
+  my $good_p = 1;
+  for( @taxa ){
+    if( $line =~ $_ ){
+      ll('SKIPPING: '. $line);
+      $good_p = 0;
+      last;
+    }
+  }
+  print STDOUT $line if $good_p;
+}
+close INPUT;
+
+###
+### Doc.
+###
+
+=head1 NAME
+
+filter.pl
+
+=head1 SYNOPSIS
+
+filter.pl [-h/--help] [-v/--verbose] [-m/--metadata FILE] [-i/--input FILE]
+
+=head1 DESCRIPTION
+
+This script takes a datasets.json file (from the NEO pipeline--long story) and uses the taxon information in it to filter a GPI (or any other) file. Output to STDOUT.
+
+Example usage:
+
+perl filter.pl -v --metadata /tmp/datasets.json --input /tmp/uniprot_reviewed.gpi > /tmp/clean_file.gpi
+
+=head1 OPTIONS
+
+=over
+
+=item -v/--verbose
+
+Verbose
+
+=item -h/--help
+
+Help.
+
+=item -m/--metadata FILE
+
+The location of the datasets.json file.
+
+=item -i/--input FILE
+
+The location of the file to be filtered.
+
+=back
+
+=head1 SEE ALSO
+
+https://github.com/geneontology/neo
+
+=cut
diff --git a/filter_list.txt b/filter_list.txt
@@ -0,0 +1,23 @@
+NCBITaxon:10090
+NCBITaxon:10116
+NCBITaxon:1280
+NCBITaxon:1314
+NCBITaxon:196620
+NCBITaxon:208964
+NCBITaxon:2697049
+NCBITaxon:282459
+NCBITaxon:3702
+NCBITaxon:383379
+NCBITaxon:44689
+NCBITaxon:471876
+NCBITaxon:4896
+NCBITaxon:5085
+NCBITaxon:5476
+NCBITaxon:559292
+NCBITaxon:588858
+NCBITaxon:6239
+NCBITaxon:7227
+NCBITaxon:7955
+NCBITaxon:83333
+NCBITaxon:9606
+NCBITaxon:9823