From e9d3181e54acd2f5fd11c75c2c671bcf49886b14 Mon Sep 17 00:00:00 2001 From: Seth Carbon Date: Fri, 18 Mar 2022 14:19:31 -0700 Subject: [PATCH 01/12] reviewed-only trial for issue #82 --- Makefile | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 18007ca..b6881d6 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,8 @@ clean: TEST_SRCS ?= sgd pombase #SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap uniprot_reviewed_virus_bacteria -SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap ecocyc +#SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap ecocyc +SRCS ?= uniprot_reviewed OBO_SRCS = $(patsubst %,target/neo-%.obo,$(SRCS)) all_obo: $(OBO_SRCS) @@ -53,6 +54,13 @@ mirror/uniprot_reviewed_virus_bacteria.gpi.gz: target/neo-uniprot_reviewed_virus_bacteria.obo: mirror/uniprot_reviewed_virus_bacteria.gpi.gz gzip -dc $< | ./gpi2obo.pl -F -n reviewed_virus_bacteria > $@.tmp && mv $@.tmp $@ +## In support of including all swissprot reviewed. +## (https://github.com/geneontology/neo/issues/82). +## http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed.gpi.gz +mirror/uniprot_reviewed.gpi.gz: + wget --no-check-certificate http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed.gpi.gz -O mirror/uniprot_reviewed.gpi.gz +target/neo-uniprot_reviewed.obo: mirror/uniprot_reviewed.gpi.gz + gzip -dc $< | ./gpi2obo.pl -F -n reviewed > $@.tmp && mv $@.tmp $@ # Sub-makefile # From 5d03c76cf0be94c8048a410e59705499618f1c36 Mon Sep 17 00:00:00 2001 From: Seth Carbon Date: Mon, 21 Mar 2022 15:00:22 -0700 Subject: [PATCH 02/12] try all files for #82 --- Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b6881d6..db5108f 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,8 @@ clean: TEST_SRCS ?= sgd pombase #SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap uniprot_reviewed_virus_bacteria #SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap ecocyc -SRCS ?= uniprot_reviewed +#SRCS ?= uniprot_reviewed +SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap uniprot_reviewed OBO_SRCS = $(patsubst %,target/neo-%.obo,$(SRCS)) all_obo: $(OBO_SRCS) From 8d8cd009a60f77edc753bc06f52f8a07728758cb Mon Sep 17 00:00:00 2001 From: Seth Carbon Date: Mon, 21 Mar 2022 18:28:38 -0700 Subject: [PATCH 03/12] new quick and dirty script for filtering taxon for #82 --- Makefile | 4 +- filter.pl | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 filter.pl diff --git a/Makefile b/Makefile index db5108f..b10def4 100644 --- a/Makefile +++ b/Makefile @@ -29,8 +29,8 @@ IMPORTS = imports/pr_import.obo neo.obo: $(OBO_SRCS) $(IMPORTS) owltools --create-ontology http://purl.obolibrary.org/obo/go/noctua/neo.owl $^ --merge-support-ontologies -o -f obo $@.tmp && grep -v ^owl-axioms $@.tmp > $@ - - +## datasets.json is created as a throwaway in the NEO versions of the +## pipeline and is based on the go-sitr master data. datasets.json: trigger wget http://s3.amazonaws.com/go-build/metadata/datasets.json -O $@ && touch $@ diff --git a/filter.pl b/filter.pl new file mode 100644 index 0000000..4090a44 --- /dev/null +++ b/filter.pl @@ -0,0 +1,126 @@ +#!/usr/bin/perl -w +#### +#### Cheap script to filter out all the "known" species from the +#### collision-prone uniprot_reviewed file, as described in +#### https://github.com/geneontology/neo/issues/82#issuecomment-1074494641 +#### . +#### +#### Usage: +#### perl filter.pl -v --metadata /tmp/datasets.json --input /tmp/uniprot_reviewed.gpi > /tmp/clean_file.gpi +#### + +## Bring in necessaries. +use utf8; +use strict; +use Data::Dumper; +use Getopt::Long; + +## Opts. +my $verbose = ''; +my $help = ''; +my $metadata = ''; +my $input = ''; +GetOptions ('verbose' => \$verbose, + 'help' => \$help, + 'metadata=s' => \$metadata, + 'input=s' => \$input); + +## Just a little easy printin' when feeling verbose. +sub ll { + my $str = shift || ''; + print STDERR $str if $verbose; +} +ll("Verbose ON.\n"); + +## Embedded help through perldoc. +if( $help ){ + system('perldoc', __FILE__); + exit 0; +} + +### +### Main. +### + +## Grab anything that looks like a taxon. +my %taxa_hash; +open(METADATA, '<', $metadata) or die "Cannot even open metadata: $metadata: $!"; +while( ){ + ## We are going from NCBITaxon -> taxon. + if( /NCBITaxon\:([0-9]+)/ ){ + $taxa_hash{'taxon:'. $1} = 1; + } +} +close METADATA; +my @taxa = keys(%taxa_hash); + +# ## Force error for testing. +# push @taxa, 'taxon:399742'; + +## Check: +ll(Dumper \@taxa); + +## Filter input file. +open(INPUT, '<', $input) or die "Cannot even open input: $input: $!"; +while( ){ + #if( $_ ~~ @taxa ){ + my $line = $_; + my $good_p = 1; + for( @taxa ){ + if( $line =~ $_ ){ + ll('SKIPPING: '. $line); + $good_p = 0; + last; + } + } + print STDOUT $line if $good_p; +} +close INPUT; + +### +### Doc. +### + +=head1 NAME + +filter.pl + +=head1 SYNOPSIS + +filter.pl [-h/--help] [-v/--verbose] [-m/--metadata FILE] [-i/--input FILE] + +=head1 DESCRIPTION + +This script takes a datasets.json file (from the NEO pipeline--long story) and uses the taxon information in it to filter a GPI (or any other) file. Output to STDOUT. + +Example usage: + +perl filter.pl -v --metadata /tmp/datasets.json --input /tmp/uniprot_reviewed.gpi > /tmp/clean_file.gpi + +=head1 OPTIONS + +=over + +=item -v/--verbose + +Verbose + +=item -h/--help + +Help. + +=item -m/--metadata FILE + +The location of the datasets.json file. + +=item -i/--input FILE + +The location of the file to be filtered. + +=back + +=head1 SEE ALSO + +https://github.com/geneontology/neo + +=cut From ca57680d1f31a44ac31186e3d7bd3f810dbb7712 Mon Sep 17 00:00:00 2001 From: Seth Carbon Date: Mon, 21 Mar 2022 18:32:20 -0700 Subject: [PATCH 04/12] re-adding ecocyc as we are heading towards filters; work on #82 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b10def4..d456139 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ TEST_SRCS ?= sgd pombase #SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap uniprot_reviewed_virus_bacteria #SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap ecocyc #SRCS ?= uniprot_reviewed -SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap uniprot_reviewed +SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap ecocyc uniprot_reviewed OBO_SRCS = $(patsubst %,target/neo-%.obo,$(SRCS)) all_obo: $(OBO_SRCS) From 6c6cc9578d2cbfde7f8de2bb48809377fd3b3729 Mon Sep 17 00:00:00 2001 From: Seth Carbon Date: Tue, 5 Apr 2022 14:36:58 -0700 Subject: [PATCH 05/12] a little cleaning before heading into #82 --- Makefile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index d456139..b099ed9 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ TEST_SRCS ?= sgd pombase #SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap uniprot_reviewed_virus_bacteria #SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap ecocyc #SRCS ?= uniprot_reviewed -SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap ecocyc uniprot_reviewed +SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap ecocyc goa_sars-cov-2 uniprot_reviewed OBO_SRCS = $(patsubst %,target/neo-%.obo,$(SRCS)) all_obo: $(OBO_SRCS) @@ -47,13 +47,13 @@ mirror/goa_sars-cov-2.gpi.gz: target/neo-goa_sars-cov-2.obo: mirror/goa_sars-cov-2.gpi.gz gzip -dc $< | ./gpi2obo.pl -s Scov2 -n sars-cov-2 > $@.tmp && mv $@.tmp $@ -## In support of including viruses and bacteria -## (https://github.com/geneontology/neo/issues/77). -## http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed_virus_bacteria.gpi.gz -mirror/uniprot_reviewed_virus_bacteria.gpi.gz: - wget --no-check-certificate http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed_virus_bacteria.gpi.gz -O mirror/uniprot_reviewed_virus_bacteria.gpi.gz -target/neo-uniprot_reviewed_virus_bacteria.obo: mirror/uniprot_reviewed_virus_bacteria.gpi.gz - gzip -dc $< | ./gpi2obo.pl -F -n reviewed_virus_bacteria > $@.tmp && mv $@.tmp $@ +# ## In support of including viruses and bacteria +# ## (https://github.com/geneontology/neo/issues/77). +# ## http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed_virus_bacteria.gpi.gz +# mirror/uniprot_reviewed_virus_bacteria.gpi.gz: +# wget --no-check-certificate http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed_virus_bacteria.gpi.gz -O mirror/uniprot_reviewed_virus_bacteria.gpi.gz +# target/neo-uniprot_reviewed_virus_bacteria.obo: mirror/uniprot_reviewed_virus_bacteria.gpi.gz +# gzip -dc $< | ./gpi2obo.pl -F -n reviewed_virus_bacteria > $@.tmp && mv $@.tmp $@ ## In support of including all swissprot reviewed. ## (https://github.com/geneontology/neo/issues/82). From e8ea79bddc4593158b1156e733a5f1c24c6cd130 Mon Sep 17 00:00:00 2001 From: Seth Carbon Date: Tue, 5 Apr 2022 14:55:51 -0700 Subject: [PATCH 06/12] a draft for using the filter sctipr for #82 --- Makefile | 8 ++++++-- filter.pl | 6 ++++-- 2 files changed, 10 insertions(+), 4 deletions(-) mode change 100644 => 100755 filter.pl diff --git a/Makefile b/Makefile index b099ed9..c86f0c4 100644 --- a/Makefile +++ b/Makefile @@ -58,8 +58,12 @@ target/neo-goa_sars-cov-2.obo: mirror/goa_sars-cov-2.gpi.gz ## In support of including all swissprot reviewed. ## (https://github.com/geneontology/neo/issues/82). ## http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed.gpi.gz -mirror/uniprot_reviewed.gpi.gz: - wget --no-check-certificate http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed.gpi.gz -O mirror/uniprot_reviewed.gpi.gz +mirror/uniprot_reviewed.gpi.gz: datasets.json + wget --no-check-certificate http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed.gpi.gz -O mirror/uniprot_reviewed.gpi.gz.tmp + gzip -dc mirror/uniprot_reviewed.gpi.gz.tmp > mirror/uniprot_reviewed.gpi.tmp + perl ./filter.pl -v --metadata ./datasets.json --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp + gzip -c mirror/filtered_uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.gz.tmp + mv mirror/filtered_uniprot_reviewed.gpi.gz.tmp mirror/uniprot_reviewed.gpi.gz target/neo-uniprot_reviewed.obo: mirror/uniprot_reviewed.gpi.gz gzip -dc $< | ./gpi2obo.pl -F -n reviewed > $@.tmp && mv $@.tmp $@ diff --git a/filter.pl b/filter.pl old mode 100644 new mode 100755 index 4090a44..2fbb6df --- a/filter.pl +++ b/filter.pl @@ -2,12 +2,14 @@ #### #### Cheap script to filter out all the "known" species from the #### collision-prone uniprot_reviewed file, as described in -#### https://github.com/geneontology/neo/issues/82#issuecomment-1074494641 -#### . +#### https://github.com/geneontology/neo/issues/82#issuecomment-1074494641 . #### #### Usage: #### perl filter.pl -v --metadata /tmp/datasets.json --input /tmp/uniprot_reviewed.gpi > /tmp/clean_file.gpi #### +#### Makefile usage: +#### perl ./filter.pl -v --metadata ./datasets.json --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp +#### ## Bring in necessaries. use utf8; From a127d5512fbe3f9dba7f9c6b7ee33db6ca6859a1 Mon Sep 17 00:00:00 2001 From: Seth Carbon Date: Tue, 5 Apr 2022 15:42:42 -0700 Subject: [PATCH 07/12] a little cleaning for #82 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index c86f0c4..242f3a7 100644 --- a/Makefile +++ b/Makefile @@ -30,7 +30,7 @@ neo.obo: $(OBO_SRCS) $(IMPORTS) owltools --create-ontology http://purl.obolibrary.org/obo/go/noctua/neo.owl $^ --merge-support-ontologies -o -f obo $@.tmp && grep -v ^owl-axioms $@.tmp > $@ ## datasets.json is created as a throwaway in the NEO versions of the -## pipeline and is based on the go-sitr master data. +## pipeline and is based on the go-site master data. datasets.json: trigger wget http://s3.amazonaws.com/go-build/metadata/datasets.json -O $@ && touch $@ @@ -61,7 +61,7 @@ target/neo-goa_sars-cov-2.obo: mirror/goa_sars-cov-2.gpi.gz mirror/uniprot_reviewed.gpi.gz: datasets.json wget --no-check-certificate http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed.gpi.gz -O mirror/uniprot_reviewed.gpi.gz.tmp gzip -dc mirror/uniprot_reviewed.gpi.gz.tmp > mirror/uniprot_reviewed.gpi.tmp - perl ./filter.pl -v --metadata ./datasets.json --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp + perl filter.pl -v --metadata datasets.json --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp gzip -c mirror/filtered_uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.gz.tmp mv mirror/filtered_uniprot_reviewed.gpi.gz.tmp mirror/uniprot_reviewed.gpi.gz target/neo-uniprot_reviewed.obo: mirror/uniprot_reviewed.gpi.gz From a1a9fbbe60c51d35e8b8df5a02bc6dacc0b2d6e4 Mon Sep 17 00:00:00 2001 From: Seth Carbon Date: Tue, 5 Apr 2022 15:43:09 -0700 Subject: [PATCH 08/12] an update to Makefile-gafs with new datasets.json --- Makefile-gafs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/Makefile-gafs b/Makefile-gafs index 82bc72d..583f803 100644 --- a/Makefile-gafs +++ b/Makefile-gafs @@ -39,7 +39,7 @@ target/neo-fb.obo: mirror/gene_association.fb.gz mirror/Lmajor.gaf.gz: - wget --no-check-certificate ftp://ftp.sanger.ac.uk/pub/project/pathogens/as44/Lmajor.gaf.gz -O $@.tmp && mv $@.tmp $@ + wget --no-check-certificate ftp://ftp.sanger.ac.uk/pub/genedb/releases/latest/Lmajor/Lmajor.gaf.gz -O $@.tmp && mv $@.tmp $@ target/neo-genedb_lmajor.obo: mirror/Lmajor.gaf.gz @@ -47,13 +47,21 @@ target/neo-genedb_lmajor.obo: mirror/Lmajor.gaf.gz mirror/Tbruceibrucei927.gaf.gz: - wget --no-check-certificate ftp://ftp.sanger.ac.uk/pub/project/pathogens/as44/Tbruceibrucei927.gaf.gz -O $@.tmp && mv $@.tmp $@ + wget --no-check-certificate ftp://ftp.sanger.ac.uk/pub/genedb/releases/latest/Tbruceibrucei927/Tbruceibrucei927.gaf.gz -O $@.tmp && mv $@.tmp $@ target/neo-genedb_tbrucei.obo: mirror/Tbruceibrucei927.gaf.gz gzip -dc mirror/Tbruceibrucei927.gaf.gz | ./gaf2obo.pl -s genedb_tbrucei -n genedb_tbrucei > $@.tmp && mv $@.tmp $@ +mirror/Pfalciparum.gaf.gz: + wget --no-check-certificate ftp://ftp.sanger.ac.uk/pub/genedb/releases/latest/Pfalciparum/Pfalciparum.gaf.gz -O $@.tmp && mv $@.tmp $@ + + +target/neo-genedb_pfalciparum.obo: mirror/Pfalciparum.gaf.gz + gzip -dc mirror/Pfalciparum.gaf.gz | ./gaf2obo.pl -s genedb_pfalciparum -n genedb_pfalciparum > $@.tmp && mv $@.tmp $@ + + mirror/goa_chicken.gpi.gz: wget --no-check-certificate ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/CHICKEN/goa_chicken.gpi.gz -O $@.tmp && mv $@.tmp $@ @@ -415,7 +423,7 @@ target/neo-sgd.obo: mirror/gpi.sgd.gz mirror/gene_association.sgn.gz: - wget --no-check-certificate ftp://ftp.solgenomics.net/ontology/GO/gene_association.sgn.gz -O $@.tmp && mv $@.tmp $@ + wget --no-check-certificate https://solgenomics.net/ftp/ontology/GO/gene_association.sgn.gz -O $@.tmp && mv $@.tmp $@ target/neo-sgn.obo: mirror/gene_association.sgn.gz @@ -439,7 +447,7 @@ target/neo-wb.obo: mirror/c_elegans.PRJNA13758.current.gene_product_info.gpi.gz mirror/xenbase.gpi.gz: - wget --no-check-certificate https://github.com/geneontology/pipeline/files/7862197/xenbase.gpi.gz -O $@.tmp && mv $@.tmp $@ + wget --no-check-certificate https://ftp.xenbase.org/pub/GenePageReports/xenbase.gpi.gz -O $@.tmp && mv $@.tmp $@ target/neo-xenbase.obo: mirror/xenbase.gpi.gz From 81f9ef1052301c8c855e0e7d3bf0d9fd337cfddd Mon Sep 17 00:00:00 2001 From: Seth Carbon Date: Tue, 5 Apr 2022 16:08:25 -0700 Subject: [PATCH 09/12] commentary --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 242f3a7..c7d19d7 100644 --- a/Makefile +++ b/Makefile @@ -56,6 +56,7 @@ target/neo-goa_sars-cov-2.obo: mirror/goa_sars-cov-2.gpi.gz # gzip -dc $< | ./gpi2obo.pl -F -n reviewed_virus_bacteria > $@.tmp && mv $@.tmp $@ ## In support of including all swissprot reviewed. +## Download and /filter out by species/. ## (https://github.com/geneontology/neo/issues/82). ## http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed.gpi.gz mirror/uniprot_reviewed.gpi.gz: datasets.json From dea77bd337c40b794d61583845d817591954f52d Mon Sep 17 00:00:00 2001 From: Seth Carbon Date: Tue, 5 Apr 2022 17:00:26 -0700 Subject: [PATCH 10/12] additional cleaning; add another (optional) filter file to add manual things that are not present in the datasets.json; work on #82 --- Makefile | 4 ++-- filter.pl | 27 ++++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index c7d19d7..e190fb1 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ OBO = http://purl.obolibrary.org/obo all: target all_obo neo.obo neo.owl clean: - rm trigger datasets.json mirror/*gz target/*.obo || echo "not all files present, perhaps last build did not complete" + rm trigger datasets.json mirror/*gz mirror/*tmp target/*.obo || echo "not all files present, perhaps last build did not complete" TEST_SRCS ?= sgd pombase #SRCS ?= sgd pombase mgi zfin rgd dictybase fb tair wb goa_human goa_human_complex goa_human_rna goa_human_isoform goa_pig xenbase pseudocap uniprot_reviewed_virus_bacteria @@ -62,7 +62,7 @@ target/neo-goa_sars-cov-2.obo: mirror/goa_sars-cov-2.gpi.gz mirror/uniprot_reviewed.gpi.gz: datasets.json wget --no-check-certificate http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed.gpi.gz -O mirror/uniprot_reviewed.gpi.gz.tmp gzip -dc mirror/uniprot_reviewed.gpi.gz.tmp > mirror/uniprot_reviewed.gpi.tmp - perl filter.pl -v --metadata datasets.json --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp + perl filter.pl -v --metadata datasets.json --filter filter_list.txt --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp gzip -c mirror/filtered_uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.gz.tmp mv mirror/filtered_uniprot_reviewed.gpi.gz.tmp mirror/uniprot_reviewed.gpi.gz target/neo-uniprot_reviewed.obo: mirror/uniprot_reviewed.gpi.gz diff --git a/filter.pl b/filter.pl index 2fbb6df..7c019e6 100755 --- a/filter.pl +++ b/filter.pl @@ -7,8 +7,14 @@ #### Usage: #### perl filter.pl -v --metadata /tmp/datasets.json --input /tmp/uniprot_reviewed.gpi > /tmp/clean_file.gpi #### -#### Makefile usage: -#### perl ./filter.pl -v --metadata ./datasets.json --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp +#### Makefile usage (ideal, if all things are in datasets.json): +#### perl filter.pl -v --metadata datasets.json --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp +#### +#### Makefile usage (w/a list generated from grepping out all the non-reviews taxon): +#### perl filter.pl -v --metadata datasets.json --filter filter_list.txt --input mirror/uniprot_reviewed.gpi.tmp > mirror/filtered_uniprot_reviewed.gpi.tmp +#### +#### This list can be generated by taking the uniprot_reviewed files +#### out of mirrors/ and: "zgrep -ohi 'taxon\:[0-9]*' *.gz | sort | uniq > /tmp/filter_list.txt" #### ## Bring in necessaries. @@ -21,10 +27,12 @@ my $verbose = ''; my $help = ''; my $metadata = ''; +my $filter = ''; my $input = ''; GetOptions ('verbose' => \$verbose, 'help' => \$help, 'metadata=s' => \$metadata, + 'filter=s' => \$filter, 'input=s' => \$input); ## Just a little easy printin' when feeling verbose. @@ -54,8 +62,21 @@ sub ll { } } close METADATA; -my @taxa = keys(%taxa_hash); +## Do the same for an optional test list file. +if( $filter ){ + open(FILTER, '<', $filter) or die "Cannot even open filter: $filter: $!"; + while( ){ + ## We are going from NCBITaxon -> taxon. + if( /NCBITaxon\:([0-9]+)/ ){ + $taxa_hash{'taxon:'. $1} = 1; + } + } +} +close FILTER; + +## Convert to something a little more usable. +my @taxa = keys(%taxa_hash); # ## Force error for testing. # push @taxa, 'taxon:399742'; From 5a7f86caac69731cebeece92cbfe831d7d8f11b6 Mon Sep 17 00:00:00 2001 From: Seth Carbon Date: Tue, 5 Apr 2022 17:05:16 -0700 Subject: [PATCH 11/12] comment --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index e190fb1..c89c0d4 100644 --- a/Makefile +++ b/Makefile @@ -59,6 +59,8 @@ target/neo-goa_sars-cov-2.obo: mirror/goa_sars-cov-2.gpi.gz ## Download and /filter out by species/. ## (https://github.com/geneontology/neo/issues/82). ## http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed.gpi.gz +## The filter_list.txt (and option) should not be needed in the future +## as we should be drawing exclusively from datasets.json. mirror/uniprot_reviewed.gpi.gz: datasets.json wget --no-check-certificate http://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_reviewed.gpi.gz -O mirror/uniprot_reviewed.gpi.gz.tmp gzip -dc mirror/uniprot_reviewed.gpi.gz.tmp > mirror/uniprot_reviewed.gpi.tmp From 6fe86a7f3871cc0c3e04bd9dd258ef8e6e070d9c Mon Sep 17 00:00:00 2001 From: Seth Carbon Date: Tue, 5 Apr 2022 17:16:55 -0700 Subject: [PATCH 12/12] whoops, forgot the list; for #82 --- filter_list.txt | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 filter_list.txt diff --git a/filter_list.txt b/filter_list.txt new file mode 100644 index 0000000..5cef4d8 --- /dev/null +++ b/filter_list.txt @@ -0,0 +1,23 @@ +NCBITaxon:10090 +NCBITaxon:10116 +NCBITaxon:1280 +NCBITaxon:1314 +NCBITaxon:196620 +NCBITaxon:208964 +NCBITaxon:2697049 +NCBITaxon:282459 +NCBITaxon:3702 +NCBITaxon:383379 +NCBITaxon:44689 +NCBITaxon:471876 +NCBITaxon:4896 +NCBITaxon:5085 +NCBITaxon:5476 +NCBITaxon:559292 +NCBITaxon:588858 +NCBITaxon:6239 +NCBITaxon:7227 +NCBITaxon:7955 +NCBITaxon:83333 +NCBITaxon:9606 +NCBITaxon:9823