From f8ebd8a9057a8818e5db10cf76915f792f88976b Mon Sep 17 00:00:00 2001 From: Henrique Pinheiro Date: Tue, 6 Feb 2024 17:24:06 -0300 Subject: [PATCH 1/2] Usa occurrences.csv --- .../workflows/update-mongodb-occurrences.yml | 2 +- referencias/sources.json | 472 ------------------ src/ocorrencia.ts | 149 +++--- 3 files changed, 76 insertions(+), 547 deletions(-) delete mode 100644 referencias/sources.json diff --git a/.github/workflows/update-mongodb-occurrences.yml b/.github/workflows/update-mongodb-occurrences.yml index 422e912..cd3dcb2 100644 --- a/.github/workflows/update-mongodb-occurrences.yml +++ b/.github/workflows/update-mongodb-occurrences.yml @@ -8,7 +8,7 @@ on: 'src/lib/**', 'src/ocorrencia.ts', '.github/workflows/update-mongodb-occurrences.yml', - 'referencias/sources.json' + 'referencias/occurrences.csv' ] workflow_dispatch: diff --git a/referencias/sources.json b/referencias/sources.json deleted file mode 100644 index b2d64c8..0000000 --- a/referencias/sources.json +++ /dev/null @@ -1,472 +0,0 @@ -[ - { - "ipt": "jbrj", - "baseUrl": "https://ipt.jbrj.gov.br/jbrj/", - "datasets": ["jbrj_rb"] - }, - { - "ipt": "reflora", - "baseUrl": "https://ipt.jbrj.gov.br/reflora/", - "datasets": [ - "alcb_herbarium", - "ase_herbarium", - "b_herb", - "cen", - "cepec_herbarium", - "cesj", - "e_hv", - "eac_herbarium", - "ebc", - "esa_herbarium", - "gh_herb", - "hrcb", - "hstm", - "hucp", - "huefs", - "huem", - "huemg", - "huenf", - "hufu_herbarium", - "hvasf", - "ian", - "icn", - "k_reflora", - "mac", - "mg_herbarium", - "moh", - "nyh", - "p_reflora", - "pc_herb", - "pmsp", - "rbr", - "real", - "s_reflora", - "sof", - "spf_herbarium", - "us_reflora", - "w_reflora", - "huni", - "ibge", - "ufg" - ] - }, - { - "ipt": "jabot", - "baseUrl": "https://ipt.jbrj.gov.br/jabot/", - "datasets": [ - "acam", - "afr", - "bhcb", - "brba", - "cap", - "cbpm", - "cgms", - "cor", - "cri", - "ddms", - "dvpr", - "ect", - "fcab", - "flor", - "furb", - "hatm", - "hb_herbarium", - "hbr", - "hcf", - "hcjs", - "hcp", - "hdjf", - "heph", - "herbam", - "hevb", - "hf_herbarium", - "hipe", - "hisa", - "hj_herbarium", - "hrb", - "hrj", - "hstm", - "hto", - "hub", - "hueg", - "huenf", - "hufsp", - "huni", - "hupg", - "hurb", - "huva", - "hvat", - "hvc", - "ibge", - "ibiuemg", - "ifrv", - "jabu", - "jar", - "jbb", - "joi", - "lag", - "lpf", - "lspf", - "lusc", - "mbm", - "mbml", - "mcca", - "mfs", - "mufal", - "nit", - "oupr", - "pab", - "pel", - "perd", - "r_herbarium", - "ra_lap", - "rbr", - "rfa", - "rffp", - "rn_herbarium", - "ron", - "rspf", - "sames", - "shpr", - "sjrp", - "slui", - "spsc", - "srbm", - "tepb", - "ualf", - "ufrn", - "ufrr", - "unip", - "unop", - "upcb", - "vies" - ] - }, - { - "ipt": "taxonline", - "baseUrl": "https://ipt.sibbr.gov.br/taxonline/", - "datasets": [ - "ufpr_dzup_coleoptera_01", - "ufpr_dzup_hemiptera_01", - "ufpr_dzup_diptera_01", - "ufpr_dzup_hymenoptera_01", - "cesp_ufpr_01", - "mzuel_ictiologia_01", - "uel_mzuel_peixes_01", - "herpetologia_mhnci_01", - "ictiologia_mhnci_01", - "ufpr_dzup_aphoidea_01", - "mhnci_05", - "ufpr_colpar_dpat_01", - "ufpr_dzup_lepidoptera_01", - "ufpr_dzup_trichoptera_01", - "dzup_ascidiacea_01", - "mzuel_herpetologia_01", - "ufpr_mastologia_01", - "ufpr_dzup_orthoptera_01", - "unila_ictiologia_01", - "ufpr_palotina_fishcast_001", - "ufpr_dzup_formicidae_01", - "utfpr_ictiologia_01", - "ufpr_lafma_01", - "mzuel_carcinologia_01", - "mzuel_mastozoologia_01", - "ufpr_dzup_dermaptera_01", - "ufpr_dzup_neuroptera_01", - "ufpr_dzup_megaloptera_01", - "ufpr_dzup_phasmida_02", - "unicentro_ciac_01", - "ufpr_dzup_plecoptera_01", - "mzuel_ornitologia_01", - "ufpr_dzup_psocoptera_02", - "ufpr_dzup_mantodea_01", - "ufpr_dzup_mecoptera_01" - ] - }, - { - "ipt": "mpeg", - "baseUrl": "https://ipt.sibbr.gov.br/goeldi/", - "datasets": [ - "museu_paraense_emilio_goeldi_ornithology_collection", - "museu_paraense_emilio_goeldi-hymenoptera_collection", - "museu_paraense_emilio_goeldi-hexapoda_collection", - "museuparaenseemiliogoeldi-collection-entomologiacoleoptera", - "museu_paraense_emilio_goeldi_mastozoology_collection", - "museu_paraense_emilio_goeldi-amphibia_collection", - "museu_paraense_emilio_goeldi_lepidoptera_collection", - "museu_paraense_emilio_goeldi-diptera_collection", - "museu_paraense_emilio_goeldi_ictiology_collection", - "museuparaenseemiliogoeldi-collection-aracnologiaaraneae", - "museu_paraense_emilio_goeldi-lacertilia_collection", - "museu_paraense_emilio_goeldi_ophidia_collection", - "museu_paraense_emilio_goeldi_ornithology_collection-skin", - "museu_paraense_emilio_goeldi-ornithology_collection_osteological", - "museu_paraense_emilio_goeldi-paleoinvertebrate_collection", - "museuparaenseemiliogoeldi-collection-aracnologiaopiliones", - "museu_paraense_emilio_goeldi-paleovertebrate_collection", - "museu_paraense_emilio_goeldi-crustacea_collection", - "museu_paraense_emilio_goeldi-mollusca_collection", - "museuparaenseemiliogoeldi-collection-aracnologiascorpiones", - "museu_paraense_emilio_goeldi-chelonia_collection_", - "museu_paraense_emilio_goeldi-annelida_collection", - "museu_paraense_emilio_goeldi_ornithology_collection_eggs", - "museu_paraense_emilio_goeldi-nematoda_collection", - "museuparaenseemiliogoeldi-collection-aracnologiaamblypygi", - "museu_paraense_emilio_goeldi-ornithology_collection_nests", - "museuparaenseemiliogoeldi-collection-aracnologiaacari2", - "museu_paraense_emilio_goeldi-crocodilia_collection", - "museu_paraense_emilio_goeldi-myriapoda_collection", - "museuparaenseemiliogoeldi-collection-aracnologiaricinulei", - "museuparaenseemiliogoeldi-collection-aracnologiauropygi", - "museuparaenseemiliogoeldi-collection-aracnologiaschizomida", - "museuparaenseemiliogoeldi-collection-aracnologiasolifugae", - "museu_paraense_emilio_goeldi-acanthocephala_collection" - ] - }, - { - "ipt": "speciesLink", - "baseUrl": "http://ipt1.cria.org.br/ipt/", - "datasets": [ - "bhzb", - "botu", - "botu-fungi", - "ce-ufpe", - "cepann", - "cnmt", - "cpap", - "cvrd", - "dcbu", - "dsec", - "dzub", - "eafm", - "ean", - "efc", - "fuel", - "hamab", - "has", - "has-algae", - "hbra", - "hcdal", - "hfsl", - "hfsl-fungos", - "hpan", - "hpbr", - "hpl", - "hst", - "htsa", - "huco", - "hucpe", - "hucs", - "hucs-liquenoteca", - "hucs-micoteca", - "huesb", - "hufabc", - "hufsj", - "husc", - "huto", - "iac", - "ipa", - "irai", - "ise", - "lirp", - "mac", - "mar", - "mbml-anfibios", - "mbml-aves", - "mbml-peixes", - "mbml-repteis", - "mcp", - "moss", - "mpuc", - "nup", - "nx_herbario", - "nx-fanerogamas", - "paca-agp", - "paca-algas", - "paca-bryophytes", - "paca-fungi", - "paca-liquens", - "peufr", - "rpsp", - "smdb", - "soro", - "sp_fanerogamas", - "sp-algae", - "sp-bryophyta", - "sp-fungi", - "spfr", - "spsf", - "uec", - "uesc", - "ufacpz", - "ufmt", - "unopa", - "urm", - "vic", - "zuec-amp", - "zuec-api", - "zuec-apl", - "zuec-apl", - "zuec-asc", - "zuec-ast", - "zuec-ave", - "zuec-biv", - "zuec-bra", - "zuec-bry", - "zuec-clb", - "zuec-cni", - "zuec-col", - "zuec-cph", - "zuec-cru", - "zuec-dip", - "zuec-ech", - "zuec-eph", - "zuec-gas", - "zuec-gch", - "zuec-hem", - "zuec-hym", - "zuec-lep", - "zuec-mam", - "zuec-nem", - "zuec-neu", - "zuec-nma", - "zuec-oph", - "zuec-ort", - "zuec-pis", - "zuec-pla", - "zuec-pol", - "zuec-rep" - ] - }, - - { - "ipt": "museuNacional", - "baseUrl": "https://ipt.sibbr.gov.br/mnrj/", - "datasets": [ - "mnrj_arachnidakury", - "arachnida_ordens_menores_do_museu_nacional", - "arachnida_opiliones_do_museu_nacional", - "mnrj_amphibia", - "ascidiacea_mn_ufrj", - "mnrj_ornitologia", - "brachiopoda_mn_ufrj", - "bryozoa_mn_ufrj", - "mnrj_cnidaria", - "mnrj_carcinologia", - "mnrj-echino", - "mammnufrj", - "nematoda_mn_ufrj", - "nematomorpha_mn_ufrj", - "nemertea_mn_ufrj", - "platyhelminthes_mn_ufrj", - "ufjr_polychaeta", - "ufrj-ib_porifera", - "porifera_mn_ufrj", - "mnrj_reptilia", - "sipuncula_mn_ufrj", - "tardigrada_mn_ufrj", - "mnrj_entomo_blattaria", - "mnrj_entomo_coleoptera_01", - "mnrj_entomo_diptera_01", - "mnrj_entomo_diptera_aquat", - "mnrj_entomo_hemiptera", - "mnrj_entomo_hymenoptera", - "mnrj_entomo_lepidoptera", - "mnrj_entomo_orthoptera", - "mnrj_entomo_outras_ordens", - "entomologica_1", - "mnrj_ictiologia", - "af_fireflies_br_01", - "mnrj_mollusca", - "myriapoda_do_museu_nacional" - ] - }, - { - "ipt": "inpa", - "baseUrl": "https://ipt.sibbr.gov.br/inpa/", - "datasets": [ - "inpa_acari", - "inpa_amblypygi", - "inpa_blattaria", - "inpa_chilopoda", - "inpa_crustacea", - "inpa_diplopoda", - "inpa_diptera", - "inpa_diptera", - "inpa_ephemenoptera", - "inpa_hemiptera", - "inpa_herpetologia", - "inpa_hymenoptera", - "inpa_ictiologia", - "inpa_myxozoa", - "inpa_nematoda", - "inpa_onychophora", - "inpa_opiliones", - "inpa_orthoptera", - "inpa_palpigradi", - "inpa_pauropoda", - "inpa_phasmatodea", - "inpa_porifera", - "inpa_pseudoscorpiones", - "inpa_ricinulei", - "inpa_schizomida", - "inpa_scorpiones", - "inpa_solifugae", - "inpa_symphyla", - "inpa_thelyphonida" - ] - }, - { - "ipt": "sibbr", - "baseUrl": "https://ipt.sibbr.gov.br/sibbr/", - "datasets": [ - "funed_01", - "npm_macae01", - "sibbr_lepufrrj", - "acarosibsp", - "cepsul_01", - "univates_artropodofauna_01", - "unisinos_aves_01", - "aves_cahz_ufpb01", - "unir_avi01", - "herpetologia_univates", - "univates_ictiologia", - "ufro_ictiofauna", - "ictiologia_abam", - "unisinos_mammalia_01", - "uenfmz", - "ufla_mamiferos_01", - "mammalia_via_seca", - "mammaliaumida", - "ufes_mam_01", - "mamiferos_mucs", - "univates_mastologia_01", - "univates_ornitologia", - "uft_unt_peixes_01", - "ucs_peixes_01", - "ufes_tec_01", - "lissamphibia", - "unicamp_celei_01", - "helminthos", - "zufmsamp", - "zufmscru", - "zufmshem", - "zufmsmol", - "zufmsort", - "zufmspis", - "zufmsrep", - "coleoptera", - "scarabaeoidea", - "zufmsche", - "zufmschi", - "zufmsdip", - "zufmsfos", - "zufmshym", - "zufmsnem", - "zufmstec", - "reptiliaufmt", - "uepa_34", - "uel_mzuel_peixes" - ] - } -] diff --git a/src/ocorrencia.ts b/src/ocorrencia.ts index 19f8aa5..7e1eb36 100644 --- a/src/ocorrencia.ts +++ b/src/ocorrencia.ts @@ -1,6 +1,7 @@ import { MongoClient } from 'npm:mongodb' import { calculateObjectSize } from 'npm:bson' import cliProgress from 'npm:cli-progress' +import Papa from 'npm:papaparse' import { getEml, processaEml, processaZip, type DbIpt } from './lib/dwca.ts' @@ -40,9 +41,18 @@ async function safeInsertMany( } } -const iptSources = await Deno.readTextFile('./referencias/sources.json').then( - (contents) => JSON.parse(contents) -) +type IptSource = { + nome: string + repositorio: string + kingdom: 'Animalia' | 'Plantae' | 'Fungi' + tag: string + url: string +} +const { data: iptSources } = (await Deno.readTextFile( + './referencias/occurrences.csv' +).then((contents) => Papa.parse(contents, { header: true }))) as { + data: IptSource[] +} const client = new MongoClient(Deno.env.get('MONGO_URI') as string) await client.connect() @@ -79,80 +89,71 @@ await Promise.all([ ]) ]) -for (const { ipt: iptName, baseUrl, datasets } of iptSources) { - for (const set of datasets) { - if (!set) continue - console.debug(`Processing ${set}`) - const eml = await getEml(`${baseUrl}eml.do?r=${set}`) - const ipt = processaEml(eml) - const dbVersion = ((await iptsCol.findOne({ _id: ipt.id })) as DbIpt | null) - ?.version - if (dbVersion === ipt.version) { - console.debug(`${set} already on version ${ipt.version}`) - continue - } - console.log(`Version mismatch: DB[${dbVersion}] vs REMOTE[${ipt.version}]`) - console.debug(`Downloading ${set} [${baseUrl}archive.do?r=${set}]`) - const ocorrencias = await processaZip( - `${baseUrl}archive.do?r=${set}`, - true, - 5000 - ) - console.debug(`Cleaning ${set}`) - console.log( - `Deleted ${ - (await ocorrenciasCol.deleteMany({ iptId: ipt.id })).deletedCount - } entries` - ) - const bar = new cliProgress.SingleBar( - {}, - cliProgress.Presets.shades_classic - ) - bar.start(ocorrencias.length, 0) - for (const batch of ocorrencias) { - if (!batch || !batch.length) break - bar.increment(batch.length - Math.floor(batch.length / 4)) - await safeInsertMany( - ocorrenciasCol, - batch.map((ocorrencia) => { - const canonicalName = [ - ocorrencia[1].genus, - ocorrencia[1].genericName, - ocorrencia[1].subgenus, - ocorrencia[1].infragenericEpithet, - ocorrencia[1].specificEpithet, - ocorrencia[1].infraspecificEpithet, - ocorrencia[1].cultivarEpiteth - ] - .filter(Boolean) - .join(' ') - return { - iptId: ipt.id, - ipt: iptName, - canonicalName, - flatScientificName: ( - (ocorrencia[1].scientificName as string) ?? canonicalName - ) - .replace(/[^a-zA-Z0-9]/g, '') - .toLocaleLowerCase(), - ...ocorrencia[1] - } - }), - { - ordered: false +for (const { repositorio, kingdom, tag, url } of iptSources) { + console.debug(`Processing ${repositorio}:${tag}`) + const eml = await getEml(`${url}eml.do?r=${tag}`) + const ipt = processaEml(eml) + const dbVersion = ((await iptsCol.findOne({ _id: ipt.id })) as DbIpt | null) + ?.version + if (dbVersion === ipt.version) { + console.debug(`${repositorio}:${tag} already on version ${ipt.version}`) + continue + } + console.log(`Version mismatch: DB[${dbVersion}] vs REMOTE[${ipt.version}]`) + console.debug(`Downloading ${repositorio}:${tag} [${url}archive.do?r=${tag}]`) + const ocorrencias = await processaZip(`${url}archive.do?r=${tag}`, true, 5000) + console.debug(`Cleaning ${repositorio}:${tag}`) + console.log( + `Deleted ${ + (await ocorrenciasCol.deleteMany({ iptId: ipt.id })).deletedCount + } entries` + ) + const bar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic) + bar.start(ocorrencias.length, 0) + for (const batch of ocorrencias) { + if (!batch || !batch.length) break + bar.increment(batch.length - Math.floor(batch.length / 4)) + await safeInsertMany( + ocorrenciasCol, + batch.map((ocorrencia) => { + const canonicalName = [ + ocorrencia[1].genus, + ocorrencia[1].genericName, + ocorrencia[1].subgenus, + ocorrencia[1].infragenericEpithet, + ocorrencia[1].specificEpithet, + ocorrencia[1].infraspecificEpithet, + ocorrencia[1].cultivarEpiteth + ] + .filter(Boolean) + .join(' ') + return { + iptId: ipt.id, + ipt: repositorio, + canonicalName, + flatScientificName: ( + (ocorrencia[1].scientificName as string) ?? canonicalName + ) + .replace(/[^a-zA-Z0-9]/g, '') + .toLocaleLowerCase(), + ...ocorrencia[1] } - ) - bar.increment(Math.floor(batch.length / 4)) - } - bar.stop() - console.debug(`Inserting IPT ${set}`) - const { id: _id, ...iptDb } = ipt - await iptsCol.updateOne( - { _id: ipt.id }, - { $set: { _id, ...iptDb, tag: set, ipt: iptName } }, - { upsert: true } + }), + { + ordered: false + } ) + bar.increment(Math.floor(batch.length / 4)) } + bar.stop() + console.debug(`Inserting IPT ${repositorio}:${tag}`) + const { id: _id, ...iptDb } = ipt + await iptsCol.updateOne( + { _id: ipt.id }, + { $set: { _id, ...iptDb, tag, ipt: repositorio, kingdom } }, + { upsert: true } + ) } + console.debug('Done') client.close() From 0e35d4131e61cfa86d9b57e1ad20a6339aaf6e25 Mon Sep 17 00:00:00 2001 From: Henrique Pinheiro Date: Tue, 6 Feb 2024 18:04:34 -0300 Subject: [PATCH 2/2] =?UTF-8?q?Cria=20geoPoint=20para=20ocorr=C3=AAncias?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #60 --- src/ocorrencia.ts | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/ocorrencia.ts b/src/ocorrencia.ts index 7e1eb36..f6ac4fd 100644 --- a/src/ocorrencia.ts +++ b/src/ocorrencia.ts @@ -116,6 +116,23 @@ for (const { repositorio, kingdom, tag, url } of iptSources) { await safeInsertMany( ocorrenciasCol, batch.map((ocorrencia) => { + if (ocorrencia[1].decimalLatitude && ocorrencia[1].decimalLongitude) { + const latitude = +ocorrencia[1].decimalLatitude + const longitude = +ocorrencia[1].decimalLongitude + if ( + !isNaN(latitude) && + !isNaN(longitude) && + latitude >= -90 && + latitude <= 90 && + longitude >= -180 && + longitude <= 180 + ) { + ocorrencia[1].geoPoint = { + type: 'Point', + coordinates: [longitude, latitude] + } + } + } const canonicalName = [ ocorrencia[1].genus, ocorrencia[1].genericName,