Skip to content

Commit

Permalink
update, fixed, notes, feb 2024 snapshot
Browse files Browse the repository at this point in the history
  • Loading branch information
dylanbeaudette committed Feb 15, 2024
1 parent 46bab97 commit e70e895
Show file tree
Hide file tree
Showing 9 changed files with 41 additions and 23 deletions.
4 changes: 2 additions & 2 deletions MIR/art-from-MIR-spectral-DB.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ parseSpectra <- function(.txt, compressed = TRUE) {
base.path <- 'E:/MIR'

# compressed spectra
db.file <- file.path(base.path, 'MIR-compact-gz.sqlite')
# db.file <- file.path(base.path, 'MIR-compact-gz.sqlite')

# plain-text spectra
db.file <- file.path(base.path, 'MIR-compact-text.sqlite')
# db.file <- file.path(base.path, 'MIR-compact-text.sqlite')

# full
db.file <- file.path(base.path, 'MIR-compact.sqlite')
Expand Down
20 changes: 14 additions & 6 deletions MIR/build-compact-spectral-library.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ dbExecute(db, .sql)
f <- list.files(wd.path, full.names = TRUE)

# 2023-01-07: 1594 collections
# 2024-02-14: 1386 collections
length(f)

## select a wavenumber template
Expand All @@ -92,8 +93,8 @@ wnTemplate <- seq(from = 4000, to = 600, by = -2)

## iterate over collections / write intermediate pieces to files
## parallel safe
# plain text: 6 minutes
# gz compression: 6 minutes
# plain text: 2 minutes
# gz compression: 2 minutes

plan(multisession)

Expand All @@ -114,7 +115,7 @@ plan(sequential)
## load intermediate files into DB table
## not likely safe to do in parallel
# plain text: ~2 minutes
# gz compressed: ~30 seconds
# gz compressed: ~10 seconds

.rds <- list.files(path = staging.dir, full.names = TRUE)

Expand All @@ -137,7 +138,7 @@ indexTable('mir_metadata', c('collection', 'sample'))
indexTable('mir_spec', 'sample')

## cleanup
# ~ 5 minutes
# ~ 2 minutes
dbExecute(db, 'VACUUM;')
dbExecute(db, 'VACUUM;')

Expand Down Expand Up @@ -171,15 +172,18 @@ dbDisconnect(db)
# uncompressed / txt: 9.5GB
# gzipped / txt: 4.2GB

# uncompressed / gz(txt): 4.5GB
# gzipped / gz(txt): 4.4GB
# uncompressed / gz(txt): 2.6GB
# gzipped / gz(txt): GB


# Dec 2022 MIR Library snapshot
# 650,714 Files
# * raw folders/files: 105GB
# * single zip archive: 88Gb

# Feb 2024 Public MIR Library
# 373,818 Files
# * raw folders/files: 61GB

## compress final sqlite DB
# ~ 10 minutes
Expand All @@ -192,6 +196,10 @@ system.time(
unlink(staging.dir, recursive = TRUE)


## cleanup
rm(list = ls())
gc(reset = TRUE)




Expand Down
26 changes: 15 additions & 11 deletions MIR/collect-wavenumber-metadata.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

library(furrr)
library(purrr)

## functions waiting for an R package
source('../code/snapshot-preparation/snapshot-functions.R')
Expand Down Expand Up @@ -29,28 +30,30 @@ system.time(
plan(sequential)

# test for error conditions
e <- whochsapply(z, '[', 'error')
e <- sapply(z, '[', 'error')
which(!sapply(e, is.null))

f[1054]



# extract results
z <- sapply(z, '[', 'result')

# flatten
z <- do.call('rbind', z)
row.names(z) <- NULL

## 2023-01-09: 325369 rows
## 2024-02-14: 186893 rows
nrow(z)
str(z)

# double check that there are three distinct WN sequences
stopifnot(length(unique(z$wn)) == 3)

# id for simpler description
z$wnID <- factor(z$wn, labels = 1:3)

## frequency
# 1 2 3
# 119871 205470 28
# 1 2 3
# 2472 55052 129369
table(z$wnID)


Expand Down Expand Up @@ -81,7 +84,7 @@ str(w, 1)
# List of 3
# $ : chr [1:1765] "4002" "4000" "3998" "3996" ...
# $ : chr [1:1765] "4002" "4000" "3998" "3996" ...
# $ : chr [1:3578] "7498" "7496" "7494" "7492" ...
# $ : chr [1:1765] "4001" "4000" "3998" "3996" ...

# ~ 19 elements in the integer wn sequence are off by 1
idx <- which(w[[1]] != w[[2]])
Expand Down Expand Up @@ -114,11 +117,12 @@ cbind(




## 2024-02-14: no longer a problem in public data
## what is going on with the strange sequence 600-7498?
# 28 samples
nrow(x <- z[z$wnID == 3, ])
knitr::kable(x[, 1:2], row.names = FALSE)
# nrow(x <- z[z$wnID == 3, ])
# knitr::kable(x[, 1:2], row.names = FALSE)

range(as.numeric(strsplit(x$wn[1], split = ',', fixed = TRUE)[[1]]))

## cleanup
Expand Down
Binary file added MIR/errors.rds
Binary file not shown.
6 changes: 6 additions & 0 deletions MIR/main.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,9 @@ source('collect-wavenumber-metadata.R')
source('build-compact-spectral-library.R')


## TODO:
## 1. create spectra -- sample LUT
## 2. index
## 3. simplify code if possible
## 4. wrapper / helper functions

2 changes: 1 addition & 1 deletion MIR/pre-process-collections.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ system.time(
plan(sequential)

# keep track of errors

saveRDS(e, file = 'errors.rds')

## cleanup
rm(list = ls())
Expand Down
Binary file modified MIR/pretty-spectra-dist-from-median-02.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified MIR/results/collection-metadata.csv.gz
Binary file not shown.
6 changes: 3 additions & 3 deletions MIR/results/wn-LUT.csv

Large diffs are not rendered by default.

0 comments on commit e70e895

Please sign in to comment.