From 6049fcaacf51ce4846bd2a057082263eb6a560ed Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Wed, 22 Nov 2023 16:33:45 +0000 Subject: [PATCH] fix(ci): add `pre-commit` to check `R/` path and apply --- .github/workflows/ci.yaml | 4 +- .pre-commit-config.yaml | 2 +- R/LCAT/Data_Processing_todf.R | 51 +- R/LCAT/LCATv_apply_qmapQuant_to_crpd_df_fn.R | 464 +++++++++--------- R/README.md | 8 +- R/Resampling/Resampling.HADs.inR.R | 22 +- R/bias-correction-methods/ThreeCitiesQmap.RMD | 20 +- R/bias-correction-methods/WIP_EQM.RMD | 35 +- .../apply_qmapQuant_to_crpd_df_fn.R | 43 +- .../WIP Comparing HADs grids.Rmd | 35 +- .../WIP-Comparing-HADs-grids.Rmd | 35 +- .../WIP-Comparing-HADs-grids.md | 14 +- .../converting_city_crops_to_df.R | 118 ++--- README.md | 2 +- 14 files changed, 420 insertions(+), 433 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8ca3d40a..38e4d4e9 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -14,11 +14,11 @@ env: on: pull_request: - branches: ['main', 'docker-config'] + branches: ['main', 'ruth-notebook-for-workshop'] paths-ignore: ['docs/**'] push: - branches: ['main', 'docker-config'] + branches: ['main', 'ruth-notebook-for-workshop'] paths-ignore: ['docs/**'] concurrency: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bd658c3a..4ac55731 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -exclude: "R|data" +exclude: "data" repos: - repo: https://github.com/psf/black rev: "23.9.1" diff --git a/R/LCAT/Data_Processing_todf.R b/R/LCAT/Data_Processing_todf.R index 89d15d15..fdea70a4 100644 --- a/R/LCAT/Data_Processing_todf.R +++ b/R/LCAT/Data_Processing_todf.R @@ -1,12 +1,12 @@ -## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## +## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## Data Processing UK HadsGrid CPM data from raster to data.frame ## 0. About -# Many of the methods packages for applying bias correction to climate date take as input vector, matrix or data.frame, +# Many of the methods packages for applying bias correction to climate date take as input vector, matrix or data.frame, # rather than a spatial file or raster -# This may not be the most effective way of analysising this data +# This may not be the most effective way of analysising this data rm(list=ls()) @@ -47,16 +47,16 @@ cores <- detectCores() cl <- makeCluster(cores[1]-1) registerDoParallel(cl) - foreach(x = x, + foreach(x = x, .packages = c("terra", "tidyverse"), - .errorhandling = 'pass') %dopar% { + .errorhandling = 'pass') %dopar% { f <- paste0(dd,'shapefiles/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom_2022_7279368953270783580/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom.shp') UK.shape <-vect(f) crop.area <- UK.shape[which(UK.shape$nuts118cd==x)] - + var <- c("rainfall", "tasmax", "tasmin", "tas") - + hads19802010_read_crop_df_write(var = var, fp = paste0(dd, "Processed/HadsUKgrid/resampled_2.2km/"), name1 = "HadsUK", @@ -64,45 +64,42 @@ registerDoParallel(cl) crop.area=crop.area, cropname=x, rd=rd) - + } - - stopCluster(cl) + + stopCluster(cl) gc() ### Processing CPM - + x <- regioncd - + cores <- detectCores() cl <- makeCluster(cores[1]-1) registerDoParallel(cl) - - foreach(x = x, + + foreach(x = x, .packages = c("terra", "tidyverse"), - .errorhandling = 'pass') %dopar% { - + .errorhandling = 'pass') %dopar% { + f <- paste0(dd,'shapefiles/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom_2022_7279368953270783580/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom.shp') UK.shape <-vect(f) crop.area <- UK.shape[which(UK.shape$nuts118cd==x)] - + runs <- c("05", "07", "08", "06") var <- c("tasmax", "tasmin","pr", "tas") rd <- paste0(dd, "Interim/CPM/Data_as_df") - + cpm_read_crop_df_write(runs=runs, var=var, fp=paste0(dd, "Reprojected/UKCP2.2/"), - year1=2060, year2=2080, #Ran sep for each year segment - name1="CPM", crop = T, + year1=2060, year2=2080, #Ran sep for each year segment + name1="CPM", crop = T, crop.area = crop.area, cropname = x, rd=paste0(dd, "Interim/CPM/Data_as_df")) - - + + } - - stopCluster(cl) + + stopCluster(cl) gc() - - - diff --git a/R/LCAT/LCATv_apply_qmapQuant_to_crpd_df_fn.R b/R/LCAT/LCATv_apply_qmapQuant_to_crpd_df_fn.R index 7ab0cac2..639aa7be 100644 --- a/R/LCAT/LCATv_apply_qmapQuant_to_crpd_df_fn.R +++ b/R/LCAT/LCATv_apply_qmapQuant_to_crpd_df_fn.R @@ -1,5 +1,5 @@ #This version of this file was used to process LCAT files. Later version of this files will be renamed and split -##Loading data as created in 'Data_Processing_todf.R' +##Loading data as created in 'Data_Processing_todf.R' #Requires library(tidyverse) @@ -11,7 +11,7 @@ apply_bias_correction_to_cropped_df <- function(region, #Region code - needs to var, #Meterological variables Runs){ - i <- region + i <- region for(r in Runs){ for(v in var){ @@ -26,8 +26,8 @@ for(r in Runs){ #subset file list to var obs.var <- obs[grepl(v,obs)] - - #subset to calibration years + + #subset to calibration years obs.varc <- obs.var[grepl("1980", obs.var)] obs.df <- fread(paste0(fp, obs.varc)) obs.df <- as.data.frame(obs.df) @@ -36,12 +36,12 @@ for(r in Runs){ obs.df$x <- NULL obs.df$y <- NULL - #Remove the dates not in the cpm + #Remove the dates not in the cpm ## find col position of the first cpm date 19801201 n1 <-min(grep("19801201", names(obs.df))) obs.df <- obs.df[c(n1:ncol(obs.df))] - - + + #Using 1980 - 2010 as calibration period fp <- paste0(dd, "Interim/CPM/Data_as_df/") cpm.files <- list.files(fp) @@ -49,44 +49,44 @@ for(r in Runs){ #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] - #Subset file list to area + #Subset file list to area cpm.cal <- cpm.cal[grepl(i, cpm.cal)] #subset to var and run cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] - #Load in + #Load in cal.df <- lapply(cpm.cal.var, function(x){ df <- fread(paste0(fp, x)) df <- as.data.frame(df) - + row.names(df)<- paste0(df$x, "_", df$y) df$x <- NULL df$y <- NULL return(df) }) - + cal.df <- cal.df %>% reduce(cbind) - #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here - #Keep all of the files with these years - because the naming convention runs + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here + #Keep all of the files with these years - because the naming convention runs #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- n2 <- min(grep("20091201-",names(cal.df))) + 29 - - #This is the first part of the validation dataset, but all the val will be added to the projection df for + + #This is the first part of the validation dataset, but all the val will be added to the projection df for #the sake of bias correction and assessed separately proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] cal.df <- cal.df[c(1:n2)] - - gc() - + + gc() + yi <- paste0(i,c(2020,2040,2060), collapse="|") cpm.proj <- cpm.files[grepl(yi, cpm.files)] #Subset to Area, var and run cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] - #Load in + #Load in proj.df2 <- lapply(cpm.proj, function(x){ df <- as.data.frame(fread(paste0(fp, x))) #Remove x and y cols @@ -96,32 +96,32 @@ for(r in Runs){ names(proj.df2) <- cpm.proj proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) - + remove("proj.df1") remove("proj.df2") ## **2. Wrangle the data** - + #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] - + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #save the missing outputs + + #save the missing outputs p <- paste0("checkpoint1", v, "_", i, "_", r, "_") print(p) #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) - + ### Update obs data to 360 days - #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted - + #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") remove <- paste0(remove, collapse = "|") - + obs.df <- obs.df[,!grepl(remove, names(obs.df))] #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove obs.df <- obs.df[1:ncol(cal.df)] @@ -146,8 +146,8 @@ for(r in Runs){ library(qmap) qm1.fit <- fitQmapQUANT(obs.df, cal.df, wet.day = FALSE, - qstep = 0.01, - nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. + qstep = 0.01, + nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") @@ -170,232 +170,232 @@ for(r in Runs){ p <- paste0("checkpoint5", v, "_", i, "_", r, "_") print(p) rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) - + gc(reset=TRUE) - + } else { - -#### Precipitation - the HADs variable has is called 'rainfall' + +#### Precipitation - the HADs variable has is called 'rainfall' dd <- "/mnt/vmfileshare/ClimateData/" #Subset to Area #HADs grid observational data fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") files <- list.files(fp) obs <- files[grepl(i, files)] - + #subset file list to var obs.var <- obs[grepl("rainfall",obs)] - - #subset to calibration years + + #subset to calibration years obs.varc <- obs.var[grepl("1980", obs.var)] obs.df <- fread(paste0(fp, obs.varc)) obs.df <- as.data.frame(obs.df) - + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) obs.df$x <- NULL obs.df$y <- NULL - - #Remove the dates not in the cpm + + #Remove the dates not in the cpm ## find col position of the first cpm date 19801201 n1 <-min(grep("19801201", names(obs.df))) obs.df <- obs.df[c(n1:ncol(obs.df))] - - + + #Using 1980 - 2010 as calibration period fp <- paste0(dd, "Interim/CPM/Data_as_df/") cpm.files <- list.files(fp) - + #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] - - #Subset file list to area + + #Subset file list to area cpm.cal <- cpm.cal[grepl(i, cpm.cal)] - + #subset to var and run cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] - - #Load in + + #Load in cal.df <- lapply(cpm.cal.var, function(x){ df <- fread(paste0(fp, x)) df <- as.data.frame(df) - + row.names(df)<- paste0(df$x, "_", df$y) df$x <- NULL df$y <- NULL return(df) }) - + cal.df <- cal.df %>% reduce(cbind) - - #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here - #Keep all of the files with these years - because the naming convention runs + + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here + #Keep all of the files with these years - because the naming convention runs #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- n2 <- min(grep("20091201-",names(cal.df))) + 29 - - #This is the first part of the validation dataset, but all the val will be added to the projection df for + + #This is the first part of the validation dataset, but all the val will be added to the projection df for #the sake of bias correction and assessed separately proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] cal.df <- cal.df[c(1:n2)] - + gc() - + yi <- paste0(i,c(2020,2040,2060), collapse="|") cpm.proj <- cpm.files[grepl(yi, cpm.files)] - + #Subset to Area, var and run cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] - - #Load in + + #Load in proj.df2 <- lapply(cpm.proj, function(x){ df <- as.data.frame(fread(paste0(fp, x))) #Remove x and y cols df[c(3:ncol(df))] }) - + names(proj.df2) <- cpm.proj - + proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) - + remove("proj.df1") remove("proj.df2") - + ## **2. Wrangle the data** - + #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] - - + + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #save the missing outputs + + #save the missing outputs p <- paste0("checkpoint1", v, "_", i, "_", r, "_") print(p) #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) - + ### Update obs data to 360 days - - #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted - + + #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") remove <- paste0(remove, collapse = "|") - + obs.df <- obs.df[,!grepl(remove, names(obs.df))] #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove obs.df <- obs.df[1:ncol(cal.df)] - + ### Transpose the data sets - + #Obs grid should be cols, observations (time) should be rows for linear scaling - + cal.df <- t(cal.df) proj.df <- t(proj.df) obs.df <- t(obs.df) - + ## **3. Empirical Quantile Mapping** - + #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform #quantile mapping p <- paste0("checkpoint2", v, "_", i, "_", r, "_") print(p) - - + + qm1.fit <- fitQmapQUANT(obs.df, cal.df, - wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. - qstep = 0.01, - nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. - - + wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. + qstep = 0.01, + nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. + + qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") - + qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") - + ## **4. Save the data** p <- paste0("checkpoint3", v, "_", i, "_", r, "_") print(p) # Save data - lists of dfs for now (will be easier for assessment) results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) - + names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") p <- paste0("checkpoint4", v, "_", i, "_", r, "_") print(p) base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",v, ".RDS")) - + p <- paste0("checkpoint5", v, "_", i, "_", r, "_") print(p) rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) - + gc(reset=TRUE) - + } } } } -###################### Further cropping to the cropped dfs (!) - mostly for Scotland which is too big! +###################### Further cropping to the cropped dfs (!) - mostly for Scotland which is too big! cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs to relate to the file name in a unique way to subset var, #Meterological variables - Runs, #Runs in form 'Run08'- matched input + Runs, #Runs in form 'Run08'- matched input N.new.segments,...){ #Numeric, Number of dfs to break down to, eg 4 - - i <- region + + i <- region N.new.segments<- N.new.segments Runs <- Runs var <- var - + for(r in Runs){ for(v in var){ for(y in 1:N.new.segments){ if(v!="pr"){ dd <- "/mnt/vmfileshare/ClimateData/" - + #Subset to Area - #Load cpm first and then use this to subset the latter as there are more cells in cpm that hads + #Load cpm first and then use this to subset the latter as there are more cells in cpm that hads #Using 1980 - 2010 as calibration period fp <- paste0(dd, "Interim/CPM/Data_as_df/") cpm.files <- list.files(fp) - + #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] - - #Subset file list to area + + #Subset file list to area cpm.cal <- cpm.cal[grepl(i, cpm.cal)] - + #subset to var and run cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] - - #Load in + + #Load in cal.df <- lapply(cpm.cal.var, function(x){ df <- fread(paste0(fp, x)) df <- as.data.frame(df) - + row.names(df)<- paste0(df$x, "_", df$y) df$x <- NULL df$y <- NULL return(df) }) - + cal.df <- cal.df %>% reduce(cbind) - - - #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here - #Keep all of the files with these years - because the naming convention runs + + + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here + #Keep all of the files with these years - because the naming convention runs #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- n2 <- min(grep("20091201-",names(cal.df))) + 29 - - #This is the first part of the validation dataset, but all the val will be added to the projection df for + + #This is the first part of the validation dataset, but all the val will be added to the projection df for #the sake of bias correction and assessed separately proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] cal.df <- cal.df[c(1:n2)] - + #Subset the dataframe iteratively depending on y nrows.seg <- nrow(cal.df)/N.new.segments y_1 <- y-1 @@ -403,176 +403,176 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t nr1 <- round(nrows.seg*y_1) + 1 nr2 <- round(nrows.seg*y) cal.df <- cal.df[nr1:nr2,] - + #proj data yi <- paste0(i,c(2020,2040,2060), collapse="|") cpm.proj <- cpm.files[grepl(yi, cpm.files)] - + #Subset to Area, var and run cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] - - #Load in + + #Load in proj.df2 <- lapply(cpm.proj, function(x){ df <- as.data.frame(fread(paste0(fp, x))) #Remove x and y cols df[c(3:ncol(df))] }) - + names(proj.df2) <- cpm.proj - + proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) proj.df <- proj.df[which(row.names(proj.df)%in%row.names(cal.df)),] - + remove("proj.df1") remove("proj.df2") - - + + #HADs grid observational data fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") files <- list.files(fp) obs <- files[grepl(i, files)] - + #subset file list to var obs.var <- obs[grepl(v,obs)] - - #subset to calibration years + + #subset to calibration years obs.varc <- obs.var[grepl("1980", obs.var)] obs.df <- fread(paste0(fp, obs.varc)) obs.df <- as.data.frame(obs.df) - + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) obs.df$x <- NULL obs.df$y <- NULL - + #Subset to the rows which are in above (some will be missing) obs.df <- obs.df[which(row.names(obs.df)%in%row.names(cal.df)),] - - #Remove the dates not in the cpm + + #Remove the dates not in the cpm ## find col position of the first cpm date 19801201 n1 <-min(grep("19801201", names(obs.df))) obs.df <- obs.df[c(n1:ncol(obs.df))] - + gc() - - + + ## **2. Wrangle the data** - + #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] - - + + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #save the missing outputs + + #save the missing outputs p <- paste0("checkpoint1", v, "_", i, "_", r, "_",y) print(p) #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) - + ### Update obs data to 360 days - - #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted - + + #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") remove <- paste0(remove, collapse = "|") - + obs.df <- obs.df[,!grepl(remove, names(obs.df))] #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove obs.df <- obs.df[1:ncol(cal.df)] - + ### Transpose the data sets - + #Obs grid should be cols, observations (time) should be rows for linear scaling - + cal.df <- t(cal.df) proj.df <- t(proj.df) obs.df <- t(obs.df) - - + + ## **3. Empirical Quantile Mapping** - + #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform #quantile mapping p <- paste0("checkpoint2", v, "_", i, "_", r, "_",y) print(p) - + library(qmap) qm1.fit <- fitQmapQUANT(obs.df, cal.df, wet.day = FALSE, - qstep = 0.01, - nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. - - + qstep = 0.01, + nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. + + qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") - + qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") - + ## **4. Save the data** p <- paste0("checkpoint3", v, "_", i, "_", r, "_", y) print(p) # Save data - lists of dfs for now (will be easier for assessment) results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) - + names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") p <- paste0("checkpoint4", v, "_", i, "_", r, "_", y) print(p) base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",y,"_",v, ".RDS")) - + p <- paste0("checkpoint5", v, "_", i, "_", r, "_", y) print(p) rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs", "y", "N.new.segments"))) - + gc(reset=TRUE) - - + + } else { - - #### Precipitation - the HADs variable has is called 'rainfall' + + #### Precipitation - the HADs variable has is called 'rainfall' dd <- "/mnt/vmfileshare/ClimateData/" - + #Subset to Area - #Load cpm first and then use this to subset the latter as there are more cells in cpm that hads + #Load cpm first and then use this to subset the latter as there are more cells in cpm that hads #Using 1980 - 2010 as calibration period fp <- paste0(dd, "Interim/CPM/Data_as_df/") cpm.files <- list.files(fp) - + #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] - - #Subset file list to area + + #Subset file list to area cpm.cal <- cpm.cal[grepl(i, cpm.cal)] - + #subset to var and run cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] - - #Load in + + #Load in cal.df <- lapply(cpm.cal.var, function(x){ df <- fread(paste0(fp, x)) df <- as.data.frame(df) - + row.names(df)<- paste0(df$x, "_", df$y) df$x <- NULL df$y <- NULL return(df) }) - + cal.df <- cal.df %>% reduce(cbind) - - - #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here - #Keep all of the files with these years - because the naming convention runs + + + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here + #Keep all of the files with these years - because the naming convention runs #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- n2 <- min(grep("20091201-",names(cal.df))) + 29 - - #This is the first part of the validation dataset, but all the val will be added to the projection df for + + #This is the first part of the validation dataset, but all the val will be added to the projection df for #the sake of bias correction and assessed separately proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] cal.df <- cal.df[c(1:n2)] - + #Subset the dataframe iteratively depending on y nrows.seg <- nrow(cal.df)/N.new.segments y_1 <- y-1 @@ -580,138 +580,136 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t nr1 <- round(nrows.seg*y_1) + 1 nr2 <- round(nrows.seg*y) cal.df <- cal.df[nr1:nr2,] - - + + #proj data yi <- paste0(i,c(2020,2040,2060), collapse="|") cpm.proj <- cpm.files[grepl(yi, cpm.files)] - + #Subset to Area, var and run cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] - - #Load in + + #Load in proj.df2 <- lapply(cpm.proj, function(x){ df <- as.data.frame(fread(paste0(fp, x))) #Remove x and y cols df[c(3:ncol(df))] }) - + names(proj.df2) <- cpm.proj - + proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) proj.df <- proj.df[which(row.names(proj.df)%in%row.names(cal.df)),] - + remove("proj.df1") remove("proj.df2") - - + + #HADs grid observational data fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") files <- list.files(fp) obs <- files[grepl(i, files)] - + #subset file list to var obs.var <- obs[grepl("rainfall",obs)] - - #subset to calibration years + + #subset to calibration years obs.varc <- obs.var[grepl("1980", obs.var)] obs.df <- fread(paste0(fp, obs.varc)) obs.df <- as.data.frame(obs.df) - + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) obs.df$x <- NULL obs.df$y <- NULL - + #Subset to the rows which are in above (some will be missing) obs.df <- obs.df[which(row.names(obs.df)%in%row.names(cal.df)),] - - #Remove the dates not in the cpm + + #Remove the dates not in the cpm ## find col position of the first cpm date 19801201 n1 <-min(grep("19801201", names(obs.df))) obs.df <- obs.df[c(n1:ncol(obs.df))] - + gc() - - + + ## **2. Wrangle the data** - + #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] - - + + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #save the missing outputs + + #save the missing outputs p <- paste0("checkpoint1", v, "_", i, "_", r, "_",y) print(p) #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) - + ### Update obs data to 360 days - - #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted - + + #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") remove <- paste0(remove, collapse = "|") - + obs.df <- obs.df[,!grepl(remove, names(obs.df))] #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove obs.df <- obs.df[1:ncol(cal.df)] - + ### Transpose the data sets - + #Obs grid should be cols, observations (time) should be rows for linear scaling - + cal.df <- t(cal.df) proj.df <- t(proj.df) obs.df <- t(obs.df) - - + + ## **3. Empirical Quantile Mapping** - + #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform #quantile mapping p <- paste0("checkpoint2", v, "_", i, "_", r, "_",y) print(p) - + qm1.fit <- fitQmapQUANT(obs.df, cal.df, - wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. - qstep = 0.01, - nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. - - + wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. + qstep = 0.01, + nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. + + qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") - + qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") - - + + ## **4. Save the data** p <- paste0("checkpoint3", v, "_", i, "_", r, "_", y) print(p) # Save data - lists of dfs for now (will be easier for assessment) results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) - + names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") p <- paste0("checkpoint4", v, "_", i, "_", r, "_", y) print(p) base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",y,"_",v, ".RDS")) - + p <- paste0("checkpoint5", v, "_", i, "_", r, "_", y) print(p) rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs", "y", "N.new.segments"))) - + gc(reset=TRUE) - - - + + + } } } } } - - diff --git a/R/README.md b/R/README.md index 390d95b9..6552b697 100644 --- a/R/README.md +++ b/R/README.md @@ -1,13 +1,13 @@ # Methods implemented in R -- **/Resampling** - code for Resampling data to different extents (grid sizes) +- **/Resampling** - code for Resampling data to different extents (grid sizes) - **/bias-correction-methods** - bias correction methods implemented in R -- **/comparing-r-and-python** - Comparing various pipeline aspects between R and python +- **/comparing-r-and-python** - Comparing various pipeline aspects between R and python ``` r -og_c <- terra::crop(og, scotland, snap="in") +og_c <- terra::crop(og, scotland, snap="in") plot(og_c$tasmax_1) ``` @@ -181,7 +181,7 @@ Ok there are some differences that I can see from the plot between the two resampled files! ``` r -## Cropping to a small area to compare with the same orginal HADS file +## Cropping to a small area to compare with the same orginal HADS file i <- rast() ext(i) <- c(200000, 210000, 700000, 710000) ``` @@ -246,7 +246,7 @@ plot(og_ci$tasmax_1) ![](WIP-Comparing-HADs-grids_files/figure-gfm/unnamed-chunk-10-1.png) ``` r -ukcp_c <- terra::crop(ukcp.r, i) +ukcp_c <- terra::crop(ukcp.r, i) plot(ukcp_c$`tasmax_rcp85_land-cpm_uk_2.2km_01_day_19991201-20001130_31`) ``` diff --git a/R/pre-processing/converting_city_crops_to_df.R b/R/pre-processing/converting_city_crops_to_df.R index df9cee3c..7fa85d30 100644 --- a/R/pre-processing/converting_city_crops_to_df.R +++ b/R/pre-processing/converting_city_crops_to_df.R @@ -2,12 +2,12 @@ rm(list=ls()) -# Most input to bias correction methods in R need dfs +# Most input to bias correction methods in R need dfs # Might be better in future to seperate it out differently (ie not run hads and cpm in same loop, or by variable ) library(qmap) library(terra) -library(tidyverse) +library(tidyverse) library(doParallel) dd <- "/mnt/vmfileshare/ClimateData/" @@ -19,55 +19,55 @@ x <- c("London", "Manchester", "Glasgow") # Where to write the results (note subfolder added as name of city/x above) rd <- paste0(dd, "Interim/CPM/Data_as_df/three.cities/") -#file locations for cropped versions of CPM +#file locations for cropped versions of CPM fps <- paste0(dd, "Cropped/three.cities/CPM/") fps <- paste0(fps,x) -#Run the conversion to df in parallel +#Run the conversion to df in parallel cores <- detectCores() cl <- makeCluster(cores[1]-1) registerDoParallel(cl) -foreach(x = x, +foreach(x = x, .packages = c("terra", "tidyverse"), - .errorhandling = 'pass') %dopar% { - + .errorhandling = 'pass') %dopar% { + fp <- fps[grepl(x, fps)] fp <- paste0(fp, "/") files <- list.files(fp) files.paths.all <- paste0(fp, files) - + #group in runs and in variable - + for(v in var){ for(r in run){ rr <- paste0("_",r,"_") files.paths <- files.paths.all[grepl(v, files.paths.all)& grepl(rr, files.paths.all)&grepl("CPM", files.paths.all)] - - # Read in 1st runpath as df with xy coords to ensure overlay - p1 <- files.paths[[1]] + + # Read in 1st runpath as df with xy coords to ensure overlay + p1 <- files.paths[[1]] rast <- rast(p1) - rdf1 <- as.data.frame(rast, xy=T) - - # Load and convert remaining to single col dfs + rdf1 <- as.data.frame(rast, xy=T) + + # Load and convert remaining to single col dfs dfL <- lapply(2:length(files.paths), function(i){ - p <- files.paths[[i]] + p <- files.paths[[i]] rast <- rast(p) - rdf <- as.data.frame(rast) + rdf <- as.data.frame(rast) return(rdf) - }) - + }) + df <- dfL %>% reduce(cbind) df <- cbind(rdf1, df) - + fn <- paste0(rd, x, "/", v, "_","Run",r, ".csv") write.csv(df, fn, row.names = F) - - - } + + + } } } -stopCluster(cl) +stopCluster(cl) gc() @@ -78,74 +78,74 @@ rd <- paste0(dd, "Interim/HadsUK/Data_as_df/three.cities/") fps <- paste0(dd, "Cropped/three.cities/Hads.updated360/") fps <- paste0(fps,x) -#Run the conversion to df in parallel +#Run the conversion to df in parallel cores <- detectCores() cl <- makeCluster(cores[1]-1) registerDoParallel(cl) -foreach(x = x, +foreach(x = x, .packages = c("terra", "tidyverse"), - .errorhandling = 'pass') %dopar% { - + .errorhandling = 'pass') %dopar% { + fp <- fps[grepl(x, fps)] fp <- paste0(fp, "/") files <- list.files(fp) files.paths.all <- paste0(fp, files) - + #group in runs and in variable - + for(v in var){ if(v!="pr"){ - + files.paths <- files.paths.all[grepl(v, files.paths.all)] - - # Read in 1st runpath as df with xy coords to ensure overlay - p1 <- files.paths[[1]] + + # Read in 1st runpath as df with xy coords to ensure overlay + p1 <- files.paths[[1]] rast <- rast(p1) - rdf1 <- as.data.frame(rast, xy=T) - - # Load and convert remaining to single col dfs + rdf1 <- as.data.frame(rast, xy=T) + + # Load and convert remaining to single col dfs dfL <- lapply(2:length(files.paths), function(i){ - p <- files.paths[[i]] + p <- files.paths[[i]] rast <- rast(p) - rdf <- as.data.frame(rast) + rdf <- as.data.frame(rast) return(rdf) - }) - + }) + df <- dfL %>% reduce(cbind) df <- cbind(rdf1, df) - + fn <- paste0(rd, x, "/", v, ".csv") write.csv(df, fn, row.names = F) - - + + } else { - + files.paths <- files.paths.all[grepl("rainfall", files.paths.all)] - - # Read in 1st runpath as df with xy coords to ensure overlay - p1 <- files.paths[[1]] + + # Read in 1st runpath as df with xy coords to ensure overlay + p1 <- files.paths[[1]] rast <- rast(p1) - rdf1 <- as.data.frame(rast, xy=T) - - # Load and convert remaining to single col dfs + rdf1 <- as.data.frame(rast, xy=T) + + # Load and convert remaining to single col dfs dfL <- lapply(2:length(files.paths), function(i){ - p <- files.paths[[i]] + p <- files.paths[[i]] rast <- rast(p) - rdf <- as.data.frame(rast) + rdf <- as.data.frame(rast) return(rdf) - }) - + }) + df <- dfL %>% reduce(cbind) df <- cbind(rdf1, df) - - + + fn <- paste0(rd, x, "/", v, ".csv") write.csv(df, fn, row.names = F) - + } } } -stopCluster(cl) +stopCluster(cl) gc() diff --git a/README.md b/README.md index ac4a1b22..409dadcf 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ The UK Climate Projections 2018 (UKCP18) dataset offers insights into the potent [HadUK-Grid](https://www.metoffice.gov.uk/research/climate/maps-and-data/data/haduk-grid/haduk-grid) is a comprehensive collection of climate data for the UK, compiled from various land surface observations across the country. This data is organized into a uniform grid to ensure consistent coverage throughout the UK at up to 1km x 1km resolution. The dataset, spanning from 1836 to the present, includes a variety of climate variables such as air temperature, precipitation, sunshine, and wind speed, available on daily, monthly, seasonal, and annual timescales. ### Geographical Dataset -The geographical dataset can be used for visualising climate data. It mainly includes administrative boundaries published by the Office for National Statistics (ONS). The dataset is sharable under the [Open Government Licence v.3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/) and is available for download via this [link](https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom_2022/FeatureServer/replicafilescache/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom_2022_7279368953270783580.zip). We include a copy in the `data/Geofiles` folder for convenience. In addition, the clips for three cities' boundaries from the same dataset are copied to `three.cities` subfolder. +The geographical dataset can be used for visualising climate data. It mainly includes administrative boundaries published by the Office for National Statistics (ONS). The dataset is sharable under the [Open Government Licence v.3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/) and is available for download via this [link](https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom_2022/FeatureServer/replicafilescache/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom_2022_7279368953270783580.zip). We include a copy in the `data/Geofiles` folder for convenience. In addition, the clips for three cities' boundaries from the same dataset are copied to `three.cities` subfolder. ## Why Bias Correction?