From d19e83c35f4c62aea7834c5fcea51c683c8d2fff Mon Sep 17 00:00:00 2001
From: AnnaKS123 <aks85@cam.ac.uk>
Date: Mon, 7 Aug 2023 15:40:31 +0100
Subject: [PATCH] added comments

---
 R/error_handling.R                   |  1 +
 R/health_burden.R                    | 23 ++++++---
 R/injury_death_to_yll.R              | 76 ++++++++++++++++++++++------
 R/population_attributable_fraction.R |  2 +-
 multi_city_script.R                  |  9 ++--
 5 files changed, 83 insertions(+), 28 deletions(-)

diff --git a/R/error_handling.R b/R/error_handling.R
index b768b22e..ad1044f5 100644
--- a/R/error_handling.R
+++ b/R/error_handling.R
@@ -1,3 +1,4 @@
+#' CURRENTLY NOT BEING CALLED
 #' @export
 error_handling <- function(code, function_name, variable_name) {
   output <-
diff --git a/R/health_burden.R b/R/health_burden.R
index 292a84e1..83a815b8 100644
--- a/R/health_burden.R
+++ b/R/health_burden.R
@@ -20,8 +20,8 @@
 #'        - find the sum of the relative risks (RR) for the specific disease for each age and sex category for the non-reference scenario
 #'        - calculate the PIF (potential impact fraction), i.e the proportional change in the sum of relative risks between the reference
 #'          and the on-reference scenario for each age and sex category
-#'        - calculate the health burden for the non-reference scenario by multiplying the current burden of disease by the PIF 
-#'          (combine_health_and_pif.R)
+#'        - calculate the health burden (deaths and ylls) for the non-reference scenario compared to the reference scenario by multiplying
+#'          the current burden of disease by the PIF (combine_health_and_pif.R)
 #'      - if confidence intervals are required, loop through the upper and lower confidence interval limits
 #'        and calculate the health burden for deaths and YLLs using the upper and lower confidence relative risks. If no upper and 
 #'        lower relative risk values exist, use the mean value instead
@@ -127,12 +127,14 @@ health_burden <- function(ind_ap_pa, conf_int = F, combined_AP_PA = T){
         # calculate PIF for this scenario and convert to vector
         pif_scen <- ((pif_ref[,2] - pif_temp[,2]) / pif_ref[,2]) %>% pull() 
         
-        # Calculate ylls by multiplying current burden of disease for particular disease by the PIF value, i.e the expected change
+        #Calculate the difference in ylls between the non-reference and the reference scenario
+        # by multiplying current burden of disease for particular disease by the PIF value, i.e the expected change
         # between the reference scenario and the given scenario for each age and sex category
         yll_dfs <- combine_health_and_pif(pif_values=pif_scen, hc = gbd_ylls_disease)
         ylls[[yll_name]] <- yll_dfs
        
-         # Calculate deaths by multiplying current burden of disease for particular disease by the PIF value, i.e the expected change
+        # Calculate the difference in deaths between the non-reference and the reference scenario
+        # by multiplying current burden of disease for particular disease by the PIF value, i.e the expected change
         # between the reference scenario and the given scenario  for each age and sex category
         death_dfs <- combine_health_and_pif(pif_values=pif_scen,hc=gbd_deaths_disease)
         deaths[[deaths_name]] <- death_dfs
@@ -199,10 +201,15 @@ health_burden <- function(ind_ap_pa, conf_int = F, combined_AP_PA = T){
 
 
 
-#' Join disease health burden and injury
+#' Join disease health burden and injury data
 #' 
 #' Join the two data frames for health burden: that from disease, and that from road-traffic injury
 #' 
+#' This function performs the following steps:
+#'  - extract the yll and deaths data from the combined AP and PA pathways
+#'  - extract the yll and deaths data from the injury data
+#'  - create one dataframe for yll and one for deaths containing all the AP, PA and injury data
+#' 
 #' @param ind_ap_pa list (deaths, YLLs) of data frames of all demographic groups' burdens for diseases
 #' @param inj list (deaths, YLLs) of data frames of all demographic groups' burdens for road-traffic injury
 #' 
@@ -213,11 +220,11 @@ join_hb_and_injury <- function(ind_ap_pa,inj){
   
   deaths <- ind_ap_pa$deaths
   ylls <- ind_ap_pa$ylls
-  # Select deaths columns
+  # Select deaths columns from injury data
   inj_deaths <- dplyr::select(inj, c(age_cat, sex, contains("deaths")))
-  # Select yll columns
+  # Select yll columns from injury data
   inj_ylls <- dplyr::select(inj, c(age_cat, sex, contains("yll")))
-  # Join injuries data to global datasets
+  # Join injuries data to global deaths and yll datasets
   deaths <- dplyr::left_join(deaths, inj_deaths, by = c("age_cat", "sex"))
   ylls <- dplyr::left_join(ylls, inj_ylls, by = c("age_cat", "sex"))
   list(deaths=deaths,ylls=ylls)
diff --git a/R/injury_death_to_yll.R b/R/injury_death_to_yll.R
index 50e4c2b6..0bce841d 100644
--- a/R/injury_death_to_yll.R
+++ b/R/injury_death_to_yll.R
@@ -1,37 +1,68 @@
-#' Map injury death burden to YLL burden
+#' Map injury death burden to YLL (years of life lost) burden
 #' 
-#' Calculated the YLL burden from the death burden of injury based on the ratio in the GBD data.
+#' Calculate the YLL burden from the death burden of injury based on the ratio in the GBD data.
 #' 
-#' @param injuries data frame of injury deaths
+#' The function performs the following steps:
+#'  - join the estimated injury deaths with the global burden of disease (GBD) injury data by age and sex
+#'  - multiply the estimated injury deaths by the yll to injury death ratio in the GBD data to get YLL
+#'    from the estimated injury deaths
+#'  - extract and create matrices for deaths and ylls with one column for each scenario
+#'  - create dataframe A with ylls and deaths of reference scenario
+#'  - create dataframe B showing the differences in deaths and yll for each non-reference
+#'    scenario to the reference scenario
+#'  - if confidence intervals are required:
+#'    - create dataframe with ylls and deaths of reference scenario using both the lower and upper 
+#'      relative risk boundary values
+#'    - create dataframe showing the differences in deaths and yll for each non-reference
+#'      scenario to the reference scenario using both the lower and upper relative
+#'      risk boundary values
+#'    - add the confidence upper and lower interval boundary values to the two output dataframes A and B
+#' 
+#' 
+#' @param injuries data frame of injury deaths by age and sex category for each scenario incl. baseline 
 #' 
 #' @return list of injury deaths and YLLs (which are differences from reference scenario) plus the values in the reference scenario.
 #' 
 #' @export
 injury_death_to_yll <- function(injuries){
   
-  # injuries is a tibble, GBD_INJ_YLL is a data.frame, returns a tibble
+  
+  # join the injury deaths with the YLLs lost due to injuries in the global burden of disease data
   joined_injury <- dplyr::left_join(injuries, GBD_INJ_YLL[,c('sex_age','sex','yll_dth_ratio')], by=c("sex_age",'sex'))
   
+  # multiply the injury deaths by the yll to injury death ratio in the global burden of disease injury data
   joined_injury$YLL <- joined_injury$Deaths*joined_injury$yll_dth_ratio
+  # extract columns of interest
   death_and_yll <- dplyr::select(joined_injury, c('age_cat','sex','scenario','Deaths','YLL'))
   
+  # extract and create matrices for deaths and ylls with one column for each scenario
   x_deaths <- dplyr::select(death_and_yll, -YLL)
-  x_deaths <- spread(x_deaths,scenario, Deaths)
+  x_deaths <- spread(x_deaths,scenario, Deaths) # create one column for age_cat, sex and each scenario (incl baseline) for deaths
   x_yll <- dplyr::select(death_and_yll, -Deaths)
-  x_yll <- spread(x_yll,scenario, YLL)
+  x_yll <- spread(x_yll,scenario, YLL) # create one column for age_cat, sex and each scenario (incl baseline) for ylls
   
+  # set reference and other scenarios
   ref_scen <- REFERENCE_SCENARIO
-  if (REFERENCE_SCENARIO == 'Baseline'){
-    ref_scen = 'baseline'} 
+  if (REFERENCE_SCENARIO == 'Baseline'){ref_scen = 'baseline'} 
   ref_scen_index <- which(SCEN==ref_scen)
   calc_scen <- SCEN[SCEN!=ref_scen]
+  # find indexes of columns in x_deaths  (and therefore also x_yll) data that belong to the non-reference scenarios
   calc_scen_index <- which(colnames(x_deaths)%in%calc_scen)
   
+  # create dataframe with ylls and deaths of reference scenario
   ref_injuries <- as.data.frame(cbind(x_deaths[,1:2],deaths=x_deaths[[ref_scen]],ylls=x_yll[[ref_scen]]))
+  
+  # calculate the differences in injury deaths between the non-reference and the reference scenario
   deaths <- t(repmat(unlist(ref_injuries$deaths),NSCEN,1)) - x_deaths[,calc_scen_index,drop=F]
+  
+  # calculate the differences in injury ylls between the non-reference and the reference scenario
   ylls <- t(repmat(unlist(ref_injuries$ylls),NSCEN,1)) - x_yll[,calc_scen_index,drop=F]
+  
+  # create one dataframe showing the differences in deaths and yll for each non-reference scenario to the 
+  # reference scenario
   deaths_yll_injuries <- as.data.frame(cbind(as.data.frame(x_deaths[,1:2]),deaths, ylls))
   
+  # update columns names of deaths_yll_injuries to state whether the column shows injury deaths or ylls
   metric <- c("deaths", "yll")
   k <- 1
   for  (i in 1: 2)
@@ -41,31 +72,36 @@ injury_death_to_yll <- function(injuries){
     }
   
   # Repeat the above logic for lower and upper interval
-  
-  if (any(colnames(injuries) %in% c('Deaths_lb', 'Deaths_ub')))
-  {
+  if (any(colnames(injuries) %in% c('Deaths_lb', 'Deaths_ub'))){
     
-    # injuries is a tibble, GBD_INJ_YLL is a data.frame, returns a tibble
+    ## lower interval boundary
+    # join the injury deaths with the YLLs lost due to injuries in the global burden of disease data
     joined_injury_lb <- dplyr::left_join(injuries, GBD_INJ_YLL[,c('sex_age','sex','yll_dth_ratio')], by=c("sex_age",'sex'))
     
+    # multiply the injury deaths by the yll to injury death ratio in the global burden of disease injury data
     joined_injury_lb$YLL_lb <- joined_injury_lb$Deaths_lb * joined_injury_lb$yll_dth_ratio
     death_and_yll_lb <- dplyr::select(joined_injury_lb, c('age_cat','sex','scenario','Deaths_lb','YLL_lb'))
     
+    # extract and create matrices for deaths and ylls with one column for each scenario
     x_deaths_lb <- dplyr::select(death_and_yll_lb, -YLL_lb)
     x_deaths_lb <- spread(x_deaths_lb,scenario, Deaths_lb)
     x_yll_lb <- dplyr::select(death_and_yll_lb, -Deaths_lb)
     x_yll_lb <- spread(x_yll_lb,scenario, YLL_lb)
     
+    # set reference and other scenarios
     ref_scen_lb <- REFERENCE_SCENARIO
     ref_scen_index_lb <- which(SCEN==ref_scen_lb)
     calc_scen <- SCEN[SCEN!=ref_scen_lb]
     calc_scen_index <- which(colnames(x_deaths_lb)%in%calc_scen)
     
+    # create one dataframe showing the differences in deaths and yll for each non-reference scenario to the 
+    # reference scenario
     ref_injuries_lb <- as.data.frame(cbind(x_deaths_lb[,1:2],deaths_lb=x_deaths_lb[[ref_scen_lb]],ylls_lb=x_yll_lb[[ref_scen_lb]]))
     deaths_lb <- t(repmat(unlist(ref_injuries_lb$deaths_lb),NSCEN,1)) - x_deaths_lb[,calc_scen_index,drop=F]
     ylls_lb <- t(repmat(unlist(ref_injuries_lb$ylls_lb),NSCEN,1)) - x_yll_lb[,calc_scen_index,drop=F]
     deaths_yll_injuries_lb <- as.data.frame(cbind(as.data.frame(x_deaths_lb[,1:2]),deaths_lb, ylls_lb))
     
+    # update columns names of deaths_yll_injuries to state whether the column shows injury deaths or ylls
     metric <- c("deaths", "yll")
     k <- 1
     for  (i in 1: 2)
@@ -75,27 +111,36 @@ injury_death_to_yll <- function(injuries){
         k<-k+1
       }
     
-    # injuries is a tibble, GBD_INJ_YLL is a data.frame, returns a tibble
+    
+    
+    ## upper confidence interval boundary
+    # join the injury deaths with the YLLs lost due to injuries in the global burden of disease data
     joined_injury_ub <- dplyr::left_join(injuries, GBD_INJ_YLL[,c('sex_age','sex','yll_dth_ratio')], by=c("sex_age",'sex'))
     
+    # multiply the injury deaths by the yll to injury death ratio in the global burden of disease injury data
     joined_injury_ub$YLL_ub <- joined_injury_ub$Deaths_ub * joined_injury_ub$yll_dth_ratio
     death_and_yll_ub <- dplyr::select(joined_injury_ub, c('age_cat','sex','scenario','Deaths_ub','YLL_ub'))
     
+    # extract and create matrices for deaths and ylls with one column for each scenario
     x_deaths_ub <- dplyr::select(death_and_yll_ub, -YLL_ub)
     x_deaths_ub <- spread(x_deaths_ub,scenario, Deaths_ub)
     x_yll_ub <- dplyr::select(death_and_yll_ub, -Deaths_ub)
     x_yll_ub <- spread(x_yll_ub,scenario, YLL_ub)
     
+    # set reference and other scenarios
     ref_scen_ub <- REFERENCE_SCENARIO
     ref_scen_index_ub <- which(SCEN==ref_scen_ub)
     calc_scen <- SCEN[SCEN!=ref_scen_ub]
     calc_scen_index <- which(colnames(x_deaths_ub)%in%calc_scen)
-    
+
+    # create one dataframe showing the differences in deaths and yll for each non-reference scenario to the 
+    # reference scenario    
     ref_injuries_ub <- as.data.frame(cbind(x_deaths_ub[,1:2],deaths_ub=x_deaths_ub[[ref_scen_ub]],ylls_ub=x_yll_ub[[ref_scen_ub]]))
     deaths_ub <- t(repmat(unlist(ref_injuries_ub$deaths_ub),NSCEN,1)) - x_deaths_ub[,calc_scen_index,drop=F]
     ylls_ub <- t(repmat(unlist(ref_injuries_ub$ylls_ub),NSCEN,1)) - x_yll_ub[,calc_scen_index,drop=F]
     deaths_yll_injuries_ub <- as.data.frame(cbind(as.data.frame(x_deaths_ub[,1:2]),deaths_ub, ylls_ub))
     
+    # update columns names of deaths_yll_injuries to state whether the column shows injury deaths or ylls
     metric <- c("deaths", "yll")
     k <- 1
     for  (i in 1: 2)
@@ -105,8 +150,7 @@ injury_death_to_yll <- function(injuries){
         k<-k+1
       }
     
-    # Combine lower and upper datasets
-    
+    # add lower and upper boundary values to the reference and the yll and deaths injury differences datasets
     deaths_yll_injuries <- left_join(deaths_yll_injuries, deaths_yll_injuries_ub)
     deaths_yll_injuries <- left_join(deaths_yll_injuries, deaths_yll_injuries_lb)
     
diff --git a/R/population_attributable_fraction.R b/R/population_attributable_fraction.R
index 39720cdb..ab7a6a82 100644
--- a/R/population_attributable_fraction.R
+++ b/R/population_attributable_fraction.R
@@ -1,4 +1,4 @@
-#' Calculate population attributable fraction
+#' Calculate population attributable fraction - CURRENTLY NOT BEING USED
 #' 
 #' 
 #' 
diff --git a/multi_city_script.R b/multi_city_script.R
index 312f5345..68c3948c 100644
--- a/multi_city_script.R
+++ b/multi_city_script.R
@@ -85,16 +85,19 @@ comment <- "Added CO2 emission sampling"
 
 # scenario definition
 scenario_name <- "BOGOTA" # name of scenario to be called
-reference_scenario <- 'Baseline' # scenario the other scenarios are compared to
+# scenario the other scenarios are compared to, the reference scenario name should always
+# be the name of the scenario corresponding to the actual baseline burden of disease and 
+# other input data for the city 
+reference_scenario <- 'Baseline' 
 scenario_increase <- 0.05 # increase for each mode in each scenario (used in GLOBAL, BOGOTA, LATAM and AFRICA_INDIA scenarios)
-n_scenarios <- 3 # number of scnenarios (not including the baseline scenario)
+n_scenarios <- 3 # number of scenarios (not including the baseline scenario)
 
 # define which output results to plot, more than 6 cannot be plotted
 # potential outputs (in yll for all scenarios):  c('pa_ap_all_cause', 'pa_ap_IHD', 'pa_total_cancer', 'pa_ap_lung_cancer', 'ap_COPD', 
 #                       'pa_ap_stroke', 'pa_ap_T2D', 'ap_LRI', 'pa_breast_cancer', 'pa_colon_cancer', 'pa_endo_cancer',
 #                       'pa_liver_cancer', 'pa_ap_CVD', 'pa_total_dementia', 'pa_myeloma', 'pa_Parkinson',
 #                       'pa_head_neck_cancer', 'pa_stomach_cancer', 'inj')
-outputs_to_plot <- c('pa_ap_all_cause','inj','pa_ap_IHD', 'pa_total_cancer', 'pa_ap_lung_cancer', 'ap_COPD')
+outputs_to_plot <- c('pa_ap_all_cause', 'ap_COPD', 'pa_total_dementia', 'pa_myeloma', 'ap_LRI','pa_stomach_cancer' )
 
 
 ############################### No need to change the following ##################################