Initial commit of 11 helper functions and all files to build the package

NeuroShepherd · Apr 28, 2020 · 8c412a3 · 8c412a3
commit 8c412a3
Show file tree

Hide file tree

Showing 30 changed files with 779 additions and 0 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,3 @@
+^RankinLabTools\.Rproj$
+^\.Rproj\.user$
+^LICENSE\.md$
diff --git a/.Rhistory b/.Rhistory
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.Rproj.user
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,26 @@
+Package: RankinLabTools
+Title: General data cleaning tools for the Rankin lab
+Version: 0.0.0.9000
+Date: 2020-04-27
+Authors@R: 
+    person(given = "Patrick",
+           family = "Callahan",
+           role = c("aut", "cre"),
+           email = "[email protected]",
+           comment = c(ORCID = "0000-0003-1769-7580"))
+Maintainer: Patrick Callahan <[email protected]>
+URL: https://orcid.org/0000-0003-1769-7580
+Description: What the package does (one paragraph).
+License: MIT + file LICENSE
+Encoding: UTF-8
+LazyData: true
+Roxygen: list(markdown = TRUE)
+RoxygenNote: 7.1.0
+Imports: 
+    dplyr,
+    readr,
+    rlang,
+    glue,
+    stringr,
+    fuzzyjoin,
+    lubridate
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2020
+COPYRIGHT HOLDER: Patrick Callahan
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,21 @@
+# MIT License
+
+Copyright (c) 2020 Patrick Callahan
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,13 @@
+# Generated by roxygen2: do not edit by hand
+
+export(add_test_name_to_vars)
+export(calculate_current_past_difference)
+export(combine_current_and_past_observations)
+export(combine_mac_uds_encounters)
+export(custom_fuzzy_join)
+export(impute_missing_values)
+export(negative_values_to_na)
+export(read_csv_by_string_search)
+export(remove_rows_with_mult_NAs)
+export(reverse_scoring)
+export(select_by_test_name)
diff --git a/R/Calculate Current Past Difference.R b/R/Calculate Current Past Difference.R
@@ -0,0 +1,47 @@
+
+
+#' Calculate difference between vectors of columns
+#' This function accepts any number of 2 sets of equal length columns, and calculates the difference (that is, past-current = difference).
+#' Intended to be used in conjunction with the combine_current_and_past_observations() function.
+#'
+#' @param dataframe dataframe object
+#' @param variables_past variables where the observation is in the past
+#' @param variables_current variables where the observation is current
+#' @param variables_output_prefix add a meaningful prefix to the name of the difference columns
+#' @param PIDN defaults to PIDN; can be any other ID
+#' @param DCDate defaults to DCDate; can be any other date variable
+#'
+#' @return dataframe
+#' @export
+#'
+#' @examples
+calculate_current_past_difference <- function(dataframe, variables_past, variables_current, variables_output_prefix, PIDN=PIDN, DCDate = DCDate) {
+  variables_current <- dplyr::enquo(variables_current)
+  variables_past <- dplyr::enquo(variables_past)
+  PIDN <- dplyr::enquo(PIDN)
+  DCDate <- dplyr::enquo(DCDate)
+
+  length1 <- dataframe %>%
+    dplyr::select(!!variables_past) %>%
+    length()
+  length2 <- dataframe %>%
+    dplyr::select(!!variables_current) %>%
+    length()
+  if (length1 != length2){print("Stop! Your variable lengths differ; you are not using this function correctly. The map() call within this function will not work!")}
+
+  difference_names <- dataframe %>%
+    dplyr::select(!!variables_current) %>%
+    dplyr::rename_all(~gsub("current","difference",.)) %>%
+    colnames()
+
+  dataframe %<>%
+    dplyr::select(!!variables_past,!!variables_current,everything()) %>%
+    dplyr::bind_cols(purrr::map2(.x = .[, 1:(length1)], .y = .[, (1+length1):(length1+length2)], .f = ~.x - .y)) %>% # calculate difference between selected columns; PAST - CURRENT = DIFFERENCE
+    dplyr::rename_at(dplyr::vars(!colnames(dataframe)), ~paste(difference_names)) %>% # Rename new columns to indicate they're the difference column
+    dplyr::select(everything(),!!variables_past,!!variables_current,ends_with("difference")) # arrange new columns
+
+
+  return(dataframe)
+}
+
+
diff --git a/R/Combine Current and Past Observations.R b/R/Combine Current and Past Observations.R
@@ -0,0 +1,35 @@
+
+
+#' Combine Current and Past Observations
+#'
+#' @param dataframe dataframe object
+#' @param CurrentPast variable indicating whether the observation is a "Current" or "Past" record; default column name expected is CurrentPast, but can be replaced with any column with "current" and "past"/"before" characters
+#' @param DCDate date column; defaults to DCDate
+#' @param PIDN identifier column; defaults to PIDN
+#'
+#' @return dataframe
+#' @export
+#'
+#' @examples
+combine_current_and_past_observations <- function(dataframe, CurrentPast = CurrentPast, DCDate=DCDate, PIDN=PIDN) {
+  CurrentPast <- enquo(CurrentPast)
+  DCDate <- enquo(DCDate)
+  PIDN <- enquo(PIDN)
+
+  current_dataframe <- dataframe %>%
+    dplyr::filter(!!CurrentPast %in% c("CURRENT","Current"))
+
+  past_dataframe <- dataframe %>%
+    dplyr::filter(!!CurrentPast %in% c("PAST","Past","Before","BEFORE")) %>%
+    dplyr::mutate(DCDate = lubridate::as_date(DCDate)) %>%
+    dplyr::group_by(PIDN) %>% # These three lines act to select only the earliest instance of completing a PAST test within each PIDN
+    dplyr::filter(DCDate == min(DCDate)) %>%
+    dplyr::ungroup()
+
+  final_dataframe <- current_dataframe %>%
+    dplyr::left_join(past_dataframe, by=c("PIDN"="PIDN"), suffix=c("_current","_past"))
+
+  return(final_dataframe)
+}
+
+
diff --git a/R/Combine MAC and UDS encounters.R b/R/Combine MAC and UDS encounters.R
@@ -0,0 +1,39 @@
+
+#' Combine MAC and UDS Encounters
+#' This function allows you to combine MAC and UDS encounters for a particular test where the
+#' encounters have been separated into different rows, but really occurred around the same time period.
+#' You can determine what range of days qualifies as a "single visit" using the day_range argument.
+#'
+#' @param dataframe dataframe object
+#' @param questions_to_combine select the variables that should be combined. This will be at least the item-level variables, and potentially your summary scores
+#' @param day_range range of days over which MAC and UDS visits can be considered the "same" visit; default of 30 days
+#' @param DCDate date identifier; defaults to DCDate variable
+#' @param PIDN person identifier; defaults to PIDN
+#'
+#' @return dataframe
+#' @export
+#'
+#' @examples
+combine_mac_uds_encounters <- function(dataframe, questions_to_combine, day_range = 30, DCDate = DCDate, PIDN = PIDN){
+  questions_to_combine <- enquo(questions_to_combine)
+  DCDate <- enquo(DCDate)
+  PIDN <- enquo(PIDN)
+
+  dataframe %<>%
+    dplyr::mutate_at(dplyr::vars(!!questions_to_combine), ~dplyr::na_if(.,-6)) %>%
+    dplyr::mutate(DCDate = lubridate::as_date(!!DCDate)) %>%
+    dplyr::arrange(!!PIDN,DCDate) %>%
+    dplyr::group_by(!!PIDN) %>%
+    dplyr::mutate(closeness_lag = dplyr::near(DCDate,dplyr::lead(DCDate),tol=day_range)) %>%
+    dplyr::ungroup() %>%
+    dplyr::mutate(newDate = dplyr::case_when(closeness_lag == TRUE ~ dplyr::lead(DCDate),
+                                             closeness_lag == FALSE ~ DCDate,
+                                             is.na(closeness_lag) ~ DCDate)) %>%
+    dplyr::group_by(!!PIDN,newDate) %>%
+    dplyr::summarize_all(list(~dplyr::first(stats::na.omit(.)))) %>%
+    dplyr::ungroup()
+
+  return(dataframe)
+
+}
+
diff --git a/R/Custom Fuzzy Join.R b/R/Custom Fuzzy Join.R
@@ -0,0 +1,43 @@
+
+#' Title
+#'
+#' @param df1 first dataframe argument
+#' @param DCDate1 first date argument
+#' @param PIDN1 first ID argument; defaults to PIDN
+#' @param df2 second dataframe argument
+#' @param DCDate2 second date argument
+#' @param PIDN2 second ID argument; defaults to PIDN
+#' @param mode used to specify if the join should be "left", "right", "full", "semi", or "anti"; defaults to "left"
+#' @param interval absolute value of days over which to match visits; default of +/-90 days
+#'
+#' @return
+#' @export
+#'
+#' @examples
+custom_fuzzy_join <- function(df1, DCDate1, PIDN1 = PIDN,
+                              df2, DCDate2, PIDN2 = PIDN,
+                              mode = "left", interval = 90) {
+
+  PIDN1 <- enquo(PIDN1)
+  PIDN2 <- enquo(PIDN2)
+  DCDate1 <- enquo(DCDate1)
+  DCDate2 <- enquo(DCDate2)
+
+  joining_pidn <- set_names(quo_name(PIDN2),quo_name(PIDN1))
+  joining_date <- set_names(quo_name(DCDate2),quo_name(DCDate1))
+
+  DCDate1 <- df1 %>%
+    select(!!DCDate1) %>%
+    mutate_at(vars(!!DCDate1), ~as_date(.))
+  DCDate2 <- df2 %>%
+    select(!!DCDate2) %>%
+    mutate_at(vars(!!DCDate2), ~as_date(.))
+
+  df1 %>%
+    fuzzyjoin::fuzzy_join(df2,
+                          by=c(joining_pidn,joining_date),
+                          match_fun = list(`==`, function(x,y)abs(x-y)<interval),
+                          mode=mode)
+
+}
+
diff --git a/R/Final Variable Naming.R b/R/Final Variable Naming.R
@@ -0,0 +1,35 @@
+
+
+#' Add Test Name to Variables
+#' Adds a name to all variables in the dataframe; can specify if the name appears at the beginning or end of the variables
+#'
+#' @param dataframe dataframe object
+#' @param test_name name to be added to all variables; must be quoted (e.g. "name")
+#' @param location where to append the test_name; 0 = end of variables, 1 = beginning of variables; defaults to 0
+#'
+#' @return
+#' @export
+#'
+#' @examples
+add_test_name_to_vars <- function(dataframe, test_name, location = 0){
+
+  # Create a vector of columns that *do not* already contain the test name
+  rename_me <- dataframe %>%
+    colnames() %>%
+    stringr::str_subset(test_name, negate=TRUE)
+
+  if (location == 1){
+    dataframe %<>%
+      dplyr::rename_at(dplyr::vars(all_of(rename_me)), ~paste(test_name,.,sep="_"))
+  }
+  # Use the rename_me vector to rename columns only where needed
+
+  if (location == 0){
+    dataframe %<>%
+      dplyr::rename_at(dplyr::vars(all_of(rename_me)), ~paste(sep="_",.,test_name))
+  }
+
+  return(dataframe)
+
+}
+
diff --git a/R/Impute Missing Values.R b/R/Impute Missing Values.R
@@ -0,0 +1,57 @@
+
+
+#' Impute for Missing Values
+#' This function calculates imputed scores where item-level data is missing, and allows for the threshold of proportion of missing questions to be adjusted
+#'
+#' @param dataframe dataframe object
+#' @param vars_to_impute vector of variables from which a score will be calculated/imputed
+#' @param scale_name output name of the scale being calculated
+#' @param missing_threshold maximum proportion of the scale questions allowed to be missing (i.e. the scale will not be calculated if the number of missing questions is greater than this value)
+#' @param toggle_warning
+#'
+#' @return
+#' @export
+#'
+#' @examples
+impute_missing_values <- function(dataframe, vars_to_impute, scale_name, missing_threshold = 0.2, toggle_warning = TRUE){
+  if(missing_threshold != 0.2 & toggle_warning){print("Warning! Default threshold is 0.2 or 20% in LAVA. Do you have a compelling reason to change this? (You can turn this warning off; toggle_warning = FALSE)")}
+
+  vars_to_impute <- rlang::enquo(vars_to_impute)
+  length_imputed_columns <- dataframe %>%
+    dplyr::select(!!vars_to_impute) %>%
+    length()
+  quantity_missing <- dplyr::quo_name(scale_name) %>%
+    stringr::str_replace(., ":","_") %>%
+    paste0(.,"_quantity_missing")
+  prop_missing_Q <- dplyr::quo_name(scale_name) %>%
+    stringr::str_replace(.,":","_") %>%
+    paste0(.,"_prop_missing_Q")
+  row_Avg <- dplyr::quo_name(scale_name) %>%
+    stringr::str_replace(.,":","_") %>%
+    paste0(.,"_avg")
+  row_Sum <- dplyr::quo_name(scale_name) %>%
+    stringr::str_replace(.,":","_") %>%
+    paste0(.,"_imputed_sum")
+
+  dataframe %<>%
+    dplyr::mutate_at(dplyr::vars(!!vars_to_impute), ~dplyr::case_when(. >= 0 ~ .)) %>%
+    dplyr::mutate(!!row_Avg := dplyr::select(.,!!vars_to_impute) %>% rowMeans(.,na.rm=TRUE),
+                  !!quantity_missing := dplyr::select(.,!!vars_to_impute) %>% is.na() %>% rowSums(),
+                  !!prop_missing_Q := !!dplyr::sym(quantity_missing)/length_imputed_columns,
+                  !!row_Sum := dplyr::case_when(!!dplyr::sym(prop_missing_Q) <= missing_threshold ~ round(!!dplyr::sym(row_Avg)*length_imputed_columns))
+                  )
+#   Lines below imputed averages to the empty columns then calculated the sum of the rows for
+#   the selected columns, but this item-level replacment should not be done!!
+#    dplyr::mutate_at(dplyr::vars(!!vars_to_impute), ~dplyr::case_when(!!dplyr::sym(prop_missing_Q) <= missing_threshold & is.na(.) ~ !!dplyr::sym(row_Avg), TRUE ~ .)) %>%
+#    dplyr::mutate(!!row_sum := dplyr::select(.,!!vars_to_impute) %>% rowSums(.))
+
+  return(dataframe)
+
+  # When imputing values, item-level values should not be replaced!!!
+  # To calculate new sub-scale scores: take the average of all existing values in the sub-scale, and multiply this
+  # average by the total number of items in the sub-scale (that is, in R terms, the length() of the columns used
+  # in the sub-scale)
+}
+
+
+
diff --git a/R/Read CSV by String.R b/R/Read CSV by String.R
@@ -0,0 +1,29 @@
+
+
+#' Read CSV By String Search
+#' This convenience function searches either the default directory or a specified directory for a (1) .csv file that at least partially matches the string_search variable. The file is then read-in to R.
+#'
+#' @param string_search string to search for in the directory
+#' @param file_parent can specify the parent directory of the file if needed
+#'
+#' @return
+#' @export
+#'
+#' @examples
+read_csv_by_string_search <- function(string_search, file_parent) {
+  if(rlang::is_missing(file_parent)){
+    destination <- here::here()
+  } else {
+    destination <- here::here(file_parent)
+  }
+
+
+  pathway <- fs::dir_info(destination) %>%
+    dplyr::filter_at(vars(path), dplyr::all_vars(stringr::str_detect(.,pattern=glue::glue("{string_search}")))) %>%
+    dplyr::pull(path)
+  if (length(pathway) < 1){print(glue::glue("Error! No object with partial name = '{string_search}' found in this directory. Specify a different folder within this project using the file_parent argument"))}
+  if (length(pathway) > 1){print(glue::glue("Error!", length(pathway),"objects with partial name = '{string_search}' found in this directory. Check folder contents for duplicate files, and ensure your string_search argument is specific."))}
+
+  return(readr::read_csv(paste0(pathway)))
+
+}
diff --git a/R/Remove Rows with Multiple NAs.R b/R/Remove Rows with Multiple NAs.R
@@ -0,0 +1,30 @@
+
+
+
+#' Remove Rows with Multipls NAs
+#' This function allows you to remove rows if ALL specified columns are NA
+#'
+#' @param dataframe dataframe object
+#' @param columns variables to check for missingness
+#'
+#' @return
+#' @export
+#'
+#' @examples
+remove_rows_with_mult_NAs <- function(dataframe, columns){
+  columns <- enquo(columns)
+
+  dataframe %<>%
+    dplyr::filter_at(dplyr::vars(!!columns), dplyr::any_vars(!is.na(.)))
+
+  return(dataframe)
+  # This function allows you to remove rows from a dataframe given ALL variables specified in the *columns*
+  # argument are NA.
+
+  # Should rename this function to drop_if_all_NA()
+  # Should create another version of the function where a row is dropped if any variables in the list are
+  # missing, and this would be drop_if_any_NA(). It's counterintuitive, but I think I would just need to
+  # change the any_vars() call to all_vars() in the function to implement this change.
+}
+
+