Skip to content

Commit

Permalink
Initial commit of 11 helper functions and all files to build the package
Browse files Browse the repository at this point in the history
  • Loading branch information
NeuroShepherd committed Apr 28, 2020
0 parents commit 8c412a3
Show file tree
Hide file tree
Showing 30 changed files with 779 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
^RankinLabTools\.Rproj$
^\.Rproj\.user$
^LICENSE\.md$
Empty file added .Rhistory
Empty file.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.Rproj.user
26 changes: 26 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
Package: RankinLabTools
Title: General data cleaning tools for the Rankin lab
Version: 0.0.0.9000
Date: 2020-04-27
Authors@R:
person(given = "Patrick",
family = "Callahan",
role = c("aut", "cre"),
email = "[email protected]",
comment = c(ORCID = "0000-0003-1769-7580"))
Maintainer: Patrick Callahan <[email protected]>
URL: https://orcid.org/0000-0003-1769-7580
Description: What the package does (one paragraph).
License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.0
Imports:
dplyr,
readr,
rlang,
glue,
stringr,
fuzzyjoin,
lubridate
2 changes: 2 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
YEAR: 2020
COPYRIGHT HOLDER: Patrick Callahan
21 changes: 21 additions & 0 deletions LICENSE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# MIT License

Copyright (c) 2020 Patrick Callahan

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
13 changes: 13 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Generated by roxygen2: do not edit by hand

export(add_test_name_to_vars)
export(calculate_current_past_difference)
export(combine_current_and_past_observations)
export(combine_mac_uds_encounters)
export(custom_fuzzy_join)
export(impute_missing_values)
export(negative_values_to_na)
export(read_csv_by_string_search)
export(remove_rows_with_mult_NAs)
export(reverse_scoring)
export(select_by_test_name)
47 changes: 47 additions & 0 deletions R/Calculate Current Past Difference.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@


#' Calculate difference between vectors of columns
#' This function accepts any number of 2 sets of equal length columns, and calculates the difference (that is, past-current = difference).
#' Intended to be used in conjunction with the combine_current_and_past_observations() function.
#'
#' @param dataframe dataframe object
#' @param variables_past variables where the observation is in the past
#' @param variables_current variables where the observation is current
#' @param variables_output_prefix add a meaningful prefix to the name of the difference columns
#' @param PIDN defaults to PIDN; can be any other ID
#' @param DCDate defaults to DCDate; can be any other date variable
#'
#' @return dataframe
#' @export
#'
#' @examples
calculate_current_past_difference <- function(dataframe, variables_past, variables_current, variables_output_prefix, PIDN=PIDN, DCDate = DCDate) {
variables_current <- dplyr::enquo(variables_current)
variables_past <- dplyr::enquo(variables_past)
PIDN <- dplyr::enquo(PIDN)
DCDate <- dplyr::enquo(DCDate)

length1 <- dataframe %>%
dplyr::select(!!variables_past) %>%
length()
length2 <- dataframe %>%
dplyr::select(!!variables_current) %>%
length()
if (length1 != length2){print("Stop! Your variable lengths differ; you are not using this function correctly. The map() call within this function will not work!")}

difference_names <- dataframe %>%
dplyr::select(!!variables_current) %>%
dplyr::rename_all(~gsub("current","difference",.)) %>%
colnames()

dataframe %<>%
dplyr::select(!!variables_past,!!variables_current,everything()) %>%
dplyr::bind_cols(purrr::map2(.x = .[, 1:(length1)], .y = .[, (1+length1):(length1+length2)], .f = ~.x - .y)) %>% # calculate difference between selected columns; PAST - CURRENT = DIFFERENCE
dplyr::rename_at(dplyr::vars(!colnames(dataframe)), ~paste(difference_names)) %>% # Rename new columns to indicate they're the difference column
dplyr::select(everything(),!!variables_past,!!variables_current,ends_with("difference")) # arrange new columns


return(dataframe)
}


35 changes: 35 additions & 0 deletions R/Combine Current and Past Observations.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@


#' Combine Current and Past Observations
#'
#' @param dataframe dataframe object
#' @param CurrentPast variable indicating whether the observation is a "Current" or "Past" record; default column name expected is CurrentPast, but can be replaced with any column with "current" and "past"/"before" characters
#' @param DCDate date column; defaults to DCDate
#' @param PIDN identifier column; defaults to PIDN
#'
#' @return dataframe
#' @export
#'
#' @examples
combine_current_and_past_observations <- function(dataframe, CurrentPast = CurrentPast, DCDate=DCDate, PIDN=PIDN) {
CurrentPast <- enquo(CurrentPast)
DCDate <- enquo(DCDate)
PIDN <- enquo(PIDN)

current_dataframe <- dataframe %>%
dplyr::filter(!!CurrentPast %in% c("CURRENT","Current"))

past_dataframe <- dataframe %>%
dplyr::filter(!!CurrentPast %in% c("PAST","Past","Before","BEFORE")) %>%
dplyr::mutate(DCDate = lubridate::as_date(DCDate)) %>%
dplyr::group_by(PIDN) %>% # These three lines act to select only the earliest instance of completing a PAST test within each PIDN
dplyr::filter(DCDate == min(DCDate)) %>%
dplyr::ungroup()

final_dataframe <- current_dataframe %>%
dplyr::left_join(past_dataframe, by=c("PIDN"="PIDN"), suffix=c("_current","_past"))

return(final_dataframe)
}


39 changes: 39 additions & 0 deletions R/Combine MAC and UDS encounters.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@

#' Combine MAC and UDS Encounters
#' This function allows you to combine MAC and UDS encounters for a particular test where the
#' encounters have been separated into different rows, but really occurred around the same time period.
#' You can determine what range of days qualifies as a "single visit" using the day_range argument.
#'
#' @param dataframe dataframe object
#' @param questions_to_combine select the variables that should be combined. This will be at least the item-level variables, and potentially your summary scores
#' @param day_range range of days over which MAC and UDS visits can be considered the "same" visit; default of 30 days
#' @param DCDate date identifier; defaults to DCDate variable
#' @param PIDN person identifier; defaults to PIDN
#'
#' @return dataframe
#' @export
#'
#' @examples
combine_mac_uds_encounters <- function(dataframe, questions_to_combine, day_range = 30, DCDate = DCDate, PIDN = PIDN){
questions_to_combine <- enquo(questions_to_combine)
DCDate <- enquo(DCDate)
PIDN <- enquo(PIDN)

dataframe %<>%
dplyr::mutate_at(dplyr::vars(!!questions_to_combine), ~dplyr::na_if(.,-6)) %>%
dplyr::mutate(DCDate = lubridate::as_date(!!DCDate)) %>%
dplyr::arrange(!!PIDN,DCDate) %>%
dplyr::group_by(!!PIDN) %>%
dplyr::mutate(closeness_lag = dplyr::near(DCDate,dplyr::lead(DCDate),tol=day_range)) %>%
dplyr::ungroup() %>%
dplyr::mutate(newDate = dplyr::case_when(closeness_lag == TRUE ~ dplyr::lead(DCDate),
closeness_lag == FALSE ~ DCDate,
is.na(closeness_lag) ~ DCDate)) %>%
dplyr::group_by(!!PIDN,newDate) %>%
dplyr::summarize_all(list(~dplyr::first(stats::na.omit(.)))) %>%
dplyr::ungroup()

return(dataframe)

}

43 changes: 43 additions & 0 deletions R/Custom Fuzzy Join.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@

#' Title
#'
#' @param df1 first dataframe argument
#' @param DCDate1 first date argument
#' @param PIDN1 first ID argument; defaults to PIDN
#' @param df2 second dataframe argument
#' @param DCDate2 second date argument
#' @param PIDN2 second ID argument; defaults to PIDN
#' @param mode used to specify if the join should be "left", "right", "full", "semi", or "anti"; defaults to "left"
#' @param interval absolute value of days over which to match visits; default of +/-90 days
#'
#' @return
#' @export
#'
#' @examples
custom_fuzzy_join <- function(df1, DCDate1, PIDN1 = PIDN,
df2, DCDate2, PIDN2 = PIDN,
mode = "left", interval = 90) {

PIDN1 <- enquo(PIDN1)
PIDN2 <- enquo(PIDN2)
DCDate1 <- enquo(DCDate1)
DCDate2 <- enquo(DCDate2)

joining_pidn <- set_names(quo_name(PIDN2),quo_name(PIDN1))
joining_date <- set_names(quo_name(DCDate2),quo_name(DCDate1))

DCDate1 <- df1 %>%
select(!!DCDate1) %>%
mutate_at(vars(!!DCDate1), ~as_date(.))
DCDate2 <- df2 %>%
select(!!DCDate2) %>%
mutate_at(vars(!!DCDate2), ~as_date(.))

df1 %>%
fuzzyjoin::fuzzy_join(df2,
by=c(joining_pidn,joining_date),
match_fun = list(`==`, function(x,y)abs(x-y)<interval),
mode=mode)

}

35 changes: 35 additions & 0 deletions R/Final Variable Naming.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@


#' Add Test Name to Variables
#' Adds a name to all variables in the dataframe; can specify if the name appears at the beginning or end of the variables
#'
#' @param dataframe dataframe object
#' @param test_name name to be added to all variables; must be quoted (e.g. "name")
#' @param location where to append the test_name; 0 = end of variables, 1 = beginning of variables; defaults to 0
#'
#' @return
#' @export
#'
#' @examples
add_test_name_to_vars <- function(dataframe, test_name, location = 0){

# Create a vector of columns that *do not* already contain the test name
rename_me <- dataframe %>%
colnames() %>%
stringr::str_subset(test_name, negate=TRUE)

if (location == 1){
dataframe %<>%
dplyr::rename_at(dplyr::vars(all_of(rename_me)), ~paste(test_name,.,sep="_"))
}
# Use the rename_me vector to rename columns only where needed

if (location == 0){
dataframe %<>%
dplyr::rename_at(dplyr::vars(all_of(rename_me)), ~paste(sep="_",.,test_name))
}

return(dataframe)

}

57 changes: 57 additions & 0 deletions R/Impute Missing Values.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@


#' Impute for Missing Values
#' This function calculates imputed scores where item-level data is missing, and allows for the threshold of proportion of missing questions to be adjusted
#'
#' @param dataframe dataframe object
#' @param vars_to_impute vector of variables from which a score will be calculated/imputed
#' @param scale_name output name of the scale being calculated
#' @param missing_threshold maximum proportion of the scale questions allowed to be missing (i.e. the scale will not be calculated if the number of missing questions is greater than this value)
#' @param toggle_warning
#'
#' @return
#' @export
#'
#' @examples
impute_missing_values <- function(dataframe, vars_to_impute, scale_name, missing_threshold = 0.2, toggle_warning = TRUE){
if(missing_threshold != 0.2 & toggle_warning){print("Warning! Default threshold is 0.2 or 20% in LAVA. Do you have a compelling reason to change this? (You can turn this warning off; toggle_warning = FALSE)")}

vars_to_impute <- rlang::enquo(vars_to_impute)
length_imputed_columns <- dataframe %>%
dplyr::select(!!vars_to_impute) %>%
length()
quantity_missing <- dplyr::quo_name(scale_name) %>%
stringr::str_replace(., ":","_") %>%
paste0(.,"_quantity_missing")
prop_missing_Q <- dplyr::quo_name(scale_name) %>%
stringr::str_replace(.,":","_") %>%
paste0(.,"_prop_missing_Q")
row_Avg <- dplyr::quo_name(scale_name) %>%
stringr::str_replace(.,":","_") %>%
paste0(.,"_avg")
row_Sum <- dplyr::quo_name(scale_name) %>%
stringr::str_replace(.,":","_") %>%
paste0(.,"_imputed_sum")

dataframe %<>%
dplyr::mutate_at(dplyr::vars(!!vars_to_impute), ~dplyr::case_when(. >= 0 ~ .)) %>%
dplyr::mutate(!!row_Avg := dplyr::select(.,!!vars_to_impute) %>% rowMeans(.,na.rm=TRUE),
!!quantity_missing := dplyr::select(.,!!vars_to_impute) %>% is.na() %>% rowSums(),
!!prop_missing_Q := !!dplyr::sym(quantity_missing)/length_imputed_columns,
!!row_Sum := dplyr::case_when(!!dplyr::sym(prop_missing_Q) <= missing_threshold ~ round(!!dplyr::sym(row_Avg)*length_imputed_columns))
)
# Lines below imputed averages to the empty columns then calculated the sum of the rows for
# the selected columns, but this item-level replacment should not be done!!
# dplyr::mutate_at(dplyr::vars(!!vars_to_impute), ~dplyr::case_when(!!dplyr::sym(prop_missing_Q) <= missing_threshold & is.na(.) ~ !!dplyr::sym(row_Avg), TRUE ~ .)) %>%
# dplyr::mutate(!!row_sum := dplyr::select(.,!!vars_to_impute) %>% rowSums(.))

return(dataframe)

# When imputing values, item-level values should not be replaced!!!
# To calculate new sub-scale scores: take the average of all existing values in the sub-scale, and multiply this
# average by the total number of items in the sub-scale (that is, in R terms, the length() of the columns used
# in the sub-scale)
}



29 changes: 29 additions & 0 deletions R/Read CSV by String.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@


#' Read CSV By String Search
#' This convenience function searches either the default directory or a specified directory for a (1) .csv file that at least partially matches the string_search variable. The file is then read-in to R.
#'
#' @param string_search string to search for in the directory
#' @param file_parent can specify the parent directory of the file if needed
#'
#' @return
#' @export
#'
#' @examples
read_csv_by_string_search <- function(string_search, file_parent) {
if(rlang::is_missing(file_parent)){
destination <- here::here()
} else {
destination <- here::here(file_parent)
}


pathway <- fs::dir_info(destination) %>%
dplyr::filter_at(vars(path), dplyr::all_vars(stringr::str_detect(.,pattern=glue::glue("{string_search}")))) %>%
dplyr::pull(path)
if (length(pathway) < 1){print(glue::glue("Error! No object with partial name = '{string_search}' found in this directory. Specify a different folder within this project using the file_parent argument"))}
if (length(pathway) > 1){print(glue::glue("Error!", length(pathway),"objects with partial name = '{string_search}' found in this directory. Check folder contents for duplicate files, and ensure your string_search argument is specific."))}

return(readr::read_csv(paste0(pathway)))

}
30 changes: 30 additions & 0 deletions R/Remove Rows with Multiple NAs.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@



#' Remove Rows with Multipls NAs
#' This function allows you to remove rows if ALL specified columns are NA
#'
#' @param dataframe dataframe object
#' @param columns variables to check for missingness
#'
#' @return
#' @export
#'
#' @examples
remove_rows_with_mult_NAs <- function(dataframe, columns){
columns <- enquo(columns)

dataframe %<>%
dplyr::filter_at(dplyr::vars(!!columns), dplyr::any_vars(!is.na(.)))

return(dataframe)
# This function allows you to remove rows from a dataframe given ALL variables specified in the *columns*
# argument are NA.

# Should rename this function to drop_if_all_NA()
# Should create another version of the function where a row is dropped if any variables in the list are
# missing, and this would be drop_if_any_NA(). It's counterintuitive, but I think I would just need to
# change the any_vars() call to all_vars() in the function to implement this change.
}


Loading

0 comments on commit 8c412a3

Please sign in to comment.