Skip to content

Commit

Permalink
updates for release of version 1.0.3
Browse files Browse the repository at this point in the history
  • Loading branch information
nicholasjclark committed May 16, 2018
1 parent 0dff981 commit d4825a6
Show file tree
Hide file tree
Showing 29 changed files with 168 additions and 332 deletions.
Binary file modified .DS_Store
Binary file not shown.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: MRFcov
Type: Package
Title: Markov Random Fields with additional covariates
Version: 1.0.2
Date/Publication: 2018-01-08
Version: 1.0.3
Date/Publication: 2018-17-05
URL: https://github.com/nicholasjclark/MRFcov
Authors@R: c(
person("Nicholas J", "Clark", email = "[email protected]", role = c("aut", "cre")),
Expand Down
63 changes: 19 additions & 44 deletions R/MRFcov.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,9 @@
#'left-most variables are variables that are to be represented by nodes in the graph
#'@param lambda1 Positve numeric. If \code{fixed_lambda = TRUE}, this value is used as the
#'l1−regularization parameter for all node-specific regressions. If \code{fixed_lambda = FALSE},
#'this value is ignored and penalization parameters are optimized automatially.
#'this value is ignored and penalization parameters are optimized automatially, using
#'coordinated gradient descent to minimize cross-validated error.
#'See \code{\link[glmnet]{cv.glmnet}} for further details of \code{lambda1} optimisation
#'@param lambda2 Positive numeric. Value for l2−regularization, where larger values lead
#'to stronger shrinking of coefficient magnitudes. Default is \code{0}, but larger values
#'may be necessary for large or particularly sparse datasets. Takes values between
#'\code{0} and \code{5}.
#'@param symmetrise The method to use for symmetrising corresponding parameter estimates
#'(which are taken from separate regressions). Options are \code{min} (take the coefficient with the
#'smallest absolute value), \code{max} (take the coefficient with the largest absolute value)
Expand All @@ -35,7 +32,8 @@
#'columns in \code{data}, corresponding to no additional covariates
#'@param n_cores Positive integer. The number of cores to spread the job across using
#'\code{\link[parallel]{makePSOCKcluster}}. Default is 1 (no parallelisation)
#'@param n_covariates Positive integer. The number of covariates in \code{data}, before cross-multiplication
#'@param n_covariates Positive integer. The number of covariates in \code{data}, before cross-multiplication.
#'Default is \code{ncol(data) - n_nodes}
#'@param family The response type. Responses can be quantitative continuous (\code{family = "gaussian"}),
#'non-negative counts (\code{family = "poisson"}) or binomial 1s and 0s (\code{family = "binomial"})
#'@param fixed_lambda Logical. If \code{FALSE}, node-specific regressions are optimized using the
Expand Down Expand Up @@ -88,7 +86,7 @@
#'effects of the covariate on interactions between \code{j} and all other species
#'(\code{/j}). Note that coefficients must be estimated on the same scale in order
#'for the resulting models to be unified in a Markov Random Field. Counts for \code{poisson}
#'variables will be standardised using the square root mean transformation
#'variables will be therefore standardised using the square root mean transformation
#'\code{x = x / sqrt(mean(x ^ 2))} so that they are on similar ranges. These transformed counts
#'will then be used in a \code{(family = "gaussian")} model and their respective scaling factors
#'will be returned so that coefficients can be unscaled before interpretation (this unscaling is
Expand All @@ -100,8 +98,8 @@
#'\cr
#'Note that since the number of parameters quickly increases with increasing
#'numbers of species and covariates, LASSO penalization is used to regularize
#'regressions based on values of regularization parameters \code{lambda1} and
#'\code{lambda2}. This can be done either by minimising the cross-validated
#'regressions based on values of the regularization parameter \code{lambda1}.
#'This can be done either by minimising the cross-validated
#'mean error for each node separately (using \code{\link[glmnet]{cv.glmnet}}) or by
#'running all regressions at a single \code{lambda1} value. The latter approach may be
#'useful for optimising all nodes as part of a joint graphical model, while the former
Expand All @@ -116,7 +114,7 @@
#'
#'@export
#'
MRFcov <- function(data, lambda1, lambda2, symmetrise,
MRFcov <- function(data, lambda1, symmetrise,
prep_covariates, n_nodes, n_cores, n_covariates,
family, fixed_lambda, bootstrap = FALSE) {

Expand Down Expand Up @@ -149,17 +147,6 @@ MRFcov <- function(data, lambda1, lambda2, symmetrise,
fixed_lambda <- FALSE
}

if(missing(lambda2)) {
lambda2 <- 0
} else {
if(lambda2 < 0){
stop('Please provide a non-negative numeric value for lambda2')
}
if(lambda2 > 5){
lambda2 <- 5
}
}

if(missing(n_cores)) {
n_cores <- 1
} else {
Expand Down Expand Up @@ -213,12 +200,12 @@ MRFcov <- function(data, lambda1, lambda2, symmetrise,
}

# Specify number of folds for cv.glmnet based on data size
if(nrow(data) < 50){
if(nrow(data) < 150){
# If less than 50 observations, use leave one out cv
n_folds <- nrow(data)
} else {
# If > 50 but < 100 observations, use 15-fold cv
if(nrow(data) < 100){
# If > 150 but < 250 observations, use 15-fold cv
if(nrow(data) < 250){
n_folds <- 15
} else {
# else use the default for cv.glmnet (10-fold cv)
Expand Down Expand Up @@ -321,7 +308,7 @@ MRFcov <- function(data, lambda1, lambda2, symmetrise,

if(parallel_compliant){
clusterExport(NULL, c('mrf_data',
'lambda1','lambda2','n_nodes','family','fixed_lambda',
'lambda1','n_nodes','family','fixed_lambda',
'n_folds'),
envir = environment())

Expand All @@ -339,25 +326,19 @@ MRFcov <- function(data, lambda1, lambda2, symmetrise,
mrf_mods <- parLapply(NULL, seq_len(n_nodes), function(i) {
penalized(response = mrf_data[,i],
penalized = mrf_data[, -which(grepl(colnames(mrf_data)[i], colnames(mrf_data)) == T)],
lambda1 = lambda1, lambda2 = lambda2, steps = 1,
lambda1 = lambda1, steps = 1,
model = fam, standardize = TRUE, trace = F, maxiter = 25000)
})

} else {
#Use function 'cv.glmnet' from package glmnet if cross-validation is specified
#Each node-wise regression will be optimised separately using cv, reducing user-bias
clusterEvalQ(cl, library(glmnet))
alpha <- if(lambda2 == 0){
0
} else {
lambda2
}

mrf_mods <- parLapply(NULL, seq_len(n_nodes), function(i) {
mrf_mods <- parLapply(NULL, seq_len(n_nodes), function(i) {
cv.glmnet(x = mrf_data[, -which(grepl(colnames(mrf_data)[i], colnames(mrf_data)) == T)],
y = mrf_data[,i], family = family, alpha = alpha,
y = mrf_data[,i], family = family, alpha = 1,
nfolds = n_folds, weights = rep(1, nrow(mrf_data)),
lambda = exp(seq(log(0.00001), log(5), length.out = 100)),
#lambda = rev(seq(0.0001, 1, length.out = 100)),
intercept = TRUE, standardize = TRUE, maxit = 25000)

})
Expand All @@ -374,22 +355,16 @@ MRFcov <- function(data, lambda1, lambda2, symmetrise,
mrf_mods <- lapply(seq_len(n_nodes), function(i) {
penalized(response = mrf_data[,i],
penalized = mrf_data[, -which(grepl(colnames(mrf_data)[i], colnames(mrf_data)) == T)],
lambda1 = lambda1, lambda2 = lambda2, steps = 1,
lambda1 = lambda1, steps = 1,
model = fam, standardize = TRUE, trace = F, maxiter = 25000)
})

} else {
alpha <- if(lambda2 == 0){
0
} else {
lambda2
}

mrf_mods <- lapply(seq_len(n_nodes), function(i) {
cv.glmnet(x = mrf_data[, -which(grepl(colnames(mrf_data)[i], colnames(mrf_data)) == T)],
y = mrf_data[,i], family = family, alpha = lambda2,
y = mrf_data[,i], family = family, alpha = 1,
nfolds = n_folds, weights = rep(1, nrow(mrf_data)),
lambda = exp(seq(log(0.00001), log(5), length.out = 100)),
#lambda = rev(seq(0.0001, 1, length.out = 100)),
intercept = TRUE, standardize = TRUE, maxit = 25000)
})
}
Expand Down
57 changes: 21 additions & 36 deletions R/bootstrap_MRF.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
#'
#'This function runs \code{\link{MRFcov}} models multiple times using either a user-specified
#'range of l1 regularization values or cross-validation. To capture uncertainty
#'in paramter esimates, the dataset is bootstrapped a user-specified number of times in each iteration.
#'in paramter esimates, the dataset is shuffled and missing values are imputed
#'in each bootstrap iteration. Two parameters control the number of bootstrap iterations
#'to perform in order to facilitate optimal parallel computing.
#'
#'@importFrom magrittr %>%
#'@importFrom parallel makePSOCKcluster setDefaultCluster clusterExport stopCluster clusterEvalQ parLapply
Expand All @@ -16,16 +18,21 @@
#'@param n_bootstraps Positive integer. Represents the total number of bootstrap samples
#'to test in each of \code{n_its} or \code{seq(min_lambda1, max_lambda1, by_lambda1)} iterations.
#'Default is \code{10}.
#'@param n_its Positive integer. The number of iterations to perform \code{n_bootstraps} replicates
#'across. The use of two parameters provides more control for optimising parallel computations. It is
#'generally recommended to use a higher number for \code{n_its} relative to \code{n_bootstraps}, as
#'the job will be split across cores based on the \code{n_its} parameter.
#'If \code{fixed_lambda = TRUE}, this value is ignored and \code{n_its} is taken from the length of
#'\code{lambda1_seq = seq(min_lambda1, max_lambda1, by_lambda1)}. Default when \code{fixed_lambda = FALSE}
#'is \code{10}. This coincides with the default \code{n_bootstraps} of \code{10} to produce
#'\code{100} bootstrap replicates in total.
#'@param min_lambda1 Positive numeric. The lowest l1−regularization value to be tested.
#'If \code{fixed_lambda = FALSE}, this value is ignored and penalization parameters are optimized automatially
#'@param max_lambda1 Positive numeric. The highest l1−regularization to be tested.
#'If \code{fixed_lambda = FALSE}, this value is ignored and penalization parameters are optimized automatially
#'@param by_lambda1 Positive numeric. The increment of the l1 test sequence. The test sequence is generated by calling
#'\code{lambda1_seq = seq(min_lambda1, max_lambda1, by_lambda1)}.
#'If \code{fixed_lambda = FALSE}, this value is ignored and penalization parameters are optimized automatially.
#'@param lambda2 Numeric \code{(>= 0)}. Value for l2−regularization, where larger values lead
#'to stronger shrinking of coefficient magnitudes. Default is 0, but larger values
#'may be necessary for large or particularly sparse datasets
#'@param symmetrise The method to use for symmetrising corresponding parameter estimates
#'(which are taken from separate regressions). Options are \code{min} (take the coefficient with the
#'smallest absolute value), \code{max} (take the coefficient with the largest absolute value)
Expand All @@ -48,14 +55,6 @@
#'at a single \code{lambda1} value (the same \code{lambda1} value is used for each separate
#'regression). Default is \code{FALSE}, meaning that arguments to \code{min_lambda1}, \code{max_lambda1}
#'and \code{by_lambda1} are ignored
#'@param n_its Positive integer. The number of iterations to perform \code{n_bootstraps} replicates
#'across. The use of two parameters provides more control for optimising parallel computations. It is
#'generally recommended to use a higher number for \code{n_its} relative to \code{n_bootstraps}, as
#'the job will be split across cores based on the \code{n_its} parameter.
#'If \code{fixed_lambda = TRUE}, this value is ignored and \code{n_its} is taken from the length of
#'\code{lambda1_seq = seq(min_lambda1, max_lambda1, by_lambda1)}. Default when \code{fixed_lambda = FALSE}
#'is \code{100}. This coincides with the default \code{n_bootstraps} of \code{10} to produce
#'\code{1000} bootstrap replicates in total.
#'@return A \code{list} containing:
#'\itemize{
#' \item \code{lambda_results}: if \code{fixed_lambda = TRUE}, a \code{list} of length \code{lambda1_seq} containing:
Expand Down Expand Up @@ -105,15 +104,14 @@
#'
#'# Perform 100 bootstrap replicates in total
#'bootedCRF <- bootstrap_MRF(data = Bird.parasites,
#' n_nodes = 4, n_bootstraps = 5,
#' n_its = 20,
#' n_nodes = 4,
#' family = 'binomial',
#' n_cores = 3)}
#'@export
#'
bootstrap_MRF <- function(data, n_bootstraps, sample_seed, min_lambda1,
max_lambda1, by_lambda1, lambda2, symmetrise,
n_nodes, n_cores, n_covariates, fixed_lambda, family, n_its){
bootstrap_MRF <- function(data, n_bootstraps, n_its, sample_seed, min_lambda1,
max_lambda1, by_lambda1, symmetrise,
n_nodes, n_cores, n_covariates, family, fixed_lambda){

#### Specify default parameter values and initiate warnings ####
if(!(family %in% c('gaussian', 'poisson', 'binomial')))
Expand Down Expand Up @@ -164,7 +162,7 @@ bootstrap_MRF <- function(data, n_bootstraps, sample_seed, min_lambda1,
}

if(missing(n_its)) {
n_its <- 100
n_its <- 10
}

#### If fixed_lambda = TRUE, check that lambda arguments are appropriate ####
Expand Down Expand Up @@ -201,28 +199,16 @@ bootstrap_MRF <- function(data, n_bootstraps, sample_seed, min_lambda1,
stop('Please provide a by_lambda1 that can be used as an
increment between min_lambda1 & max_lambda1')
}

if(missing(lambda2)) {
lambda2 <- 0
} else {
if(sign(lambda2) != 1){
stop('Please provide a non-negative value for lambda2
or use option "fixed_lambda = FALSE"')
}
}
} else{
lambda1 <- 1
if(missing(lambda2)){
lambda2 <- 0
}
}

#### Basic checks on data arguments ####
if(missing(n_nodes)) {
warning('n_nodes not specified. using ncol(data) as default, assuming no covariates',
call. = FALSE)
n_nodes <- ncol(data)
n_covariates = 0
n_covariates <- 0
} else {
if(sign(n_nodes) != 1){
stop('Please provide a positive integer for n_nodes')
Expand Down Expand Up @@ -364,7 +350,7 @@ bootstrap_MRF <- function(data, n_bootstraps, sample_seed, min_lambda1,
if(parallel_compliant){

#Export necessary data and variables to each cluster
clusterExport(NULL, c('lambda1_seq', 'booted_datas', 'lambda2',
clusterExport(NULL, c('lambda1_seq', 'booted_datas',
'symmetrise', 'n_nodes', 'n_bootstraps',
'n_covariates', 'fixed_lambda', 'family'), envir = environment())

Expand All @@ -389,13 +375,13 @@ bootstrap_MRF <- function(data, n_bootstraps, sample_seed, min_lambda1,
booted_mrfs <- lapply(seq_len(n_bootstraps), function(x) {
sample_data <- sample(seq_len(100), 1)
mod <- suppressWarnings(MRFcov(data = prepped_datas[[sample_data]], lambda1 = l,
lambda2 = lambda2,
symmetrise= symmetrise,
symmetrise = symmetrise,
n_nodes = n_nodes,
n_cores = 1,
prep_covariates = FALSE,
n_covariates = n_covariates,
fixed_lambda = fixed_lambda, family = family,
fixed_lambda = fixed_lambda,
family = family,
bootstrap = TRUE))

list(direct_coefs = mod$direct_coefs,
Expand Down Expand Up @@ -470,7 +456,6 @@ bootstrap_MRF <- function(data, n_bootstraps, sample_seed, min_lambda1,
booted_mrfs <- lapply(seq_len(n_bootstraps), function(x) {
sample_data <- sample(seq_len(100), 1)
mod <- suppressWarnings(MRFcov(data = prepped_datas[[sample_data]], lambda1 = l,
lambda2 = lambda2,
symmetrise = symmetrise,
n_nodes = n_nodes,
n_cores = 1,
Expand Down
8 changes: 4 additions & 4 deletions R/comp_rosalia_MRF.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
#'@param rosalia_mod A fitted \code{\link[rosalia]{rosalia}} model
#'@return A \code{list} of two objects:
#'\itemize{
#' \item \code{Positive_predictive_value}: The positive predictive value
#' \item \code{Negative_predictive_value}: The negative predictive value
#' \item \code{Positive_interactions}: The positive predictive value
#' \item \code{Negative_interactions}: The negative predictive value
#' }
#'
#'@seealso \code{\link{MRFcov}}, \code{\link[rosalia]{rosalia}}
Expand Down Expand Up @@ -46,6 +46,6 @@ pos.pred <- sum(true.pos, na.rm = TRUE)/

neg.pred <- sum(true.neg, na.rm = TRUE)/
(sum(true.neg, na.rm = TRUE) + sum(missed.neg, na.rm = TRUE))
return(list(Positive_predictive_value = pos.pred,
Negative_predictive_value = neg.pred))
return(list(Positive_interactions = pos.pred,
Negative_interactions = neg.pred))
}
Loading

0 comments on commit d4825a6

Please sign in to comment.