updates for release of version 1.0.3

nicholasjclark · May 16, 2018 · d4825a6 · d4825a6
1 parent 0dff981
commit d4825a6
Show file tree

Hide file tree

Showing 29 changed files with 168 additions and 332 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: MRFcov
 Type: Package
 Title: Markov Random Fields with additional covariates 
-Version: 1.0.2
-Date/Publication: 2018-01-08
+Version: 1.0.3
+Date/Publication: 2018-17-05
 URL: https://github.com/nicholasjclark/MRFcov
 Authors@R: c(
     person("Nicholas J", "Clark", email = "[email protected]", role = c("aut", "cre")),

diff --git a/R/MRFcov.R b/R/MRFcov.R
@@ -14,12 +14,9 @@
 #'left-most variables are variables that are to be represented by nodes in the graph
 #'@param lambda1 Positve numeric. If \code{fixed_lambda = TRUE}, this value is used as the
 #'l1−regularization parameter for all node-specific regressions. If \code{fixed_lambda = FALSE},
-#'this value is ignored and penalization parameters are optimized automatially.
+#'this value is ignored and penalization parameters are optimized automatially, using
+#'coordinated gradient descent to minimize cross-validated error.
 #'See \code{\link[glmnet]{cv.glmnet}} for further details of \code{lambda1} optimisation
-#'@param lambda2 Positive numeric. Value for l2−regularization, where larger values lead
-#'to stronger shrinking of coefficient magnitudes. Default is \code{0}, but larger values
-#'may be necessary for large or particularly sparse datasets. Takes values between
-#'\code{0} and \code{5}.
 #'@param symmetrise The method to use for symmetrising corresponding parameter estimates
 #'(which are taken from separate regressions). Options are \code{min} (take the coefficient with the
 #'smallest absolute value), \code{max} (take the coefficient with the largest absolute value)
@@ -35,7 +32,8 @@
 #'columns in \code{data}, corresponding to no additional covariates
 #'@param n_cores Positive integer. The number of cores to spread the job across using
 #'\code{\link[parallel]{makePSOCKcluster}}. Default is 1 (no parallelisation)
-#'@param n_covariates Positive integer. The number of covariates in \code{data}, before cross-multiplication
+#'@param n_covariates Positive integer. The number of covariates in \code{data}, before cross-multiplication.
+#'Default is \code{ncol(data) - n_nodes}
 #'@param family The response type. Responses can be quantitative continuous (\code{family = "gaussian"}),
 #'non-negative counts (\code{family = "poisson"}) or binomial 1s and 0s (\code{family = "binomial"})
 #'@param fixed_lambda Logical. If \code{FALSE}, node-specific regressions are optimized using the
@@ -88,7 +86,7 @@
 #'effects of the covariate on interactions between \code{j} and all other species
 #'(\code{/j}). Note that coefficients must be estimated on the same scale in order
 #'for the resulting models to be unified in a Markov Random Field. Counts for \code{poisson}
-#'variables will be standardised using the square root mean transformation
+#'variables will be therefore standardised using the square root mean transformation
 #'\code{x = x / sqrt(mean(x ^ 2))} so that they are on similar ranges. These transformed counts
 #'will then be used in a \code{(family = "gaussian")} model and their respective scaling factors
 #'will be returned so that coefficients can be unscaled before interpretation (this unscaling is
@@ -100,8 +98,8 @@
 #'\cr
 #'Note that since the number of parameters quickly increases with increasing
 #'numbers of species and covariates, LASSO penalization is used to regularize
-#'regressions based on values of regularization parameters \code{lambda1} and
-#'\code{lambda2}. This can be done either by minimising the cross-validated
+#'regressions based on values of the regularization parameter \code{lambda1}.
+#'This can be done either by minimising the cross-validated
 #'mean error for each node separately (using \code{\link[glmnet]{cv.glmnet}}) or by
 #'running all regressions at a single \code{lambda1} value. The latter approach may be
 #'useful for optimising all nodes as part of a joint graphical model, while the former
@@ -116,7 +114,7 @@
 #'
 #'@export
 #'
-MRFcov <- function(data, lambda1, lambda2, symmetrise,
+MRFcov <- function(data, lambda1, symmetrise,
                    prep_covariates, n_nodes, n_cores, n_covariates,
                    family, fixed_lambda, bootstrap = FALSE) {
 
@@ -149,17 +147,6 @@ MRFcov <- function(data, lambda1, lambda2, symmetrise,
     fixed_lambda <- FALSE
   }
 
-  if(missing(lambda2)) {
-    lambda2 <- 0
-  } else {
-    if(lambda2 < 0){
-      stop('Please provide a non-negative numeric value for lambda2')
-    }
-    if(lambda2 > 5){
-      lambda2 <- 5
-    }
-  }
-
   if(missing(n_cores)) {
     n_cores <- 1
   } else {
@@ -213,12 +200,12 @@ MRFcov <- function(data, lambda1, lambda2, symmetrise,
   }
 
   # Specify number of folds for cv.glmnet based on data size
-    if(nrow(data) < 50){
+    if(nrow(data) < 150){
       # If less than 50 observations, use leave one out cv
       n_folds <- nrow(data)
     } else {
-      # If > 50 but < 100 observations, use 15-fold cv
-      if(nrow(data) < 100){
+      # If > 150 but < 250 observations, use 15-fold cv
+      if(nrow(data) < 250){
         n_folds <- 15
       } else {
         # else use the default for cv.glmnet (10-fold cv)
@@ -321,7 +308,7 @@ MRFcov <- function(data, lambda1, lambda2, symmetrise,
 
   if(parallel_compliant){
     clusterExport(NULL, c('mrf_data',
-                          'lambda1','lambda2','n_nodes','family','fixed_lambda',
+                          'lambda1','n_nodes','family','fixed_lambda',
                           'n_folds'),
                   envir = environment())
 
@@ -339,25 +326,19 @@ MRFcov <- function(data, lambda1, lambda2, symmetrise,
       mrf_mods <- parLapply(NULL, seq_len(n_nodes), function(i) {
         penalized(response = mrf_data[,i],
                   penalized = mrf_data[, -which(grepl(colnames(mrf_data)[i], colnames(mrf_data)) == T)],
-                  lambda1 = lambda1, lambda2 = lambda2, steps = 1,
+                  lambda1 = lambda1, steps = 1,
                   model = fam, standardize = TRUE, trace = F, maxiter = 25000)
       })
 
     } else {
       #Use function 'cv.glmnet' from package glmnet if cross-validation is specified
       #Each node-wise regression will be optimised separately using cv, reducing user-bias
       clusterEvalQ(cl, library(glmnet))
-      alpha <- if(lambda2 == 0){
-        0
-      } else {
-        lambda2
-      }
-
-      mrf_mods <-  parLapply(NULL, seq_len(n_nodes), function(i) {
+      mrf_mods <- parLapply(NULL, seq_len(n_nodes), function(i) {
         cv.glmnet(x = mrf_data[, -which(grepl(colnames(mrf_data)[i], colnames(mrf_data)) == T)],
-                  y = mrf_data[,i], family = family, alpha = alpha,
+                  y = mrf_data[,i], family = family, alpha = 1,
                   nfolds = n_folds, weights = rep(1, nrow(mrf_data)),
-                  lambda = exp(seq(log(0.00001), log(5), length.out = 100)),
+                  #lambda = rev(seq(0.0001, 1, length.out = 100)),
                   intercept = TRUE, standardize = TRUE, maxit = 25000)
 
       })
@@ -374,22 +355,16 @@ MRFcov <- function(data, lambda1, lambda2, symmetrise,
       mrf_mods <- lapply(seq_len(n_nodes), function(i) {
         penalized(response = mrf_data[,i],
                   penalized = mrf_data[, -which(grepl(colnames(mrf_data)[i], colnames(mrf_data)) == T)],
-                  lambda1 = lambda1, lambda2 = lambda2, steps = 1,
+                  lambda1 = lambda1, steps = 1,
                   model = fam, standardize = TRUE, trace = F, maxiter = 25000)
       })
 
     } else {
-      alpha <- if(lambda2 == 0){
-        0
-      } else {
-        lambda2
-      }
-
       mrf_mods <- lapply(seq_len(n_nodes), function(i) {
         cv.glmnet(x = mrf_data[, -which(grepl(colnames(mrf_data)[i], colnames(mrf_data)) == T)],
-                  y = mrf_data[,i], family = family, alpha = lambda2,
+                  y = mrf_data[,i], family = family, alpha = 1,
                   nfolds = n_folds, weights = rep(1, nrow(mrf_data)),
-                  lambda = exp(seq(log(0.00001), log(5), length.out = 100)),
+                  #lambda = rev(seq(0.0001, 1, length.out = 100)),
                   intercept = TRUE, standardize = TRUE, maxit = 25000)
       })
     }

diff --git a/R/bootstrap_MRF.R b/R/bootstrap_MRF.R
@@ -2,7 +2,9 @@
 #'
 #'This function runs \code{\link{MRFcov}} models multiple times using either a user-specified
 #'range of l1 regularization values or cross-validation. To capture uncertainty
-#'in paramter esimates, the dataset is bootstrapped a user-specified number of times in each iteration.
+#'in paramter esimates, the dataset is shuffled and missing values are imputed
+#'in each bootstrap iteration. Two parameters control the number of bootstrap iterations
+#'to perform in order to facilitate optimal parallel computing.
 #'
 #'@importFrom magrittr %>%
 #'@importFrom parallel makePSOCKcluster setDefaultCluster clusterExport stopCluster clusterEvalQ parLapply
@@ -16,16 +18,21 @@
 #'@param n_bootstraps Positive integer. Represents the total number of bootstrap samples
 #'to test in each of \code{n_its} or \code{seq(min_lambda1, max_lambda1, by_lambda1)} iterations.
 #'Default is \code{10}.
+#'@param n_its Positive integer. The number of iterations to perform \code{n_bootstraps} replicates
+#'across. The use of two parameters provides more control for optimising parallel computations. It is
+#'generally recommended to use a higher number for \code{n_its} relative to \code{n_bootstraps}, as
+#'the job will be split across cores based on the \code{n_its} parameter.
+#'If \code{fixed_lambda = TRUE}, this value is ignored and \code{n_its} is taken from the length of
+#'\code{lambda1_seq = seq(min_lambda1, max_lambda1, by_lambda1)}. Default when \code{fixed_lambda = FALSE}
+#'is \code{10}. This coincides with the default \code{n_bootstraps} of \code{10} to produce
+#'\code{100} bootstrap replicates in total.
 #'@param min_lambda1 Positive numeric. The lowest l1−regularization value to be tested.
 #'If \code{fixed_lambda = FALSE}, this value is ignored and penalization parameters are optimized automatially
 #'@param max_lambda1 Positive numeric. The highest l1−regularization to be tested.
 #'If \code{fixed_lambda = FALSE}, this value is ignored and penalization parameters are optimized automatially
 #'@param by_lambda1 Positive numeric. The increment of the l1 test sequence. The test sequence is generated by calling
 #'\code{lambda1_seq = seq(min_lambda1, max_lambda1, by_lambda1)}.
 #'If \code{fixed_lambda = FALSE}, this value is ignored and penalization parameters are optimized automatially.
-#'@param lambda2 Numeric \code{(>= 0)}. Value for l2−regularization, where larger values lead
-#'to stronger shrinking of coefficient magnitudes. Default is 0, but larger values
-#'may be necessary for large or particularly sparse datasets
 #'@param symmetrise The method to use for symmetrising corresponding parameter estimates
 #'(which are taken from separate regressions). Options are \code{min} (take the coefficient with the
 #'smallest absolute value), \code{max} (take the coefficient with the largest absolute value)
@@ -48,14 +55,6 @@
 #'at a single \code{lambda1} value (the same \code{lambda1} value is used for each separate
 #'regression). Default is \code{FALSE}, meaning that arguments to \code{min_lambda1}, \code{max_lambda1}
 #'and \code{by_lambda1} are ignored
-#'@param n_its Positive integer. The number of iterations to perform \code{n_bootstraps} replicates
-#'across. The use of two parameters provides more control for optimising parallel computations. It is
-#'generally recommended to use a higher number for \code{n_its} relative to \code{n_bootstraps}, as
-#'the job will be split across cores based on the \code{n_its} parameter.
-#'If \code{fixed_lambda = TRUE}, this value is ignored and \code{n_its} is taken from the length of
-#'\code{lambda1_seq = seq(min_lambda1, max_lambda1, by_lambda1)}. Default when \code{fixed_lambda = FALSE}
-#'is \code{100}. This coincides with the default \code{n_bootstraps} of \code{10} to produce
-#'\code{1000} bootstrap replicates in total.
 #'@return A \code{list} containing:
 #'\itemize{
 #'   \item \code{lambda_results}: if \code{fixed_lambda = TRUE}, a \code{list} of length \code{lambda1_seq} containing:
@@ -105,15 +104,14 @@
 #'
 #'# Perform 100 bootstrap replicates in total
 #'bootedCRF <- bootstrap_MRF(data = Bird.parasites,
-#'                           n_nodes = 4, n_bootstraps = 5,
-#'                           n_its = 20,
+#'                           n_nodes = 4,
 #'                           family = 'binomial',
 #'                           n_cores = 3)}
 #'@export
 #'
-bootstrap_MRF <- function(data, n_bootstraps, sample_seed, min_lambda1,
-                           max_lambda1, by_lambda1, lambda2, symmetrise,
-                           n_nodes, n_cores, n_covariates, fixed_lambda, family, n_its){
+bootstrap_MRF <- function(data, n_bootstraps, n_its, sample_seed, min_lambda1,
+                           max_lambda1, by_lambda1, symmetrise,
+                           n_nodes, n_cores, n_covariates, family, fixed_lambda){
 
   #### Specify default parameter values and initiate warnings ####
   if(!(family %in% c('gaussian', 'poisson', 'binomial')))
@@ -164,7 +162,7 @@ bootstrap_MRF <- function(data, n_bootstraps, sample_seed, min_lambda1,
   }
 
   if(missing(n_its)) {
-    n_its <- 100
+    n_its <- 10
   }
 
   #### If fixed_lambda = TRUE, check that lambda arguments are appropriate ####
@@ -201,28 +199,16 @@ bootstrap_MRF <- function(data, n_bootstraps, sample_seed, min_lambda1,
     stop('Please provide a by_lambda1 that can be used as an
          increment between min_lambda1 & max_lambda1')
   }
-
-  if(missing(lambda2)) {
-     lambda2 <- 0
-    } else {
-      if(sign(lambda2) != 1){
-        stop('Please provide a non-negative value for lambda2
-             or use option "fixed_lambda = FALSE"')
-      }
-    }
   } else{
     lambda1 <- 1
-    if(missing(lambda2)){
-      lambda2 <- 0
-    }
   }
 
   #### Basic checks on data arguments ####
   if(missing(n_nodes)) {
     warning('n_nodes not specified. using ncol(data) as default, assuming no covariates',
             call. = FALSE)
     n_nodes <- ncol(data)
-    n_covariates = 0
+    n_covariates <- 0
   } else {
     if(sign(n_nodes) != 1){
       stop('Please provide a positive integer for n_nodes')
@@ -364,7 +350,7 @@ bootstrap_MRF <- function(data, n_bootstraps, sample_seed, min_lambda1,
   if(parallel_compliant){
 
     #Export necessary data and variables to each cluster
-    clusterExport(NULL, c('lambda1_seq', 'booted_datas', 'lambda2',
+    clusterExport(NULL, c('lambda1_seq', 'booted_datas',
                           'symmetrise', 'n_nodes', 'n_bootstraps',
                           'n_covariates', 'fixed_lambda', 'family'), envir = environment())
 
@@ -389,13 +375,13 @@ bootstrap_MRF <- function(data, n_bootstraps, sample_seed, min_lambda1,
       booted_mrfs <- lapply(seq_len(n_bootstraps), function(x) {
         sample_data <- sample(seq_len(100), 1)
         mod <- suppressWarnings(MRFcov(data = prepped_datas[[sample_data]], lambda1 = l,
-                      lambda2 = lambda2,
-                      symmetrise= symmetrise,
+                      symmetrise = symmetrise,
                       n_nodes = n_nodes,
                       n_cores = 1,
                       prep_covariates = FALSE,
                       n_covariates = n_covariates,
-                      fixed_lambda = fixed_lambda, family = family,
+                      fixed_lambda = fixed_lambda,
+                      family = family,
                       bootstrap = TRUE))
 
         list(direct_coefs = mod$direct_coefs,
@@ -470,7 +456,6 @@ bootstrap_MRF <- function(data, n_bootstraps, sample_seed, min_lambda1,
         booted_mrfs <- lapply(seq_len(n_bootstraps), function(x) {
           sample_data <- sample(seq_len(100), 1)
           mod <- suppressWarnings(MRFcov(data = prepped_datas[[sample_data]], lambda1 = l,
-                        lambda2 = lambda2,
                         symmetrise = symmetrise,
                         n_nodes = n_nodes,
                         n_cores = 1,

diff --git a/R/comp_rosalia_MRF.R b/R/comp_rosalia_MRF.R
@@ -11,8 +11,8 @@
 #'@param rosalia_mod A fitted \code{\link[rosalia]{rosalia}} model
 #'@return A \code{list} of two objects:
 #'\itemize{
-#'    \item \code{Positive_predictive_value}: The positive predictive value
-#'    \item \code{Negative_predictive_value}: The negative predictive value
+#'    \item \code{Positive_interactions}: The positive predictive value
+#'    \item \code{Negative_interactions}: The negative predictive value
 #'    }
 #'
 #'@seealso \code{\link{MRFcov}}, \code{\link[rosalia]{rosalia}}
@@ -46,6 +46,6 @@ pos.pred <- sum(true.pos, na.rm = TRUE)/
 
 neg.pred <- sum(true.neg, na.rm = TRUE)/
                 (sum(true.neg, na.rm = TRUE) + sum(missed.neg, na.rm = TRUE))
-return(list(Positive_predictive_value = pos.pred,
-            Negative_predictive_value = neg.pred))
+return(list(Positive_interactions = pos.pred,
+            Negative_interactions = neg.pred))
 }