How to create 100 maps with bootstrapping using stacked ensemble fit with tidymodels

11 views Asked by At

I am using tidymodels and I am stacking 5 models with tidymodels : random forest, cubist, xgb, support vector machine and neural network for a final prediction that will result in a raster map. My data cannot be shared but here is an example. This is a farm data from the CAST package and we want to map soil moisture. My plan is to be able to produce 100 maps of the stacked models and compute a 90% confidence interval map. Here below is the code for one map.

My way forward is to bootstrap the data 100 times so that for each data split: (1) models got fine tuned, (2) then stacked, (3) predict the final map, (4) save the maps on my computer (e.g. map001.tif, map002.tif, …map100.tif), and finally (5) save locally each of the fit ensemble (e.g. fit001.rds, fit002.rds…fit100.rds) that produce the maps. It should be possible to run a loop based on the bootstrap data for that purpose and that is where I am having issues. I will appreciate any help.

# Helper packages
library(dplyr)
library(CAST)
library(sf)

# For modelling
library(tidymodels)
library(tidyverse)
library(finetune)
library(stacks)
library(tidypredict)

# Run in parallel
library(doParallel)

# Load and split the data
set.seed(123)  # for reproducibility
data <- get(load(system.file("extdata","Cookfarm.RData",package="CAST")))
data_sp <- unique(data[,c("SOURCEID","Easting","Northing")])


# Select data for year 2012 and Reduce data
sel_data <- data[data$altitude==-0.3&
                   year(data$Date)==2012&
                   week(data$Date)%in%c(10:12),]

red_data <- sel_data %>% dplyr::select("VW",
                                       "DEM","TWI","Precip_cum",
                                       "cday","NDRE.M")

# Set number of folds and repetition
set.seed(123)
es_folds <- 
  vfold_cv(red_data, strata = VW, V=10,repeats = 3)

# Creating two recipes 
# centered and scaled predictors for SVM

normalized_rec <- 
  recipe(VW ~ ., data = red_data) %>% 
  step_normalize(all_predictors()) 

poly_recipe <- 
  normalized_rec %>% 
  step_poly(all_predictors()) %>% 
  step_interact(~ all_predictors():all_predictors())

# Create a set of model specifications

nnet_spec <- 
  mlp(hidden_units = tune(), penalty = tune(), epochs = tune()) %>% 
  set_engine("nnet", MaxNWts = 2600) %>% 
  set_mode("regression")

svm_r_spec <- 
  svm_rbf(cost = tune(), rbf_sigma = tune()) %>% 
  set_engine("kernlab") %>% 
  set_mode("regression")

rf_spec <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_engine("ranger",importance = "permutation") %>% 
  set_mode("regression")

xgb_spec <- 
  boost_tree(tree_depth = tune(), learn_rate = tune(), loss_reduction = tune(), 
             min_n = tune(), sample_size = tune(), trees = tune()) %>% 
  set_engine("xgboost") %>% 
  set_mode("regression")

cubist_spec <- 
  cubist_rules(committees = tune(), neighbors = tune()) %>% 
  set_engine("Cubist") 

nnet_param <- 
  nnet_spec %>% 
  extract_parameter_set_dials() %>% 
  update(hidden_units = hidden_units(c(1, 27)))

#---------------------------------------¨--------------------------------------#
#------------------- creating workflow set ------------------------------------#
#------------------------------------------------------------------------------#

normalized <- 
  workflow_set(
    preproc = list(normalized = normalized_rec), 
    models = list(SVM_radial = svm_r_spec,  
                  neural_network = nnet_spec)
    
  )

# add the neural network parameter object:
normalized <- 
  normalized %>% 
  option_add(param_info = nnet_param, 
             id = "normalized_neural_network")


# create another workflow for non linear models
model_vars <- 
  workflow_variables(outcomes = VW, 
                     predictors = everything())

no_pre_proc <- 
  workflow_set(
    preproc = list(simple = model_vars), 
    models = list(RF = rf_spec, boosting = xgb_spec, 
                  Cubist = cubist_spec)
  )

# Finally, we assemble the set that uses nonlinear terms and interactions 
# with the appropriate models

all_workflows <- 
  bind_rows(no_pre_proc, normalized) %>% 
  # Make the workflow ID's a little more simple: 
  mutate(wflow_id = gsub("(simple_)|(normalized_)", "", wflow_id))
all_workflows

#---------------------------------------¨--------------------------------------#
#------------------- fine tuning models ---------------------------------------#
#------------------------------------------------------------------------------#

# Register a parallel backend
all_cores <- parallel::detectCores(logical = FALSE)
ncore <- all_cores-2
cl <- makePSOCKcluster(ncore)
registerDoParallel(cl)

# Use here race control
race_ctrl <-
  control_race(
    allow_par=TRUE,
    save_pred = TRUE,
    parallel_over = "everything",
    save_workflow = TRUE
  )

race_results <-
  all_workflows %>%
  workflow_map(
    fn="tune_race_anova",
    seed = 123,
    resamples = es_folds,
    grid = 100,
    control = race_ctrl
  )

#------------------------------------------------------------------------------#
#------------------- Stack the models -----------------------------------------#
#------------------------------------------------------------------------------#

# # Stack the models
es_stack <-
  stacks() %>%
  add_candidates(race_results)

# # Blend the models
set.seed(123)
ens <- blend_predictions(es_stack, penalty = 10^seq(-2, -0.5, length = 20))

# Fit the ensemble models
fit_ens <- fit_members(ens)


#------------------------------------------------------------------------------#
#------------------- Making the maps ------------------------------------------#
#------------------------------------------------------------------------------#

# This is the grid of the study area
rast_grid <-rast(system.file("extdata","predictors_2012-03-25.grd",package="CAST"))

# Predict the map
r_vw_map<-predict(rast_grid,fit_ens)
plot(r_vw_map)

stopCluster(cl)
0

There are 0 answers