I modified a function in an R6 class defining a pipeline step for supervised learning, specifically using cross-validation (CV). This class encapsulates a learner and performs cross-validation on the training data (see also this link for more details).
When running this function, I get the following error message:
Error in self$assert(xs) :
Assertion on 'xs' failed: Parameter 'keep_response' not available. Did you mean 'resampling.method' / 'resampling.folds' / 'resampling.repeats'?.
The problem comes from this line of code:
super$initialize(id, alist(resampling = private$.crossval_param_set, private$.learner$param_set), param_vals = param_vals, can_subset_cols = TRUE, task_type = task_type, tags = c("learner", "ensemble"))
I have put print()
statements everywhere, but every time the variable keep_response
appears. I am completely new to object-oriented programming in R, so your help would be greatly appreciated.
Here is a reproducible example to use the function:
data <- data.frame(ID = 1:1742, x = runif(1742, -130.88, -61.12), y = runif(1742, 12.12, 61.38), year = runif(1742, 2005, 2020), presence = rep(0:1, each=871), V1 = runif(1742, -3.66247, 2.95120), V2 = runif(1742, -1.6501, 7.5510))
data$presence <- as.factor(data$presence)
## summary(data)
task <- mlr3spatial::as_task_classif_st(x = data, target = "presence", positive = "1", coordinate_names = c("x", "y"), crs = "+proj=longlat +datum=WGS84 +no_defs +type=crs")
task$set_col_roles("ID", roles = "space")
task$set_col_roles("year", roles = "time")
source("H:/PipeOpLearnerCV_mod.R")
learner_glmboost <- mlr3::lrn("classif.glmboost", predict_type = "prob", family = "Binomial")
po_learner_glmboost <- PipeOpLearnerCV_mod$new(learner = learner_glmboost, param_vals = list(resampling.method = "sptcv_cstf", resampling.folds = 2, keep_response = FALSE))
Here is the function with my edits and print()
:
PipeOpLearnerCV_mod = R6Class("PipeOpLearnerCV_mod",
inherit = PipeOpTaskPreproc,
public = list(
initialize = function(learner, id = NULL, param_vals = list()){
private$.learner = as_learner(learner, clone = TRUE)
if(mlr3pipelines:::paradox_info$is_old){
private$.learner$param_set$set_id = ""
}
########################################################################
## My edits
id = ifelse(is.null(id), private$.learner$id, id)
print(id)
## id = id %??% private$.learner$id
# FIXME: can be changed when mlr-org/mlr3#470 has an answer
########################################
type = private$.learner$task_type
task_type = mlr_reflections$task_types[type, mult = "first"]$task
########################################################################
## My edits
private$.crossval_param_set = ps(
method = p_fct(levels = c("cv", "insample", "sptcv_cstf", "repeated_sptcv_cstf"), tags = c("train", "required")),
folds = p_int(lower = 2L, upper = Inf, tags = c("train", "required")),
repeats = p_int(lower = 1L, upper = Inf),
stratify = p_lgl(default = FALSE),
keep_response = p_lgl(tags = c("train", "required"))
)
## My edits
if(param_vals$resampling.method == "sptcv_cstf"){private$.crossval_param_set$values = list(method = param_vals$resampling.method,
folds = param_vals$resampling.folds,
keep_response = param_vals$keep_response)
print(private$.crossval_param_set$values)}
if(param_vals$resampling.method == "repeated_sptcv_cstf"){private$.crossval_param_set$values = list(method = param_vals$resampling.method,
folds = param_vals$resampling.folds,
repeats = param_vals$resampling.repeats,
keep_response = param_vals$keep_response)
print(private$.crossval_param_set$values)}
if(param_vals$resampling.method == "cv" || param_vals$resampling.method == "insample"){private$.crossval_param_set$values = list(method = "cv", folds = 3, keep_response = FALSE)
print(private$.crossval_param_set$values)}
if(mlr3pipelines:::paradox_info$is_old){
private$.crossval_param_set$set_id = "resampling"
}
# Dependencies in paradox have been broken from the start and this is known since at least a year:
# https://github.com/mlr-org/paradox/issues/216
# The following would make it _impossible_ to set "method" to "insample", because then "folds"
# is both _required_ (required tag above) and at the same time must be unset (because of this
# dependency). We will opt for the least annoying behaviour here and just not use dependencies
# in PipeOp ParamSets.
# private$.crossval_param_set$add_dep("folds", "method", CondEqual$new("cv")) # don't do this.
########################################################################
print("A")
a <- private$.crossval_param_set
print(a)
print("B")
b <- private$.learner$param_set
print(b)
print("C")
c <- param_vals
print(c)
print("D")
print(id)
print("E")
print(task_type)
super$initialize(id, alist(resampling = private$.crossval_param_set, private$.learner$param_set), param_vals = param_vals, can_subset_cols = TRUE, task_type = task_type, tags = c("learner", "ensemble"))
## super$initialize(id, alist(resampling = private$.crossval_param_set, private$.learner$param_set), param_vals = param_vals, can_subset_cols = TRUE, task_type = task_type, tags = c("learner", "ensemble"))
print("F")
}
),
active = list(
learner = function(val){
if(!missing(val)){
if(!identical(val, private$.learner)){
stop("$learner is read-only.")
}
}
private$.learner
},
learner_model = function(val){
if(!missing(val)){
if(!identical(val, private$.learner)){
stop("$learner_model is read-only.")
}
}
if(is.null(self$state) || is_noop(self$state)){
private$.learner
} else {
multiplicity_recurse(self$state, clone_with_state, learner = private$.learner)
}
},
predict_type = function(val){
if(!missing(val)){
assert_subset(val, names(mlr_reflections$learner_predict_types[[private$.learner$task_type]]))
private$.learner$predict_type = val
}
private$.learner$predict_type
}
),
private = list(
.train_task = function(task){
on.exit({private$.learner$state = NULL})
# Train a learner for predicting
self$state = private$.learner$train(task)$state
pv = private$.crossval_param_set$values
print(pv)
print("bbbbbb")
# Compute CV Predictions
if(pv$method != "insample"){
rdesc = mlr_resamplings$get(pv$method)
if(pv$method == "cv"){rdesc$param_set$values = list(folds = pv$folds)}
########################################################################
## My edits
if(pv$method == "sptcv_cstf"){rdesc$param_set$values = list(folds = pv$folds, stratify = pv$stratify)}
if(pv$method == "repeated_sptcv_cstf"){rdesc$param_set$values = list(folds = pv$folds, repeats = pv$repeats, stratify = pv$stratify)}
########################################################################
rr = resample(task, private$.learner, rdesc)
prds = as.data.table(rr$prediction(predict_sets = "test"))
} else {
prds = as.data.table(private$.learner$predict(task))
}
private$pred_to_task(prds, task)
},
.predict_task = function(task){
on.exit({private$.learner$state = NULL})
private$.learner$state = self$state
prediction = as.data.table(private$.learner$predict(task))
private$pred_to_task(prediction, task)
},
pred_to_task = function(prds, task){
if(!is.null(prds$truth)) prds[, truth := NULL]
if(!self$param_set$values$resampling.keep_response && self$learner$predict_type == "prob"){
prds[, response := NULL]
}
renaming = setdiff(colnames(prds), c("row_id", "row_ids"))
data.table::setnames(prds, renaming, sprintf("%s.%s", self$id, renaming))
# This can be simplified for mlr3 >= 0.11.0;
# will be always "row_ids"
row_id_col = intersect(colnames(prds), c("row_id", "row_ids"))
data.table::setnames(prds, old = row_id_col, new = task$backend$primary_key)
task$select(character(0))$cbind(prds)
},
.crossval_param_set = NULL,
.learner = NULL,
.additional_phash_input = function() private$.learner$phash
)
)
mlr_pipeops$add("learner_cv", PipeOpLearnerCV_mod, list(R6Class("Learner", public = list(id = "learner_cv", task_type = "classif", param_set = ps()))$new()))