R Segmentation Fault in Parallel Process with RestRserve and mcparallel
I’m developing a REST API using RestRserve in R to manage trading bots. The API allows starting and stopping bot processes. However, I’m encountering segmentation faults and other memory-related errors when starting and stopping bots multiple times. My goal is baisically to create a REST API that can spawn and manage long running processes (bots) in parallel.
Here’s a minimal example that reproduces the issue:
Minimal example
box::use(RestRserve[Application, BackendRserve])
box::use(R6)
box::use(later)
box::use(parallel)
box::use(uuid)
# Define the TradeBot class
TradeBot <- R6$R6Class(
"TradeBot",
public = list(
is_running = FALSE,
timer = NULL,
id = NULL,
bot_file = NULL,
initialize = function(id, bot_file) {
self$bot_file <- bot_file
self$is_running <- FALSE
self$timer <- NULL
self$id <- id
},
start = function() {
if (!self$is_running) {
self$is_running <- TRUE
private$run()
cat(sprintf("Bot %s startedn", self$id))
} else {
cat(sprintf("Bot %s is already runningn", self$id))
}
},
stop = function() {
if (self$is_running) {
self$is_running <- FALSE
if (!is.null(self$timer)) {
self$timer()
self$timer <- NULL
}
cat(sprintf("Bot %s stoppedn", self$id))
} else {
cat(sprintf("Bot %s is not runningn", self$id))
}
}
),
private = list(
check_status = function() {
tryCatch({
processes <- readRDS(self$bot_file)
if (is.null(processes[[self$id]])) {
self$stop()
return(FALSE)
}
return(TRUE)
}, error = function(e) {
cat(sprintf("Error checking status for Bot %s: %sn", self$id, e$message))
return(FALSE)
})
},
run = function() {
if (!private$check_status()) {
return(NULL)
}
cat(sprintf("Bot %s is runningn", self$id))
self$timer <- later$later(function() {
tryCatch({
private$run()
}, error = function(e) {
cat(sprintf("Error in Bot %s: %sn", self$id, e$message))
self$stop()
})
}, 1, loop = later$global_loop())
}
)
)
# Use file to store bot processes; simulates database
bot_file <- "bot_processes.Rds"
if (!file.exists(bot_file)) {
processes <- list()
saveRDS(processes, file = bot_file)
}
# Function to start a new bot process
start_bot_process <- function(bot_id) {
processes <- readRDS(bot_file)
if (!is.null(processes[[bot_id]])) {
return(FALSE) # Bot already exists
}
bot_process <- parallel$mcparallel({
tryCatch({
bot <- TradeBot$new(bot_id, bot_file)
bot$start()
while (bot$is_running) {
later$run_now(all = TRUE, loop = later$global_loop())
Sys.sleep(0.1) # Add a small delay to prevent excessive CPU usage
}
}, error = function(e) {
cat("Error in bot process:", e$message, "n")
}, finally = {
cat(sprintf("Bot %s process exitingn", bot_id))
})
})
processes[[bot_id]] <- list(pid = bot_process$pid)
saveRDS(processes, file = bot_file)
return(TRUE)
}
# Function to stop a bot process
stop_bot_process <- function(bot_id) {
processes <- readRDS(bot_file)
if (is.null(processes[[bot_id]])) {
return(FALSE) # Bot doesn't exist
}
# Signal the bot to stop
processes[[bot_id]] <- NULL
saveRDS(processes, file = bot_file)
# Wait for the process to finish
Sys.sleep(2)
return(TRUE)
}
# Create the REST API application
app <- Application$new()
app$add_get("/", function(request, response) {
response$set_body("Welcome to the Trade Bot API")
})
app$add_get("/start-bot", function(request, response) {
bot_id <- request$parameters_query[["id"]]
if (is.null(bot_id)) {
bot_id <- uuid$UUIDgenerate()
}
if (start_bot_process(bot_id)) {
response$set_body(sprintf("Bot %s started", bot_id))
} else {
response$set_status_code(400)
response$set_body(sprintf("Bot %s already running", bot_id))
}
})
app$add_get("/stop-bot", function(request, response) {
bot_id <- request$parameters_query[["id"]]
if (is.null(bot_id)) {
response$set_status_code(400)
response$set_body("Bot ID is required")
return(NULL)
}
if (stop_bot_process(bot_id)) {
response$set_body(sprintf("Bot %s stopped", bot_id))
} else {
response$set_status_code(400)
response$set_body(sprintf("Bot %s not found or could not be stopped", bot_id))
}
})
app$add_get("/list-bots", function(request, response) {
processes <- readRDS(bot_file)
bot_list <- names(processes)
response$set_body(paste("Running bots:", paste(bot_list, collapse = ", ")))
})
# Start the server
backend <- BackendRserve$new()
backend$start(app, http_port = 8080)
Problem
When I start and stop bots multiple times using curl commands, I encounter various memory-related errors:
(base) work@Derecks-MacBook-Air dev % Rscript rest-bots-independent/index.R
{"timestamp":"2024-08-14 21:05:32.656289","level":"INFO","name":"Application","pid":32644,"msg":"","context":{"http_port":8080,"endpoints":{"HEAD":["/","/start-bot","/stop-bot","/list-bots"],"GET":["/","/start-bot","/stop-bot","/list-bots"]}}}
-- running Rserve in this R session (pid=32644), 2 server(s) --
(This session will block until Rserve is shut down)
Bot 1a861785-533f-4cf1-b668-6986da137c23 is running
Bot 1a861785-533f-4cf1-b668-6986da137c23 started
Bot 1a861785-533f-4cf1-b668-6986da137c23 is running
Bot 1a861785-533f-4cf1-b668-6986da137c23 is running
Bot 1a861785-533f-4cf1-b668-6986da137c23 is running
Bot 1a861785-533f-4cf1-b668-6986da137c23 stopped
Bot 1a861785-533f-4cf1-b668-6986da137c23 process exiting
*** caught bus error ***
address 0x102f47028, cause 'invalid alignment'
*** caught ***
Bot 158977f5-0cd6-4302-ad01-3e905f79387e is running
Bot 158977f5-0cd6-4302-ad01-3e905f79387e started
Bot 158977f5-0cd6-4302-ad01-3e905f79387e is running
Bot 158977f5-0cd6-4302-ad01-3e905f79387e is running
Bot 158977f5-0cd6-4302-ad01-3e905f79387e is running
Bot 158977f5-0cd6-4302-ad01-3e905f79387e stopped
Bot 158977f5-0cd6-4302-ad01-3e905f79387e process exiting
*** caught segfault ***
address 0x4000000102f47018, cause 'invalid permissions'
Bot 3a472bc4-8a32-4980-9120-52985ce41ce5 is running
Bot 3a472bc4-8a32-4980-9120-52985ce41ce5 started
Bot 3a472bc4-8a32-4980-9120-52985ce41ce5 is running
Bot 3a472bc4-8a32-4980-9120-52985ce41ce5 is running
Bot 3a472bc4-8a32-4980-9120-52985ce41ce5 stopped
Bot 3a472bc4-8a32-4980-9120-52985ce41ce5 process exiting
*** caught segfault ***
address 0x4000000102f47228, cause 'invalid permissions'
Bot 6259a15d-03f1-4174-8155-e5e4677811aa is running
Bot 6259a15d-03f1-4174-8155-e5e4677811aa started
Bot 6259a15d-03f1-4174-8155-e5e4677811aa is running
Bot 6259a15d-03f1-4174-8155-e5e4677811aa stopped
Bot 6259a15d-03f1-4174-8155-e5e4677811aa process exiting
*** caught segfault ***
address 0x4000000102f47228, cause 'invalid permissions'
{"timestamp":"2024-08-14 21:06:37.898974","level":"ERROR","name":"Application","pid":33791,"msg":"","context":{"request_id":"fc61d990-5aaa-11ef-9041-8ad61b804eb5","message":{"error":"bad value","call":"deparse(call, nlines = 1L)","traceback":["FUN(request, response)","start_bot_process(bot_id)",["parallel$mcparallel({"," tryCatch({"," bot <- TradeBot$new(bot_id, bot_file)"," bot$start()"," while (bot$is_running) {"," later$run_now(all = TRUE, loop = later$global_loop())"," Sys.sleep(0.1)"," }"," }, error = function(e) {"," cat("Error in bot process:", e$message, "\n")"," }, finally = {"," cat(sprintf("Bot %s process exiting\n", bot_id))"," })","})"],"mcexit(1L, structure("fatal error in wrapper code", class = "try-error"))","try(sendMaster(send, FALSE), silent = TRUE)",["tryCatch(expr, error = function(e) {"," call <- conditionCall(e)"," if (!is.null(call)) {"," if (identical(call[[1L]], quote(doTryCatch))) "," call <- sys.call(-4L)"," dcall <- deparse(call, nlines = 1L)"," prefix <- paste("Error in", dcall, ": ")"," LONG <- 75L"," sm <- strsplit(conditionMessage(e), "\n")[[1L]]"," w <- 14L + nchar(dcall, type = "w") + nchar(sm[1L], type = "w")"," if (is.na(w)) "," w <- 14L + nchar(dcall, type = "b") + nchar(sm[1L], "," type = "b")"," if (w > LONG) "," prefix <- paste0(prefix, "\n ")"," }"," else prefix <- "Error : ""," msg <- paste0(prefix, conditionMessage(e), "\n")"," .Internal(seterrmessage(msg[1L]))"," if (!silent && isTRUE(getOption("show.error.messages"))) {"," cat(msg, file = outFile)"," .Internal(printDeferredWarnings())"," }"," invisible(structure(msg, class = "try-error", condition = e))","})"],"tryCatchList(expr, classes, parentenv, handlers)","tryCatchOne(expr, names, parentenv, handlers[[1L]])","value[[3L]](cond)","deparse(call, nlines = 1L)"]}}}
^CCaught break signal, shutting down Rserve.
[1] TRUE
(base) work@Derecks-MacBook-Air dev % curl "localhost:8080/start-bot"
Bot 1a861785-533f-4cf1-b668-6986da137c23 started%
(base) work@Derecks-MacBook-Air dev % curl "localhost:8080/stop-bot?id=1a861785-533f-4cf1-b668-6986da137c23"
Bot 1a861785-533f-4cf1-b668-6986da137c23 stopped%
These errors occur seemingly after starting and stopping bots every time.
Question
- What could be causing these segmentation faults and memory errors?
- How can I properly manage my parallel processes to prevent these crashes?
- Is there a better way to implement this bot management system using RestRserve and parallel processing in R?
Any insights or suggestions would be greatly appreciated. Thank you!
NB: related discussion: https://github.com/rexyai/RestRserve/discussions/216#discussioncomment-10331021