I'm attempting to calculate the time-dependent AUC for survival models using the survAUC package in RStudio. I'm encountering a problem with large datasets, where if the initial dataset exceeds 150,000 records and/or the training subset goes beyond 50,000 records, the process abruptly stops/crashes after some time.
Is there a solution to address this issue, or can someone recommend an alternative function? Your assistance is greatly appreciated. Thank you.
PS: I have 512 GB RAM.
My current "solution": To get AUC I have to SRS (reduce) the initial dataset with
data2 <- data1[sample(1:nrow(data1), 150000, replace = FALSE), ]
before creating of training subset
TRAIN_data <- data2[sample(1:nrow(data2), 45000, replace = FALSE), ]
TEST_data <- data2[!data2$ID %in% TRAIN_data$ID, ]
times <- seq(1/12, 17, 1/12)
Reproduce the issue:
# Set the seed for reproducibility
set.seed(123)
# Number of observations in the dataset
n <- 320000
data1 <- data.frame(
Time = pmin(pmax(rexp(n, rate = 0.02), 0), 18),
Status = sample(0:1, n, replace = TRUE), # Binary status variable
V1 = runif(n, 18, 80),
V2 = rnorm(n, mean = 170, sd = 10),
V3 = rnorm(n, mean = 70, sd = 10),
V4 = rnorm(n, mean = 110, sd = 10),
V5 = sample(1:6, n, replace = TRUE),
V6 = sample(1:4, n, replace = TRUE),
V7 = sample(1:2, n, replace = TRUE),
V8 = sample(1:2, n, replace = TRUE),
V9 = sample(1:3, n, replace = TRUE),
V10 = sample(1:2, n, replace = TRUE),
V11 = sample(1:2, n, replace = TRUE),
V12 = sample(1:3, n, replace = TRUE),
V13 = sample(1:3, n, replace = TRUE),
V14 = sample(1:3, n, replace = TRUE),
V15 = sample(1:3, n, replace = TRUE),
V16 = runif(n, 0, 1),
V17 = runif(n, 0.5, 1),
V18 = sample(1:2, n, replace = TRUE),
V19 = as.factor(sample(1:18, n, replace = TRUE)),
V20 = as.factor(sample(1:280, n, replace = TRUE)),
ID = 1:n
)
f1 <- c(7:17, 20:22)
for (i in f1) {
data1 [, i] <- as.factor(data1 [, i])
}
library(survival)
library(survminer)
library(survAUC)
TRAIN_data <- data1[sample(1:nrow(data1), 100000, replace = FALSE), ]
TEST_data <- data1[!data1$ID %in% TRAIN_data$ID, ]
times <- seq(1/12, 17, 1/12)
formula1 <- formula(Surv(Time, Status) ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 +
V9 + V10 + V11 + V12 + V13 + V14 + V15 + V16 + V17 + V18 + strata(V19) + strata(V20))
# AUC
train.fit <- coxph(formula1, data = TRAIN_data)
lp <- predict(train.fit)
lpnew <- predict(train.fit, newdata = TEST_data)
Surv.rsp <- Surv(TRAIN_data$Time, TRAIN_data$Status)
Surv.rsp.new <- Surv(TEST_data$Time, TEST_data$Status)
AUC <- AUC.cd(Surv.rsp, Surv.rsp.new, lp, lpnew, times)
plot(AUC)
abline(h = 0.5)
