survAUC large dataset issue

48 views Asked by At

I'm attempting to calculate the time-dependent AUC for survival models using the survAUC package in RStudio. I'm encountering a problem with large datasets, where if the initial dataset exceeds 150,000 records and/or the training subset goes beyond 50,000 records, the process abruptly stops/crashes after some time.

enter image description here

Is there a solution to address this issue, or can someone recommend an alternative function? Your assistance is greatly appreciated. Thank you.

PS: I have 512 GB RAM.

My current "solution": To get AUC I have to SRS (reduce) the initial dataset with

data2 <- data1[sample(1:nrow(data1), 150000, replace = FALSE), ]

before creating of training subset

TRAIN_data <- data2[sample(1:nrow(data2), 45000, replace = FALSE), ]
TEST_data <- data2[!data2$ID %in% TRAIN_data$ID, ]
times <- seq(1/12, 17, 1/12)

Reproduce the issue:

# Set the seed for reproducibility
set.seed(123)

# Number of observations in the dataset
n <- 320000
data1 <- data.frame(
  Time = pmin(pmax(rexp(n, rate = 0.02), 0), 18),
  Status = sample(0:1, n, replace = TRUE),  # Binary status variable
  V1 = runif(n, 18, 80),
  V2 = rnorm(n, mean = 170, sd = 10),
  V3 = rnorm(n, mean = 70, sd = 10),
  V4 = rnorm(n, mean = 110, sd = 10),
  V5 = sample(1:6, n, replace = TRUE),
  V6 = sample(1:4, n, replace = TRUE),
  V7 = sample(1:2, n, replace = TRUE),
  V8 = sample(1:2, n, replace = TRUE),
  V9 = sample(1:3, n, replace = TRUE),
  V10 = sample(1:2, n, replace = TRUE),
  V11 = sample(1:2, n, replace = TRUE),
  V12 = sample(1:3, n, replace = TRUE),
  V13 = sample(1:3, n, replace = TRUE),
  V14 = sample(1:3, n, replace = TRUE),
  V15 = sample(1:3, n, replace = TRUE),
  V16 = runif(n, 0, 1),
  V17 = runif(n, 0.5, 1),
  V18 = sample(1:2, n, replace = TRUE),
  V19 = as.factor(sample(1:18, n, replace = TRUE)),
  V20 = as.factor(sample(1:280, n, replace = TRUE)),
  ID = 1:n
)

f1 <- c(7:17, 20:22)
for (i in f1) {
  data1 [, i] <- as.factor(data1 [, i])
}

library(survival)
library(survminer)
library(survAUC)
TRAIN_data <- data1[sample(1:nrow(data1), 100000, replace = FALSE), ]
TEST_data <- data1[!data1$ID %in% TRAIN_data$ID, ]
times <- seq(1/12, 17, 1/12)

formula1 <- formula(Surv(Time, Status) ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + 
V9 + V10 + V11 + V12 + V13 + V14 + V15 + V16 + V17 + V18 + strata(V19) + strata(V20))

# AUC
train.fit <- coxph(formula1, data = TRAIN_data)
lp <- predict(train.fit)
lpnew <- predict(train.fit, newdata = TEST_data)
Surv.rsp <- Surv(TRAIN_data$Time, TRAIN_data$Status)
Surv.rsp.new <- Surv(TEST_data$Time, TEST_data$Status)
AUC <- AUC.cd(Surv.rsp, Surv.rsp.new, lp, lpnew, times)

plot(AUC)
abline(h = 0.5)
0

There are 0 answers