Non-Linear Relationship Plots with CIs from Bootstrap

在流行病学分析当中，我们经常会遇到或者想看一下是否存在非线性的关系。在这里，我整理了一下线性回归、Logistic回归、Cox回归中如何画一个效应指标对连续性变量$X$的图，并且使用bootstrap方法估计可信区间。
##### linear regression
mydata = plotRCS::cancer[,2:6]

fit1<-lm(size ~ ns(age, df = 3) + sex + race + metastasis, data = plotRCS::cancer)
dat1 <- data.frame(age = seq(20, 80), sex = "Male", race = "White", metastasis = "No")
dat2 <- data.frame(age = rep(46, 61), sex="Male", race = "White", metastasis = "No")
yhat1 <- predict(fit1, dat1)
yhat2 <- predict(fit1, dat2)
plot(dat1$age, -yhat2+yhat1, type ="l", ylim = c(-15, 90))

boot_func <- function(mydata, indices) {
  sample_data <- mydata[indices, ]
  myfit <- lm(size ~ ns(age, df = 3) + race + sex + metastasis, data = sample_data)
  yhat1 <- predict(myfit, dat1)
  yhat2 <- predict(myfit, dat2)
  yhatdiff <- yhat1 - yhat2
  return(yhatdiff)
}

# 运行bootstrap
set.seed(123)  # 设置随机种子以确保结果可重复
n_boot <- 1000  # 设置bootstrap的次数
boot_results <- boot(mydata, boot_func, R = n_boot)

# 提取结果
boot_diff <- t(boot_results$t)

# 计算每个age点的95%可信区间
ci_lower <- apply(boot_diff, 1, function(x) quantile(x, 0.025))
ci_upper <- apply(boot_diff, 1, function(x) quantile(x, 0.975))

# 准备用于绘图的数据
age_seq <- seq(20,80)
ci_data <- data.frame(age = age_seq, lower = ci_lower, upper = ci_upper, ydiff = -yhat2+yhat1)
lines(ci_data$age, ci_data$lower, lty = 2)
lines(ci_data$age, ci_data$upper, lty = 2)

#########################################

# logistic regression: OR = exp(beta)

mydata = plotRCS::cancer[,2:7]

fit2<-glm(status ~ ns(age, df = 3) + size + sex + race + metastasis, family = "binomial",data = plotRCS::cancer)
dat1 <- data.frame(age = seq(20, 80), sex = "Male", race = "White", metastasis = "No", size = 45)
dat2 <- data.frame(age = rep(46, 61), sex="Male", race = "White", metastasis = "No", size = 45)
logoddshat1 <- predict(fit2, dat1)
logoddshat2 <- predict(fit2, dat2)
plot(dat1$age, exp(logoddshat1-logoddshat2), type ="l", ylim = c(0, 10))

boot_func <- function(mydata, indices) {
  sample_data <- mydata[indices, ]
  myfit <- glm(status ~ ns(age, df = 3) + race + sex + metastasis+size, family = "binomial", data = sample_data)
  logoddshat1 <- predict(myfit, dat1)
  logoddshat2 <- predict(myfit, dat2)
  logoddshatdiff <- logoddshat1 - logoddshat2
  return(logoddshatdiff)
}

# 运行bootstrap
set.seed(123)  # 设置随机种子以确保结果可重复
n_boot <- 1000  # 设置bootstrap的次数
boot_results <- boot(mydata, boot_func, R = n_boot)

# 提取结果
boot_diff <- t(boot_results$t)

# 计算每个age点的95%可信区间
ci_lower <- apply(boot_diff, 1, function(x) quantile(x, 0.025))
ci_upper <- apply(boot_diff, 1, function(x) quantile(x, 0.975))

# 准备用于绘图的数据
age_seq <- seq(20,80)
ci_data <- data.frame(age = age_seq, lower = ci_lower, upper = ci_upper)
lines(ci_data$age, exp(ci_data$lower), lty = 2)
lines(ci_data$age, exp(ci_data$upper), lty = 2)
abline(h = 1, lty = 2, col = 2)

####################################################

# Cox regression: HR = exp(beta)

library(bshazard)
library(survival)

fit3 <- coxph(Surv(futime, death) ~ age, data = mgus)

basehzard <- bshazard(Surv(futime/365/100, death) ~ 1, data = mgus)
plot(basehzard, ylim = c(2,35), ylab = "Baseline Hazard Rate")

# 预测累积风险 (H(t))
cumulative_hazard <- predict(fit3, type = "expected")

# 计算生存概率
survival_prob <- exp(-cumulative_hazard) # which is the same as:
survival_predict <- predict(fit3, type = "survival")

lp_predict <- predict(fit3, type = "lp")
riskscore <- predict(fit3, type = "risk")

# 
mfit <- coxph(Surv(futime, death) ~ sex + pspline(age, df=4), data=mgus)
ptemp <- termplot(mfit, se=TRUE, plot=FALSE)
ageterm <- ptemp$age
center <- with(ageterm, y[x==64])

ytemp <- ageterm$y + outer(ageterm$se, c(0, -1.96, 1.96), '*')

matplot(ageterm$x, exp(ytemp - center), log='y',
        type='l', lty=c(1,2,2), col=1,
        xlab="Age at diagnosis", ylab="Relative Death Rate")
abline(h = 1, col = 2, lty = 2)

newdata1 <- data.frame(
  age = seq(34, 90, length = 100), sex = as.factor("female")
)
newdata2 <- data.frame(
  age = rep(64, length = 100), sex = as.factor("female")
)

lp1 <- predict(mfit, newdata = newdata1, type = "lp")
lp2 <- predict(mfit, newdata = newdata2, type = "lp")

lines(seq(34, 90, length = 100), exp(lp1-lp2), col = 2)

plot(seq(34, 90, length = 100), exp(lp1-lp2), col = 2, log = "y", type="l", ylim = c(0.02,20))

# bootstrap for confidence intervals
mydata <- mgus
boot_func <- function(mydata, indices) {
  sample_data <- mydata[indices, ]
  myfit <- coxph(Surv(futime, death) ~ sex + pspline(age, df=4), data=sample_data)
  lp1 <- predict(myfit, newdata1, type = "lp")
  lp2 <- predict(myfit, newdata2, type = "lp")
  lpdiff <- lp1-lp2
  return(lpdiff)
}

# 运行bootstrap
set.seed(123)  
n_boot <- 1000 
boot_results <- boot(mydata, boot_func, R = n_boot)

# 提取结果
boot_diff <- t(boot_results$t)

# 计算每个age点的95%可信区间
ci_lower <- apply(boot_diff, 1, function(x) quantile(x, 0.025))
ci_upper <- apply(boot_diff, 1, function(x) quantile(x, 0.975))

age_seq <- seq(34, 90, length = 100)
ci_data <- data.frame(age = age_seq, lower = ci_lower, upper = ci_upper)
lines(ci_data$age, exp(ci_data$lower), lty = 2, col = 2)
lines(ci_data$age, exp(ci_data$upper), lty = 2, col = 2)
abline(v = 64, lty = 2, col = 1)

####################