Non-Linear Relationship Plots with CIs from Bootstrap
在流行病学分析当中,我们经常会遇到或者想看一下是否存在非线性的关系。在这里,我整理了一下线性回归、Logistic回归、Cox回归中如何画一个效应指标对连续性变量$X$的图,并且使用bootstrap方法估计可信区间。
##### linear regression
mydata = plotRCS::cancer[,2:6]
fit1<-lm(size ~ ns(age, df = 3) + sex + race + metastasis, data = plotRCS::cancer)
dat1 <- data.frame(age = seq(20, 80), sex = "Male", race = "White", metastasis = "No")
dat2 <- data.frame(age = rep(46, 61), sex="Male", race = "White", metastasis = "No")
yhat1 <- predict(fit1, dat1)
yhat2 <- predict(fit1, dat2)
plot(dat1$age, -yhat2+yhat1, type ="l", ylim = c(-15, 90))
boot_func <- function(mydata, indices) {
sample_data <- mydata[indices, ]
myfit <- lm(size ~ ns(age, df = 3) + race + sex + metastasis, data = sample_data)
yhat1 <- predict(myfit, dat1)
yhat2 <- predict(myfit, dat2)
yhatdiff <- yhat1 - yhat2
return(yhatdiff)
}
# 运行bootstrap
set.seed(123) # 设置随机种子以确保结果可重复
n_boot <- 1000 # 设置bootstrap的次数
boot_results <- boot(mydata, boot_func, R = n_boot)
# 提取结果
boot_diff <- t(boot_results$t)
# 计算每个age点的95%可信区间
ci_lower <- apply(boot_diff, 1, function(x) quantile(x, 0.025))
ci_upper <- apply(boot_diff, 1, function(x) quantile(x, 0.975))
# 准备用于绘图的数据
age_seq <- seq(20,80)
ci_data <- data.frame(age = age_seq, lower = ci_lower, upper = ci_upper, ydiff = -yhat2+yhat1)
lines(ci_data$age, ci_data$lower, lty = 2)
lines(ci_data$age, ci_data$upper, lty = 2)
#########################################
# logistic regression: OR = exp(beta)
mydata = plotRCS::cancer[,2:7]
fit2<-glm(status ~ ns(age, df = 3) + size + sex + race + metastasis, family = "binomial",data = plotRCS::cancer)
dat1 <- data.frame(age = seq(20, 80), sex = "Male", race = "White", metastasis = "No", size = 45)
dat2 <- data.frame(age = rep(46, 61), sex="Male", race = "White", metastasis = "No", size = 45)
logoddshat1 <- predict(fit2, dat1)
logoddshat2 <- predict(fit2, dat2)
plot(dat1$age, exp(logoddshat1-logoddshat2), type ="l", ylim = c(0, 10))
boot_func <- function(mydata, indices) {
sample_data <- mydata[indices, ]
myfit <- glm(status ~ ns(age, df = 3) + race + sex + metastasis+size, family = "binomial", data = sample_data)
logoddshat1 <- predict(myfit, dat1)
logoddshat2 <- predict(myfit, dat2)
logoddshatdiff <- logoddshat1 - logoddshat2
return(logoddshatdiff)
}
# 运行bootstrap
set.seed(123) # 设置随机种子以确保结果可重复
n_boot <- 1000 # 设置bootstrap的次数
boot_results <- boot(mydata, boot_func, R = n_boot)
# 提取结果
boot_diff <- t(boot_results$t)
# 计算每个age点的95%可信区间
ci_lower <- apply(boot_diff, 1, function(x) quantile(x, 0.025))
ci_upper <- apply(boot_diff, 1, function(x) quantile(x, 0.975))
# 准备用于绘图的数据
age_seq <- seq(20,80)
ci_data <- data.frame(age = age_seq, lower = ci_lower, upper = ci_upper)
lines(ci_data$age, exp(ci_data$lower), lty = 2)
lines(ci_data$age, exp(ci_data$upper), lty = 2)
abline(h = 1, lty = 2, col = 2)
####################################################
# Cox regression: HR = exp(beta)
library(bshazard)
library(survival)
fit3 <- coxph(Surv(futime, death) ~ age, data = mgus)
basehzard <- bshazard(Surv(futime/365/100, death) ~ 1, data = mgus)
plot(basehzard, ylim = c(2,35), ylab = "Baseline Hazard Rate")
# 预测累积风险 (H(t))
cumulative_hazard <- predict(fit3, type = "expected")
# 计算生存概率
survival_prob <- exp(-cumulative_hazard) # which is the same as:
survival_predict <- predict(fit3, type = "survival")
lp_predict <- predict(fit3, type = "lp")
riskscore <- predict(fit3, type = "risk")
#
mfit <- coxph(Surv(futime, death) ~ sex + pspline(age, df=4), data=mgus)
ptemp <- termplot(mfit, se=TRUE, plot=FALSE)
ageterm <- ptemp$age
center <- with(ageterm, y[x==64])
ytemp <- ageterm$y + outer(ageterm$se, c(0, -1.96, 1.96), '*')
matplot(ageterm$x, exp(ytemp - center), log='y',
type='l', lty=c(1,2,2), col=1,
xlab="Age at diagnosis", ylab="Relative Death Rate")
abline(h = 1, col = 2, lty = 2)
newdata1 <- data.frame(
age = seq(34, 90, length = 100), sex = as.factor("female")
)
newdata2 <- data.frame(
age = rep(64, length = 100), sex = as.factor("female")
)
lp1 <- predict(mfit, newdata = newdata1, type = "lp")
lp2 <- predict(mfit, newdata = newdata2, type = "lp")
lines(seq(34, 90, length = 100), exp(lp1-lp2), col = 2)
plot(seq(34, 90, length = 100), exp(lp1-lp2), col = 2, log = "y", type="l", ylim = c(0.02,20))
# bootstrap for confidence intervals
mydata <- mgus
boot_func <- function(mydata, indices) {
sample_data <- mydata[indices, ]
myfit <- coxph(Surv(futime, death) ~ sex + pspline(age, df=4), data=sample_data)
lp1 <- predict(myfit, newdata1, type = "lp")
lp2 <- predict(myfit, newdata2, type = "lp")
lpdiff <- lp1-lp2
return(lpdiff)
}
# 运行bootstrap
set.seed(123)
n_boot <- 1000
boot_results <- boot(mydata, boot_func, R = n_boot)
# 提取结果
boot_diff <- t(boot_results$t)
# 计算每个age点的95%可信区间
ci_lower <- apply(boot_diff, 1, function(x) quantile(x, 0.025))
ci_upper <- apply(boot_diff, 1, function(x) quantile(x, 0.975))
age_seq <- seq(34, 90, length = 100)
ci_data <- data.frame(age = age_seq, lower = ci_lower, upper = ci_upper)
lines(ci_data$age, exp(ci_data$lower), lty = 2, col = 2)
lines(ci_data$age, exp(ci_data$upper), lty = 2, col = 2)
abline(v = 64, lty = 2, col = 1)
####################