我试图弄清楚如何描述我的连续变量。不幸的是,我并不了解所有的统计数据。我真的很感激你们能在这里帮助我。为了更好地说明我的问题,我编写了以下示例。
library(rms)
library(survival)
data(pbc)
d <- pbc
rm(pbc, pbcseq)
d$status <- ifelse(d$status != 0, 1, 0)
dd = datadist(d)
options(datadist='dd')
# linear model
f1 <- cph(Surv(time, status) ~ albumin, data=d)
p1 <- Predict(f1, fun=exp)
(a1 <- anova(f1))
Function(f1)
plot(p1, anova=a1, pval=TRUE, ylab="Hazard Ratio")
# rcs model
f2 <- cph(Surv(time, status) ~ rcs(albumin, 4), data=d)
p2 <- Predict(f2, fun=exp)
(a2 <- anova(f2))
Function(f2)
plot(p2, anova=a2, pval=TRUE, ylab="Hazard Ratio")
# minimal CI width
p1$diff <- p1$upper-p1$lower
min(p1$diff) # = 0.002321521
p1[which(p1$diff==min(p1$diff)),]$albumin # = 3.494002
describe(d$albumin) # mean = 3.497
p2$diff <- p2$upper-p2$lower
min(p2$diff) # = 0.2039817
p2[which(p2$diff==min(p2$diff)),]$albumin # = 3.502447
describe(d$albumin) # mean = 3.497
# both models in a single figure
p <- rbind(linear.model=p1, rcs.model=p2)
library(ggplot2)
df <- data.frame(albumin=p$albumin, yhat=p$yhat, lower=p$lower, upper=p$upper, predictor=p$.set.)
(g <- ggplot(data=df, aes(x=albumin, y=yhat, group=predictor, color=predictor)) + geom_line(size=1))
(g <- g + geom_ribbon(data=df, aes(ymin=lower, ymax=upper), alpha=0.2, linetype=0))
(g <- g + theme_bw())
(g <- g + xlab("Albumin"))
(g <- g + ylab("Hazard Ratio"))
(g <- g + theme(axis.line = element_line(color='black', size=1)))
(g <- g + theme(axis.ticks = element_line(color='black', size=1)))
(g <- g + theme( plot.background = element_blank() ))
(g <- g + theme( panel.grid.minor = element_blank() ))
(g <- g + theme( panel.border = element_blank() ))
- 为什么显示线性模型(p1)的图不是直线?
- 如何在同一图中绘制模型 f1 和 f2?
- 如何比较模型 f1 和 f2 以研究哪些模型更适合数据?...就像生存包中 coxph 的 anova()
- 为什么在线性 (f1) 模型中,接近白蛋白平均值的最小 CI 宽度更明显?
- 图中的 P 值是什么意思?我如何解释 anova(...) 的输出
更新 #1
根据 Harrell 的回答,我更新了上面的代码,显示了如何将两个预测变量的样条图组合在一个图中。最后一个问题:我如何比较两个 rms 模型anova(m1, m2)
,如下所示的生存包?
> m1 <- coxph(Surv(time, status) ~ albumin, data=d)
> m2 <- coxph(Surv(time, status) ~ pspline(albumin), data=d)
> anova(m1, m2) # compare models
Analysis of Deviance Table
Cox model: response is Surv(time, status)
Model 1: ~ albumin
Model 2: ~ pspline(albumin)
loglik Chisq Df P(>|Chi|)
1 -975.61
2 -973.26 4.6983 11 0.9449
> summary(m1)
Call:
coxph(formula = Surv(time, status) ~ albumin, data = d)
n= 418, number of events= 186
coef exp(coef) se(coef) z Pr(>|z|)
albumin -1.4695 0.2300 0.1714 -8.574 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
exp(coef) exp(-coef) lower .95 upper .95
albumin 0.23 4.347 0.1644 0.3219
Concordance= 0.688 (se = 0.023 )
Rsquare= 0.147 (max possible= 0.992 )
Likelihood ratio test= 66.6 on 1 df, p=3.331e-16
Wald test = 73.51 on 1 df, p=0
Score (logrank) test = 72.38 on 1 df, p=0
更新#2
我想我只是自己回答了我的“最后一个问题”(见下文)。我希望这不会意外显示正确。我认为我可以比较模型cph
,coxph
不是吗?计算自由度的方法df
是否正确?
> # using coxph from survival
> m1 <- coxph(Surv(time, status) ~ albumin, data=d)
> m2 <- coxph(Surv(time, status) ~ albumin + age, data=d)
> # loglik = a vector of length 2 containing the log-likelihood with the initial values and with the final values of the coefficients.
> m1$loglik[2]
[1] -975.6126
> m2$loglik[2]
[1] -973.2272
> (df <- abs(length(m1$coefficients) - length(m2$coefficients)))
[1] 1
> (LR <- 2 * (m2$loglik[2] - m1$loglik[2]))
[1] 4.770787
> pchisq(LR, df, lower=FALSE)
[1] 0.02894659
> anova(m2, m1)
Analysis of Deviance Table
Cox model: response is Surv(time, status)
Model 1: ~ albumin + age
Model 2: ~ albumin
loglik Chisq Df P(>|Chi|)
1 -973.23
2 -975.61 4.7708 1 0.02895 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> m1 <- cph(Surv(time, status) ~ albumin, data=d)
> m2 <- cph(Surv(time, status) ~ albumin + age, data=d)
> # loglik = a vector of length 2 containing the log-likelihood with the initial values and with the final values of the coefficients.
> m1$loglik[2]
[1] -975.6126
> m2$loglik[2]
[1] -973.2272
> (df <- abs(length(m1$coefficients) - length(m2$coefficients)))
[1] 1
> (LR <- 2 * (m2$loglik[2] - m1$loglik[2]))
[1] 4.770787
> pchisq(LR, df, lower=FALSE)
[1] 0.02894659
更新#3 我按照 DWin 的友好回答更改了示例,如下所示。这样,应该正确计算自由度:
library(Hmisc)
library(rms)
library(ggplot2)
library(gridExtra)
data(pbc)
d <- pbc
rm(pbc, pbcseq)
d$status <- ifelse(d$status != 0, 1, 0)
### log likelihood test using a coxph model
m1 <- coxph(Surv(time, status) ~ albumin, data=d)
m2 <- coxph(Surv(time, status) ~ albumin + age, data=d)
# loglik = a vector of length 2 containing the log-likelihood with the initial values and with the final values of the coefficients.
m1$loglik[2]
m2$loglik[2]
(df <- abs(sum(anova(m1)$Df, na.rm=TRUE) - sum(anova(m2)$Df, na.rm=TRUE)))
(LR <- 2 * (m2$loglik[2] - m1$loglik[2])) # the most parsimonious models have to be first
pchisq(LR, df, lower=FALSE)
anova(m2, m1)
### log likelihood test using a cph model
dd = datadist(d)
options(datadist='dd')
m3 <- cph(Surv(time, status) ~ albumin, data=d)
m4 <- cph(Surv(time, status) ~ albumin + age, data=d)
# loglik = a vector of length 2 containing the log-likelihood with the initial values and with the final values of the coefficients.
m3$loglik[2]
m4$loglik[2]
(df <- abs(print(anova(m3)[, "d.f."])[['TOTAL']] - print(anova(m4)[, "d.f."])[['TOTAL']]))
(LR <- 2 * (m4$loglik[2] - m3$loglik[2])) # the most parsimonious models have to be first
pchisq(LR, df, lower=FALSE)