First we upload required packages.
library(MASS)
library(survival)
Now, we simulate right censored data.
n=5000 # the size of a cohort
ncov=20 # the number of covariates
# simulating 20 covariates
ncov=20
z=matrix(0, nrow=n, ncol=ncov)
# coefficient vector
betavec=c(0.5, 0.5, 0.35, -.35, rep(0, 16))
for (i in 1:20){
z[, i]=rbinom(n, 1, 0.5)
}
mu=z%*%betavec
epsilon.star=rweibull(n, 0.82, 29.27)
epsilon=log(epsilon.star)/sqrt(1.57)
mytime=exp(1+mu+ exp(-0.5*mu^2)*epsilon)
cen.time=z[, 1]+z[, 2]+runif(n, 0, 130)
# This led to approximately 70% right censored data
myv=apply(cbind(mytime, cen.time), 1, min)
mydel=as.numeric(mytime<cen.time)
mydata=data.frame(myv, mydel, z)
This is the end of data generations. Now we apply different variable selection tecniques to obtain results. In the variable selection we fit the log-normal family of distributions to the (accelerated failure time model) AFT model. One can use other distributions that can be specified in the survreg function under the survival package.
myFormula <- as.formula(paste('Surv(myv, mydel)~', paste(paste0('X',1:ncov), collapse='+')))
out1=survreg(myFormula, data=mydata, dist="lognormal")
outfull=out1
null <- as.formula('Surv(myv, mydel)~1')
full <- as.formula(paste('Surv(myv, mydel)~', paste(paste0('X',1:ncov), collapse='+')))
out0=survreg(null, data=mydata, dist="lognormal")
## Variable selection using the AIC criteria with the backward elimination
out2.aic.b=suppressWarnings(stepAIC(out1, trace=FALSE, direction="backward"))
out3.aic.b=out2.aic.b$terms
cat('variable selection with AIC and backward elimination: ', noquote(attr(out3.aic.b, "term.labels")), "\n")
## variable selection with AIC and backward elimination: X1 X2 X3 X4 X18
## Variable selection using the AIC criteria with the forward selection
out2.aic.f=suppressWarnings(stepAIC(out0, trace=FALSE, direction='forward',
scope=list(lower=null, upper=full)))
out3.aic.f=out2.aic.f$terms
cat('variable selection with AIC and forward selection: ', noquote(attr(out3.aic.f, "term.labels")), "\n")
## variable selection with AIC and forward selection: X1 X3 X4 X2 X18
## Variable selection using the BIC criteria with the backward elimination
out2.bic.b=suppressWarnings(stepAIC(out1, trace=FALSE, direction="backward", k=log(nrow(mydata))))
out3.bic.b=out2.bic.b$terms
cat('variable selection with BIC and backward elimination: ', noquote(attr(out3.bic.b, "term.labels")), "\n")
## variable selection with BIC and backward elimination: X1
## Variable selection using the BIC criteria with the forward selection
out0=survreg(null, data=mydata, dist="lognormal")
out2.bic.f=suppressWarnings(stepAIC(out0, trace=FALSE, direction='forward',
scope=list(lower=null, upper=full), k=log(nrow(mydata))))
out3.bic.f=out2.bic.f$terms
cat('variable selection with BIC and forward selection: ', noquote(attr(out3.bic.f, "term.labels")), "\n")
## variable selection with BIC and forward selection: X1
## Variable selection using the p-value criteria with the backward elimination
p.thr=0.1 # pre-specified cut-off of p-values p.thr=0.1
pout1.old=suppressWarnings(drop1(outfull, test="Chisq"))
pout1=pout1.old[-1, ]
mx=max(pout1[, 4])
index=paste0('X',1:ncov)
while(mx>p.thr){
ind=row.names(pout1)[pout1[, 4]==mx]
ind.1=which(index==ind)
index=index[-ind.1]
myFormula <- as.formula(paste('Surv(myv, mydel)~', paste(paste0(index), collapse='+')))
out1=survreg(myFormula, data=mydata, dist="lognormal")
pout1.old=suppressWarnings(drop1(out1, test="Chisq"))
pout1=pout1.old[-1, ]
mx=max(pout1[, 4])
}
cat('variable selection with using backward elimination and with a p-value threshold of', p.thr,': ', noquote(index), "\n")
## variable selection with using backward elimination and with a p-value threshold of 0.1 : X1 X2 X3 X4
## Variable selection using the p-value criteria with the forward selection
p.thr=0.1# pre-specified cut-off of p-values p.thr=0.1
pout2.old=suppressWarnings(add1(out0, scope = full, test = "Chisq"))
pout2=pout2.old[-1, ]
mn=min(pout2[, 4])
index=NULL;
while(mn<p.thr){
ind=row.names(pout2)[pout2[, 4]==mn]
index=c(index, ind)
myFormula <- as.formula(paste('Surv(myv, mydel)~', paste(paste0(index), collapse='+')))
out1=survreg(myFormula, data=mydata, dist="lognormal")
pout2.old=suppressWarnings(add1(out1, scope =full, test = "Chisq"))
pout2=pout2.old[-1, ]
mn=min(pout2[, 4])
}
cat('variable selection with using forward selection and p-value less or equal to', p.thr, 'criteria: ', noquote(index), "\n")
## variable selection with using forward selection and p-value less or equal to 0.1 criteria: X1 X3 X4 X2
Variable selection using the LASSO technique in the Cox proportional hazard model
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-18
y <- cbind(time=mydata$myv, status=mydata$mydel)
x <- as.matrix(mydata[,-c(1, 2)])
cvfit <- cv.glmnet(x, y, family="cox") #first do 10-fold cross-validation to select lambda
m <- glmnet(x, y, family="cox", lambda=cvfit$lambda.min) #plugin the optimal lambda
cat('Cox-LASSO selected: ', paste0('X', which(m$beta!=0)), '\n')
## Cox-LASSO selected: X2 X14 X18