fz <- read.table("~/LifeData/FZ/feiglz.dat", header=T) dead <- rep(1,33) ; lwbc <- log(wbc) ; y <- log(tfz) fz.C.ag <- coxph( Surv(t, dead) ~ ag) # fit a Cox proportional hazard model with covariate 'ag' fz.C.lwbc <- coxph( Surv(t, dead) ~ lwbc) fz.C.2 <- coxph( Surv(t, dead) ~ lwbc+ag) ######### Residuals: mresid.ag <- residuals(fz.C.ag) # Martingale residuals are the default dresid.ag <- residuals(fz.C.ag, type="deviance") # deviance residuals sresid.ag <- residuals(fz.C.ag, type="schoenfeld") # Schoenfeld residuals for each covariate rC.ag <- dead-mresid.ag # Cox-Snell residuals: M= dead - rC, so rC=dead -M # The Cox-Snell residuals are used in a probability plot to # check overall fit. Remember - exponentially distributed. skm.rC <- summary( survfit(Surv(rC.ag,dead)) ) rCd <- skm.rC$time # CS resids for observed deaths or failures rCsurv <- skm.rC$surv # the estimated survival at death times # This is an efficient way to get the points we need for the exponential plot plot(rCd, -log(rCsurv), xlab="Cox-Snell residuals", ylab="Cumulative hazard", main="Exponential probability plot");abline(a=0,b=1) # -log(rCsurv) is the estimated H(t) =- log( S(t)) # As these should be standard exponential, they should follow the line y=x. # What do you think of the plot? # plot the martingale residuals against 'lwbc' to see whether there is an assocation, # and what form it takes. # M = dead - H(t) times exp(beta AG): if the model includes all the necessary #covariates, the scatter will be random. Otherwise, the pattern indicates #what funtion might be needed. plot(lwbc,mresid.ag, xlab="log (wbc)", ylab="Martingale residuals", main="Martingale residuals, Cox fit to AG vs lwbc") # With this plot, the relation with lwbc is more obvious that with the residual plot # shown for the exponential fit shown for the exponential model. # It might be useful to have some indication of what the pattern is. # the function scatter.smooth() does this: scatter.smooth(lwbc,mresid.ag, xlab="log (wbc)", ylab="Martingale residuals", main="Martingale residuals, Cox fit to AG vs lwbc") # A straight line is OK, but perhaps it reaches an upper limit about lwbc=10? # (the smooth line is created by generalised spline fitting) # plot the deviance residuals to see if there are outliers: plot(dresid.ag,ylab="Deviance residuals",main="Deviance residuals") # no extremely large or small values. # plot the deviance residuals vs lwbs to see where outliers are plot(lwbc,dresid.ag,xlab="log(wbc)",ylab="Deviance residuals", main="Deviance residuals vs log(wbc)") # smallest residual at lowest lwbc, highest at large lwbc # The largest values of dresid.ag =2.4 are for the same value of lwbc, # which is why there is one point, not two, as we see in the index plot. ######### several plots on one page. par(mfrow=c(2,2)) # split the page into 2 rows, 2 cols. plot(rCd, -log(rCsurv), xlab="Cox-Snell residuals", ylab="Cumulative hazard", main="Exponential probability plot");abline(a=0,b=1) scatter.smooth(lwbc,mresid.ag, xlab="log (wbc)", ylab="Martingale residuals", main="Martingale residuals, Cox fit to AG vs lwbc") plot(dresid.ag,ylab="Deviance residuals",main="Deviance residuals") plot(lwbc,dresid.ag,xlab="log(wbc)",ylab="Deviance residuals", main="Deviance residuals vs log(wbc)") # plots are entered in row 1, col 1, row 1, col2, row 2, col 1, row 2, col 2 dev.off() # Usually best to reset. ######## # Schoenfeld residuals plot( sort(t[dead==1]) , sresid.ag[dead==1], xlab="Ordered survival times") # Only observed failures (all people in this data set) # We see clearly that this is a binary variable. # Largest values at earlier times, so model not bad. PH.test <- cox.zph(fz.C.ag) ; plot(PH.test ) plot(PH.test, main="Check coefficient constant over time" ) # Does a straight line at zero fit within the limits (dotted) # Assess for influential observations bresid <- residuals(fz.C.ag, type="dfbetas") index <- seq(1,length(t)) # what values does index take? plot(index,bresid,type="h",ylab="Scaled change in coef if ith unit omitted", xlab="Observations i", main="Influence plot for model") # three points, observations 4, 27 and 29 change the coef by 0.3 standard errors # this is not regarded as a large change. # Now, we know we should include lwbc in the model, fz.C.2 <- coxph( Surv(t, dead) ~ lwbc+ag) # as before # now check to see how well this model fits, using the residual plots! # Warning: there are now two sets of Schoenfeld residuals: # residuals for each covariate, so modify procedures sresid2 <- residuals(fz.C.2, type="schoenfeld") round(sresid2,2) # the first are residuals for lwbc, the second for ag. plot( sort(t[dead==1]) , sresid2[,1][dead==1], xlab="Ordered survival times") # Largest values at earlier times, so model not bad. plot( sort(t[dead==1]) , sresid2[,2][dead==1], xlab="Ordered survival times") # We see clearly that this is for a binary variable. # Largest values at earlier times, so model not bad. PH2.test <- cox.zph(fz.C2) ; par(mfrow=c(2,1) ) # this divides the plot up into 2 rows, 1 column. plot(PH2.test, main="Check coefficient constant over time" ) # Does a straight line at zero fit within the limits (dotted) # Assess for influential observations bresid2 <- residuals(fz.C.2, type="dfbetas") #Again, for both covariates plot(index,bresid2[,1],type="h",ylab="Scaled change in lwbc coef", xlab="Observations i", main="Influence plot for model") # Observation 32 most influential on lwbc coefficient. plot(index,bresid2[,2],type="h",ylab="Scaled change in AG coef", xlab="Observations i", main="Influence plot for model") # Observation 33 most influential on AG coefficient. ################ EXTRA fz.C.2a <- coxph( Surv(t[1:32], dead[1:32]) ~ lwbc[1:32]+ag[1:32]) #remove observation 33 fz.C.2; fz.C.2a # look directly at the difference that removing remove observation 33 has for AG. # Quite a large change in 'coef' and in 'z p' fz.C.2b<- coxph( Surv(t[t!=43], dead[t!=43]) ~ lwbc[t!=43]+ag[t!=43]) #remove observation 32, which has t=43 fz.C.2; fz.C.2a; fz.C.2b ################ censoring and residuals. # To see what difference censoring makes, let's look at censoring some of the times: un <- runif(33) ; this generates 33 uniform random variables. newdead <- as.numeric(un < 0.6) # this gives about 60% dead, 40% censored. newt <- t # take a copy of t, then newt[newdead==0] <- 0.8*t[newdead==0] # make the censoring time shorter than the death time. # there is another way of doing this, in one line - try it if you like. plot(survfit(Surv(newt,newdead)~ ag) ) # see the censoring! newfz.C.ag <- coxph( Surv(newt, newdead) ~ ag) fz.C.ag ; newfz.C.ag # compare the two results. mresid.new.ag <- residuals(newfz.C.ag) dresid.new.ag <- residuals(newfz.C.ag, type="deviance" ) plot(lwbc[newdead==1],mresid.new.ag[newdead==1], xlab="log (wbc)", ylab="Martingale residuals", main="Martingale residuals, added censoring") points(lwbc[newdead==0],mresid.new.ag[newdead==0], pch=2) # notice where the censored points are - lower, in general, than observed deaths. plot(lwbc[newdead==1],dresid.new.ag[newdead==1], xlab="log (wbc)", ylab="Deviance residuals", main="Deviance residuals, added censoring") points(lwbc[newdead==0],dresid.new.ag[newdead==0], pch=2) # censored points are even lower, in general, than observed deaths.