# Script providing functions for training/testing models # on MDA genomic data. require(MASS) source("kcv.r") source("perf.r") source("fselect.r") rcv.lda = function(y, X, kfold=5, rep=10, ntop=25) { r = matrix(0, 2, 2) # confusion matrix p.error = vector(mode="numeric", rep) p.spec = vector(mode="numeric", rep) p.sens = vector(mode="numeric", rep) p.auc = vector(mode="numeric", rep) for (n in 1:rep) { used = rep(FALSE, n) prob = rep(0, n) cv = kcv.splits(y, k=kfold, stratified=TRUE) r.cv = matrix(0, 2, 2) cat(paste("Repeat", n, "of", rep, "\n")) for (k in 1:kfold) { itr = cv$cv.train[[k]] # index of elements for training ivd = cv$cv.valid[[k]] # index of elements for validation Xtr = X[,itr]; ytr = y[itr] Xvd = X[,ivd]; yvd = y[ivd] cat(paste(" Fold:", k, "\n")) cat(paste(" Feature selection: top", ntop, "probesets.\n")) bws = bwss(Xtr, ytr) g.idx = order(bws$bw, decreasing=T)[1:ntop] Xtr = Xtr[g.idx,] Xvd = Xvd[g.idx,] cat(paste(" LDA classification.\n")) used[ivd] = TRUE m = lda(x=t(Xtr), grouping=ytr) pred = as.numeric(as.character(predict(m, t(Xvd))$class)) r.cv = r.cv + perf.matrix(yvd, pred, lneg=0, lpos=1) prob[ivd] = predict(m, t(Xvd))$posterior[,2] } r = r + r.cv ## after each repeat, estimate the current performance r.cv = r.cv / kfold p.error[n] = perf.error(r.cv) p.sens[n] = perf.sensitivity(r.cv) p.spec[n] = perf.specificity(r.cv) prb = prob[used]; prb = prb[!is.na(prb)] yt = y[used]; yt = yt[!is.na(prob[used])] p.auc[n] = perf.auc(prb, yt, lneg=0, lpos=1) cat(paste("--> Error =", round(100*p.error[n],4), "Sensitivity =", round(100*p.sens[n],4), "Specificity =", round(100*p.spec[n],4), "AUC =", round(100*p.auc[n],4), "\n")) } r = r / (rep*kfold) return (list(cmat=r, err=p.error, sens=p.sens, spec=p.spec, auc=p.auc)) } do.training = function(y, X, kfold=5, rep=10) { r = matrix(0, 2, 2) # confusion matrix p.error = vector(mode="numeric", rep) p.spec = vector(mode="numeric", rep) p.sens = vector(mode="numeric", rep) p.auc = vector(mode="numeric", rep) w = rep(1, length(y)) w1 = 1; lambda = 0.001 w[y==1] = w1 ntop = c(10+(0:8)*5, 75) for (n in 1:rep) { cv = kcv.splits(y, k=kfold, stratified=TRUE) r.cv = matrix(0, 2, 2) used = rep(FALSE, n) prob = rep(0, n) cat(paste("Repeat", n, "of", rep, "\n")) for (k in 1:kfold) { itr = cv$cv.train[[k]] # index of elements for training ivd = cv$cv.valid[[k]] # index of elements for validation Xtr = X[,itr]; ytr = y[itr] Xvd = X[,ivd]; yvd = y[ivd] cat(paste(" Fold:", k, "\n")) cat(paste(" Feature selection: finding the best ntop: ")) best.err = 1.0 best.ntop = ntop[1] for (i in ntop) { fsr = rcv.lda(ytr, Xtr, kfold=5, rep=1, ntop=i) if (mean(fsr$err) < best.err) { # lowest error best.ntop = i } } cat(paste(best.ntop, "\n")) cat(" Feature selection: selecting on the full training set\n") bws = bwss(Xtr, ytr) g.idx = order(bws$bw, decreasing=T)[1:best.ntop] Xtr = Xtr[g.idx,] Xvd = Xvd[g.idx,] cat(paste(" LDA classification.\n")) used[ivd] = TRUE m = lda(x=t(Xtr), grouping=ytr) pred = as.numeric(as.character(predict(m, t(Xvd))$class)) r.cv = r.cv + perf.matrix(yvd, pred, lneg=0, lpos=1) prob[ivd] = predict(m, t(Xvd))$posterior[,2] } r = r + r.cv ## after each repeat, estimate the current performance r.cv = r.cv / kfold p.error[n] = perf.error(r.cv) p.sens[n] = perf.sensitivity(r.cv) p.spec[n] = perf.specificity(r.cv) prb = prob[used]; prb = prb[!is.na(prb)] yt = y[used]; yt = yt[!is.na(prob[used])] p.auc[n] = perf.auc(prb, yt, lneg=0, lpos=1) cat(paste("--> Error =", round(100*p.error[n],4), "Sensitivity =", round(100*p.sens[n],4), "Specificity =", round(100*p.spec[n],4), "AUC =", round(100*p.auc[n],4), "\n")) } r = r / (rep*kfold) return (list(cmat=r, err=p.error, sens=p.sens, spec=p.spec, auc=p.auc)) }