Chapter 8: Graphical Displays for Biomarker Data

Manuela Zucknick, Thomas Hielscher, Martin Sill, Axel Benner

All Figures in this Chapter, except the dynamical graphics in Section 8.4, have been created with the statistical computing environment R (version 2.15) and contributed R and Bioconductor packages available from www.r-project.org and www.bioconductor.org, respectively.

All the R code is contained in file Chapter8.R to be downloaded from the webpage. Some of the R code chunks require additional R functions, which are collected in file Chapter8_utility.R. Download this file from the webpage, save it in the same directory as the main R script and run source("Chapter8_utility.R").

Note that not all R code chunks are self-contained. In general, it is preferable to run all code chunks in consecutive order starting from Setup, because some code depends on the evaluation of previous code chunks.

### code chunk: Setup

rm(list = ls())
dir.create("figures", showWarnings = FALSE)

The example data set that was used throughout the Chapter can be downloaded from the NCBI Gene Expression Omnibus repository www.ncbi.nlm.nih.gov/geo under GEO accession number GSE24080. You will need the three Series Matrix Files (TXT format) and the EXCEL supplementary file GSE24080_MM_UAMS565_ClinInfo_27Jun2008_LS_clean.xls.gz containing clinical data. The following code chunk shows how the data can be downloaded from within R using the Bioconductor library GEOquery.

### code chunk: Fetch the data

if (file.exists("Rdata/expressionSets.Rdata")) {
    library(Biobase)
    load("Rdata/expressionSets.Rdata")

} else {
    library(GEOquery)
    library(RCurl)

    # tweak to make sure that only the first supplementary data file (=
    # clinical data) is downloaded by getGEOSuppFiles() instead of all files
    getDirListing <- function(url) {
        print(url)
        a <- getURL(url)
        b <- textConnection(a)
        d <- read.table(b, header = FALSE, nrows = 1)
        close(b)
        return(d)
    }
    assignInNamespace("getDirListing", getDirListing, "GEOquery")

    dir.create("GEOdata", showWarnings = FALSE)
    dir.create("Rdata", showWarnings = FALSE)

    # download processed data files from url and save locally
    geo_exprs_data <- getGEO("GSE24080", destdir = "GEOdata/")
    geo_exprs_suppl <- getGEOSuppFiles("GSE24080", makeDirectory = F, baseDir = "GEOdata/")

    # get data from locally saved copies
    geo_exprs_1 <- getGEO(filename = "GEOdata/GSE24080_series_matrix-1.txt.gz")
    geo_exprs_2 <- getGEO(filename = "GEOdata/GSE24080_series_matrix-2.txt.gz")
    geo_exprs_3 <- getGEO(filename = "GEOdata/GSE24080_series_matrix-3.txt.gz")

    # convert all factors of pheno data to characters prior to combining eSets
    index1 <- sapply(pData(geo_exprs_1), is.factor)
    index2 <- sapply(pData(geo_exprs_2), is.factor)
    index3 <- sapply(pData(geo_exprs_3), is.factor)
    pData(geo_exprs_1)[, index1] <- sapply(pData(geo_exprs_1)[, index1], as.character)
    pData(geo_exprs_2)[, index2] <- sapply(pData(geo_exprs_2)[, index2], as.character)
    pData(geo_exprs_3)[, index3] <- sapply(pData(geo_exprs_3)[, index3], as.character)

    # combine expression sets
    geo_exprs <- combine(geo_exprs_1, geo_exprs_2, geo_exprs_3)

    gunzip("GEOdata/GSE24080_MM_UAMS565_ClinInfo_27Jun2008_LS_clean.xls.gz")

    library(gdata)
    geo_suppl <- read.xls("GEOdata/GSE24080_MM_UAMS565_ClinInfo_27Jun2008_LS_clean.xls", 
        row.names = 3, na.strings = c("NA", ""))
    cels <- paste(as.character(pData(geo_exprs)$title), "CEL", sep = ".")

    # phenotype data
    pData(geo_exprs) <- data.frame(pData(geo_exprs), geo_suppl[cels, ])
    # training data set
    eset_train <- geo_exprs[, geo_exprs$MAQC_Distribution_Status == "Training"]  #n=340
    # test data set
    eset_test <- geo_exprs[, geo_exprs$MAQC_Distribution_Status == "Validation"]  #n=214

    annotation(eset_train) <- annotation(eset_test) <- "hgu133plus2"
    save(eset_train, eset_test, file = "Rdata/expressionSets.Rdata")
}

After some data processing and filtering as shown in the following R code chunk, the data are ready for further analysis.

### code chunk: Preprocess the data

# unspecific filtering: select 10000 most varying probe sets
library(genefilter)

rowvars <- rowVars(exprs(eset_train))
top10000 <- which(rowvars >= sort(rowvars, decreasing = T)[10000])
eset_train <- eset_train[top10000, ]

# group patients according to number of focal lesions in MRI (Walker, JCO,
# 2007) training set:
eset_train$focal_lesions <- cut(eset_train$MRI, c(0, 7, max(eset_train$MRI, 
    na.rm = T)), include.lowest = T, labels = c("0-7", ">7"))
eset_train$beta2mg <- cut(as.numeric(as.character(eset_train$B2M)), c(0, 3.5, 
    max(as.numeric(as.character(eset_train$B2M)), na.rm = T)), include.lowest = T, 
    labels = c("<= 3.5 mg/dL", ">3.5 mg/dL"))
eset_train_FL <- eset_train[, !is.na(eset_train$focal_lesions) & !is.na(eset_train$beta2mg)]

# test set:
eset_test$focal_lesions <- cut(eset_test$MRI, c(0, 7, max(eset_test$MRI, na.rm = T)), 
    include.lowest = T, labels = c("0-7", ">7"))
B2M <- sub("&lt;", "", eset_test$B2M)
B2M <- sub("<", "", B2M)
eset_test$beta2mg <- cut(as.numeric(as.character(B2M)), c(0, 3.5, max(as.numeric(as.character(eset_test$B2M)), 
    na.rm = T)), include.lowest = T, labels = c("<= 3.5 mg/dL", ">3.5 mg/dL"))
eset_test_FL <- eset_test[, !is.na(eset_test$focal_lesions) & !is.na(eset_test$beta2mg)]

# further unspecific filtering to reduce the number of probes to make
# plots like heatmaps more useful
rowvars <- rowVars(exprs(eset_test))
top <- which(rowvars >= sort(rowvars, decreasing = T)[10000])
top2 <- which(rowvars >= sort(rowvars, decreasing = T)[500])
unsup.all <- t(scale(t(exprs(eset_test[top, ]))))
unsup <- t(scale(t(exprs(eset_test[top2, ]))))
pdata <- pData(eset_test)

Section 8.2 Unsupervised Methods

### code chunk: Figure 8.1

library(gplots)

# k-means clustering
set.seed(12345)
K <- 4
km <- kmeans(unsup, K)
kmcol <- (palette()[1:K])[as.vector(km$cluster)]

# heatmap
cols <- bluered(100)
heatmap.2(unsup, col = cols, scale = "none", margins = c(2, 2), key = TRUE, 
    trace = "none", density.info = "none", symbreaks = FALSE, labRow = "", labCol = "", 
    xlab = "Samples", ylab = "Features", RowSideColors = kmcol, keysize = 1.25)

### code chunk: Figure 8.2

library(cluster)

# silhouette
km.ord <- order(km$cluster)
km.sil <- km$cluster[km.ord]
unsup.sil <- unsup[km.ord, ]
sil <- silhouette(km.sil, dist(unsup.sil, "euclidean"))

# plot
plot(sil, col = 1:4, main = "", sub = "", do.clus.stat = FALSE, do.n.k = FALSE)
abline(v = 0)
abline(h = (nrow(unsup) - match(unique(km.sil), km.sil)[-1]) + 1)
text(x = 0.9, y = (nrow(unsup) - match(unique(km.sil), km.sil) - 5), pos = 1, 
    labels = paste("cluster", 1:4))
box()

### code chunk: Figure 8.3

library(clusterCons)

# consensus clustering
set.seed(1234)
cmr <- cluscomp(data.frame(unsup), algo = list("kmeans"), clmin = 2, clmax = 6, 
    rep = 50, merge = 0)
cm2 <- cmr$e1_kmeans_k2@cm
cm3 <- cmr$e1_kmeans_k3@cm
cm4 <- cmr$e1_kmeans_k4@cm
cm5 <- cmr$e1_kmeans_k5@cm
cm6 <- cmr$e1_kmeans_k6@cm
acs <- aucs(cmr)
dks <- deltak(acs)

# top-left plot
par(mar = c(3, 3, 2, 2))
plot(ecdf(cm2), main = "eCDF", ylab = "", xlab = "")
lines(ecdf(cm3), col = "purple")
lines(ecdf(cm4), col = "blue")
lines(ecdf(cm5), col = "green")
lines(ecdf(cm6), col = "orange")
legend(x = "bottomright", legend = c("K=2", "K=3", "K=4", "K=5", "K=6"), text.col = c("black", 
    "purple", "blue", "green", "orange"))

# bottom-left
line_number <- length(unique(dks[, 2]))
lines <- c(1:line_number)
points <- c(1:line_number)
p1 <- xyplot(deltak ~ k, group = dks[, 2], dks, scales = list(x = list(tick.number = 4)), 
    xlab = "cluster number (k)", ylab = "delta-K", type = c("o"), panel = function(...) {
        panel.xyplot(..., pch = points, lty = lines)
    })
plot(p1)

# right plot
cm <- cmr$e1_kmeans_k4@cm
colnames(cm) <- rownames(cm) <- rep("", nrow(unsup))
heatmap(cm)

### code chunk: Figure 8.4

library(RColorBrewer)
library(Heatplus)

# put together the covariables to be added to heatmap
covars <- pdata[, c("Cyto.Abn", "focal_lesions", "B2M")]
covars$focal_lesions <- as.numeric(covars$focal_lesions) - 1
covars$B2M <- log10(as.numeric(as.character(covars$B2M)))
covars$B2M[is.na(covars$B2M)] <- median(covars$B2M, na.rm = TRUE)  #imputation  of missing values is needed, since missing values are not allowed in picketplot()
colnames(covars) <- c("cytogenetic aberrations", ">7 focal lesions", "log10 beta-2-microglob.")

# hierarchical clustering and determination of groups
hcr <- hclust(dist(t(unsup), method = "euclidean"))
K2 <- 8
ctr <- cutree(hcr, k = K2)
h <- floor(sort(hcr$height, decreasing = TRUE)[K2 - 1])
unsup.plus <- unsup
colnames(unsup.plus) <- NULL
rownames(unsup.plus) <- NULL

# heatmap
covcols <- col2rgb(brewer.pal(K2, "Paired"), alpha = TRUE)/255
covcols["alpha", ] <- 0.6
covcols <- do.call(rgb, as.list(as.data.frame(t(covcols))))
heatmap_plus(unsup.plus, col = cols, h = h, do.dendro = TRUE, addvar = covars, 
    covariate = 3, cluscol = covcols)

tab <- table(covars$">7 focal lesions", ctr)

### code chunk: Figure 8.5

library(s4vd)

# biclustering
set.seed(7252011)
res <- s4vd(unsup, pcerv = 0.05, pceru = 0.01, merr = 10^(-2), col.overlap = F, 
    nbiclust = 3, cols.nc = F, rows.nc = TRUE)

# heatmap
BCheatmap(unsup, res, allrows = F, allcolumns = F, axisR = F, axisC = F)

### code chunk: Figure 8.6

# parallel coordinates plot
parallelCoordinates(unsup, res, 1, plotBoth = T, order = T, las = 2, bothlab = c(" ", 
    " "), cex.axis = 0.5)

### code chunk: Figure 8.7

par(mar = c(2, 2, 1, 1) + 0.5, mgp = c(1.5, 0.5, 0))
layout(rbind(c(1, 2), c(3, 4)), respect = TRUE, widths = c(1, 1), heights = c(1, 
    1))

# principal components analysis
pca1 <- prcomp(t(unsup), scale. = TRUE)

# top-left plot
biplot(pca1, expand = 5/4, cex = 0.5, xlabs = rep("o", ncol(unsup)), ylabs = rep("", 
    nrow(unsup)), col = c("black", rgb(0.1, 0.1, 0.1, 0.2)), arrow.len = 0.07, 
    asp = 1)

# top-right plot
barplot(100 * (pca1$sdev^2/sum(pca1$sdev^2))[1:10], names.arg = 1:10, main = "", 
    ylab = "% explained variance", xlab = "PC", cex.names = 0.9)

# bottom-left plot
kmclus <- as.factor(km$cluster)
kmcols <- col2rgb(palette()[as.numeric(kmclus)], alpha = TRUE)/255
kmcols["alpha", ] <- 0.3
kmcols <- do.call(rgb, as.list(as.data.frame(t(kmcols))))
lam <- pca1$sdev[1:5] * sqrt(nrow(pca1$x))
PC1 <- pca1$rotation[, 1] * lam[1]
PC2 <- pca1$rotation[, 2] * lam[2]
PC5 <- pca1$rotation[, 5] * lam[5]
xlim <- range(PC1) + c(-abs(min(PC1)) * 0.1, abs(max(PC1)) * 0.1)
ylim <- range(PC2) + c(-abs(min(PC2)) * 0.1, abs(max(PC2)) * 0.1)
plot(PC1, PC2, pch = 19, col = kmcols, xlim = xlim, ylim = ylim, asp = 1, main = "")  #shows variables colored by k-means cluster results
legend(x = "topleft", legend = 1:4, fill = 1:4)

# bottom-right plot
source("Chapter8_utility.R")
sex <- factor(pdata$SEX, levels = c("male", "female"))
gm <- by(t(unsup), INDICES = list(sex), FUN = mean, na.rm = TRUE)
genemeans <- cbind(gm[[1]], gm[[2]])
sexcol <- rep(NA, nrow(genemeans))
sexcol[gm$female <= gm$male] <- "purple"
sexcol[gm$female > gm$male] <- "orange"
GE.Icon(Genes = cbind(PC1, PC5), genedata = genemeans, gclr = sexcol, iconsize = 0.05, 
    xlab = "PC1", ylab = "PC5")
abline(h = 7.5, lty = 1, col = "gray")
abline(h = -7, lty = 1, col = "gray")
abline(v = -7.5, lty = 1, col = "gray")
legend(x = "topleft", legend = c("higher in females", "lower in females"), fill = c("orange", 
    "purple"), cex = 0.8)
par(pty = "m")

### code chunk: Figure 8.8

library(pcaMethods)

# pairwise feature loadings plots
pca3 <- pcaMethods::pca(t(unsup), nPcs = 5, method = "svd", scale = "uv")
plotPcs(pca3, pch = 19, cex = 0.5, type = "loadings")

### code chunk: Tables A.1 and A.2 in Section 8.5

library(xtable)
library(hgu133plus2cdf)
library(hgu133plus2.db)

# annotation
geneMap <- unlist(as.list(hgu133plus2MAP))

# Table A.1
out0 <- names(which(PC1 < -7.5))
out1 <- names(which(PC5 > 7.5))
out2 <- names(which(PC5 < -7))
fdata <- eset_test@featureData@data
out <- data.frame(Subgroup = rep("1", length(out0)), fdata[match(out0, rownames(fdata)), 
    c("ID", "Gene Symbol", "Gene Title", "ENTREZ_GENE_ID")], geneMap[match(out0, 
    names(geneMap))])
out$Gene.Symbol <- gsub("^.*/// ", "", out$Gene.Symbol)
genetit0 <- gsub("^.*/// ", "", out$Gene.Title)
out$Gene.Title <- ifelse(nchar(genetit0) > 40, paste(substr(genetit0, 1, 40), 
    "...", sep = ""), genetit0)
out$ENTREZ_GENE_ID <- gsub("^.*/// ", "", out$ENTREZ_GENE_ID)
colnames(out) <- c("Subset", "Affymetrix ID", "Gene symbol", "Gene title", "Entrez ID", 
    "Cytoband")
outA <- out

print(xtable(outA, caption = "Subset of genes identified in first principal component. Gene titles are abbreviated to show at most 40 characters."), 
    include.rownames = FALSE, size = "scriptsize")

# Table A.2
fdata <- eset_test@featureData@data
out <- data.frame(Subgroup = c(rep("2", length(out1)), rep("3", length(out2))), 
    fdata[match(c(out1, out2), rownames(fdata)), c("ID", "Gene Symbol", "Gene Title", 
        "ENTREZ_GENE_ID")], geneMap[match(c(out1, out2), names(geneMap))])
out$Gene.Symbol <- gsub("^.*/// ", "", out$Gene.Symbol)
genetit <- gsub("^.*/// ", "", out$Gene.Title)
out$Gene.Title <- ifelse(nchar(genetit) > 40, paste(substr(genetit, 1, 40), 
    "...", sep = ""), genetit)
out$ENTREZ_GENE_ID <- gsub("^.*/// ", "", out$ENTREZ_GENE_ID)
colnames(out) <- c("Subset", "Affymetrix ID", "Gene symbol", "Gene title", "Entrez ID", 
    "Cytoband")
outB <- out

print(xtable(outB, caption = "Subsets of genes identified in fifth principal component. Gene titles are abbreviated to show at most 40 characters."), 
    hline.after = c(-1, 0, length(out1), length(out1) + length(out2)), include.rownames = FALSE, 
    size = "scriptsize")

### code chunk: Figure 8.9

library(tm)
library(wordcloud)

# extract word counts
corpus <- Corpus(DataframeSource(data.frame(genetit0)))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, tolower)
tdm <- TermDocumentMatrix(corpus)
m <- as.matrix(tdm)
v <- sort(rowSums(m), decreasing = TRUE)
d <- data.frame(word = names(v), freq = v)

# word cloud
pal <- gray(seq(0.7, 0, by = -0.1))
set.seed(123)
wordcloud(d$word, d$freq, min.freq = 1, random.order = FALSE, rot.per = 0, colors = pal)

Section 8.3 Supervised Methods

### code chunk: Figure 8.10

library(limma)

# limma analysis
design <- model.matrix(~focal_lesions + beta2mg + Cyto.Abn, pData(eset_train_FL))
fit <- lmFit(exprs(eset_train_FL), design)
fit <- eBayes(fit)
tt <- topTable(fit, coef = "focal_lesions>7", number = nrow(eset_train_FL), 
    p.value = 0.05, adjust.method = "BH")
ttall <- topTable(fit, coef = "focal_lesions>7", number = nrow(eset_train_FL), 
    p.value = 1)
dect <- decideTests(fit, method = "separate", adjust.method = "BH", p.value = 0.05, 
    lfc = 0)

# volcano plot
volcanoplot(fit, coef = "focal_lesions>7", highlight = 5, names = featureData(eset_train_FL)@data$"Gene Symbol")

### code chunk: Figure 8.11

library(ggplot2)
library(reshape2)

# reshape data
tmp <- data.frame(pData(eset_train_FL)[, c("geo_accession", "focal_lesions")], 
    t(exprs(eset_train_FL)[tt$ID[1:5], , drop = F]))
tmp <- reshape2:::melt.data.frame(tmp, id.vars = c("geo_accession", "focal_lesions"))
levels(tmp$variable) <- paste(featureData(eset_train_FL)@data[tt$ID[1:5], "Gene Symbol"], 
    gsub("X", "", levels(tmp$variable)), sep = "\n")

# create boxplots with ggplot2 package
p <- ggplot(tmp, aes(focal_lesions, value))
pg <- p + geom_boxplot(outlier.colour = rgb(0, 0, 0, 0), outlier.size = 0) + 
    geom_jitter(position = position_jitter(width = 0.15), alpha = 0.5) + facet_wrap(~variable, 
    nrow = 1) + theme_bw() + scale_y_continuous("log2 expression") + scale_x_discrete("focal lesions")
print(pg)

### code chunk: Figure 8.12

library(glmnet)
source("Chapter8_utility.R")

# lasso model
set.seed(815)
lasso_cvobj <- cv.glmnet(t(exprs(eset_train_FL)), eset_train_FL$focal_lesions, 
    family = "binomial", standardize = F)
coefs <- coef(lasso_cvobj, s = lasso_cvobj$lambda.min)
coefs <- coefs[which(coefs != 0), ]
eset_test_FL$focal_lesions_predicted_prob <- as.numeric(predict(lasso_cvobj$glmnet.fit, 
    newx = t(exprs(eset_test_FL)[featureNames(eset_train_FL), ]), s = lasso_cvobj$lambda.min, 
    type = "response"))
eset_test_FL$focal_lesions_predicted_class <- predict(lasso_cvobj$glmnet.fit, 
    newx = t(exprs(eset_test_FL)[featureNames(eset_train_FL), ]), s = lasso_cvobj$lambda.min, 
    type = "class")
eset_test_FL$focal_lesions_predicted_class <- factor(eset_test_FL$focal_lesions_predicted_class, 
    labels = c("0-7", ">7"))

def.par <- par(no.readonly = TRUE)
par(mfrow = c(2, 1), oma = c(0, 0, 0, 0), mar = c(5, 4, 2, 2), mgp = c(2, 0.75, 
    0))

# top plot
plot(lasso_cvobj$glmnet.fit, label = T, "lambda", xlab = expression(paste("log ", 
    lambda)), xlim = c(-3, -1), ylim = c(-0.2, 0.15), dfplot = F)
abline(v = log(lasso_cvobj$lambda.min))
abline(h = 0)

# bottom plot
plot(lasso_cvobj, xlab = expression(paste("log ", lambda)), xlim = c(-3, -1), 
    ylim = c(1.2, 1.5), ylab = "Binomial model deviance")
mtext("number of non-zero coefficients", side = 3, line = 2)
par(def.par)

### code chunk: Figure 8.13

tmp <- cbind(1 - pData(eset_test_FL)$focal_lesions_predicted_prob, pData(eset_test_FL)$focal_lesions_predicted_prob)
plotprob(tmp, label = pData(eset_test_FL)$focal_lesions, main = "")

### code chunk: Figure 8.14

library(peperr)
library(gridExtra)
source("Chapter8_utility.R")

# estimate prediction errors
tmpdat <- data.matrix(t(exprs(eset_train_FL)))
tmpresp <- as.numeric(eset_train_FL$focal_lesions) - 1
peperr_obj <- peperr(response = tmpresp, x = tmpdat, fit.fun = fit.glmnet, args.fit = list(maxit = 1000, 
    standardize = F), complexity = complexity.glmnet, args.complexity = list(nfolds = 10, 
    maxit = 3000, standardize = F), trace = T, RNG = "fixed", seed = 815, aggregation.fun = aggregation.misclass, 
    indices = resample.indices(n = ncol(eset_train_FL), sample.n = 50, method = "sub632"))

# create boxplot with ggplot2 package
tmp <- data.frame(grp = "", error = unlist(peperr_obj$sample.error) * 100)
p <- ggplot(tmp, aes(grp, error))
pg <- p + geom_boxplot(outlier.colour = rgb(0, 0, 0, 0), outlier.size = 0) + 
    geom_jitter(position = position_jitter(width = 0.1)) + theme_bw() + scale_y_continuous("misclassification rate [%]", 
    limits = c(20, 50)) + scale_x_discrete("") + geom_hline(aes(yintercept = perr(peperr_obj, 
    "resample") * 100, colour = "2"), show_guide = TRUE) + geom_hline(aes(yintercept = perr(peperr_obj, 
    "632p") * 100, colour = "3"), show_guide = TRUE) + geom_hline(aes(yintercept = perr(peperr_obj, 
    "app") * 100, colour = "4"), show_guide = TRUE) + geom_hline(aes(yintercept = perr(peperr_obj, 
    "nullmodel") * 100, colour = "5"), show_guide = TRUE) + scale_colour_discrete(guide = "legend", 
    name = "error type", breaks = c("2", "3", "4", "5"), labels = c("mean\nout-of-bag", 
        ".632plus", "apparent", "null model")) + opts(title = "Misclassification rate \n in bootstrap samples ")
print(pg)

# create histogram with ggplot2 package
p2 <- ggplot(data.frame(complx = peperr_obj$sample.complexity), aes(x = complx))
pg2 <- p2 + geom_histogram(binwidth = 0.02, fill = 0, colour = 1) + theme_bw() + 
    xlab(expression(lambda)) + ylab("frequency") + opts(panel.grid.major = theme_blank(), 
    panel.grid.minor = theme_blank(), panel.background = theme_blank()) + geom_vline(xintercept = peperr_obj$selected.complexity, 
    colour = 2) + opts(title = "Selected complexity \n in bootstrap samples", 
    axis.title.x = theme_text(size = 11)) + ggplot2::annotate("text", x = 0.14, 
    y = -0.5, label = "full data", colour = "red", size = 4)

# arrange plot layout with gridExtra package
grid.arrange(pg2, pg, ncol = 2)

### code chunk: Figure 8.15

library(globaltest)
library(KEGG.db)

# remove multiple probe sets and keep the one with strongest effect
unique_probesets <- ttall$ID[!duplicated(featureData(eset_train_FL)@data[ttall$ID, 
    "Gene Symbol"])]

# global test
gtKEGGres <- gtKEGG(focal_lesions, eset_train_FL[unique_probesets, ])
signif_keggs_gt <- which(gtKEGGres@extra[, 1] < 0.05)

# annotation and plot
library(hgu133plus2.db)
tmp <- gtKEGGres[[1]]
covariates(tmp, alias = hgu133plus2SYMBOL, legend = c("upregulated in patients with >7 lesions", 
    "downregulated in patients with >7 lesions"), what = "z-score")
title(paste(names(tmp), "-", alias(tmp)))

### code chunk: Figure 8.16

library(HTSanalyzeR)

# define results and hits
sel <- match(ttall$ID, fData(eset_train_FL)$ID)
t_all <- ttall$t
names(t_all) <- fData(eset_train_FL)$ENTREZ_GENE_ID[sel]
t_hits <- names(t_all)[which(ttall$adj.P.Val < 0.05)]
# select gene sets
KEGG_pathways <- KeggGeneSets(species = "Hs")

# gene set enrichment analysis
gscs <- list(`KEGG pathways` = KEGG_pathways)
gsca <- new("GSCA", listOfGeneSetCollections = gscs, geneList = t_all, hits = t_hits)
gsca <- preprocess(gsca, species = "Hs", initialIDs = "Entrez.gene", keepMultipleMappings = TRUE, 
    duplicateRemoverMethod = "max", orderAbsValue = FALSE)
set.seed(815)
gsca <- analyze(gsca, para = list(pValueCutoff = 0.05, pAdjustMethod = "BH", 
    nPermutations = 1000, minGeneSetSize = 15, exponent = 1))

# enrichment map
gsca2 <- appendGSTerms(gsca, keggGSCs = "KEGG pathways")
viewEnrichMap(gsca2, resultName = "GSEA.results", gscs = "KEGG pathways", ntop = 15, 
    allSig = FALSE, gsNameType = "term", displayEdgeLabel = FALSE, layout = "layout.fruchterman.reingold")

Section 8.4 Dynamical Graphics

The dynamical graphics in Figures 8.17 and 8.18 were created with the SEURAT software available on R-forge. More details can be found on the SEURAT project webpage http://seurat.r-forge.r-project.org and in the corresponding publication by Gribov et al. (2010) (URL http://dx.doi.org/10.1186/1755-8794-3-21).