All Figures in this Chapter, except the dynamical graphics in Section 8.4, have been created with the statistical computing environment R
(version 2.15) and contributed R
and Bioconductor
packages available from www.r-project.org and www.bioconductor.org, respectively.
All the R
code is contained in file Chapter8.R
to be downloaded from the webpage. Some of the R
code chunks require additional R
functions, which are collected in file Chapter8_utility.R
. Download this file from the webpage, save it in the same directory as the main R
script and run source("Chapter8_utility.R")
.
Note that not all R
code chunks are self-contained. In general, it is preferable to run all code chunks in consecutive order starting from Setup
, because some code depends on the evaluation of previous code chunks.
### code chunk: Setup
rm(list = ls())
dir.create("figures", showWarnings = FALSE)
The example data set that was used throughout the Chapter can be downloaded from the NCBI Gene Expression Omnibus repository www.ncbi.nlm.nih.gov/geo under GEO accession number GSE24080
. You will need the three Series Matrix Files (TXT format) and the EXCEL supplementary file GSE24080_MM_UAMS565_ClinInfo_27Jun2008_LS_clean.xls.gz
containing clinical data. The following code chunk shows how the data can be downloaded from within R
using the Bioconductor
library GEOquery
.
### code chunk: Fetch the data
if (file.exists("Rdata/expressionSets.Rdata")) {
library(Biobase)
load("Rdata/expressionSets.Rdata")
} else {
library(GEOquery)
library(RCurl)
# tweak to make sure that only the first supplementary data file (=
# clinical data) is downloaded by getGEOSuppFiles() instead of all files
getDirListing <- function(url) {
print(url)
a <- getURL(url)
b <- textConnection(a)
d <- read.table(b, header = FALSE, nrows = 1)
close(b)
return(d)
}
assignInNamespace("getDirListing", getDirListing, "GEOquery")
dir.create("GEOdata", showWarnings = FALSE)
dir.create("Rdata", showWarnings = FALSE)
# download processed data files from url and save locally
geo_exprs_data <- getGEO("GSE24080", destdir = "GEOdata/")
geo_exprs_suppl <- getGEOSuppFiles("GSE24080", makeDirectory = F, baseDir = "GEOdata/")
# get data from locally saved copies
geo_exprs_1 <- getGEO(filename = "GEOdata/GSE24080_series_matrix-1.txt.gz")
geo_exprs_2 <- getGEO(filename = "GEOdata/GSE24080_series_matrix-2.txt.gz")
geo_exprs_3 <- getGEO(filename = "GEOdata/GSE24080_series_matrix-3.txt.gz")
# convert all factors of pheno data to characters prior to combining eSets
index1 <- sapply(pData(geo_exprs_1), is.factor)
index2 <- sapply(pData(geo_exprs_2), is.factor)
index3 <- sapply(pData(geo_exprs_3), is.factor)
pData(geo_exprs_1)[, index1] <- sapply(pData(geo_exprs_1)[, index1], as.character)
pData(geo_exprs_2)[, index2] <- sapply(pData(geo_exprs_2)[, index2], as.character)
pData(geo_exprs_3)[, index3] <- sapply(pData(geo_exprs_3)[, index3], as.character)
# combine expression sets
geo_exprs <- combine(geo_exprs_1, geo_exprs_2, geo_exprs_3)
gunzip("GEOdata/GSE24080_MM_UAMS565_ClinInfo_27Jun2008_LS_clean.xls.gz")
library(gdata)
geo_suppl <- read.xls("GEOdata/GSE24080_MM_UAMS565_ClinInfo_27Jun2008_LS_clean.xls",
row.names = 3, na.strings = c("NA", ""))
cels <- paste(as.character(pData(geo_exprs)$title), "CEL", sep = ".")
# phenotype data
pData(geo_exprs) <- data.frame(pData(geo_exprs), geo_suppl[cels, ])
# training data set
eset_train <- geo_exprs[, geo_exprs$MAQC_Distribution_Status == "Training"] #n=340
# test data set
eset_test <- geo_exprs[, geo_exprs$MAQC_Distribution_Status == "Validation"] #n=214
annotation(eset_train) <- annotation(eset_test) <- "hgu133plus2"
save(eset_train, eset_test, file = "Rdata/expressionSets.Rdata")
}
After some data processing and filtering as shown in the following R
code chunk, the data are ready for further analysis.
### code chunk: Preprocess the data
# unspecific filtering: select 10000 most varying probe sets
library(genefilter)
rowvars <- rowVars(exprs(eset_train))
top10000 <- which(rowvars >= sort(rowvars, decreasing = T)[10000])
eset_train <- eset_train[top10000, ]
# group patients according to number of focal lesions in MRI (Walker, JCO,
# 2007) training set:
eset_train$focal_lesions <- cut(eset_train$MRI, c(0, 7, max(eset_train$MRI,
na.rm = T)), include.lowest = T, labels = c("0-7", ">7"))
eset_train$beta2mg <- cut(as.numeric(as.character(eset_train$B2M)), c(0, 3.5,
max(as.numeric(as.character(eset_train$B2M)), na.rm = T)), include.lowest = T,
labels = c("<= 3.5 mg/dL", ">3.5 mg/dL"))
eset_train_FL <- eset_train[, !is.na(eset_train$focal_lesions) & !is.na(eset_train$beta2mg)]
# test set:
eset_test$focal_lesions <- cut(eset_test$MRI, c(0, 7, max(eset_test$MRI, na.rm = T)),
include.lowest = T, labels = c("0-7", ">7"))
B2M <- sub("<", "", eset_test$B2M)
B2M <- sub("<", "", B2M)
eset_test$beta2mg <- cut(as.numeric(as.character(B2M)), c(0, 3.5, max(as.numeric(as.character(eset_test$B2M)),
na.rm = T)), include.lowest = T, labels = c("<= 3.5 mg/dL", ">3.5 mg/dL"))
eset_test_FL <- eset_test[, !is.na(eset_test$focal_lesions) & !is.na(eset_test$beta2mg)]
# further unspecific filtering to reduce the number of probes to make
# plots like heatmaps more useful
rowvars <- rowVars(exprs(eset_test))
top <- which(rowvars >= sort(rowvars, decreasing = T)[10000])
top2 <- which(rowvars >= sort(rowvars, decreasing = T)[500])
unsup.all <- t(scale(t(exprs(eset_test[top, ]))))
unsup <- t(scale(t(exprs(eset_test[top2, ]))))
pdata <- pData(eset_test)
### code chunk: Figure 8.1
library(gplots)
# k-means clustering
set.seed(12345)
K <- 4
km <- kmeans(unsup, K)
kmcol <- (palette()[1:K])[as.vector(km$cluster)]
# heatmap
cols <- bluered(100)
heatmap.2(unsup, col = cols, scale = "none", margins = c(2, 2), key = TRUE,
trace = "none", density.info = "none", symbreaks = FALSE, labRow = "", labCol = "",
xlab = "Samples", ylab = "Features", RowSideColors = kmcol, keysize = 1.25)
### code chunk: Figure 8.2
library(cluster)
# silhouette
km.ord <- order(km$cluster)
km.sil <- km$cluster[km.ord]
unsup.sil <- unsup[km.ord, ]
sil <- silhouette(km.sil, dist(unsup.sil, "euclidean"))
# plot
plot(sil, col = 1:4, main = "", sub = "", do.clus.stat = FALSE, do.n.k = FALSE)
abline(v = 0)
abline(h = (nrow(unsup) - match(unique(km.sil), km.sil)[-1]) + 1)
text(x = 0.9, y = (nrow(unsup) - match(unique(km.sil), km.sil) - 5), pos = 1,
labels = paste("cluster", 1:4))
box()
### code chunk: Figure 8.3
library(clusterCons)
# consensus clustering
set.seed(1234)
cmr <- cluscomp(data.frame(unsup), algo = list("kmeans"), clmin = 2, clmax = 6,
rep = 50, merge = 0)
cm2 <- cmr$e1_kmeans_k2@cm
cm3 <- cmr$e1_kmeans_k3@cm
cm4 <- cmr$e1_kmeans_k4@cm
cm5 <- cmr$e1_kmeans_k5@cm
cm6 <- cmr$e1_kmeans_k6@cm
acs <- aucs(cmr)
dks <- deltak(acs)
# top-left plot
par(mar = c(3, 3, 2, 2))
plot(ecdf(cm2), main = "eCDF", ylab = "", xlab = "")
lines(ecdf(cm3), col = "purple")
lines(ecdf(cm4), col = "blue")
lines(ecdf(cm5), col = "green")
lines(ecdf(cm6), col = "orange")
legend(x = "bottomright", legend = c("K=2", "K=3", "K=4", "K=5", "K=6"), text.col = c("black",
"purple", "blue", "green", "orange"))
# bottom-left
line_number <- length(unique(dks[, 2]))
lines <- c(1:line_number)
points <- c(1:line_number)
p1 <- xyplot(deltak ~ k, group = dks[, 2], dks, scales = list(x = list(tick.number = 4)),
xlab = "cluster number (k)", ylab = "delta-K", type = c("o"), panel = function(...) {
panel.xyplot(..., pch = points, lty = lines)
})
plot(p1)
# right plot
cm <- cmr$e1_kmeans_k4@cm
colnames(cm) <- rownames(cm) <- rep("", nrow(unsup))
heatmap(cm)
### code chunk: Figure 8.4
library(RColorBrewer)
library(Heatplus)
# put together the covariables to be added to heatmap
covars <- pdata[, c("Cyto.Abn", "focal_lesions", "B2M")]
covars$focal_lesions <- as.numeric(covars$focal_lesions) - 1
covars$B2M <- log10(as.numeric(as.character(covars$B2M)))
covars$B2M[is.na(covars$B2M)] <- median(covars$B2M, na.rm = TRUE) #imputation of missing values is needed, since missing values are not allowed in picketplot()
colnames(covars) <- c("cytogenetic aberrations", ">7 focal lesions", "log10 beta-2-microglob.")
# hierarchical clustering and determination of groups
hcr <- hclust(dist(t(unsup), method = "euclidean"))
K2 <- 8
ctr <- cutree(hcr, k = K2)
h <- floor(sort(hcr$height, decreasing = TRUE)[K2 - 1])
unsup.plus <- unsup
colnames(unsup.plus) <- NULL
rownames(unsup.plus) <- NULL
# heatmap
covcols <- col2rgb(brewer.pal(K2, "Paired"), alpha = TRUE)/255
covcols["alpha", ] <- 0.6
covcols <- do.call(rgb, as.list(as.data.frame(t(covcols))))
heatmap_plus(unsup.plus, col = cols, h = h, do.dendro = TRUE, addvar = covars,
covariate = 3, cluscol = covcols)
tab <- table(covars$">7 focal lesions", ctr)
### code chunk: Figure 8.5
library(s4vd)
# biclustering
set.seed(7252011)
res <- s4vd(unsup, pcerv = 0.05, pceru = 0.01, merr = 10^(-2), col.overlap = F,
nbiclust = 3, cols.nc = F, rows.nc = TRUE)
# heatmap
BCheatmap(unsup, res, allrows = F, allcolumns = F, axisR = F, axisC = F)
### code chunk: Figure 8.6
# parallel coordinates plot
parallelCoordinates(unsup, res, 1, plotBoth = T, order = T, las = 2, bothlab = c(" ",
" "), cex.axis = 0.5)
### code chunk: Figure 8.7
par(mar = c(2, 2, 1, 1) + 0.5, mgp = c(1.5, 0.5, 0))
layout(rbind(c(1, 2), c(3, 4)), respect = TRUE, widths = c(1, 1), heights = c(1,
1))
# principal components analysis
pca1 <- prcomp(t(unsup), scale. = TRUE)
# top-left plot
biplot(pca1, expand = 5/4, cex = 0.5, xlabs = rep("o", ncol(unsup)), ylabs = rep("",
nrow(unsup)), col = c("black", rgb(0.1, 0.1, 0.1, 0.2)), arrow.len = 0.07,
asp = 1)
# top-right plot
barplot(100 * (pca1$sdev^2/sum(pca1$sdev^2))[1:10], names.arg = 1:10, main = "",
ylab = "% explained variance", xlab = "PC", cex.names = 0.9)
# bottom-left plot
kmclus <- as.factor(km$cluster)
kmcols <- col2rgb(palette()[as.numeric(kmclus)], alpha = TRUE)/255
kmcols["alpha", ] <- 0.3
kmcols <- do.call(rgb, as.list(as.data.frame(t(kmcols))))
lam <- pca1$sdev[1:5] * sqrt(nrow(pca1$x))
PC1 <- pca1$rotation[, 1] * lam[1]
PC2 <- pca1$rotation[, 2] * lam[2]
PC5 <- pca1$rotation[, 5] * lam[5]
xlim <- range(PC1) + c(-abs(min(PC1)) * 0.1, abs(max(PC1)) * 0.1)
ylim <- range(PC2) + c(-abs(min(PC2)) * 0.1, abs(max(PC2)) * 0.1)
plot(PC1, PC2, pch = 19, col = kmcols, xlim = xlim, ylim = ylim, asp = 1, main = "") #shows variables colored by k-means cluster results
legend(x = "topleft", legend = 1:4, fill = 1:4)
# bottom-right plot
source("Chapter8_utility.R")
sex <- factor(pdata$SEX, levels = c("male", "female"))
gm <- by(t(unsup), INDICES = list(sex), FUN = mean, na.rm = TRUE)
genemeans <- cbind(gm[[1]], gm[[2]])
sexcol <- rep(NA, nrow(genemeans))
sexcol[gm$female <= gm$male] <- "purple"
sexcol[gm$female > gm$male] <- "orange"
GE.Icon(Genes = cbind(PC1, PC5), genedata = genemeans, gclr = sexcol, iconsize = 0.05,
xlab = "PC1", ylab = "PC5")
abline(h = 7.5, lty = 1, col = "gray")
abline(h = -7, lty = 1, col = "gray")
abline(v = -7.5, lty = 1, col = "gray")
legend(x = "topleft", legend = c("higher in females", "lower in females"), fill = c("orange",
"purple"), cex = 0.8)
par(pty = "m")
### code chunk: Figure 8.8
library(pcaMethods)
# pairwise feature loadings plots
pca3 <- pcaMethods::pca(t(unsup), nPcs = 5, method = "svd", scale = "uv")
plotPcs(pca3, pch = 19, cex = 0.5, type = "loadings")
### code chunk: Tables A.1 and A.2 in Section 8.5
library(xtable)
library(hgu133plus2cdf)
library(hgu133plus2.db)
# annotation
geneMap <- unlist(as.list(hgu133plus2MAP))
# Table A.1
out0 <- names(which(PC1 < -7.5))
out1 <- names(which(PC5 > 7.5))
out2 <- names(which(PC5 < -7))
fdata <- eset_test@featureData@data
out <- data.frame(Subgroup = rep("1", length(out0)), fdata[match(out0, rownames(fdata)),
c("ID", "Gene Symbol", "Gene Title", "ENTREZ_GENE_ID")], geneMap[match(out0,
names(geneMap))])
out$Gene.Symbol <- gsub("^.*/// ", "", out$Gene.Symbol)
genetit0 <- gsub("^.*/// ", "", out$Gene.Title)
out$Gene.Title <- ifelse(nchar(genetit0) > 40, paste(substr(genetit0, 1, 40),
"...", sep = ""), genetit0)
out$ENTREZ_GENE_ID <- gsub("^.*/// ", "", out$ENTREZ_GENE_ID)
colnames(out) <- c("Subset", "Affymetrix ID", "Gene symbol", "Gene title", "Entrez ID",
"Cytoband")
outA <- out
print(xtable(outA, caption = "Subset of genes identified in first principal component. Gene titles are abbreviated to show at most 40 characters."),
include.rownames = FALSE, size = "scriptsize")
# Table A.2
fdata <- eset_test@featureData@data
out <- data.frame(Subgroup = c(rep("2", length(out1)), rep("3", length(out2))),
fdata[match(c(out1, out2), rownames(fdata)), c("ID", "Gene Symbol", "Gene Title",
"ENTREZ_GENE_ID")], geneMap[match(c(out1, out2), names(geneMap))])
out$Gene.Symbol <- gsub("^.*/// ", "", out$Gene.Symbol)
genetit <- gsub("^.*/// ", "", out$Gene.Title)
out$Gene.Title <- ifelse(nchar(genetit) > 40, paste(substr(genetit, 1, 40),
"...", sep = ""), genetit)
out$ENTREZ_GENE_ID <- gsub("^.*/// ", "", out$ENTREZ_GENE_ID)
colnames(out) <- c("Subset", "Affymetrix ID", "Gene symbol", "Gene title", "Entrez ID",
"Cytoband")
outB <- out
print(xtable(outB, caption = "Subsets of genes identified in fifth principal component. Gene titles are abbreviated to show at most 40 characters."),
hline.after = c(-1, 0, length(out1), length(out1) + length(out2)), include.rownames = FALSE,
size = "scriptsize")
### code chunk: Figure 8.9
library(tm)
library(wordcloud)
# extract word counts
corpus <- Corpus(DataframeSource(data.frame(genetit0)))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, tolower)
tdm <- TermDocumentMatrix(corpus)
m <- as.matrix(tdm)
v <- sort(rowSums(m), decreasing = TRUE)
d <- data.frame(word = names(v), freq = v)
# word cloud
pal <- gray(seq(0.7, 0, by = -0.1))
set.seed(123)
wordcloud(d$word, d$freq, min.freq = 1, random.order = FALSE, rot.per = 0, colors = pal)
### code chunk: Figure 8.10
library(limma)
# limma analysis
design <- model.matrix(~focal_lesions + beta2mg + Cyto.Abn, pData(eset_train_FL))
fit <- lmFit(exprs(eset_train_FL), design)
fit <- eBayes(fit)
tt <- topTable(fit, coef = "focal_lesions>7", number = nrow(eset_train_FL),
p.value = 0.05, adjust.method = "BH")
ttall <- topTable(fit, coef = "focal_lesions>7", number = nrow(eset_train_FL),
p.value = 1)
dect <- decideTests(fit, method = "separate", adjust.method = "BH", p.value = 0.05,
lfc = 0)
# volcano plot
volcanoplot(fit, coef = "focal_lesions>7", highlight = 5, names = featureData(eset_train_FL)@data$"Gene Symbol")
### code chunk: Figure 8.11
library(ggplot2)
library(reshape2)
# reshape data
tmp <- data.frame(pData(eset_train_FL)[, c("geo_accession", "focal_lesions")],
t(exprs(eset_train_FL)[tt$ID[1:5], , drop = F]))
tmp <- reshape2:::melt.data.frame(tmp, id.vars = c("geo_accession", "focal_lesions"))
levels(tmp$variable) <- paste(featureData(eset_train_FL)@data[tt$ID[1:5], "Gene Symbol"],
gsub("X", "", levels(tmp$variable)), sep = "\n")
# create boxplots with ggplot2 package
p <- ggplot(tmp, aes(focal_lesions, value))
pg <- p + geom_boxplot(outlier.colour = rgb(0, 0, 0, 0), outlier.size = 0) +
geom_jitter(position = position_jitter(width = 0.15), alpha = 0.5) + facet_wrap(~variable,
nrow = 1) + theme_bw() + scale_y_continuous("log2 expression") + scale_x_discrete("focal lesions")
print(pg)
### code chunk: Figure 8.12
library(glmnet)
source("Chapter8_utility.R")
# lasso model
set.seed(815)
lasso_cvobj <- cv.glmnet(t(exprs(eset_train_FL)), eset_train_FL$focal_lesions,
family = "binomial", standardize = F)
coefs <- coef(lasso_cvobj, s = lasso_cvobj$lambda.min)
coefs <- coefs[which(coefs != 0), ]
eset_test_FL$focal_lesions_predicted_prob <- as.numeric(predict(lasso_cvobj$glmnet.fit,
newx = t(exprs(eset_test_FL)[featureNames(eset_train_FL), ]), s = lasso_cvobj$lambda.min,
type = "response"))
eset_test_FL$focal_lesions_predicted_class <- predict(lasso_cvobj$glmnet.fit,
newx = t(exprs(eset_test_FL)[featureNames(eset_train_FL), ]), s = lasso_cvobj$lambda.min,
type = "class")
eset_test_FL$focal_lesions_predicted_class <- factor(eset_test_FL$focal_lesions_predicted_class,
labels = c("0-7", ">7"))
def.par <- par(no.readonly = TRUE)
par(mfrow = c(2, 1), oma = c(0, 0, 0, 0), mar = c(5, 4, 2, 2), mgp = c(2, 0.75,
0))
# top plot
plot(lasso_cvobj$glmnet.fit, label = T, "lambda", xlab = expression(paste("log ",
lambda)), xlim = c(-3, -1), ylim = c(-0.2, 0.15), dfplot = F)
abline(v = log(lasso_cvobj$lambda.min))
abline(h = 0)
# bottom plot
plot(lasso_cvobj, xlab = expression(paste("log ", lambda)), xlim = c(-3, -1),
ylim = c(1.2, 1.5), ylab = "Binomial model deviance")
mtext("number of non-zero coefficients", side = 3, line = 2)
par(def.par)
### code chunk: Figure 8.13
tmp <- cbind(1 - pData(eset_test_FL)$focal_lesions_predicted_prob, pData(eset_test_FL)$focal_lesions_predicted_prob)
plotprob(tmp, label = pData(eset_test_FL)$focal_lesions, main = "")
### code chunk: Figure 8.14
library(peperr)
library(gridExtra)
source("Chapter8_utility.R")
# estimate prediction errors
tmpdat <- data.matrix(t(exprs(eset_train_FL)))
tmpresp <- as.numeric(eset_train_FL$focal_lesions) - 1
peperr_obj <- peperr(response = tmpresp, x = tmpdat, fit.fun = fit.glmnet, args.fit = list(maxit = 1000,
standardize = F), complexity = complexity.glmnet, args.complexity = list(nfolds = 10,
maxit = 3000, standardize = F), trace = T, RNG = "fixed", seed = 815, aggregation.fun = aggregation.misclass,
indices = resample.indices(n = ncol(eset_train_FL), sample.n = 50, method = "sub632"))
# create boxplot with ggplot2 package
tmp <- data.frame(grp = "", error = unlist(peperr_obj$sample.error) * 100)
p <- ggplot(tmp, aes(grp, error))
pg <- p + geom_boxplot(outlier.colour = rgb(0, 0, 0, 0), outlier.size = 0) +
geom_jitter(position = position_jitter(width = 0.1)) + theme_bw() + scale_y_continuous("misclassification rate [%]",
limits = c(20, 50)) + scale_x_discrete("") + geom_hline(aes(yintercept = perr(peperr_obj,
"resample") * 100, colour = "2"), show_guide = TRUE) + geom_hline(aes(yintercept = perr(peperr_obj,
"632p") * 100, colour = "3"), show_guide = TRUE) + geom_hline(aes(yintercept = perr(peperr_obj,
"app") * 100, colour = "4"), show_guide = TRUE) + geom_hline(aes(yintercept = perr(peperr_obj,
"nullmodel") * 100, colour = "5"), show_guide = TRUE) + scale_colour_discrete(guide = "legend",
name = "error type", breaks = c("2", "3", "4", "5"), labels = c("mean\nout-of-bag",
".632plus", "apparent", "null model")) + opts(title = "Misclassification rate \n in bootstrap samples ")
print(pg)
# create histogram with ggplot2 package
p2 <- ggplot(data.frame(complx = peperr_obj$sample.complexity), aes(x = complx))
pg2 <- p2 + geom_histogram(binwidth = 0.02, fill = 0, colour = 1) + theme_bw() +
xlab(expression(lambda)) + ylab("frequency") + opts(panel.grid.major = theme_blank(),
panel.grid.minor = theme_blank(), panel.background = theme_blank()) + geom_vline(xintercept = peperr_obj$selected.complexity,
colour = 2) + opts(title = "Selected complexity \n in bootstrap samples",
axis.title.x = theme_text(size = 11)) + ggplot2::annotate("text", x = 0.14,
y = -0.5, label = "full data", colour = "red", size = 4)
# arrange plot layout with gridExtra package
grid.arrange(pg2, pg, ncol = 2)
### code chunk: Figure 8.15
library(globaltest)
library(KEGG.db)
# remove multiple probe sets and keep the one with strongest effect
unique_probesets <- ttall$ID[!duplicated(featureData(eset_train_FL)@data[ttall$ID,
"Gene Symbol"])]
# global test
gtKEGGres <- gtKEGG(focal_lesions, eset_train_FL[unique_probesets, ])
signif_keggs_gt <- which(gtKEGGres@extra[, 1] < 0.05)
# annotation and plot
library(hgu133plus2.db)
tmp <- gtKEGGres[[1]]
covariates(tmp, alias = hgu133plus2SYMBOL, legend = c("upregulated in patients with >7 lesions",
"downregulated in patients with >7 lesions"), what = "z-score")
title(paste(names(tmp), "-", alias(tmp)))
### code chunk: Figure 8.16
library(HTSanalyzeR)
# define results and hits
sel <- match(ttall$ID, fData(eset_train_FL)$ID)
t_all <- ttall$t
names(t_all) <- fData(eset_train_FL)$ENTREZ_GENE_ID[sel]
t_hits <- names(t_all)[which(ttall$adj.P.Val < 0.05)]
# select gene sets
KEGG_pathways <- KeggGeneSets(species = "Hs")
# gene set enrichment analysis
gscs <- list(`KEGG pathways` = KEGG_pathways)
gsca <- new("GSCA", listOfGeneSetCollections = gscs, geneList = t_all, hits = t_hits)
gsca <- preprocess(gsca, species = "Hs", initialIDs = "Entrez.gene", keepMultipleMappings = TRUE,
duplicateRemoverMethod = "max", orderAbsValue = FALSE)
set.seed(815)
gsca <- analyze(gsca, para = list(pValueCutoff = 0.05, pAdjustMethod = "BH",
nPermutations = 1000, minGeneSetSize = 15, exponent = 1))
# enrichment map
gsca2 <- appendGSTerms(gsca, keggGSCs = "KEGG pathways")
viewEnrichMap(gsca2, resultName = "GSEA.results", gscs = "KEGG pathways", ntop = 15,
allSig = FALSE, gsNameType = "term", displayEdgeLabel = FALSE, layout = "layout.fruchterman.reingold")
The dynamical graphics in Figures 8.17 and 8.18 were created with the SEURAT software available on R-forge
. More details can be found on the SEURAT project webpage http://seurat.r-forge.r-project.org and in the corresponding publication by Gribov et al. (2010) (URL http://dx.doi.org/10.1186/1755-8794-3-21).