How to format TCGA datasets for MOSClip analysis

Need functions stored in functions-to-process-TCGA-data.R file.

library(maftools)
library(checkmate)
library(org.Hs.eg.db)
library(impute)
library(doParallel)
library(MethylMix)
library(TCGAbiolinks)
library(SummarizedExperiment)
source("downloadTCGA/functions-to-process-TCGA-data.R")

genome = "hg38"
tumor = "OV"
project = paste0("TCGA-", tumor)

# load data downloaded by TCGAbiolinks
load(paste0("downloadTCGA/", project, "-", genome, ".RData"))

Expression

# convert ensembl ids into entrez
geneAnnotation <- rowData(exprData)

ensembl2entrez <- mapIds(org.Hs.eg.db, 
                         keys = sub("\\..*", "", geneAnnotation$gene_id),
                         column = "ENTREZID", keytype = "ENSEMBL")
geneAnnotation$entrezID <- ensembl2entrez

rowData(exprData) <- geneAnnotation
genes <- geneAnnotation$gene_id[!is.na(geneAnnotation$entrezID)]
# select primary tumors
colAnnotation <- exprData@colData
primaryTumorSel  <- colAnnotation$definition=="Primary solid Tumor"
primaryTumor <- colnames(exprData)[primaryTumorSel]
exprData <- exprData[genes, primaryTumor, drop=F]
# take average for duplicated entrez ids
exp <- assay(exprData, 1)
row.names(exp) <- sub("\\..*", "", row.names(exprData))

avg <- tapply(row.names(exp), rowData(exprData)$entrezID, function(r){
  row = exp[r, ,drop=F]
  if (nrow(row)>1)
    row=colMeans(row, na.rm = T)
  row
})

expAvg <- do.call(rbind, avg)
row.names(expAvg) <- names(avg)

colnames(expAvg) <- substr(colnames(expAvg), 1, 12)

expAvg[1:6, 1:6]
##           TCGA-20-1687 TCGA-29-2414 TCGA-24-1416 TCGA-29-1697 TCGA-25-1635
## 1                    1           13            1            3            3
## 10                   1            3            0            6            8
## 100                307          556          352          373          530
## 1000              1112          517          319         7134         3755
## 10000              274          531           95          564          885
## 100009668            2            6            8           30            4
##           TCGA-20-1686
## 1                    1
## 10                   1
## 100                173
## 1000                77
## 10000              308
## 100009668           16

Mutation

maf <- read.maf(mutData)
## -Validating
## -Silent variants: 9351 
## -Summarizing
## --Mutiple centers found
## WUGSC;BI;BI;WUGSC--Possible FLAGS among top ten genes:
##   TTN
##   USH2A
##   MUC16
##   HMCN1
##   FLG
## -Processing clinical data
## --Missing clinical data
## -Finished in 0.926s elapsed (2.266s cpu)
considerThisImpactType <- c("HIGH", "MODERATE")

ov.mutations <- prepareMutations(maf@data, impact=considerThisImpactType, 
                                 filterByThisEntrez=row.names(expAvg), 
                                 patients=colnames(expAvg))

ov.mutations$data[1:6,1:6]
##           TCGA-24-2033 TCGA-13-1488 TCGA-24-1844 TCGA-25-2400 TCGA-09-2045
## 1                    1            0            0            0            0
## 10                   0            1            1            0            0
## 100                  0            0            0            1            0
## 1000                 0            0            0            0            1
## 10003                0            0            0            0            0
## 100049587            0            0            0            0            0
##           TCGA-23-1029
## 1                    0
## 10                   0
## 100                  0
## 1000                 1
## 10003                0
## 100049587            0

CNV

ov.cnv <- lapply(cnvData, as.numeric)
ov.cnv <- data.frame(ov.cnv, check.names = F)
row.names(ov.cnv) <- row.names(cnvData)

for (i in names(ov.cnv)) {
  if (!identical(ov.cnv[[i]], as.numeric(cnvData[[i]])))
    warning("sample number ", i, " is not equal after numeric transformation")
}

ov.cnv[1:6,1:6]
##        TCGA-04-1331 TCGA-04-1332 TCGA-04-1335 TCGA-04-1336 TCGA-04-1337
## 116983           -1            0           -1            1            0
## 140625           -1            0           -1            1            0
## 375790           -1            0           -1            1            0
## 441869           -1            0           -1            1            0
## 55210            -1            0           -1            1            0
## 83858            -1            0           -1            1            0
##        TCGA-04-1338
## 116983           -2
## 140625           -2
## 375790           -2
## 441869           -2
## 55210            -2
## 83858            -2

Methylation

# Select primary solid tumors
sampleSelection <- colData(metData27)[[5]] == "Primary solid Tumor"
normalSelection <- colData(metData27)[[5]] == "Solid Tissue Normal"
samples <- colnames(metData27)[sampleSelection]
normals <- colnames(metData27)[normalSelection]
normData <- metData27[, normals]
metData <- metData27[, samples]

# Remove all rows with low values and impute missing values
thr <- ncol(assay(normData))*0.4
remove <- apply(assay(normData),1,sum) > thr
normAssay <- assay(normData)[!remove, ]
normAssay <- imputeKNN(normAssay,k=5)$data
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 3593 rows with more than 50 % entries missing;
##  mean imputation used for these rows
thr <- ncol(assay(metData))*0.4
remove <- apply(is.na(assay(metData)),1,sum) > thr
metAssay <- assay(metData)[!remove, ]
metAssay <- imputeKNN(metAssay,k=5)$data

# make clusters
cl <- makeCluster(16)
registerDoParallel(cl)
metClustValues <- ClusterProbes(metAssay, normAssay)
## Removing 2115 probes with SNPs.
## Clustering 15383 probes in CpG site clusters.
## Removing doubles for 1 samples.
## 
## Found 12195 CpG site clusters.
stopCluster(cl)
# connect methylation clusters to genes
geneNames <- sapply(strsplit(row.names(metClustValues$MET_Cancer_Clustered), "---"),
                    function(x) x[1])
assList <- tapply(row.names(metClustValues$MET_Cancer_Clustered), geneNames, function(g) g)
symbol2entrez <- mapIds(org.Hs.eg.db, keys=names(assList), column="ENTREZID",
                        keytype="SYMBOL")
names(assList) <- symbol2entrez
metClustValues$eMap <- assList[!is.na(names(assList))]

Survival annotation

fup<-createFollowUp(followUp, newTumorEvent)

Two-class annotation

subtypes <- PanCancerAtlas_subtypes()
table(subtypes[subtypes$cancer.type=="OVCA", "Subtype_mRNA"])
## Subtype_mRNA
## Differentiated Immunoreactive    Mesenchymal  Proliferative 
##            135            107            109            138
subtypes <- subtypes[subtypes$cancer.type=="OVCA",]

classes <- data.frame(row.names = substr(subtypes$pan.samplesID, 1, 12), 
                      classes=subtypes$Subtype_mRNA)
save(classes, fup, metClustValues, ov.cnv, ov.mutations, expAvg, file="downloadTCGA/TCGA-OV-pre-processed.RData")