Need functions stored in functions-to-process-TCGA-data.R file.
library(maftools)
library(checkmate)
library(org.Hs.eg.db)
library(impute)
library(doParallel)
library(MethylMix)
library(TCGAbiolinks)
library(SummarizedExperiment)
source("downloadTCGA/functions-to-process-TCGA-data.R")
genome = "hg38"
tumor = "OV"
project = paste0("TCGA-", tumor)
# load data downloaded by TCGAbiolinks
load(paste0("downloadTCGA/", project, "-", genome, ".RData"))
# convert ensembl ids into entrez
geneAnnotation <- rowData(exprData)
ensembl2entrez <- mapIds(org.Hs.eg.db,
keys = sub("\\..*", "", geneAnnotation$gene_id),
column = "ENTREZID", keytype = "ENSEMBL")
geneAnnotation$entrezID <- ensembl2entrez
rowData(exprData) <- geneAnnotation
genes <- geneAnnotation$gene_id[!is.na(geneAnnotation$entrezID)]
# select primary tumors
colAnnotation <- exprData@colData
primaryTumorSel <- colAnnotation$definition=="Primary solid Tumor"
primaryTumor <- colnames(exprData)[primaryTumorSel]
exprData <- exprData[genes, primaryTumor, drop=F]
# take average for duplicated entrez ids
exp <- assay(exprData, 1)
row.names(exp) <- sub("\\..*", "", row.names(exprData))
avg <- tapply(row.names(exp), rowData(exprData)$entrezID, function(r){
row = exp[r, ,drop=F]
if (nrow(row)>1)
row=colMeans(row, na.rm = T)
row
})
expAvg <- do.call(rbind, avg)
row.names(expAvg) <- names(avg)
colnames(expAvg) <- substr(colnames(expAvg), 1, 12)
expAvg[1:6, 1:6]
## TCGA-20-1687 TCGA-29-2414 TCGA-24-1416 TCGA-29-1697 TCGA-25-1635
## 1 1 13 1 3 3
## 10 1 3 0 6 8
## 100 307 556 352 373 530
## 1000 1112 517 319 7134 3755
## 10000 274 531 95 564 885
## 100009668 2 6 8 30 4
## TCGA-20-1686
## 1 1
## 10 1
## 100 173
## 1000 77
## 10000 308
## 100009668 16
maf <- read.maf(mutData)
## -Validating
## -Silent variants: 9351
## -Summarizing
## --Mutiple centers found
## WUGSC;BI;BI;WUGSC--Possible FLAGS among top ten genes:
## TTN
## USH2A
## MUC16
## HMCN1
## FLG
## -Processing clinical data
## --Missing clinical data
## -Finished in 0.926s elapsed (2.266s cpu)
considerThisImpactType <- c("HIGH", "MODERATE")
ov.mutations <- prepareMutations(maf@data, impact=considerThisImpactType,
filterByThisEntrez=row.names(expAvg),
patients=colnames(expAvg))
ov.mutations$data[1:6,1:6]
## TCGA-24-2033 TCGA-13-1488 TCGA-24-1844 TCGA-25-2400 TCGA-09-2045
## 1 1 0 0 0 0
## 10 0 1 1 0 0
## 100 0 0 0 1 0
## 1000 0 0 0 0 1
## 10003 0 0 0 0 0
## 100049587 0 0 0 0 0
## TCGA-23-1029
## 1 0
## 10 0
## 100 0
## 1000 1
## 10003 0
## 100049587 0
ov.cnv <- lapply(cnvData, as.numeric)
ov.cnv <- data.frame(ov.cnv, check.names = F)
row.names(ov.cnv) <- row.names(cnvData)
for (i in names(ov.cnv)) {
if (!identical(ov.cnv[[i]], as.numeric(cnvData[[i]])))
warning("sample number ", i, " is not equal after numeric transformation")
}
ov.cnv[1:6,1:6]
## TCGA-04-1331 TCGA-04-1332 TCGA-04-1335 TCGA-04-1336 TCGA-04-1337
## 116983 -1 0 -1 1 0
## 140625 -1 0 -1 1 0
## 375790 -1 0 -1 1 0
## 441869 -1 0 -1 1 0
## 55210 -1 0 -1 1 0
## 83858 -1 0 -1 1 0
## TCGA-04-1338
## 116983 -2
## 140625 -2
## 375790 -2
## 441869 -2
## 55210 -2
## 83858 -2
# Select primary solid tumors
sampleSelection <- colData(metData27)[[5]] == "Primary solid Tumor"
normalSelection <- colData(metData27)[[5]] == "Solid Tissue Normal"
samples <- colnames(metData27)[sampleSelection]
normals <- colnames(metData27)[normalSelection]
normData <- metData27[, normals]
metData <- metData27[, samples]
# Remove all rows with low values and impute missing values
thr <- ncol(assay(normData))*0.4
remove <- apply(assay(normData),1,sum) > thr
normAssay <- assay(normData)[!remove, ]
normAssay <- imputeKNN(normAssay,k=5)$data
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 3593 rows with more than 50 % entries missing;
## mean imputation used for these rows
thr <- ncol(assay(metData))*0.4
remove <- apply(is.na(assay(metData)),1,sum) > thr
metAssay <- assay(metData)[!remove, ]
metAssay <- imputeKNN(metAssay,k=5)$data
# make clusters
cl <- makeCluster(16)
registerDoParallel(cl)
metClustValues <- ClusterProbes(metAssay, normAssay)
## Removing 2115 probes with SNPs.
## Clustering 15383 probes in CpG site clusters.
## Removing doubles for 1 samples.
##
## Found 12195 CpG site clusters.
stopCluster(cl)
# connect methylation clusters to genes
geneNames <- sapply(strsplit(row.names(metClustValues$MET_Cancer_Clustered), "---"),
function(x) x[1])
assList <- tapply(row.names(metClustValues$MET_Cancer_Clustered), geneNames, function(g) g)
symbol2entrez <- mapIds(org.Hs.eg.db, keys=names(assList), column="ENTREZID",
keytype="SYMBOL")
names(assList) <- symbol2entrez
metClustValues$eMap <- assList[!is.na(names(assList))]
fup<-createFollowUp(followUp, newTumorEvent)
subtypes <- PanCancerAtlas_subtypes()
table(subtypes[subtypes$cancer.type=="OVCA", "Subtype_mRNA"])
## Subtype_mRNA
## Differentiated Immunoreactive Mesenchymal Proliferative
## 135 107 109 138
subtypes <- subtypes[subtypes$cancer.type=="OVCA",]
classes <- data.frame(row.names = substr(subtypes$pan.samplesID, 1, 12),
classes=subtypes$Subtype_mRNA)
save(classes, fup, metClustValues, ov.cnv, ov.mutations, expAvg, file="downloadTCGA/TCGA-OV-pre-processed.RData")