1. Get the barcode of the samples with co-occurring measurements

getData.R → subtype.tsv

DB has been updated since the download, i.e.:

suppressPackageStartupMessages({
library(SummarizedExperiment)#1.22.0
library(TCGAbiolinks)#2.20.1
#library(VennDiagram)#1.6.20
})
##########SAMPLE IDs PER DATA TYPE#####################
mthyltn <-  GDCquery(project = "TCGA-BRCA",
    data.category = "DNA Methylation",
    platform="Illumina Human Methylation 450")
## --------------------------------------
## o GDCquery: Searching in GDC database
## --------------------------------------
## Genome of reference: hg38
## --------------------------------------------
## oo Accessing GDC. This might take a while...
## --------------------------------------------
## ooo Project: TCGA-BRCA
## --------------------
## oo Filtering results
## --------------------
## ooo By platform
## ----------------
## oo Checking data
## ----------------
## ooo Check if there are duplicated cases
## Warning: There are more than one file for the same case. Please verify query results. You can use the command View(getResults(query)) in rstudio
## ooo Check if there results for the query
## -------------------
## o Preparing output
## -------------------
mthyltn=getResults(mthyltn)
head(mthyltn,3)
i=substr(mthyltn$cases,1,19)
xprssn <- GDCquery(project = "TCGA-BRCA",
  data.category = "Transcriptome Profiling",
  data.type = "Gene Expression Quantification",
  workflow.type="STAR - Counts")#11/2022 option
## --------------------------------------
## o GDCquery: Searching in GDC database
## --------------------------------------
## Genome of reference: hg38
## --------------------------------------------
## oo Accessing GDC. This might take a while...
## --------------------------------------------
## ooo Project: TCGA-BRCA
## --------------------
## oo Filtering results
## --------------------
## ooo By data.type
## ooo By workflow.type
## ----------------
## oo Checking data
## ----------------
## ooo Check if there are duplicated cases
## ooo Check if there results for the query
## -------------------
## o Preparing output
## -------------------
#  workflow.type = "HTSeq - Counts")
xprssn=getResults(xprssn)
j=substr(xprssn$cases,1,19)
mirnas <- GDCquery(project = "TCGA-BRCA",
    data.category = "Transcriptome Profiling",
    data.type = "miRNA Expression Quantification")
## --------------------------------------
## o GDCquery: Searching in GDC database
## --------------------------------------
## Genome of reference: hg38
## --------------------------------------------
## oo Accessing GDC. This might take a while...
## --------------------------------------------
## ooo Project: TCGA-BRCA
## --------------------
## oo Filtering results
## --------------------
## ooo By data.type
## ----------------
## oo Checking data
## ----------------
## ooo Check if there are duplicated cases
## ooo Check if there results for the query
## -------------------
## o Preparing output
## -------------------
mirnas=getResults(mirnas)
k=substr(mirnas$cases,1,19)

##############CONCOURRENT MEASURES########################
samples=intersect(intersect(i,j),k)
samples=data.frame(cbind(sample=samples,patient=substr(samples,1,12)))
head(samples)

would have been easier with tidy

suppressPackageStartupMessages({library(tidyverse)})
subtypes=TCGAquery_subtype(tumor="brca")#subtype per patient
## brca subtype information from:doi.org/10.1016/j.ccell.2018.03.014
samples=merge(samples,subtypes,by="patient",all.x=T)
samples%>%distinct(patient,BRCA_Subtype_PAM50)%>%count(BRCA_Subtype_PAM50)
colnames(mthyltn)[30]="patient"
temp=mthyltn%>%select(cases,data_type,sample_type,patient)
temp$sample=substr(temp$cases,1,19)
samples=merge(samples,temp,by="sample",all.x=T)
samples%>%distinct(patient.x,BRCA_Subtype_PAM50,sample_type)%>%count(sample_type)

2. RNAseq pre-processing

prepro-mRNA.R → RNAseqnormalized.tsv

3. miRNAseq pre-processing

prepro-miRNA.R → miRNAseqNormi.tsv

suppressPackageStartupMessages({library(data.table)})
exampl=fread("miRNAseqNormi.tsv")
## Warning in fread("miRNAseqNormi.tsv"): Detected 805 column names but the data
## has 806 columns (i.e. invalid file). Added 1 extra default column name for
## the first column which is guessed to be row names or an index. Use setnames()
## afterwards if this guess is not correct, or fix the file write command that
## created the file to create a valid file.
dim(exampl)
## [1] 604 806
exampl[1:5,1:5]

4. HM450 pre-processing

prepro-methy.R → methyNormi.tsv

5. Paste together the 3 matrixes of every subtype

Rscript concatena.R → Basal.mtrx, Her2.mtrx, …

6. Eigenvalue normalization

Rscript mfa_normi.R → Basal.eigeNormi, Her2.eigeNormi…

exampl=fread("Her2.eigeNormi")
## Warning in fread("Her2.eigeNormi"): Detected 46 column names but the data has
## 47 columns (i.e. invalid file). Added 1 extra default column name for the first
## column which is guessed to be row names or an index. Use setnames() afterwards
## if this guess is not correct, or fix the file write command that created the
## file to create a valid file.
exampl[1:5,1:5]