Fit the SGCCA

1. Fit a penalization value per omic

Rscript fit_penalty.R 0.01 0.01 0.01→ 0.01.0.01.0.01.tsv

data matrices have to be in the same folder

update cl as needed

suppressPackageStartupMessages({
library(mixOmics)#6.16.3
library(data.table)#1.14.2
library(parallel)#4.1.1
})

penalty_cpgs=0.01#as.numeric(args[1])
penalty_transcripts=0.01#as.numeric(args[2])
penalty_mir=0.01#as.numeric(args[3])

#take model descriptors<-----------------recycled
describe=function(data,pc,pt,pm){
    #subsample observations
    #data=lapply(data,function(x) x[sample(1:n,subn),])
    resus=wrapper.sgcca(data,penalty=c(pc,pt,pm),scale=T,
        scheme="centroid")
    #get results description
    description=as.data.frame(do.call(rbind,resus$AVE$AVE_X))
    description$nfeatures=sapply(resus$loadings,function(x) sum(x!=0))
    description$omic=rownames(description)
    description$penalty=resus$penalty
    colnames(description)[1]="AVE"
return(description)}

######################DATA TO LIST OF MATRIXES PER MOLECULAR LEVEL

files=list.files()
files=files[grep("eigeN",files)]
#sizes=c(Basal=129,Her2=47,LumA=417,LumB=141,Normal=76)
sizes=c(Basal=129,Her2=47)#just to show how this work, uncomment the line above

#cl <- parallel::makeCluster(10)
cl <- parallel::makeCluster(3)#just to show how this work, uncomment the line above

clusterExport(cl, c("describe","penalty_cpgs","penalty_transcripts",
    "penalty_mir","wrapper.sgcca","files","sizes","fread"))
#resus=do.call(rbind,parLapply(cl,1:10,function(x) {
resus=do.call(rbind,parLapply(cl,1:3,function(x) {
    #1 more than the samples coz of the rownames 
    i=lapply(sizes,function(x) c(1,sample(2:x,10)))
    data=lapply(1:length(sizes),function(x) fread(files[x],select=i[[x]]))
    data=lapply(data,function(x) as.matrix(x[,2:ncol(x)],rownames=x$V1))
    data=do.call(cbind,data)
    #separate omics
    data=apply(cbind(c(1,393133,410210),c(393132,410209,410813)),1,
        function(x) t(data[x[1]:x[2],]))
    names(data)=c("CpGs","transcripts","miRNAs")
    describe(data,penalty_cpgs,penalty_transcripts,penalty_mir)}))
stopCluster(cl)
head(resus)

1.1. Paste all together

#cat *.tsv>penalty_search.tsv

2. Chosing the penalization values with highest AVE and lowest nfeatures

choose_penalty.R → Figure S1

suppressPackageStartupMessages({
library(ggplot2)#3.3.5
library(gridExtra)#2.3
})

temp=read.table("penalty_search.tsv",sep='\t',header=T)
temp$omic=factor(temp$omic,levels=c("CpGs","transcripts","miRNAs"))
#temp=temp[order(temp$penalty),]
#plot median AVE vs meadian nfeatures
omics=levels(temp$omic)
omics=lapply(omics,function(x) temp[temp$omic==x,])
omics=lapply(omics,function(x) as.data.frame(apply(x,2,as.numeric)))

## Warning in apply(x, 2, as.numeric): NAs introduced by coercion

## Warning in apply(x, 2, as.numeric): NAs introduced by coercion

## Warning in apply(x, 2, as.numeric): NAs introduced by coercion

names(omics)=levels(temp$omic)
omics=lapply(omics,function(y) sapply(unique(y$penalty),function(x) 
apply(y[y$penalty==x,],2,median,na.rm=T)))#better than mean?
omics=lapply(omics,function(x) as.data.frame(t(x)))
#indi plots or CpGs will determine axis
plots=lapply(1:3,function(x) ggplot(omics[[x]],
    aes(x=nfeatures,y=AVE,col=penalty))+geom_point()+
    ggtitle(names(omics)[x])+theme(text=element_text(size=18))+
    scale_x_continuous(trans="log10")+geom_line())
#png("sparsity_search.png")
 grid.arrange(plots[[1]],plots[[2]],plots[[3]])

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 row(s) containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 row(s) containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 row(s) containing missing values (geom_path).

#dev.off()

3. Run an SGCCA per subtype, using the adjusted penalization values

Rscript sgcca.R Her2 → Her2.selected

subtype="Her2"#args[1]
data=fread(paste(subtype,"eigeNormi",sep='.'))

## Warning in fread(paste(subtype, "eigeNormi", sep = ".")): Detected 46 column
## names but the data has 47 columns (i.e. invalid file). Added 1 extra default
## column name for the first column which is guessed to be row names or an index.
## Use setnames() afterwards if this guess is not correct, or fix the file write
## command that created the file to create a valid file.

data=as.matrix(data[,2:ncol(data)],rownames=data$V1)
#separate omics
data=apply(cbind(c(1,393133,410210),c(393132,410209,410813)),1,
    function(x) t(data[x[1]:x[2],]))
names(data)=c("CpGs","transcripts","miRNAs")
penalty=c(CpGs=0.02,transcripts=0.02,miRNAs=0.05)#output of choose_penalty.R
#ncomp=nrow(data$miRNAs)-1#the last comp has all loadings>0
ncomp=2#exchange #with the line above
final=wrapper.sgcca(X=data,penalty=penalty,scale=F,
    scheme="centroid",ncomp=ncomp)#ncomp to explain 50% of transcripts matrix according to mfa.R

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

## Warning in BinarySearch(x, sumabs): Didn't quite converge

#get selected features
selected=lapply(final$loadings,function(y) 
    apply(y,2,function(x) x[x!=0]))
selected=as.data.frame(do.call(rbind,lapply(selected,function(y) 
    do.call(rbind,lapply(1:length(y),function(x) 
        cbind(names(y)[x],y[[x]],names(y[[x]])))))))
colnames(selected)=c("component","final","variable")
head(selected)

#write.table(selected,paste(subtype,"selected",sep='.'),sep='\t',
#   quote=F,row.names=F)

4. SGCCA with 100 subsets of the data to check the stability

Rscript sgccaSubsample.R Her2 1→ Her2.1.selected

The same than sgcca.R but with a random subset of half the data

5. Count the features selected on the subsets to check the stability

Rscript joinSubsamples.R Her2 → Her2.subsampled

subsampled=read.table("Her2.subsampled",sep='\t',header=T)
dim(subsampled)

## [1] 9824  102

subsampled[1:5,1:5]