Rscript fit_penalty.R 0.01 0.01 0.01→ 0.01.0.01.0.01.tsv
data matrices have to be in the same folder
update cl as needed
suppressPackageStartupMessages({
library(mixOmics)#6.16.3
library(data.table)#1.14.2
library(parallel)#4.1.1
})
penalty_cpgs=0.01#as.numeric(args[1])
penalty_transcripts=0.01#as.numeric(args[2])
penalty_mir=0.01#as.numeric(args[3])
#take model descriptors<-----------------recycled
describe=function(data,pc,pt,pm){
#subsample observations
#data=lapply(data,function(x) x[sample(1:n,subn),])
resus=wrapper.sgcca(data,penalty=c(pc,pt,pm),scale=T,
scheme="centroid")
#get results description
description=as.data.frame(do.call(rbind,resus$AVE$AVE_X))
description$nfeatures=sapply(resus$loadings,function(x) sum(x!=0))
description$omic=rownames(description)
description$penalty=resus$penalty
colnames(description)[1]="AVE"
return(description)}
######################DATA TO LIST OF MATRIXES PER MOLECULAR LEVEL
files=list.files()
files=files[grep("eigeN",files)]
#sizes=c(Basal=129,Her2=47,LumA=417,LumB=141,Normal=76)
sizes=c(Basal=129,Her2=47)#just to show how this work, uncomment the line above
#cl <- parallel::makeCluster(10)
cl <- parallel::makeCluster(3)#just to show how this work, uncomment the line above
clusterExport(cl, c("describe","penalty_cpgs","penalty_transcripts",
"penalty_mir","wrapper.sgcca","files","sizes","fread"))
#resus=do.call(rbind,parLapply(cl,1:10,function(x) {
resus=do.call(rbind,parLapply(cl,1:3,function(x) {
#1 more than the samples coz of the rownames
i=lapply(sizes,function(x) c(1,sample(2:x,10)))
data=lapply(1:length(sizes),function(x) fread(files[x],select=i[[x]]))
data=lapply(data,function(x) as.matrix(x[,2:ncol(x)],rownames=x$V1))
data=do.call(cbind,data)
#separate omics
data=apply(cbind(c(1,393133,410210),c(393132,410209,410813)),1,
function(x) t(data[x[1]:x[2],]))
names(data)=c("CpGs","transcripts","miRNAs")
describe(data,penalty_cpgs,penalty_transcripts,penalty_mir)}))
stopCluster(cl)
head(resus)
#cat *.tsv>penalty_search.tsv
choose_penalty.R → Figure S1
suppressPackageStartupMessages({
library(ggplot2)#3.3.5
library(gridExtra)#2.3
})
temp=read.table("penalty_search.tsv",sep='\t',header=T)
temp$omic=factor(temp$omic,levels=c("CpGs","transcripts","miRNAs"))
#temp=temp[order(temp$penalty),]
#plot median AVE vs meadian nfeatures
omics=levels(temp$omic)
omics=lapply(omics,function(x) temp[temp$omic==x,])
omics=lapply(omics,function(x) as.data.frame(apply(x,2,as.numeric)))
## Warning in apply(x, 2, as.numeric): NAs introduced by coercion
## Warning in apply(x, 2, as.numeric): NAs introduced by coercion
## Warning in apply(x, 2, as.numeric): NAs introduced by coercion
names(omics)=levels(temp$omic)
omics=lapply(omics,function(y) sapply(unique(y$penalty),function(x)
apply(y[y$penalty==x,],2,median,na.rm=T)))#better than mean?
omics=lapply(omics,function(x) as.data.frame(t(x)))
#indi plots or CpGs will determine axis
plots=lapply(1:3,function(x) ggplot(omics[[x]],
aes(x=nfeatures,y=AVE,col=penalty))+geom_point()+
ggtitle(names(omics)[x])+theme(text=element_text(size=18))+
scale_x_continuous(trans="log10")+geom_line())
#png("sparsity_search.png")
grid.arrange(plots[[1]],plots[[2]],plots[[3]])
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
#dev.off()
Rscript sgcca.R Her2 → Her2.selected
subtype="Her2"#args[1]
data=fread(paste(subtype,"eigeNormi",sep='.'))
## Warning in fread(paste(subtype, "eigeNormi", sep = ".")): Detected 46 column
## names but the data has 47 columns (i.e. invalid file). Added 1 extra default
## column name for the first column which is guessed to be row names or an index.
## Use setnames() afterwards if this guess is not correct, or fix the file write
## command that created the file to create a valid file.
data=as.matrix(data[,2:ncol(data)],rownames=data$V1)
#separate omics
data=apply(cbind(c(1,393133,410210),c(393132,410209,410813)),1,
function(x) t(data[x[1]:x[2],]))
names(data)=c("CpGs","transcripts","miRNAs")
penalty=c(CpGs=0.02,transcripts=0.02,miRNAs=0.05)#output of choose_penalty.R
#ncomp=nrow(data$miRNAs)-1#the last comp has all loadings>0
ncomp=2#exchange #with the line above
final=wrapper.sgcca(X=data,penalty=penalty,scale=F,
scheme="centroid",ncomp=ncomp)#ncomp to explain 50% of transcripts matrix according to mfa.R
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
## Warning in BinarySearch(x, sumabs): Didn't quite converge
#get selected features
selected=lapply(final$loadings,function(y)
apply(y,2,function(x) x[x!=0]))
selected=as.data.frame(do.call(rbind,lapply(selected,function(y)
do.call(rbind,lapply(1:length(y),function(x)
cbind(names(y)[x],y[[x]],names(y[[x]])))))))
colnames(selected)=c("component","final","variable")
head(selected)
#write.table(selected,paste(subtype,"selected",sep='.'),sep='\t',
# quote=F,row.names=F)
Rscript sgccaSubsample.R Her2 1→ Her2.1.selected
The same than sgcca.R but with a random subset of half the data
Rscript joinSubsamples.R Her2 → Her2.subsampled
subsampled=read.table("Her2.subsampled",sep='\t',header=T)
dim(subsampled)
## [1] 9824 102
subsampled[1:5,1:5]