This is an extract of the scripts in steps 3 to produce the figures before section 3.5
selectedFeatures.R → supplementary figure 3, Basal.stable, Her2.stable, …
suppressPackageStartupMessages({
library(tidyverse)
#library(biomaRt)
#library(rentrez)
library(ggplot2)})
#load sgcca.R output
files=list.files("..",full.names = T)#chech folder order
files=files[grep("selected",files)]
sets=lapply(files,read_tsv,show_col_types = FALSE)
names(sets)=gsub("../","",gsub(".selected","",files))
head(sets["Basal"])
## $Basal
## # A tibble: 44,105 × 7
## variable component loading omic initial entrezgene_id ID
## <chr> <chr> <dbl> <chr> <dbl> <dbl> <chr>
## 1 cg00001583 comp 77 0.0150 CpGs 0.00233 2494 IlmnID
## 2 cg00001583 comp 51 -0.0557 CpGs -0.000954 2494 IlmnID
## 3 cg00001583 comp 124 0.0125 CpGs 0.00278 2494 IlmnID
## 4 cg00001747 comp 3 -0.0962 CpGs 0.00631 NA <NA>
## 5 cg00001747 comp 49 -0.00716 CpGs -0.00105 NA <NA>
## 6 cg00001747 comp 70 0.0709 CpGs 0.0000535 NA <NA>
## 7 cg00001747 comp 111 -0.125 CpGs -0.00124 NA <NA>
## 8 cg00001793 comp 1 -0.0148 CpGs -0.00820 2120 IlmnID
## 9 cg00001793 comp 80 0.000969 CpGs 0.000375 2120 IlmnID
## 10 cg00001793 comp 61 -0.167 CpGs 0.00693 2120 IlmnID
## # … with 44,095 more rows
#load joinSubsamples.R output
files=list.files()
files=files[grep("subsampled",files)]
stability=lapply(files,read_tsv,show_col_types = FALSE)
names(stability)=gsub(".subsampled","",files)
#names(stability)==names(sets)#to be sure
stability=lapply(stability,function(x) as.data.frame(cbind(
"variable"=x$feature,
#sum across subsamples
"stability"=rowSums(!is.na(x[,3:ncol(x)])))))
#add stability info to sets
subtypes=names(sets)
sets=lapply(subtypes,function(x)
merge(sets[[x]],stability[[x]],by="variable"))
names(sets)=subtypes#to be sure
head(sets[["Basal"]])
##################PLOT
#join all together in a data.frame
stability=do.call(rbind,lapply(1:5,function(x)
as.data.frame(cbind("subtype"=names(stability)[x],
stability[[x]]))))
#nice omic identifiers
stability$omic=gsub("E","transcript",
gsub("h","miRNA",
gsub("c","CpG",substr(stability$variable,1,1))))
#force the order I wanna
stability$omic=factor(stability$omic,levels=c("CpG",
"transcript",
"miRNA"))
#categorical instead of numbers
stability$tokeep=c("non stable","stable")[
as.factor(stability$stability>=70)]#arbitrary threshold
#barplot
#png("stability.png")
stability%>%count(subtype,omic,tokeep)%>%
ggplot(aes(y=as.numeric(n),x=subtype,fill=tokeep))+
geom_bar(stat="identity",position="fill")+facet_wrap(~omic)+
xlab("")+ylab("%")+theme(text=element_text(size=16),
panel.background=element_blank(),legend.position="bottom",
legend.title = element_blank(),
axis.text.x=element_text(angle=45),
legend.margin=margin(-40,0,0,0))+#or the legend fall far bottom
scale_fill_manual(values=c("gray47","brown1"))
#dev.off()
#sets=lapply(sets,function(x) x%>%filter(stability>=70))
#lapply(names(sets),function(x)
# write_tsv(x=sets[[x]],file=paste(x,"stable",sep='.')))
functions_overrepre.R → BP.enrich, KEGG.enrich
#slow lines, tend to die
functions_overrepre_plots.R L1-L29 → figure 2
library(UpSetR)
bp=read_tsv("BP.enrichment",show_col_types = FALSE)
k=read_tsv("KEGG.enrichment",show_col_types = FALSE)
enriched=list(BP=bp,KEGG=k)
get_sets=function(enriched_table,exclusive){
#matrix subtypes vs function
g=table(unique(enriched_table[,c("ID","subtype")]))
#to get exclusive functions for another time
if(exclusive==T){
g=g[rowSums(g==0)==4,]
}
#upset() needs a list of IDs
sets=apply(g,2,function(y) names(which(y>0)))
return(sets)}
functions=lapply(enriched,get_sets,exclusive=F)
lapply(functions,function(x) upset(fromList(x),text.scale=1.5,order.by="degree"))
## $BP
##
## $KEGG
functions_overrepre_plots.R L32-L127 → figure 3B
suppressPackageStartupMessages({
library(GSEABase)
library(GO.db)
#library(ggplot2)
})
# as in https://support.bioconductor.org/p/128407/
#and https://support.bioconductor.org/p/83375/
fl="http://current.geneontology.org/ontology/subsets/goslim_agr.obo"
#subset used for humans in PMC6800510
slim <- getOBOCollection(fl)#53 ids only
df = select(GO.db, keys(GO.db), "ONTOLOGY")#ontology of all GOids
## 'select()' returned 1:1 mapping between keys and columns
#table(df$ONTOLOGY[df$GOID%in%ids(slim)])#found all slim ids
gomap=as.list(GOBPOFFSPRING)#descendents off every goid
found=names(gomap)[names(gomap)%in%ids(slim)]
#[1] 21
#sum(found%in%df$GOID[df$ONTOLOGY=="BP"])
#[1] 21 #actually only descendents of BP goids
gomap=gomap[names(gomap)%in%ids(slim)]
#format to easy to read data frames
slim=as.data.frame(do.call(rbind,lapply(1:21,function(x)
cbind(names(gomap)[x],gomap[[x]]))))
colnames(slim)=c("parent","child")
head(slim)
slimnames=as.data.frame(sapply(unique(slim$parent),function(x)
Term(GOTERM[[x]])))
slimnames$parent=rownames(slimnames)
colnames(slimnames)[1]="name"
head(slimnames)
exclusive=lapply(enriched,get_sets,exclusive=T)
exclusive$BP$Basal
## [1] "GO:0001889" "GO:0003299" "GO:0008584" "GO:0010977" "GO:0014887"
## [6] "GO:0014898" "GO:0021983" "GO:0030199" "GO:0030516" "GO:0030517"
## [11] "GO:0035282" "GO:0035860" "GO:0043586" "GO:0045058" "GO:0045765"
## [16] "GO:0046546" "GO:0048638" "GO:0048675" "GO:0050772" "GO:0051147"
## [21] "GO:0061387" "GO:0070593" "GO:1901342" "GO:1902742" "GO:1902894"
## [26] "GO:2000727"
#count categories per subtype
BP.classes=as.data.frame(do.call(rbind,lapply(1:5,
function(x) cbind(names(exclusive$BP)[x],exclusive$BP[[x]]))))
colnames(BP.classes)=c("subtype","child")
BP.classes=merge(merge(BP.classes,slim,by="child"),
slimnames,by="parent")
head(BP.classes)
bias=function(classes,subtype,class){
totals=table(classes[,c(class,subtype)])
s=colSums(totals)
ps=p.adjust(apply(totals,1,function(x)
fisher.test(rbind(x,s-x),simulate.p.value=T)$p.val),"bonferroni")#categories change depending on the correction used
#TRUE needed coz > 2 categories
return(rownames(totals)[ps<0.05])}
i=bias(BP.classes,3,4)
i
## [1] "cellular component organization" "DNA metabolic process"
## [3] "establishment of localization"
BP.classes%>%count(subtype,name)%>%
ggplot(aes(x=n,y=name,fill=subtype))+
geom_bar(stat="identity",position="fill")+
scale_x_continuous(labels=scales::percent)+
theme(text=element_text(size=18),axis.ticks=element_blank(),
panel.background=element_blank(),legend.title=element_blank(),
legend.position="bottom",legend.margin=margin(-25,0,0,-150))+
xlab("")+ylab("")+scale_fill_viridis_d(option = "plasma")+
annotate("text",x=1.08,y=sort(unique(BP.classes$name)),
label=BP.classes%>%count(name)%>%dplyr::select(n)%>%unlist)+
annotate("text",y=i,x=-.05,label="*",size=8,vjust=.8)
functions_overrepre_groups.R → Groups_per_component.tsv, Figure 4
suppressPackageStartupMessages({
library(igraph)
library(RCy3)#cytoscape has to be already open
})
coenriched=do.call(rbind,enriched)%>%group_by(subtype)%>%
group_map(~table(.x[,c("Description","component")]))
#matrix with the component intersections
intersection=lapply(coenriched,function(x) crossprod(t(x)))
#matrix with the component unions
union=lapply(coenriched,function(z) sapply(1:nrow(z),function(x)
sapply(1:nrow(z),function(y) sum(colSums(z[c(x,y),])>0))))#slow
#Jaccard index for the components
coenriched=lapply(1:5,function(x) intersection[[x]]/union[[x]])
names(coenriched)=unique(enriched$BP$subtype)
coenriched$Basal[1:3,1:3]
## Description
## Description Aldosterone synthesis and secretion
## Aldosterone synthesis and secretion 1
## AMPK signaling pathway 0
## anterior/posterior pattern specification 0
## Description
## Description AMPK signaling pathway
## Aldosterone synthesis and secretion 0
## AMPK signaling pathway 1
## anterior/posterior pattern specification 0
## Description
## Description anterior/posterior pattern specification
## Aldosterone synthesis and secretion 0
## AMPK signaling pathway 0
## anterior/posterior pattern specification 1
#don't forget 1-x to have identical sets together
trees=lapply(coenriched,function(x) hclust(as.dist(1-x)))
groups=lapply(trees,function(x) cutree(x,h=0))
groups=do.call(rbind,lapply(1:5,function(y)
data.frame(cbind("subtype"=unique(enriched$BP$subtype)[y],
"Description"=names(groups[[y]]),
"group"=groups[[y]]))))
#write_tsv(groups,"Groups_per_component.tsv")
head(groups)
functionsJaccard.R → funcJaccI.tsv, figure 3A
features=lapply(enriched,function(x)
x%>%dplyr::select(subtype,Description,geneID)%>%
separate_rows(geneID,sep='/',convert=T))
#which functions map the same genes for a pair of datasets?????
i=lapply(features,function(x)
x%>%distinct(subtype,Description)%>%count(Description)%>%
filter(n>1)%>%dplyr::select(Description)%>%unlist)
#use only functions found in at least 2 datsets
features=lapply(1:2,function(x)
features[[x]]%>%filter(Description%in%i[[x]]))
head(features)
## [[1]]
## # A tibble: 16,284 × 3
## subtype Description geneID
## <chr> <chr> <chr>
## 1 Basal cell-cell adhesion via plasma-membrane adhesion molecules PCDHA3
## 2 Basal cell-cell adhesion via plasma-membrane adhesion molecules PCDHAC1
## 3 Basal cell-cell adhesion via plasma-membrane adhesion molecules PCDHA6
## 4 Basal cell-cell adhesion via plasma-membrane adhesion molecules PCDHA12
## 5 Basal cell-cell adhesion via plasma-membrane adhesion molecules PCDHA1
## 6 Basal cell-cell adhesion via plasma-membrane adhesion molecules PCDHA10
## 7 Basal cell-cell adhesion via plasma-membrane adhesion molecules PCDHA4
## 8 Basal cell-cell adhesion via plasma-membrane adhesion molecules PCDHA2
## 9 Basal cell-cell adhesion via plasma-membrane adhesion molecules PCDHA5
## 10 Basal cell-cell adhesion via plasma-membrane adhesion molecules PCDHA9
## # … with 16,274 more rows
##
## [[2]]
## # A tibble: 310 × 3
## subtype Description geneID
## <chr> <chr> <int>
## 1 Basal Dopaminergic synapse 2903
## 2 Basal Dopaminergic synapse 5579
## 3 Basal Dopaminergic synapse 5519
## 4 Basal Dopaminergic synapse 773
## 5 Basal Dopaminergic synapse 3763
## 6 Basal Dopaminergic synapse 2774
## 7 Basal Aldosterone synthesis and secretion 8912
## 8 Basal Aldosterone synthesis and secretion 2778
## 9 Basal Aldosterone synthesis and secretion 775
## 10 Basal Aldosterone synthesis and secretion 196883
## # … with 300 more rows
#jaccard index
jacc=function(set1,set2){
inter=length(intersect(set1,set2))
un=length(union(set1,set2))
return(inter/un)}
intersect_functions=function(data,fun){
set=data%>%filter(Description==fun)
subtys=unique(set$subtype)
#paired contrast
mat=do.call(rbind,lapply(1:(length(subtys)-1),function(x)
do.call(rbind,lapply((1+x):length(subtys),function(y)
c("func"=fun,"pair1"=subtys[x],"pair2"=subtys[y],
"index"=jacc(set$geneID[set$subtype==subtys[x]],
set$geneID[set$subtype==subtys[y]]))))))
return(mat)}
jindx=lapply(features,function(x)
data.frame(do.call(rbind,lapply(unique(x$Description),function(y)
intersect_functions(x,y)))))
#toplot
jindx=do.call(rbind,lapply(1:2,function(x)
cbind("type"=c("biological process","KEGG pathway")[x],
jindx[[x]])))
jindx$pair=paste(jindx$pair1,jindx$pair2)
#write_tsv(x=jindx,"funcJaccI.tsv")
head(jindx)
temp=jindx[jindx$index>0.5,]
#png("enrichJacc1.png")
ggplot(jindx,aes(x=as.numeric(index),y=pair))+
geom_violin()+geom_jitter(height=0.1)+
ggrepel::geom_text_repel(data=temp,
aes(y=pair,x=as.numeric(index),label=func),hjust=1,nudge_x=1.33,xlim=c(0,1.26),
force=33,force_pull=.05)+theme(text=element_text(size=16),
axis.ticks.y=element_blank(),panel.background=element_blank(),
plot.margin=unit(c(.1,5,.1,.1),"cm"))+coord_cartesian(clip="off")+
ylab("")+xlab("jaccard index")+geom_vline(xintercept=0.5,
color="firebrick")
#dev.off()