This is an extract of the scripts in steps 3 to produce the figures before section 3.5

1. Check stability

selectedFeatures.R → supplementary figure 3, Basal.stable, Her2.stable, …

suppressPackageStartupMessages({
library(tidyverse)
#library(biomaRt)
#library(rentrez)
library(ggplot2)})

#load sgcca.R output
files=list.files("..",full.names = T)#chech folder order
files=files[grep("selected",files)]
sets=lapply(files,read_tsv,show_col_types = FALSE)
names(sets)=gsub("../","",gsub(".selected","",files))
head(sets["Basal"])

## $Basal
## # A tibble: 44,105 × 7
##    variable   component   loading omic     initial entrezgene_id ID    
##    <chr>      <chr>         <dbl> <chr>      <dbl>         <dbl> <chr> 
##  1 cg00001583 comp 77    0.0150   CpGs   0.00233            2494 IlmnID
##  2 cg00001583 comp 51   -0.0557   CpGs  -0.000954           2494 IlmnID
##  3 cg00001583 comp 124   0.0125   CpGs   0.00278            2494 IlmnID
##  4 cg00001747 comp 3    -0.0962   CpGs   0.00631              NA <NA>  
##  5 cg00001747 comp 49   -0.00716  CpGs  -0.00105              NA <NA>  
##  6 cg00001747 comp 70    0.0709   CpGs   0.0000535            NA <NA>  
##  7 cg00001747 comp 111  -0.125    CpGs  -0.00124              NA <NA>  
##  8 cg00001793 comp 1    -0.0148   CpGs  -0.00820            2120 IlmnID
##  9 cg00001793 comp 80    0.000969 CpGs   0.000375           2120 IlmnID
## 10 cg00001793 comp 61   -0.167    CpGs   0.00693            2120 IlmnID
## # … with 44,095 more rows

#load joinSubsamples.R output
files=list.files()
files=files[grep("subsampled",files)]
stability=lapply(files,read_tsv,show_col_types = FALSE)
names(stability)=gsub(".subsampled","",files)
#names(stability)==names(sets)#to be sure
stability=lapply(stability,function(x) as.data.frame(cbind(
        "variable"=x$feature,
        #sum across subsamples
        "stability"=rowSums(!is.na(x[,3:ncol(x)])))))
#add stability info to sets
subtypes=names(sets)
sets=lapply(subtypes,function(x) 
    merge(sets[[x]],stability[[x]],by="variable"))
names(sets)=subtypes#to be sure

head(sets[["Basal"]])

##################PLOT
#join all together in a data.frame
stability=do.call(rbind,lapply(1:5,function(x) 
    as.data.frame(cbind("subtype"=names(stability)[x],
                                    stability[[x]]))))
#nice omic identifiers
stability$omic=gsub("E","transcript",
                gsub("h","miRNA",
                 gsub("c","CpG",substr(stability$variable,1,1))))
#force the order I wanna
stability$omic=factor(stability$omic,levels=c("CpG",
                                                "transcript",
                                                "miRNA"))
#categorical instead of numbers
stability$tokeep=c("non stable","stable")[
                    as.factor(stability$stability>=70)]#arbitrary threshold
#barplot
#png("stability.png")
stability%>%count(subtype,omic,tokeep)%>%
ggplot(aes(y=as.numeric(n),x=subtype,fill=tokeep))+
geom_bar(stat="identity",position="fill")+facet_wrap(~omic)+
xlab("")+ylab("%")+theme(text=element_text(size=16),
    panel.background=element_blank(),legend.position="bottom",
    legend.title = element_blank(),
    axis.text.x=element_text(angle=45),
    legend.margin=margin(-40,0,0,0))+#or the legend fall far bottom
scale_fill_manual(values=c("gray47","brown1"))

#dev.off()
#sets=lapply(sets,function(x) x%>%filter(stability>=70))
#lapply(names(sets),function(x)
#   write_tsv(x=sets[[x]],file=paste(x,"stable",sep='.')))

2. Functional enrichment (over-representation)

functions_overrepre.R → BP.enrich, KEGG.enrich

#slow lines, tend to die

3. Compare the sets of enriched functions

functions_overrepre_plots.R L1-L29 → figure 2

library(UpSetR)

bp=read_tsv("BP.enrichment",show_col_types = FALSE)
k=read_tsv("KEGG.enrichment",show_col_types = FALSE)
enriched=list(BP=bp,KEGG=k)
get_sets=function(enriched_table,exclusive){
#matrix subtypes vs function
 g=table(unique(enriched_table[,c("ID","subtype")]))
#to get exclusive functions for another time
 if(exclusive==T){
    g=g[rowSums(g==0)==4,]
 }
 #upset() needs a list of IDs
 sets=apply(g,2,function(y) names(which(y>0)))
return(sets)}
functions=lapply(enriched,get_sets,exclusive=F)
lapply(functions,function(x) upset(fromList(x),text.scale=1.5,order.by="degree"))

## $BP

## 
## $KEGG

3. Check if the categories of biological processes are over-represented

functions_overrepre_plots.R L32-L127 → figure 3B

suppressPackageStartupMessages({
library(GSEABase)
library(GO.db)
#library(ggplot2)
})
# as in https://support.bioconductor.org/p/128407/
#and https://support.bioconductor.org/p/83375/
fl="http://current.geneontology.org/ontology/subsets/goslim_agr.obo"
#subset used for humans in PMC6800510
slim <- getOBOCollection(fl)#53 ids only
df = select(GO.db, keys(GO.db), "ONTOLOGY")#ontology of all GOids

## 'select()' returned 1:1 mapping between keys and columns

#table(df$ONTOLOGY[df$GOID%in%ids(slim)])#found all slim ids
gomap=as.list(GOBPOFFSPRING)#descendents off every goid
found=names(gomap)[names(gomap)%in%ids(slim)]
#[1] 21
#sum(found%in%df$GOID[df$ONTOLOGY=="BP"])
#[1] 21 #actually only descendents of BP goids
gomap=gomap[names(gomap)%in%ids(slim)]
#format to easy to read data frames
slim=as.data.frame(do.call(rbind,lapply(1:21,function(x) 
    cbind(names(gomap)[x],gomap[[x]]))))
colnames(slim)=c("parent","child")
head(slim)

slimnames=as.data.frame(sapply(unique(slim$parent),function(x) 
    Term(GOTERM[[x]])))
slimnames$parent=rownames(slimnames)
colnames(slimnames)[1]="name"
head(slimnames)

exclusive=lapply(enriched,get_sets,exclusive=T)
exclusive$BP$Basal

##  [1] "GO:0001889" "GO:0003299" "GO:0008584" "GO:0010977" "GO:0014887"
##  [6] "GO:0014898" "GO:0021983" "GO:0030199" "GO:0030516" "GO:0030517"
## [11] "GO:0035282" "GO:0035860" "GO:0043586" "GO:0045058" "GO:0045765"
## [16] "GO:0046546" "GO:0048638" "GO:0048675" "GO:0050772" "GO:0051147"
## [21] "GO:0061387" "GO:0070593" "GO:1901342" "GO:1902742" "GO:1902894"
## [26] "GO:2000727"

#count categories per subtype
BP.classes=as.data.frame(do.call(rbind,lapply(1:5,
    function(x) cbind(names(exclusive$BP)[x],exclusive$BP[[x]]))))
colnames(BP.classes)=c("subtype","child")
BP.classes=merge(merge(BP.classes,slim,by="child"),
    slimnames,by="parent")
head(BP.classes)

bias=function(classes,subtype,class){
    totals=table(classes[,c(class,subtype)])
    s=colSums(totals)
    ps=p.adjust(apply(totals,1,function(x) 
    fisher.test(rbind(x,s-x),simulate.p.value=T)$p.val),"bonferroni")#categories change depending on the correction used
    #TRUE needed coz > 2 categories
    return(rownames(totals)[ps<0.05])}
i=bias(BP.classes,3,4)
i

## [1] "cellular component organization" "DNA metabolic process"          
## [3] "establishment of localization"

BP.classes%>%count(subtype,name)%>%
ggplot(aes(x=n,y=name,fill=subtype))+
geom_bar(stat="identity",position="fill")+
scale_x_continuous(labels=scales::percent)+
 theme(text=element_text(size=18),axis.ticks=element_blank(),
    panel.background=element_blank(),legend.title=element_blank(),
    legend.position="bottom",legend.margin=margin(-25,0,0,-150))+
 xlab("")+ylab("")+scale_fill_viridis_d(option = "plasma")+
annotate("text",x=1.08,y=sort(unique(BP.classes$name)),
    label=BP.classes%>%count(name)%>%dplyr::select(n)%>%unlist)+
annotate("text",y=i,x=-.05,label="*",size=8,vjust=.8)

4. Group the functions that are over-represented in the same SGCCA components

functions_overrepre_groups.R → Groups_per_component.tsv, Figure 4

suppressPackageStartupMessages({
library(igraph)
library(RCy3)#cytoscape has to be already open
})

coenriched=do.call(rbind,enriched)%>%group_by(subtype)%>%
    group_map(~table(.x[,c("Description","component")]))
#matrix with the component intersections
intersection=lapply(coenriched,function(x) crossprod(t(x)))
#matrix with the component unions
union=lapply(coenriched,function(z) sapply(1:nrow(z),function(x) 
    sapply(1:nrow(z),function(y) sum(colSums(z[c(x,y),])>0))))#slow
#Jaccard index for the components
coenriched=lapply(1:5,function(x) intersection[[x]]/union[[x]])
names(coenriched)=unique(enriched$BP$subtype)
coenriched$Basal[1:3,1:3]

##                                           Description
## Description                                Aldosterone synthesis and secretion
##   Aldosterone synthesis and secretion                                        1
##   AMPK signaling pathway                                                     0
##   anterior/posterior pattern specification                                   0
##                                           Description
## Description                                AMPK signaling pathway
##   Aldosterone synthesis and secretion                           0
##   AMPK signaling pathway                                        1
##   anterior/posterior pattern specification                      0
##                                           Description
## Description                                anterior/posterior pattern specification
##   Aldosterone synthesis and secretion                                             0
##   AMPK signaling pathway                                                          0
##   anterior/posterior pattern specification                                        1

#don't forget 1-x to have identical sets together
trees=lapply(coenriched,function(x) hclust(as.dist(1-x)))
groups=lapply(trees,function(x) cutree(x,h=0))
groups=do.call(rbind,lapply(1:5,function(y) 
    data.frame(cbind("subtype"=unique(enriched$BP$subtype)[y],
                     "Description"=names(groups[[y]]),
                     "group"=groups[[y]]))))
#write_tsv(groups,"Groups_per_component.tsv")
head(groups)

5. Check the feature similarity between two subtypes (or the normal tissue) with over-representation of the same function

functionsJaccard.R → funcJaccI.tsv, figure 3A

features=lapply(enriched,function(x) 
    x%>%dplyr::select(subtype,Description,geneID)%>%
    separate_rows(geneID,sep='/',convert=T))
#which functions map the same genes for a pair of datasets?????
i=lapply(features,function(x) 
    x%>%distinct(subtype,Description)%>%count(Description)%>%
    filter(n>1)%>%dplyr::select(Description)%>%unlist)
#use only functions found in at least 2 datsets
features=lapply(1:2,function(x) 
    features[[x]]%>%filter(Description%in%i[[x]]))
head(features)

## [[1]]
## # A tibble: 16,284 × 3
##    subtype Description                                               geneID 
##    <chr>   <chr>                                                     <chr>  
##  1 Basal   cell-cell adhesion via plasma-membrane adhesion molecules PCDHA3 
##  2 Basal   cell-cell adhesion via plasma-membrane adhesion molecules PCDHAC1
##  3 Basal   cell-cell adhesion via plasma-membrane adhesion molecules PCDHA6 
##  4 Basal   cell-cell adhesion via plasma-membrane adhesion molecules PCDHA12
##  5 Basal   cell-cell adhesion via plasma-membrane adhesion molecules PCDHA1 
##  6 Basal   cell-cell adhesion via plasma-membrane adhesion molecules PCDHA10
##  7 Basal   cell-cell adhesion via plasma-membrane adhesion molecules PCDHA4 
##  8 Basal   cell-cell adhesion via plasma-membrane adhesion molecules PCDHA2 
##  9 Basal   cell-cell adhesion via plasma-membrane adhesion molecules PCDHA5 
## 10 Basal   cell-cell adhesion via plasma-membrane adhesion molecules PCDHA9 
## # … with 16,274 more rows
## 
## [[2]]
## # A tibble: 310 × 3
##    subtype Description                         geneID
##    <chr>   <chr>                                <int>
##  1 Basal   Dopaminergic synapse                  2903
##  2 Basal   Dopaminergic synapse                  5579
##  3 Basal   Dopaminergic synapse                  5519
##  4 Basal   Dopaminergic synapse                   773
##  5 Basal   Dopaminergic synapse                  3763
##  6 Basal   Dopaminergic synapse                  2774
##  7 Basal   Aldosterone synthesis and secretion   8912
##  8 Basal   Aldosterone synthesis and secretion   2778
##  9 Basal   Aldosterone synthesis and secretion    775
## 10 Basal   Aldosterone synthesis and secretion 196883
## # … with 300 more rows

#jaccard index
jacc=function(set1,set2){
    inter=length(intersect(set1,set2))
    un=length(union(set1,set2))
return(inter/un)}
intersect_functions=function(data,fun){
    set=data%>%filter(Description==fun)
    subtys=unique(set$subtype)
    #paired contrast
    mat=do.call(rbind,lapply(1:(length(subtys)-1),function(x) 
        do.call(rbind,lapply((1+x):length(subtys),function(y) 
            c("func"=fun,"pair1"=subtys[x],"pair2"=subtys[y],
                "index"=jacc(set$geneID[set$subtype==subtys[x]],
                           set$geneID[set$subtype==subtys[y]]))))))
return(mat)}
jindx=lapply(features,function(x) 
    data.frame(do.call(rbind,lapply(unique(x$Description),function(y)
        intersect_functions(x,y)))))

#toplot
jindx=do.call(rbind,lapply(1:2,function(x) 
    cbind("type"=c("biological process","KEGG pathway")[x],
        jindx[[x]])))
jindx$pair=paste(jindx$pair1,jindx$pair2)
#write_tsv(x=jindx,"funcJaccI.tsv")
head(jindx)

temp=jindx[jindx$index>0.5,]
#png("enrichJacc1.png")
ggplot(jindx,aes(x=as.numeric(index),y=pair))+
geom_violin()+geom_jitter(height=0.1)+
ggrepel::geom_text_repel(data=temp,
 aes(y=pair,x=as.numeric(index),label=func),hjust=1,nudge_x=1.33,xlim=c(0,1.26),
 force=33,force_pull=.05)+theme(text=element_text(size=16),
 axis.ticks.y=element_blank(),panel.background=element_blank(),
 plot.margin=unit(c(.1,5,.1,.1),"cm"))+coord_cartesian(clip="off")+
ylab("")+xlab("jaccard index")+geom_vline(xintercept=0.5,
    color="firebrick")

#dev.off()