This is an extract of the scripts in step 4. The idea is to illustrate graph construction from one of the functions over-represented in the SGCCA

1. Get a matrix with all the features in the SGCCA components where the function is over-represented

Run in the same folder than subtype.eigeNormi & the enrichment results

Rscript get_matrix.R GO:0007156 Basal → GO:0007156.Basal.mtrx

suppressPackageStartupMessages({library(tidyverse)})

fun="GO:0007156"#args[1]
subty="Basal"#args[2]

#get the components linked to the function
if(length(grep("GO",fun))>0){
 enrich=read_tsv("BP.enrichment",show_col_types = FALSE)
}else{
 enrich=read_tsv("KEGG.enrichment",show_col_types = FALSE)  
}
comp=enrich%>%filter(ID==fun&subtype==subty)%>%
              dplyr::select(component)%>%unlist
head(comp)
## component1 component2 component3 component4 component5 component6 
## "comp 100" "comp 109" "comp 110" "comp 111" "comp 124"  "comp 13"
#get the features selected in those components
selected=read_tsv(paste(subty,"stable",sep='.'),,show_col_types = FALSE)
features=selected%>%filter(component%in%comp)%>%
         distinct(variable)%>%unlist
#print(paste("Function has",length(features),"features associated",sep=' '))
head(features)
##    variable1    variable2    variable3    variable4    variable5    variable6 
## "cg00001747" "cg00001793" "cg00002719" "cg00011616" "cg00015699" "cg00016406"
#get the data
data=data.table::fread(paste(subty,"eigeNormi",sep='.'))
## Warning in data.table::fread(paste(subty, "eigeNormi", sep = ".")): Detected
## 128 column names but the data has 129 columns (i.e. invalid file). Added 1 extra
## default column name for the first column which is guessed to be row names or an
## index. Use setnames() afterwards if this guess is not correct, or fix the file
## write command that created the file to create a valid file.
data=data[data$V1%in%features,]
#write_tsv(data,paste(fun,subty,"mtrx",sep='.'))#needed for puma
data[1:3,1:3]

2. Run ARACNE with obtained matrix

https://github.com/CSB-IG/ARACNE-multicore

run.sh file was adjusted to the matrix names changing:

line 12 → nom=$(echo $ftsv | cut -d’.’ -f 1,2)

line 23 → n=$( (cd adj; ls) | head -1 | cut -d’.’ -f 3 )

mv GO\:0007156.Basal.mtrx ARACNE-multicore-master/launch/
cd ARACNE-multicore-master/launch/
bash run.sh GO\:0007156.Basal.mtrx
head GO\:0007156.Basal.sort
mv GO\:0007156.Basal.sort ../../
cd ../../
## mv: cannot stat 'GO:0007156.Basal.mtrx': No such file or directory
## Processing MI calculations for: GO:0007156.Basal.mtrx ...
## Column Index Name: V1
## ARACNe time: .03333333333333333333 minutes.
## Parameters to join: GO:0007156.Basal 164 node.list V1
## join ADJ matriz time: 0 minutes.
## Moving adjancy matrix
## Creating SIF: 0 minutes.
## Sorting: .01666666666666666666 minutes.
## cg14805347   cg13198321  0.658929
## cg13198321   cg14805347  0.658929
## cg23953697   cg02105519  0.63503
## cg02105519   cg23953697  0.63503
## cg27252395   cg04515001  0.623192
## cg04515001   cg27252395  0.623192
## cg27252395   cg05347898  0.584007
## cg05347898   cg27252395  0.584007
## cg05347898   cg04515001  0.578078
## cg04515001   cg05347898  0.578078

If something fails probably is due to an incorrect parsing of the parameters to join, they must be: matrix name, ARACNE kernel width & column index name

3. Keep just the edges with MI above a threshold that is specific for the edge type

Rscript MIfilter.R GO:0007156.Basal.sortGO:0007156.Basal.filtered.alt

If ARACNE MI distributions are not significantly different, MIfilter.R outputs file.filtered & file.filtered.alt.

file.filtered.alt uses a single threshold no matter the edge type

edges=read_tsv("GO:0007156.Basal.filtered.alt",show_col_types=F)
head(edges)

4. Plot the interactions returned by MIfilter.R

Run in the same folder than the results of differential expression/methylation & the file mapping CpG probes to genes

Also Cytoscape has to be already open

Rscript plotGraph.R GO:0007156.Basal.filtered.altGO:0007156.Basal.cys

5. Get info about the nodes and edges plotted

Cytoscape has to be already open

Rscript annotateGraph.R GO:0007156.Basal.cys

suppressPackageStartupMessages(library(igraph))
library(RCy3)

net="GO:0007156.Basal.cys"#commandArgs(trailingOnly=TRUE)
id=unlist(strsplit(net,'.',fixed=T))[1]
subty=unlist(strsplit(net,'.',fixed=T))[2]

openSession(net)
## Opening /home/msoledad/param-rm/GO:0007156.Basal.cys...
g=createIgraphFromNetwork(subty)
chosen=read_tsv("chosen.tsv",show_col_types=F)
head(chosen)
#get the cluster that contains the function
chosen=chosen%>%filter(ID==id&subtype==subty)
cl=chosen$group
funs=chosen$Description

#get related functions
groups=read_tsv("Groups_per_component.tsv",show_col_types=F)
groups=groups%>%filter(subtype==subty,group==cl)
if(nrow(groups)>1){
    funs=groups%>%select(Description)%>%unlist}
funs
## [1] "homophilic cell adhesion via plasma membrane adhesion molecules"
names=data.frame(cbind("name"=V(g)$name,"label"=V(g)$label))

library(biomaRt)
mart=useEnsembl("ensembl",dataset="hsapiens_gene_ensembl",
    version=105)
## Ensembl site unresponsive, trying useast mirror
#https://dec2021.archive.ensembl.org
myannot=getBM(attributes = c("hgnc_symbol","wikigene_description"),
    filter="hgnc_symbol",values=names$label,mart=mart)
head(myannot)
enriched=list(BP=read_tsv("BP.enrichment",show_col_types=F),
    KEGG=read_tsv("KEGG.enrichment",show_col_types=F))
known_genes=lapply(enriched,function(x) 
    x%>%filter(Description%in%funs&subtype==subty)%>%
    dplyr::select(Description,geneID)%>%separate_rows(geneID,sep='/'))
head(known_genes$BP)
#Checking functions in pubmed
library(rentrez)

#search only the nodes that aren't responsible for the enrichment
query=names$label[!names$label%in%known_genes$label]
query=unique(unlist(strsplit(query,",")))
knownfun=lapply(funs,function(x) #for every function in the net
            lapply(query,function(y) #and every non-functional feature
            #search papers connecting them
            entrez_search(db="pubmed",term=paste(x,y,sep=' AND '))))
as.data.frame(do.call(rbind,lapply(knownfun,function(x) do.call(rbind,lapply(x,function(y) cbind(paste(y$ids,collapse=','),y$QueryTranslation))))))[1:2,]
#Checking edges on pubmed
edges=data.frame(get.edgelist(g))
colnames(edges)[2]="name"
edges=merge(edges,names,by="name")[,2:3]
colnames(edges)[2]="pair1"
colnames(edges)[1]="name"
edges=unique(merge(edges,names,by="name")[,2:3])
colnames(edges)[2]="pair2"
edges=edges%>%separate_rows(pair2,sep=',',convert=T)%>%separate_rows(pair2,sep=',',convert=T)
known=apply(edges,1,function(x) 
    entrez_search(db="pubmed",
                  term=paste(x[1],x[2],sep=' AND ')))
head(known,2)
## [[1]]
## Entrez search result with 0 hits (object contains 0 IDs and no web_history object)
##  Search term (as translated):  RPS6 AND cg00001747 
## 
## [[2]]
## Entrez search result with 0 hits (object contains 0 IDs and no web_history object)
##  Search term (as translated):  PHGDH AND cg00123762