This is an extract of the scripts in step 4. The idea is to illustrate graph construction from one of the functions over-represented in the SGCCA
Run in the same folder than subtype.eigeNormi & the enrichment results
Rscript get_matrix.R GO:0007156 Basal → GO:0007156.Basal.mtrx
suppressPackageStartupMessages({library(tidyverse)})
fun="GO:0007156"#args[1]
subty="Basal"#args[2]
#get the components linked to the function
if(length(grep("GO",fun))>0){
enrich=read_tsv("BP.enrichment",show_col_types = FALSE)
}else{
enrich=read_tsv("KEGG.enrichment",show_col_types = FALSE)
}
comp=enrich%>%filter(ID==fun&subtype==subty)%>%
dplyr::select(component)%>%unlist
head(comp)
## component1 component2 component3 component4 component5 component6
## "comp 100" "comp 109" "comp 110" "comp 111" "comp 124" "comp 13"
#get the features selected in those components
selected=read_tsv(paste(subty,"stable",sep='.'),,show_col_types = FALSE)
features=selected%>%filter(component%in%comp)%>%
distinct(variable)%>%unlist
#print(paste("Function has",length(features),"features associated",sep=' '))
head(features)
## variable1 variable2 variable3 variable4 variable5 variable6
## "cg00001747" "cg00001793" "cg00002719" "cg00011616" "cg00015699" "cg00016406"
#get the data
data=data.table::fread(paste(subty,"eigeNormi",sep='.'))
## Warning in data.table::fread(paste(subty, "eigeNormi", sep = ".")): Detected
## 128 column names but the data has 129 columns (i.e. invalid file). Added 1 extra
## default column name for the first column which is guessed to be row names or an
## index. Use setnames() afterwards if this guess is not correct, or fix the file
## write command that created the file to create a valid file.
data=data[data$V1%in%features,]
#write_tsv(data,paste(fun,subty,"mtrx",sep='.'))#needed for puma
data[1:3,1:3]
https://github.com/CSB-IG/ARACNE-multicore
run.sh file was adjusted to the matrix names changing:
line 12 → nom=$(echo $ftsv | cut -d’.’ -f 1,2)
line 23 → n=$( (cd adj; ls) | head -1 | cut -d’.’ -f 3 )
mv GO\:0007156.Basal.mtrx ARACNE-multicore-master/launch/
cd ARACNE-multicore-master/launch/
bash run.sh GO\:0007156.Basal.mtrx
head GO\:0007156.Basal.sort
mv GO\:0007156.Basal.sort ../../
cd ../../
## mv: cannot stat 'GO:0007156.Basal.mtrx': No such file or directory
## Processing MI calculations for: GO:0007156.Basal.mtrx ...
## Column Index Name: V1
## ARACNe time: .03333333333333333333 minutes.
## Parameters to join: GO:0007156.Basal 164 node.list V1
## join ADJ matriz time: 0 minutes.
## Moving adjancy matrix
## Creating SIF: 0 minutes.
## Sorting: .01666666666666666666 minutes.
## cg14805347 cg13198321 0.658929
## cg13198321 cg14805347 0.658929
## cg23953697 cg02105519 0.63503
## cg02105519 cg23953697 0.63503
## cg27252395 cg04515001 0.623192
## cg04515001 cg27252395 0.623192
## cg27252395 cg05347898 0.584007
## cg05347898 cg27252395 0.584007
## cg05347898 cg04515001 0.578078
## cg04515001 cg05347898 0.578078
If something fails probably is due to an incorrect parsing of the parameters to join, they must be: matrix name, ARACNE kernel width & column index name
Rscript MIfilter.R GO:0007156.Basal.sort → GO:0007156.Basal.filtered.alt
If ARACNE MI distributions are not significantly different, MIfilter.R outputs file.filtered & file.filtered.alt.
file.filtered.alt uses a single threshold no matter the edge type
edges=read_tsv("GO:0007156.Basal.filtered.alt",show_col_types=F)
head(edges)
Run in the same folder than the results of differential expression/methylation & the file mapping CpG probes to genes
Also Cytoscape has to be already open
Rscript plotGraph.R GO:0007156.Basal.filtered.alt → GO:0007156.Basal.cys
Cytoscape has to be already open
Rscript annotateGraph.R GO:0007156.Basal.cys
suppressPackageStartupMessages(library(igraph))
library(RCy3)
net="GO:0007156.Basal.cys"#commandArgs(trailingOnly=TRUE)
id=unlist(strsplit(net,'.',fixed=T))[1]
subty=unlist(strsplit(net,'.',fixed=T))[2]
openSession(net)
## Opening /home/msoledad/param-rm/GO:0007156.Basal.cys...
g=createIgraphFromNetwork(subty)
chosen=read_tsv("chosen.tsv",show_col_types=F)
head(chosen)
#get the cluster that contains the function
chosen=chosen%>%filter(ID==id&subtype==subty)
cl=chosen$group
funs=chosen$Description
#get related functions
groups=read_tsv("Groups_per_component.tsv",show_col_types=F)
groups=groups%>%filter(subtype==subty,group==cl)
if(nrow(groups)>1){
funs=groups%>%select(Description)%>%unlist}
funs
## [1] "homophilic cell adhesion via plasma membrane adhesion molecules"
names=data.frame(cbind("name"=V(g)$name,"label"=V(g)$label))
library(biomaRt)
mart=useEnsembl("ensembl",dataset="hsapiens_gene_ensembl",
version=105)
## Ensembl site unresponsive, trying useast mirror
#https://dec2021.archive.ensembl.org
myannot=getBM(attributes = c("hgnc_symbol","wikigene_description"),
filter="hgnc_symbol",values=names$label,mart=mart)
head(myannot)
enriched=list(BP=read_tsv("BP.enrichment",show_col_types=F),
KEGG=read_tsv("KEGG.enrichment",show_col_types=F))
known_genes=lapply(enriched,function(x)
x%>%filter(Description%in%funs&subtype==subty)%>%
dplyr::select(Description,geneID)%>%separate_rows(geneID,sep='/'))
head(known_genes$BP)
#Checking functions in pubmed
library(rentrez)
#search only the nodes that aren't responsible for the enrichment
query=names$label[!names$label%in%known_genes$label]
query=unique(unlist(strsplit(query,",")))
knownfun=lapply(funs,function(x) #for every function in the net
lapply(query,function(y) #and every non-functional feature
#search papers connecting them
entrez_search(db="pubmed",term=paste(x,y,sep=' AND '))))
as.data.frame(do.call(rbind,lapply(knownfun,function(x) do.call(rbind,lapply(x,function(y) cbind(paste(y$ids,collapse=','),y$QueryTranslation))))))[1:2,]
#Checking edges on pubmed
edges=data.frame(get.edgelist(g))
colnames(edges)[2]="name"
edges=merge(edges,names,by="name")[,2:3]
colnames(edges)[2]="pair1"
colnames(edges)[1]="name"
edges=unique(merge(edges,names,by="name")[,2:3])
colnames(edges)[2]="pair2"
edges=edges%>%separate_rows(pair2,sep=',',convert=T)%>%separate_rows(pair2,sep=',',convert=T)
known=apply(edges,1,function(x)
entrez_search(db="pubmed",
term=paste(x[1],x[2],sep=' AND ')))
head(known,2)
## [[1]]
## Entrez search result with 0 hits (object contains 0 IDs and no web_history object)
## Search term (as translated): RPS6 AND cg00001747
##
## [[2]]
## Entrez search result with 0 hits (object contains 0 IDs and no web_history object)
## Search term (as translated): PHGDH AND cg00123762