---
title: 'RNA-seq Analysis 2:'
author: "Pietà Schofield"
subtitle: Functional Annotation Analysis
output:
  html_document:
    code_folding: show
    fig_caption: yes
    highlight: textmate
    theme: spacelab
    toc: yes
    toc_depth: 3
    toc_float:
      collapsed: no
      smooth_scroll: yes
  pdf_document:
    toc: yes
    toc_depth: '3'
---


```{r setup,echo=FALSE,include=FALSE}
require(RefManageR)
BibOptions(style="html",longnamesfirst=F)
if(!file.exists(".pieta.bib")){
 download.file("http://www.compbio.dundee.ac.uk/user/pschofield/biblio.bib",".pieta.bib")
}
bib <- ReadBib(".pieta.bib",check=FALSE)
require(knitr)
opts_chunk$set( message=F, warning=F, comment=NA)
#
# This little bit of trickery is rather complex knitr-foo thanks to Ramnath Vaidyanathan
#
# http://ramnathv.github.io/posts/verbatim-chunks-knitr/index.html
#
# It enables code chunks to be included with the surrounding brackets and options
# for illustrative purposes
#
knitr::opts_chunk$set(tidy = F)
knit_hooks$set(source = function(x, options){
  if (!is.null(options$verbatim) && options$verbatim){
    opts = gsub(",\\s*verbatim\\s*=\\s*TRUE\\s*", "", options$params.src)
    bef = sprintf('\n\n    ```{r %s}\n', opts, "\n")
    stringr::str_c(
      bef, 
      knitr:::indent_block(paste(x, collapse = '\n'), "    "), 
      "\n    ```\n"
    )
  } else {
    stringr::str_c("\n\n```", tolower(options$engine), "\n", 
      paste(x, collapse = '\n'), "\n```\n\n"
    )
  }
})
#
# a little bit of R from Yihui Xieto put in verbatim inline R
#
# http://stackoverflow.com/questions/20409172/how-to-display-verbatim-inline-r-code-with-backticks-using-rmarkdown
#
rinline <- function(code) {
  sprintf('``` `r %s` ```', code)
}

if(Sys.info()["sysname"]!="Windows"){
  homeDir <- paste0(system("echo ${HOME}",intern=T),"/")
}else{
  homeDir <- paste0(getwd(),"/")
}
rnaLocal <- paste0(homeDir,"RNAseq/")
```

# Functional Annotation Packages

There is an increasing number of packages for accessing and analysing functional annotation enrichments associated with set of significant genes. For example looking for over represented Gene Ontology (GO) terms or over represented pathways from KEGG. On of the simplest to use is the package [STRINGdb](https://www.bioconductor.org/packages/3.3/bioc/html/STRINGdb.html) 

We need the output from the feature analysis for example the results from the differential gene expression analysis. This might exist already as a saved RData object but here we will quickly recreate it using edgeR from the featureCounts


```{r doEdgeR, echo=TRUE}
# download the data if it is not already on the system
# 
if(!file.exists(paste0(rnaLocal,"Rdata/fullSC.RData"))){
  dir.create(paste0(rnaLocal,"Rdata/"),recursive=T,showWarnings = F)
  download.file("http://www.compbio.dundee.ac.uk/user/pschofield/data/fullSC.RData",
                paste0(rnaLocal,"Rdata/fullSC.RData"))
}
load( paste0(rnaLocal,"Rdata/fullSC.RData"))
countMatrix <- fc$counts
colnames(countMatrix)<-gsub("[.]*","",colnames(countMatrix))
colnames(countMatrix)<-gsub("sbam$","",colnames(countMatrix))
colnames(countMatrix)<-gsub("Snf2","Mut",colnames(countMatrix))

# if dont already have the results calculate them otherwise load them
if(!file.exists(paste0(rnaLocal,"Rdata/dgeSC.RData"))){
  require(edgeR)
  conditions <- as.factor(substr(colnames(countMatrix),1,1))
  conditions <- relevel(conditions,ref = "W")
  design <- model.matrix( ~ conditions -1)
  dge <- DGEList(countMatrix, group=conditions)
  dge <- calcNormFactors(dge,method="TMM")
  dge <- estimateDisp(dge,design)
  fit <- glmFit(dge,design)
  lrt <- glmLRT(fit,contrast = c(-1,1))
  tt <- topTags(lrt, n=10000)
  dge_scerevisiae <- tt$table[,c("FDR","logFC")]
  dge_scerevisiae$geneId <- rownames(dge_scerevisiae)
  save(dge_scerevisiae,file=paste0(rnaLocal,"Rdata/dgeSC.RData"))
}else{
  load(paste0(rnaLocal,"Rdata/dgeSC.RData"))
}
```

A reminder of what the results look like

```{r stringDBdata, echo=FALSE}
#load(dataFile)
knitr::kable(head(dge_scerevisiae))
```

One of the oldest challenges in bioinformatics is the problem caused by id proliferations and inconsistencies. For example the same gene will have different id in the ENSEMBL, NCBI and Entrezgene databases. It is possible to get some of these from the ENSEMBL database using biomaRt

```{r biomaRt}
require(biomaRt)

# get a list of the available species from ENSEMBL
ens <- useMart("ensembl")
marts <- listDatasets(ens)

knitr::kable(marts[which(grepl("^S",marts$description)),])
```

So there is a dataset called `r paste0(marts$dataset[which(grepl("^Sacc",marts$description))])` that is the one we want

```{r linkBM}
scmart <- useDataset("scerevisiae_gene_ensembl", ens)
```

We can see a full list of the data available in the *scerevisiae_gene_ensembl* dataset.

```{r headAtt}
attributePages(scmart)
knitr::kable(head(listAttributes(scmart,"feature_page")))
knitr::kable(head(listAttributes(scmart,"sequences")))
```

We can get some of this information for the signicantly differentially expressed genes

```{r getBMsig}
# again if scripting this it may be something we don't want to do everytime we run the script
# but being mindful of the fact we might change parameters in the upstream analysis that would
# mean the set of significant genes changes and this step would need refreshing
# 
if(!file.exists(paste0(homeDir,"Rdata/annSC.RData"))){
  annData <- getBM(
     attributes = c("ensembl_gene_id",
                    "external_gene_name",
                    "description",
                    "entrezgene",
                    "refseq_dna",
                    "refseq_mrna"),
    filter="ensembl_gene_id",
    values=rownames(dge_scerevisiae)[which(dge_scerevisiae$FDR<=0.05)],
    mart=scmart)
  save(annData,file=paste0(rnaLocal,"Rdata/annSC.RData"))
}else{
  load(paste0(rnaLocal,"Rdata/annSC.RData"))
}

head(annData)
```

We can combine this data with the gene expression data using merge

```{r mergeDB}
annRes <- merge(annData,dge_scerevisiae,by.x="ensembl_gene_id",by.y="geneId")

knitr::kable(head(annRes[order(abs(annRes$logFC),decreasing = T),]))
```

And satisfyingly and fortunately the biggest foldchange between the WT and the SNF2 deletion mutant is the SNF2 gene itself.

## STRINGdb

The first step after loading the packages is to create an object that links to STRING, this requires knowing the species id of the organism of interest of which there are over 2000. 

```{r stringDBex }
require(STRINGdb)

# List the names and ids of the organisms
species <- get_STRING_species(version="10", species_name=NULL)

# find the organisms starting with "Sacc"
species[which(grepl("^Sacc",species$official_name)),]

# create a new STRING_db object for Saccharomyces cerevisiae
string_db <- STRINGdb$new(version="10",species=4932)
```

Once we have the STRINGdb we need to convert or map the ids we have from ENSEMBL for our genes to the gene ids in STRINGdb. We can use either the entrez gene id or the ensembl gene id but they have differening success rates so better to use the one that maximises the number of mapped genes

```{r stringDBmap}
# map to STRING ids
annMap = string_db$map(annRes, "entrezgene", removeUnmappedRows = TRUE )

annMap = string_db$map(annRes, "ensembl_gene_id", removeUnmappedRows = TRUE )
```

STRINGdb then lets us draw a gene interaction network for a set of genes. Here we select the STRING_id for the top 25 differentially expressed genes 

```{r ppint}
# get the STRING_id for the top 50 genes
hits = annMap$STRING_id[1:25]  

# plot the STRING network png 
string_db$plot_network( hits )
```

It is also possible to add to this diagram the foldchange information in the form of coloured halos around the notes that represent the foldchange as an intensity on a Red to Green colour scale.

```{r stringPoshPlot}
# filter by p-value and add a color column 
# (i.e. green down-regulated gened and red for up-regulated genes)
annMap05 = string_db$add_diff_exp_color( subset(annMap, FDR<0.05), logFcColStr="logFC" )    

# post payload information to the STRING server
payload_id = string_db$post_payload( annMap05$STRING_id, colors=annMap05$color )

# display a STRING network png with the "halo"
string_db$plot_network( hits, payload_id=payload_id )
```

## GO and KEGG Annotations

STRINGdb has the function `get_enrichment()` that can calculate enriched annotations from GO, KEGG, InterPro and pfam for sets of genes

```{r stringGO}
hits <- example1_mapped$STRING_id
######## compute enrichment in GO annotations for Biological Process ########
goProcess <- string_db$get_enrichment(hits,  category = "Process", 
                                      methodMT = "fdr", iea = TRUE )
write.csv(goProcess[which(goProcess$pvalue_fdr<=0.05),],
          file=paste0(rnaLocal,"RData/goProcessR.csv"))
goFunction <- string_db$get_enrichment(hits,  category = "Function" , 
                                      methodMT = "fdr", iea = TRUE )
write.csv(goFunction[which(goFunction$pvalue_fdr<=0.05),],
          file=paste0(rnaLocal,"RData/goFunctionR.csv"))
goComponent <- string_db$get_enrichment(hits,  category = "Component" , 
                                      methodMT = "fdr", iea = TRUE )
write.csv(goComponent[which(goComponent$pvalue_fdr<=0.05),],
          file=paste0(rnaLocal,"RData/goComponentR.csv"))
keggPath <- string_db$get_enrichment(hits,  category = "KEGG" , 
                                      methodMT = "fdr", iea = TRUE )
write.csv(keggPath[which(keggPath$pvalue_fdr<=0.05),],
          file=paste0(rnaLocal,"RData/keggPathR.csv"))
interPro <- string_db$get_enrichment(hits,  category = "InterPro" , 
                                      methodMT = "fdr", iea = TRUE )
write.csv(interPro[which(interPro$pvalue_fdr<=0.05),],
          file=paste0(rnaLocal,"RData/interProR.csv"))
pfamDom <- string_db$get_enrichment(hits,  category = "Pfam" , 
                                      methodMT = "fdr", iea = TRUE )
write.csv(pfamDom[which(pfamDom$pvalue_fdr<=0.05),],
          file=paste0(rnaLocal,"RData/pfamDomR.csv"))
```

## Visualisation

There are also packages for visualising these kinds of results. One such is the [pathview]() package that is used to visualise expression data on KEGG pathways

```{r pathview, eval=TRUE}
require(pathview)
gene.data <- as.numeric(annRes$logFC)
names(gene.data) <- annRes$geneId
pathID <- "00260"
pview <- pathview(gene.data=gene.data,
                  gene.idtype="kegg",
                  pathway.id=pathID,
                  species="sce",
                  out.suffix="kegg",
                  kegg.native=T,
                  same.layer=T)
```

![KEGG pathway diagram](`r paste0(getwd(),"/sce00260.kegg.png")`)

```{r }
sessionInfo()
```