\documentclass[final]{beamer} \usepackage{setspace} \mode { \usetheme{DAG} } <>= library(knitr) opts_chunk$set( message=F, warning=F ) posterPath <- "/Users/pschofield/git_tree/dag_pages/" figPath <- paste0(posterPath,"figure/") @ \usepackage{ragged2e} \usepackage{times} \usepackage{amsmath,amssymb} \usepackage[english]{babel} \usepackage[latin1]{inputenc} \usepackage[orientation=portrait,size=a0,scale=1.5]{beamerposter} \title[Fancy Posters]{RNA-seq Data Analysis With R-bioconductor: An Exercise in Reproducable Research} \author[Schofield]{Piet\`a Schofield} \institute[DAG]{Data Analysis Group\\ Computational Biology\\ College of Life Sciences\\ University of Dundee} \date{October 2014} \addtobeamertemplate{block begin}{}{\justifying} %\graphicspath{/Users/pschofield/git_tree/dag_pages/figure/} \begin{document} \begin{frame}[fragile] \begin{columns}[t] \begin{column}{0.48\textwidth} \begin{block}{Introduction \emph{knitr}} This poster is a demonstration of reproducible research. The poster was created as a 300 line R-noweb script. The script contains all the words that appear on the poster and also all the code required to process the raw data and to produce the figures that appear on the poster. R-noweb is an augmented form of \LaTeX. Similarly \texttt{Rmarkdown} scripts can be written to produce HTML and MS Word documents. They are compiled by the R \texttt{knitr} package\\ \vspace{2mm} \begin{minipage}{0.55\textwidth} <>= setwd(posterPath) logknit <- knit("rnaseq_poster.rnw") loglatex <- system("pdflatex rnaseq_poster.tex",intern=T) system("open rnaseq_poster.pdf") @ \end{minipage} \hspace{2mm} \begin{minipage}{0.4\textwidth} For completeness this is the code that is used to compile the poster \emph{NB although this code is included in the script it is not excuted at compile time as this would create and infinitely recursive process} \end{minipage} \end{block} \begin{block}{Quality Checking \emph{seqTools}} If you have Illumina data generally it is in the form of FASTQ files. The first step is to examine the quality of the data, this can be done using the \texttt{seqTools} package.\\ First set up the paths to the raw data and use the \texttt{fastqq} function to generate quality metrics for all the files. \vspace{1mm} <>= require(pathview) require(Rsubread,quiet=TRUE) require(pvclust) require(seqTools,quiet=TRUE) require(STRINGdb,quiet=TRUE) @ <>= homePath <- "/Users/pschofield/" projPath <- paste0(homePath, "Projects/rna_seq/") fastqPath <- paste0(homePath,"Projects/BS32010_2015/web/data") reportPath <- paste0(projPath,"bam/") files <- list.files(fastqPath, pattern="^(S|F).*fq[.]gz", full=TRUE) names(files) <- sub(".fq.gz","",basename(files)) if(!exists("fq")) fq<-fastqq(files,k=6,probeLabel=names(files)) @ then visualise some of the quality metrics on all these files, here we show just a couple though it would be best to look at the metrics for each file. For example:\\ \vspace{3mm} \begin{minipage}{0.47\textwidth} View the gc content distribution of all the reads <>= plotGCcontent(fq) @ \end{minipage} \hspace{2mm} \begin{minipage}{0.5\textwidth} View the phred quality distribution for each base by position <>= plotMergedPhredQuant(fq) @ \end{minipage}\\ \end{block} \vfill \begin{block}{Read Alignment \emph{Rsubread}} The reads must be aligned to a reference genome. We can use the bioconductor package \texttt{Rsubread} to make a reference index from a FASTA file available from the ENSEMBL website and align the reads in the FASTQ files. The \texttt{subjunc} function is used to do the alignment as it copes with reads containing splice junctions. <>= ref <- paste0(projPath,"annot/Scerevisiae_R64_1_1_dna.fa") indexname <- paste0(projPath,"annot/sc_R64") if(!file.exists(paste0(indexname,".log"))) buildindex(basename=indexname,reference=ref) @ <>= lapply(seq(1,length(files)),function(x){ subjunc(indexname,files[x],nthread=4,input_format="gzFASTQ", output_file=paste0(gsub("[.]fq.gz","",files[x]),"_subjunc.bam")) }) @ Then the aligned reads assigned to annotated features using the \texttt{featureCounts} function\\ \begin{minipage}{0.55\textwidth} <>= bamFiles <- list.files(fastqPath, pattern=".*bam$",full=T) annoFile <- paste0(projPath, "annot/Scerevisiae_R64_1_1.gtf") if(!exists("fc")){ fc <- invisible(featureCounts(bamFiles,annot.ext=annoFile, isGTFAnnotationFile=TRUE, isPairedEnd=FALSE)) } fcounts <- fc$counts colnames(fcounts) <- gsub("_subjunc[.]bam","", basename(bamFiles)) fcounts <- fcounts[rowSums(fcounts)>0,] pvcFC <- pvclust(fcounts) @ \end{minipage} \hspace{2mm} \begin{minipage}{0.4\textwidth} <>= plot(pvcFC, main=paste0("Cluster of Sample \n", "by expression profile")) @ \end{minipage}\\ The clustering indicates that generally the replicates cluster by treatment, however one of the snf2 replicates stands out. This might indicate a problem with the replicate, or an unexplain source of biological variation and warrents further investigation. \end{block} \begin{block}{Differential Expression \emph{edgeR}} In order to perform a differential expression analysis the data files must be distinguished by treatment, or tissue or whatever factor is relevant. This information can be extracted from the file names <>= treatment=relevel(as.factor(ifelse(grepl("^W",colnames(fcounts)),"WT","SNF2")),"WT") @ There are many differential expression packages available in R-bioconductor, here we will use the \texttt{edgeR} package. The data must be transformed into a data structure used by \texttt{edgeR}.\\ \vspace{2mm} \begin{minipage}{0.55\textwidth} <>= require(edgeR, quiet=TRUE) dge <- DGEList(counts=fcounts[rowSums(fcounts)>0,], group=treatment) @ The data must also be normalized to account for the difference in library sizes and the variance model. <>= dge <- calcNormFactors(dge) @ The differential expression test can then be performed <>= require(ggplot2,quiet=TRUE) mm <- model.matrix(~treatment) dge <- estimateGLMCommonDisp(dge,mm) dge <- estimateGLMTrendedDisp(dge,mm) dge <- estimateGLMTagwiseDisp(dge,mm) fit <- glmFit(dge,mm) lrt <- glmLRT(fit,coef=2) tt <- as.data.frame(topTags(lrt,n=10000)) gp <- ggplot(tt, aes(x=logFC,y=log2(FDR))) gp <- gp + geom_point(size=1, aes(colour = logCPM)) gp <- gp + scale_colour_gradient(high = 2, low = 4 ) gp <- gp + geom_abline(intercept=log2(0.05), slope=0,colour=7) @ \end{minipage} \hspace{2mm} \begin{minipage}{0.4\textwidth} <>= plot(gp) @ We can use the \texttt{ggplot2} package to plot the false discovery rate adjusted significance against the Log2 foldchange \end{minipage} \end{block} \end{column} \begin{column}{0.48\textwidth} \begin{block}{Significant Genes \emph{biomaRt}} The significantly differentially expressed genes can be extracted from the topTags listing first we will get the annotations for the significant genes using \texttt{biomaRt}\\ <>= require(biomaRt) if(!exists("ttAnno")){ mart <- useMart("ensembl","scerevisiae_gene_ensembl") anno <- getBM(attributes=c("ensembl_gene_id","external_gene_name", "description","entrezgene"), filter="ensembl_gene_id",values=rownames(tt)[which(tt$FDR<=0.05)],mart=mart) colnames(anno) <- c("GeneId","GeneName","Description","EntrezId") ttAnno <- merge(anno,tt[,c("logFC","FDR")],by.x="GeneId",by.y="row.names") ttAnno$Description <- paste0("\\parbox{0.5\\textwidth}{",gsub("%"," percent ",ttAnno$Description),"}") ttAnno$FDR <- sapply(ttAnno$FDR,function(x) sprintf("%e",x)) } @ \vspace{3mm} \begin{minipage}{0.7\textwidth} <>= downGenes <- ttAnno$GeneId[which(ttAnno$logFC<=0)] upGenes <- ttAnno$GeneId[which(ttAnno$logFC>=0)] @ \end{minipage} \hspace{2mm} \begin{minipage}{0.25\textwidth} <>= sumtab<-as.data.frame(cbind(Expression=c("Up regulated","Down regulated"),Count=c(length(upGenes), length(downGenes)))) kable(sumtab) @ \end{minipage}\\ \vspace{3mm} Gratifyingly the most significantly differentially expressed gene with also the highest foldchange is YOR290C (SNF2) which is strongly down regulated in the mutant. However the top five most strongly up regulated annotations are all "Dubious ORF"s. \\ <>= upTop <- ttAnno[which(ttAnno$GeneId %in% upGenes),] kable(head(upTop[rev(order(upTop$logFC)),],5),row.names=FALSE) @ \vspace{3mm} similarly for down regulated genes \vspace{3mm} <>= downTop <- ttAnno[which(ttAnno$GeneId %in% downGenes),] kable(head(downTop[order(downTop$logFC),],5),row.names=FALSE) @ \end{block} \begin{block}{Network Analysis \emph{STRINGdb}} There are now many packages in R-bioconductor that can be used to investigate gene set and pathway enrichment of the significant genes. Here we show the example of \texttt{STRINGdb} \\ \vspace{3mm} \begin{minipage}{0.4\textwidth} <>= string_db <- invisible(STRINGdb$new( version="9_05", species=4932, score_threshold=0, input_directory="")) downMap <- string_db$map(downTop,"GeneId", removeUnmappedRows=T) upMap <- string_db$map(upTop,"GeneId", removeUnmappedRows=T) allMap <- string_db$map(ttAnno,"GeneId", removeUnmappedRows=T) hits <- head(downMap$STRING_id[ order(downMap$logFC)],100) @ <>= string_db$plot_network(hits) @ \end{minipage} \hspace{2mm} \begin{minipage}{0.55\textwidth} <>= string_db$plot_network(hits) @ \end{minipage}\\ The \texttt{STRINGdb} can also be used to look for enriched GO terms and enriched KEGG pathways\\ \vspace{3mm} \begin{minipage}{0.4\textwidth} <>= backgroundV <- allMap$STRING_id[1:2000] string_db$set_background(backgroundV) hitsUP <- upMap$STRING_id hitsDown <- downMap$STRING_id upGOBP <- string_db$get_enrichment( hitsUP, category = "Process", methodMT = "fdr", iea = TRUE ) upKEGG <- string_db$get_enrichment( hitsUP, category = "KEGG", methodMT = "fdr", iea = TRUE ) @ \end{minipage} \hspace{2mm} \begin{minipage}{0.57\textwidth} <>= colnames(upGOBP) <- gsub("_"," ",colnames(upGOBP)) kable(head(upGOBP,4)) @ <>= colnames(upKEGG) <- gsub("_"," ",colnames(upKEGG)) kable(head(upKEGG,4)) @ \end{minipage} \end{block} \begin{block}{Pathway Analysis \emph{pathview}} \begin{minipage}{0.4\textwidth} Then using the \texttt{pathview} package it is possible to visualise the second top hit in the enriched pathway, \Sexpr{paste0("sce:",gsub("sce","",head(upKEGG[1,1])))} \\ <>= gene.data <- as.numeric(ttAnno$logFC) names(gene.data) <- ttAnno$GeneId pathID <- gsub("sce","",head(upKEGG[1,1])) pathview(gene.data=gene.data, gene.idtype="kegg", pathway.id=pathID, species="sce", out.suffix="kegg", kegg.native=T, same.layer=T) @ <>= system(paste0("mv sce",pathID,".kegg.png pathdisp_kegg.png")) @ \end{minipage} \hspace{2mm} \begin{minipage}{0.55\textwidth} \includegraphics[width=\textwidth]{pathdisp_kegg.png} \end{minipage} \end{block} \vfill \end{column} \end{columns} \end{frame} \end{document}