Genome Annotation and Visualisation using R and Bioconductor

Mark Dunning

Last modified: 16 Mar 2016

Previously

Aims for this session

biomaRt

biomaRt

Connecting to biomaRt

library(biomaRt)
head(listMarts(host="www.ensembl.org"), 5)    
##                biomart               version
## 1 ENSEMBL_MART_ENSEMBL      Ensembl Genes 84
## 2     ENSEMBL_MART_SNP  Ensembl Variation 84
## 3 ENSEMBL_MART_FUNCGEN Ensembl Regulation 84
## 4    ENSEMBL_MART_VEGA               Vega 64
ensembl <- useEnsembl(biomart="ensembl",dataset="hsapiens_gene_ensembl")

Connecting to biomaRt

ensembl <- useEnsembl(biomart="ensembl",dataset="hsapiens_gene_ensembl")
head(listDatasets(ensembl),10)
##                           dataset
## 1          oanatinus_gene_ensembl
## 2         cporcellus_gene_ensembl
## 3         gaculeatus_gene_ensembl
## 4  itridecemlineatus_gene_ensembl
## 5          lafricana_gene_ensembl
## 6         choffmanni_gene_ensembl
## 7          csavignyi_gene_ensembl
## 8             fcatus_gene_ensembl
## 9        rnorvegicus_gene_ensembl
## 10         psinensis_gene_ensembl
##                                   description         version
## 1      Ornithorhynchus anatinus genes (OANA5)           OANA5
## 2             Cavia porcellus genes (cavPor3)         cavPor3
## 3      Gasterosteus aculeatus genes (BROADS1)         BROADS1
## 4  Ictidomys tridecemlineatus genes (spetri2)         spetri2
## 5          Loxodonta africana genes (loxAfr3)         loxAfr3
## 6         Choloepus hoffmanni genes (choHof1)         choHof1
## 7              Ciona savignyi genes (CSAV2.0)         CSAV2.0
## 8         Felis catus genes (Felis_catus_6.2) Felis_catus_6.2
## 9          Rattus norvegicus genes (Rnor_6.0)        Rnor_6.0
## 10     Pelodiscus sinensis genes (PelSin_1.0)      PelSin_1.0

An example query

Say we want to find out more information about a given Entrez gene(s).

head(listFilters(ensembl), 5)     
##              name     description
## 1 chromosome_name Chromosome name
## 2           start Gene Start (bp)
## 3             end   Gene End (bp)
## 4      band_start      Band Start
## 5        band_end        Band End
flt <- listFilters(ensembl)
flt[grep("entrez", flt[,1]),]
##                               name
## 28                 with_entrezgene
## 29 with_entrezgene_transcript_name
## 84                      entrezgene
## 85      entrezgene_transcript_name
##                                                  description
## 28                                     with EntrezGene ID(s)
## 29                        with EntrezGene Transcript Name(s)
## 84                            EntrezGene ID(s) [e.g. 115286]
## 85 EntrezGene transcript name ID(s) [e.g. CTD-2350J17.1-002]

Attributes

head(listAttributes(ensembl), 25)
##                               name
## 1                  ensembl_gene_id
## 2            ensembl_transcript_id
## 3               ensembl_peptide_id
## 4                  ensembl_exon_id
## 5                      description
## 6                  chromosome_name
## 7                   start_position
## 8                     end_position
## 9                           strand
## 10                            band
## 11                transcript_start
## 12                  transcript_end
## 13        transcription_start_site
## 14               transcript_length
## 15                  transcript_tsl
## 16        transcript_gencode_basic
## 17               transcript_appris
## 18              external_gene_name
## 19            external_gene_source
## 20        external_transcript_name
## 21 external_transcript_source_name
## 22                transcript_count
## 23           percentage_gc_content
## 24                    gene_biotype
## 25              transcript_biotype
##                                   description
## 1                             Ensembl Gene ID
## 2                       Ensembl Transcript ID
## 3                          Ensembl Protein ID
## 4                             Ensembl Exon ID
## 5                                 Description
## 6                             Chromosome Name
## 7                             Gene Start (bp)
## 8                               Gene End (bp)
## 9                                      Strand
## 10                                       Band
## 11                      Transcript Start (bp)
## 12                        Transcript End (bp)
## 13             Transcription Start Site (TSS)
## 14 Transcript length (including UTRs and CDS)
## 15             Transcript Support Level (TSL)
## 16                   GENCODE basic annotation
## 17                          APPRIS annotation
## 18                       Associated Gene Name
## 19                     Associated Gene Source
## 20                 Associated Transcript Name
## 21               Associated Transcript Source
## 22                           Transcript count
## 23                               % GC content
## 24                                  Gene type
## 25                            Transcript type

Forming the query

entrez <- c("673", "837")
myfilter <- "entrezgene"
attr = c("entrezgene", "hgnc_symbol", "ensembl_gene_id","description")
allAttr <- listAttributes(ensembl)
attr %in% allAttr[,1]
## [1] TRUE TRUE TRUE TRUE
myInfo <- getBM(filters="entrezgene",
    values=entrez,
    attributes=attr,
    mart=ensembl)

View the results

myInfo
##   entrezgene hgnc_symbol ensembl_gene_id
## 1        673        BRAF ENSG00000157764
## 2        837       CASP4 ENSG00000196954
##                                                                        description
## 1 B-Raf proto-oncogene, serine/threonine kinase [Source:HGNC Symbol;Acc:HGNC:1097]
## 2                                     caspase 4 [Source:HGNC Symbol;Acc:HGNC:1505]

Using multiple filters

myfilters <- c("chromosome_name", "start", "end")
myvalues <- list(16, 1100000, 1250000)
head(allAttr[grep("start", allAttr[,1]),])
##                         name                    description
## 7             start_position                Gene Start (bp)
## 11          transcript_start          Transcript Start (bp)
## 13  transcription_start_site Transcription Start Site (TSS)
## 130              pirsf_start                    PIRSF start
## 133        superfamily_start              SUPERFAMILY start
## 136              smart_start                    SMART start
attr <- c("ensembl_gene_id", "hgnc_symbol","entrezgene","chromosome_name", "start_position", "end_position")

Make the query

myInfo <- getBM(attributes = attr,
  filters = myfilters,
  values=myvalues,mart=ensembl)
myInfo
##    ensembl_gene_id hgnc_symbol entrezgene chromosome_name start_position
## 1  ENSG00000260532                     NA              16        1111627
## 2  ENSG00000273551                     NA              16        1148224
## 3  ENSG00000172236      TPSAB1       7177              16        1240696
## 4  ENSG00000260702                     NA              16        1103280
## 5  ENSG00000196557     CACNA1H       8912              16        1153241
## 6  ENSG00000261294                     NA              16        1206560
## 7  ENSG00000277010                     NA              16        1223639
## 8  ENSG00000197253       TPSB2      64499              16        1227272
## 9  ENSG00000260403                     NA              16        1156976
## 10 ENSG00000259910                     NA              16        1159548
## 11 ENSG00000116176       TPSG1      25823              16        1221651
##    end_position
## 1       1113399
## 2       1148754
## 3       1242554
## 4       1105461
## 5       1221771
## 6       1207124
## 7       1224143
## 8       1230184
## 9       1157974
## 10      1160176
## 11      1225257

Reversing the query

myfilters <- "ensembl_gene_id"
values = c("ENSG00000261713","ENSG00000261720","ENSG00000181791")
attr <- c("ensembl_gene_id","chromosome_name","start_position", "end_position","entrezgene")
getBM(attributes = attr, filters = myfilters, values = values,
ensembl
)
##   ensembl_gene_id chromosome_name start_position end_position entrezgene
## 1 ENSG00000261713              16        1064093      1078731     146336
## 2 ENSG00000261720              16        1065240      1066502         NA

Bioconductor Annotation Resources

Organism-level Packages

library(org.Hs.eg.db)

Filtering an organism package

keytypes(org.Hs.eg.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT" 
##  [5] "ENSEMBLTRANS" "ENTREZID"     "ENZYME"       "EVIDENCE"    
##  [9] "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"    
## [17] "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [25] "UNIGENE"      "UNIPROT"
length(keys(org.Hs.eg.db,keytype="ENTREZID"))
## [1] 59500
head(keys(org.Hs.eg.db,keytype="ENTREZID"))
## [1] "1"  "2"  "3"  "9"  "10" "11"

Selecting attributes

columns(org.Hs.eg.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT" 
##  [5] "ENSEMBLTRANS" "ENTREZID"     "ENZYME"       "EVIDENCE"    
##  [9] "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"    
## [17] "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [25] "UNIGENE"      "UNIPROT"

Example query

entrez <- c("673", "837")
select(org.Hs.eg.db, keys=entrez,
  keytype="ENTREZID",
  columns=c("SYMBOL","CHRLOC","CHRLOCEND"))
## Warning in .deprecatedColsMessage(): Accessing gene location information via 'CHR','CHRLOC','CHRLOCEND'
##   is deprecated. Please use a range based accessor like genes(), or
##   select() with columns values like TXCHROM and TXSTART on a TxDb or
##   OrganismDb object instead.
## 'select()' returned 1:many mapping between keys and columns
##   ENTREZID SYMBOL     CHRLOC CHRLOCCHR  CHRLOCEND
## 1      673   BRAF -140433812         7 -140624564
## 2      837  CASP4 -104813593        11 -104827422
## 3      837  CASP4 -104813593        11 -104839325

Another query

Give me the Symbols of every gene with GO ontology GO:0003674

head(select(org.Hs.eg.db, keys = "GO:0003674",
keytype = "GO", columns = "SYMBOL"))
##           GO EVIDENCE ONTOLOGY  SYMBOL
## 1 GO:0003674       ND       MF    A1BG
## 2 GO:0003674       ND       MF   AP2A2
## 3 GO:0003674       ND       MF    AIF1
## 4 GO:0003674       ND       MF    AIM1
## 5 GO:0003674       ND       MF   BCL7A
## 6 GO:0003674       ND       MF CEACAM1

Managing gene models: GenomicFeatures

Pre-built packages

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
## Loading required package: GenomicFeatures
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene

The transcriptDB object

txdb
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: UCSC
## # Genome: hg19
## # Organism: Homo sapiens
## # Taxonomy ID: 9606
## # UCSC Table: knownGene
## # Resource URL: http://genome.ucsc.edu/
## # Type of Gene ID: Entrez Gene ID
## # Full dataset: yes
## # miRBase build ID: GRCh37
## # transcript_nrow: 82960
## # exon_nrow: 289969
## # cds_nrow: 237533
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2015-10-07 18:11:28 +0000 (Wed, 07 Oct 2015)
## # GenomicFeatures version at creation time: 1.21.30
## # RSQLite version at creation time: 1.0.0
## # DBSCHEMAVERSION: 1.1

keys for the object

keytypes(txdb)
## [1] "CDSID"    "CDSNAME"  "EXONID"   "EXONNAME" "GENEID"   "TXID"    
## [7] "TXNAME"
columns(txdb)
##  [1] "CDSCHROM"   "CDSEND"     "CDSID"      "CDSNAME"    "CDSSTART"  
##  [6] "CDSSTRAND"  "EXONCHROM"  "EXONEND"    "EXONID"     "EXONNAME"  
## [11] "EXONRANK"   "EXONSTART"  "EXONSTRAND" "GENEID"     "TXCHROM"   
## [16] "TXEND"      "TXID"       "TXNAME"     "TXSTART"    "TXSTRAND"  
## [21] "TXTYPE"

Making a query

select(txdb, keys=entrez,
keytype="GENEID",
columns=c("TXID",
"TXCHROM", "TXSTART",
"TXEND"))
## 'select()' returned 1:many mapping between keys and columns
##   GENEID  TXID TXCHROM   TXSTART     TXEND
## 1    673 31502    chr7 140433813 140624564
## 2    837 44976   chr11 104813594 104827422
## 3    837 44977   chr11 104813594 104839325
## 4    837 44978   chr11 104815475 104839325
## 5    837 44979   chr11 104819547 104839325
## 6    837 44980   chr11 104822124 104839325

Querying the exons

mygene <- select(txdb, keys = "673", keytype = "GENEID",
columns = c("EXONID", "EXONCHROM", "EXONSTART","EXONEND","EXONSTRAND"))
## 'select()' returned 1:many mapping between keys and columns
mygene
##    GENEID EXONID EXONCHROM EXONSTRAND EXONSTART   EXONEND
## 1     673 112179      chr7          - 140624366 140624564
## 2     673 112178      chr7          - 140549911 140550012
## 3     673 112177      chr7          - 140534409 140534672
## 4     673 112176      chr7          - 140508692 140508795
## 5     673 112175      chr7          - 140507760 140507862
## 6     673 112174      chr7          - 140501212 140501360
## 7     673 112173      chr7          - 140500162 140500281
## 8     673 112172      chr7          - 140494108 140494267
## 9     673 112171      chr7          - 140487348 140487384
## 10    673 112170      chr7          - 140482821 140482957
## 11    673 112169      chr7          - 140481376 140481493
## 12    673 112168      chr7          - 140477791 140477875
## 13    673 112167      chr7          - 140476712 140476888
## 14    673 112166      chr7          - 140453987 140454033
## 15    673 112165      chr7          - 140453075 140453193
## 16    673 112164      chr7          - 140449087 140449218
## 17    673 112163      chr7          - 140439612 140439746
## 18    673 112162      chr7          - 140433813 140434570

Exon Structure

GRanges(mygene$EXONCHROM, IRanges(mygene$EXONSTART,
mygene$EXONEND),strand=mygene$EXONSTRAND,exon_id=mygene$EXONID)
## GRanges object with 18 ranges and 1 metadata column:
##        seqnames                 ranges strand   |   exon_id
##           <Rle>              <IRanges>  <Rle>   | <integer>
##    [1]     chr7 [140624366, 140624564]      -   |    112179
##    [2]     chr7 [140549911, 140550012]      -   |    112178
##    [3]     chr7 [140534409, 140534672]      -   |    112177
##    [4]     chr7 [140508692, 140508795]      -   |    112176
##    [5]     chr7 [140507760, 140507862]      -   |    112175
##    ...      ...                    ...    ... ...       ...
##   [14]     chr7 [140453987, 140454033]      -   |    112166
##   [15]     chr7 [140453075, 140453193]      -   |    112165
##   [16]     chr7 [140449087, 140449218]      -   |    112164
##   [17]     chr7 [140439612, 140439746]      -   |    112163
##   [18]     chr7 [140433813, 140434570]      -   |    112162
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

Convenience Functions

trs <- transcripts(txdb)
trs
## GRanges object with 82960 ranges and 2 metadata columns:
##                 seqnames           ranges strand   |     tx_id     tx_name
##                    <Rle>        <IRanges>  <Rle>   | <integer> <character>
##       [1]           chr1 [ 11874,  14409]      +   |         1  uc001aaa.3
##       [2]           chr1 [ 11874,  14409]      +   |         2  uc010nxq.1
##       [3]           chr1 [ 11874,  14409]      +   |         3  uc010nxr.1
##       [4]           chr1 [ 69091,  70008]      +   |         4  uc001aal.1
##       [5]           chr1 [321084, 321115]      +   |         5  uc001aaq.2
##       ...            ...              ...    ... ...       ...         ...
##   [82956] chrUn_gl000237   [    1,  2686]      -   |     82956  uc011mgu.1
##   [82957] chrUn_gl000241   [20433, 36875]      -   |     82957  uc011mgv.2
##   [82958] chrUn_gl000243   [11501, 11530]      +   |     82958  uc011mgw.1
##   [82959] chrUn_gl000243   [13608, 13637]      +   |     82959  uc022brq.1
##   [82960] chrUn_gl000247   [ 5787,  5816]      -   |     82960  uc022brr.1
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

Retrieve all exons at once

exs <- exons(txdb)
exs
## GRanges object with 289969 ranges and 1 metadata column:
##                  seqnames         ranges strand   |   exon_id
##                     <Rle>      <IRanges>  <Rle>   | <integer>
##        [1]           chr1 [11874, 12227]      +   |         1
##        [2]           chr1 [12595, 12721]      +   |         2
##        [3]           chr1 [12613, 12721]      +   |         3
##        [4]           chr1 [12646, 12697]      +   |         4
##        [5]           chr1 [13221, 14409]      +   |         5
##        ...            ...            ...    ... ...       ...
##   [289965] chrUn_gl000241 [35706, 35859]      -   |    289965
##   [289966] chrUn_gl000241 [36711, 36875]      -   |    289966
##   [289967] chrUn_gl000243 [11501, 11530]      +   |    289967
##   [289968] chrUn_gl000243 [13608, 13637]      +   |    289968
##   [289969] chrUn_gl000247 [ 5787,  5816]      -   |    289969
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

Group by genes

exons <- exonsBy(txdb, "gene")
is(exons)
## [1] "GRangesList"                      "CompressedList"                  
## [3] "GenomicRangesList"                "GenomicRangesORGRangesList"      
## [5] "List"                             "GenomicRangesORGenomicRangesList"
## [7] "Vector"                           "Annotated"
length(exons)
## [1] 23459

see also transcriptsBy, intronsByTranscript, fiveUTRsByTranscript, threeUTRsByTranscript

How to subset this object

exons[["673"]]
## GRanges object with 18 ranges and 2 metadata columns:
##        seqnames                 ranges strand   |   exon_id   exon_name
##           <Rle>              <IRanges>  <Rle>   | <integer> <character>
##    [1]     chr7 [140433813, 140434570]      -   |    112162        <NA>
##    [2]     chr7 [140439612, 140439746]      -   |    112163        <NA>
##    [3]     chr7 [140449087, 140449218]      -   |    112164        <NA>
##    [4]     chr7 [140453075, 140453193]      -   |    112165        <NA>
##    [5]     chr7 [140453987, 140454033]      -   |    112166        <NA>
##    ...      ...                    ...    ... ...       ...         ...
##   [14]     chr7 [140507760, 140507862]      -   |    112175        <NA>
##   [15]     chr7 [140508692, 140508795]      -   |    112176        <NA>
##   [16]     chr7 [140534409, 140534672]      -   |    112177        <NA>
##   [17]     chr7 [140549911, 140550012]      -   |    112178        <NA>
##   [18]     chr7 [140624366, 140624564]      -   |    112179        <NA>
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

Implications

Examples

Retrieve the subset of reads that overlap a particular gene.

gr <- exons[["49"]]
library(GenomicAlignments)
## Loading required package: SummarizedExperiment
## Loading required package: Biostrings
## Loading required package: XVector
## Loading required package: Rsamtools
system.time(bam.sub <- readGAlignments(file = mybam,
    use.names = TRUE, param = ScanBamParam(which = gr)))
##    user  system elapsed 
##   0.103   0.000   0.105

Examine the output

bam.sub
## GAlignments object with 1917 alignments and 0 metadata columns:
##                     seqnames strand       cigar    qwidth     start
##                        <Rle>  <Rle> <character> <integer> <integer>
##    SRR076681.239386       22      -       1S67M        68  51176595
##    SRR078452.251117       22      -         68M        68  51176597
##    SRR076696.585674       22      -         68M        68  51176597
##    SRR078501.824091       22      +         68M        68  51176605
##    SRR078568.818440       22      +         68M        68  51176606
##                 ...      ...    ...         ...       ...       ...
##     SRR076132.39409       22      -         68M        68  51183674
##    SRR076898.252854       22      -         68M        68  51183679
##    SRR076176.943759       22      -         68M        68  51183687
##     SRR076340.66381       22      -         68M        68  51183699
##   SRR076936.1030386       22      -         68M        68  51183724
##                           end     width     njunc
##                     <integer> <integer> <integer>
##    SRR076681.239386  51176661        67         0
##    SRR078452.251117  51176664        68         0
##    SRR076696.585674  51176664        68         0
##    SRR078501.824091  51176672        68         0
##    SRR078568.818440  51176673        68         0
##                 ...       ...       ...       ...
##     SRR076132.39409  51183741        68         0
##    SRR076898.252854  51183746        68         0
##    SRR076176.943759  51183754        68         0
##     SRR076340.66381  51183766        68         0
##   SRR076936.1030386  51183791        68         0
##   -------
##   seqinfo: 86 sequences from an unspecified genome

Retrieving gene sequences

library(BSgenome.Hsapiens.UCSC.hg19)
hg19 <- BSgenome.Hsapiens.UCSC.hg19
system.time(seqs <- getSeq(hg19, exons[["49"]]))
##    user  system elapsed 
##   0.249   0.000   0.252
seqs
##   A DNAStringSet instance of length 6
##     width seq
## [1]    89 AGTGCCAGGAGTATGGTTGAGATGCTACCAA...CCGTGGTTGCTAAAGATAACGCCACGTGTGA
## [2]   204 TGGCCCCTGTGGGTTACGGTTCAGGCAAAAC...TCACTGCTGCTCACTGCTTCGTCGGCAAAAA
## [3]   284 TAATGTGCATGACTGGAGACTGGTTTTCGGA...GTGGCCGGCTGGGGATATATAGAAGAGAAAG
## [4]   666 TAATGTGCATGACTGGAGACTGGTTTTCGGA...TGTGGCCGTATGACAGTGCCTTCCACTCTCT
## [5]   146 CCCCCAGGCCATCATCTATACTGATGGAGGC...GTATCCTGTAGGCAAGATCGACACCTGCCAG
## [6]   647 GGAGACAGCGGCGGGCCTCTCATGTGCAAAG...ATAAATAAATAAACATATATATATAGATATA
width(exons[["49"]])
## [1]  89 204 284 666 146 647

Alternative counting

bam <- readGAlignments(file = mybam)
countOverlaps(gr, bam)
## [1]  37  46 175 182 212 297

Other sources of annotation

library(rtracklayer)
download.file("http://www.nimblegen.com/downloads/annotation/ez_exome_v3/SeqCapEZ_Exome_v3.0_Design_Annotation_files.zip",destfile="Nimblgen-regions.zip")
unzip("Nimblgen-regions.zip")
nimb <- import("SeqCap_EZ_Exome_v3_primary.bed")
nimb
## UCSC track 'target_region'
## UCSCData object with 242232 ranges and 1 metadata column:
##            seqnames               ranges strand   |
##               <Rle>            <IRanges>  <Rle>   |
##        [1]     chr1       [14426, 14627]      *   |
##        [2]     chr1       [14638, 14883]      *   |
##        [3]     chr1       [14903, 15103]      *   |
##        [4]     chr1       [15670, 15990]      *   |
##        [5]     chr1       [16590, 17074]      *   |
##        ...      ...                  ...    ... ...
##   [242228]     chrY [59355662, 59356146]      *   |
##   [242229]     chrY [59356745, 59357067]      *   |
##   [242230]     chrY [59357675, 59357797]      *   |
##   [242231]     chrY [59357856, 59358098]      *   |
##   [242232]     chrY [59358152, 59358273]      *   |
##                                                                   name
##                                                            <character>
##        [1] gn|RP11-34P13.2;ens|ENSG00000227232;vega|OTTHUMG00000000958
##        [2] gn|RP11-34P13.2;ens|ENSG00000227232;vega|OTTHUMG00000000958
##        [3] gn|RP11-34P13.2;ens|ENSG00000227232;vega|OTTHUMG00000000958
##        [4] gn|RP11-34P13.2;ens|ENSG00000227232;vega|OTTHUMG00000000958
##        [5] gn|RP11-34P13.2;ens|ENSG00000227232;vega|OTTHUMG00000000958
##        ...                                                         ...
##   [242228]       gn|WASH6P;ens|ENSG00000182484;vega|OTTHUMG00000022677
##   [242229]       gn|WASH6P;ens|ENSG00000182484;vega|OTTHUMG00000022677
##   [242230]       gn|WASH6P;ens|ENSG00000182484;vega|OTTHUMG00000022677
##   [242231]       gn|WASH6P;ens|ENSG00000182484;vega|OTTHUMG00000022677
##   [242232]       gn|WASH6P;ens|ENSG00000182484;vega|OTTHUMG00000022677
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths

Practical time

Exploring RNA-seq results

Visualisation

More-advanced graphics in R

ggplot2 overview

ggplot2 cheat-sheet

cheatsheet

Plot Comparison

x <- 1:10
y <- 2*x
plot(x,y)

library(ggplot2)
df <-data.frame(x,y)
ggplot(df, aes(x=x,y=y)) + geom_point()

Plot construction

library(reshape2)
df <- data.frame(A = rnorm(5,3), B=rnorm(5,1))
df[1:3,]
##          A         B
## 1 4.190048 1.1398759
## 2 3.706241 1.0264480
## 3 2.665927 0.4725375
df2 <- melt(df)
## No id variables; using all as measure variables
df2
##    variable      value
## 1         A  4.1900481
## 2         A  3.7062414
## 3         A  2.6659272
## 4         A  1.7585936
## 5         A  3.1936497
## 6         B  1.1398759
## 7         B  1.0264480
## 8         B  0.4725375
## 9         B  1.2188030
## 10        B -1.5492710

Plot construction

ggplot(df2, aes(x = variable,y=value)) + geom_boxplot()

Plot construction

ggplot(df2, aes(x = variable,y=value,fill=variable)) + geom_boxplot()

Updating a plot

df <- data.frame(A = rnorm(5,3), B=rnorm(5,1),C=rnorm(5,2))
df2 <- melt(df)
## No id variables; using all as measure variables
ggplot(df2, aes(x = variable,y=value,fill=variable)) + geom_boxplot()

Introducing ggbio

The autoplot function

library(ggbio)
autoplot(bam.sub)

Can choose a summary statistic

autoplot(bam.sub,stat="coverage")

Plotting gene structure

autoplot(txdb,which=exons[["49"]])

Combining plots

tracks(autoplot(txdb,which=exons[["49"]]),
autoplot(bam.sub,stat="coverage"))

Different layouts available

manhat

Karyogram

Karyogram

kgram

Circular

circos

Practical time