--- title: 'Mapping homologous gene symbols' author: 'Mathew Chamberlain' date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Mapping homologous gene symbols} %\VignetteEngine{knitr::rmarkdown_notangle} %\VignetteEncoding{UTF-8} --- This vignette shows how to map homologous gene symbols from Macaca fascicularis to the human genome. ```{r setup, include=FALSE} all_times <- list() # store the time for each chunk knitr::knit_hooks$set(time_it = local({ now <- NULL function(before, options) { if (before) { now <<- Sys.time() } else { res <- difftime(Sys.time(), now, units = "secs") all_times[[options$label]] <<- res } } })) knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE, time_it = TRUE ) orthologs <- readRDS(file = "fls/orthologs.rds") ``` # Load the essential packages ```{r setup packages, eval = F} require(biomaRt) require(tidyverse) require(SignacX) ``` After mapping the reads to the Macaca fasccicularis genome, we load the genes, which were generated from the output of the [cellranger pipeline](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) from 10X Genomics. # Load Macaca fascicularis genes ```{r load genes, echo = T, eval = T} features.tsv <- read.delim("fls/features.tsv.gz", header=FALSE, stringsAsFactors=FALSE) head(features.tsv) ``` # Map homologous genes ```{r map genes, eval = F} # get human and cyno gene symbols human.R95 <- useMart(host='jan2019.archive.ensembl.org', biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl') cyno.R95 <- useMart(host='jan2019.archive.ensembl.org', biomart='ENSEMBL_MART_ENSEMBL', dataset='mfascicularis_gene_ensembl') # values = listeENSID: list of cynomolgus ensembl IDs to be retrieved. listeENSID = features.tsv$V1 orthologs <- getLDS(attributes = c("ensembl_gene_id","external_gene_name"), filters = "ensembl_gene_id", values = listeENSID, mart = cyno.R95, attributesL = c("hgnc_symbol","ensembl_gene_id"), martL = human.R95) orthologs <- as_tibble(orthologs) colnames(orthologs) <- c("GeneID","cynoSymbol","HumanSymbol","HumanGeneID") # keep only 1:1 orthologs one2one <- orthologs %>% group_by(GeneID) %>% summarise(n()) %>% filter(`n()`<=1) %>% dplyr::select(`GeneID`) %>% pull() orthologs <- orthologs %>% filter(GeneID %in% one2one) # replace empty HumanSymbol (where there isn't a gene name for a homologous gene) with NA orthologs <- orthologs %>% mutate(HumanSymbol=replace(HumanSymbol,HumanSymbol=="", NA)) orthologs <- orthologs %>% mutate(cynoSymbol=replace(cynoSymbol,cynoSymbol=="", NA)) idx = match(listeENSID, orthologs$GeneID) xx = orthologs$HumanSymbol[idx] logik = !is.na(orthologs$HumanSymbol[idx]) # sum(logik) returns 17,365 homologous genes xx = xx[logik] orthologs = orthologs[!is.na(orthologs$HumanSymbol),] # note: several of these genes are not unique mappers; we will aggregate them later or make them unique. To aggregate, where E is the sparse expression matrix with rownames set to xx: # E = Matrix.utils::aggregate.Matrix(E, row.names(E)) ``` Now we have mapped homologous gene symbols across species: ```{r display orthologs} head(orthologs) ``` After mapping homologous genes, Signac can be used to classify the cell types. ```{r save.times, include = FALSE} write.csv(x = t(as.data.frame(all_times)), file = "fls/tutorial_times_Crabeating_vignette.csv") saveRDS(orthologs, file = "fls/orthologs.rds") ```
**Session Info** ```{r, echo=FALSE} sessionInfo() ```