## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set(collapse = TRUE, warning = TRUE, comment = "#>") run_vignette <- requireNamespace("GSEABase", quietly = TRUE) && requireNamespace("GO.db", quietly = TRUE) && requireNamespace("reactome.db", quietly = TRUE) && requireNamespace("org.Hs.eg.db", quietly = TRUE) && requireNamespace("ggplot2", quietly = TRUE) && requireNamespace("forcats", quietly = TRUE) ## ----setupr, message=FALSE---------------------------------------------------- library("BaseSet", quietly = TRUE) library("dplyr", quietly = TRUE) ## ----prepare_GO, message=FALSE, eval=run_vignette----------------------------- # We load some libraries library("org.Hs.eg.db", quietly = TRUE) library("GO.db", quietly = TRUE) library("ggplot2", quietly = TRUE) # Prepare the data h2GO_TS <- tidySet(org.Hs.egGO) h2GO <- as.data.frame(org.Hs.egGO) ## ----evidence_ontology, eval=run_vignette, fig.alt="Horizontal bar plot with the number of evidence code per ontology."---- library("forcats", include.only = "fct_reorder2", quietly = TRUE) h2GO %>% group_by(Evidence, Ontology) %>% count(name = "Freq") %>% ungroup() %>% mutate(Evidence = fct_reorder2(Evidence, Ontology, -Freq), Ontology = case_match(Ontology, "CC" ~ "Cellular Component", "MF" ~ "Molecular Function", "BP" ~ "Biological Process", .default = NA)) %>% ggplot() + geom_col(aes(Evidence, Freq)) + facet_grid(~Ontology) + theme_minimal() + coord_flip() + labs(x = element_blank(), y = element_blank(), title = "Evidence codes for each ontology") ## ----nEvidence_plot, eval=run_vignette, fig.alt="Bar plot with the number of relationships that with a given number of evidences: most only have 1 evidence code but some have up to 7"---- h2GO_TS %>% relations() %>% group_by(elements, sets) %>% count(sort = TRUE, name = "Annotations") %>% ungroup() %>% count(Annotations, sort = TRUE) %>% ggplot() + geom_col(aes(Annotations, n)) + theme_minimal() + labs(x = "Evidence codes", y = "Annotations", title = "Evidence codes for each annotation", subtitle = "in human") + scale_x_continuous(breaks = 1:7) ## ----numbers, eval=run_vignette----------------------------------------------- # Add all the genes and GO terms h2GO_TS <- add_elements(h2GO_TS, keys(org.Hs.eg.db)) %>% add_sets(grep("^GO:", keys(GO.db), value = TRUE)) sizes_element <- element_size(h2GO_TS) %>% arrange(desc(size)) sum(sizes_element$size == 0) sum(sizes_element$size != 0) sizes_set <- set_size(h2GO_TS) %>% arrange(desc(size)) sum(sizes_set$size == 0) sum(sizes_set$size != 0) ## ----plots_GO, eval=run_vignette, fig.alt=c("Histogram of number of sets per element: there are many genes on many ontologies.", "Histogram of number of elements per set: There is one set that is huge but then there many than have few elements.")---- sizes_element %>% filter(size != 0) %>% ggplot() + geom_histogram(aes(size), binwidth = 1) + theme_minimal() + labs(x = "# sets per element", y = "Count") sizes_set %>% filter(size != 0) %>% ggplot() + geom_histogram(aes(size), binwidth = 1) + theme_minimal() + labs(x = "# elements per set", y = "Count") ## ----distr_sizes, eval=run_vignette------------------------------------------- head(sizes_set, 10) ## ----fuzzy_setup, eval=run_vignette------------------------------------------- nr <- h2GO_TS %>% relations() %>% dplyr::select(sets, Evidence) %>% distinct() %>% mutate(fuzzy = case_match(Evidence, "EXP" ~ 0.9, "IDA" ~ 0.8, "IPI" ~ 0.8, "IMP" ~ 0.75, "IGI" ~ 0.7, "IEP" ~ 0.65, "HEP" ~ 0.6, "HDA" ~ 0.6, "HMP" ~ 0.5, "IBA" ~ 0.45, "ISS" ~ 0.4, "ISO" ~ 0.32, "ISA" ~ 0.32, "ISM" ~ 0.3, "RCA" ~ 0.2, "TAS" ~ 0.15, "NAS" ~ 0.1, "IC" ~ 0.02, "ND" ~ 0.02, "IEA" ~ 0.01, .default = 0.01)) %>% dplyr::select(sets = "sets", elements = "Evidence", fuzzy = fuzzy) ## ----fuzzy_setup2, eval=run_vignette------------------------------------------ ts <- h2GO_TS %>% relations() %>% dplyr::select(-Evidence) %>% rbind(nr) %>% tidySet() %>% mutate_element(Type = ifelse(grepl("^[0-9]+$", elements), "gene", "evidence")) ## ----cardinality, eval=run_vignette------------------------------------------- ts %>% dplyr::filter(Type != "Gene") %>% cardinality() %>% arrange(desc(cardinality)) %>% head() ## ----size_go, eval=run_vignette----------------------------------------------- ts %>% filter(sets %in% c("GO:0008152", "GO:0003674", "GO:0005575"), Type != "gene") %>% set_size() ## ----evidence_go, eval=run_vignette------------------------------------------- go_terms <- c("GO:0008152", "GO:0003674", "GO:0005575") ts %>% filter(sets %in% go_terms & Type != "gene") ## ----prepare_reactome, eval=run_vignette-------------------------------------- # We load some libraries library("reactome.db") # Prepare the data (is easier, there isn't any ontoogy or evidence column) h2p <- as.data.frame(reactomeEXTID2PATHID) colnames(h2p) <- c("sets", "elements") # Filter only for human pathways h2p <- h2p[grepl("^R-HSA-", h2p$sets), ] # There are duplicate relations with different evidence codes!!: summary(duplicated(h2p[, c("elements", "sets")])) h2p <- unique(h2p) # Create a TidySet and h2p_TS <- tidySet(h2p) %>% # Add all the genes add_elements(keys(org.Hs.eg.db)) ## ----numbers_pathways, eval=run_vignette-------------------------------------- sizes_element <- element_size(h2p_TS) %>% arrange(desc(size)) sum(sizes_element$size == 0) sum(sizes_element$size != 0) sizes_set <- set_size(h2p_TS) %>% arrange(desc(size)) ## ----pathways_plots, eval=run_vignette, fig.alt=c("Genes per pathway.", "Pathway per gene.")---- sizes_element %>% filter(size != 0) %>% ggplot() + geom_histogram(aes(size), binwidth = 1) + scale_y_log10() + theme_minimal() + labs(x = "# sets per element", y = "Count") sizes_set %>% ggplot() + geom_histogram(aes(size), binwidth = 1) + scale_y_log10() + theme_minimal() + labs(x = "# elements per set", y = "Count") ## ----distr_sizes_pathways, eval=run_vignette---------------------------------- head(sizes_set, 10) ## ----sessionInfo, echo=FALSE-------------------------------------------------- sessionInfo()