## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----echo = FALSE------------------------------------------------------------- options(crayon.enabled = FALSE, cli.num_colors = 0) ## ----------------------------------------------------------------------------- library(metasnf) class(cort_t) dim(cort_t) str(cort_t[1:5, 1:5]) cort_t[1:5, 1:5] ## ----------------------------------------------------------------------------- dim(income) str(income[1:5, ]) income[1:5, ] ## ----------------------------------------------------------------------------- df_list <- list( anxiety, depress, cort_t, cort_sa, subc_v, income, pubertal ) # The number of rows in each data frame: lapply(df_list, dim) # Whether or not each data frame has missing values: lapply(df_list, function(x) { any(is.na(x)) } ) ## ----------------------------------------------------------------------------- complete_uids <- get_complete_uids(df_list, uid = "unique_id") print(length(complete_uids)) # Reducing data frames to only common observations with no missing data anxiety <- anxiety[anxiety$"unique_id" %in% complete_uids, ] depress <- depress[depress$"unique_id" %in% complete_uids, ] cort_t <- cort_t[cort_t$"unique_id" %in% complete_uids, ] cort_sa <- cort_sa[cort_sa$"unique_id" %in% complete_uids, ] subc_v <- subc_v[subc_v$"unique_id" %in% complete_uids, ] income <- income[income$"unique_id" %in% complete_uids, ] pubertal <- pubertal[pubertal$"unique_id" %in% complete_uids, ] ## ----------------------------------------------------------------------------- # Note that you do not need to explicitly name every single named element # (data = ..., name = ..., etc.) input_dl <- data_list( list( data = cort_t, name = "cortical_thickness", domain = "neuroimaging", type = "continuous" ), list( data = cort_sa, name = "cortical_surface_area", domain = "neuroimaging", type = "continuous" ), list( data = subc_v, name = "subcortical_volume", domain = "neuroimaging", type = "continuous" ), list( data = income, name = "household_income", domain = "demographics", type = "continuous" ), list( data = pubertal, name = "pubertal_status", domain = "demographics", type = "continuous" ), uid = "unique_id" ) ## ----------------------------------------------------------------------------- summary(input_dl) ## ----------------------------------------------------------------------------- target_dl <- data_list( list(anxiety, "anxiety", "behaviour", "ordinal"), list(depress, "depressed", "behaviour", "ordinal"), uid = "unique_id" ) summary(target_dl) ## ----------------------------------------------------------------------------- set.seed(42) my_sc <- snf_config( dl = input_dl, n_solutions = 20, min_k = 20, max_k = 50 ) my_sc ## ----------------------------------------------------------------------------- my_sc$"settings_df" ## ----------------------------------------------------------------------------- head(as.data.frame(my_sc$"settings_df")) ## ----------------------------------------------------------------------------- names(my_sc) ## ----------------------------------------------------------------------------- sol_df <- batch_snf(input_dl, my_sc) sol_df ## ----------------------------------------------------------------------------- cluster_solutions <- t(sol_df) cluster_solutions ## ----------------------------------------------------------------------------- sol_aris <- calc_aris(sol_df) head(sol_aris) ## ----eval = FALSE------------------------------------------------------------- # if (!require("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # # # To make heatmaps with metasnf # BiocManager::install("ComplexHeatmap") # # # To use heatmap shiny app (later in vignette) with metasnf # BiocManager::install("InteractiveComplexHeatmap") ## ----------------------------------------------------------------------------- meta_cluster_order <- get_matrix_order(sol_aris) # Just a vector of numbers meta_cluster_order ## ----eval = FALSE------------------------------------------------------------- # ari_hm <- meta_cluster_heatmap( # sol_aris, # order = meta_cluster_order # ) # # ari_hm ## ----eval = FALSE, echo = FALSE----------------------------------------------- # save_heatmap( # heatmap = ari_hm, # path = "vignettes/meta_cluster_heatmap.png", # width = 330, # height = 300, # res = 100 # ) ## ----eval = FALSE------------------------------------------------------------- # shiny_annotator(ari_hm) ## ----------------------------------------------------------------------------- split_vec <- c(2, 5, 12, 17) ## ----------------------------------------------------------------------------- mc_sol_df <- label_meta_clusters( sol_df, order = meta_cluster_order, split_vector = split_vec ) mc_sol_df ## ----eval = FALSE------------------------------------------------------------- # ari_mc_hm <- meta_cluster_heatmap( # sol_aris, # order = meta_cluster_order, # split_vector = split_vec # ) # # ari_mc_hm ## ----eval = FALSE, echo = FALSE----------------------------------------------- # save_heatmap( # heatmap = ari_mc_hm, # path = "vignettes/ari_mc_hm.png", # width = 330, # height = 300, # res = 100 # ) ## ----------------------------------------------------------------------------- # Only looking at our out-of-model p-values ext_sol_df <- extend_solutions( mc_sol_df, target_dl = target_dl ) ext_sol_df # Re-running to calculate the p-value for every single input and out-of-model # feature: ext_sol_df <- extend_solutions( mc_sol_df, dl = input_dl, target_dl = target_dl ) ext_sol_df ## ----eval = FALSE------------------------------------------------------------- # # No features are used to calculate summary p-values # ext_sol_df_no_summaries <- extend_solutions( # mc_sol_df, # dl = c(input_dl, target_dl) # ) # # # Every features are used to calculate summary p-values # ext_sol_df_all_summaries <- extend_solutions( # mc_sol_df, # target_dl = c(input_dl, target_dl) # ) ## ----eval = FALSE------------------------------------------------------------- # annotated_ari_hm <- meta_cluster_heatmap( # sol_aris, # order = meta_cluster_order, # split_vector = split_vec, # data = as.data.frame(ext_sol_df), # top_hm = list( # "Depression p-value" = "cbcl_depress_r_pval", # "Anxiety p-value" = "cbcl_anxiety_r_pval", # "Overall outcomes p-value" = "mean_pval" # ), # bottom_bar = list( # "Number of Clusters" = "nclust" # ), # annotation_colours = list( # "Depression p-value" = colour_scale( # ext_sol_df$"cbcl_depress_r_pval", # min_colour = "purple", # max_colour = "black" # ), # "Anxiety p-value" = colour_scale( # ext_sol_df$"cbcl_anxiety_r_pval", # min_colour = "green", # max_colour = "black" # ), # "Overall outcomes p-value" = colour_scale( # ext_sol_df$"mean_pval", # min_colour = "lightblue", # max_colour = "black" # ) # ) # ) ## ----eval = FALSE, echo = FALSE----------------------------------------------- # save_heatmap( # heatmap = annotated_ari_hm, # path = "vignettes/annotated_ari_hm.png", # width = 700, # height = 500, # res = 100 # ) ## ----eval = FALSE------------------------------------------------------------- # annotation_data <- ext_sol_df |> # as.data.frame(keep_attributes = TRUE) |> # dplyr::mutate( # key_subjects_cluster_together = dplyr::case_when( # uid_NDAR_INVLF3TNDUZ == uid_NDAR_INVLDQH8ATK ~ TRUE, # TRUE ~ FALSE # ) # ) # # annotated_ari_hm2 <- meta_cluster_heatmap( # sol_aris, # order = meta_cluster_order, # split_vector = split_vec, # data = annotation_data, # top_hm = list( # "Depression p-value" = "cbcl_depress_r_pval", # "Anxiety p-value" = "cbcl_anxiety_r_pval", # "Key Subjects Clustered Together" = "key_subjects_cluster_together" # ), # bottom_bar = list( # "Number of Clusters" = "nclust" # ), # annotation_colours = list( # "Depression p-value" = colour_scale( # ext_sol_df$"cbcl_depress_r_pval", # min_colour = "purple", # max_colour = "black" # ), # "Anxiety p-value" = colour_scale( # ext_sol_df$"cbcl_anxiety_r_pval", # min_colour = "purple", # max_colour = "black" # ), # "Key Subjects Clustered Together" = c( # "TRUE" = "blue", # "FALSE" = "black" # ) # ) # ) ## ----eval = FALSE, echo = FALSE----------------------------------------------- # save_heatmap( # heatmap = annotated_ari_hm2, # path = "vignettes/annotated_ari_hm2.png", # width = 700, # height = 500, # res = 100 # ) ## ----eval = FALSE------------------------------------------------------------- # rep_solutions <- get_representative_solutions(sol_aris, ext_sol_df) # # mc_manhattan <- mc_manhattan_plot( # rep_solutions, # dl = input_dl, # target_dl = target_dl, # hide_x_labels = TRUE, # point_size = 2, # text_size = 12, # threshold = 0.05, # neg_log_pval_thresh = 5 # ) ## ----eval = FALSE, echo = FALSE----------------------------------------------- # ggplot2::ggsave( # "vignettes/mc_manhattan.png", # mc_manhattan, # height = 4, # width = 8, # dpi = 100 # ) ## ----eval = FALSE------------------------------------------------------------- # rep_solutions_no_cort <- dplyr::select(rep_solutions, -dplyr::contains("mrisdp")) # # mc_manhattan2 <- mc_manhattan_plot( # ext_sol_df = rep_solutions_no_cort, # dl = input_dl, # target_dl = target_dl, # point_size = 4, # threshold = 0.01, # text_size = 12, # domain_colours = c( # "neuroimaging" = "cadetblue", # "demographics" = "purple", # "behaviour" = "firebrick" # ) # ) ## ----eval = FALSE, echo = FALSE----------------------------------------------- # ggplot2::ggsave( # "vignettes/mc_manhattan2.png", # mc_manhattan2, # height = 8, # width = 12, # dpi = 100 # ) ## ----eval = FALSE------------------------------------------------------------- # config_hm <- config_heatmap( # sc = my_sc, # order = meta_cluster_order, # hide_fixed = TRUE # ) ## ----eval = FALSE, echo = FALSE----------------------------------------------- # save_heatmap( # heatmap = config_hm, # path = "vignettes/config_hm.png", # width = 400, # height = 500, # res = 75 # ) ## ----eval = FALSE------------------------------------------------------------- # annotation_data <- ext_sol_df |> # as.data.frame(keep_attributes = TRUE) |> # dplyr::mutate( # key_subjects_cluster_together = dplyr::case_when( # uid_NDAR_INVLF3TNDUZ == uid_NDAR_INVLDQH8ATK ~ TRUE, # TRUE ~ FALSE # ) # ) # # annotation_data$"clust_alg" <- as.factor(annotation_data$"clust_alg") # # annotated_ari_hm3 <- meta_cluster_heatmap( # sol_aris, # order = meta_cluster_order, # split_vector = split_vec, # data = annotation_data, # top_hm = list( # "Depression p-value" = "cbcl_depress_r_pval", # "Anxiety p-value" = "cbcl_anxiety_r_pval", # "Key Subjects Clustered Together" = "key_subjects_cluster_together" # ), # left_hm = list( # "Clustering Algorithm" = "clust_alg" # from the original settings # ), # bottom_bar = list( # "Number of Clusters" = "nclust" # also from the original settings # ), # annotation_colours = list( # "Depression p-value" = colour_scale( # ext_sol_df$"cbcl_depress_r_pval", # min_colour = "purple", # max_colour = "black" # ), # "Anxiety p-value" = colour_scale( # ext_sol_df$"cbcl_anxiety_r_pval", # min_colour = "purple", # max_colour = "black" # ), # "Key Subjects Clustered Together" = c( # "TRUE" = "blue", # "FALSE" = "black" # ) # ) # ) # # annotated_ari_hm3 ## ----------------------------------------------------------------------------- sol_df <- batch_snf(dl = input_dl, sc = my_sc, return_sim_mats = TRUE) ## ----eval = FALSE------------------------------------------------------------- # silhouette_scores <- calculate_silhouettes(sol_df) # dunn_indices <- calculate_dunn_indices(sol_df) # db_indices <- calculate_db_indices(sol_df) ## ----------------------------------------------------------------------------- ext_sol_df <- extend_solutions(sol_df, target_dl) ## ----eval = FALSE------------------------------------------------------------- # pval_hm <- pval_heatmap(ext_sol_df, order = meta_cluster_order) # # pval_hm ## ----eval = FALSE, echo = FALSE----------------------------------------------- # save_heatmap( # heatmap = pval_hm, # path = "vignettes/pval_heatmap_ordered.png", # width = 400, # height = 500, # res = 100 # ) ## ----------------------------------------------------------------------------- # All the observations present in all data frames with no NAs all_observations <- uids(input_dl) all_observations # Remove the "uid_" prefix to allow merges with the original data all_observations <- gsub("uid_", "", all_observations) # data frame assigning 80% of observations to train and 20% to test assigned_splits <- train_test_assign(train_frac = 0.8, uids = all_observations) # Pulling the training and testing observations specifically train_obs <- assigned_splits$"train" test_obs <- assigned_splits$"test" # Partition a training set train_cort_t <- cort_t[cort_t$"unique_id" %in% train_obs, ] train_cort_sa <- cort_sa[cort_sa$"unique_id" %in% train_obs, ] train_subc_v <- subc_v[subc_v$"unique_id" %in% train_obs, ] train_income <- income[income$"unique_id" %in% train_obs, ] train_pubertal <- pubertal[pubertal$"unique_id" %in% train_obs, ] train_anxiety <- anxiety[anxiety$"unique_id" %in% train_obs, ] train_depress <- depress[depress$"unique_id" %in% train_obs, ] # Partition a test set test_cort_t <- cort_t[cort_t$"unique_id" %in% test_obs, ] test_cort_sa <- cort_sa[cort_sa$"unique_id" %in% test_obs, ] test_subc_v <- subc_v[subc_v$"unique_id" %in% test_obs, ] test_income <- income[income$"unique_id" %in% test_obs, ] test_pubertal <- pubertal[pubertal$"unique_id" %in% test_obs, ] test_anxiety <- anxiety[anxiety$"unique_id" %in% test_obs, ] test_depress <- depress[depress$"unique_id" %in% test_obs, ] # A data list with just training observations train_dl <- data_list( list(train_cort_t, "cortical_thickness", "neuroimaging", "continuous"), list(train_cort_sa, "cortical_sa", "neuroimaging", "continuous"), list(train_subc_v, "subcortical_volume", "neuroimaging", "continuous"), list(train_income, "household_income", "demographics", "continuous"), list(train_pubertal, "pubertal_status", "demographics", "continuous"), uid = "unique_id" ) # A data list with training and testing observations full_dl <- data_list( list(cort_t, "cortical_thickness", "neuroimaging", "continuous"), list(cort_sa, "cortical_surface_area", "neuroimaging", "continuous"), list(subc_v, "subcortical_volume", "neuroimaging", "continuous"), list(income, "household_income", "demographics", "continuous"), list(pubertal, "pubertal_status", "demographics", "continuous"), uid = "unique_id" ) # Construct the target lists train_target_dl <- data_list( list(train_anxiety, "anxiety", "behaviour", "ordinal"), list(train_depress, "depressed", "behaviour", "ordinal"), uid = "unique_id" ) # Find a clustering solution in your training data set.seed(42) my_sc <- snf_config( train_dl, n_solutions = 5, min_k = 10, max_k = 30 ) train_sol_df <- batch_snf( train_dl, my_sc ) ext_sol_df <- extend_solutions( train_sol_df, train_target_dl ) # The first row had the lowest minimum p-value across our outcomes lowest_min_pval <- min(ext_sol_df$"min_pval") which(ext_sol_df$"min_pval" == lowest_min_pval) # Keep track of your top solution top_row <- ext_sol_df[1, ] # Use the solutions data frame from the training observations and the data list from # the training and testing observations to propagate labels to the test observations propagated_labels <- label_propagate(top_row, full_dl) head(propagated_labels) tail(propagated_labels) ## ----------------------------------------------------------------------------- propagated_labels_all <- label_propagate( ext_sol_df, full_dl ) head(propagated_labels_all) tail(propagated_labels_all)