Assuming all software dependencies and ROKET (Little et al., 2023) installation are installed, we can begin.
# Load libraries
req_packs = c("devtools","smarter","ggplot2","reshape2",
"survival","ggdendro","MiRKAT","ROKET")
for(pack in req_packs){
library(package = pack,character.only = TRUE)
}
#> Loading required package: usethis
#> Registered S3 method overwritten by 'httr':
#> method from
#> print.response rmutil
# List package's exported functions
ls("package:ROKET")
#> [1] "kOT_sim_AGG" "kOT_sim_OT" "kOT_sim_REG" "kOT_sim_make" "kernTEST"
#> [6] "run_myOT" "run_myOTs"
# Fix seed
set.seed(2)
For \(i = 1,\ldots,N\) and \(g = 1,\ldots,G\), let
We would like to calculate the distance between the \(i\)th and \(j\)th samples in terms of mutated genes \(\mathbf{Z}_i\) and \(\mathbf{Z}_j\), denoted by \(d\left(\mathbf{Z}_i,\mathbf{Z}_j\right)\).
Several optimal transport (OT) references to get familiarized with the framework, objective functions, updating equations, entropic regularizations, types of OT are provided:
For the \(i\)th and \(j\)th samples, let
If the mass of the two vectors are equal or normalized such that \(\sum_g Z_{ig} = \sum_g Z_{jg}\), we could use balanced optimal transport.
If the mass of the two vectors are not equal, \(\sum_g Z_{ig} \neq \sum_g Z_{jg}\), we could use unbalanced optimal transport with penalty parameters.
The code below will simulate samples and mutated genes. We welcome the reader to manipulate the following input arguments:
NN
for sample size,PP
for number of pathways,GG
for number of genes, andbnd_same
the lower bound gene similarity for genes
sharing the same pathway and 1 - bnd_same
, an upper bound
on gene similarity for genes not sharing the same pathwayIdeally, PP
should be much less than GG
to
allow multiple genes to be allocated to each pathway.
# number of samples
NN = 30
NN_nms = sprintf("S%s",seq(NN))
# number of pathways
PP = 4
PP_nms = sprintf("P%s",seq(PP))
# number of genes
GG = 30
GG_nms = sprintf("G%s",seq(GG))
# bound for gene similarity of two genes on same or different pathway
bnd_same = 0.75
# Gene and pathway relationship
GP = smart_df(PATH = sample(seq(PP),GG,replace = TRUE),
GENE = seq(GG))
table(GP$PATH)
#>
#> 1 2 3 4
#> 9 6 7 8
# gene-gene similarity matrix
GS = matrix(NA,GG,GG)
dimnames(GS) = list(GG_nms,GG_nms)
diag(GS) = 1
tmp_mat = t(combn(seq(GG),2))
for(ii in seq(nrow(tmp_mat))){
G1 = tmp_mat[ii,1]
G2 = tmp_mat[ii,2]
same = GP$PATH[GP$GENE == G1] == GP$PATH[GP$GENE == G2]
if( same )
GS[G1,G2] = runif(1,bnd_same,1)
else
GS[G1,G2] = runif(1,0,1 - bnd_same)
}
GS[lower.tri(GS)] = t(GS)[lower.tri(GS)]
# round(GS,3)
Let’s take a look at the gene similarity matrix.
#> Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
#> ℹ Please use the `linewidth` argument instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
The function show_tile()
is inherent to this vignette
and not part of the ROKET package.
# Mutated gene statuses
prob_mut = 0.2
prob_muts = c(1 - prob_mut,prob_mut)
while(TRUE){
ZZ = matrix(sample(c(0,1),NN*GG,replace = TRUE,prob = prob_muts),NN,GG)
# Ensure each sample has at least one mutated gene
if( min(rowSums(ZZ)) > 0 ) break
}
dimnames(ZZ) = list(NN_nms,GG_nms)
show_tile(MAT = ZZ,
LABEL = "Mutation Status: Gene by Sample",
TYPE = "MUT",DIGITS = 0)
# Store all distances
DD = array(data = NA,dim = c(NN,NN,5))
dimnames(DD)[1:2] = list(NN_nms,NN_nms)
dimnames(DD)[[3]] = c("EUC","OT_Balanced",sprintf("OT_LAM%s",c(0.5,1.0,5.0)))
We can look at the distribution of gene mutation frequencies.
freq = colSums(ZZ); # freq
dat = smart_df(GENE = names(freq),FREQ = as.integer(freq))
dat$GENE = factor(dat$GENE,levels = names(sort(freq,decreasing = TRUE)))
# dat
ggplot(data = dat,mapping = aes(x = GENE,y = FREQ)) +
geom_bar(stat = "identity") +
xlab("Gene") + ylab("Mutation Frequency") +
scale_x_discrete(guide = guide_axis(n.dodge = 2))
We can calculate the Euclidean distance, which does not incorporate relationships between pairs of genes.
DD[,,"EUC"] = as.matrix(dist(ZZ,diag = TRUE,upper = TRUE))
hout = hclust(as.dist(DD[,,"EUC"]))
ord = hout$labels[hout$order]
show_tile(MAT = DD[,,"EUC"][ord,ord],
LABEL = "Euclidean Pairwise Distances",
TYPE = "DIST",DIGITS = 2)
To demonstrate ROKET’s functionality, the code below
will run balanced OT (pre-normalizing input vectors) between two
samples. Regardless of the values specified by LAMBDA1
and
LAMBDA2
arguments, we need to set
balance = TRUE
. The OT cost matrix argument corresponds to
1 - GS
, one minus the gene similarity matrix.
# Pick two samples
ii = 1
jj = 2
ZZ[c(ii,jj),colSums(ZZ[c(ii,jj),]) > 0]
#> G6 G8 G9 G17 G21 G25 G26 G27 G29
#> S1 0 1 0 1 0 1 1 1 1
#> S2 1 1 1 0 1 0 0 0 0
outOT = run_myOT(XX = ZZ[ii,],YY = ZZ[jj,],
COST = 1 - GS,EPS = 1e-3,LAMBDA1 = 1,
LAMBDA2 = 1,balance = TRUE,verbose = FALSE)
# str(outOT)
# Optimal transport matrix
tmpOT = outOT$OT
tmpOT = tmpOT[rowSums(tmpOT) > 0,colSums(tmpOT) > 0]
show_tile(MAT = tmpOT,LABEL = "Balanced OT (showing only genes mutated in each sample)",
TYPE = "DIST",LABx = sprintf("Sample %s",ii),
LABy = sprintf("Sample %s",jj),
DIGITS = 2)
Let’s try again but with unbalanced OT and \(\lambda = 0.5\). We need to set
balance = FALSE
and specify LAMBDA1 = 0.5
and
LAMBDA2 = 0.5
.
ZZ[c(ii,jj),colSums(ZZ[c(ii,jj),]) > 0]
#> G6 G8 G9 G17 G21 G25 G26 G27 G29
#> S1 0 1 0 1 0 1 1 1 1
#> S2 1 1 1 0 1 0 0 0 0
LAM = 0.5
outOT = run_myOT(XX = ZZ[ii,],YY = ZZ[jj,],
COST = 1 - GS,EPS = 1e-3,LAMBDA1 = LAM,
LAMBDA2 = LAM,balance = FALSE,verbose = FALSE)
# str(outOT)
# Optimal transport matrix
tmpOT = outOT$OT
tmpOT = tmpOT[rowSums(tmpOT) > 0,colSums(tmpOT) > 0]
show_tile(MAT = tmpOT,
LABEL = "Unbalanced OT (showing only genes mutated in each sample)",TYPE = "DIST",
LABx = sprintf("Sample %s",ii),
LABy = sprintf("Sample %s",jj),
DIGITS = 2)
ROKET can run optimal transport calculations across all \(N\) choose 2 samples. Below is code to run balanced OT.
outOTs = run_myOTs(ZZ = t(ZZ),COST = 1 - GS,
EPS = 1e-3,balance = TRUE,LAMBDA1 = 1,
LAMBDA2 = 1,conv = 1e-5,max_iter = 3e3,
ncores = 1,verbose = FALSE)
hout = hclust(as.dist(outOTs))
ord = hout$labels[hout$order]
show_tile(MAT = outOTs[ord,ord],
LABEL = "Balanced OT Distances",
TYPE = "DIST",DIGITS = 1)
We can run calculations for various \(\lambda\) values. For shorthand, \(\lambda\) = Inf corresponds to balanced OT.
LAMs = c(0,0.5,1.0,5.0)
for(LAM in LAMs){
# LAM = LAMs[2]
BAL = ifelse(LAM == 0,TRUE,FALSE)
LAM2 = ifelse(BAL,1,LAM)
outOTs = run_myOTs(ZZ = t(ZZ),COST = 1 - GS,
EPS = 1e-3,balance = BAL,LAMBDA1 = LAM2,
LAMBDA2 = LAM2,conv = 1e-5,max_iter = 3e3,
ncores = 1,verbose = FALSE)
hout = hclust(as.dist(outOTs))
ord = hout$labels[hout$order]
LABEL = ifelse(BAL,"OT Distances (Balanced)",
sprintf("OT Distances (Lambda = %s)",LAM))
LABEL2 = ifelse(BAL,"OT_Balanced",sprintf("OT_LAM%s",LAM))
gg = show_tile(MAT = outOTs[ord,ord],
LABEL = LABEL,TYPE = "DIST",DIGITS = 2)
print(gg)
DD[,,LABEL2] = outOTs
rm(outOTs)
}
We can see that Euclidean distance calculations on gene mutation statuses alone does not lead to strong evidence of sample clusters. However optimal transport-based distance calculations with integrated gene-gene similarities provide stronger evidence of sample clusters.
nms = dimnames(DD)[[3]]; nms
#> [1] "EUC" "OT_Balanced" "OT_LAM0.5" "OT_LAM1" "OT_LAM5"
for(nm in nms){
# nm = nms[2]
hout = hclust(as.dist(DD[,,nm]))
hdend = as.dendrogram(hout)
dend_data = dendro_data(hdend,type = "rectangle")
gg = ggplot(dend_data$segments) +
geom_segment(aes(x = x,y = y,xend = xend,yend = yend)) +
ggtitle(nm) + xlab("") + ylab("") +
geom_text(data = dend_data$labels,
mapping = aes(x,y,label = label),vjust = 0.5,hjust = 1) +
theme_dendro() + coord_flip() +
theme(plot.title = element_text(hjust = 0.5))
print(gg)
rm(hout,hdend,dend_data,gg)
}
The next step is to transform distance matrices into centered kernel matrices to perform hypothesis testing on our constructed kernels.
For a binary or continuous outcome, \(Y_i\), fitted with a generalized linear model, we have
\[ g\left(E\left[Y_i \middle| \mathbf{X}_i,\mathbf{Z}_i\right]\right) = \mathbf{X}_i^\text{T}\mathbf{\beta} + f(\mathbf{Z}_i) = \mathbf{X}_i^\text{T}\mathbf{\beta} + \mathbf{K}_i^\text{T}\mathbf{\alpha} \]
and for Cox proportional hazards for survival outcomes, we have
\[ h(t;\mathbf{X}_i,\mathbf{Z}_i) = h_0(t) \exp\left\{\mathbf{X}_i^\text{T}\mathbf{\beta} + f(\mathbf{Z}_i)\right\} = h_0(t) \exp\left\{\mathbf{X}_i^\text{T}\mathbf{\beta} + \mathbf{K}_i^\text{T}\mathbf{\alpha}\right\} \]
where
The matrix \(\mathbf{K}\) is constructed from the distance matrix (Euclidean or optimal transport), \(\mathbf{D}\), by double centering the rows and columns of \(\mathbf{D}\) with the formula
\[ \tilde{\mathbf{K}} = -\frac{1}{2} \left(\mathbf{I}_N - \mathbf{J}_N \mathbf{J}_N^\text{T}\right) \mathbf{D}^2 \left(\mathbf{I}_N - \mathbf{J}_N \mathbf{J}_N^\text{T}\right) \]
where
Since \(\tilde{\mathbf{K}}\) is not guaranteed to be positive semi-definite, we perform spectral decomposition and replace negative eigenvalues with zero and re-calculate the kernel to arrive at \(\mathbf{K}\).
For a single candidate distance matrix, transform the distance matrix
to a kernel matrix and convert the object into a list, denoted by the
object KK
, code is provided below from the R package, MiRKAT
(Plantinga et al., 2017; Zhao et al.,
2015).
If there are multiple distance matrices, in our case, contained in an
array DD
, store them into a list of kernel matrices
KK
. Example code is below.
The user needs to pre-specify and fit the null model,
\[ H_0: \mathbf{\alpha} = 0, \]
of baseline covariates, \(\mathbf{X}_i\), to one of the three outcome
models to obtain continuous, logistic, or martingale residuals
(RESI
). Some example code is provided below.
# Continuous
out_LM = lm(Y ~ .,data = data.frame(X))
RESI = residuals(out_LM)
# Logistic
out_LOG = glm(Y ~ .,data = data.frame(X),family = "binomial")
RESI = residuals(out_LOG)
# Survival
out_CX = coxph(Surv(TIME,CENS) ~ .,data = data.frame(X))
RESI = residuals(out_CX)
With one or multiple candidate kernels, ROKET will take
aKK
) (defined
below),RESI
), andnPERMS
)to calculate individual kernel p-values as well as omnibus tests. An
omnibus test assesses the significance of the minimum p-value kernel
among kernels considered. The function kernTEST
requires
names(RESI)
and dimension names per object as well as
row/column names per kernel within aKK
are named for sample
order consistency. Sample code is provided below. Setting
verbose = TRUE
allows the user to track the permutation’s
progress, especially when requesting hundreds of thousands of
permutations.
nPERMS = 1e5
nKK = length(KK)
# Array of kernels
aKK = array(data = NA,dim = dim(DD),
dimnames = dimnames(DD))
for(nm in dimnames(DD)[[3]]){
aKK[,,nm] = KK[[nm]]
}
# Create OMNI matrix
OMNI = matrix(0,nrow = 2,ncol = dim(aKK)[3],
dimnames = list(c("EUC","OT"),dimnames(aKK)[[3]]))
OMNI["EUC","EUC"] = 1
OMNI["OT",grepl("^OT",colnames(OMNI))] = 1
OMNI
# Hypothesis Testing
ROKET::kernTEST(RESI = RESI,
KK = aKK,
OMNI = OMNI,
nPERMS = nPERMS,
ncores = 1)
The final output contains individual kernel p-values and the omnibus p-value.
Have fun with utilizing kernel regression and optimal transport frameworks with ROKET!
sessionInfo()
#> R version 4.2.2 (2022-10-31 ucrt)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 19045)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=C
#> [2] LC_CTYPE=English_United States.utf8
#> [3] LC_MONETARY=English_United States.utf8
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=English_United States.utf8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] ROKET_1.0.0 MiRKAT_1.2.3 ggdendro_0.2.0 survival_3.4-0 reshape2_1.4.4
#> [6] ggplot2_3.4.0 smarter_1.0.1 devtools_2.4.5 usethis_2.1.6
#>
#> loaded via a namespace (and not attached):
#> [1] minqa_1.2.8 colorspace_2.1-0 ellipsis_0.3.2
#> [4] fs_1.5.2 rmutil_1.1.10 clue_0.3-65
#> [7] farver_2.1.1 MatrixModels_0.5-1 remotes_2.4.2
#> [10] statip_0.2.3 ggrepel_0.9.3 fansi_1.0.4
#> [13] codetools_0.2-18 splines_4.2.2 cachem_1.0.8
#> [16] knitr_1.42 pkgload_1.3.2 jsonlite_1.8.5
#> [19] nloptr_2.1.1 kernlab_0.9-33 cluster_2.1.4
#> [22] stabledist_0.7-1 shiny_1.7.4 httr_1.4.6
#> [25] compiler_4.2.2 lazyeval_0.2.2 Matrix_1.5-1
#> [28] fastmap_1.1.1 cli_3.6.1 later_1.3.0
#> [31] htmltools_0.5.4 quantreg_5.95 prettyunits_1.1.1
#> [34] tools_4.2.2 modeest_2.4.0 gtable_0.3.4
#> [37] glue_1.6.2 dplyr_1.1.2 Rcpp_1.0.10
#> [40] jquerylib_0.1.4 vctrs_0.6.2 ape_5.8
#> [43] nlme_3.1-160 iterators_1.0.14 timeDate_4041.110
#> [46] xfun_0.42 stringr_1.5.0 rbibutils_2.3
#> [49] ps_1.7.2 lme4_1.1-36 spatial_7.3-15
#> [52] mime_0.12 miniUI_0.1.1.1 CompQuadForm_1.4.3
#> [55] lifecycle_1.0.3 GUniFrac_1.8 gtools_3.9.4
#> [58] statmod_1.5.0 PearsonDS_1.3.1 MASS_7.3-58.1
#> [61] scales_1.2.1 timeSeries_4041.111 promises_1.2.0.1
#> [64] parallel_4.2.2 inline_0.3.21 SparseM_1.81
#> [67] yaml_2.3.7 memoise_2.0.1 sass_0.4.5
#> [70] fBasics_4041.97 segmented_2.1-3 rpart_4.1.19
#> [73] stringi_1.7.12 highr_0.10 foreach_1.5.2
#> [76] permute_0.9-7 caTools_1.18.2 boot_1.3-28
#> [79] pkgbuild_1.4.0 stable_1.1.6 Rdpack_2.6.2
#> [82] rlang_1.1.1 pkgconfig_2.0.3 matrixStats_0.63.0
#> [85] bitops_1.0-7 evaluate_0.20 lattice_0.20-45
#> [88] purrr_1.0.1 labeling_0.4.3 htmlwidgets_1.6.1
#> [91] processx_3.8.0 tidyselect_1.2.0 plyr_1.8.8
#> [94] magrittr_2.0.3 R6_2.5.1 reformulas_0.4.0
#> [97] gplots_3.1.3 generics_0.1.3 profvis_0.3.7
#> [100] pillar_1.9.0 withr_2.5.0 mgcv_1.8-41
#> [103] mixtools_2.0.0 RCurl_1.98-1.12 tibble_3.2.1
#> [106] crayon_1.5.2 KernSmooth_2.23-20 utf8_1.2.3
#> [109] plotly_4.10.1 rmarkdown_2.20 urlchecker_1.0.1
#> [112] grid_4.2.2 data.table_1.14.8 callr_3.7.3
#> [115] vegan_2.6-10 digest_0.6.31 xtable_1.8-4
#> [118] tidyr_1.3.0 httpuv_1.6.7 munsell_0.5.0
#> [121] viridisLite_0.4.2 bslib_0.4.2 sessioninfo_1.2.2