mclust
is a contributed R package for model-based clustering, classification, and density estimation based on finite normal mixture modelling. It provides functions for parameter estimation via the EM algorithm for normal mixture models with a variety of covariance structures, and functions for simulation from these models. Also included are functions that combine model-based hierarchical clustering, EM for mixture estimation and the Bayesian Information Criterion (BIC) in comprehensive strategies for clustering, density estimation and discriminant analysis. Additional functionalities are available for displaying and visualizing fitted models along with clustering, classification, and density estimation results.
This document gives a quick tour of mclust
(version 5.2.3) functionalities. It was written in R Markdown, using the knitr package for production. See help(package="mclust")
for further details and references provided by citation("mclust")
.
library(mclust)
## Package 'mclust' version 5.2.3
## Type 'citation("mclust")' for citing this R package in publications.
data(diabetes)
class = diabetes$class
table(class)
## class
## Chemical Normal Overt
## 36 76 33
X = diabetes[,-1]
head(X)
## glucose insulin sspg
## 1 80 356 124
## 2 97 289 117
## 3 105 319 143
## 4 90 356 199
## 5 90 323 240
## 6 86 381 157
clPairs(X, class)
BIC = mclustBIC(X)
plot(BIC)
summary(BIC)
## Best BIC values:
## VVV,3 VVE,3 EVE,4
## BIC -4760.091 -4775.53693 -4793.26143
## BIC diff 0.000 -15.44628 -33.17079
mod1 = Mclust(X, x = BIC)
summary(mod1, parameters = TRUE)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model with 3 components:
##
## log.likelihood n df BIC ICL
## -2307.883 145 29 -4760.091 -4776.086
##
## Clustering table:
## 1 2 3
## 82 33 30
##
## Mixing probabilities:
## 1 2 3
## 0.5603211 0.2244432 0.2152356
##
## Means:
## [,1] [,2] [,3]
## glucose 91.39558 105.1109 219.21971
## insulin 358.61206 516.2814 1040.59177
## sspg 166.02012 320.2471 98.56807
##
## Variances:
## [,,1]
## glucose insulin sspg
## glucose 61.81664 97.41582 34.42346
## insulin 97.41582 2106.98136 378.95467
## sspg 34.42346 378.95467 2669.14406
## [,,2]
## glucose insulin sspg
## glucose 152.2496 789.1576 -483.0501
## insulin 789.1576 6476.1400 -2752.2840
## sspg -483.0501 -2752.2840 26029.0307
## [,,3]
## glucose insulin sspg
## glucose 6350.858 26190.11 -4448.25
## insulin 26190.111 122126.21 -22772.10
## sspg -4448.250 -22772.10 5913.76
plot(mod1, what = "classification")
table(class, mod1$classification)
##
## class 1 2 3
## Chemical 8 26 2
## Normal 74 2 0
## Overt 0 5 28
par(mfrow = c(2,2))
plot(mod1, what = "uncertainty", dimens = c(2,1), main = "")
plot(mod1, what = "uncertainty", dimens = c(3,1), main = "")
plot(mod1, what = "uncertainty", dimens = c(2,3), main = "")
par(mfrow = c(1,1))
ICL = mclustICL(X)
summary(ICL)
## Best ICL values:
## VVV,3 VVE,3 EVE,4
## ICL -4776.086 -4793.27143 -4809.16868
## ICL diff 0.000 -17.18553 -33.08278
plot(ICL)
LRT = mclustBootstrapLRT(X, modelName = "VVV")
LRT
## Bootstrap sequential LRT for the number of mixture components
## -------------------------------------------------------------
## Model = VVV
## Replications = 999
## LRTS bootstrap p-value
## 1 vs 2 361.186445 0.001
## 2 vs 3 114.703559 0.001
## 3 vs 4 7.437806 0.937
data(iris)
class = iris$Species
table(class)
## class
## setosa versicolor virginica
## 50 50 50
X = iris[,1:4]
head(X)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
mod2 = MclustDA(X, class, modelType = "EDDA")
summary(mod2)
## ------------------------------------------------
## Gaussian finite mixture model for classification
## ------------------------------------------------
##
## EDDA model summary:
##
## log.likelihood n df BIC
## -187.7097 150 36 -555.8024
##
## Classes n Model G
## setosa 50 VEV 1
## versicolor 50 VEV 1
## virginica 50 VEV 1
##
## Training classification summary:
##
## Predicted
## Class setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 47 3
## virginica 0 0 50
##
## Training error = 0.02
plot(mod2, what = "scatterplot")
plot(mod2, what = "classification")
data(banknote)
class = banknote$Status
table(class)
## class
## counterfeit genuine
## 100 100
X = banknote[,-1]
head(X)
## Length Left Right Bottom Top Diagonal
## 1 214.8 131.0 131.1 9.0 9.7 141.0
## 2 214.6 129.7 129.7 8.1 9.5 141.7
## 3 214.8 129.7 129.7 8.7 9.6 142.2
## 4 214.8 129.7 129.6 7.5 10.4 142.0
## 5 215.0 129.6 129.7 10.4 7.7 141.8
## 6 215.7 130.8 130.5 9.0 10.1 141.4
mod3 = MclustDA(X, class)
summary(mod3)
## ------------------------------------------------
## Gaussian finite mixture model for classification
## ------------------------------------------------
##
## MclustDA model summary:
##
## log.likelihood n df BIC
## -646.0798 200 66 -1641.849
##
## Classes n Model G
## counterfeit 100 EVE 2
## genuine 100 XXX 1
##
## Training classification summary:
##
## Predicted
## Class counterfeit genuine
## counterfeit 100 0
## genuine 0 100
##
## Training error = 0
plot(mod3, what = "scatterplot")
plot(mod3, what = "classification")
unlist(cvMclustDA(mod2, nfold = 10)[2:3])
## error se
## 0.02666667 0.01474055
unlist(cvMclustDA(mod3, nfold = 10)[2:3])
## error se
## 0 0
data(acidity)
mod4 = densityMclust(acidity)
summary(mod4)
## -------------------------------------------------------
## Density estimation via Gaussian finite mixture modeling
## -------------------------------------------------------
##
## Mclust E (univariate, equal variance) model with 2 components:
##
## log.likelihood n df BIC ICL
## -185.9493 155 4 -392.0723 -398.5554
##
## Clustering table:
## 1 2
## 98 57
plot(mod4, what = "BIC")
plot(mod4, what = "density", data = acidity, breaks = 15)
plot(mod4, what = "diagnostic", type = "cdf")
plot(mod4, what = "diagnostic", type = "qq")
data(faithful)
mod5 = densityMclust(faithful)
summary(mod5)
## -------------------------------------------------------
## Density estimation via Gaussian finite mixture modeling
## -------------------------------------------------------
##
## Mclust EEE (ellipsoidal, equal volume, shape and orientation) model with 3 components:
##
## log.likelihood n df BIC ICL
## -1126.361 272 11 -2314.386 -2360.865
##
## Clustering table:
## 1 2 3
## 130 97 45
plot(mod5, what = "BIC")
plot(mod5, what = "density")
plot(mod5, what = "density", type = "image", col = "dodgerblue3", grid = 100)
plot(mod5, what = "density", type = "persp")
boot1 = MclustBootstrap(mod1, nboot = 999, type = "bs")
summary(boot1, what = "se")
## ----------------------------------------------------------
## Resampling standard errors
## ----------------------------------------------------------
## Model = VVV
## Num. of mixture components = 3
## Replications = 999
## Type = nonparametric bootstrap
##
## Mixing probabilities:
## 1 2 3
## 0.05113766 0.04692931 0.03933802
##
## Means:
## 1 2 3
## glucose 0.9957175 3.719799 17.89535
## insulin 7.4288265 26.570738 78.69333
## sspg 7.5554329 33.373813 17.75690
##
## Variances:
## [,,1]
## glucose insulin sspg
## glucose 11.15564 49.75742 52.08716
## insulin 49.75742 460.16740 348.06758
## sspg 52.08716 348.06758 588.68142
## [,,2]
## glucose insulin sspg
## glucose 66.46209 490.338 509.2021
## insulin 490.33795 3721.300 3466.7450
## sspg 509.20209 3466.745 7151.5723
## [,,3]
## glucose insulin sspg
## glucose 1125.646 5973.188 1766.345
## insulin 5973.188 37421.632 11074.325
## sspg 1766.345 11074.325 3247.057
summary(boot1, what = "ci")
## ----------------------------------------------------------
## Resampling confidence intervals
## ----------------------------------------------------------
## Model = VVV
## Num. of mixture components = 3
## Replications = 999
## Type = nonparametric bootstrap
## Confidence level = 0.95
##
## Mixing probabilities:
## 1 2 3
## 2.5% 0.4518969 0.1335669 0.1403584
## 97.5% 0.6532272 0.3277191 0.2909213
##
## Means:
## [,,1]
## glucose insulin sspg
## 2.5% 89.29221 343.9336 152.7377
## 97.5% 93.40953 373.1924 182.0874
## [,,2]
## glucose insulin sspg
## 2.5% 98.98162 477.4897 255.8220
## 97.5% 113.56784 579.2478 387.5803
## [,,3]
## glucose insulin sspg
## 2.5% 188.4082 892.765 66.9118
## 97.5% 256.5064 1193.908 135.9513
##
## Variances:
## [,,1]
## glucose insulin sspg
## 2.5% 39.18232 1182.991 1600.026
## 97.5% 83.67443 2976.011 3975.154
## [,,2]
## glucose insulin sspg
## 2.5% 60.76569 1821.772 13133.20
## 97.5% 338.34355 17802.967 40317.37
## [,,3]
## glucose insulin sspg
## 2.5% 3820.959 53981.77 1591.799
## 97.5% 8233.711 195338.49 12223.792
boot4 = MclustBootstrap(mod4, nboot = 999, type = "bs")
summary(boot4, what = "se")
## ----------------------------------------------------------
## Resampling standard errors
## ----------------------------------------------------------
## Model = E
## Num. of mixture components = 2
## Replications = 999
## Type = nonparametric bootstrap
##
## Mixing probabilities:
## 1 2
## 0.04026519 0.04026519
##
## Means:
## 1 2
## 0.04442527 0.07054702
##
## Variances:
## 1 2
## 0.02363732 0.02363732
summary(boot4, what = "ci")
## ----------------------------------------------------------
## Resampling confidence intervals
## ----------------------------------------------------------
## Model = E
## Num. of mixture components = 2
## Replications = 999
## Type = nonparametric bootstrap
## Confidence level = 0.95
##
## Mixing probabilities:
## 1 2
## 2.5% 0.5427730 0.2951715
## 97.5% 0.7048285 0.4572270
##
## Means:
## 1 2
## 2.5% 4.280837 6.177355
## 97.5% 4.453510 6.445052
##
## Variances:
## 1 2
## 2.5% 0.1385877 0.1385877
## 97.5% 0.2315903 0.2315903
mod1dr = MclustDR(mod1)
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
summary(mod1dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification
## -----------------------------------------------------------------
##
## Mixture model type: Mclust (VVV, 3)
##
## Clusters n
## 1 82
## 2 33
## 3 30
##
## Estimated basis vectors:
## Dir1 Dir2 Dir3
## glucose -0.986054 0.24922 0.9588647
## insulin 0.157645 -0.11513 -0.2837395
## sspg -0.053353 -0.96158 -0.0083946
##
## Dir1 Dir2 Dir3
## Eigenvalues 1.3749 0.77725 0.65829
## Cum. % 48.9207 76.57662 100.00000
plot(mod1dr, what = "pairs")
plot(mod1dr, what = "boundaries", ngrid = 200)
mod1dr = MclustDR(mod1, lambda = 1)
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
summary(mod1dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification
## -----------------------------------------------------------------
##
## Mixture model type: Mclust (VVV, 3)
##
## Clusters n
## 1 82
## 2 33
## 3 30
##
## Estimated basis vectors:
## Dir1 Dir2
## glucose 0.81116 0.92578
## insulin -0.56210 -0.19371
## sspg -0.16147 -0.32467
##
## Dir1 Dir2
## Eigenvalues 1.0574 0.3968
## Cum. % 72.7144 100.0000
plot(mod1dr, what = "scatterplot")
plot(mod1dr, what = "boundaries", ngrid = 200)
mod2dr = MclustDR(mod2)
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
summary(mod2dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification
## -----------------------------------------------------------------
##
## Mixture model type: EDDA
##
## Classes n Model G
## setosa 50 VEV 1
## versicolor 50 VEV 1
## virginica 50 VEV 1
##
## Estimated basis vectors:
## Dir1 Dir2 Dir3 Dir4
## Sepal.Length 0.17425 -0.193663 0.64081 -0.46231
## Sepal.Width 0.45292 0.066561 0.34852 0.57110
## Petal.Length -0.61629 -0.311030 -0.42366 0.46256
## Petal.Width -0.62024 0.928076 0.53703 -0.49613
##
## Dir1 Dir2 Dir3 Dir4
## Eigenvalues 0.94747 0.68835 0.076141 0.052607
## Cum. % 53.69408 92.70374 97.018700 100.000000
plot(mod2dr, what = "scatterplot")
plot(mod2dr, what = "boundaries", ngrid = 200)
mod3dr = MclustDR(mod3)
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in x/sqrt(crossprod(x)): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
summary(mod3dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification
## -----------------------------------------------------------------
##
## Mixture model type: MclustDA
##
## Classes n Model G
## counterfeit 100 EVE 2
## genuine 100 XXX 1
##
## Estimated basis vectors:
## Dir1 Dir2 Dir3 Dir4 Dir5 Dir6
## Length -0.10027 -0.327553 0.79718 -0.033721 -0.317043 0.084618
## Left -0.21760 -0.305350 -0.30266 -0.893676 0.371043 -0.565611
## Right 0.29180 -0.018877 -0.49600 0.406605 -0.861020 0.481331
## Bottom 0.57603 0.445501 0.12002 -0.034570 0.004359 -0.078688
## Top 0.57555 0.385645 0.10093 -0.103629 0.136005 0.625416
## Diagonal -0.44088 0.672251 -0.04781 -0.151473 -0.044035 0.209542
##
## Dir1 Dir2 Dir3 Dir4 Dir5 Dir6
## Eigenvalues 0.87241 0.55372 0.48603 0.13301 0.053113 0.027239
## Cum. % 41.04429 67.09530 89.96182 96.21965 98.718473 100.000000
plot(mod3dr, what = "scatterplot")
plot(mod3dr, what = "boundaries", ngrid = 200)