ModelMatrixModel

model.matrix function in R is a convenient way to transform training dataset for modeling. But it does not save any parameter used in transformation, so it is hard to apply the same transformation to test dataset or new dataset. ModelMatrixModel package is created to solve the problem.

setup

#devtools::install_github("xinyongtian/R_ModelMatrixModel") #install from github
rm(list=ls())
library(ModelMatrixModel)
set.seed(10)
traindf= data.frame(x1 = sample(LETTERS[1:5], replace = T, 20),
                 x2 = rnorm(20, 100, 5),
                 x3 = factor(sample(c("U","L","P"), replace = T, 20)),
                 y = rnorm(20, 10, 2))
set.seed(20)
newdf=data.frame(x1 = sample(LETTERS[1:5], replace = T, 3),
                 x2 = rnorm(3, 100, 5),
                 x3 = sample(c("U","L","P"), replace = T, 3)) 

head(traindf)
#>   x1        x2 x3         y
#> 1  C 102.41489  L  9.198725
#> 2  A  97.01845  U  9.330887
#> 3  B  89.07357  P 12.735908
#> 4  D  96.62567  L 14.275534
#> 5  C  89.40469  U 11.011639
#> 6  B  93.67401  P 11.572685
sapply(traindf,class) #input categorical variable can be either character or factor
#>          x1          x2          x3           y 
#> "character"   "numeric"    "factor"   "numeric"

problem with model.matrix() function

f1=formula("~x1+x2")
head(model.matrix(f1, traindf),2)
#>   (Intercept) x1B x1C x1D x1E        x2
#> 1           1   0   1   0   0 102.41489
#> 2           1   0   0   0   0  97.01845
head(model.matrix(f1, newdf),2)
#>   (Intercept) x1B x1C       x2
#> 1           1   0   1 93.33703
#> 2           1   1   0 97.76717

Note the number of columns is different in the two outputs, which will be problematic when applying the built model to new data . To avoid that, column x1 in both dataset needs to be transformed to factor with exact same levels. That will be cumbersome if there are many categorical columns. In addition, other transforming parameters, in transformation like orthogonal polynomials, also need to be saved.

ModelMatixModel comes to rescue

fit data to create ModelMatixModel object

f2=formula("~ 1+x1+x2") # "1" is need in order to output intercept column
mm=ModelMatrixModel( f2,traindf,remove_1st_dummy =T,sparse = F)

class(mm)
#> [1] "ModelMatrixModel"
head(mm$x,2) #note "_Intercept_" is intercept column
#>   _Intercept_ x1B x1C x1D x1E        x2
#> 1           1   0   1   0   0 102.41489
#> 2           1   0   0   0   0  97.01845

transform new data

mm_pred=predict(mm,newdf)
head(mm_pred$x,2)
#>   _Intercept_ x1B x1C x1D x1E       x2
#> 1           1   0   1   0   0 93.33703
#> 2           1   1   0   0   0 97.76717

dummy variable

keep first dummy variable

mm=ModelMatrixModel(~x1+x2+x3,traindf,remove_1st_dummy = F)

default is to keep first dummy variable

data.frame(as.matrix(head(mm$x,2)))
#>   x1A x1B x1C x1D x1E        x2 x3L x3P x3U
#> 1   0   0   1   0   0 102.41489   1   0   0
#> 2   1   0   0   0   0  97.01845   0   0   1
mm_pred=predict(mm,newdf)
data.frame(as.matrix(head(mm_pred$x,2)))
#>   x1A x1B x1C x1D x1E       x2 x3L x3P x3U
#> 1   0   0   1   0   0 93.33703   0   1   0
#> 2   0   1   0   0   0 97.76717   0   0   1

dummy variable with interaction

keep 1st dummy variable

mm=ModelMatrixModel(~x2+x3+x2:x3,traindf) 
data.frame(as.matrix(head(mm$x,2))) # ':' in column name  is replaced with '_X_'
#>          x2 x3L x3P x3U x2_X_x3L x2_X_x3P x2_X_x3U
#> 1 102.41489   1   0   0 102.4149        0  0.00000
#> 2  97.01845   0   0   1   0.0000        0 97.01845
mm_pred=predict(mm,newdf)
data.frame(as.matrix(head(mm_pred$x,2)))
#>         x2 x3L x3P x3U x2_X_x3L x2_X_x3P x2_X_x3U
#> 1 93.33703   0   1   0        0 93.33703  0.00000
#> 2 97.76717   0   0   1        0  0.00000 97.76717

remove 1st dummy variable

mm=ModelMatrixModel(~x2*x3,traindf,remove_1st_dummy = T) 
data.frame(as.matrix(head(mm$x,2)))
#>          x2 x3P x3U x2_X_x3P x2_X_x3U
#> 1 102.41489   0   0        0  0.00000
#> 2  97.01845   0   1        0 97.01845
mm_pred=predict(mm,newdf)
data.frame(as.matrix(head(mm_pred$x,2)))
#>         x2 x3P x3U x2_X_x3P x2_X_x3U
#> 1 93.33703   1   0 93.33703  0.00000
#> 2 97.76717   0   1  0.00000 97.76717

invalid level in new data

It is a common categorical column in new data contains in valid level, it can be handled as following

mm=ModelMatrixModel(~x2+x3,traindf) 
data.frame(as.matrix(head(mm$x,2)))
#>          x2 x3L x3P x3U
#> 1 102.41489   1   0   0
#> 2  97.01845   0   0   1
newdf2=newdf
newdf2[1,'x3']='z'  #create invalid level
mm_pred=predict(mm,newdf2,handleInvalid = "keep")

default is to keep the invalid row ,i.e. set all dummy variables as 0. if handleInvalid = “error”, throw error.

data.frame(as.matrix(head(mm_pred$x,2))) 
#>         x2 x3L x3P x3U
#> 1 93.33703   0   0   0
#> 2 97.76717   0   0   1

poly() in formula

ModelMatrixModel can save orthogonal polynomials parameter.

mm=ModelMatrixModel(~poly(x2,3)+x3,traindf) 
data.frame(as.matrix(head(mm$x,2)))
#>   poly_x2__3_1 poly_x2__3_2 poly_x2__3_3 x3L x3P x3U
#> 1   0.30172460    0.1984401  -0.11179204   1   0   0
#> 2   0.02600198   -0.2101237   0.01653188   0   0   1
mm_pred=predict(mm,newdf)
data.frame(as.matrix(head(mm_pred$x,2)))
#>   poly_x2__3_1 poly_x2__3_2 poly_x2__3_3 x3L x3P x3U
#> 1  -0.16209394   -0.1138245   0.29216357   0   1   0
#> 2   0.06425658   -0.1924877  -0.06097619   0   0   1

also works raw polynomial transformation

mm=ModelMatrixModel(~poly(x2,3,raw=T)+x3, traindf) 
data.frame(as.matrix(head(mm$x,2)))
#>   poly_x2__3__raw___T_1 poly_x2__3__raw___T_2 poly_x2__3__raw___T_3 x3L x3P x3U
#> 1             102.41489             10488.810             1074210.4   1   0   0
#> 2              97.01845              9412.579              913193.8   0   0   1
mm_pred=predict(mm,newdf)
data.frame(as.matrix(head(mm_pred$x,2)))
#>   poly_x2__3__raw___T_1 poly_x2__3__raw___T_2 poly_x2__3__raw___T_3 x3L x3P x3U
#> 1              93.33703              8711.801              813133.7   0   1   0
#> 2              97.76717              9558.419              934499.5   0   0   1

scale and center

training dataset can be scaled, and same scale parameters then can be applied to new dataset.

mm=ModelMatrixModel(~x2+x3,traindf,scale = T,center = T) 
data.frame(as.matrix(head(mm$x,2)))
#>         x2        x3L        x3P        x3U
#> 1 1.315187  1.4888474 -0.7958224 -0.6380775
#> 2 0.113340 -0.6380775 -0.7958224  1.4888474
mm_pred=predict(mm,newdf)
data.frame(as.matrix(head(mm_pred$x,2)))
#>           x2 x3L        x3P        x3U
#> 1 -0.7065511   0  1.1937336 -0.6380775
#> 2  0.2800879   0 -0.7958224  1.4888474