model.matrix function in R is a convenient way to transform training dataset for modeling. But it does not save any parameter used in transformation, so it is hard to apply the same transformation to test dataset or new dataset. ModelMatrixModel package is created to solve the problem.
#devtools::install_github("xinyongtian/R_ModelMatrixModel") #install from github
rm(list=ls())
library(ModelMatrixModel)
set.seed(10)
traindf= data.frame(x1 = sample(LETTERS[1:5], replace = T, 20),
x2 = rnorm(20, 100, 5),
x3 = factor(sample(c("U","L","P"), replace = T, 20)),
y = rnorm(20, 10, 2))
set.seed(20)
newdf=data.frame(x1 = sample(LETTERS[1:5], replace = T, 3),
x2 = rnorm(3, 100, 5),
x3 = sample(c("U","L","P"), replace = T, 3))
head(traindf)
#> x1 x2 x3 y
#> 1 C 102.41489 L 9.198725
#> 2 A 97.01845 U 9.330887
#> 3 B 89.07357 P 12.735908
#> 4 D 96.62567 L 14.275534
#> 5 C 89.40469 U 11.011639
#> 6 B 93.67401 P 11.572685
sapply(traindf,class) #input categorical variable can be either character or factor
#> x1 x2 x3 y
#> "character" "numeric" "factor" "numeric"
f1=formula("~x1+x2")
head(model.matrix(f1, traindf),2)
#> (Intercept) x1B x1C x1D x1E x2
#> 1 1 0 1 0 0 102.41489
#> 2 1 0 0 0 0 97.01845
head(model.matrix(f1, newdf),2)
#> (Intercept) x1B x1C x2
#> 1 1 0 1 93.33703
#> 2 1 1 0 97.76717
Note the number of columns is different in the two outputs, which will be problematic when applying the built model to new data . To avoid that, column x1 in both dataset needs to be transformed to factor with exact same levels. That will be cumbersome if there are many categorical columns. In addition, other transforming parameters, in transformation like orthogonal polynomials, also need to be saved.
default is to keep first dummy variable
mm=ModelMatrixModel(~x2+x3+x2:x3,traindf)
data.frame(as.matrix(head(mm$x,2))) # ':' in column name is replaced with '_X_'
#> x2 x3L x3P x3U x2_X_x3L x2_X_x3P x2_X_x3U
#> 1 102.41489 1 0 0 102.4149 0 0.00000
#> 2 97.01845 0 0 1 0.0000 0 97.01845
mm_pred=predict(mm,newdf)
data.frame(as.matrix(head(mm_pred$x,2)))
#> x2 x3L x3P x3U x2_X_x3L x2_X_x3P x2_X_x3U
#> 1 93.33703 0 1 0 0 93.33703 0.00000
#> 2 97.76717 0 0 1 0 0.00000 97.76717
mm=ModelMatrixModel(~x2*x3,traindf,remove_1st_dummy = T)
data.frame(as.matrix(head(mm$x,2)))
#> x2 x3P x3U x2_X_x3P x2_X_x3U
#> 1 102.41489 0 0 0 0.00000
#> 2 97.01845 0 1 0 97.01845
mm_pred=predict(mm,newdf)
data.frame(as.matrix(head(mm_pred$x,2)))
#> x2 x3P x3U x2_X_x3P x2_X_x3U
#> 1 93.33703 1 0 93.33703 0.00000
#> 2 97.76717 0 1 0.00000 97.76717
It is a common categorical column in new data contains in valid level, it can be handled as following
mm=ModelMatrixModel(~x2+x3,traindf)
data.frame(as.matrix(head(mm$x,2)))
#> x2 x3L x3P x3U
#> 1 102.41489 1 0 0
#> 2 97.01845 0 0 1
newdf2=newdf
newdf2[1,'x3']='z' #create invalid level
mm_pred=predict(mm,newdf2,handleInvalid = "keep")
default is to keep the invalid row ,i.e. set all dummy variables as 0. if handleInvalid = “error”, throw error.
ModelMatrixModel can save orthogonal polynomials parameter.
mm=ModelMatrixModel(~poly(x2,3)+x3,traindf)
data.frame(as.matrix(head(mm$x,2)))
#> poly_x2__3_1 poly_x2__3_2 poly_x2__3_3 x3L x3P x3U
#> 1 0.30172460 0.1984401 -0.11179204 1 0 0
#> 2 0.02600198 -0.2101237 0.01653188 0 0 1
mm_pred=predict(mm,newdf)
data.frame(as.matrix(head(mm_pred$x,2)))
#> poly_x2__3_1 poly_x2__3_2 poly_x2__3_3 x3L x3P x3U
#> 1 -0.16209394 -0.1138245 0.29216357 0 1 0
#> 2 0.06425658 -0.1924877 -0.06097619 0 0 1
also works raw polynomial transformation
mm=ModelMatrixModel(~poly(x2,3,raw=T)+x3, traindf)
data.frame(as.matrix(head(mm$x,2)))
#> poly_x2__3__raw___T_1 poly_x2__3__raw___T_2 poly_x2__3__raw___T_3 x3L x3P x3U
#> 1 102.41489 10488.810 1074210.4 1 0 0
#> 2 97.01845 9412.579 913193.8 0 0 1
mm_pred=predict(mm,newdf)
data.frame(as.matrix(head(mm_pred$x,2)))
#> poly_x2__3__raw___T_1 poly_x2__3__raw___T_2 poly_x2__3__raw___T_3 x3L x3P x3U
#> 1 93.33703 8711.801 813133.7 0 1 0
#> 2 97.76717 9558.419 934499.5 0 0 1
training dataset can be scaled, and same scale parameters then can be applied to new dataset.
mm=ModelMatrixModel(~x2+x3,traindf,scale = T,center = T)
data.frame(as.matrix(head(mm$x,2)))
#> x2 x3L x3P x3U
#> 1 1.315187 1.4888474 -0.7958224 -0.6380775
#> 2 0.113340 -0.6380775 -0.7958224 1.4888474
mm_pred=predict(mm,newdf)
data.frame(as.matrix(head(mm_pred$x,2)))
#> x2 x3L x3P x3U
#> 1 -0.7065511 0 1.1937336 -0.6380775
#> 2 0.2800879 0 -0.7958224 1.4888474