In this lab, we show how to do classification using the tree-based ensemble methods.

library(tidyverse)
# load credit card data
credit.data <- read_csv("https://www.dropbox.com/s/tnoo06n8m842uit/credit_card_default.csv?dl=1")
credit.data<- rename(credit.data, default=`default payment next month`)
# convert categorical data to factor
credit.data$SEX<- as.factor(credit.data$SEX)
credit.data$EDUCATION<- as.factor(credit.data$EDUCATION)
credit.data$MARRIAGE<- as.factor(credit.data$MARRIAGE)
# random splitting
index <- sample(nrow(credit.data),nrow(credit.data)*0.80)
credit.train = credit.data[index,]
credit.test = credit.data[-index,]

# load MNIST data
digit<- data.matrix(read_csv("https://www.dropbox.com/s/ulujvi2a4ykfzju/train.csv?dl=1"))
# random splitting
index<- sample(1:nrow(digit), 0.6*nrow(digit))
train<- digit[index,]
test<- digit[-index,]
# standardize X
train.x <- train[,-1] #remove 'label' column
test.x<- test[,-1]
train.y <- train[,1] #label column
test.y<- test[,1]
train.x <- train.x/255
test.x <- test.x/255

Bagging

library(ipred)
credit.bag<- bagging(as.factor(default)~., data = credit.train, nbagg=100, coob=T, method="class")
predprob.test.bag<- predict(credit.bag, newdata = credit.test, type="prob")[,2]
table(credit.test$default, (predprob.test.bag>0.2)*1, dnn = c("True", "Pred"))
##     Pred
## True    0    1
##    0 3133 1521
##    1  415  931
library(ROCR)
pred <- prediction(predprob.test.bag, credit.test$default)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)

#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.7601595

The classification results are generated by specifying type="class".

predclass.test.bag<- predict(credit.bag, newdata = credit.test, type="class")
table(credit.test$default, predclass.test.bag, dnn = c("True", "Pred"))
##     Pred
## True    0    1
##    0 4387  267
##    1  850  496

Comparing with a single tree. For fair comparison, we assume the loss is symmetric.

library(rpart)
credit.rpart <- rpart(formula = default ~ ., data = credit.train, 
                      method = "class", parms=list(loss=matrix(c(0,5,1,0), nrow=2)))
credit.test.pred.tree1<- predict(credit.rpart, credit.test, type="class")
table(credit.test$default, credit.test.pred.tree1, dnn=c("Truth","Predicted"))
##      Predicted
## Truth    0    1
##     0 3032 1622
##     1  373  973

Which one is better?

Note that parms=list(loss=matrix(c(0,10,1,0), nrow=2)) in bagging() does not work. As far as I know, bagging() cannot handle asymetric loss. There might be other packages can do so. However, we can write our own algorithm for bagging based on rpart, and it should be pretty straightforward.

Let’s write our own bagging with Paralell computing

library(doParallel)  # for parallel backend to foreach
library(foreach)     # for parallel processing with for loops
detectCores()        # detect how many cores are on you PC
## [1] 16
cl <- makeCluster(8) # use 8 workers
registerDoParallel(cl) # register the parallel backend
ptc <- proc.time() # time stamp
mybag.predictions <- foreach(1:100, .packages = "rpart", .combine = cbind) %dopar% {
    # bootstrap copy of training data
    index <- sample(nrow(credit.train), replace = TRUE)
    credit_train_boot <- credit.train[index, ]  
    
    # fit tree to bootstrap copy
    bagged_tree <- rpart(formula = default ~ ., data = credit_train_boot, method = "class",
                         parms = list(loss=matrix(c(0,5,1,0), nrow = 2))) 
    
    predict(bagged_tree, newdata = credit.test, type="class")
}
stopCluster(cl)
proc.time()-ptc  # time stamp
##    user  system elapsed 
##    0.34    0.03   24.72

Let’s look at the results from paralell computing, and make it as what we want.

dim(mybag.predictions)
## [1] 6000  100
mybag.predictions[1:10,1:10]
##    result.1 result.2 result.3 result.4 result.5 result.6 result.7 result.8
## 1         1        2        2        2        2        1        2        1
## 2         2        2        2        2        2        2        2        2
## 3         1        1        1        1        2        1        2        1
## 4         1        1        2        1        2        1        2        1
## 5         2        2        2        2        2        2        2        2
## 6         2        2        2        2        2        2        2        2
## 7         1        1        1        1        1        1        1        1
## 8         1        1        1        1        1        1        1        1
## 9         2        2        2        2        2        2        2        2
## 10        1        1        1        1        1        1        1        1
##    result.9 result.10
## 1         1         2
## 2         2         2
## 3         1         2
## 4         1         2
## 5         2         2
## 6         2         2
## 7         1         1
## 8         1         1
## 9         2         2
## 10        1         1
mybag.predictions <- mybag.predictions-1
mybag.pred <- apply(mybag.predictions, 1, mean)
summary(mybag.pred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.5700  0.4788  1.0000  1.0000
mybag.pred.class <- (mybag.pred>0.5)*1
table(credit.test$default, mybag.pred.class, dnn=c("Truth","Predicted"))
##      Predicted
## Truth    0    1
##     0 2598 2056
##     1  298 1048

Back to top


Random Forests

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
credit.rf<- randomForest(as.factor(default)~., data = credit.train, cutoff=c(4/5,1/5)) # classwt does not change the results
credit.rf
## 
## Call:
##  randomForest(formula = as.factor(default) ~ ., data = credit.train,      cutoff = c(4/5, 1/5)) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 31.28%
## Confusion matrix:
##       0    1 class.error
## 0 12838 5872   0.3138429
## 1  1636 3654   0.3092628

The argument cutoff specifies the cut off probability of each class (e.g. Y=0/1) for prediction. In our example, we specify (4/5, 1/5), meaning that the rule of majority vote for 0 needs to be at least 3/4 and for 1 needs to be 1/4, instead of half-half. Therefore, cutoff essentially represents the asymmetric loss, so the MR or OOB error rate may not be a good measure of prediction accuracy, and you have to calculate your own asymmetric cost

We can again easily plot the error rate vs. ntree.

plot(credit.rf, lwd=rep(2, 3))
legend("right", legend = c("OOB Error", "FPR", "FNR"), lwd=rep(2, 3), lty = c(1,2,3), col = c("black", "red", "green"))

You may try different specification of cutoff and see the change of confusion matrix. In addition, for prediction purpose, you can also apply your own cutoff on the probabilistic prediction as we learned in logistic regression.

Prediction

credit.rf.pred<- predict(credit.rf, newdata=credit.test, type = "prob")[,2]
library(ROCR)
pred <- prediction(credit.rf.pred, credit.test$default)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)

#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.7680649

Below is the confusion matrix based on class prediction

credit.rf.class.test<- predict(credit.rf, newdata=credit.test, type = "class")
table(credit.test$default, credit.rf.class.test, dnn = c("True", "Pred"))
##     Pred
## True    0    1
##    0 3228 1426
##    1  415  931

Below is the confusion matrix based on probabilistic prediction with user specified cutoff (overall default rate).

credit.rf.class.test<- (credit.rf.pred>mean(credit.train$default))*1
table(credit.test$default, credit.rf.class.test, dnn = c("True", "Pred"))
##     Pred
## True    0    1
##    0 3442 1212
##    1  458  888

Variable importance

credit.rf$importance
##           MeanDecreaseGini
## LIMIT_BAL        423.26464
## SEX               79.94966
## EDUCATION        157.88440
## MARRIAGE          90.62258
## AGE              460.98686
## PAY_0            779.53534
## PAY_2            373.80639
## PAY_3            235.54445
## PAY_4            181.19049
## PAY_5            172.83620
## PAY_6            155.66403
## BILL_AMT1        489.66804
## BILL_AMT2        446.47195
## BILL_AMT3        424.90634
## BILL_AMT4        413.26852
## BILL_AMT5        411.29915
## BILL_AMT6        416.97039
## PAY_AMT1         415.68683
## PAY_AMT2         400.67271
## PAY_AMT3         381.79284
## PAY_AMT4         356.95726
## PAY_AMT5         360.10407
## PAY_AMT6         375.32128

We can visualize it with vip package.

library(vip)
vip(credit.rf, num_features = 15, geom = "point")

Back to top


Boosting

Modeling

library(gbm)
## Warning: package 'gbm' was built under R version 4.0.3
## Loaded gbm 2.1.8
credit.boost<- gbm(default~., data = credit.train, distribution = "bernoulli", 
                   n.trees = 5000, cv.folds = 5, n.cores = 5)

The arguement distribution is to specify the type of likelihood function, n.tree is to specify the maximum number of trees, cv.folds is optional. We use cross-validation to choose the best number of trees. Recall that Boosting can easily overfit the data. The last argument n.cores is to specify the number of cores to be used for parallel computing.

# relative influence
summary(credit.boost)

##                 var    rel.inf
## PAY_0         PAY_0 28.7831033
## PAY_2         PAY_2  6.8544376
## BILL_AMT1 BILL_AMT1  5.8828048
## BILL_AMT3 BILL_AMT3  5.2269399
## BILL_AMT2 BILL_AMT2  5.0725585
## AGE             AGE  4.4599509
## PAY_3         PAY_3  3.9919860
## BILL_AMT6 BILL_AMT6  3.8336082
## BILL_AMT5 BILL_AMT5  3.7972611
## LIMIT_BAL LIMIT_BAL  3.7824630
## BILL_AMT4 BILL_AMT4  3.2437609
## PAY_AMT1   PAY_AMT1  3.1754604
## PAY_5         PAY_5  3.1418165
## PAY_AMT2   PAY_AMT2  3.1248074
## PAY_AMT3   PAY_AMT3  2.8268863
## PAY_AMT6   PAY_AMT6  2.6192079
## PAY_AMT4   PAY_AMT4  2.3484584
## PAY_6         PAY_6  2.2699214
## PAY_AMT5   PAY_AMT5  2.1545556
## PAY_4         PAY_4  1.8943737
## EDUCATION EDUCATION  0.6643631
## MARRIAGE   MARRIAGE  0.5458253
## SEX             SEX  0.3054499

Below is the figure of cross-validation to choose the optimal number of trees. The black curve represents the training error and green the CV error.

best.iter <- (gbm.perf(credit.boost, method = "cv"))

best.iter
## [1] 405

Prediction

# predicted probability
pred.credit.boost<- predict(credit.boost, newdata = credit.test, n.trees = best.iter, type="response")
# AUC
pred <- prediction(pred.credit.boost, credit.test$default)
# perf <- performance(pred, "tpr", "fpr")
# plot(perf, colorize=TRUE)
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.7798897

The predicted value is the probability if we specify type="response", as in logistic regression.

Below we show the confusion matrix using the cutoff as the sample proportion.

pred.credit.boost.class<- (pred.credit.boost>mean(credit.train$default))*1
table(credit.test$default, pred.credit.boost.class, dnn = c("True", "Pred"))
##     Pred
## True    0    1
##    0 3766  888
##    1  516  830

Compare with logit model

#Fit logistic regression model
credit.glm<- glm(default~., data = credit.train, family=binomial)
pred.glm<- predict(credit.glm, credit.test, type="response")
# AUC
pred <- prediction(pred.glm, credit.test$default)
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.7206418
#Get binary prediction
credit.test.pred.glm<- (pred.glm>mean(credit.train$default))*1
#Confusion matrix
table(credit.test$default, credit.test.pred.glm, dnn=c("True","Pred"))
##     Pred
## True    0    1
##    0 3182 1472
##    1  482  864

Back to top


XGboost

library(xgboost)
ptm <- proc.time()
fit.xgboost.class<- xgboost(data = model.matrix(~., credit.train[,-24])[,-1], label = credit.train$default, 
                            eta = 0.1, nthread = 6, nrounds = 100, objective = "binary:logistic", verbose = 0)
proc.time()-ptm
##    user  system elapsed 
##    3.39    0.19    0.77

Prediction

pred.credit.xgboost<- predict(fit.xgboost.class, newdata = model.matrix(~., credit.test[,-24])[,-1])
# AUC
pred <- prediction(pred.credit.xgboost, credit.test$default)
# perf <- performance(pred, "tpr", "fpr")
# plot(perf, colorize=TRUE)
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.7836254

Confusion matrix

Below we show the confusion matrix using the cutoff as the sample proportion.

pred.credit.xgboost.class<- (pred.credit.xgboost>mean(credit.train$default))*1
table(credit.test$default, pred.credit.xgboost.class, dnn = c("True", "Pred"))
##     Pred
## True    0    1
##    0 3664  990
##    1  482  864

XGboost for large dataset – MNIST data

We can’t see the advantage of XGboost for above example (small data). It even takes longer than gbm(). However, the fast computing speed can be seen for large scale data analysis.

ptm <- proc.time()
xgboost.digit.fit<- xgboost(data = train.x, label = train.y, eta = 0.2,  num_class=10,
                    nthread = 8, nrounds = 100, objective = "multi:softmax", verbose = 0)
proc.time()-ptm
##    user  system elapsed 
##  579.08    1.11   75.08
xgboost.digit.pred = predict(xgboost.digit.fit, test.x)
mean(xgboost.digit.pred==test.y)
## [1] 0.9702381

Over 95% accuracy rate!!!

Compare with gbm().

It will take a very long time, could be hours depending on the computing power.

ptm <- proc.time()
gbm.digit.fit<- gbm(train.y~., data=data.frame(train.y, train.x), distribution = "multinomial", 
                    shrinkage = 0.1, n.trees = 500, cv.folds=5, n.cores = 8)
proc.time()-ptm

# performance
best.iter <- (gbm.perf(gbm.digit.fit, method = "cv"))
gbm.digit.pred <- predict(gbm.digit.fit, data.frame(test.y, test.x), n.trees = best.iter, type = "response")
gbm.digit.pred <- apply(gbm.digit.pred, 1, which.max)-1
mean(gbm.digit.pred==test.y)

Back to top