In this lab, we show how to do classification using the tree-based ensemble methods.
library(tidyverse)
# load credit card data
credit.data <- read_csv("https://www.dropbox.com/s/tnoo06n8m842uit/credit_card_default.csv?dl=1")
credit.data<- rename(credit.data, default=`default payment next month`)
# convert categorical data to factor
credit.data$SEX<- as.factor(credit.data$SEX)
credit.data$EDUCATION<- as.factor(credit.data$EDUCATION)
credit.data$MARRIAGE<- as.factor(credit.data$MARRIAGE)
# random splitting
index <- sample(nrow(credit.data),nrow(credit.data)*0.80)
credit.train = credit.data[index,]
credit.test = credit.data[-index,]
# load MNIST data
digit<- data.matrix(read_csv("https://www.dropbox.com/s/ulujvi2a4ykfzju/train.csv?dl=1"))
# random splitting
index<- sample(1:nrow(digit), 0.6*nrow(digit))
train<- digit[index,]
test<- digit[-index,]
# standardize X
train.x <- train[,-1] #remove 'label' column
test.x<- test[,-1]
train.y <- train[,1] #label column
test.y<- test[,1]
train.x <- train.x/255
test.x <- test.x/255
library(ipred)
credit.bag<- bagging(as.factor(default)~., data = credit.train, nbagg=100, coob=T, method="class")
predprob.test.bag<- predict(credit.bag, newdata = credit.test, type="prob")[,2]
table(credit.test$default, (predprob.test.bag>0.2)*1, dnn = c("True", "Pred"))
## Pred
## True 0 1
## 0 3133 1521
## 1 415 931
library(ROCR)
pred <- prediction(predprob.test.bag, credit.test$default)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.7601595
The classification results are generated by specifying type="class"
.
predclass.test.bag<- predict(credit.bag, newdata = credit.test, type="class")
table(credit.test$default, predclass.test.bag, dnn = c("True", "Pred"))
## Pred
## True 0 1
## 0 4387 267
## 1 850 496
Comparing with a single tree. For fair comparison, we assume the loss is symmetric.
library(rpart)
credit.rpart <- rpart(formula = default ~ ., data = credit.train,
method = "class", parms=list(loss=matrix(c(0,5,1,0), nrow=2)))
credit.test.pred.tree1<- predict(credit.rpart, credit.test, type="class")
table(credit.test$default, credit.test.pred.tree1, dnn=c("Truth","Predicted"))
## Predicted
## Truth 0 1
## 0 3032 1622
## 1 373 973
Which one is better?
Note that parms=list(loss=matrix(c(0,10,1,0), nrow=2))
in bagging()
does not work. As far as I know, bagging()
cannot handle asymetric loss. There might be other packages can do so. However, we can write our own algorithm for bagging based on rpart
, and it should be pretty straightforward.
library(doParallel) # for parallel backend to foreach
library(foreach) # for parallel processing with for loops
detectCores() # detect how many cores are on you PC
## [1] 16
cl <- makeCluster(8) # use 8 workers
registerDoParallel(cl) # register the parallel backend
ptc <- proc.time() # time stamp
mybag.predictions <- foreach(1:100, .packages = "rpart", .combine = cbind) %dopar% {
# bootstrap copy of training data
index <- sample(nrow(credit.train), replace = TRUE)
credit_train_boot <- credit.train[index, ]
# fit tree to bootstrap copy
bagged_tree <- rpart(formula = default ~ ., data = credit_train_boot, method = "class",
parms = list(loss=matrix(c(0,5,1,0), nrow = 2)))
predict(bagged_tree, newdata = credit.test, type="class")
}
stopCluster(cl)
proc.time()-ptc # time stamp
## user system elapsed
## 0.34 0.03 24.72
Let’s look at the results from paralell computing, and make it as what we want.
dim(mybag.predictions)
## [1] 6000 100
mybag.predictions[1:10,1:10]
## result.1 result.2 result.3 result.4 result.5 result.6 result.7 result.8
## 1 1 2 2 2 2 1 2 1
## 2 2 2 2 2 2 2 2 2
## 3 1 1 1 1 2 1 2 1
## 4 1 1 2 1 2 1 2 1
## 5 2 2 2 2 2 2 2 2
## 6 2 2 2 2 2 2 2 2
## 7 1 1 1 1 1 1 1 1
## 8 1 1 1 1 1 1 1 1
## 9 2 2 2 2 2 2 2 2
## 10 1 1 1 1 1 1 1 1
## result.9 result.10
## 1 1 2
## 2 2 2
## 3 1 2
## 4 1 2
## 5 2 2
## 6 2 2
## 7 1 1
## 8 1 1
## 9 2 2
## 10 1 1
mybag.predictions <- mybag.predictions-1
mybag.pred <- apply(mybag.predictions, 1, mean)
summary(mybag.pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.5700 0.4788 1.0000 1.0000
mybag.pred.class <- (mybag.pred>0.5)*1
table(credit.test$default, mybag.pred.class, dnn=c("Truth","Predicted"))
## Predicted
## Truth 0 1
## 0 2598 2056
## 1 298 1048
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
credit.rf<- randomForest(as.factor(default)~., data = credit.train, cutoff=c(4/5,1/5)) # classwt does not change the results
credit.rf
##
## Call:
## randomForest(formula = as.factor(default) ~ ., data = credit.train, cutoff = c(4/5, 1/5))
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 31.28%
## Confusion matrix:
## 0 1 class.error
## 0 12838 5872 0.3138429
## 1 1636 3654 0.3092628
The argument cutoff
specifies the cut off probability of each class (e.g. Y=0/1) for prediction. In our example, we specify (4/5, 1/5), meaning that the rule of majority vote for 0 needs to be at least 3/4 and for 1 needs to be 1/4, instead of half-half. Therefore, cutoff
essentially represents the asymmetric loss, so the MR or OOB error rate may not be a good measure of prediction accuracy, and you have to calculate your own asymmetric cost
We can again easily plot the error rate vs. ntree.
plot(credit.rf, lwd=rep(2, 3))
legend("right", legend = c("OOB Error", "FPR", "FNR"), lwd=rep(2, 3), lty = c(1,2,3), col = c("black", "red", "green"))
You may try different specification of cutoff
and see the change of confusion matrix. In addition, for prediction purpose, you can also apply your own cutoff on the probabilistic prediction as we learned in logistic regression.
credit.rf.pred<- predict(credit.rf, newdata=credit.test, type = "prob")[,2]
library(ROCR)
pred <- prediction(credit.rf.pred, credit.test$default)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.7680649
Below is the confusion matrix based on class prediction
credit.rf.class.test<- predict(credit.rf, newdata=credit.test, type = "class")
table(credit.test$default, credit.rf.class.test, dnn = c("True", "Pred"))
## Pred
## True 0 1
## 0 3228 1426
## 1 415 931
Below is the confusion matrix based on probabilistic prediction with user specified cutoff (overall default rate).
credit.rf.class.test<- (credit.rf.pred>mean(credit.train$default))*1
table(credit.test$default, credit.rf.class.test, dnn = c("True", "Pred"))
## Pred
## True 0 1
## 0 3442 1212
## 1 458 888
credit.rf$importance
## MeanDecreaseGini
## LIMIT_BAL 423.26464
## SEX 79.94966
## EDUCATION 157.88440
## MARRIAGE 90.62258
## AGE 460.98686
## PAY_0 779.53534
## PAY_2 373.80639
## PAY_3 235.54445
## PAY_4 181.19049
## PAY_5 172.83620
## PAY_6 155.66403
## BILL_AMT1 489.66804
## BILL_AMT2 446.47195
## BILL_AMT3 424.90634
## BILL_AMT4 413.26852
## BILL_AMT5 411.29915
## BILL_AMT6 416.97039
## PAY_AMT1 415.68683
## PAY_AMT2 400.67271
## PAY_AMT3 381.79284
## PAY_AMT4 356.95726
## PAY_AMT5 360.10407
## PAY_AMT6 375.32128
We can visualize it with vip
package.
library(vip)
vip(credit.rf, num_features = 15, geom = "point")
library(gbm)
## Warning: package 'gbm' was built under R version 4.0.3
## Loaded gbm 2.1.8
credit.boost<- gbm(default~., data = credit.train, distribution = "bernoulli",
n.trees = 5000, cv.folds = 5, n.cores = 5)
The arguement distribution
is to specify the type of likelihood function, n.tree
is to specify the maximum number of trees, cv.folds
is optional. We use cross-validation to choose the best number of trees. Recall that Boosting can easily overfit the data. The last argument n.cores
is to specify the number of cores to be used for parallel computing.
# relative influence
summary(credit.boost)
## var rel.inf
## PAY_0 PAY_0 28.7831033
## PAY_2 PAY_2 6.8544376
## BILL_AMT1 BILL_AMT1 5.8828048
## BILL_AMT3 BILL_AMT3 5.2269399
## BILL_AMT2 BILL_AMT2 5.0725585
## AGE AGE 4.4599509
## PAY_3 PAY_3 3.9919860
## BILL_AMT6 BILL_AMT6 3.8336082
## BILL_AMT5 BILL_AMT5 3.7972611
## LIMIT_BAL LIMIT_BAL 3.7824630
## BILL_AMT4 BILL_AMT4 3.2437609
## PAY_AMT1 PAY_AMT1 3.1754604
## PAY_5 PAY_5 3.1418165
## PAY_AMT2 PAY_AMT2 3.1248074
## PAY_AMT3 PAY_AMT3 2.8268863
## PAY_AMT6 PAY_AMT6 2.6192079
## PAY_AMT4 PAY_AMT4 2.3484584
## PAY_6 PAY_6 2.2699214
## PAY_AMT5 PAY_AMT5 2.1545556
## PAY_4 PAY_4 1.8943737
## EDUCATION EDUCATION 0.6643631
## MARRIAGE MARRIAGE 0.5458253
## SEX SEX 0.3054499
Below is the figure of cross-validation to choose the optimal number of trees. The black curve represents the training error and green the CV error.
best.iter <- (gbm.perf(credit.boost, method = "cv"))
best.iter
## [1] 405
# predicted probability
pred.credit.boost<- predict(credit.boost, newdata = credit.test, n.trees = best.iter, type="response")
# AUC
pred <- prediction(pred.credit.boost, credit.test$default)
# perf <- performance(pred, "tpr", "fpr")
# plot(perf, colorize=TRUE)
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.7798897
The predicted value is the probability if we specify type="response"
, as in logistic regression.
Below we show the confusion matrix using the cutoff as the sample proportion.
pred.credit.boost.class<- (pred.credit.boost>mean(credit.train$default))*1
table(credit.test$default, pred.credit.boost.class, dnn = c("True", "Pred"))
## Pred
## True 0 1
## 0 3766 888
## 1 516 830
#Fit logistic regression model
credit.glm<- glm(default~., data = credit.train, family=binomial)
pred.glm<- predict(credit.glm, credit.test, type="response")
# AUC
pred <- prediction(pred.glm, credit.test$default)
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.7206418
#Get binary prediction
credit.test.pred.glm<- (pred.glm>mean(credit.train$default))*1
#Confusion matrix
table(credit.test$default, credit.test.pred.glm, dnn=c("True","Pred"))
## Pred
## True 0 1
## 0 3182 1472
## 1 482 864
library(xgboost)
ptm <- proc.time()
fit.xgboost.class<- xgboost(data = model.matrix(~., credit.train[,-24])[,-1], label = credit.train$default,
eta = 0.1, nthread = 6, nrounds = 100, objective = "binary:logistic", verbose = 0)
proc.time()-ptm
## user system elapsed
## 3.39 0.19 0.77
pred.credit.xgboost<- predict(fit.xgboost.class, newdata = model.matrix(~., credit.test[,-24])[,-1])
# AUC
pred <- prediction(pred.credit.xgboost, credit.test$default)
# perf <- performance(pred, "tpr", "fpr")
# plot(perf, colorize=TRUE)
unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.7836254
Below we show the confusion matrix using the cutoff as the sample proportion.
pred.credit.xgboost.class<- (pred.credit.xgboost>mean(credit.train$default))*1
table(credit.test$default, pred.credit.xgboost.class, dnn = c("True", "Pred"))
## Pred
## True 0 1
## 0 3664 990
## 1 482 864
We can’t see the advantage of XGboost for above example (small data). It even takes longer than gbm()
. However, the fast computing speed can be seen for large scale data analysis.
ptm <- proc.time()
xgboost.digit.fit<- xgboost(data = train.x, label = train.y, eta = 0.2, num_class=10,
nthread = 8, nrounds = 100, objective = "multi:softmax", verbose = 0)
proc.time()-ptm
## user system elapsed
## 579.08 1.11 75.08
xgboost.digit.pred = predict(xgboost.digit.fit, test.x)
mean(xgboost.digit.pred==test.y)
## [1] 0.9702381
Over 95% accuracy rate!!!
gbm()
.It will take a very long time, could be hours depending on the computing power.
ptm <- proc.time()
gbm.digit.fit<- gbm(train.y~., data=data.frame(train.y, train.x), distribution = "multinomial",
shrinkage = 0.1, n.trees = 500, cv.folds=5, n.cores = 8)
proc.time()-ptm
# performance
best.iter <- (gbm.perf(gbm.digit.fit, method = "cv"))
gbm.digit.pred <- predict(gbm.digit.fit, data.frame(test.y, test.x), n.trees = best.iter, type = "response")
gbm.digit.pred <- apply(gbm.digit.pred, 1, which.max)-1
mean(gbm.digit.pred==test.y)