1 加载经常用的R包

library(pacman)
# 读数据
p_load(readxl,writexl,data.table,openxlsx,haven,rvest)

# 数据探索
p_load(tidyverse,DT,skimr,DataExplorer,explore,vtable,stringr,kableExtra,lubridate)

# 模型
p_load(grf,glmnet,caret,tidytext,fpp2,forecast,car,tseries,hdm,tidymodels,broom)

# 可视化
p_load(patchwork,ggrepel,ggcorrplot,gghighlight,ggthemes,shiny)

# 其它常用包
p_load(magrittr,listviewer,devtools,here,janitor,reticulate,jsonlite)

之前分别介绍了机器学习概要、数据预处理(这个最重要)以及模型调参。本次重点介绍模型训练与集成。

继续回到之前提到的相亲数据，在实际业务开展时，发现相亲失败时不仅客户会心情低落，对于组织相亲的人来说，也会很难过。那么是否可以提升模型预测的精确度，增加相亲成功率呢？(优秀)

2 读入数据

data <- data.table::fread(here::here("Machine_Learning_and_Causal_Inference/data/相亲数据重新编码.csv"))

data

3 数据预处理(最重要且最费时间)

3.1 删除缺失值

data <- data %>% drop_na()
data %>% dim()  # 5723   29

## [1] 5723   29

3.2 删除近零方差

零方差或者近零方差的变量传递不了什么信息，因为几乎所有人的取值都一样。可以利用caret包中的nearZeroVar()函数，一行代码就能找出近零方差的变量，操作过程非常简单。

nearZeroVar(data) # 没有近零变量

## integer(0)

不用的代码注释掉

# data_training_dropvariable <- data_training[,-nearZeroVar(data_training)]
# data_testing_dropvariable <- data_testing[,-nearZeroVar(data_training)]
# data_training_dropvariable

3.3 数据类型变换

data %>%
  mutate(决定   = factor(决定,
                         levels = c(0, 1),
                         labels = c("拒绝", "接受"))) %>%
  mutate(性别   = factor(性别,
                         levels = c(0, 1),
                         labels = c("女", "男"))) %>%
  mutate(种族   = factor(
    种族,
    levels = c(1, 2, 3, 4, 5, 6),
    labels = c("非洲裔", "欧洲裔", "拉丁裔", "亚裔", "印第安土著", "其他")
  )) %>%
  mutate(从事领域 = factor(
    从事领域,
    levels = 1:18,
    labels = c(
      "法律",
      "数学",
      "社会科学或心理学",
      "医学或药物学或生物技术",
      "工程学",
      "写作或新闻",
      "历史或宗教或哲学",
      "商业或经济或金融",
      "教育或学术",
      "生物科学或化学或物理",
      "社会工作",
      "大学在读或未择方向",
      "政治学或国际事务",
      "电影",
      "艺术管理",
      "语言",
      "建筑学",
      "其他"
    )
  )) %>%
  mutate(对方决定  = factor(对方决定,
                            levels = 0:1,
                            labels = c("拒绝", "接收"))) %>%
  mutate(对方种族  = factor(
    对方种族,
    levels = c(1, 2, 3, 4, 5, 6),
    labels = c("非洲裔", "欧洲裔", "拉丁裔", "亚裔", "印第安土著", "其他")
  )) %>%
  mutate(是否同一种族  = factor(
    是否同一种族,
    levels = c(0, 1),
    labels = c("非同一种族", "同一种族") 
  )) -> data

data %>% map(unique)

## $决定
## [1] 接受 拒绝
## Levels: 拒绝 接受
## 
## $性别
## [1] 女 男
## Levels: 女 男
## 
## $吸引力
##  [1]  6.0  7.0  5.0  4.0  8.0  9.0  3.0 10.0  2.0  1.0  0.0  6.5  7.5  9.5  8.5
## [16]  9.9  3.5
## 
## $共同爱好
##  [1]  5.0  6.0  8.0  4.0  7.0  3.0  2.0  9.0 10.0  1.0  0.0  7.5  6.5  8.5  5.5
## 
## $幽默
##  [1]  7.0  8.0  4.0  6.0  9.0  3.0  5.0 10.0  2.0  1.0  0.0  5.5  6.5  9.5  7.5
## [16]  8.5
## 
## $真诚
##  [1]  9.0  8.0  6.0  7.0  5.0 10.0  4.0  3.0  2.0  1.0  0.0  8.5  7.5
## 
## $雄心
##  [1]  6.0  5.0  8.0 10.0  9.0  3.0  7.0  4.0  2.0  1.0  0.0  9.5  7.5  8.5
## 
## $智力
##  [1]  7.0  8.0  6.0  9.0 10.0  5.0  4.0  3.0  2.0  1.0  0.0  6.5  8.5  7.5  5.5
## 
## $好感
##  [1]  7.0  6.0  8.0  5.0  9.0  4.0 10.0  2.0  3.0  6.5  1.0  8.5  9.5  0.0  7.5
## [16]  5.5  4.5  9.7
## 
## $成功率自估
##  [1]  6.0  5.0  7.0  4.0  3.0  8.0  1.0 10.0  2.0  9.0  0.0  6.5  7.5  8.5  9.5
## [16]  5.5  3.5  4.5
## 
## $日常出门频率
## [1] 1 4 2 3 5 7 6
## 
## $对宗教的看重程度
##  [1]  4  5  1  3  2  8 10  6  7  9
## 
## $对种族的看重程度
##  [1]  2  8  1  4  7  3  9 10  5  6  0
## 
## $年龄
##  [1] 21 24 25 23 22 26 27 30 28 29 34 35 32 20 19 18 33 36 31 42 38 55
## 
## $种族
## [1] 亚裔   欧洲裔 其他   拉丁裔 非洲裔
## Levels: 非洲裔 欧洲裔 拉丁裔 亚裔 印第安土著 其他
## 
## $从事领域
##  [1] 法律                   数学                   政治学或国际事务      
##  [4] 商业或经济或金融       工程学                 教育或学术            
##  [7] 社会科学或心理学       社会工作               大学在读或未择方向    
## [10] 医学或药物学或生物技术 历史或宗教或哲学       写作或新闻            
## [13] 生物科学或化学或物理   电影                   语言                  
## [16] 艺术管理               建筑学                 其他                  
## 18 Levels: 法律 数学 社会科学或心理学 医学或药物学或生物技术 ... 其他
## 
## $对方决定
## [1] 拒绝 接收
## Levels: 拒绝 接收
## 
## $好感得分
##  [1]  7.0  8.0  2.0  6.5  6.0 10.0  9.0  4.0  5.0  3.0  1.0  9.5  7.5  4.5  8.5
## [16]  5.5  0.0  9.7
## 
## $对方评估成功率
##  [1]  4.0  7.0  6.0  1.0  5.0  8.0  2.0 10.0  3.0  9.0  4.5  6.5  5.5  0.0  7.5
## [16]  8.5  9.5  3.5
## 
## $吸引力得分
##  [1]  6.0  7.0  8.0  3.0 10.0  9.0  5.0  4.0  2.0  1.0  0.0  6.5  7.5  8.5  9.5
## [16]  9.9  3.5
## 
## $共同爱好得分
##  [1]  6.0  5.0  8.0  7.0  9.0  4.0 10.0  3.0  2.0  1.0  7.5  6.5  8.5  0.0  5.5
## 
## $幽默得分
##  [1]  8.0  7.0  6.0  5.0  9.0 10.0  3.0  4.0  2.0  1.0  5.5  6.5  9.5  0.0  7.5
## [16]  8.5 11.0
## 
## $真诚得分
##  [1]  8.0  7.0  6.0 10.0  9.0  3.0  5.0  4.0  2.0  1.0  4.5  8.5  0.0  7.5
## 
## $雄心得分
##  [1]  8.0  7.0  9.0  6.0 10.0  5.0  4.0  2.0  3.0  1.0  5.5  9.5  0.0  7.5  8.5
## 
## $智力得分
##  [1]  8.0 10.0  9.0  7.0  5.0  6.0  4.0  3.0  1.0  2.0  6.5  8.5  0.0  7.5  5.5
## 
## $对方年龄
##  [1] 27 22 23 24 25 30 28 21 26 29 32 35 34 18 20 19 33 31 36 42 38 55
## 
## $对方种族
## [1] 欧洲裔 拉丁裔 亚裔   其他   非洲裔
## Levels: 非洲裔 欧洲裔 拉丁裔 亚裔 印第安土著 其他
## 
## $是否同一种族
## [1] 非同一种族 同一种族  
## Levels: 非同一种族 同一种族
## 
## $日常约会频率
## [1] 7 5 3 4 6 1 2

3.4 数据划分

set.seed(1234)
data_id <- createDataPartition(y = data$决定,p = 0.75,times = 1,list = FALSE)
data_training <- data[data_id,]
data_testing <- data[-data_id,]

data_training$决定 %>% table() %>% prop.table()

## .
##      拒绝      接受 
## 0.5599814 0.4400186

data_testing$决定 %>% table() %>% prop.table()

## .
##      拒绝      接受 
## 0.5601399 0.4398601

data$决定 %>% table() %>% prop.table()

## .
##     拒绝     接受 
## 0.560021 0.439979

3.5 删除共线性变量

caret包中的findCorrelation()函数会自动找到高度共线性的变量，并给出建议删除的变量。

但需要注意，这个函数对输入的数据要求比较高：

首先，数据中不能有缺失值，所以在此之前需要先处理缺失值；
其次，只能包含数值型变量。

data_training %>% 
  select(-nearZeroVar(data_training)) %>% 
  select(where(is.numeric)) %>% 
  cor() -> data_cor
data_cor %>% 
  round(1) %>% ggcorrplot::ggcorrplot(lab = TRUE,hc.order = TRUE,type = "lower") + mytheme

data_high_cor <- findCorrelation(data_cor,cutoff = 0.75,verbose = TRUE,names = TRUE)

## All correlations <= 0.75

data_high_cor   # 这个数据真好，多重共线性也没有！现实里别想有这么好的数据集

## character(0)

3.6 标准化(可选择)

为什么要标准化？很简单，看看年龄，几十万岁，但是智力这个变量最高也才10分，这两列变量的量纲不同，为了防止年龄的权重过高，就需要将这些特征进行标准化才能学习各个变量真实的权重。需要注意的是：只能拿训练集的均值和标准差来对测试集进行标准化。

# data_proprocess_std <- preProcess(data_training,method = c("scale","center"))
# 
# data_training_std <- predict(data_proprocess_value,data_training)
# data_testing_std <- predict(data_proprocess_value,data_testing)

总结：目前就对数据进行了两项处理：删除缺失值和数据类型转换.。

4 模型调参(默认参数)

可以使用网格搜索和随机搜索，就是后者慢！

set.seed(1234)  # 设置种子
fit_control <- trainControl(
  method = "cv",
  number = 5,             # 5折交叉验证
  classProbs = TRUE,
  summaryFunction = twoClassSummary
)

5 逻辑回归

逻辑回归可以说是最基础的分类模型，它度量的是Y=1的可能性。图1为经典逻辑回归的一个例子，自变量包括5个，因变量为“是否女神”。逻辑回归模型利用训练集对不同的自变量赋予不同的权重，这些自变量线性组合得到z。z通过logit函数转换，就得到了“女神的概率”。

knitr::include_graphics(here::here("Machine_Learning_and_Causal_Inference/fig/logis.png"))

Figure 5.1: 逻辑回归

再看看数据

data_training

data_testing

data %>% str()

## Classes 'data.table' and 'data.frame':   5723 obs. of  29 variables:
##  $ 决定            : Factor w/ 2 levels "拒绝","接受": 2 2 2 2 1 2 1 2 2 1 ...
##  $ 性别            : Factor w/ 2 levels "女","男": 1 1 1 1 1 1 1 1 1 1 ...
##  $ 吸引力          : num  6 7 7 5 4 7 4 7 5 5 ...
##  $ 共同爱好        : num  5 6 8 6 4 7 6 8 8 3 ...
##  $ 幽默            : num  7 8 7 7 4 4 6 9 8 4 ...
##  $ 真诚            : num  9 8 6 6 9 6 9 6 6 7 ...
##  $ 雄心            : num  6 5 6 6 6 6 5 8 10 6 ...
##  $ 智力            : num  7 7 8 7 7 7 7 8 6 8 ...
##  $ 好感            : num  7 7 7 6 6 6 6 7 6 6 ...
##  $ 成功率自估      : num  6 5 6 6 5 5 7 7 6 4 ...
##  $ 日常出门频率    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ 对宗教的看重程度: int  4 4 4 4 4 4 4 4 4 5 ...
##  $ 对种族的看重程度: int  2 2 2 2 2 2 2 2 2 2 ...
##  $ 年龄            : int  21 21 21 21 21 21 21 21 21 24 ...
##  $ 种族            : Factor w/ 6 levels "非洲裔","欧洲裔",..: 4 4 4 4 4 4 4 4 4 2 ...
##  $ 从事领域        : Factor w/ 18 levels "法律","数学",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ 对方决定        : Factor w/ 2 levels "拒绝","接收": 1 1 2 2 2 1 1 2 1 1 ...
##  $ 好感得分        : num  7 8 7 8 7 2 7 6.5 6 7 ...
##  $ 对方评估成功率  : num  4 4 7 6 6 1 5 8 6 2 ...
##  $ 吸引力得分      : num  6 7 7 8 7 3 6 7 6 8 ...
##  $ 共同爱好得分    : num  6 5 8 7 7 7 6 9 6 4 ...
##  $ 幽默得分        : num  8 7 8 6 8 5 6 8 6 9 ...
##  $ 真诚得分        : num  8 8 8 7 7 6 7 7 6 7 ...
##  $ 雄心得分        : num  8 7 9 9 7 8 8 8 6 7 ...
##  $ 智力得分        : num  8 10 9 9 8 7 5 8 6 6 ...
##  $ 对方年龄        : int  27 22 23 24 25 30 27 28 24 27 ...
##  $ 对方种族        : Factor w/ 6 levels "非洲裔","欧洲裔",..: 2 2 2 3 2 2 2 2 2 2 ...
##  $ 是否同一种族    : Factor w/ 2 levels "非同一种族","同一种族": 1 1 1 1 1 1 1 1 1 2 ...
##  $ 日常约会频率    : int  7 7 7 7 7 7 7 7 7 5 ...
##  - attr(*, ".internal.selfref")=<externalptr>

那么，在caret包中如何实现逻辑回归呢？代码只要几行即可，如下所示：

set.seed(1234)
model_logis <- train(决定 ~ .,
                     data = data_training,
                     method = "glm",     # 方法
                     trControl = fit_control,
                     family = "binomial",
                     metric = "ROC")  # 指标

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

model_logis_pre <- predict(model_logis,newdata = data_testing)

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

confusionMatrix(model_logis_pre,data_testing$决定)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 拒绝 接受
##       拒绝  642  166
##       接受  159  463
##                                           
##                Accuracy : 0.7727          
##                  95% CI : (0.7501, 0.7942)
##     No Information Rate : 0.5601          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.5382          
##                                           
##  Mcnemar's Test P-Value : 0.7393          
##                                           
##             Sensitivity : 0.8015          
##             Specificity : 0.7361          
##          Pos Pred Value : 0.7946          
##          Neg Pred Value : 0.7444          
##              Prevalence : 0.5601          
##          Detection Rate : 0.4490          
##    Detection Prevalence : 0.5650          
##       Balanced Accuracy : 0.7688          
##                                           
##        'Positive' Class : 拒绝            
##

6 决策树

决策树是机器学习中常用的基础树模型。前面介绍了一个如何判断是否为女神的例子，下面就利用决策树来介绍一个男生追女神的故事（见下图）。首先判断女生是不是女神，如果是，则看女神是否单身。对于单身女神，又可以分为喜欢我的和不喜欢我的。对于喜欢我的单身女神，果断选择追，其他情况下都选择不追。这就是决策树模型的最终输出呈现。

knitr::include_graphics(here::here("Machine_Learning_and_Causal_Inference/fig/tree.png"))

Figure 6.1: 决策树

那么，在caret包中如何实现决策树呢？在method中设置参数为”rpart”即可。

set.seed(1234)
model_tree <- train(决定 ~ .,
                     data = data_training,
                     method = "rpart",     # 方法
                     trControl = fit_control,
                     metric = "ROC")  # 指标

model_tree_pre <- predict(model_tree,newdata = data_testing)

confusionMatrix(model_tree_pre,data_testing$决定)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 拒绝 接受
##       拒绝  705  240
##       接受   96  389
##                                           
##                Accuracy : 0.765           
##                  95% CI : (0.7422, 0.7868)
##     No Information Rate : 0.5601          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5112          
##                                           
##  Mcnemar's Test P-Value : 6.128e-15       
##                                           
##             Sensitivity : 0.8801          
##             Specificity : 0.6184          
##          Pos Pred Value : 0.7460          
##          Neg Pred Value : 0.8021          
##              Prevalence : 0.5601          
##          Detection Rate : 0.4930          
##    Detection Prevalence : 0.6608          
##       Balanced Accuracy : 0.7493          
##                                           
##        'Positive' Class : 拒绝            
##

7 随机森林

随机森林是通过将多棵决策树集成的一种算法，它的基本单元为决策树。下图为随机森林建模的步骤，这里依然沿用男生追女神的例子。

knitr::include_graphics(here::here("Machine_Learning_and_Causal_Inference/fig/rf.png"))

Figure 7.1: 随机森林

首先，从训练样本中重抽样m组样本，每组样本都是一个子训练集；然后，对每个子训练集样本都构造出一棵决策树，每棵树都有一个决策结果。最后，使用投票法决定最终输出结果。N棵树会有N个分类结果，根据“少数服从多数”原则，投票次数最多的类别为最终的输出。

比如现在有3棵决策树：一棵树认为追女神A，两棵树认为不追女神A，那么根据投票法，到底追不追女神A呢？

那么，在caret包中如何实现随机森林呢？只需要在method中设置为”rf”（random forest的缩写），就可以了。

set.seed(1234)
# model_rf <- train(决定 ~ .,
#                      data = data_training,
#                      method = "rf",     # 方法
#                      trControl = fit_control,
#                      metric = "ROC")  # 指标

# 先保存随机森林的结果

# save(model_rf,file = here::here("Machine_Learning_and_Causal_Inference/result/model_training_integration-model_rf.RData"))

load(file = here::here("Machine_Learning_and_Causal_Inference/result/model_training_integration-model_rf.RData"))
model_rf_pre <- predict(model_rf,newdata = data_testing)

confusionMatrix(model_rf_pre,data_testing$决定)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 拒绝 接受
##       拒绝  669  150
##       接受  132  479
##                                           
##                Accuracy : 0.8028          
##                  95% CI : (0.7812, 0.8231)
##     No Information Rate : 0.5601          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.5986          
##                                           
##  Mcnemar's Test P-Value : 0.3114          
##                                           
##             Sensitivity : 0.8352          
##             Specificity : 0.7615          
##          Pos Pred Value : 0.8168          
##          Neg Pred Value : 0.7840          
##              Prevalence : 0.5601          
##          Detection Rate : 0.4678          
##    Detection Prevalence : 0.5727          
##       Balanced Accuracy : 0.7984          
##                                           
##        'Positive' Class : 拒绝            
##

8 模型集成

一个分类器学习可能会犯错，但是多个分类器一起学习可能会取长补短，这是模型集成的思想，一句话概括就是“三个臭皮匠顶个诸葛亮”。

用的模型集成方法分为投票法、平均法和堆叠集成。其中投票法适用于分类问题，平均法适用于回归问题。其中，平均法的结果由几个分类器的结果平均而得，可以采用简单平均和加权平均。

8.1 投票法

投票法的思想是“少数服从多数”，“群众的眼光是雪亮的”。和随机森林的思路很像，只是这里的分类器可以是不同的分类器，不仅仅是决策树（见下图）。假设分类器1认为杨幂是女神，分类器2认为杨幂是女神，分类器3认为杨幂不是女神。那么最后这3个分类器经过开会投票表决，决定最终结果为杨幂是女神。这就是投票法的思想。

knitr::include_graphics(here::here("Machine_Learning_and_Causal_Inference/fig/vote.png"))

Figure 8.1: 投票法

results <-  data.frame(model_logis_pre, model_tree_pre, model_rf_pre)
results <-  map(results,as.character)

# major_results <-  apply(results, 1, function(x) {
#   tb = sort(table(x), decreasing = T)
#   if(tb[1] %in% tb[2]) {
#     return(sample(c(names(tb)[1], names(tb)[2]), 1))
#   } else {
#     return(names(tb)[1])
#   }
# })  

map(1:length(results[[1]]),function(i){
  fct_count(c(results[[1]][[i]],results[[2]][[i]],results[[3]][[i]])) %>% 
    arrange(desc(n)) %>% 
    slice(1)
}) %>% bind_rows() %>%
  pull(f)  -> results_vote

results_vote %>% class()

## [1] "factor"

# all.equal(results_vote,major_results)

results_vote %>% levels()

## [1] "接受" "拒绝"

data_testing$决定 %>% levels()

## [1] "拒绝" "接受"

confusionMatrix(results_vote, data_testing$决定)

## Warning in confusionMatrix.default(results_vote, data_testing$决定): Levels are
## not in the same order for reference and data. Refactoring data to match.

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 拒绝 接受
##       拒绝  674  166
##       接受  127  463
##                                           
##                Accuracy : 0.7951          
##                  95% CI : (0.7732, 0.8158)
##     No Information Rate : 0.5601          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.5814          
##                                           
##  Mcnemar's Test P-Value : 0.02642         
##                                           
##             Sensitivity : 0.8414          
##             Specificity : 0.7361          
##          Pos Pred Value : 0.8024          
##          Neg Pred Value : 0.7847          
##              Prevalence : 0.5601          
##          Detection Rate : 0.4713          
##    Detection Prevalence : 0.5874          
##       Balanced Accuracy : 0.7888          
##                                           
##        'Positive' Class : 拒绝            
##

首先将几个分类器得到的结果整合在一个数据框中，然后对每行样本都进行投票表决，得到最终结果。但问题是，投票法得到的预测精度还不如随机森林，为什么呢？

这里预测精度降低的原因很简单，就是有个别分类器在拉后腿。所以需要更有效的方式来进行模型集成，即堆叠集成法。

8.2 堆叠法

堆叠集成思路是，首先利用机器学习的不同模型得到不同预测结果，不同模型得到的预测结果就像组装前的零部件。然后将预测结果作为自变量输入模型进行拟合，也就是将这些零部件组装在一起，而如何组装就取决于不同的模型了（见下图）。

knitr::include_graphics(here::here("Machine_Learning_and_Causal_Inference/fig/pile.png"))

Figure 8.2: 堆叠法

那么在R中如何实现呢？首先将各个模型得到的分类结果及真实的分类组合成一个数据框；然后将各个模型的分类结果作为自变量，真实的分类作为因变量，利用模型进行拟合预测。这里，在组装这个阶段利用随机森林模型。

set.seed(1234)
combPre <- data.frame(model_logis_pre = model_logis_pre, 
                      model_tree_pre = model_tree_pre,
                      model_rf_pre =model_rf_pre, 
                     决定 = data_testing$决定)
combfit <- train(决定~., 
                   method = "rf", 
                   data = combPre,
                   trControl = fit_control,
                   metric = "ROC")

## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .

results_pile <-  predict(combfit, newdata = data_testing)
confusionMatrix(results_pile, data_testing$决定)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 拒绝 接受
##       拒绝  669  150
##       接受  132  479
##                                           
##                Accuracy : 0.8028          
##                  95% CI : (0.7812, 0.8231)
##     No Information Rate : 0.5601          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.5986          
##                                           
##  Mcnemar's Test P-Value : 0.3114          
##                                           
##             Sensitivity : 0.8352          
##             Specificity : 0.7615          
##          Pos Pred Value : 0.8168          
##          Neg Pred Value : 0.7840          
##              Prevalence : 0.5601          
##          Detection Rate : 0.4678          
##    Detection Prevalence : 0.5727          
##       Balanced Accuracy : 0.7984          
##                                           
##        'Positive' Class : 拒绝            
##

8.3 AdaBoost

AdaBoost算法的核心思想是：区别对待不同训练样本。首先，秉承“人人平等”的原则，对所有训练样本都赋予相等的权重。然后，对每个训练样本都进行训练，得到训练精度。秉承“帮助弱者”的原则，对训练精度低的样本赋予更大的权重，让模型能更注意提高这部分样本的训练精度。最后，将各个样本训练出来的结果进行加权投票或加权平均。

下边代码训练时间也有点长，先使用save函数保存结果

set.seed(1234)
fit4 <-  train(决定 ~., 
               data = data_training, 
               method = "gam")  # 训练模型
result_ada <- predict(fit4, newdata = data_testing)  # 在测试集上预测
confusionMatrix(result_ada, data_testing$决定)  # 利用混淆矩阵评估模型

save(result_ada,here::here("Machine_Learning_and_Causal_Inference/model_training_integration-result_ada"))

load(here::here("Machine_Learning_and_Causal_Inference/model_training_integration-result_ada"))

最后，来总结一下本节用到的模型及模型集成的预测精度。可以看出，堆叠集成法是提高预测精度的利器。

confusionMatrix(model_logis_pre, data_testing$决定)[[3]][[1]]  # 利用混淆矩阵评估模型

## [1] 0.7727273

confusionMatrix(model_rf_pre, data_testing$决定)[[3]][[1]]  # 利用混淆矩阵评估模型

## [1] 0.8027972

confusionMatrix(model_tree_pre, data_testing$决定)[[3]][[1]]  # 利用混淆矩阵评估模型

## [1] 0.765035

confusionMatrix(results_vote, data_testing$决定)[[3]][[1]]  # 利用混淆矩阵评估模型

## Warning in confusionMatrix.default(results_vote, data_testing$决定): Levels are
## not in the same order for reference and data. Refactoring data to match.

## [1] 0.7951049

confusionMatrix(results_pile, data_testing$决定)[[3]][[1]]  # 利用混淆矩阵评估模型

## [1] 0.8027972

result_df_compare <- tibble(
  logis = confusionMatrix(model_logis_pre, data_testing$决定)[[3]][[1]],
  rf = confusionMatrix(model_rf_pre, data_testing$决定)[[3]][[1]],  # 利用混淆矩阵评估模型
  tree = confusionMatrix(model_tree_pre, data_testing$决定)[[3]][[1]],  # 利用混淆矩阵评估模型
  vote = confusionMatrix(results_vote, data_testing$决定)[[3]][[1]] , # 利用混淆矩阵评估模型
  pile = confusionMatrix(results_pile, data_testing$决定)[[3]][[1]],  # 利用混淆矩阵评估模型
)

## Warning in confusionMatrix.default(results_vote, data_testing$决定): Levels are
## not in the same order for reference and data. Refactoring data to match.

result_df_compare %>% t() %>% as.data.frame() %>% rownames_to_column(var = "model") %>% 
  rename(value = V1) %>% 
  arrange(value)

机器学习-模型训练与集成

Studennt. Li Junjie

2020-08-19