集成学习简介

集成学习(Ensemble Learning)通过构建并结合多个学习器来完成学习任务,也被称为多分类器系统(multi-classifier system)、基于委员会的学习(committee-based leLrning)等。

集成学习的原理可以简洁概括为综合对样本有不同鉴别力的分类器的优势,使错误率最小。通过产生一组个体学习器,再用某种策略将其结合起来。个体学习器由一个现有学习算法从训练数据中产生,如决策树算法、BP神经网络算法等,集成中至包含同种类型的个体学习器称为基学习器,相应算法为基学习算法,基学习器也被称为弱学习器。集成中包含不同类型的基学习器称为异质集成,相应的称为组件学习器

通常要求基学习器相互独立且正确率大于50%,性能好切具有多样性的基分类器结合效果较优。根据个体学习器的生成方式,集成学习大致分为两类:

  • 个体学习器间存在强依赖关系、串行生成的序列化方法,如Boosting
  • 个体学习器间不存在强依赖关系、可同时生成的并行化方法,如Bagging和随机森林(Random Forest)

集成学习方法

Boosting

Boosting一族可将弱学习器提升为强学习器,最著名的代表为AdaBoost,这类算法流程如下:

  1. 从初始训练集训练出一个基学习器
  2. 根据基学习器表现调整训练样本分布,加大对做错训练样本的关注,基于调整后的样本分布训练下一个基学习器
  3. 重复步骤2,直至基学习器数量达到指定期望$T$个基学习器加权结合
AdaBoost算法
AdaBoost算法

代码实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import imageio
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.datasets import make_moons, make_circles
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot


class Adaboost_Demonstration:
def __init__(self, X, y, learning_rate=1.):
"""
输入的X为N*2矩阵, y为一维向量, y的值只能取1或-1
:param X: 数据点
:param y: 数据点标记
"""
self.X = X
self.y = y
# 给每个弱分类器一个衰减, 避免过拟合
self.learning_rate = learning_rate
# 样本的个数
self.num_samples = len(self.X)
# 初始化数据样本的权重(设为等值)
self.sample_weight = np.full(self.num_samples, 1 / self.num_samples)
# 存储所有的弱分类器对象
self.classifiers = []
# 储存在每一步的错误率
self.errors_list = []
# 定义弱分类器, 直接调用sklearn的决策树, max_depth=1代表一层决策树, 即决策树桩
self.alphas = []

def predict(self, data=None, labels=None, reduction="sign"):
"""
预测数据点的分类
:param reduction: "sign"对弱分类的线性加权组合取符号, "mean"取平均
"""
if data is None:
data = self.X
labels = self.y
# 计算弱分类器线性加权组合的结果
predictions = np.zeros([len(data)]).astype("float")
for classifier, alpha in zip(self.classifiers, self.alphas):
predictions += alpha * classifier.predict(data)
# 对结果取符号
if reduction == "sign":
predictions = np.sign(predictions)
# 对结果求均值
elif reduction == "mean":
predictions /= len(self.classifiers)
# 如果可以的话获取f1 score
if labels is not None and reduction == "sign":
f1 = f1_score(predictions, labels)
return predictions, f1
else:
return predictions

def contour_plot(self, data=None, labels=None, interval=0.2, title="adaboost", mode="3d", epoch=None):
"""
等高线图可视化
:param interval: 等高线图网格的间隔
:param title: 等高线图的标题
:param mode: 可选3D或2D可视化
:param epoch: 迭代次数
"""
if data is None:
data = self.X
labels = self.y
if labels is None:
labels = np.ones([len(data)])
# 获取网格
x_min, x_max = data[:, 0].min() - .5, data[:, 0].max() + .5
y_min, y_max = data[:, 1].min() - .5, data[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, interval), np.arange(y_min, y_max, interval))
# 将网格的X, Y轴拼接用来进行等高线的计算
X_grid = np.concatenate([np.expand_dims(np.ravel(xx), axis=-1),
np.expand_dims(np.ravel(yy), axis=-1)], axis=-1)
# X_grid的形状[batch(数据点数量), 2]
# 计算分类边界(等高线)
Z_grid = self.predict(data=X_grid, reduction="mean")
Z_grid = Z_grid.reshape(xx.shape)
# 可视化
if mode == "3d":
# 数据点画散点图
scatter = go.Scatter3d(x=data[:, 0], y=data[:, 1], z=self.predict(data=data, reduction="mean"),
mode='markers',
marker=dict(color=labels, size=5, symbol='circle',
line=dict(color='rgb(204, 204, 204)', width=1),
opacity=0.9))
# 等高线3D轮廓图
surface = go.Surface(x=xx, y=yy, z=Z_grid, opacity=0.9)
plot_data = [scatter, surface]
layout = go.Layout(title=title)
# 设置视角
camera = dict(up=dict(x=0, y=0, z=1),
center=dict(x=0, y=0, z=0),
eye=dict(x=1, y=1, z=0.8))
fig = go.Figure(data=plot_data, layout=layout)
fig['layout'].update(scene=dict(camera=camera))
iplot(fig, image="png", filename=title)
if mode == "2d":
# 等高线
plt.contourf(xx, yy, Z_grid, cmap=plt.cm.RdBu, alpha=.8)
# 散点
plt.scatter(data[:, 0], data[:, 1], c=labels,
cmap=ListedColormap(['#FF0000', '#0000FF']), edgecolors='k')
plt.title(title)
# plt.show()
plt.savefig('imgs/' + 'res' + str(epoch))


def __next__(self, reduction="mean", plot=True, plot_mode="2d", epoch=None):
# 定义弱分类器(决策树桩)
# classifier = DecisionTreeClassifier(
# max_depth=2,min_samples_split=20,
# min_samples_leaf=5)
classifier = DecisionTreeClassifier(max_depth=1)
# 用弱分类器拟合数据
classifier.fit(self.X, self.y, sample_weight=self.sample_weight)
# 得到弱分类器对数据的推断, 也就是h(x)
predictions = classifier.predict(self.X)
# 计算错误率
error_rate = np.mean(np.average((predictions != self.y), weights=self.sample_weight))
# 计算alpha
alpha = self.learning_rate * (np.log((1 - error_rate) / error_rate)) / 2
# 计算t+1的权重
self.sample_weight *= np.exp(-alpha * self.y * predictions)
# 归一化, 归一化因子为Z: sum(self.sample_weight)
self.sample_weight /= np.sum(self.sample_weight)
# 记录当前弱分类器对象
self.classifiers.append(classifier)
# 记录当前弱分类器权重
self.alphas.append(alpha)
# 计算f1 score
_, f1 = self.predict()
# 画图
if plot:
return self.contour_plot(
title="adaboost step " + str(len(self.classifiers)) + " f1 score: {:.2f}".format(f1), mode=plot_mode, epoch=epoch)
else:
return f1


def convert_gif(epochs):
outfile = 'gifs/adaboost.gif'
filenames = []
for i in range(epochs):
filename = 'imgs/res' + str(i) + '.png'
filenames.append(filename)
frames = []
for image_name in filenames:
img = imageio.imread(image_name)
frames.append(img)
imageio.mimsave(outfile, frames, 'GIF', duration='0.1')


if __name__ == '__main__':
# 测试
X, y = make_moons(n_samples=300, noise=0.2, random_state=3)
y[np.where(y == 0)] = -1
model = Adaboost_Demonstration(X, y)
epochs = 100
for i in range(epochs):
model.__next__(plot=False, epoch=i)
model.contour_plot(mode="2d")
# convert_gif(epochs)

运行结果

AdaBoost算法运行结果

Bagging

Bagging是并行式集成学习方法最著名的代表,其算法流程如下:

  1. 首先进行自助采样(bootstrap sampling),有放回的从给定包含$m$个训练样本的数据集中随机采样出$T$个含$m$个样本的采样集
  2. 基于每个采样集训练出一个基学习器
  3. 将基学习器按照某种策略结合,一般采用投票策略
Bagging算法
Bagging算法

算法复杂度:基分类器复杂度 + 投票复杂度

由于每个基学习器只使用了部分训练集样本,剩余样本可用作验证集来对泛化性能进行“包外估计”。

随机森林

随机森林是多决策树的Bagging,通常情况下 随机森林相较于Bagging会收敛到更低的泛化误差,且训练速度更快,其算法流程如下:

  1. 随机自助采样出$T$个含有$m$个样本的采样集
  2. 每个采样集随机选择$k$个属性子集构建不剪枝的决策树,形成随机森林,推荐$k=log_2d$,$d$为属性总数
  3. 对于新样本经过随机森林进行决策,一般取得票最高的为分类结果

结合策略

平均法

  1. 简单平均法:$$H(x)=\frac{1}{T}\sum^T_{i=1}h_i(x)$$
  2. 加权平均法:$$H(x)=\sum^T_{i=1}w_ih_i(x)$$
    $w_i$是$h_i$的权重,通常$w_i\geq0,\sum^T_{i=1}w_i=1$

投票法

  1. 绝对多数投票法:

    $$H(x)=c_j \space if \space \sum^T_{i=1} h^j_i(x)>0.5\sum^N_{k=1}\sum^T_{i=1}h_i^k(x) \space else \space reject$$

  2. 相对多数投票法:$$H(x)=c_{\underset{j}{argmax}}\sum^T_{i=1}h^j_i(x)$$

  3. 加权投票法:$$H(x)=c_{\underset{j}{argmax}}\sum^T_{i=1}w_ih^j_i(x)$$
    其中$h_i^j(x)$是学习器$h_i$在类别标记$c_j$上的输出

学习法

训练数据很多时,可以与另一个学习器进行结合。把个体学习器称为初级学习器,用于结合的学习器称为次级学习器元学习器。代表算法为Stacking,Stacking本身就为一种集成学习方法,其算法流程如下:

  1. 从初始训练集训练出初级学习器,生成一个新数据集
  2. 初级学习器的输出当作样例输入特征来训练次级学习器
Stacking算法
Stacking算法

比较

Bagging Boosting
基学习器之间不存在强依赖关系,可并行生成 基学习器之间存在强相互依赖关系,必须串行生成
样本训练 有放回采样,各训练集相互独立 每一轮训练集不变,而训练集中的样例的权重变化
样本权重 均匀取样,各样例权重相等 根据学习误差率不断调整,错误率越大的权重越大
预测函数 所有预测函数权重相等 每个弱分类器都有相应的权重,对于分类误差小的分类器给予更大的权重
依靠降低方差提升预测的精准度 依靠降低偏差和方差提升预测精准度

AdaBoost优缺点:

  • 具有较低的泛化误差且不容易出现过拟合
  • 代码易实现
  • 对异常点敏感,影响后续产生的弱分类器

RF的优缺点:

  • 训练可以并行化,对于大规模样本的训练具有速度的优势
  • 随机选择决策树划分特征列表使得在样本维度较高时仍具有比较高的训练性能
  • 存在随机抽样,训练出来的模型方差小,泛化能力强
  • 实现简单,且对于部分特征的缺失不敏感
  • 在某些噪音比较大的特征上易陷入过拟合
  • 取值比较多的划分特征对RF的决策会产生更大的影响,从而有可能影响模型的效果

sklearn实现决策树极度随机树随机森林AdaBoost及其在Iris数据集上的结果比较

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.datasets import load_iris
from sklearn.ensemble import (
RandomForestClassifier,
ExtraTreesClassifier,
AdaBoostClassifier,
)
from sklearn.tree import DecisionTreeClassifier

# Parameters
n_classes = 3
n_estimators = 30 # 森林中树的数量/终止提升的最大值
cmap = plt.cm.RdYlBu
plot_step = 0.02 # fine step width for decision surface contours
plot_step_coarser = 0.5 # step widths for coarse classifier guesses
RANDOM_SEED = 13 # fix the seed on each iteration

# Load data
iris = load_iris()

plot_idx = 1

models = [
DecisionTreeClassifier(max_depth=None),
RandomForestClassifier(n_estimators=n_estimators),
ExtraTreesClassifier(n_estimators=n_estimators),
AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=n_estimators),
]

for pair in ([0, 1], [0, 2], [2, 3]): # 选取不同特征分类
for model in models:
# We only take the two corresponding features
X = iris.data[:, pair]
y = iris.target

# Shuffle
idx = np.arange(X.shape[0])
np.random.seed(RANDOM_SEED)
np.random.shuffle(idx)
X = X[idx]
y = y[idx]

# Standardize
mean = X.mean(axis=0)
std = X.std(axis=0)
X = (X - mean) / std

# Train
model.fit(X, y)

scores = model.score(X, y)
# Create a title for each column and the console by using str() and
# slicing away useless parts of the string
model_title = str(type(model)).split(".")[-1][:-2][: -len("Classifier")]

model_details = model_title
if hasattr(model, "estimators_"):
model_details += " with {} estimators".format(len(model.estimators_))
print(model_details + " with features", pair, "has a score of", scores)

plt.subplot(3, 4, plot_idx)
if plot_idx <= len(models):
# Add a title at the top of each column
plt.title(model_title, fontsize=9)

# Now plot the decision boundary using a fine mesh as input to a
# filled contour plot
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(
np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)
)

# Plot either a single DecisionTreeClassifier or alpha blend the
# decision surfaces of the ensemble of classifiers
if isinstance(model, DecisionTreeClassifier):
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=cmap)
else:
# Choose alpha blend level with respect to the number
# of estimators
# that are in use (noting that AdaBoost can use fewer estimators
# than its maximum if it achieves a good enough fit early on)
estimator_alpha = 1.0 / len(model.estimators_)
for tree in model.estimators_:
Z = tree.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, alpha=estimator_alpha, cmap=cmap)

# Build a coarser grid to plot a set of ensemble classifications
# to show how these are different to what we see in the decision
# surfaces. These points are regularly space and do not have a
# black outline
xx_coarser, yy_coarser = np.meshgrid(
np.arange(x_min, x_max, plot_step_coarser),
np.arange(y_min, y_max, plot_step_coarser),
)
Z_points_coarser = model.predict(
np.c_[xx_coarser.ravel(), yy_coarser.ravel()]
).reshape(xx_coarser.shape)
cs_points = plt.scatter(
xx_coarser,
yy_coarser,
s=15,
c=Z_points_coarser,
cmap=cmap,
edgecolors="none",
)

# Plot the training points, these are clustered together and have a
# black outline
plt.scatter(
X[:, 0],
X[:, 1],
c=y,
cmap=ListedColormap(["r", "y", "b"]),
edgecolor="k",
s=20,
)
plot_idx += 1 # move on to the next plot in sequence

plt.suptitle("Classifiers on feature subsets of the Iris dataset", fontsize=12)
plt.axis("tight")
plt.tight_layout(h_pad=0.2, w_pad=0.2, pad=2.5)
plt.show()

运行结果

1
2
3
4
5
6
7
8
9
10
11
12
DecisionTree with features [0, 1] has a score of 0.9266666666666666
RandomForest with 30 estimators with features [0, 1] has a score of 0.9266666666666666
ExtraTrees with 30 estimators with features [0, 1] has a score of 0.9266666666666666
AdaBoost with 30 estimators with features [0, 1] has a score of 0.8533333333333334
DecisionTree with features [0, 2] has a score of 0.9933333333333333
RandomForest with 30 estimators with features [0, 2] has a score of 0.9933333333333333
ExtraTrees with 30 estimators with features [0, 2] has a score of 0.9933333333333333
AdaBoost with 30 estimators with features [0, 2] has a score of 0.9933333333333333
DecisionTree with features [2, 3] has a score of 0.9933333333333333
RandomForest with 30 estimators with features [2, 3] has a score of 0.9933333333333333
ExtraTrees with 30 estimators with features [2, 3] has a score of 0.9933333333333333
AdaBoost with 30 estimators with features [2, 3] has a score of 0.9933333333333333
四种分类器结果对比

拓展学习

  1. TreeBoost
  2. XGBoost
  3. GBDT

参考资料