系列笔记

  1. Decision Tree
  2. Ensemble Learning
  3. GBDT / XGBoost / LightGBM

如果说我看得比别人更远些,那是因为我站在巨人的肩膀上。

参考资料

GBDT代码实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# load data
diabetes = datasets.load_diabetes()
X, y = diabetes.data, diabetes.target

# data preprocessing
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.1, random_state=13
)
params = {
"n_estimators": 500, # 迭代次数
"max_depth": 4, # 树中结点的最大深度
"min_samples_split": 5, # 拆分内部结点所需的最小样本数
"learning_rate": 0.01, # 每棵树的贡献会减少多少
"loss": "squared_error", # 优化损失函数
}

# fit regression model
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_test, reg.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse)) # 3009.1324

# polt training deviance
test_score = np.zeros((params["n_estimators"],), dtype=np.float64)
for i, y_pred in enumerate(reg.staged_predict(X_test)):
test_score[i] = reg.loss_(y_test, y_pred)

fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.title("Deviance")
plt.plot(
np.arange(params["n_estimators"]) + 1,
reg.train_score_,
"b-",
label="Training Set Deviance",
)
plt.plot(
np.arange(params["n_estimators"]) + 1, test_score, "r-", label="Test Set Deviance"
)
plt.legend(loc="upper right")
plt.xlabel("Boosting Iterations")
plt.ylabel("Deviance")
fig.tight_layout()
plt.show()
运行结果 GBDT regression运行结果

XGBoost代码实现

1
2
3
4
5
6
7
8
9
10
import xgboost as xgb
# read in data
dtrain = xgb.DMatrix('demo/data/agaricus.txt.train')
dtest = xgb.DMatrix('demo/data/agaricus.txt.test')
# specify parameters via map
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
num_round = 2
bst = xgb.train(param, dtrain, num_round)
# make prediction
preds = bst.predict(dtest)

LightGBM代码实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import lightgbm as lgb
import numpy as np
import scipy
import h5py

# Data Interface
train_data = lgb.Dataset('train.svm.bin')
data = np.random.rand(500, 10) # 500 entities, each contains 10 features
label = np.random.randint(2, size=500) # binary target
train_data = lgb.Dataset(data, label=label)

'''
# 创建数据集的其他方式
csr = scipy.sparse.csr_matrix((dat, (row, col)))
train_data = lgb.Dataset(csr)

class HDFSequence(lgb.Sequence):
def __init__(self, hdf_dataset, batch_size):
self.data = hdf_dataset
self.batch_size = batch_size

def __getitem__(self, idx):
return self.data[idx]

def __len__(self):
return len(self.data)

f = h5py.File('train.hdf5', 'r')
train_data = lgb.Dataset(HDFSequence(f['X'], 8192), label=f['Y'][:])

train_data = lgb.Dataset('train.svm.txt')
train_data.save_binary('train.bin')

validation_data = train_data.create_valid('validation.svm')
# validation_data = lgb.Dataset('validation.svm', reference=train_data)

train_data = lgb.Dataset(data, label=label, feature_name=['c1', 'c2', 'c3'], categorical_feature=['c3'])

w = np.random.rand(500, )
train_data = lgb.Dataset(data, label=label, weight=w)
'''

# Setting Parameters
param = {'num_leaves': 31, 'objective': 'binary'}
param['metric'] = 'auc'

# Traing
num_round = 10
bst = lgb.train(param, train_data, num_round, valid_sets=[validation_data])
bst.save_model('model.txt')
json_model = bst.dump_model()
bst = lgb.Booster(model_file='model.txt') # init model

# CV
lgb.cv(param, train_data, num_round, nfold=5)

# Early Stopping
bst = lgb.train(param, train_data, num_round, valid_sets=valid_sets, callbacks=[lgb.early_stopping(stopping_rounds=5)])
bst.save_model('model.txt', num_iteration=bst.best_iteration)

# Prediction
# 7 entities, each contains 10 features
data = np.random.rand(7, 10)
ypred = bst.predict(data)
# ypred = bst.predict(data, num_iteration=bst.best_iteration) # if early stopping