 
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance
from matplotlib.font_manager import FontProperties
import matplotlib.pyplot as plt

#1 导入波士顿房价数据集
bhPrice = datasets.load_boston()

#2 分割总数据集为训练数据集和测试数据集
X_train, X_test, y_train, y_test = train_test_split(bhPrice.data, bhPrice.target, random_state=42, test_size=0.1)

#3 标准化数据集
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std  = sc.transform(X_test)

#4 设置几个 GradientBoostingRegressor 的超参数
gbr_params = {'n_estimators': 1000,
              'max_depth': 3,
              'min_samples_split': 5,
              'learning_rate': 0.01,
              'loss': 'ls' }

#5 创建GradientBoostingRegressor的对象，并训练模型
gbr = GradientBoostingRegressor(**gbr_params)
gbr.fit(X_train_std, y_train)

#6 计算并输出决定系数R^2
print("模型准确度: %.3f" % gbr.score(X_test_std, y_test))

#7 计算均方误差
mse = mean_squared_error(y_test, gbr.predict(X_test_std))
print("基于测试数据的均方误差(MSE): {:.4f}".format(mse))

#8 使用feature_importances_获得特征变量的重要性指标
feature_importance = gbr.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

#9 输出特征变量的重要性
fig = plt.figure(figsize=(16, 8))
fig.canvas.manager.set_window_title("GradientBoostingRegressor集成模型")  # Matplotlib >= 3.4
#fig.canvas.set_window_title("GradientBoostingRegressor集成模型")  # Matplotlib < 3.4
font = FontProperties(fname="C:\\Windows\\Fonts\\SimHei.ttf")  # , size=16

plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(bhPrice.feature_names)[sorted_idx])
#plt.title('Feature Importance (MDI)', fontproperties=font)
plt.title('特征重要性(MDI)', fontproperties=font)
result = permutation_importance(gbr, X_test_std, y_test, n_repeats=10,
                                random_state=42, n_jobs=2)
sorted_idx = result.importances_mean.argsort()

#10 计算不同弱学习器个数对应的GradientBoostingRegressor对象时的测试评分
test_score = np.zeros((gbr_params['n_estimators'],), dtype=np.float64)
for i, y_pred in enumerate(gbr.staged_predict(X_test_std)):
    test_score[i] = gbr.loss_(y_test, y_pred)
 
#11 绘制偏差曲线
plt.subplot(1, 2, 2)
plt.title('偏差(Deviance)', fontproperties=font)
plt.plot(np.arange(gbr_params['n_estimators']) + 1, gbr.train_score_, 'b-',
         label='训练偏差')
plt.plot(np.arange(gbr_params['n_estimators']) + 1, test_score, 'r-',
         label='测试偏差')
plt.legend(loc='upper right', prop=font)
plt.xlabel('迭代次数', fontproperties=font)
plt.ylabel('偏差', fontproperties=font)

fig.tight_layout()
plt.show()
 