管道模型
上一节
下一节
管道模型(Pipeline)的基本功能是把一系列算法打包在一起,让他们各司其职,形成一个流水线。就像组装汽车的工厂流水线。
未使用管道模型进行训练和评估
- 数据载入
# 导入数据生成器 from sklearn.datasets import make_blobs # 导入数据集拆分工具 from sklearn.model_selection import train_test_split # 导入数据预处理工具 from sklearn.preprocessing import StandardScaler # 导入多层感知机神经网络 from sklearn.neural_network import MLPClassifier # 导入画图工具 import matplotlib.pyplot as plt #TODO: 1.生成数据集并进行数据划分 X, y = make_blobs(n_samples=200, centers=2, cluster_std=5, random_state=16) X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=16) #TODO: 2.数据预处理 scaler = StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test)
- 数据的可视化
plt.rcParams['font.sans-serif'] = [u'Microsoft YaHei']
plt.scatter(X_train[:,0], X_train[:,1], label='原始数据')
plt.scatter(X_train_scaled[:,0], X_train_scaled[:,1], marker='^', edgecolor='k', label='预处理后数据')
plt.title('预处理前后训练集的分布情况图')
plt.legend(loc='best')
plt.show()- 使用网格搜索输出评分
# 导入网格搜索类
from sklearn.model_selection import GridSearchCV
# 设定网格搜索的目标参数字典
params = {'hidden_layer_sizes':[[50],[100],[100,100]],
'alpha':[0.0001, 0.001, 0.01, 0.1]}
# 建立网格搜索模型
mlp = MLPClassifier(max_iter=1600, random_state=16)
grid = GridSearchCV(mlp, param_grid=params, cv=3, iid=False, verbose=1, n_jobs=8)
# 使用网格搜索拟合数据
grid.fit(X_train_scaled, y_train)
# 输出结果
print('模型最佳得分:{:.2f}'.format(grid.best_score_))
print('模型最佳参数:{}'.format(grid.best_params_))使用管道模型实现训练和评分
# 导入管道模型
from sklearn.pipeline import Pipeline
# 建立包含预处理和神经网络的管道模型
pipeline = Pipeline([('scaler',StandardScaler()),
('mlp',MLPClassifier(max_iter=1600,random_state=38))])
# 使用管道模型对训练集进行拟合
pipeline.fit(X_train, y_train)使用管道模型进行网格搜索
# 导入数据生成器
from sklearn.datasets import make_blobs
# 导入数据集拆分工具
from sklearn.model_selection import train_test_split
# 导入数据预处理工具
from sklearn.preprocessing import StandardScaler
# 导入多层感知机神经网络
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV# 导入管道模型
from sklearn.pipeline import Pipeline
#TODO: 1.生成数据集并进行数据划分
X, y = make_blobs(n_samples=200, centers=2, cluster_std=5, random_state=16)
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=16)
#TODO: 2.创建管道模型,并创建预处理模块scaler和神经网络模块mlp
params = {'mlp__hidden_layer_sizes':[[50],[100],[100,100]],
'mlp__alpha':[0.0001, 0.001, 0.01, 0.1]}
pipeline = Pipeline(steps=
[('scaler', StandardScaler()),
('mlp', MLPClassifier(max_iter=1600, random_state=16))], verbose=1)
# TODO: 3.创建网格搜索模型,并输出预测结果
grid = GridSearchCV(pipeline, param_grid=params, cv=5, iid=False, n_jobs=8, verbose=1)
grid.fit(X_train, y_train)
print('交叉验证评分:{:.2f}'.format(grid.best_score_))
print('模型最优参数:{}'.format(grid.best_params_))
print('测试集得分:{}'.format(grid.score(X_test,y_test)))

