1、iris数据集的决策树构造
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt
iris = load_iris()
print("feature names", iris.feature_names)
print("target names", iris.target_names)
X = iris.data[:, 2:]
y = iris.target
print("data shape", iris.data.shape)
print("X shape", X.shape)
X = iris.data[:, 2:]
y = iris.target
本案例每个样本只选取后两个特征"petal length"和"petal width"
(通过切片方式iris.data[:, 2:])。
print("data shape", iris.data.shape)
print("X shape", X.shape)
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)
创建决策树并拟合,这里设置了最大深度为2,限制决策树的高度最多为2。
绘制决策树图片。sklearn提供了plot_tree的接口。
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(
tree_clf,
feature_names=iris.feature_names[2:],
class_names=iris.target_names,
filled=True
)
# Save picture
fig.savefig("decistion_tree.png")
特征重要性的源码实现
在sklearn,特征重要性的计算核心函数是cpython文件_tree.pyx的的compute_feature_importances。
dt = DecisionTreeClassifier()
dt.fit(X, y)
importance = dt.feature_importances_
feature_names = iris.feature_names
df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
df = df.sort_values('Importance', ascending=False)
print(df)
2、wine数据集的决策树构造
from sklearn import tree
import pandas as pd
from sklearn.datasets import load_wine #红酒数据集
from sklearn.model_selection import train_test_split
wine = load_wine()
print(wine.data)
print(wine.data.shape)
print(wine.target)
print(wine.feature_names)
print(wine.target_names)
df = pd.concat([pd.DataFrame(wine.data),pd.DataFrame(wine.target)],axis=1) #将特征和标签合并,axis=0为横向合并
print(df)
# 划分测试集和训练集
xtrain,xtest,ytrain,ytest = train_test_split(wine.data,wine.target,test_size=0.3)
# 创建决策树
clf = tree.DecisionTreeClassifier(criterion='entropy',random_state=42,splitter='random',max_depth=3,min_samples_leaf=10,min_samples_split=10) #实例化
#训练
clf = clf.fit(xtrain,ytrain)
#为测试集打分,分数每次运行会不一样,原因有两个:测试集和训练集划分不确定;决策树分类器的random_state参数影响
result = clf.score(xtest,ytest)
print('测试集打分',result)
print('训练集打分',clf.score(xtrain,ytrain))
#可视化
from sklearn.tree import export_graphviz
import pydot
import graphviz #需要自己导入 pip install graphviz
feature_name = ['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','非黄烷类酚类','花青素','颜色强度','色调','od280/od315稀释葡萄酒','脯氨酸']
dot = tree.export_graphviz(clf
,feature_names=feature_name
,class_names=['琴酒','雪莉','贝尔摩德']
,filled=True #上色
,rounded=True) #圆角
graph = graphviz.Source(dot)
print(graph)
#重要性
importance = clf.feature_importances_
feature_importance = [(feature,importance.round(3)) for feature,importance in zip(feature_name,importance)]
#排序
feature_importance = sorted(feature_importance,key=lambda x :x[1],reverse=True)
#对应进行打印
[print('variable:{:20} importance: {}'.format(*pair)) for pair in feature_importance]
#绘制超参数曲线
import matplotlib.pyplot as plt
test = []
for i in range(100):
clf = tree.DecisionTreeClassifier(max_depth=2
# ,criterion='gini'
,criterion='entropy'
,random_state=i
,splitter='random')
clf.fit(xtrain,ytrain)
score = clf.score(xtest,ytest)
test.append(score)
plt.plot(range(100),test)
plt.xlabel('max_depth')
plt.ylabel('score')
plt.show()