经典案例 泰坦尼克号

import pandas as pd
import graphviz#1. 读取数据,获取特征值:
data = pd.read_excel(r'tietan.xls')
print(data)#2. pclass, age, sex
feature = data.loc[:, ['pclass','age','sex']]
print(feature.info())#3. 填充空值: AGE: 找不到准确值:
feature.fillna({'age':feature['age'].mean()}, inplace=True)#4. 将特征中的类别属性进行转化, one-hot 编码:X = feature.to_dict(orient='records')
print(X)#5. 批量哑变量处理的方法:
from sklearn.feature_extraction import DictVectorizerdic = DictVectorizer(sparse=False)  # sparse=False 表示返回数组类型,否则返回词频向量化类型x = dic.fit_transform(X) # X: [{'':, "", }] #对X 进行哑变量转换,返回的数组类型;
feature_name = dic.get_feature_names() # 返回每列的表头# print(feature_name)
# print(x)#6. 分割训练集与测试集:y = data['survived']from sklearn.model_selection import train_test_splitx_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)#7.模型训练:
from sklearn.tree import DecisionTreeClassifier# 实例化:
dc = DecisionTreeClassifier(criterion='entropy', max_depth=5 )# 训练:
dc.fit(x_train, y_train)# 预测:
dc.predict(x_test)# 准确度:
print(dc.score(x_test, y_test))#8. 导出模型:
from sklearn import treetree.export_graphviz(dc, 'tree.dot')#9. 可视化
a = graphviz.Source('tree.dot')
print(a)


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部