泰坦尼克训练数据集
链接:https://pan.baidu.com/s/175zFJE1NS2QoSP8P1omo_g
提取码:t1k8
代码如下:注意要修改文件路径哦
# 1、查看训练集数据情况
import pandas as pddata_train = pd.read_csv("E:\\Desktop\\train.csv")
print(data_train.shape)
data_train.info()# 2、查看测试集数据情况
data_test = pd.read_csv("E:\\Desktop\\test.csv")
print(data_test.shape)
data_test.info()# 3、缺失值处理(使用随机森林预测填充)
from sklearn.ensemble import RandomForestRegressor# 把要填充的特征和其他无缺失的特征取出,这里先处理Age特征
age_df = data_train[['Age','Fare','Parch','SibSp','Pclass']]# 将乘客分成已知年龄和未知年龄两部分,分别作为训练集和测试集
age_know = age_df[age_df.Age.notnull()].iloc[:,:].values
age_unknow = age_df[age_df.Age.isnull()].iloc[:,:].values# 获取训练集特征和结果标签
X=age_know[:,1:] # 训练集特征
y=age_know[:,0] # 训练集的结果标签# 利用上面构建的训练集训练随机森林回归模型
RF_clf =RandomForestRegressor(random_state=0,n_estimators=200,n_jobs=-1)
RF_clf.fit(X,y)# 用得到的模型对年龄未知的样本进行预测
age_predicted =RF_clf.predict(age_unknow[:,1::])# 用得到的预测结果填补原缺失的数据
data_train.loc[(data_train.Age.isnull()),'Age']=age_predicteddata_train.info()# 注意Pclass原本是int类型,需要先转成string类型
data_train['Pclass'] =data_train['Pclass'].apply(lambda x: str(x))
data_test['Pclass']=data_test['Pclass'].apply(lambda x: str(x))# 将Cabin特征去掉
data_train =data_train.drop(['Cabin'],axis=1)
data_train.info()# 将剩下的还有缺失值的样本直接去掉(这里就是Embatked特征)
data_train =data_train.dropna(axis=0)
data_train.info()data_train.head(6)# 4、将类别型特征取出并进行one-hot编码
cate_df =data_train[['Pclass','Sex','Embarked']]
cate_onehot_df=pd.get_dummies(cate_df)
cate_onehot_df.head(3)df_train = data_train[['Age','Fare','SibSp','Parch']]# 5、数据标准化
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(df_train)df_train = pd.DataFrame(X_train,columns=['Age','Fare','SibSp','Parch'])
df_train.head()# 拼接
df_train =pd.concat([data_train['Survived'],cate_onehot_df,df_train],axis=1)# 防止有NaN值,再次过滤
df_train = df_train.dropna(axis=0)print(df_train.shape)
df_train.head()# 7、训练Logistic回归模型并验证
# 将训练集由DataFrame格式转为矩阵格式
df_train_mat = df_train.iloc[:,:].values# 分割训练集特征和结果标签
X = df_train_mat[:, 1:] # 训练集特征
y = df_train_mat[:, 0] # 训练集的结果标签# 划分训练集和验证集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)# 训练模型
from sklearn import linear_model
LR_clf = linear_model.LogisticRegression(C=1.0,penalty='l1',solver='liblinear',tol=1e-6)
LR_clf.fit(X_train,y_train)# 在验证集上验证
y_predict = LR_clf.predict(X_test)
y_predict_prob = LR_clf.predict_proba(X_test)[:, 1]from sklearn.metrics import classification_report
print('查准率、查全率、F1值:')
print(classification_report(y_test,y_predict,target_names=None))from sklearn.metrics import roc_auc_score
print('AUC值:')
print(roc_auc_score(y_test,y_predict_prob))
from sklearn.metrics import confusion_matrix
print('混淆矩阵:')
print(confusion_matrix(y_test,y_predict,labels=None))# 8、输出特征标签
feature_list = list(df_train.columns[1:])
print(feature_list)# 输出LR模型中各特征的权重值
weight_array = LR_clf.coef_ # 输出为矩阵格式
weight = weight_array[0] # 获取其第一行,输出为列表格式
print(weight)# 将其生成为DataFrame格式输出
df = pd.DataFrame({'feature':feature_list,'weight':weight})
df = df.sort_values('weight',ascending=False)
df.head(12)
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
