实验汇总11泰坦尼克号模型构建

# 导入所需库
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
import os
from sklearn import preprocessing
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
data_url="http://hbiostat.org/data/repo/titanic3.xls"
data_file_path=r"titanic3.xls"
if not os.path.isfile(data_file_path):result = urllib.request.urlretrieve(data_url,data_file_path)print('downloaded:',result)
else:print(data_file_path,'data file already exists.')
titanic3.xls data file already exists.
df_data = pd.read_excel(data_file_path)
df_data.head(2)
pclasssurvivednamesexagesibspparchticketfarecabinembarkedboatbodyhome.dest
011Allen, Miss. Elisabeth Waltonfemale29.00000024160211.3375B5S2NaNSt Louis, MO
111Allison, Master. Hudson Trevormale0.916712113781151.5500C22 C26S11NaNMontreal, PQ / Chesterville, ON
selected_cols = ["survived","pclass","name","sex","age","sibsp","parch","fare","embarked"]
selected_df_data = df_data[selected_cols]
selected_df_data.head(2)
survivedpclassnamesexagesibspparchfareembarked
011Allen, Miss. Elisabeth Waltonfemale29.000000211.3375S
111Allison, Master. Hudson Trevormale0.916712151.5500S
def prepare_data(df_data):#处理缺失值和非数值型数据df=df_data.drop(['name'], axis=1) #删除姓名列age_mean = df['age'].mean()df['age'] = df['age'].fillna(age_mean) #为缺失age记录填充值fare_mean = df['fare'].mean()df['fare'] = df['fare'].fillna(fare_mean) #为缺失fare记录填充值df['sex']= df['sex'].map({'female':0, 'male': 1}).astype(int) #把sex值由字符串转换为数值df['embarked'] = df['embarked'].fillna('S') #为缺失embarked记录填充值df['embarked']=df['embarked'].map({'C':0, 'Q': 1,'S': 2}).astype(int) #把embarked值由字ndarray_data = df.values #转换为ndarray数组#特征和标签分离features = ndarray_data[:,1:] #后7列是特征值label = ndarray_data[:,0] #第0列是标签值# 特征值标准化minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))norm_features=minmax_scale.fit_transform(features)return norm_features,label #返回特征数据和标签数据

1.4 打乱样本顺序

通过Pandas的抽样函数sample实现,frac为百分比。
frac=1表示100%,即所有数据抽取出来打乱顺序,即变成随机数据。

selected_df_data = selected_df_data.sample(frac=1)
x_data,y_data = prepare_data(selected_df_data)
train_size = int(len(x_data)*0.8)
print(train_size )
#训练集
x_train = x_data[:train_size]
y_train = y_data[:train_size]
#测试集
x_test = x_data[train_size:]
y_test = y_data[train_size:]
1047

输入层:7个神经元
隐藏层1:64个神经元
隐藏层2:32个神经元
输出层:1个神经元

model = tf.keras.models.Sequential()
#第一个隐藏层
model.add(tf.keras.layers.Dense(units=64,input_dim=7,activation="relu"))#丢弃层
model.add(tf.keras.layers.Dropout(rate=0.3))#第二个隐藏层
model.add(tf.keras.layers.Dense(units=32,activation="relu"))#丢弃层
model.add(tf.keras.layers.Dropout(rate=0.3))#输出层
model.add(tf.keras.layers.Dense(units=1,activation="sigmoid"))
model.summary()
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_3 (Dense)              (None, 64)                512       
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_3 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
=================================================================
Total params: 2,625
Trainable params: 2,625
Non-trainable params: 0
_________________________________________________________________

输入层:7个神经元
隐藏层1:64个神经元
隐藏层2:32个神经元
输出层:1个神经元

3.1 定义模型训练模式

model.compile(optimizer="Adam",loss="binary_crossentropy",metrics=["accuracy"])

3.2 训练模型

train_history = model.fit(x_train,y_train,validation_split=0.2,epochs=200,batch_size=32,verbose=1)
Epoch 196/200
27/27 [==============================] - 0s 3ms/step - loss: 0.4276 - accuracy: 0.7969 - val_loss: 0.4232 - val_accuracy: 0.8190
Epoch 197/200
27/27 [==============================] - 0s 3ms/step - loss: 0.4256 - accuracy: 0.8124 - val_loss: 0.4205 - val_accuracy: 0.8143
Epoch 198/200
27/27 [==============================] - 0s 3ms/step - loss: 0.4316 - accuracy: 0.7969 - val_loss: 0.4215 - val_accuracy: 0.8333
Epoch 199/200
27/27 [==============================] - 0s 3ms/step - loss: 0.4358 - accuracy: 0.8053 - val_loss: 0.4231 - val_accuracy: 0.8381
Epoch 200/200
27/27 [==============================] - 0s 3ms/step - loss: 0.4214 - accuracy: 0.8136 - val_loss: 0.4230 - val_accuracy: 0.8381

训练过程数据可视化

plt.plot(train_history.history["loss"],color="b")
plt.plot(train_history.history["val_loss"],color="r")

在这里插入图片描述

plt.plot(train_history.history["accuracy"],color="b")
plt.plot(train_history.history["val_accuracy"],color="r")

在这里插入图片描述

result = model.evaluate(x_test,y_test,verbose=2)
print("测试集损失={:.10f},测试集准确率={:.10f}".format(result[0],result[1]))
9/9 - 0s - loss: 0.4039 - accuracy: 0.8244
测试集损失=0.4038968980,测试集准确率=0.8244274855
result = model.evaluate(x_test,y_test,verbose=2)
print("测试集损失={:.10f},测试集准确率={:.10f}".format(result[0],result[1]))
9/9 - 0s - loss: 0.4039 - accuracy: 0.8244
测试集损失=0.4038968980,测试集准确率=0.8244274855
selected_cols
['survived','pclass','name','sex','age','sibsp','parch','fare','embarked']
# Jack和Rose的旅客信息
Jack_info = [ 0 , 3 ,'Jack', 'male' , 23, 1, 0, 5.0000,'S']
Rose_info = [ 1 , 1 ,'Rose', 'female', 20, 1, 0, 100.0000,'S']
# 创建新的旅客DataFrame
new_passenger_pd = pd.DataFrame([Jack_info,Rose_info],columns=selected_cols)
new_passenger_pd
survivedpclassnamesexagesibspparchfareembarked
003Jackmale23105.0S
111Rosefemale2010100.0S
x_np,y_np = prepare_data(new_passenger_pd)
array_p=model.predict(x_np)
array_p.tolist()# 转换成列表
array_p=pd.DataFrame(array_p,columns=['存活率'],index=['Jack','Rose'])
array_p
存活率
Jack0.066661
Rose0.999976


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部