数据挖掘心电图
Task1 第一天体验
学习日记
主要是跑通基准模型,以及看懂基准模型的代码部分。
问题:
- 对于机器学习理论了解不深
- 特别是集成学习这块以及多分类问题
- 没有挤出时间来研究代码
#这一部分主要导入一些用到的包
import os
import gc
import mathimport pandas as pd
import numpy as npimport lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScalerfrom sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoderfrom tqdm import tqdm
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
train = pd.read_csv('data/train.csv')
test=pd.read_csv('data/testA.csv')
train.head()
print(train.head())
#数据的预处理,把
def reduce_mem_usage(df):start_mem = df.memory_usage().sum() / 1024 ** 2print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))for col in df.columns:col_type = df[col].dtypeif col_type != object:c_min = df[col].min()c_max = df[col].max()if str(col_type)[:3] == 'int':if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:df[col] = df[col].astype(np.int8)elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:df[col] = df[col].astype(np.int16)elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:df[col] = df[col].astype(np.int32)elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:df[col] = df[col].astype(np.int64)else:if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:df[col] = df[col].astype(np.float16)elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:df[col] = df[col].astype(np.float32)else:df[col] = df[col].astype(np.float64)else:df[col] = df[col].astype('category')end_mem = df.memory_usage().sum() / 1024 ** 2print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))return df# 简单预处理,也就是所谓的归一化处理
train_list = []for items in train.values:train_list.append([items[0]] + [float(i) for i in items[1].split(',')] + [items[2]])train = pd.DataFrame(np.array(train_list))
train.columns = ['id'] + ['s_'+str(i) for i in range(len(train_list[0])-2)] + ['label']
train = reduce_mem_usage(train)test_list=[]
for items in test.values:test_list.append([items[0]] + [float(i) for i in items[1].split(',')])test = pd.DataFrame(np.array(test_list))
test.columns = ['id'] + ['s_'+str(i) for i in range(len(test_list[0])-1)]
test = reduce_mem_usage(test)
# 训练数据/测试数据准备
x_train = train.drop(['id','label'], axis=1)#删除掉ID跟标签
y_train = train['label']#只保留标签
x_test=test.drop(['id'], axis=1)#只保留数据
# 模型训练
def abs_sum(y_pre,y_tru):y_pre=np.array(y_pre)y_tru=np.array(y_tru)loss=sum(sum(abs(y_pre-y_tru)))return lossdef cv_model(clf, train_x, train_y, test_x, clf_name):folds = 5seed = 2021kf = KFold(n_splits=folds, shuffle=True, random_state=seed)test = np.zeros((test_x.shape[0], 4))cv_scores = []onehot_encoder = OneHotEncoder(sparse=False)for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):print('************************************ {} ************************************'.format(str(i + 1)))trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], \train_y[valid_index]if clf_name == "lgb":train_matrix = clf.Dataset(trn_x, label=trn_y)valid_matrix = clf.Dataset(val_x, label=val_y)params = {'boosting_type': 'gbdt','objective': 'multiclass','num_class': 4,'num_leaves': 2 ** 5,'feature_fraction': 0.8,'bagging_fraction': 0.8,'bagging_freq': 4,'learning_rate': 0.1,'seed': seed,'nthread': 24,#把这个地方原本的参数28修改成24'n_jobs': 24,'verbose': -1,}model = clf.train(params,train_set=train_matrix,valid_sets=valid_matrix,num_boost_round=2000,verbose_eval=100,early_stopping_rounds=200)val_pred = model.predict(val_x, num_iteration=model.best_iteration)test_pred = model.predict(test_x, num_iteration=model.best_iteration)val_y = np.array(val_y).reshape(-1, 1)val_y = onehot_encoder.fit_transform(val_y)print('预测的概率矩阵为:')print(test_pred)test += test_predscore = abs_sum(val_y, val_pred)cv_scores.append(score)print(cv_scores)print("%s_scotrainre_list:" % clf_name, cv_scores)print("%s_score_mean:" % clf_name, np.mean(cv_scores))print("%s_score_std:" % clf_name, np.std(cv_scores))test = test / kf.n_splitsreturn testdef lgb_model(x_train, y_train, x_test):lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")return lgb_testlgb_test = lgb_model(x_train, y_train, x_test)
模型训练过程截图

最后总结:
初步看懂前面的数据处理部分,但是在模型训练部分对于集成学习理论的不了解,对于模型如何训练,参数的设置依然有很大的问题存在
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
