项目实战1:红酒质量分析

一、数据介绍

        数据下载网址:https://archive.ics.uci.edu/ml/datasets/Wine+Quality

        https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/

        论文下载网址:https://www.sciencedirect.com/science/article/pii/S0167923609001377

        数据说明:包括两个数据集,与葡萄牙北部的红葡萄酒和白葡萄酒样品有关。目标是根据物理化学测试对葡萄酒质量进行建模。

二、建模步骤

        (1)加载CSV数据

        (2)将字符串类型的数据转换为浮点型

        (3)数据归一化(最大-最小值归一化)

        (4)交叉验证

        (5)通过RMSE评估算法性能

        下面是具体的实现代码:

# -*- coding: utf-8 -*-
"""
Created on Mon May  9 15:23:51 2022@author: xiaofeng
"""# 1. load csv
# 2. convert string to float
# 3. normalization
# 4. cross validation
# 5. evaluate our algo(RMSE)# 1.Import standard Lib
from math import sqrt
from csv import reader
from random import randrange
from random import seed# 2.Load our csv filedef load_csv(filename):"""读取CSV数据"""dataset = list()with open(filename, "r") as file:csv_reader = reader(file)for row in csv_reader:if not row:continuedataset.append(row)return dataset# 3.Convert our datatype
def string_to_float(dataset, column):"""将字符串类型的数据转换成浮点型"""for row in dataset:row[column] = float(row[column].strip())# 4.find the min and max of our dataset
def find_min_and_max(dataset):"""找出数据集中每一列的最小值和最大值并放入到一个新的列表中"""min_max_list = list()for i in range(len(dataset[0])):col_value = [row[i] for row in dataset]min_value = min(col_value)max_value = max(col_value)min_max_list.append([min_value, max_value])return min_max_list# 5.normalization our data
def normalization(dataset, min_max_list):"""将数据集进行最小-最大值归一化"""for row in dataset:for i in range(len(row)):row[i] = (row[i] - min_max_list[i][0]) / (min_max_list[i][1] - min_max_list[i][0])# 6.spliting our data
def k_fold_cross_validation_split(dataset, n_folds):"""K折交叉验证划分"""splitted_dataset = list()copy_dataset = list(dataset)every_fold_size = int(len(dataset) / n_folds)for i in range(n_folds):fold = list()while len(fold) < every_fold_size:index = randrange(len(copy_dataset))fold.append(copy_dataset.pop(index))splitted_dataset.append(fold)return splitted_dataset# 7.using root mean squared error method to calculate our model
def rmse_method(actual_data, predicted_data):"""计算RMSE"""sum_of_error = 0.0for i in range(len(actual_data)):predicted_error = predicted_data[i] - actual_data[i]sum_of_error += (predicted_error ** 2)mean_error = sum_of_error / float(len(actual_data))rmse = sqrt(mean_error)return rmse# 8.how good is our algo by using cross validation
def how_good_is_our_algo(dataset, algo, n_folds, *args):"""通过RMSE的值评估算法"""folds = k_fold_cross_validation_split(dataset, n_folds)scores = list()for fold in folds:train_set = list(folds)train_set.remove(fold)train_set = sum(train_set, [])test_set = list()for row in fold:row_copy = list(row)test_set.append(row_copy)row_copy[-1] = Nonepredicted = algo(train_set, test_set, *args)actual = [row[-1] for row in fold]rmse = rmse_method(actual, predicted)scores.append(rmse)return scores# 9.make prediction
def predict(row, coefficients):"""线性回归预测"""y_hat = coefficients[0]for i in range(len(row) - 1):y_hat += coefficients[i + 1] * row[i]return y_hat# 10.using stochastic gradient descent method to calculate the coefficient
def sgd_method_to_calculate_coefficient(training_data, learning_rate, n_epoch):"""通过随机梯度下降计算线性回归的系数"""coefficients_list = [0.0 for i in range(len(training_data[0]))]for epoch in range(n_epoch):for row in training_data:y_hat = predict(row, coefficients_list)error = y_hat - row[-1]coefficients_list[0] = coefficients_list[0] - learning_rate * errorfor i in range(len(row) - 1):coefficients_list[i + 1] = coefficients_list[i + 1] - learning_rate * error * row[i]return coefficients_list# 11.using linear regression algo
def using_sgd_method_to_calculate_linear_regression(training_data,testing_data,learning_rate,n_epoch):"""使用随机梯度下降算法计算线性回归"""predictions = list()coefficients_list = sgd_method_to_calculate_coefficient(training_data, learning_rate, n_epoch)for row in testing_data:y_hat = predict(row, coefficients_list)predictions.append(y_hat)return (predictions)# 主函数
if '__main__' == __name__:# 12.Using our real wine quality dataseed(1)dataset_list = load_csv(".\dataset\Wine+Quality\winequality-white(chuli).csv")for i in range(len(dataset_list[0])):string_to_float(dataset_list, i)# 12.Normalizationmin_and_max = find_min_and_max(dataset_list)normalization(dataset_list, min_and_max)# 13.How good is our algon_folds = 5learning_rate = 0.01n_epoch = 50algo_score = how_good_is_our_algo(dataset_list, using_sgd_method_to_calculate_linear_regression,n_folds, learning_rate,n_epoch)print("Our algo's score is %s"%algo_score)print("The mean of our algo's RMSE is %.3f"%(sum(algo_score)/float(len(algo_score))))

三、数据和代码下载

链接:https://pan.baidu.com/s/1iN65MCrfuVcdQw0gVyoF2A 
提取码:rxrg 
--来自百度网盘超级会员V7的分享


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部