2015美国数学建模a代码c语言,美国数学建模比赛2020 MCM C题代码详解

# #!-*- coding:utf-8 -*-

import pandas as pd

import numpy as np

import xlrd

import vaderSentiment

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from sklearn.preprocessing import StandardScaler

from openpyxl import load_workbook # 写入excel

from wordcloud import WordCloud as wc

import jieba # 结巴分词

import matplotlib.pyplot as plt # 绘图

from collections import defaultdict # 字典,用于词频统计

from PIL import Image # 打开图片,用于词云背景层

import cv2

from pyecharts.charts import Bar

import datetime

from pandas import Series

# #######################################################################################

# 1.读取数据与数据预处理

# data = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\2018Q3.csv")

hair_dryer = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer.csv")

hair_dryer_review = hair_dryer['review_body']

# 删除重复数据

print('hair_dryer删除重复数据前大小:',hair_dryer.shape)

hair_dryer.drop_duplicates()

print('hair_dryer删除重复数据后大小:',hair_dryer.shape)

#查看是否有缺失值

hair_dryer.isnull().sum()

# #######################################################################################

# 2.计算三件商品评论的情感得分;

analyzer = SentimentIntensityAnalyzer()

hair_dryer_sentiments = [analyzer.polarity_scores(review_body) for review_body in hair_dryer_review]

# float() argument must be a string or a number, not 'dict'

# 'float' object has no attribute 'split',需要把float换成其他格式

# 特征格式变换

# data['term'] = data['term'].str.replace(' months', '').astype('float')

# col = data.select_dtypes(include=['int64', 'float64']).columns

# col = col.drop('loan_status') # 剔除目标变量

# 将得分作为新的数据列与hair_dryer数据合并;

hair_dryer = hair_dryer.join(pd.DataFrame(hair_dryer_sentiments))

# #######################################################################

# 4.将Date列转换为日期格式;时间分析

hair_dryer['review_date'] = pd.to_datetime(hair_dryer['review_date'])

# 5.将新的日期设置为index;

hair_dryer.set_index(hair_dryer['review_date'], inplace=True)

"""

hair_dryer['weekday'] = hair_dryer['review_date'].dt.weekday

df1 = hair_dryer.set_index('review_date')

df1.resample('D').size().sort_values(ascending=False).head(100)

df2 = df1.resample('M').size().to_period()

# df2 = df2.reset_index(df2['review_date'], inplace=True)

print(hair_dryer['weekday'])

weekday

0 1777

1 1702

2 1641

3 1849

4 1531

5 1536

6 1434

"""

# ###################################################################

# 获取带星期的简化日期,如:11-25周六

def getWeek(x):

tstr = x.strftime('%Y-%m-%d')

dDay = tstr.split('-', 1)[1]

weekDict = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}

dWeek = x.weekday()

return weekDict[dWeek]

hair_dryer['weekday'] = hair_dryer['review_date'].map(getWeek)

# print("weekday_hair_dryer", hair_dryer['weekday'])

weekday_hair_dryer = hair_dryer['weekday']

# hair_dryer = hair_dryer.join(pd.DataFrame(weekday_hair_dryer))

hair_dryer['month'] = pd.to_datetime(hair_dryer['review_date']).dt.month

# print("hair_dryer['month']", hair_dryer['month'])

month_hair_dryer = hair_dryer['month']

hair_dryer['months'] = hair_dryer['month']

hair_dryer = hair_dryer.drop(["month"], axis=1)

# #######################################################################################

# 购买量的增长情况

hair_dryer['date'] = pd.to_datetime(hair_dryer['review_date']).dt.strftime('%Y-%m-%d')

date_num = hair_dryer['date']

date_num.apply(pd.value_counts)

data_counts = hair_dryer['date'].value_counts()

# print(data_counts)

date_num_describe = date_num.describe()

"""

print(date_num_describe)

count 11470

unique 2307

top 2010-08-05

freq 146

"""

hair_dryer_describe = hair_dryer.describe()

# print(hair_dryer_describe)

hair_dryer_corr = hair_dryer.corr()

# print(hair_dryer_corr)

# data_counts.to_csv('hair_dryer_data_counts.csv')

# hair_dryer.to_csv('hair_dryer_new.csv')

# 手动读取,数据预处理

# #################################################################################

# 数据预处理完 我们调用新数据

hair_dryer_new = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer_new.csv")

# hair_dryer = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer.csv")

# hair_dryer_review = hair_dryer['review_body']

# #################################################################################

# 按产品id统计,并绘制散点图

hair_dryer_productid = hair_dryer_new['product_id'].value_counts().sort_values(ascending=False)

print("hair_dryer_productid", hair_dryer_productid)

# 很容易获取 前10后10的产品

# 如何对应他们的评分平均值 EXCEL,波士顿矩阵

# hair_dryer_productid.to_csv('hair_dryer_productid.csv')

# hair_dryer_productid.plot(kind='scatter')

# ValueError: plot kind scatter can only be used for data frames

# #################################################################################

# 统计指定单词出现的次数

"""

import sys

File_tuple1 = open(r'english.txt') #打开目标文件

File_tuple2 = File_tuple1.read()

File_tuple1.close()

File_list = File_tuple2.split(' ') #以空格来划分文件中的单词

#print(File_list)

x = input('请输入要查询的单词:')

a = 0

i = 0

for i in range(len(File_list)):

if File_list[i]==x:

a+=1

print (x,'在english.txt中出现的次数为',a, '次。')

"""

# ################################################################################################

# 预测

# hair_dryer_new = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer_new.csv")

# 查看因变量情况

star_rating_hairdryer = hair_dryer_new['star_rating']

# print("star_rating_hairdryer", hair_dryer_new['star_rating'].value_counts())

sns.countplot(hair_dryer_new.star_rating)

# plt.tick_params(axis='x', labelsize=6)

plt.show()

词云1 ###############################################

hair_dryer_review_text = open(r"C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer_review.txt",

'r', encoding='UTF-8')

hair_dryer_text = hair_dryer_review_text.read()

cut_hair_dryer_text = " ".join(jieba.cut(hair_dryer_text))

color_mask_hair_dryer = cv2.imread('mask.jpg')

# coloring=np.array(Image.open("cat_new.jpg"))

# #获取背景图片,new.jpg

# alice_mask = np.array(Image.open(path.join(d, "alice_mask.png")))

cloud = wc(

# 设置字体,不指定就会出现乱码

font_path="C:\\Windows\\Fonts\\Times New Roman.TTF",

# font_path=path.join(d,'simsun.ttc'),

# 设置背景色

background_color='white',

# 词云形状

mask=color_mask_hair_dryer,

# 允许最大词汇

max_words=2000,

# 最大号字体

max_font_size=40

)

# 直接根据分词结果生成简单的词云图

wordcloud = wc().generate(cut_hair_dryer_text)

# wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask, stopwords=stopwords)

# wordcloud = wd(width=1000, height=860, margin=2, font_path="simsun.ttf", background_color="white", max_font_size=180,

mask=myimg).fit_words(wordfrequency) # 根据词频字典生成词云

plt.imshow(wordcloud, interpolation='bilinear')

plt.axis('off')# 不显示坐标轴

plt.show()


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部