NLP学习（十一）-NLP实战之电影评分数据的情感分析-Python3

2023-11-23 21:27:42

电影评论数据集
百度文库地址
https://pan.baidu.com/s/15ReMZi0gGo0MA5pn-1h3LQ
qknb
字典参考

1、基于词袋模型的逻辑回归情感分类

# -*- coding: UTF-8 -*-
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import itertools
import jieba
import os
cur_dir = os.path.dirname(os.path.abspath(__file__))
print(cur_dir)# 超参数
stopwords_path = os.path.join(cur_dir, '../testdata/chineseStopWords.txt') # 停用词字典地址
# 加载停用词
stopwords = [i.strip() for i in open(stopwords_path, encoding="utf-8").readlines()]###########################词袋模型特征############################################
#重组为新的句子
def clean_text(text):"""去掉html标签、移除标点、切分成词/token、去掉停用词、重组为新的句子:param text::return:"""# print(text)words = jieba.lcut(''.join(re.findall('[\u4e00-\u9fa5]', text)), cut_all=False)words = [w for w in words if w not in stopwords]# print(words)return ' '.join(words)#混淆矩阵
def plot_confusion_matrix(cm, classes,title='Confusion matrix',cmap=plt.cm.Blues):"""This function prints and plots the confusion matrix."""plt.imshow(cm, interpolation='nearest', cmap=cmap)plt.title(title)plt.colorbar()tick_marks = np.arange(len(classes))plt.xticks(tick_marks, classes, rotation=0)plt.yticks(tick_marks, classes)thresh = cm.max() / 2.for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):plt.text(j, i, cm[i, j],horizontalalignment="center",color="white" if cm[i, j] > thresh else "black")plt.tight_layout()plt.ylabel('True label')plt.xlabel('Predicted label')if __name__=='__main__':#读取数据df = pd.read_csv('../testdata/ratings.csv', sep=',', escapechar='\\')print(df.head(5))#数据清洗,对df中的每一个Serial进行清洗df['clean_comment'] = df.comment.apply(clean_text)print(df['clean_comment'])#抽取bag of words特征(用sklearn的CountVectorizer)vectorizer = CountVectorizer(max_features=50)train_data_features = vectorizer.fit_transform(df.clean_comment).toarray()print(train_data_features)# 数据切分X_train, X_test, y_train, y_test = train_test_split(train_data_features, df.rating, test_size=0.2,random_state=0)print(X_train,X_test,y_train,y_test)# ### 训练分类器LR_model = LogisticRegression()LR_model = LR_model.fit(X_train, y_train)y_pred = LR_model.predict(X_test)print(y_pred)print(y_test)cnf_matrix = confusion_matrix(y_test, y_pred)print(cnf_matrix)print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))print("accuracy metric in the testing dataset: ", (cnf_matrix[1, 1] + cnf_matrix[0, 0]) / (cnf_matrix[0, 0] + cnf_matrix[1, 1] + cnf_matrix[1, 0] + cnf_matrix[0, 1]))# Plot non-normalized confusion matrixclass_names = [0, 1]plt.figure()plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')plt.show()

2、基于word2vec词向量模型的逻辑回归情感分类

# -*- coding: UTF-8 -*-
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import nltk
import warnings
from gensim.models.word2vec import Word2Vec
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import itertools
warnings.filterwarnings("ignore")
import jieba
import os
cur_dir = os.path.dirname(os.path.abspath(__file__))
print(cur_dir)# 超参数
stopwords_path = os.path.join(cur_dir, '../testdata/chineseStopWords.txt') # 停用词字典地址
# 加载停用词
stopwords = [i.strip() for i in open(stopwords_path, encoding="utf-8").readlines()]def clean_text(text, remove_stopwords=False):# print(text)words = jieba.lcut(''.join(re.findall('[\u4e00-\u9fa5]', text)), cut_all=False)words = [w for w in words if w not in stopwords]# print(words)return ' '.join(words)return wordsdef split_sentences(review):#print(type(review))raw_sentences=tokenizer.tokenize(str(review).strip())sentences = [clean_text(s) for s in raw_sentences if s]return sentencesdef to_review_vector(review):global word_vecreview = clean_text(review, remove_stopwords=True)# print (review)# words = nltk.word_tokenize(review)word_vec = np.zeros((1, 300))for word in review:# word_vec = np.zeros((1,300))if word in model:word_vec += np.array([model[word]])# print (word_vec.mean(axis = 0))return pd.Series(word_vec.mean(axis=0))def plot_confusion_matrix(cm, classes,title='Confusion matrix',cmap=plt.cm.Blues):"""This function prints and plots the confusion matrix."""plt.imshow(cm, interpolation='nearest', cmap=cmap)plt.title(title)plt.colorbar()tick_marks = np.arange(len(classes))plt.xticks(tick_marks, classes, rotation=0)plt.yticks(tick_marks, classes)thresh = cm.max() / 2.for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):plt.text(j, i, cm[i, j],horizontalalignment="center",color="white" if cm[i, j] > thresh else "black")plt.tight_layout()plt.ylabel('True label')plt.xlabel('Predicted label')if __name__ == '__main__':#读取数据df = pd.read_csv('../testdata/ratings.csv', sep=',', escapechar='\\')#数据清洗df['clean_review'] = df.comment.apply(clean_text)review_part = df['clean_review']#nltk库分词tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')sentences = sum(review_part.apply(split_sentences), [])sentences_list = []for line in sentences:sentences_list.append(nltk.word_tokenize(str(line).strip()))#word2vecnum_features = 300  # Word vector dimensionalitymin_word_count = 40  # Minimum word countnum_workers = 4  # Number of threads to run in parallelcontext = 10  # Context window sizemodel_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)model = Word2Vec(sentences_list, workers=num_workers, size=num_features, min_count=min_word_count, window=context)model.init_sims(replace=True)model.save('word2vec.models')train_data_features = df.review.apply(to_review_vector)X_train, X_test, y_train, y_test = train_test_split(train_data_features, df.sentiment, test_size=0.2, random_state=0)LR_model = LogisticRegression()LR_model = LR_model.fit(X_train, y_train)y_pred = LR_model.predict(X_test)cnf_matrix = confusion_matrix(y_test, y_pred)print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))print("accuracy metric in the testing dataset: ", (cnf_matrix[1, 1] + cnf_matrix[0, 0]) / (cnf_matrix[0, 0] + cnf_matrix[1, 1] + cnf_matrix[1, 0] + cnf_matrix[0, 1]))# Plot non-normalized confusion matrixclass_names = [0, 1]plt.figure()plot_confusion_matrix(cnf_matrix , classes=class_names, title='Confusion matrix')plt.show()

本文来自互联网用户投稿，文章观点仅代表作者本人，不代表本站立场，不承担相关法律责任。如若转载，请注明出处。 如若内容造成侵权/违法违规/事实不符，请点击【内容举报】进行投诉反馈！

标签：技术

Duilib中list控件支持ctrl和shif多行选中的实现

[ICML2015]Batch Normalization:Accelerating Deep Network Training by Reducing Internal Covariate Shif

win10系统微软输入法于eclipse ctrl+shif+f冲突间接处理办法

Codeforces Round #259 (Div. 2) B. Little Pony and Sort by Shif

读LDD3，内存映射与DMA--PAGE_SHIF…

VMware虚拟机安装XP【要先分区，再设置BOOT 启动CD，shif+上移】

更换iBus五笔的左与右Shif

sublime ctrl+shif+f 没用解决办法

idea 对 ctrl + z 的撤销是 ctrl + shif + z

计算机最早的设计师应用于,计算机应用基础选择题doc.doc

win10自带截图神器：Win+Shift+S

Python基础之文件目录操作

python简述目录_Python基础之文件目录操作(示例代码)

tp5 如何做数据采集

任务2-7(服务器字体+阿里巴巴矢量库)

html标签（1)：h1~h6,p,br,pre,hr

TI 电量计介绍与芯片选型指南

几款TI电源芯片简介

TI DSP芯片C2000系列读取FLASH数据

德州仪器(Ti)平台嵌入式开发基础

TI三相电机智能栅极驱动芯片特点分类

省选模拟（12.08） T3 圈圈圈圈圈圈圈圈

Hadoop生态圈技术栈（上）

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之6.Impala交互式查询

小猿圈之Linux下Mysql 操作命令

大数据Hadoop生态圈常用面试题

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之4.Hive DDL、DQL和数据操作

备战Noip2018模拟赛11（B组）T3 Monogatari 物语

【智能优化算法-圆圈搜索算法】基于圆圈搜索算法Circle Search Algorithm求解单目标优化问题附matlab代码

NYOJ 78 圈水池

递归问题跑道汽车绕圈问题 Python实现

Hadoop生态圈（三）：MapReduce

NLP学习（十一）-NLP实战之电影评分数据的情感分析-Python3

1、基于词袋模型的逻辑回归情感分类

2、基于word2vec词向量模型的逻辑回归情感分类

相关文章