信息内容安全-微博话题检测及分析

2023-08-18 18:05:30

文章目录

信息内容安全-微博话题检测及分析
- 简介
- 代码

信息内容安全-微博话题检测及分析

简介

根据论文郑斐然苗夺谦张志飞高灿《一种中文微博新闻话题检测的方法》，有改动。

代码

# 信息内容安全实验-（微博）话题检测及分析
# 参考论文：郑斐然 苗夺谦 张志飞 高灿《一种中文微博新闻话题检测的方法》import jieba
import math
import codecs# 判断字符是否为汉字（是则返回 True）
def is_chinese(char):if char >= u'\u4e00' and char <= u'\u9fa5':return Trueelse:return False# 标准化处理（只留下必要信息）
def standardize(text):st_text = ""flag = 0for char in text:if flag == 0:# 提取汉字if is_chinese(char):st_text = st_text + char# 删除以“#话题名#”为格式的部分elif char == "#":flag = 1else:passelse:if char == "#":flag = 0with open(st, 'a', encoding = 'utf-8') as s:s.write("\n原语料内容：\n%s" % text)s.write("\n-------------------------------------------------------------------")s.write("\n标准化处理结果：\n%s" % st_text)s.close()return st_text# 根据时间窗将微博语料分类，并进行预处理
def set_time_window():# 设置时间窗大小（单位：小时）time_window = 1# 按时间窗分类的微博语料，键为时间，值为包含在此时间窗的微博内容列表time_window_dict = {}# 按行读取微博内容（一行即为一条微博）file = open(material, 'r', encoding = 'utf-8')for line in file.readlines():#t1 = line.split()[5:7]count = 0flag = 0time = ""text = ""for c in line:count += 1# 提取发表时间if count >= 55 and count <=63:if c.isdigit():time = time + c# 提取微博内容if count >= 73:if c == "'":flag = 1if flag == 0:text = text + celse:breakst_text = standardize(text)#print("--------------------------------------------------------------------")#print("st_text:", st_text)#print("time:", time)if time_window_dict.get(time):time_window_dict.setdefault(time,[]).append(st_text)#print("-----------if线--------------------------")#print("time:", time)#print("st_text:", st_text)#print(time_window_dict)else:temp = {time: [st_text]}time_window_dict.update(temp)#print("----------------else线-------------------------")#print(temp)file.close()#print("-----------------------------------------")#print(time_window_dict)#print(time_window_dict.keys())return time_window_dict# 分词
def divide(st_text):di_list = jieba.lcut(st_text, cut_all = False)       # 精准模式分词with open(st, 'a') as s:s.write("\n-------------------------------------------------------------------")s.write("\n分词结果：\n")s.write('/'.join(di_list))s.close()return di_list# 过滤停用词
def delete_stopword(di_list):de_list = []with open(cn_stopword_list, 'r', encoding = 'utf-8') as c:stopwords = c.read()c.close()for word in di_list:if word in stopwords:continueelse:de_list.append(word)with open(st, 'a') as s:s.write("\n-------------------------------------------------------------")s.write("\n过滤结果：\n")s.write('/'.join(de_list))s.close()return de_list# 主题词检测
def sub_word_detect(processed_dict):# 统计各窗格中各词的出现次数fre_dict = {}for k in processed_dict.keys():temp = {}for de in processed_dict[k]:for word in de:if temp.get(word):temp[word] += 1else:temp.update({word: 1})t1 = sorted(temp.items(), key = lambda kv:(kv[1], kv[0]), reverse = True)del t1[10:]#print("----------hey----------------")#print(t1)fre_dict.update({k: t1})#print("----------------hello-----------------------")#print(fre_dict)'''# 由于语料原因（缺少发表时间连续的语料），没办法进行增长系数和复合权值的计算，所以直接以词频作为主题词选取的依据t2 = {}# 设置回顾窗k = 3w_dict = {}for f in fre_dict.keys():for i in f:flag = 0fu = []fi = i[1]u = int(f)for r in range(1,k):fu.append(fre_dict[str(u-r)])all_fu = 0for u in fu:all_fu += u# 计算增长系数G，并保留三位小数G = round((fi*k)/all_fu, 3)# 计算复合权值w# 设置调节参数aa = 1.3w = math.log(G) + a * (math.log(fi/max(sorted.(fre_dict.keys()))))w_dict[k] = [(i, w)]topic_dect_dict = w_dict'''topic_detect_dict = fre_dictreturn topic_detect_dict# 主题词聚类
def clustering(processed_dict, topic_detect_dict):cluster = {}        # 存放最终簇D = 8               # 设置阈值D# 增量聚类for k in topic_detect_dict.keys():for t in topic_detect_dict[k]:      # t 为窗格中每一元组（词，词在此窗格中出现频数）if cluster.get(k):pw = t[1]ad = []  # 存放各簇的距离for c in cluster[k]:        # c 为窗格中每一簇p = []                  # 存放 p(ci|w)for word in c:          # word 为每一簇中词语# 计算 p(ci|w)pcw = 0temp = 0.00000# 统计未分类词语与此簇中词语同时出现次数for de in processed_dict[k]:#print("de:", de)if word in de:if t[0] in de:pcw += 1#print("word:", word, "t[0]:", t[0])temp = pcw / pwtemp = round(temp, 5)p.append(temp)#print("temp:", temp, "pcw:", pcw, "pw:", pw)#print("p:", p)#print("-------------------")# 计算词到簇的距离d = 0.00000if max(p) == 0:d = 99999else:d = 1 / max(p)d = round(d, 5)ad.append(d)# 判断是否建立新簇mind = min(ad)index = ad.index(mind)#print("ad:", ad, "mind:", mind, "index:", index)if mind > D:# 建立新簇cluster[k].append([t[0]])# cluster.setdefault(k, []).append([t[0]])else:# 将此词语加入距离最近的簇中cluster[k][index-1].append(t[0])# cluster.setdefault(k, [])[index].append(t[0])#print("cluster:", cluster)#print("--------------------------------------------")else:# 首次循环，以第一个词作为初始簇#print("-----------hey----------------------")tc = []                 # 临时存放簇tc.append(t[0])cluster.update({k: [tc]})return clusterif __name__ == '__main__':# 语料及中文停用词表文件material = '5千条微博语料.txt'cn_stopword_list = 'baidu_stopwords.txt'st = '处理结果.txt'# 语料处理time_window_dict = set_time_window()# 分词及停用词过滤# 结果存储在字典中，键为发表时间，值为各微博内容处理后的结果列表的列表processed_dict = {}for k in time_window_dict.keys():for t in time_window_dict[k]:di_list = divide(t)de_list = delete_stopword(di_list)if processed_dict.get(k):processed_dict.setdefault(k, []).append(de_list)else:temp = {k: [de_list]}processed_dict.update(temp)topic_detect_dict = sub_word_detect(processed_dict)topic_dict = clustering(processed_dict, topic_detect_dict)'''print("---------------------------------------")for k in sorted(processed_dict.keys()):print(k, processed_dict[k])for k in sorted(topic_detect_dict.keys()):print(k, topic_detect_dict[k])'''# 结果输出temp = sorted(topic_dict.keys())for k in temp:print("---------------------------------------------")print("时间：%s-%s %s" % (k[0:2], k[2:4], k[4:6]))for c in topic_dict[k]:print("话题：", c)

本文来自互联网用户投稿，文章观点仅代表作者本人，不代表本站立场，不承担相关法律责任。如若转载，请注明出处。 如若内容造成侵权/违法违规/事实不符，请点击【内容举报】进行投诉反馈！

标签：技术

上一篇 > 蓝桥杯算法提高现代诗如蚯蚓
下一篇 > 使用selenium爬取微博热门话题并写入文本文件

Duilib中list控件支持ctrl和shif多行选中的实现

[ICML2015]Batch Normalization:Accelerating Deep Network Training by Reducing Internal Covariate Shif

win10系统微软输入法于eclipse ctrl+shif+f冲突间接处理办法

Codeforces Round #259 (Div. 2) B. Little Pony and Sort by Shif

读LDD3，内存映射与DMA--PAGE_SHIF…

VMware虚拟机安装XP【要先分区，再设置BOOT 启动CD，shif+上移】

更换iBus五笔的左与右Shif

sublime ctrl+shif+f 没用解决办法

idea 对 ctrl + z 的撤销是 ctrl + shif + z

计算机最早的设计师应用于,计算机应用基础选择题doc.doc

win10自带截图神器：Win+Shift+S

Python基础之文件目录操作

python简述目录_Python基础之文件目录操作(示例代码)

tp5 如何做数据采集

任务2-7(服务器字体+阿里巴巴矢量库)

html标签（1)：h1~h6,p,br,pre,hr

TI 电量计介绍与芯片选型指南

几款TI电源芯片简介

TI DSP芯片C2000系列读取FLASH数据

德州仪器(Ti)平台嵌入式开发基础

TI三相电机智能栅极驱动芯片特点分类

省选模拟（12.08） T3 圈圈圈圈圈圈圈圈

Hadoop生态圈技术栈（上）

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之6.Impala交互式查询

小猿圈之Linux下Mysql 操作命令

大数据Hadoop生态圈常用面试题

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之4.Hive DDL、DQL和数据操作

备战Noip2018模拟赛11（B组）T3 Monogatari 物语

【智能优化算法-圆圈搜索算法】基于圆圈搜索算法Circle Search Algorithm求解单目标优化问题附matlab代码

NYOJ 78 圈水池

递归问题跑道汽车绕圈问题 Python实现

Hadoop生态圈（三）：MapReduce

信息内容安全-微博话题检测及分析

文章目录

信息内容安全-微博话题检测及分析

简介

代码

相关文章