信息内容安全-微博话题检测及分析
文章目录
- 信息内容安全-微博话题检测及分析
- 简介
- 代码
信息内容安全-微博话题检测及分析
简介
根据论文郑斐然 苗夺谦 张志飞 高灿《一种中文微博新闻话题检测的方法》,有改动。
代码
# 信息内容安全实验-(微博)话题检测及分析
# 参考论文:郑斐然 苗夺谦 张志飞 高灿《一种中文微博新闻话题检测的方法》import jieba
import math
import codecs# 判断字符是否为汉字(是则返回 True)
def is_chinese(char):if char >= u'\u4e00' and char <= u'\u9fa5':return Trueelse:return False# 标准化处理(只留下必要信息)
def standardize(text):st_text = ""flag = 0for char in text:if flag == 0:# 提取汉字if is_chinese(char):st_text = st_text + char# 删除以“#话题名#”为格式的部分elif char == "#":flag = 1else:passelse:if char == "#":flag = 0with open(st, 'a', encoding = 'utf-8') as s:s.write("\n原语料内容:\n%s" % text)s.write("\n-------------------------------------------------------------------")s.write("\n标准化处理结果:\n%s" % st_text)s.close()return st_text# 根据时间窗将微博语料分类,并进行预处理
def set_time_window():# 设置时间窗大小(单位:小时)time_window = 1# 按时间窗分类的微博语料,键为时间,值为包含在此时间窗的微博内容列表time_window_dict = {}# 按行读取微博内容(一行即为一条微博)file = open(material, 'r', encoding = 'utf-8')for line in file.readlines():#t1 = line.split()[5:7]count = 0flag = 0time = ""text = ""for c in line:count += 1# 提取发表时间if count >= 55 and count <=63:if c.isdigit():time = time + c# 提取微博内容if count >= 73:if c == "'":flag = 1if flag == 0:text = text + celse:breakst_text = standardize(text)#print("--------------------------------------------------------------------")#print("st_text:", st_text)#print("time:", time)if time_window_dict.get(time):time_window_dict.setdefault(time,[]).append(st_text)#print("-----------if线--------------------------")#print("time:", time)#print("st_text:", st_text)#print(time_window_dict)else:temp = {time: [st_text]}time_window_dict.update(temp)#print("----------------else线-------------------------")#print(temp)file.close()#print("-----------------------------------------")#print(time_window_dict)#print(time_window_dict.keys())return time_window_dict# 分词
def divide(st_text):di_list = jieba.lcut(st_text, cut_all = False) # 精准模式分词with open(st, 'a') as s:s.write("\n-------------------------------------------------------------------")s.write("\n分词结果:\n")s.write('/'.join(di_list))s.close()return di_list# 过滤停用词
def delete_stopword(di_list):de_list = []with open(cn_stopword_list, 'r', encoding = 'utf-8') as c:stopwords = c.read()c.close()for word in di_list:if word in stopwords:continueelse:de_list.append(word)with open(st, 'a') as s:s.write("\n-------------------------------------------------------------")s.write("\n过滤结果:\n")s.write('/'.join(de_list))s.close()return de_list# 主题词检测
def sub_word_detect(processed_dict):# 统计各窗格中各词的出现次数fre_dict = {}for k in processed_dict.keys():temp = {}for de in processed_dict[k]:for word in de:if temp.get(word):temp[word] += 1else:temp.update({word: 1})t1 = sorted(temp.items(), key = lambda kv:(kv[1], kv[0]), reverse = True)del t1[10:]#print("----------hey----------------")#print(t1)fre_dict.update({k: t1})#print("----------------hello-----------------------")#print(fre_dict)'''# 由于语料原因(缺少发表时间连续的语料),没办法进行增长系数和复合权值的计算,所以直接以词频作为主题词选取的依据t2 = {}# 设置回顾窗k = 3w_dict = {}for f in fre_dict.keys():for i in f:flag = 0fu = []fi = i[1]u = int(f)for r in range(1,k):fu.append(fre_dict[str(u-r)])all_fu = 0for u in fu:all_fu += u# 计算增长系数G,并保留三位小数G = round((fi*k)/all_fu, 3)# 计算复合权值w# 设置调节参数aa = 1.3w = math.log(G) + a * (math.log(fi/max(sorted.(fre_dict.keys()))))w_dict[k] = [(i, w)]topic_dect_dict = w_dict'''topic_detect_dict = fre_dictreturn topic_detect_dict# 主题词聚类
def clustering(processed_dict, topic_detect_dict):cluster = {} # 存放最终簇D = 8 # 设置阈值D# 增量聚类for k in topic_detect_dict.keys():for t in topic_detect_dict[k]: # t 为窗格中每一元组(词,词在此窗格中出现频数)if cluster.get(k):pw = t[1]ad = [] # 存放各簇的距离for c in cluster[k]: # c 为窗格中每一簇p = [] # 存放 p(ci|w)for word in c: # word 为每一簇中词语# 计算 p(ci|w)pcw = 0temp = 0.00000# 统计未分类词语与此簇中词语同时出现次数for de in processed_dict[k]:#print("de:", de)if word in de:if t[0] in de:pcw += 1#print("word:", word, "t[0]:", t[0])temp = pcw / pwtemp = round(temp, 5)p.append(temp)#print("temp:", temp, "pcw:", pcw, "pw:", pw)#print("p:", p)#print("-------------------")# 计算词到簇的距离d = 0.00000if max(p) == 0:d = 99999else:d = 1 / max(p)d = round(d, 5)ad.append(d)# 判断是否建立新簇mind = min(ad)index = ad.index(mind)#print("ad:", ad, "mind:", mind, "index:", index)if mind > D:# 建立新簇cluster[k].append([t[0]])# cluster.setdefault(k, []).append([t[0]])else:# 将此词语加入距离最近的簇中cluster[k][index-1].append(t[0])# cluster.setdefault(k, [])[index].append(t[0])#print("cluster:", cluster)#print("--------------------------------------------")else:# 首次循环,以第一个词作为初始簇#print("-----------hey----------------------")tc = [] # 临时存放簇tc.append(t[0])cluster.update({k: [tc]})return clusterif __name__ == '__main__':# 语料及中文停用词表文件material = '5千条微博语料.txt'cn_stopword_list = 'baidu_stopwords.txt'st = '处理结果.txt'# 语料处理time_window_dict = set_time_window()# 分词及停用词过滤# 结果存储在字典中,键为发表时间,值为各微博内容处理后的结果列表的列表processed_dict = {}for k in time_window_dict.keys():for t in time_window_dict[k]:di_list = divide(t)de_list = delete_stopword(di_list)if processed_dict.get(k):processed_dict.setdefault(k, []).append(de_list)else:temp = {k: [de_list]}processed_dict.update(temp)topic_detect_dict = sub_word_detect(processed_dict)topic_dict = clustering(processed_dict, topic_detect_dict)'''print("---------------------------------------")for k in sorted(processed_dict.keys()):print(k, processed_dict[k])for k in sorted(topic_detect_dict.keys()):print(k, topic_detect_dict[k])'''# 结果输出temp = sorted(topic_dict.keys())for k in temp:print("---------------------------------------------")print("时间:%s-%s %s" % (k[0:2], k[2:4], k[4:6]))for c in topic_dict[k]:print("话题:", c)
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
