123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104 |
- #!/usr/bin/python
- # -*- coding: UTF-8 -*-
- import sys
- import json
- from gensim import corpora, models, similarities
- import pickle
- import random
- import math
- import jieba
- import re
- def cal_entropy(topic):
- p=[score for _,score in topic]
- s=sum(p)
- p=[score/s for score in p]
- entropy=sum([-1*score*math.log(score) for score in p])
- return entropy
- wordslist = ["我在玉龙雪山我我我我","我喜欢玉龙雪山","我还要去玉龙雪山"]
- # 切词
- textTest = [[word for word in jieba.cut(words)] for words in wordslist]
- # 生成字典
- dictionary = corpora.Dictionary(textTest,prune_at=2000000)
- print("读取文件")
- with open("news.csv", encoding='utf-8') as f :
- lines= f.readlines()
- print("读取结束")
- lines=[line.strip() for line in lines]
- lines = lines[1:]
- train=[]
- article_info=[]
- print("解析数据")
- i = 0
- biaodian = re.compile(r'[:,。?!]')
- html = re.compile('<\/?.+?\/?>')
- for line in lines:
- i = i+1
- data = line.split('",')
- if len(data[2]) < 5 or len(data[4]) < 40:
- continue
- #标题词,摘要词,正文词
- #真实场景中,加权,去停用词
- abstract = data[4].replace("<p>", "")
- abstract = abstract.replace("</p>", "")
- abstract = abstract.replace("</li>", "")
- abstract = abstract.replace("的", "")
- abstract = abstract.replace('"', "")
- abstract = biaodian.sub('', abstract, )
- abstract = html.sub('', abstract, )
- train.append([word for word in jieba.cut(abstract)])
- article_info.append({"article_id":i,"__url":data[1]})
- print(article_info[:10])
- lines=[]
- print("解析结束")
- dictionary = corpora.Dictionary(train)
- corpus = [dictionary.doc2bow(text) for text in train]
- print("开始训练lda")
- num_topics=100
- lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
- print("lda训练结束")
- #计算词的分布
- all_word_num=len(dictionary.keys())
- word_topic_distribute={}
- for topic_num in range(0,num_topics):
- topic_list = lda.show_topic(topic_num,topn=all_word_num)
- for [word,score] in topic_list:
- if word not in word_topic_distribute:
- word_topic_distribute[word]=[]
- word_topic_distribute[word].append([topic_num,score])
- word_entropy=[ [word,cal_entropy(topic_distribute)] for word,topic_distribute in word_topic_distribute.items()]
- word_entropy=sorted(word_entropy,key=lambda s:s[1])
- word_entropy=["{}\t{}".format(s[0],s[1]) for s in word_entropy]
- with open("word_entropy","w", encoding="utf-8") as f :
- f.writelines("\n".join(word_entropy))
- #计算文章的分布
- article_result=[]
- for feature,article_info in zip(corpus,article_info):
- topic=lda.get_document_topics(feature)
- topic=[(s[0],float(s[1])) for s in topic]
- article_info['topic_lda']=topic
- article_info['entropy']=cal_entropy(topic)
- article_result.append(article_info)
- article_result=sorted(article_result,key=lambda s:s['entropy'])
- article_result=[json.dumps(s,ensure_ascii=False) for s in article_result]
- print("保存数据开始")
- with open("article_topics","w") as f:
- f.writelines("\n".join(article_result))
- with open("lda.model","wb") as f:
- pickle.dump(lda,f)
- with open("dictionary","wb") as f:
- pickle.dump(dictionary,f)
- print("保存数据结束")
|