|
@@ -0,0 +1,78 @@
|
|
1
|
+#!/usr/bin/python
|
|
2
|
+# -*- coding: UTF-8 -*-
|
|
3
|
+import sys
|
|
4
|
+import json
|
|
5
|
+from gensim import corpora, models, similarities
|
|
6
|
+import pickle
|
|
7
|
+import random
|
|
8
|
+import math
|
|
9
|
+def cal_entropy(topic):
|
|
10
|
+ p=[score for _,score in topic]
|
|
11
|
+ s=sum(p)
|
|
12
|
+ p=[score/s for score in p]
|
|
13
|
+ entropy=sum([-1*score*math.log(score) for score in p])
|
|
14
|
+ return entropy
|
|
15
|
+
|
|
16
|
+print("读取文件")
|
|
17
|
+with open("news.csv", encoding='utf-8') as f :
|
|
18
|
+ lines= f.readlines()
|
|
19
|
+print("读取结束")
|
|
20
|
+lines=[line.strip() for line in lines]
|
|
21
|
+lines = lines[1:]
|
|
22
|
+train=[]
|
|
23
|
+article_info=[]
|
|
24
|
+print("解析数据")
|
|
25
|
+i = 0
|
|
26
|
+for line in lines:
|
|
27
|
+ i = i+1
|
|
28
|
+ data = line.split('",')
|
|
29
|
+ if len(data[2]) < 5 or len(data[4]) < 40:
|
|
30
|
+ continue
|
|
31
|
+ #标题词,摘要词,正文词
|
|
32
|
+ #真实场景中,加权,去停用词
|
|
33
|
+ train.append(data[3])
|
|
34
|
+ article_info.append({"article_id":i,"__url":data[1]})
|
|
35
|
+lines=[]
|
|
36
|
+print("解析结束")
|
|
37
|
+dictionary = corpora.Dictionary([train])
|
|
38
|
+corpus = [dictionary.doc2bow(text) for text in [train]]
|
|
39
|
+print("开始训练lda")
|
|
40
|
+num_topics=100
|
|
41
|
+lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
|
|
42
|
+print("lda训练结束")
|
|
43
|
+
|
|
44
|
+#计算词的分布
|
|
45
|
+all_word_num=len(dictionary.keys())
|
|
46
|
+word_topic_distribute={}
|
|
47
|
+for topic_num in range(0,num_topics):
|
|
48
|
+ topic_list=lda.show_topic(topic_num,topn=all_word_num)
|
|
49
|
+ for [word,score] in topic_list:
|
|
50
|
+ if word not in word_topic_distribute:
|
|
51
|
+ word_topic_distribute[word]=[]
|
|
52
|
+ word_topic_distribute[word].append([topic_num,score])
|
|
53
|
+word_entropy=[ [word,cal_entropy(topic_distribute)] for word,topic_distribute in word_topic_distribute.items()]
|
|
54
|
+word_entropy=sorted(word_entropy,key=lambda s:s[1])
|
|
55
|
+word_entropy=["{}\t{}".format(s[0],s[1]) for s in word_entropy]
|
|
56
|
+with open("word_entropy","w", encoding="utf-8") as f :
|
|
57
|
+ f.writelines("\n".join(word_entropy))
|
|
58
|
+#计算文章的分布
|
|
59
|
+article_result=[]
|
|
60
|
+for feature,article_info in zip(corpus,article_info):
|
|
61
|
+ topic=lda.get_document_topics(feature)
|
|
62
|
+ topic=[(s[0],float(s[1])) for s in topic]
|
|
63
|
+ article_info['topic_lda']=topic
|
|
64
|
+ article_info['entropy']=cal_entropy(topic)
|
|
65
|
+ article_result.append(article_info)
|
|
66
|
+article_result=sorted(article_result,key=lambda s:s['entropy'])
|
|
67
|
+article_result=[json.dumps(s,ensure_ascii=False) for s in article_result]
|
|
68
|
+with open("article_topics","w") as f:
|
|
69
|
+ f.writelines("\n".join(article_result))
|
|
70
|
+with open("lda.model","wb") as f:
|
|
71
|
+ pickle.dump(lda,f)
|
|
72
|
+with open("dictionary","wb") as f:
|
|
73
|
+ pickle.dump(dictionary,f)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
|