|
@@ -6,12 +6,23 @@ from gensim import corpora, models, similarities
|
6
|
6
|
import pickle
|
7
|
7
|
import random
|
8
|
8
|
import math
|
|
9
|
+import jieba
|
|
10
|
+import re
|
|
11
|
+
|
|
12
|
+
|
9
|
13
|
def cal_entropy(topic):
|
10
|
|
- p=[score for _,score in topic]
|
11
|
|
- s=sum(p)
|
12
|
|
- p=[score/s for score in p]
|
13
|
|
- entropy=sum([-1*score*math.log(score) for score in p])
|
14
|
|
- return entropy
|
|
14
|
+ p=[score for _,score in topic]
|
|
15
|
+ s=sum(p)
|
|
16
|
+ p=[score/s for score in p]
|
|
17
|
+ entropy=sum([-1*score*math.log(score) for score in p])
|
|
18
|
+ return entropy
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+wordslist = ["我在玉龙雪山我我我我","我喜欢玉龙雪山","我还要去玉龙雪山"]
|
|
22
|
+# 切词
|
|
23
|
+textTest = [[word for word in jieba.cut(words)] for words in wordslist]
|
|
24
|
+# 生成字典
|
|
25
|
+dictionary = corpora.Dictionary(textTest,prune_at=2000000)
|
15
|
26
|
|
16
|
27
|
print("读取文件")
|
17
|
28
|
with open("news.csv", encoding='utf-8') as f :
|
|
@@ -23,6 +34,9 @@ train=[]
|
23
|
34
|
article_info=[]
|
24
|
35
|
print("解析数据")
|
25
|
36
|
i = 0
|
|
37
|
+biaodian = re.compile(r'[:,。?!]')
|
|
38
|
+html = re.compile('<\/?.+?\/?>')
|
|
39
|
+
|
26
|
40
|
for line in lines:
|
27
|
41
|
i = i+1
|
28
|
42
|
data = line.split('",')
|
|
@@ -30,12 +44,20 @@ for line in lines:
|
30
|
44
|
continue
|
31
|
45
|
#标题词,摘要词,正文词
|
32
|
46
|
#真实场景中,加权,去停用词
|
33
|
|
- train.append(data[3])
|
|
47
|
+
|
|
48
|
+ abstract = data[4].replace("<p>", "")
|
|
49
|
+ abstract = abstract.replace("</p>", "")
|
|
50
|
+ abstract = abstract.replace("</li>", "")
|
|
51
|
+ abstract = abstract.replace("的", "")
|
|
52
|
+ abstract = abstract.replace('"', "")
|
|
53
|
+ abstract = biaodian.sub('', abstract, )
|
|
54
|
+ abstract = html.sub('', abstract, )
|
|
55
|
+ train.append([word for word in jieba.cut(abstract)])
|
34
|
56
|
article_info.append({"article_id":i,"__url":data[1]})
|
35
|
57
|
lines=[]
|
36
|
58
|
print("解析结束")
|
37
|
|
-dictionary = corpora.Dictionary([train])
|
38
|
|
-corpus = [dictionary.doc2bow(text) for text in [train]]
|
|
59
|
+dictionary = corpora.Dictionary(train)
|
|
60
|
+corpus = [dictionary.doc2bow(text) for text in train]
|
39
|
61
|
print("开始训练lda")
|
40
|
62
|
num_topics=100
|
41
|
63
|
lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
|
|
@@ -45,33 +67,35 @@ print("lda训练结束")
|
45
|
67
|
all_word_num=len(dictionary.keys())
|
46
|
68
|
word_topic_distribute={}
|
47
|
69
|
for topic_num in range(0,num_topics):
|
48
|
|
- topic_list=lda.show_topic(topic_num,topn=all_word_num)
|
49
|
|
- for [word,score] in topic_list:
|
50
|
|
- if word not in word_topic_distribute:
|
51
|
|
- word_topic_distribute[word]=[]
|
52
|
|
- word_topic_distribute[word].append([topic_num,score])
|
|
70
|
+ topic_list = lda.show_topic(topic_num,topn=all_word_num)
|
|
71
|
+ for [word,score] in topic_list:
|
|
72
|
+ if word not in word_topic_distribute:
|
|
73
|
+ word_topic_distribute[word]=[]
|
|
74
|
+ word_topic_distribute[word].append([topic_num,score])
|
53
|
75
|
word_entropy=[ [word,cal_entropy(topic_distribute)] for word,topic_distribute in word_topic_distribute.items()]
|
54
|
76
|
word_entropy=sorted(word_entropy,key=lambda s:s[1])
|
55
|
77
|
word_entropy=["{}\t{}".format(s[0],s[1]) for s in word_entropy]
|
56
|
78
|
with open("word_entropy","w", encoding="utf-8") as f :
|
57
|
|
- f.writelines("\n".join(word_entropy))
|
|
79
|
+ f.writelines("\n".join(word_entropy))
|
58
|
80
|
#计算文章的分布
|
59
|
81
|
article_result=[]
|
60
|
82
|
for feature,article_info in zip(corpus,article_info):
|
61
|
|
- topic=lda.get_document_topics(feature)
|
62
|
|
- topic=[(s[0],float(s[1])) for s in topic]
|
63
|
|
- article_info['topic_lda']=topic
|
64
|
|
- article_info['entropy']=cal_entropy(topic)
|
65
|
|
- article_result.append(article_info)
|
|
83
|
+ topic=lda.get_document_topics(feature)
|
|
84
|
+ topic=[(s[0],float(s[1])) for s in topic]
|
|
85
|
+ article_info['topic_lda']=topic
|
|
86
|
+ article_info['entropy']=cal_entropy(topic)
|
|
87
|
+ article_result.append(article_info)
|
|
88
|
+
|
66
|
89
|
article_result=sorted(article_result,key=lambda s:s['entropy'])
|
67
|
90
|
article_result=[json.dumps(s,ensure_ascii=False) for s in article_result]
|
|
91
|
+print("保存数据开始")
|
68
|
92
|
with open("article_topics","w") as f:
|
69
|
|
- f.writelines("\n".join(article_result))
|
|
93
|
+ f.writelines("\n".join(article_result))
|
70
|
94
|
with open("lda.model","wb") as f:
|
71
|
|
- pickle.dump(lda,f)
|
|
95
|
+ pickle.dump(lda,f)
|
72
|
96
|
with open("dictionary","wb") as f:
|
73
|
|
- pickle.dump(dictionary,f)
|
74
|
|
-
|
|
97
|
+ pickle.dump(dictionary,f)
|
|
98
|
+print("保存数据结束")
|
75
|
99
|
|
76
|
100
|
|
77
|
101
|
|