yufeng0528 4 years ago
parent
commit
affe6b093a
2 changed files with 49 additions and 24 deletions
  1. 2 1
      lda/save_annoy.py
  2. 47 23
      lda/train_lda.py

+ 2 - 1
lda/save_annoy.py

@@ -17,9 +17,10 @@ for line in lines:
17 17
     for index,score in data["topic_lda"]:
18 18
         feature[index]=score
19 19
         t.add_item(aid, feature)
20
+
20 21
 t.build(10) 
21 22
 t.save('article.ann')
22 23
 with open("index_url","wb+") as f:
23
-    pickle.dump(url, f)
24
+    pickle.dump(index_url, f)
24 25
 
25 26
 

+ 47 - 23
lda/train_lda.py

@@ -6,12 +6,23 @@ from gensim import corpora, models, similarities
6 6
 import pickle
7 7
 import random
8 8
 import math
9
+import jieba
10
+import re
11
+
12
+
9 13
 def cal_entropy(topic):
10
-	p=[score for _,score in topic]
11
-	s=sum(p)
12
-	p=[score/s for score in p]
13
-	entropy=sum([-1*score*math.log(score) for score in p])
14
-	return entropy
14
+    p=[score for _,score in topic]
15
+    s=sum(p)
16
+    p=[score/s for score in p]
17
+    entropy=sum([-1*score*math.log(score) for score in p])
18
+    return entropy
19
+
20
+
21
+wordslist = ["我在玉龙雪山我我我我","我喜欢玉龙雪山","我还要去玉龙雪山"]
22
+# 切词
23
+textTest = [[word for word in jieba.cut(words)] for words in wordslist]
24
+# 生成字典
25
+dictionary = corpora.Dictionary(textTest,prune_at=2000000)
15 26
 
16 27
 print("读取文件")
17 28
 with open("news.csv", encoding='utf-8') as f :
@@ -23,6 +34,9 @@ train=[]
23 34
 article_info=[]
24 35
 print("解析数据")
25 36
 i = 0
37
+biaodian = re.compile(r'[:,。?!]')
38
+html = re.compile('<\/?.+?\/?>')
39
+
26 40
 for line in lines:
27 41
     i = i+1
28 42
     data = line.split('",')
@@ -30,12 +44,20 @@ for line in lines:
30 44
         continue
31 45
     #标题词,摘要词,正文词
32 46
     #真实场景中,加权,去停用词
33
-    train.append(data[3])
47
+
48
+    abstract = data[4].replace("<p>", "")
49
+    abstract = abstract.replace("</p>", "")
50
+    abstract = abstract.replace("</li>", "")
51
+    abstract = abstract.replace("的", "")
52
+    abstract = abstract.replace('"', "")
53
+    abstract = biaodian.sub('', abstract, )
54
+    abstract = html.sub('', abstract, )
55
+    train.append([word for word in jieba.cut(abstract)])
34 56
     article_info.append({"article_id":i,"__url":data[1]})
35 57
 lines=[]
36 58
 print("解析结束")
37
-dictionary = corpora.Dictionary([train])
38
-corpus = [dictionary.doc2bow(text) for text in [train]]
59
+dictionary = corpora.Dictionary(train)
60
+corpus = [dictionary.doc2bow(text) for text in train]
39 61
 print("开始训练lda")
40 62
 num_topics=100
41 63
 lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
@@ -45,33 +67,35 @@ print("lda训练结束")
45 67
 all_word_num=len(dictionary.keys())
46 68
 word_topic_distribute={}
47 69
 for topic_num in range(0,num_topics):
48
-	topic_list=lda.show_topic(topic_num,topn=all_word_num)
49
-	for [word,score] in topic_list:
50
-		if word not in word_topic_distribute:
51
-			word_topic_distribute[word]=[]
52
-		word_topic_distribute[word].append([topic_num,score])
70
+    topic_list = lda.show_topic(topic_num,topn=all_word_num)
71
+    for [word,score] in topic_list:
72
+        if word not in word_topic_distribute:
73
+             word_topic_distribute[word]=[]
74
+        word_topic_distribute[word].append([topic_num,score])
53 75
 word_entropy=[ [word,cal_entropy(topic_distribute)] for word,topic_distribute in word_topic_distribute.items()]
54 76
 word_entropy=sorted(word_entropy,key=lambda s:s[1])
55 77
 word_entropy=["{}\t{}".format(s[0],s[1]) for s in word_entropy]
56 78
 with open("word_entropy","w", encoding="utf-8") as f :
57
-	f.writelines("\n".join(word_entropy))
79
+    f.writelines("\n".join(word_entropy))
58 80
 #计算文章的分布
59 81
 article_result=[]
60 82
 for feature,article_info in  zip(corpus,article_info):
61
-	topic=lda.get_document_topics(feature)
62
-	topic=[(s[0],float(s[1])) for s in topic]
63
-	article_info['topic_lda']=topic
64
-	article_info['entropy']=cal_entropy(topic)
65
-	article_result.append(article_info)
83
+    topic=lda.get_document_topics(feature)
84
+    topic=[(s[0],float(s[1])) for s in topic]
85
+    article_info['topic_lda']=topic
86
+    article_info['entropy']=cal_entropy(topic)
87
+    article_result.append(article_info)
88
+
66 89
 article_result=sorted(article_result,key=lambda s:s['entropy'])
67 90
 article_result=[json.dumps(s,ensure_ascii=False) for s in article_result]
91
+print("保存数据开始")
68 92
 with open("article_topics","w") as f:
69
-	f.writelines("\n".join(article_result))
93
+    f.writelines("\n".join(article_result))
70 94
 with open("lda.model","wb") as f:
71
-	pickle.dump(lda,f)
95
+    pickle.dump(lda,f)
72 96
 with open("dictionary","wb") as f:
73
-	pickle.dump(dictionary,f)
74
-
97
+    pickle.dump(dictionary,f)
98
+print("保存数据结束")
75 99
 
76 100
 
77 101