Browse Source

Merge branch 'master' of http://git.yaozhitech.com/yufeng/machine_learn

yufeng 4 years ago
parent
commit
e177e60e84
2 changed files with 48 additions and 23 deletions
  1. 1 0
      lda/save_annoy.py
  2. 47 23
      lda/train_lda.py

+ 1 - 0
lda/save_annoy.py

@@ -17,6 +17,7 @@ for line in lines:
17 17
     for index,score in data["topic_lda"]:
18 18
         feature[index]=score
19 19
         t.add_item(aid, feature)
20
+
20 21
 t.build(10) 
21 22
 t.save('article.ann')
22 23
 with open("index_url","wb+") as f:

+ 47 - 23
lda/train_lda.py

@@ -6,12 +6,23 @@ from gensim import corpora, models, similarities
6 6
 import pickle
7 7
 import random
8 8
 import math
9
+import jieba
10
+import re
11
+
12
+
9 13
 def cal_entropy(topic):
10
-	p=[score for _,score in topic]
11
-	s=sum(p)
12
-	p=[score/s for score in p]
13
-	entropy=sum([-1*score*math.log(score) for score in p])
14
-	return entropy
14
+    p=[score for _,score in topic]
15
+    s=sum(p)
16
+    p=[score/s for score in p]
17
+    entropy=sum([-1*score*math.log(score) for score in p])
18
+    return entropy
19
+
20
+
21
+wordslist = ["我在玉龙雪山我我我我","我喜欢玉龙雪山","我还要去玉龙雪山"]
22
+# 切词
23
+textTest = [[word for word in jieba.cut(words)] for words in wordslist]
24
+# 生成字典
25
+dictionary = corpora.Dictionary(textTest,prune_at=2000000)
15 26
 
16 27
 print("读取文件")
17 28
 with open("news.csv", encoding='utf-8') as f :
@@ -23,6 +34,9 @@ train=[]
23 34
 article_info=[]
24 35
 print("解析数据")
25 36
 i = 0
37
+biaodian = re.compile(r'[:,。?!]')
38
+html = re.compile('<\/?.+?\/?>')
39
+
26 40
 for line in lines:
27 41
     i = i+1
28 42
     data = line.split('",')
@@ -30,13 +44,21 @@ for line in lines:
30 44
         continue
31 45
     #标题词,摘要词,正文词
32 46
     #真实场景中,加权,去停用词
33
-    train.append(data[3])
47
+
48
+    abstract = data[4].replace("<p>", "")
49
+    abstract = abstract.replace("</p>", "")
50
+    abstract = abstract.replace("</li>", "")
51
+    abstract = abstract.replace("的", "")
52
+    abstract = abstract.replace('"', "")
53
+    abstract = biaodian.sub('', abstract, )
54
+    abstract = html.sub('', abstract, )
55
+    train.append([word for word in jieba.cut(abstract)])
34 56
     article_info.append({"article_id":i,"__url":data[1]})
35 57
 print(article_info[:10])
36 58
 lines=[]
37 59
 print("解析结束")
38
-dictionary = corpora.Dictionary([train])
39
-corpus = [dictionary.doc2bow(text) for text in [train]]
60
+dictionary = corpora.Dictionary(train)
61
+corpus = [dictionary.doc2bow(text) for text in train]
40 62
 print("开始训练lda")
41 63
 num_topics=100
42 64
 lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
@@ -46,33 +68,35 @@ print("lda训练结束")
46 68
 all_word_num=len(dictionary.keys())
47 69
 word_topic_distribute={}
48 70
 for topic_num in range(0,num_topics):
49
-	topic_list=lda.show_topic(topic_num,topn=all_word_num)
50
-	for [word,score] in topic_list:
51
-		if word not in word_topic_distribute:
52
-			word_topic_distribute[word]=[]
53
-		word_topic_distribute[word].append([topic_num,score])
71
+    topic_list = lda.show_topic(topic_num,topn=all_word_num)
72
+    for [word,score] in topic_list:
73
+        if word not in word_topic_distribute:
74
+             word_topic_distribute[word]=[]
75
+        word_topic_distribute[word].append([topic_num,score])
54 76
 word_entropy=[ [word,cal_entropy(topic_distribute)] for word,topic_distribute in word_topic_distribute.items()]
55 77
 word_entropy=sorted(word_entropy,key=lambda s:s[1])
56 78
 word_entropy=["{}\t{}".format(s[0],s[1]) for s in word_entropy]
57 79
 with open("word_entropy","w", encoding="utf-8") as f :
58
-	f.writelines("\n".join(word_entropy))
80
+    f.writelines("\n".join(word_entropy))
59 81
 #计算文章的分布
60 82
 article_result=[]
61 83
 for feature,article_info in  zip(corpus,article_info):
62
-	topic=lda.get_document_topics(feature)
63
-	topic=[(s[0],float(s[1])) for s in topic]
64
-	article_info['topic_lda']=topic
65
-	article_info['entropy']=cal_entropy(topic)
66
-	article_result.append(article_info)
84
+    topic=lda.get_document_topics(feature)
85
+    topic=[(s[0],float(s[1])) for s in topic]
86
+    article_info['topic_lda']=topic
87
+    article_info['entropy']=cal_entropy(topic)
88
+    article_result.append(article_info)
89
+
67 90
 article_result=sorted(article_result,key=lambda s:s['entropy'])
68 91
 article_result=[json.dumps(s,ensure_ascii=False) for s in article_result]
92
+print("保存数据开始")
69 93
 with open("article_topics","w") as f:
70
-	f.writelines("\n".join(article_result))
94
+    f.writelines("\n".join(article_result))
71 95
 with open("lda.model","wb") as f:
72
-	pickle.dump(lda,f)
96
+    pickle.dump(lda,f)
73 97
 with open("dictionary","wb") as f:
74
-	pickle.dump(dictionary,f)
75
-
98
+    pickle.dump(dictionary,f)
99
+print("保存数据结束")
76 100
 
77 101
 
78 102