yufeng0528 4 年之前
父節點
當前提交
8a531eab23
共有 2 個文件被更改,包括 110 次插入0 次删除
  1. 32 0
      lda/find_similar.py
  2. 78 0
      lda/train_lda.py

+ 32 - 0
lda/find_similar.py

@@ -0,0 +1,32 @@
1
+#!/usr/bin/python
2
+# -*- coding: UTF-8 -*-
3
+import sys
4
+from annoy import AnnoyIndex
5
+import random
6
+import pickle
7
+import json
8
+length = 100
9
+u = AnnoyIndex(length,metric="angular")
10
+u.load('article.ann') 
11
+with open("index_url") as f:
12
+	index_url=pickle.load(f)
13
+with open("article_topics") as f:
14
+	lines=f.readlines()
15
+lines=[line.strip() for line in lines]
16
+results=[]
17
+for line in lines:
18
+	data=json.loads(line)
19
+	url=data["__url"]
20
+	aid=data["article_id"]
21
+	index,distance=u.get_nns_by_item(aid,6,include_distances=True)
22
+	urls=[ "{}\t{}".format(index_url[i],d) for [i,d] in zip(index,distance)]
23
+	results.append("原始url\t{}".format(url))
24
+	results.append("相关推荐")
25
+	results.append("\n".join(urls)+"\n")
26
+	
27
+with open("relation_article","w") as f :
28
+	f.writelines("\n".join(results))
29
+	
30
+
31
+
32
+

+ 78 - 0
lda/train_lda.py

@@ -0,0 +1,78 @@
1
+#!/usr/bin/python
2
+# -*- coding: UTF-8 -*-
3
+import sys
4
+import json
5
+from gensim import corpora, models, similarities
6
+import pickle
7
+import random
8
+import math
9
+def cal_entropy(topic):
10
+	p=[score for _,score in topic]
11
+	s=sum(p)
12
+	p=[score/s for score in p]
13
+	entropy=sum([-1*score*math.log(score) for score in p])
14
+	return entropy
15
+
16
+print("读取文件")
17
+with open("news.csv", encoding='utf-8') as f :
18
+	lines= f.readlines()
19
+print("读取结束")
20
+lines=[line.strip() for line in lines]
21
+lines = lines[1:]
22
+train=[]
23
+article_info=[]
24
+print("解析数据")
25
+i = 0
26
+for line in lines:
27
+	i = i+1
28
+	data = line.split('",')
29
+	if len(data[2]) < 5 or len(data[4]) < 40:
30
+		continue
31
+	#标题词,摘要词,正文词
32
+	#真实场景中,加权,去停用词
33
+	train.append(data[3])
34
+	article_info.append({"article_id":i,"__url":data[1]})
35
+lines=[]
36
+print("解析结束")
37
+dictionary = corpora.Dictionary([train])
38
+corpus = [dictionary.doc2bow(text) for text in [train]]
39
+print("开始训练lda")
40
+num_topics=100
41
+lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
42
+print("lda训练结束")
43
+
44
+#计算词的分布
45
+all_word_num=len(dictionary.keys())
46
+word_topic_distribute={}
47
+for topic_num in range(0,num_topics):
48
+	topic_list=lda.show_topic(topic_num,topn=all_word_num)
49
+	for [word,score] in topic_list:
50
+		if word not in word_topic_distribute:
51
+			word_topic_distribute[word]=[]
52
+		word_topic_distribute[word].append([topic_num,score])
53
+word_entropy=[ [word,cal_entropy(topic_distribute)] for word,topic_distribute in word_topic_distribute.items()]
54
+word_entropy=sorted(word_entropy,key=lambda s:s[1])
55
+word_entropy=["{}\t{}".format(s[0],s[1]) for s in word_entropy]
56
+with open("word_entropy","w", encoding="utf-8") as f :
57
+	f.writelines("\n".join(word_entropy))
58
+#计算文章的分布
59
+article_result=[]
60
+for feature,article_info in  zip(corpus,article_info):
61
+	topic=lda.get_document_topics(feature)
62
+	topic=[(s[0],float(s[1])) for s in topic]
63
+	article_info['topic_lda']=topic
64
+	article_info['entropy']=cal_entropy(topic)
65
+	article_result.append(article_info)
66
+article_result=sorted(article_result,key=lambda s:s['entropy'])
67
+article_result=[json.dumps(s,ensure_ascii=False) for s in article_result]
68
+with open("article_topics","w") as f:
69
+	f.writelines("\n".join(article_result))
70
+with open("lda.model","wb") as f:
71
+	pickle.dump(lda,f)
72
+with open("dictionary","wb") as f:
73
+	pickle.dump(dictionary,f)
74
+
75
+
76
+
77
+
78
+