|
@@ -0,0 +1,25 @@
|
|
1
|
+from annoy import AnnoyIndex
|
|
2
|
+import random
|
|
3
|
+import pickle
|
|
4
|
+import json
|
|
5
|
+length = 100
|
|
6
|
+t = AnnoyIndex(length,metric="angular")
|
|
7
|
+index_url={}
|
|
8
|
+with open("article_topics") as f:
|
|
9
|
+ lines=f.readlines()
|
|
10
|
+lines=[line.strip() for line in lines]
|
|
11
|
+for line in lines:
|
|
12
|
+ data=json.loads(line)
|
|
13
|
+ url=data["__url"]
|
|
14
|
+ aid=data["article_id"]
|
|
15
|
+ feature=[0 for _ in range(0,length)]
|
|
16
|
+ index_url[aid]=url
|
|
17
|
+ for index,score in data["topic_lda"]:
|
|
18
|
+ feature[index]=score
|
|
19
|
+ t.add_item(aid, feature)
|
|
20
|
+t.build(10)
|
|
21
|
+t.save('article.ann')
|
|
22
|
+with open("index_url","w") as f:
|
|
23
|
+ pickle.dump(index_url,f)
|
|
24
|
+
|
|
25
|
+
|