find_similar.py 785 B

123456789101112131415161718192021222324252627282930313233
  1. #!/usr/bin/python
  2. # -*- coding: UTF-8 -*-
  3. import sys
  4. from annoy import AnnoyIndex
  5. import random
  6. import pickle
  7. import json
  8. length = 100
  9. u = AnnoyIndex(length,metric="angular")
  10. u.load('article.ann')
  11. with open("index_url", "rb+") as f:
  12. index_url=pickle.load(f)
  13. with open("article_topics") as f:
  14. lines=f.readlines()
  15. lines=[line.strip() for line in lines]
  16. results=[]
  17. for line in lines:
  18. data=json.loads(line)
  19. url=data["__url"]
  20. aid=data["article_id"]
  21. index,distance=u.get_nns_by_item(aid,6,include_distances=True)
  22. urls=[ "{}\t{}".format(index_url[i],d) for [i,d] in zip(index,distance)]
  23. results.append("原始url\t{}".format(url))
  24. results.append("相关推荐")
  25. results.append("\n".join(urls)+"\n")
  26. with open("relation_article","w") as f :
  27. f.writelines("\n".join(results))