train_lda.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. #!/usr/bin/python
  2. # -*- coding: UTF-8 -*-
  3. import sys
  4. import json
  5. from gensim import corpora, models, similarities
  6. import pickle
  7. import random
  8. import math
  9. import jieba
  10. import re
  11. def cal_entropy(topic):
  12. p=[score for _,score in topic]
  13. s=sum(p)
  14. p=[score/s for score in p]
  15. entropy=sum([-1*score*math.log(score) for score in p])
  16. return entropy
  17. wordslist = ["我在玉龙雪山我我我我","我喜欢玉龙雪山","我还要去玉龙雪山"]
  18. # 切词
  19. textTest = [[word for word in jieba.cut(words)] for words in wordslist]
  20. # 生成字典
  21. dictionary = corpora.Dictionary(textTest,prune_at=2000000)
  22. print("读取文件")
  23. with open("news.csv", encoding='utf-8') as f :
  24. lines= f.readlines()
  25. print("读取结束")
  26. lines=[line.strip() for line in lines]
  27. lines = lines[1:]
  28. train=[]
  29. article_info=[]
  30. print("解析数据")
  31. i = 0
  32. biaodian = re.compile(r'[:,。?!]')
  33. html = re.compile('<\/?.+?\/?>')
  34. for line in lines:
  35. i = i+1
  36. data = line.split('",')
  37. if len(data[2]) < 5 or len(data[4]) < 40:
  38. continue
  39. #标题词,摘要词,正文词
  40. #真实场景中,加权,去停用词
  41. abstract = data[4].replace("<p>", "")
  42. abstract = abstract.replace("</p>", "")
  43. abstract = abstract.replace("</li>", "")
  44. abstract = abstract.replace("的", "")
  45. abstract = abstract.replace('"', "")
  46. abstract = biaodian.sub('', abstract, )
  47. abstract = html.sub('', abstract, )
  48. train.append([word for word in jieba.cut(abstract)])
  49. article_info.append({"article_id":i,"__url":data[1]})
  50. print(article_info[:10])
  51. lines=[]
  52. print("解析结束")
  53. dictionary = corpora.Dictionary(train)
  54. corpus = [dictionary.doc2bow(text) for text in train]
  55. print("开始训练lda")
  56. num_topics=100
  57. lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
  58. print("lda训练结束")
  59. #计算词的分布
  60. all_word_num=len(dictionary.keys())
  61. word_topic_distribute={}
  62. for topic_num in range(0,num_topics):
  63. topic_list = lda.show_topic(topic_num,topn=all_word_num)
  64. for [word,score] in topic_list:
  65. if word not in word_topic_distribute:
  66. word_topic_distribute[word]=[]
  67. word_topic_distribute[word].append([topic_num,score])
  68. word_entropy=[ [word,cal_entropy(topic_distribute)] for word,topic_distribute in word_topic_distribute.items()]
  69. word_entropy=sorted(word_entropy,key=lambda s:s[1])
  70. word_entropy=["{}\t{}".format(s[0],s[1]) for s in word_entropy]
  71. with open("word_entropy","w", encoding="utf-8") as f :
  72. f.writelines("\n".join(word_entropy))
  73. #计算文章的分布
  74. article_result=[]
  75. for feature,article_info in zip(corpus,article_info):
  76. topic=lda.get_document_topics(feature)
  77. topic=[(s[0],float(s[1])) for s in topic]
  78. article_info['topic_lda']=topic
  79. article_info['entropy']=cal_entropy(topic)
  80. article_result.append(article_info)
  81. article_result=sorted(article_result,key=lambda s:s['entropy'])
  82. article_result=[json.dumps(s,ensure_ascii=False) for s in article_result]
  83. print("保存数据开始")
  84. with open("article_topics","w") as f:
  85. f.writelines("\n".join(article_result))
  86. with open("lda.model","wb") as f:
  87. pickle.dump(lda,f)
  88. with open("dictionary","wb") as f:
  89. pickle.dump(dictionary,f)
  90. print("保存数据结束")