tv_real_recom_fix.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. #encoding=utf-8
  2. from __future__ import division
  3. import re
  4. import sys
  5. import time
  6. import math
  7. import jieba
  8. import datetime
  9. import numpy as np
  10. from fty_util.common import Mysql
  11. start = time.time()
  12. tf = {} #{id:[{},{},..],...}
  13. idf_bre = {}
  14. idf_aft = {}
  15. tv_data = [] #新电视剧的变量数据
  16. score_mat = {} #电视剧得分矩阵{id:[{},{}...],...}
  17. tags = {} #标签库
  18. weight = [5,2,1,1,1,1,2]
  19. if len(sys.argv) > 1:
  20. tv_id = int(sys.argv[1])
  21. else:
  22. print '请输入电视剧id'
  23. sys.exit()
  24. conn = Mysql.createOfflineConn()
  25. sql1 = 'select * from tmp.ad_tv_recom_idf'
  26. tmp = Mysql.selectAll(sql1, conn=conn)
  27. tv_sum = tmp[0][3] #历史总电视剧数
  28. for i in range(len(tmp)):
  29. arr = tmp[i]
  30. idf_bre[arr[1]] = arr[2]
  31. sql2 = 'select * from tmp.ad_tv_recom_var_stat'
  32. tmp = Mysql.selectAll(sql2, conn=conn)
  33. var_stat = [word.split(',') for word in tmp[0]] #各维度的词统计
  34. ff = open('ad_tv_recom_score_matrix.txt','r')
  35. title = '' #文本的列标题
  36. for line in ff.readlines():
  37. arr = line.strip('\n').split('\t')
  38. if arr[0] == 'id':
  39. title = line
  40. else:
  41. k = int(arr[0])
  42. score_mat.setdefault(k,[])
  43. for j in arr[1:]:
  44. obj = {}
  45. if len(j):
  46. for ss in j.split(','):
  47. tmp_arr = ss.split(':')
  48. obj[int(tmp_arr[0])] = tmp_arr[1]
  49. score_mat[k].append(obj)
  50. ff.close()
  51. sql4 = 'select tag from odl.ad_type_lib'
  52. tmp = Mysql.selectAll(sql4, conn=conn)
  53. for word in tmp:
  54. tags[word[0]] = 1
  55. dims = ['tv_id','types','description','director','main_actors','scriptwriter','filmer','decade']
  56. sql = "select %s from odl.ad_tv_lib where tv_id=%d" %(', '.join(dims),tv_id)
  57. tv_data = Mysql.selectAll(sql, conn=conn)
  58. def find_tag(sentence): #sentence为电视剧的描述信息
  59. seg = jieba.cut(sentence)
  60. res = {}
  61. for word in seg:
  62. if tags.get(word):
  63. res.setdefault(word,1)
  64. return u' '.join(res.keys())
  65. for i in range(len(tv_data)):
  66. tv_data[i] = list(tv_data[i])
  67. tv_data[i][0] = int(tv_data[i][0])
  68. key = tv_data[i][0]
  69. arr = tv_data[i][1:]
  70. tmp = [] #每个电视剧的所有关键词
  71. dim_tmp = [] #每个电视剧的每个维度的关键词统计[[{},{}..],..]
  72. if key not in score_mat:
  73. tv_sum += 1 #所有电视剧的数量
  74. if not arr[1]:
  75. arr[1] = ''
  76. tv_data[i][2] = ''
  77. else:
  78. arr[1] = find_tag(arr[1])
  79. tv_data[i][2] = arr[1]
  80. for j in range(len(arr)):
  81. obj = {}
  82. if not arr[j]:
  83. wd = u''
  84. else:
  85. wd = arr[j]
  86. words = wd.split(u' ')
  87. words = list(set(words))
  88. if u'' in words:
  89. words.remove(u'')
  90. tmp.extend(words)
  91. for word in words:
  92. obj.setdefault(word, 0)
  93. obj[word] += 1
  94. dim_tmp.append(obj)
  95. n = len(tmp) #每个电视剧的总词数
  96. for l in range(len(dim_tmp)):
  97. obj_j = dim_tmp[l]
  98. for k in obj_j:
  99. if n: obj_j[k] /= n
  100. else: obj_j[k] = 0
  101. if k not in var_stat[l]: #判断新剧的关键词是否在历史关键词库中
  102. var_stat[l].append(k)
  103. tf[key] = dim_tmp
  104. for ww in list(set(tmp)):
  105. if not idf_bre.has_key(ww):
  106. idf_bre[ww] = 1
  107. else:
  108. if key not in score_mat:
  109. idf_bre[ww] += 1
  110. for key in idf_bre:
  111. idf_aft[key] = math.log10(tv_sum/idf_bre[key])
  112. #对历史电视剧的得分矩阵重新计算
  113. length = sum([len(v) for v in var_stat])
  114. for key in score_mat:
  115. tmp_arr = score_mat[key]
  116. tmp = np.zeros(length)
  117. ll = 0
  118. for i in range(len(var_stat)):
  119. if i > 0: ll += len(var_stat[i-1])
  120. mat = tmp_arr[i]
  121. for k,v in mat.items():
  122. tmp[ll+k] = v
  123. score_mat[key] = tmp
  124. #计算电视剧矩阵得分
  125. def tv_score(weight, tf, idf):
  126. res = {}
  127. row = sum([len(v) for v in var_stat])
  128. for i in tf:
  129. tv_arr = tf[i]
  130. mm = 0 #每个词的位置
  131. res.setdefault(i,np.zeros(row))
  132. for j in range(len(tv_arr)):
  133. if j>0: mm += len(var_stat[j-1])
  134. for word,value in tv_arr[j].items():
  135. score = weight[j]*value*idf[word]
  136. nn = var_stat[j].index(word) + mm
  137. res[i][nn] = score
  138. return res
  139. def cos_distance(vec1, vec2):
  140. v11 = vec1*vec1
  141. v12 = vec1*vec2
  142. v22 = vec2*vec2
  143. mer = sum(v12[v12>0])
  144. denominator = math.sqrt(sum(v11[v11>0])) + math.sqrt(sum(v22[v22>0]))
  145. if not denominator:
  146. return 0
  147. return mer/denominator
  148. def tv_sim(tv_id,data): #tv_id:要计算的电视剧(1,2,3...),data:电视剧得分矩阵({1:[],2:[]})
  149. res = []
  150. vec1 = data[tv_id]
  151. for key,tv_arr in data.items():
  152. cos = cos_distance(vec1,tv_arr)
  153. res.append([key,cos])
  154. return dict(enumerate(sorted(res,key=lambda x:x[1],reverse=True)[0:400]))
  155. dat = tv_score(weight,tf,idf_aft)
  156. score_mat_new = dict(score_mat,**dat) #将新剧和老剧的得分合并
  157. #将结果和中间数据保存到数据库中
  158. for key in dat:
  159. res = tv_sim(key, score_mat_new)
  160. sim_arr = ','.join([str(i[0]) for i in res.values()])
  161. sql = 'replace into idl.ad_tv_4sim_wmd values ("%d","%s")' %(key,sim_arr)
  162. Mysql.execute(sql, conn=conn)
  163. vv = []
  164. for key,tv_arr in tf.items():
  165. tmp = []
  166. tmp.append(int(key))
  167. for tv_obj in tv_arr:
  168. ss = ';'.join([k.encode('utf-8')+':'+str(v) for k,v in tv_obj.items()])
  169. tmp.append(ss)
  170. if key not in score_mat:
  171. sql = 'replace into tmp.ad_tv_recom_tf values("%s","%s","%s","%s","%s","%s","%s","%s")' % tuple(tmp)
  172. Mysql.execute(sql, conn=conn)
  173. delete = 'delete from tmp.ad_tv_recom_idf'
  174. Mysql.execute(delete, conn=conn)
  175. tmp_ll = list(idf_bre.items())
  176. vv = [(i+1,tmp_ll[i][0],tmp_ll[i][1],tv_sum) for i in range(len(tmp_ll))]
  177. sql = 'insert into tmp.ad_tv_recom_idf values(%s,%s,%s,%s)'
  178. Mysql.insertMany(sql, vv, conn=conn)
  179. delete = 'delete from tmp.ad_tv_recom_var_stat'
  180. Mysql.execute(delete, conn=conn)
  181. dim_arr = [(','.join(tmp_arr) for tmp_arr in var_stat)]
  182. sql = 'insert into tmp.ad_tv_recom_var_stat values(%s,%s,%s,%s,%s,%s,%s)'
  183. Mysql.insertMany(sql, dim_arr, conn=conn)
  184. f1 = open('ad_tv_recom_score_matrix.txt','a')
  185. for tv_id,np_arr in dat.items():
  186. nn = 0
  187. if tv_id not in score_mat:
  188. res = str(tv_id)
  189. for arr in var_stat:
  190. tmp = np_arr[nn:(nn+len(arr))]
  191. nn += len(arr)
  192. res += '\t' + ','.join([str(i)+':'+str(tmp[i]) for i in np.nonzero(tmp)[0]])
  193. f1.write(res+'\n')
  194. f1.close()
  195. Mysql.close(conn)