tv_outline_recom.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. #encoding=utf-8
  2. #author:wdw110
  3. #功能:离线计算电视剧的相似剧
  4. from __future__ import division
  5. import re
  6. import math
  7. import jieba
  8. import numpy as np
  9. from fty_util.common import Mysql
  10. tv_tf = {} #{id:[[{},{},..],..],...}
  11. idf = {}
  12. idf_aft = {}
  13. var_stat = [[],[],[],[],[],[],[]] #各维度的词统计
  14. seq2id = {}
  15. weight = [5,2,1,1,1,1,2]
  16. tags = {} #标签库
  17. conn = Mysql.createOfflineConn()
  18. dims = ['tv_id','types','description','director','main_actors','scriptwriter','filmer','decade']
  19. sql = "select %s from odl.ad_tv_lib where is_use=1" %(', '.join(dims))
  20. tv_data = Mysql.selectAll(sql, conn=conn)
  21. sql2 = 'select tag from odl.ad_type_lib'
  22. tmp = Mysql.selectAll(sql2, conn=conn)
  23. for word in tmp:
  24. tags[word[0]] = 1
  25. def find_tag(sentence): #sentence为电视剧的描述信息
  26. seg = jieba.cut(sentence)
  27. res = {}
  28. for word in seg:
  29. if tags.get(word):
  30. res.setdefault(word,1)
  31. return u' '.join(res.keys())
  32. for i in range(len(tv_data)):
  33. tv_id = int(tv_data[i][0])
  34. tv_data[i] = list(tv_data[i])
  35. arr = tv_data[i][1:]
  36. tmp = [] #每个电视剧的所有关键词
  37. word_count = {} #每个电视剧的关键词的数量
  38. dim_tmp = [] #每个电视剧的每个维度的关键词统计[[{},{}..],..]
  39. tv_tf.setdefault(tv_id,[])
  40. seq2id[i] = tv_id
  41. if not arr[1]:
  42. arr[1] = ''
  43. tv_data[i][2] = ''
  44. else:
  45. arr[1] = find_tag(arr[1])
  46. tv_data[i][2] = arr[1]
  47. for i in range(len(arr)):
  48. obj = {}
  49. if not arr[i]:
  50. wd = u''
  51. else:
  52. wd = arr[i]
  53. words = wd.split(u' ')
  54. #print words
  55. for word in words:
  56. if word:
  57. obj[word] = 1
  58. word_count.setdefault(word,0)
  59. word_count[word] += 1
  60. dim_tmp.append(obj)
  61. var_stat[i].extend(obj.keys())
  62. tmp.extend(obj.keys())
  63. n = len(tmp) #每个电视剧的总词数
  64. for obj_j in dim_tmp:
  65. for k in obj_j:
  66. obj_j[k] = word_count[k]/n
  67. tv_tf[tv_id] = dim_tmp
  68. for word in list(set(tmp)):
  69. idf.setdefault(word,0)
  70. idf[word] += 1
  71. N = len(tv_tf) #总电视剧数量
  72. for key in idf:
  73. idf_aft[key] = math.log10(N/idf[key])
  74. for i in range(len(var_stat)):
  75. var_stat[i] = list(set(var_stat[i])) #去重处理
  76. #计算电视剧矩阵得分
  77. def tv_score(weight, tf, idf):
  78. col = len(tf)
  79. row = sum([len(v) for v in var_stat])
  80. res = np.zeros((col, row))
  81. score_arr = {}
  82. for i in range(col):
  83. tv_arr = tf[seq2id[i]]
  84. mm = 0 #每个词的位置
  85. score_arr.setdefault(i,[])
  86. for j in range(len(tv_arr)):
  87. tmp2 = np.zeros(len(var_stat[j])) #每个维度的向量
  88. if j>0: mm += len(var_stat[j-1])
  89. for word,value in tv_arr[j].items():
  90. score = weight[j]*value*idf[word]
  91. ll = var_stat[j].index(word)
  92. nn = ll + mm
  93. res[i,nn] = score
  94. tmp2[ll] = score
  95. score_arr[i].append(tmp2)
  96. return res,score_arr
  97. def cos_distance(vec1, vec2):
  98. v11 = vec1*vec1
  99. v12 = vec1*vec2
  100. v22 = vec2*vec2
  101. mer = sum(v12[v12>0])
  102. denominator = math.sqrt(sum(v11[v11>0])) + math.sqrt(sum(v22[v22>0]))
  103. if not denominator:
  104. return 0
  105. return mer/denominator
  106. def tv_sim(data): #tv_id:要计算的电视剧(1,2,3...),data:电视剧得分矩阵({1:[],2:[]})
  107. n,m = data.shape
  108. res = np.zeros((n,n))
  109. result = []
  110. x = range(1,n+1)
  111. for i in range(n):
  112. res[i,i] = 1
  113. for j in range(i+1,n):
  114. res[i,j] = cos_distance(data[i,],data[j,])
  115. res[j,i] = res[i,j]
  116. index_arr = np.argsort(-res[i,])
  117. sort_arr = res[i,][index_arr]
  118. id_arr = np.array([seq2id[i] for i in index_arr])
  119. tmp = zip(id_arr,sort_arr)
  120. result.append(dict(enumerate(tmp[0:100])))
  121. return result
  122. dat,score_mat = tv_score(weight,tv_tf,idf_aft)
  123. res_sim = tv_sim(dat)
  124. #将结果和中间数据保存到数据库中
  125. '''
  126. sql = 'delete from idl.ad_tv_cos'
  127. cursor.execute(sql)
  128. db.commit()
  129. vv = []
  130. for i in range(len(res_sim)):
  131. sim_arr = []
  132. for key,value in res_sim[i].items():
  133. sim_arr.append(value[0])
  134. vv.append((seq2id[i],str(res_sim[i]),str(sim_arr)))
  135. sql = 'insert into idl.ad_tv_cos values (%s,%s,%s)'
  136. for i in range(int(len(vv)/1000)+1):
  137. tmp = vv[i*1000:(i+1)*1000]
  138. cursor.executemany(sql,tmp)
  139. db.commit()
  140. '''
  141. delete = 'delete from tmp.ad_tv_recom_idf'
  142. Mysql.execute(delete, conn=conn)
  143. tmp_ll = list(idf.items())
  144. vv = [(i+1,tmp_ll[i][0],tmp_ll[i][1],N) for i in range(len(tmp_ll))]
  145. sql = 'insert into tmp.ad_tv_recom_idf values(%s,%s,%s,%s)'
  146. Mysql.insertMany(sql, vv, conn=conn)
  147. delete = 'delete from tmp.ad_tv_recom_tf'
  148. Mysql.execute(delete, conn=conn)
  149. vv = []
  150. for key,tv_arr in tv_tf.items():
  151. tmp = []
  152. tmp.append(int(key))
  153. for tv_obj in tv_arr:
  154. ss = ';'.join([k.encode('utf-8')+':'+str(v) for k,v in tv_obj.items()])
  155. tmp.append(ss)
  156. vv.append(tuple(tmp))
  157. sql = 'insert into tmp.ad_tv_recom_tf values(%s,%s,%s,%s,%s,%s,%s,%s)'
  158. Mysql.insertMany(sql, vv, conn=conn)
  159. delete = 'delete from tmp.ad_tv_recom_var_stat'
  160. Mysql.execute(delete, conn=conn)
  161. dim_arr = [(','.join(tmp_arr) for tmp_arr in var_stat)]
  162. sql = 'insert into tmp.ad_tv_recom_var_stat values(%s,%s,%s,%s,%s,%s,%s)'
  163. Mysql.insertMany(sql, dim_arr, conn=conn)
  164. Mysql.close(conn)
  165. #将结果保存到本地
  166. f1 = open('ad_tv_recom_score_matrix.txt','w')
  167. f1.write('id\ttype\ttag\tdirector\tmain_actors\tscriptwritter\tproduction\n')
  168. for i in range(dat.shape[0]):
  169. ss = str(seq2id[i])
  170. for tt in score_mat[i]:
  171. ss += '\t'+','.join([str(i)+':'+str(tt[i]) for i in np.nonzero(tt)[0]])
  172. f1.write(ss+'\n')
  173. f1.close()
  174. '''
  175. def en2str(word):
  176. return word.encode('utf-8')
  177. f2 = open('ad_tv_recom_var_stat.txt','w')
  178. f2.write('type\ttag\tdirector\tmain_actors\tscriptwritter\tproduction\n')
  179. ss = '\t'.join([','.join(map(en2str,tmp_arr)) for tmp_arr in var_stat])
  180. f2.write(ss+'\n')
  181. f2.close()
  182. f3 = open('data/tv_outline_cos1.txt','w')
  183. for i in range(len(res_sim)):
  184. sim_arr = []
  185. for key,value in res_sim[i].items():
  186. sim_arr.append(value[0])
  187. f3.write(str(seq2id[i])+'\t'+str(res_sim[i])+'\t'+str(sim_arr)+'\n')
  188. f1.close()
  189. f3.close()
  190. '''