123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229 |
- #encoding=utf-8
- from __future__ import division
- import re
- import sys
- import time
- import math
- import jieba
- import datetime
- import numpy as np
- from fty_util.common import Mysql
- start = time.time()
- tf = {} #{id:[{},{},..],...}
- idf_bre = {}
- idf_aft = {}
- tv_data = [] #新电视剧的变量数据
- score_mat = {} #电视剧得分矩阵{id:[{},{}...],...}
- tags = {} #标签库
- weight = [5,2,1,1,1,1,2]
- if len(sys.argv) > 1:
- tv_id = int(sys.argv[1])
- else:
- print '请输入电视剧id'
- sys.exit()
- conn = Mysql.createOfflineConn()
- sql1 = 'select * from tmp.ad_tv_recom_idf'
- tmp = Mysql.selectAll(sql1, conn=conn)
- tv_sum = tmp[0][3] #历史总电视剧数
- for i in range(len(tmp)):
- arr = tmp[i]
- idf_bre[arr[1]] = arr[2]
- sql2 = 'select * from tmp.ad_tv_recom_var_stat'
- tmp = Mysql.selectAll(sql2, conn=conn)
- var_stat = [word.split(',') for word in tmp[0]] #各维度的词统计
- ff = open('ad_tv_recom_score_matrix.txt','r')
- title = '' #文本的列标题
- for line in ff.readlines():
- arr = line.strip('\n').split('\t')
- if arr[0] == 'id':
- title = line
- else:
- k = int(arr[0])
- score_mat.setdefault(k,[])
- for j in arr[1:]:
- obj = {}
- if len(j):
- for ss in j.split(','):
- tmp_arr = ss.split(':')
- obj[int(tmp_arr[0])] = tmp_arr[1]
- score_mat[k].append(obj)
- ff.close()
- sql4 = 'select tag from odl.ad_type_lib'
- tmp = Mysql.selectAll(sql4, conn=conn)
- for word in tmp:
- tags[word[0]] = 1
- dims = ['tv_id','types','description','director','main_actors','scriptwriter','filmer','decade']
- sql = "select %s from odl.ad_tv_lib where tv_id=%d" %(', '.join(dims),tv_id)
- tv_data = Mysql.selectAll(sql, conn=conn)
- def find_tag(sentence): #sentence为电视剧的描述信息
- seg = jieba.cut(sentence)
- res = {}
- for word in seg:
- if tags.get(word):
- res.setdefault(word,1)
- return u' '.join(res.keys())
- for i in range(len(tv_data)):
- tv_data[i] = list(tv_data[i])
- tv_data[i][0] = int(tv_data[i][0])
- key = tv_data[i][0]
- arr = tv_data[i][1:]
- tmp = [] #每个电视剧的所有关键词
- dim_tmp = [] #每个电视剧的每个维度的关键词统计[[{},{}..],..]
- if key not in score_mat:
- tv_sum += 1 #所有电视剧的数量
- if not arr[1]:
- arr[1] = ''
- tv_data[i][2] = ''
- else:
- arr[1] = find_tag(arr[1])
- tv_data[i][2] = arr[1]
- for j in range(len(arr)):
- obj = {}
- if not arr[j]:
- wd = u''
- else:
- wd = arr[j]
- words = wd.split(u' ')
- words = list(set(words))
- if u'' in words:
- words.remove(u'')
- tmp.extend(words)
- for word in words:
- obj.setdefault(word, 0)
- obj[word] += 1
- dim_tmp.append(obj)
- n = len(tmp) #每个电视剧的总词数
- for l in range(len(dim_tmp)):
- obj_j = dim_tmp[l]
- for k in obj_j:
- if n: obj_j[k] /= n
- else: obj_j[k] = 0
- if k not in var_stat[l]: #判断新剧的关键词是否在历史关键词库中
- var_stat[l].append(k)
- tf[key] = dim_tmp
- for ww in list(set(tmp)):
- if not idf_bre.has_key(ww):
- idf_bre[ww] = 1
- else:
- if key not in score_mat:
- idf_bre[ww] += 1
- for key in idf_bre:
- idf_aft[key] = math.log10(tv_sum/idf_bre[key])
- #对历史电视剧的得分矩阵重新计算
- length = sum([len(v) for v in var_stat])
- for key in score_mat:
- tmp_arr = score_mat[key]
- tmp = np.zeros(length)
- ll = 0
- for i in range(len(var_stat)):
- if i > 0: ll += len(var_stat[i-1])
- mat = tmp_arr[i]
- for k,v in mat.items():
- tmp[ll+k] = v
- score_mat[key] = tmp
- #计算电视剧矩阵得分
- def tv_score(weight, tf, idf):
- res = {}
- row = sum([len(v) for v in var_stat])
- for i in tf:
- tv_arr = tf[i]
- mm = 0 #每个词的位置
- res.setdefault(i,np.zeros(row))
- for j in range(len(tv_arr)):
- if j>0: mm += len(var_stat[j-1])
- for word,value in tv_arr[j].items():
- score = weight[j]*value*idf[word]
- nn = var_stat[j].index(word) + mm
- res[i][nn] = score
- return res
- def cos_distance(vec1, vec2):
- v11 = vec1*vec1
- v12 = vec1*vec2
- v22 = vec2*vec2
- mer = sum(v12[v12>0])
- denominator = math.sqrt(sum(v11[v11>0])) + math.sqrt(sum(v22[v22>0]))
- if not denominator:
- return 0
- return mer/denominator
- def tv_sim(tv_id,data): #tv_id:要计算的电视剧(1,2,3...),data:电视剧得分矩阵({1:[],2:[]})
- res = []
- vec1 = data[tv_id]
- for key,tv_arr in data.items():
- cos = cos_distance(vec1,tv_arr)
- res.append([key,cos])
- return dict(enumerate(sorted(res,key=lambda x:x[1],reverse=True)[0:400]))
- dat = tv_score(weight,tf,idf_aft)
- score_mat_new = dict(score_mat,**dat) #将新剧和老剧的得分合并
- #将结果和中间数据保存到数据库中
- for key in dat:
- res = tv_sim(key, score_mat_new)
- sim_arr = ','.join([str(i[0]) for i in res.values()])
- sql = 'replace into idl.ad_tv_4sim_wmd values ("%d","%s")' %(key,sim_arr)
- Mysql.execute(sql, conn=conn)
- vv = []
- for key,tv_arr in tf.items():
- tmp = []
- tmp.append(int(key))
- for tv_obj in tv_arr:
- ss = ';'.join([k.encode('utf-8')+':'+str(v) for k,v in tv_obj.items()])
- tmp.append(ss)
- if key not in score_mat:
- sql = 'replace into tmp.ad_tv_recom_tf values("%s","%s","%s","%s","%s","%s","%s","%s")' % tuple(tmp)
- Mysql.execute(sql, conn=conn)
- delete = 'delete from tmp.ad_tv_recom_idf'
- Mysql.execute(delete, conn=conn)
- tmp_ll = list(idf_bre.items())
- vv = [(i+1,tmp_ll[i][0],tmp_ll[i][1],tv_sum) for i in range(len(tmp_ll))]
- sql = 'insert into tmp.ad_tv_recom_idf values(%s,%s,%s,%s)'
- Mysql.insertMany(sql, vv, conn=conn)
- delete = 'delete from tmp.ad_tv_recom_var_stat'
- Mysql.execute(delete, conn=conn)
- dim_arr = [(','.join(tmp_arr) for tmp_arr in var_stat)]
- sql = 'insert into tmp.ad_tv_recom_var_stat values(%s,%s,%s,%s,%s,%s,%s)'
- Mysql.insertMany(sql, dim_arr, conn=conn)
- f1 = open('ad_tv_recom_score_matrix.txt','a')
- for tv_id,np_arr in dat.items():
- nn = 0
- if tv_id not in score_mat:
- res = str(tv_id)
- for arr in var_stat:
- tmp = np_arr[nn:(nn+len(arr))]
- nn += len(arr)
- res += '\t' + ','.join([str(i)+':'+str(tmp[i]) for i in np.nonzero(tmp)[0]])
- f1.write(res+'\n')
- f1.close()
- Mysql.close(conn)
|