123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227 |
- #encoding=utf-8
- #author:wdw110
- #功能:离线计算电视剧的相似剧
- from __future__ import division
- import re
- import math
- import jieba
- import numpy as np
- from fty_util.common import Mysql
- tv_tf = {} #{id:[[{},{},..],..],...}
- idf = {}
- idf_aft = {}
- var_stat = [[],[],[],[],[],[],[]] #各维度的词统计
- seq2id = {}
- weight = [5,2,1,1,1,1,2]
- tags = {} #标签库
- conn = Mysql.createOfflineConn()
- dims = ['tv_id','types','description','director','main_actors','scriptwriter','filmer','decade']
- sql = "select %s from odl.ad_tv_lib where is_use=1" %(', '.join(dims))
- tv_data = Mysql.selectAll(sql, conn=conn)
- sql2 = 'select tag from odl.ad_type_lib'
- tmp = Mysql.selectAll(sql2, conn=conn)
- for word in tmp:
- tags[word[0]] = 1
- def find_tag(sentence): #sentence为电视剧的描述信息
- seg = jieba.cut(sentence)
- res = {}
- for word in seg:
- if tags.get(word):
- res.setdefault(word,1)
- return u' '.join(res.keys())
- for i in range(len(tv_data)):
- tv_id = int(tv_data[i][0])
- tv_data[i] = list(tv_data[i])
- arr = tv_data[i][1:]
- tmp = [] #每个电视剧的所有关键词
- word_count = {} #每个电视剧的关键词的数量
- dim_tmp = [] #每个电视剧的每个维度的关键词统计[[{},{}..],..]
- tv_tf.setdefault(tv_id,[])
- seq2id[i] = tv_id
- if not arr[1]:
- arr[1] = ''
- tv_data[i][2] = ''
- else:
- arr[1] = find_tag(arr[1])
- tv_data[i][2] = arr[1]
- for i in range(len(arr)):
- obj = {}
- if not arr[i]:
- wd = u''
- else:
- wd = arr[i]
- words = wd.split(u' ')
- #print words
- for word in words:
- if word:
- obj[word] = 1
- word_count.setdefault(word,0)
- word_count[word] += 1
- dim_tmp.append(obj)
- var_stat[i].extend(obj.keys())
- tmp.extend(obj.keys())
- n = len(tmp) #每个电视剧的总词数
- for obj_j in dim_tmp:
- for k in obj_j:
- obj_j[k] = word_count[k]/n
- tv_tf[tv_id] = dim_tmp
- for word in list(set(tmp)):
- idf.setdefault(word,0)
- idf[word] += 1
- N = len(tv_tf) #总电视剧数量
- for key in idf:
- idf_aft[key] = math.log10(N/idf[key])
- for i in range(len(var_stat)):
- var_stat[i] = list(set(var_stat[i])) #去重处理
- #计算电视剧矩阵得分
- def tv_score(weight, tf, idf):
- col = len(tf)
- row = sum([len(v) for v in var_stat])
- res = np.zeros((col, row))
- score_arr = {}
- for i in range(col):
- tv_arr = tf[seq2id[i]]
- mm = 0 #每个词的位置
- score_arr.setdefault(i,[])
- for j in range(len(tv_arr)):
- tmp2 = np.zeros(len(var_stat[j])) #每个维度的向量
- if j>0: mm += len(var_stat[j-1])
- for word,value in tv_arr[j].items():
- score = weight[j]*value*idf[word]
- ll = var_stat[j].index(word)
- nn = ll + mm
- res[i,nn] = score
- tmp2[ll] = score
- score_arr[i].append(tmp2)
- return res,score_arr
- def cos_distance(vec1, vec2):
- v11 = vec1*vec1
- v12 = vec1*vec2
- v22 = vec2*vec2
- mer = sum(v12[v12>0])
- denominator = math.sqrt(sum(v11[v11>0])) + math.sqrt(sum(v22[v22>0]))
- if not denominator:
- return 0
- return mer/denominator
- def tv_sim(data): #tv_id:要计算的电视剧(1,2,3...),data:电视剧得分矩阵({1:[],2:[]})
- n,m = data.shape
- res = np.zeros((n,n))
- result = []
- x = range(1,n+1)
- for i in range(n):
- res[i,i] = 1
- for j in range(i+1,n):
- res[i,j] = cos_distance(data[i,],data[j,])
- res[j,i] = res[i,j]
- index_arr = np.argsort(-res[i,])
- sort_arr = res[i,][index_arr]
- id_arr = np.array([seq2id[i] for i in index_arr])
- tmp = zip(id_arr,sort_arr)
- result.append(dict(enumerate(tmp[0:100])))
- return result
- dat,score_mat = tv_score(weight,tv_tf,idf_aft)
- res_sim = tv_sim(dat)
- #将结果和中间数据保存到数据库中
- '''
- sql = 'delete from idl.ad_tv_cos'
- cursor.execute(sql)
- db.commit()
- vv = []
- for i in range(len(res_sim)):
- sim_arr = []
- for key,value in res_sim[i].items():
- sim_arr.append(value[0])
- vv.append((seq2id[i],str(res_sim[i]),str(sim_arr)))
- sql = 'insert into idl.ad_tv_cos values (%s,%s,%s)'
- for i in range(int(len(vv)/1000)+1):
- tmp = vv[i*1000:(i+1)*1000]
- cursor.executemany(sql,tmp)
- db.commit()
- '''
- delete = 'delete from tmp.ad_tv_recom_idf'
- Mysql.execute(delete, conn=conn)
- tmp_ll = list(idf.items())
- vv = [(i+1,tmp_ll[i][0],tmp_ll[i][1],N) for i in range(len(tmp_ll))]
- sql = 'insert into tmp.ad_tv_recom_idf values(%s,%s,%s,%s)'
- Mysql.insertMany(sql, vv, conn=conn)
- delete = 'delete from tmp.ad_tv_recom_tf'
- Mysql.execute(delete, conn=conn)
- vv = []
- for key,tv_arr in tv_tf.items():
- tmp = []
- tmp.append(int(key))
- for tv_obj in tv_arr:
- ss = ';'.join([k.encode('utf-8')+':'+str(v) for k,v in tv_obj.items()])
- tmp.append(ss)
- vv.append(tuple(tmp))
- sql = 'insert into tmp.ad_tv_recom_tf values(%s,%s,%s,%s,%s,%s,%s,%s)'
- Mysql.insertMany(sql, vv, conn=conn)
- delete = 'delete from tmp.ad_tv_recom_var_stat'
- Mysql.execute(delete, conn=conn)
- dim_arr = [(','.join(tmp_arr) for tmp_arr in var_stat)]
- sql = 'insert into tmp.ad_tv_recom_var_stat values(%s,%s,%s,%s,%s,%s,%s)'
- Mysql.insertMany(sql, dim_arr, conn=conn)
- Mysql.close(conn)
- #将结果保存到本地
- f1 = open('ad_tv_recom_score_matrix.txt','w')
- f1.write('id\ttype\ttag\tdirector\tmain_actors\tscriptwritter\tproduction\n')
- for i in range(dat.shape[0]):
- ss = str(seq2id[i])
- for tt in score_mat[i]:
- ss += '\t'+','.join([str(i)+':'+str(tt[i]) for i in np.nonzero(tt)[0]])
- f1.write(ss+'\n')
- f1.close()
- '''
- def en2str(word):
- return word.encode('utf-8')
- f2 = open('ad_tv_recom_var_stat.txt','w')
- f2.write('type\ttag\tdirector\tmain_actors\tscriptwritter\tproduction\n')
- ss = '\t'.join([','.join(map(en2str,tmp_arr)) for tmp_arr in var_stat])
- f2.write(ss+'\n')
- f2.close()
- f3 = open('data/tv_outline_cos1.txt','w')
- for i in range(len(res_sim)):
- sim_arr = []
- for key,value in res_sim[i].items():
- sim_arr.append(value[0])
- f3.write(str(seq2id[i])+'\t'+str(res_sim[i])+'\t'+str(sim_arr)+'\n')
- f1.close()
- f3.close()
- '''
|