#encoding=utf-8 #author:wdw110 #功能:离线计算电视剧的相似剧 from __future__ import division import re import math import jieba import numpy as np from fty_util.common import Mysql tv_tf = {} #{id:[[{},{},..],..],...} idf = {} idf_aft = {} var_stat = [[],[],[],[],[],[],[]] #各维度的词统计 seq2id = {} weight = [5,2,1,1,1,1,2] tags = {} #标签库 conn = Mysql.createOfflineConn() dims = ['tv_id','types','description','director','main_actors','scriptwriter','filmer','decade'] sql = "select %s from odl.ad_tv_lib where is_use=1" %(', '.join(dims)) tv_data = Mysql.selectAll(sql, conn=conn) sql2 = 'select tag from odl.ad_type_lib' tmp = Mysql.selectAll(sql2, conn=conn) for word in tmp: tags[word[0]] = 1 def find_tag(sentence): #sentence为电视剧的描述信息 seg = jieba.cut(sentence) res = {} for word in seg: if tags.get(word): res.setdefault(word,1) return u' '.join(res.keys()) for i in range(len(tv_data)): tv_id = int(tv_data[i][0]) tv_data[i] = list(tv_data[i]) arr = tv_data[i][1:] tmp = [] #每个电视剧的所有关键词 word_count = {} #每个电视剧的关键词的数量 dim_tmp = [] #每个电视剧的每个维度的关键词统计[[{},{}..],..] tv_tf.setdefault(tv_id,[]) seq2id[i] = tv_id if not arr[1]: arr[1] = '' tv_data[i][2] = '' else: arr[1] = find_tag(arr[1]) tv_data[i][2] = arr[1] for i in range(len(arr)): obj = {} if not arr[i]: wd = u'' else: wd = arr[i] words = wd.split(u' ') #print words for word in words: if word: obj[word] = 1 word_count.setdefault(word,0) word_count[word] += 1 dim_tmp.append(obj) var_stat[i].extend(obj.keys()) tmp.extend(obj.keys()) n = len(tmp) #每个电视剧的总词数 for obj_j in dim_tmp: for k in obj_j: obj_j[k] = word_count[k]/n tv_tf[tv_id] = dim_tmp for word in list(set(tmp)): idf.setdefault(word,0) idf[word] += 1 N = len(tv_tf) #总电视剧数量 for key in idf: idf_aft[key] = math.log10(N/idf[key]) for i in range(len(var_stat)): var_stat[i] = list(set(var_stat[i])) #去重处理 #计算电视剧矩阵得分 def tv_score(weight, tf, idf): col = len(tf) row = sum([len(v) for v in var_stat]) res = np.zeros((col, row)) score_arr = {} for i in range(col): tv_arr = tf[seq2id[i]] mm = 0 #每个词的位置 score_arr.setdefault(i,[]) for j in range(len(tv_arr)): tmp2 = np.zeros(len(var_stat[j])) #每个维度的向量 if j>0: mm += len(var_stat[j-1]) for word,value in tv_arr[j].items(): score = weight[j]*value*idf[word] ll = var_stat[j].index(word) nn = ll + mm res[i,nn] = score tmp2[ll] = score score_arr[i].append(tmp2) return res,score_arr def cos_distance(vec1, vec2): v11 = vec1*vec1 v12 = vec1*vec2 v22 = vec2*vec2 mer = sum(v12[v12>0]) denominator = math.sqrt(sum(v11[v11>0])) + math.sqrt(sum(v22[v22>0])) if not denominator: return 0 return mer/denominator def tv_sim(data): #tv_id:要计算的电视剧(1,2,3...),data:电视剧得分矩阵({1:[],2:[]}) n,m = data.shape res = np.zeros((n,n)) result = [] x = range(1,n+1) for i in range(n): res[i,i] = 1 for j in range(i+1,n): res[i,j] = cos_distance(data[i,],data[j,]) res[j,i] = res[i,j] index_arr = np.argsort(-res[i,]) sort_arr = res[i,][index_arr] id_arr = np.array([seq2id[i] for i in index_arr]) tmp = zip(id_arr,sort_arr) result.append(dict(enumerate(tmp[0:100]))) return result dat,score_mat = tv_score(weight,tv_tf,idf_aft) res_sim = tv_sim(dat) #将结果和中间数据保存到数据库中 ''' sql = 'delete from idl.ad_tv_cos' cursor.execute(sql) db.commit() vv = [] for i in range(len(res_sim)): sim_arr = [] for key,value in res_sim[i].items(): sim_arr.append(value[0]) vv.append((seq2id[i],str(res_sim[i]),str(sim_arr))) sql = 'insert into idl.ad_tv_cos values (%s,%s,%s)' for i in range(int(len(vv)/1000)+1): tmp = vv[i*1000:(i+1)*1000] cursor.executemany(sql,tmp) db.commit() ''' delete = 'delete from tmp.ad_tv_recom_idf' Mysql.execute(delete, conn=conn) tmp_ll = list(idf.items()) vv = [(i+1,tmp_ll[i][0],tmp_ll[i][1],N) for i in range(len(tmp_ll))] sql = 'insert into tmp.ad_tv_recom_idf values(%s,%s,%s,%s)' Mysql.insertMany(sql, vv, conn=conn) delete = 'delete from tmp.ad_tv_recom_tf' Mysql.execute(delete, conn=conn) vv = [] for key,tv_arr in tv_tf.items(): tmp = [] tmp.append(int(key)) for tv_obj in tv_arr: ss = ';'.join([k.encode('utf-8')+':'+str(v) for k,v in tv_obj.items()]) tmp.append(ss) vv.append(tuple(tmp)) sql = 'insert into tmp.ad_tv_recom_tf values(%s,%s,%s,%s,%s,%s,%s,%s)' Mysql.insertMany(sql, vv, conn=conn) delete = 'delete from tmp.ad_tv_recom_var_stat' Mysql.execute(delete, conn=conn) dim_arr = [(','.join(tmp_arr) for tmp_arr in var_stat)] sql = 'insert into tmp.ad_tv_recom_var_stat values(%s,%s,%s,%s,%s,%s,%s)' Mysql.insertMany(sql, dim_arr, conn=conn) Mysql.close(conn) #将结果保存到本地 f1 = open('ad_tv_recom_score_matrix.txt','w') f1.write('id\ttype\ttag\tdirector\tmain_actors\tscriptwritter\tproduction\n') for i in range(dat.shape[0]): ss = str(seq2id[i]) for tt in score_mat[i]: ss += '\t'+','.join([str(i)+':'+str(tt[i]) for i in np.nonzero(tt)[0]]) f1.write(ss+'\n') f1.close() ''' def en2str(word): return word.encode('utf-8') f2 = open('ad_tv_recom_var_stat.txt','w') f2.write('type\ttag\tdirector\tmain_actors\tscriptwritter\tproduction\n') ss = '\t'.join([','.join(map(en2str,tmp_arr)) for tmp_arr in var_stat]) f2.write(ss+'\n') f2.close() f3 = open('data/tv_outline_cos1.txt','w') for i in range(len(res_sim)): sim_arr = [] for key,value in res_sim[i].items(): sim_arr.append(value[0]) f3.write(str(seq2id[i])+'\t'+str(res_sim[i])+'\t'+str(sim_arr)+'\n') f1.close() f3.close() '''