#encoding=utf-8 #author:wdw110 #功能:预测计算电视剧收视指数和在电视台的收视率 from __future__ import division import re import sys import time import copy import datetime import numpy as np from fty_util.common import Mysql tv_pre = [] #电视剧收视预测值 tv_data_linear = {} tv_data_sim = {} tv_stage = {} channel_rate = {} type_rate = {} model_var = [{},{},{},{},{}] rules = u' ' if len(sys.argv) > 1: tv_id = int(sys.argv[1]) else: print '请输入电视剧id' sys.exit() conn = Mysql.createOfflineConn() bound = [0,2] #固定收视指数的最小和最大值 sql = 'select * from tmp.ad_tv_sr_pre_var' tmp_data = Mysql.selectAll(sql, conn=conn) for i in range(len(tmp_data)): var_name = tmp_data[i][1] tv_sr_arr = map(float,tmp_data[i][3].strip('[|]').split(',')) year = tmp_data[i][4] var_loc = tmp_data[i][5]-1 model_var[var_loc].setdefault(var_name,{}) model_var[var_loc][var_name][year] = sum(tv_sr_arr)/len(tv_sr_arr) sql = 'select tv_id,tv_name,director,scriptwriter,main_actors,filmer,categories from odl.ad_tv_lib where tv_id=%d' % tv_id tmp_data = Mysql.selectAll(sql, conn=conn) if tmp_data: tv_id = int(tmp_data[0][0]) tv_name = tmp_data[0][1] year = datetime.datetime.now().year director = tmp_data[0][2] if tmp_data[0][2] else '' scriptwriter = tmp_data[0][3] if tmp_data[0][3] else '' actors = tmp_data[0][4] if tmp_data[0][4] else '' filmer = tmp_data[0][5] if tmp_data[0][5] else '' type1 = tmp_data[0][6] if tmp_data[0][6] else '' tv_data_linear[tv_id] = [tv_name,director,scriptwriter,actors,filmer,type1,year] else: print 'tv_id:%d不在ad_tv_lib表中!' % tv_id sys.exit() def trans(val): res = (val - bound[0])/(bound[1]-bound[0])*10.0 if res < 1.0: res = 1.0 elif res > 10.0: res = 10.0 return round(res,2) def tv_sr_pre(var_arr,year): #变量数组,各变量的名字 '''线性回归模型''' coef = np.array([0.2103148,0.5182419,0.7822451,0.4921597,0.3865043,-1.3566513]) model_avg = [] #每个变量的近一年的平均值 model_val = np.ones(len(var_arr)+1) for i in range(len(model_var)): tmp_obj = model_var[i] sum1,num = 0,0 for var_name in tmp_obj: if year in tmp_obj[var_name]: sum1 += tmp_obj[var_name][year] num += 1 elif year-1 in tmp_obj[var_name]: sum1 += tmp_obj[var_name][year-1] num += 1 tmp_avg = sum1/num if num else 0 model_avg.append(tmp_avg) for i in range(len(var_arr)): p_arr = var_arr[i].split(u' ') for peo in p_arr: if peo: if model_var[i].has_key(peo): if year in model_var[i][peo]: model_val[i] = model_var[i][peo][year] else: max_year = max(model_var[i][peo].keys()) model_val[i] = model_var[i][peo][max_year] else: model_val[i] = model_avg[i] #变量不在数据库时用均值代替 result = np.dot(coef,model_val) return result now = datetime.datetime.now() aDay = datetime.timedelta(days=-365) date_line = (now + aDay).date() #默认时间为当前时间的近一年 sql = "select * from idl.ad_pub_station_rate_stats" station_data = Mysql.selectAll(sql, conn=conn) sql = "select * from tmp.ad_pub_station_type_rate" type_data = Mysql.selectAll(sql, conn=conn) for i in range(len(station_data)): channel = station_data[i][1] aud_rating = station_data[i][2] tv_date = station_data[i][3] channel_rate.setdefault(channel,[]) if tv_date >= date_line: channel_rate[channel].append(aud_rating) for i in range(len(type_data)): channel = type_data[i][1] Type = type_data[i][2] aud_rating = type_data[i][3] tv_date = type_data[i][4] type_rate.setdefault(channel,{}) type_rate[channel].setdefault(Type,[]) if tv_date >= date_line: type_rate[channel][Type].append(aud_rating) tv2type = copy.deepcopy(type_rate) for channel,value in type_rate.items(): for ty,v_arr in value.items(): tv2type[channel][ty] = sum(v_arr)/len(v_arr) if len(v_arr) else 0.0 for tv_id in tv_data_linear: tv_name = tv_data_linear[tv_id][0] var_arr = tv_data_linear[tv_id][1:-1] type1 = tv_data_linear[tv_id][5] year = tv_data_linear[tv_id][-1] tv_station = {} ty_arr = type1.split(u' ') for channel in channel_rate: tmp,n = 0,0 value = tv2type.get(channel,{}) for tt in ty_arr: if tt and value.has_key(tt): tmp += value[tt] n += 1 tv_station[channel] = tmp/n if n else 0 tv_sr = tv_sr_pre(var_arr,year) for channel,vv in tv_station.items(): channel_avg = sum(channel_rate[channel])/len(channel_rate[channel]) if channel_rate[channel] else 0.0 #判断是否有该类型的平均收视率 if vv: station_rate = 0.6 * tv_sr * channel_avg + 0.4 * vv else: station_rate = tv_sr * channel_avg tv_pre.append((tv_id,tv_name,str(tv_sr),str(trans(tv_sr)),channel,str(station_rate))) delete = 'delete from idl.ad_tv_sr_pre where tv_id=%d' % tv_id Mysql.execute(delete, conn=conn) sql = 'insert into idl.ad_tv_sr_pre(tv_id,tv_name,tv_sr_pre,tv_sr_pre_trans,channel,station_tv_pre) values(%s,%s,%s,%s,%s,%s)' Mysql.insertMany(sql, tv_pre, conn=conn) sql = 'update tv_lib.gc_tv_series ts set ts.point = %s where ts.tv_id = %s' %(str(tv_pre[0][3]),tv_id) Mysql.update(sql, conn=conn) Mysql.close(conn)