123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180 |
- #encoding=utf-8
- #author:wdw110
- #功能:预测计算电视剧收视指数和在电视台的收视率
- from __future__ import division
- import re
- import sys
- import time
- import copy
- import datetime
- import numpy as np
- from fty_util.common import Mysql
- tv_pre = [] #电视剧收视预测值
- tv_data_linear = {}
- tv_data_sim = {}
- tv_stage = {}
- channel_rate = {}
- type_rate = {}
- model_var = [{},{},{},{},{}]
- rules = u' '
- if len(sys.argv) > 1:
- tv_id = int(sys.argv[1])
- else:
- print '请输入电视剧id'
- sys.exit()
- conn = Mysql.createOfflineConn()
- bound = [0,2] #固定收视指数的最小和最大值
- sql = 'select * from tmp.ad_tv_sr_pre_var'
- tmp_data = Mysql.selectAll(sql, conn=conn)
- for i in range(len(tmp_data)):
- var_name = tmp_data[i][1]
- tv_sr_arr = map(float,tmp_data[i][3].strip('[|]').split(','))
- year = tmp_data[i][4]
- var_loc = tmp_data[i][5]-1
- model_var[var_loc].setdefault(var_name,{})
- model_var[var_loc][var_name][year] = sum(tv_sr_arr)/len(tv_sr_arr)
- sql = 'select tv_id,tv_name,director,scriptwriter,main_actors,filmer,categories from odl.ad_tv_lib where tv_id=%d' % tv_id
- tmp_data = Mysql.selectAll(sql, conn=conn)
- if tmp_data:
- tv_id = int(tmp_data[0][0])
- tv_name = tmp_data[0][1]
- year = datetime.datetime.now().year
- director = tmp_data[0][2] if tmp_data[0][2] else ''
- scriptwriter = tmp_data[0][3] if tmp_data[0][3] else ''
- actors = tmp_data[0][4] if tmp_data[0][4] else ''
- filmer = tmp_data[0][5] if tmp_data[0][5] else ''
- type1 = tmp_data[0][6] if tmp_data[0][6] else ''
- tv_data_linear[tv_id] = [tv_name,director,scriptwriter,actors,filmer,type1,year]
- else:
- print 'tv_id:%d不在ad_tv_lib表中!' % tv_id
- sys.exit()
- def trans(val):
- res = (val - bound[0])/(bound[1]-bound[0])*10.0
- if res < 1.0:
- res = 1.0
- elif res > 10.0:
- res = 10.0
- return round(res,2)
- def tv_sr_pre(var_arr,year): #变量数组,各变量的名字
- '''线性回归模型'''
- coef = np.array([0.2103148,0.5182419,0.7822451,0.4921597,0.3865043,-1.3566513])
- model_avg = [] #每个变量的近一年的平均值
- model_val = np.ones(len(var_arr)+1)
- for i in range(len(model_var)):
- tmp_obj = model_var[i]
- sum1,num = 0,0
- for var_name in tmp_obj:
- if year in tmp_obj[var_name]:
- sum1 += tmp_obj[var_name][year]
- num += 1
- elif year-1 in tmp_obj[var_name]:
- sum1 += tmp_obj[var_name][year-1]
- num += 1
- tmp_avg = sum1/num if num else 0
- model_avg.append(tmp_avg)
- for i in range(len(var_arr)):
- p_arr = var_arr[i].split(u' ')
- for peo in p_arr:
- if peo:
- if model_var[i].has_key(peo):
- if year in model_var[i][peo]:
- model_val[i] = model_var[i][peo][year]
- else:
- max_year = max(model_var[i][peo].keys())
- model_val[i] = model_var[i][peo][max_year]
- else:
- model_val[i] = model_avg[i] #变量不在数据库时用均值代替
- result = np.dot(coef,model_val)
- return result
- now = datetime.datetime.now()
- aDay = datetime.timedelta(days=-365)
- date_line = (now + aDay).date()
- #默认时间为当前时间的近一年
- sql = "select * from idl.ad_pub_station_rate_stats"
- station_data = Mysql.selectAll(sql, conn=conn)
- sql = "select * from tmp.ad_pub_station_type_rate"
- type_data = Mysql.selectAll(sql, conn=conn)
- for i in range(len(station_data)):
- channel = station_data[i][1]
- aud_rating = station_data[i][2]
- tv_date = station_data[i][3]
-
- channel_rate.setdefault(channel,[])
- if tv_date >= date_line:
- channel_rate[channel].append(aud_rating)
- for i in range(len(type_data)):
- channel = type_data[i][1]
- Type = type_data[i][2]
- aud_rating = type_data[i][3]
- tv_date = type_data[i][4]
-
- type_rate.setdefault(channel,{})
- type_rate[channel].setdefault(Type,[])
- if tv_date >= date_line:
- type_rate[channel][Type].append(aud_rating)
- tv2type = copy.deepcopy(type_rate)
- for channel,value in type_rate.items():
- for ty,v_arr in value.items():
- tv2type[channel][ty] = sum(v_arr)/len(v_arr) if len(v_arr) else 0.0
- for tv_id in tv_data_linear:
- tv_name = tv_data_linear[tv_id][0]
- var_arr = tv_data_linear[tv_id][1:-1]
- type1 = tv_data_linear[tv_id][5]
- year = tv_data_linear[tv_id][-1]
- tv_station = {}
- ty_arr = type1.split(u' ')
- for channel in channel_rate:
- tmp,n = 0,0
- value = tv2type.get(channel,{})
- for tt in ty_arr:
- if tt and value.has_key(tt):
- tmp += value[tt]
- n += 1
- tv_station[channel] = tmp/n if n else 0
- tv_sr = tv_sr_pre(var_arr,year)
- for channel,vv in tv_station.items():
- channel_avg = sum(channel_rate[channel])/len(channel_rate[channel]) if channel_rate[channel] else 0.0
- #判断是否有该类型的平均收视率
- if vv:
- station_rate = 0.6 * tv_sr * channel_avg + 0.4 * vv
- else:
- station_rate = tv_sr * channel_avg
- tv_pre.append((tv_id,tv_name,str(tv_sr),str(trans(tv_sr)),channel,str(station_rate)))
- delete = 'delete from idl.ad_tv_sr_pre where tv_id=%d' % tv_id
- Mysql.execute(delete, conn=conn)
- sql = 'insert into idl.ad_tv_sr_pre(tv_id,tv_name,tv_sr_pre,tv_sr_pre_trans,channel,station_tv_pre) values(%s,%s,%s,%s,%s,%s)'
- Mysql.insertMany(sql, tv_pre, conn=conn)
- sql = 'update tv_lib.gc_tv_series ts set ts.point = %s where ts.tv_id = %s' %(str(tv_pre[0][3]),tv_id)
- Mysql.update(sql, conn=conn)
- Mysql.close(conn)
|