online_ad_tv_sr_pre.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. #encoding=utf-8
  2. #author:wdw110
  3. #功能:预测计算电视剧收视指数和在电视台的收视率
  4. from __future__ import division
  5. import re
  6. import sys
  7. import time
  8. import copy
  9. import datetime
  10. import numpy as np
  11. from fty_util.common import Mysql
  12. tv_pre = [] #电视剧收视预测值
  13. tv_data_linear = {}
  14. tv_data_sim = {}
  15. tv_stage = {}
  16. channel_rate = {}
  17. type_rate = {}
  18. model_var = [{},{},{},{},{}]
  19. rules = u' '
  20. if len(sys.argv) > 1:
  21. tv_id = int(sys.argv[1])
  22. else:
  23. print '请输入电视剧id'
  24. sys.exit()
  25. conn = Mysql.createOfflineConn()
  26. bound = [0,2] #固定收视指数的最小和最大值
  27. sql = 'select * from tmp.ad_tv_sr_pre_var'
  28. tmp_data = Mysql.selectAll(sql, conn=conn)
  29. for i in range(len(tmp_data)):
  30. var_name = tmp_data[i][1]
  31. tv_sr_arr = map(float,tmp_data[i][3].strip('[|]').split(','))
  32. year = tmp_data[i][4]
  33. var_loc = tmp_data[i][5]-1
  34. model_var[var_loc].setdefault(var_name,{})
  35. model_var[var_loc][var_name][year] = sum(tv_sr_arr)/len(tv_sr_arr)
  36. sql = 'select tv_id,tv_name,director,scriptwriter,main_actors,filmer,categories from odl.ad_tv_lib where tv_id=%d' % tv_id
  37. tmp_data = Mysql.selectAll(sql, conn=conn)
  38. if tmp_data:
  39. tv_id = int(tmp_data[0][0])
  40. tv_name = tmp_data[0][1]
  41. year = datetime.datetime.now().year
  42. director = tmp_data[0][2] if tmp_data[0][2] else ''
  43. scriptwriter = tmp_data[0][3] if tmp_data[0][3] else ''
  44. actors = tmp_data[0][4] if tmp_data[0][4] else ''
  45. filmer = tmp_data[0][5] if tmp_data[0][5] else ''
  46. type1 = tmp_data[0][6] if tmp_data[0][6] else ''
  47. tv_data_linear[tv_id] = [tv_name,director,scriptwriter,actors,filmer,type1,year]
  48. else:
  49. print 'tv_id:%d不在ad_tv_lib表中!' % tv_id
  50. sys.exit()
  51. def trans(val):
  52. res = (val - bound[0])/(bound[1]-bound[0])*10.0
  53. if res < 1.0:
  54. res = 1.0
  55. elif res > 10.0:
  56. res = 10.0
  57. return round(res,2)
  58. def tv_sr_pre(var_arr,year): #变量数组,各变量的名字
  59. '''线性回归模型'''
  60. coef = np.array([0.2103148,0.5182419,0.7822451,0.4921597,0.3865043,-1.3566513])
  61. model_avg = [] #每个变量的近一年的平均值
  62. model_val = np.ones(len(var_arr)+1)
  63. for i in range(len(model_var)):
  64. tmp_obj = model_var[i]
  65. sum1,num = 0,0
  66. for var_name in tmp_obj:
  67. if year in tmp_obj[var_name]:
  68. sum1 += tmp_obj[var_name][year]
  69. num += 1
  70. elif year-1 in tmp_obj[var_name]:
  71. sum1 += tmp_obj[var_name][year-1]
  72. num += 1
  73. tmp_avg = sum1/num if num else 0
  74. model_avg.append(tmp_avg)
  75. for i in range(len(var_arr)):
  76. p_arr = var_arr[i].split(u' ')
  77. for peo in p_arr:
  78. if peo:
  79. if model_var[i].has_key(peo):
  80. if year in model_var[i][peo]:
  81. model_val[i] = model_var[i][peo][year]
  82. else:
  83. max_year = max(model_var[i][peo].keys())
  84. model_val[i] = model_var[i][peo][max_year]
  85. else:
  86. model_val[i] = model_avg[i] #变量不在数据库时用均值代替
  87. result = np.dot(coef,model_val)
  88. return result
  89. now = datetime.datetime.now()
  90. aDay = datetime.timedelta(days=-365)
  91. date_line = (now + aDay).date()
  92. #默认时间为当前时间的近一年
  93. sql = "select * from idl.ad_pub_station_rate_stats"
  94. station_data = Mysql.selectAll(sql, conn=conn)
  95. sql = "select * from tmp.ad_pub_station_type_rate"
  96. type_data = Mysql.selectAll(sql, conn=conn)
  97. for i in range(len(station_data)):
  98. channel = station_data[i][1]
  99. aud_rating = station_data[i][2]
  100. tv_date = station_data[i][3]
  101. channel_rate.setdefault(channel,[])
  102. if tv_date >= date_line:
  103. channel_rate[channel].append(aud_rating)
  104. for i in range(len(type_data)):
  105. channel = type_data[i][1]
  106. Type = type_data[i][2]
  107. aud_rating = type_data[i][3]
  108. tv_date = type_data[i][4]
  109. type_rate.setdefault(channel,{})
  110. type_rate[channel].setdefault(Type,[])
  111. if tv_date >= date_line:
  112. type_rate[channel][Type].append(aud_rating)
  113. tv2type = copy.deepcopy(type_rate)
  114. for channel,value in type_rate.items():
  115. for ty,v_arr in value.items():
  116. tv2type[channel][ty] = sum(v_arr)/len(v_arr) if len(v_arr) else 0.0
  117. for tv_id in tv_data_linear:
  118. tv_name = tv_data_linear[tv_id][0]
  119. var_arr = tv_data_linear[tv_id][1:-1]
  120. type1 = tv_data_linear[tv_id][5]
  121. year = tv_data_linear[tv_id][-1]
  122. tv_station = {}
  123. ty_arr = type1.split(u' ')
  124. for channel in channel_rate:
  125. tmp,n = 0,0
  126. value = tv2type.get(channel,{})
  127. for tt in ty_arr:
  128. if tt and value.has_key(tt):
  129. tmp += value[tt]
  130. n += 1
  131. tv_station[channel] = tmp/n if n else 0
  132. tv_sr = tv_sr_pre(var_arr,year)
  133. for channel,vv in tv_station.items():
  134. channel_avg = sum(channel_rate[channel])/len(channel_rate[channel]) if channel_rate[channel] else 0.0
  135. #判断是否有该类型的平均收视率
  136. if vv:
  137. station_rate = 0.6 * tv_sr * channel_avg + 0.4 * vv
  138. else:
  139. station_rate = tv_sr * channel_avg
  140. tv_pre.append((tv_id,tv_name,str(tv_sr),str(trans(tv_sr)),channel,str(station_rate)))
  141. delete = 'delete from idl.ad_tv_sr_pre where tv_id=%d' % tv_id
  142. Mysql.execute(delete, conn=conn)
  143. sql = 'insert into idl.ad_tv_sr_pre(tv_id,tv_name,tv_sr_pre,tv_sr_pre_trans,channel,station_tv_pre) values(%s,%s,%s,%s,%s,%s)'
  144. Mysql.insertMany(sql, tv_pre, conn=conn)
  145. sql = 'update tv_lib.gc_tv_series ts set ts.point = %s where ts.tv_id = %s' %(str(tv_pre[0][3]),tv_id)
  146. Mysql.update(sql, conn=conn)
  147. Mysql.close(conn)