123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132 |
- #encoding=utf-8
- #author:wdw110
- #功能: 对卫视和地方的电视剧收视数据进行去噪声处理
- from __future__ import division
- import math
- import copy
- import mysql.connector
- import time
- import numpy as np
- from fty_util.common import Mysql
- tbs = ['ad_television','area_ad_television']
- choose = ['ad','area']
- def Stat(Number): #Number:0和1
- tv_data = []
- tv_play = {}
- tv_station = {}
- dateline = str(time.localtime().tm_year-1)
- conn = Mysql.createOfflineConn()
- sql = "select id,tv_name,channel,theater_attribute,epi_num,tv_date,start_time,end_time,audience_rating,avg_rating,market_rating from odl.%s where year(tv_date)>=%s" % (tbs[Number], dateline)
- data = Mysql.selectAll(sql, conn=conn)
- for i in range(len(data)):
- dd = list(data[i])
- tv_id = data[i][0]
- tv_name = data[i][1]
- channel = data[i][2]
- theater = data[i][3]
- year = data[i][5].year
- if dd[8]>=0 and dd[9]>=0 and dd[10]>=0 and dd[3]:
- key = (tv_name, channel, theater, year)
- tv_play[tv_id] = dd[1:]
- tv_station.setdefault(key,[[],[]])
- tv_station[key][0].append(tv_id)
- tmp_data = map(float,data[i][8:11])
- tv_station[key][1].append(tmp_data)
- def fivenum(arr):
- """Tukey's five number"""
- arr = np.sort(arr)
- res = []
- n = len(arr)
- if n == 0:
- print '数组不能为空'
- return res
- else:
- n4 = math.floor((n+3)/2)/2
- d = [0,n4-1,(n-1)/2,n-n4,n-1]
- d_floor = [int(math.floor(i)) for i in d]
- d_ceil = [int(math.ceil(i)) for i in d]
- res = list(0.5 * (arr[d_floor]+arr[d_ceil]))
- return res
- def denoise(arr):
- """异常值检测:在区间[Q1-1.5*(Q3-Q1),Q3+1.5*(Q3-Q1)]
- 之外的点均为异常值,其中,Q1为四分之一分位点,Q3为四分之三分位点"""
- five_arr = fivenum(arr)
- Q1 = five_arr[1]
- Q3 = five_arr[3]
- L1 = Q1-1.5*(Q3-Q1) #区间下界
- L2 = Q3+1.5*(Q3-Q1) #区间上界
- res = []
- for i in range(len(arr)):
- if arr[i]<=L2:
- res.append(arr[i])
- else:
- res.append(-1)
- return res
- result = []
- for key in tv_station:
- id_arr = tv_station[key][0]
- vv = np.array(tv_station[key][1])
- tmp_arr = np.array(map(denoise,vv.transpose())).transpose()
- for i in range(len(id_arr)):
- tv_id = id_arr[i]
- tmp = [tv_id] + tv_play[tv_id] + map(float,list(tmp_arr[i]))
- result.append(tmp)
- #写入yxb.ad_tv_rating_denoise
- delete = 'delete from yxb.%s_tv_rating_denoise where year(tv_date)>=%s' %(choose[Number], dateline)
- try:
- Mysql.execute(delete, conn=conn)
- except mysql.connector.errors.ProgrammingError as e:
- pass
- ind1 = 'DROP INDEX id ON yxb.%s_tv_rating_denoise' % choose[Number]
- ind2 = 'DROP INDEX tv_date ON yxb.%s_tv_rating_denoise' % choose[Number]
- try:
- Mysql.execute(ind1, conn=conn)
- except mysql.connector.errors.ProgrammingError as e:
- pass
- try:
- Mysql.execute(ind2, conn=conn)
- except mysql.connector.errors.ProgrammingError as e:
- pass
- sql = 'insert into yxb.%s_tv_rating_denoise' % choose[Number] + ' values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
- for i in range(int(len(result)/1000)+1):
- tmp = result[i*1000:(i+1)*1000]
- Mysql.insertMany(sql, tmp, conn=conn)
- sql1 = 'CREATE INDEX id ON yxb.%s_tv_rating_denoise(id)' % choose[Number]
- sql2 = 'CREATE INDEX tv_date ON yxb.%s_tv_rating_denoise (tv_date,theater_attribute)' % choose[Number]
- Mysql.execute(sql1, conn=conn)
- Mysql.execute(sql2, conn=conn)
- #写入odl.ad_tv_rating_denoise
- delete = 'delete from odl.%s_tv_rating_denoise where year(tv_date)>=%s' % (choose[Number],dateline)
- try:
- Mysql.execute(delete, conn=conn)
- except mysql.connector.errors.ProgrammingError as e:
- pass
- sql = 'insert into odl.%s_tv_rating_denoise select * from yxb.%s_tv_rating_denoise where year(tv_date)>=%s' %(choose[Number], choose[Number], dateline)
- try:
- Mysql.execute(sql, conn=conn)
- except mysql.connector.errors.ProgrammingError as e:
- pass
- Mysql.close(conn)
- for i in range(0,2):
- Stat(i)
|