#encoding=utf-8 #author:wdw110 #功能: 对卫视和地方的电视剧收视数据进行去噪声处理 from __future__ import division import math import copy import mysql.connector import time import numpy as np from fty_util.common import Mysql tbs = ['ad_television','area_ad_television'] choose = ['ad','area'] def Stat(Number): #Number:0和1 tv_data = [] tv_play = {} tv_station = {} dateline = str(time.localtime().tm_year-1) conn = Mysql.createOfflineConn() sql = "select id,tv_name,channel,theater_attribute,epi_num,tv_date,start_time,end_time,audience_rating,avg_rating,market_rating from odl.%s where year(tv_date)>=%s" % (tbs[Number], dateline) data = Mysql.selectAll(sql, conn=conn) for i in range(len(data)): dd = list(data[i]) tv_id = data[i][0] tv_name = data[i][1] channel = data[i][2] theater = data[i][3] year = data[i][5].year if dd[8]>=0 and dd[9]>=0 and dd[10]>=0 and dd[3]: key = (tv_name, channel, theater, year) tv_play[tv_id] = dd[1:] tv_station.setdefault(key,[[],[]]) tv_station[key][0].append(tv_id) tmp_data = map(float,data[i][8:11]) tv_station[key][1].append(tmp_data) def fivenum(arr): """Tukey's five number""" arr = np.sort(arr) res = [] n = len(arr) if n == 0: print '数组不能为空' return res else: n4 = math.floor((n+3)/2)/2 d = [0,n4-1,(n-1)/2,n-n4,n-1] d_floor = [int(math.floor(i)) for i in d] d_ceil = [int(math.ceil(i)) for i in d] res = list(0.5 * (arr[d_floor]+arr[d_ceil])) return res def denoise(arr): """异常值检测:在区间[Q1-1.5*(Q3-Q1),Q3+1.5*(Q3-Q1)] 之外的点均为异常值,其中,Q1为四分之一分位点,Q3为四分之三分位点""" five_arr = fivenum(arr) Q1 = five_arr[1] Q3 = five_arr[3] L1 = Q1-1.5*(Q3-Q1) #区间下界 L2 = Q3+1.5*(Q3-Q1) #区间上界 res = [] for i in range(len(arr)): if arr[i]<=L2: res.append(arr[i]) else: res.append(-1) return res result = [] for key in tv_station: id_arr = tv_station[key][0] vv = np.array(tv_station[key][1]) tmp_arr = np.array(map(denoise,vv.transpose())).transpose() for i in range(len(id_arr)): tv_id = id_arr[i] tmp = [tv_id] + tv_play[tv_id] + map(float,list(tmp_arr[i])) result.append(tmp) #写入yxb.ad_tv_rating_denoise delete = 'delete from yxb.%s_tv_rating_denoise where year(tv_date)>=%s' %(choose[Number], dateline) try: Mysql.execute(delete, conn=conn) except mysql.connector.errors.ProgrammingError as e: pass ind1 = 'DROP INDEX id ON yxb.%s_tv_rating_denoise' % choose[Number] ind2 = 'DROP INDEX tv_date ON yxb.%s_tv_rating_denoise' % choose[Number] try: Mysql.execute(ind1, conn=conn) except mysql.connector.errors.ProgrammingError as e: pass try: Mysql.execute(ind2, conn=conn) except mysql.connector.errors.ProgrammingError as e: pass sql = 'insert into yxb.%s_tv_rating_denoise' % choose[Number] + ' values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' for i in range(int(len(result)/1000)+1): tmp = result[i*1000:(i+1)*1000] Mysql.insertMany(sql, tmp, conn=conn) sql1 = 'CREATE INDEX id ON yxb.%s_tv_rating_denoise(id)' % choose[Number] sql2 = 'CREATE INDEX tv_date ON yxb.%s_tv_rating_denoise (tv_date,theater_attribute)' % choose[Number] Mysql.execute(sql1, conn=conn) Mysql.execute(sql2, conn=conn) #写入odl.ad_tv_rating_denoise delete = 'delete from odl.%s_tv_rating_denoise where year(tv_date)>=%s' % (choose[Number],dateline) try: Mysql.execute(delete, conn=conn) except mysql.connector.errors.ProgrammingError as e: pass sql = 'insert into odl.%s_tv_rating_denoise select * from yxb.%s_tv_rating_denoise where year(tv_date)>=%s' %(choose[Number], choose[Number], dateline) try: Mysql.execute(sql, conn=conn) except mysql.connector.errors.ProgrammingError as e: pass Mysql.close(conn) for i in range(0,2): Stat(i)