123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280 |
- #!/usr/bin/env python
- #coding=utf-8
- """备案、发行表数据处理
- """
- import datetime
- import re
- import sys
- import time
- from fty_util.common import Mysql
- reload(sys)
- sys.setdefaultencoding('utf8')
- def parse_date(field, date_format):
- """
- 日期转换
- """
- time_format = datetime.datetime.strptime(field, date_format)
- time_format = time_format.strftime(u'%Y-%m-%d')
- return time_format
- def parse_field(field):
- """
- 处理字段,除了p_detail字段
- """
- if field is None or len(field) == 0:
- return ""
- else:
- field = strQ2B(field)
- return field.replace(' ', '')
- def strQ2B(ustring):
- """
- 全角转半角
- """
- tstring = ""
- for uchar in ustring:
- inside_code = ord(uchar)
- # 全角空格直接转换
- if inside_code == 12288:
- inside_code = 32
- if inside_code == 160:
- inside_code = 32
- # 全角字符(除空格)根据关系转化
- elif (inside_code >= 65281 and inside_code <= 65374):
- inside_code -= 65248
-
- tstring += unichr(inside_code)
- return tstring
- """
- 从 odl.dsj_gongshi(电视剧备案数据)和odl.faxing(电视剧发行数据)表中提取数据到odl.ad_tv_record_distribution表中,作为数据分析来源数据
- """
- conn = Mysql.createOfflineConn()
- # 备案、发行表
- sql = """
- select id, name, area, theme, company, commit_company, show_date, license_id, form, num, film_date, film_period, `desc`, \
- province_advice, relative_depart_advice, remark, scrapy_url, scrapy_date, scrapy_title, p_detail, scrapy_detail_url, \
- union_company, scriptwriter, director from odl.dsj_gongshi order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- # conn_max = dev_mysql_conn.Mysql()
- sql_max = """
- select max(tv_id) tv_id from odl.ad_tv_record_distribution
- """
- row_max = Mysql.getOne(sql_max, conn=conn)
- max_id = 0
- if row_max is not None and len(row_max) > 0:
- max_id = row_max[0]
- for row in rows:
- _id = row['id']
- if _id <= max_id:
- continue
- # 将p_detail字段转换为commit_company、show_date、license_id三个字段
- p_detail = parse_field(row['p_detail'])
- commit_company = ''
- show_date = ''
- license_id = ''
- # 如果p_detail字段为空,则直接处理
- if p_detail is None or len(p_detail) == 0:
- commit_company = parse_field(row['commit_company'])
- show_date = parse_field(row['show_date'])
- show_date = show_date.replace(u'年', '-').replace(u'月', '-')
- show_date = parse_date(show_date, u'%Y-%m-')
- license_id = parse_field(row['license_id'])
- else:
- try:
- str = p_detail.decode('utf8')
- xx=ur"\d+[\u5e74]\d+[\u6708]"
- p = re.compile(xx)
- date = p.findall(str)[0]
- company_license = str.split(date)
- commit_company = company_license[0]
- show_date = date
- show_date = parse_date(show_date, '%Y年%m月')
- license_id = company_license[-1]
- except Exception, e:
- pass
- # 原始格式 2016.11
- film_date = parse_field(row['film_date'])
- try:
- film_date_pattern = re.compile('\d+')
- year_month = film_date_pattern.findall(film_date)
- year = year_month[0]
- month = year_month[1]
- film_date = parse_date(str(year) + '.' + str(month), '%Y.%m')
- except Exception, e:
- film_date = ''
- # film_date = parse_date(film_date, '%Y.%m')
- film_period = parse_field(row['film_period'])
- name = parse_field(row['name'])
- area = parse_field(row['area'])
- theme = parse_field(row['theme'])
- company = parse_field(row['company'])
- if company is None or len(company) == 0:
- company = commit_company
- form = parse_field(row['form'])
- num = parse_field(row['num'])
- desc = parse_field(row['desc'])
- province_advice = parse_field(row['province_advice'])
- relative_depart_advice = parse_field(row['relative_depart_advice'])
- remark = parse_field(row['remark'])
- scrapy_url = parse_field(row['scrapy_url'])
- scrapy_date = row['scrapy_date']
- scrapy_title = parse_field(row['scrapy_title'])
- scrapy_detail_url = parse_field(row['scrapy_detail_url'])
- union_company = parse_field(row['union_company'])
- scriptwriter = parse_field(row['scriptwriter'])
- director = parse_field(row['director'])
- # 变更后的名称
- current_name = ''
- # 变更后的公司
- current_company = ''
- # 变更后的集数
- current_num = 0
- # conn_change_type1 = dev_mysql_conn.Mysql()
- # 电视剧名称变更
- sql_change_type1 = """
- select original_name, current_name from odl.dsj_change where original_name = '%s' and change_type = 1
- """
- sql_change_type1 = sql_change_type1 % (name)
- sql_change_type1_rows = Mysql.getAll(sql_change_type1, conn=conn)
-
- if len(sql_change_type1_rows) == 1:
- current_name = sql_change_type1_rows[0]['current_name']
- elif len(sql_change_type1_rows) > 1:
- # 如果多余一条记录,则存放在dict中
- name_dict = {}
- for row in sql_change_type1_rows:
- original_name = row['original_name']
- current_name = row['current_name']
- name_dict[original_name] = current_name
- while True:
- if len(name_dict) > 1:
- current_name = name_dict.get(name)
- if current_name is None or len(current_name) == 0:
- current_name = ''
- break
- del name_dict[name]
- else:
- current_name = name_dict.get(current_name)
- break
- # conn_change_type2 = dev_mysql_conn.Mysql()
- # 类型2变更
- sql_change_type2 = """
- select name, original_company, current_company from odl.dsj_change where name = '%s' and change_type = 2
- """
- sql_change_type2 = sql_change_type2 % (name)
- sql_change_type2_rows = Mysql.getAll(sql_change_type2, conn=conn)
-
- if len(sql_change_type2_rows) == 1:
- current_company = sql_change_type2_rows[0]['current_company']
- elif len(sql_change_type2_rows) > 1:
- company_dict = {}
- for row in sql_change_type2_rows:
- original_company = row['original_company']
- current_company = row['current_company']
- company_dict[original_company] = current_company
- while True:
- if len(company_dict) > 1:
- current_company = company_dict.get(company)
- if current_company is None or len(current_company) == 0:
- current_company = ''
- break
- del company_dict[company]
- else:
- current_company = company_dict.get(current_company)
- break
- # conn_change_type3 = dev_mysql_conn.Mysql()
- # 类型3变更
- sql_change_type3 = """
- select name, original_num, current_num from odl.dsj_change where name = '%s' and change_type = 3
- """
- sql_change_type3 = sql_change_type3 % (name)
- sql_change_type3_rows = Mysql.getAll(sql_change_type3, conn=conn)
- if len(sql_change_type3_rows) == 1:
- current_num = sql_change_type3_rows[0]['current_num']
- elif len(sql_change_type3_rows) > 1:
- num_dict = {}
- for row in sql_change_type3_rows:
- original_num = row['original_num']
- current_num = row['current_num']
- num_dict[original_num] = current_num
- while True:
- if len(num_dict) > 1:
- current_num = num_dict.get(num)
- if current_num is None or len(current_num) == 0:
- current_num = 0
- break
- del num_dict[num]
- else:
- current_num = num_dict.get(current_num)
- break
- if current_name is None or len(current_name) == 0:
- current_name = name
- # 发行数据查询
- sql_distribution = """
- select name, company, num, pub_date from odl.faxing where name = '%s'
- """
- sql_distribution = sql_distribution % (current_name)
- try:
- # conn_distribution = dev_mysql_conn.Mysql()
- sql_distribution_rows = Mysql.getAll(sql_distribution, conn=conn)
- except Exception, e:
- # conn_distribution = dev_mysql_conn.Mysql()
- sql_distribution_rows = Mysql.getAll(sql_distribution, conn=conn)
- is_distribute = 0
- pub_date = ''
- if len(sql_distribution_rows) >= 1:
- pub_date = sql_distribution_rows[0]['pub_date']
- is_distribute = 1
- sql_insert = """
- insert into odl.ad_tv_record_distribution (tv_id, name, current_name, area, theme, first_type, second_type, company, record_date, form, num, \
- film_date, film_period, `desc`, scriptwriter, director, distribution_date, is_distribute) values ('%s', '%s', '%s', '%s', '%s', \
- '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')
- """
- sql_insert = sql_insert % (_id, name, current_name, area, theme, theme, theme, company, show_date, form, num, film_date, film_period, desc.replace("'", "\\'"),
- scriptwriter, director, pub_date, is_distribute)
- print sql_insert
- try_times = 0
- while True:
- if try_times > 3:
- break
- try:
- # conn1 = dev_mysql_conn.Mysql()
- Mysql.insertOne(sql_insert, conn=conn)
- try_times = 0
- break
- except Exception, e:
- try_times += 1
- print e
- if try_times > 3:
- break
- Mysql.close(conn)
|