lianghua
/
py_script


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
							#!/usr/bin/env python
#coding=utf-8

"""清洗爬取到的分类数据

流程：爬取爱奇艺数据，爬取腾讯数据 -> 将两个分类去重合并 -> 将分类处理到关联表中，每个tv_id对应一个分类
"""

import datetime
import os
import sys
import time

from fty_util.common import Mysql

reload(sys)
sys.setdefaultencoding('utf8')

# 爱奇艺数据清洗
def iqiyi_content_clean():
    pass

# 腾讯数据清洗
def tengxun_content_clean():
    pass

conn = Mysql.createOfflineConn()

# 查询爬取到的爱奇艺分类和腾讯视频分类
sql = """
    select id, tv_name, iqiyi_types, tengxun_types from scrapy.tv_category_scrapy
"""
rows = Mysql.getAll(sql, conn=conn)

for row in rows:
    _id = row['id']
    tv_name = row['tv_name']
    iqiyi_types = row['iqiyi_types']
    tengxun_types = row['tengxun_types']
    all_types = set()
    if iqiyi_types is not None and len(iqiyi_types) > 0:
        for iqiyi_type in iqiyi_types.split(' '):
            all_types.add(iqiyi_type)

    if tengxun_types is not None and len(tengxun_types) > 0:
        for tengxun_type in tengxun_types.split(' '):
            all_types.add(tengxun_type)

    sql = """
        update scrapy.tv_category_scrapy set types = '%s' where id = '%s'
    """
    sql = sql % (' '.join(all_types), _id)
    Mysql.execute(sql, conn=conn)

Mysql.close(conn)