#!/usr/bin/env python #coding=utf-8 """清洗爬取到的分类数据 流程:爬取爱奇艺数据,爬取腾讯数据 -> 将两个分类去重合并 -> 将分类处理到关联表中,每个tv_id对应一个分类 """ import datetime import os import sys import time from fty_util.common import Mysql reload(sys) sys.setdefaultencoding('utf8') conn = Mysql.createOfflineConn() # 查询爬取到的爱奇艺分类和腾讯视频分类 sql = """ select id, tv_name, iqiyi_types, tengxun_types from scrapy.tv_category_scrapy """ rows = Mysql.getAll(sql, conn=conn) for row in rows: _id = row['id'] tv_name = row['tv_name'] iqiyi_types = row['iqiyi_types'] tengxun_types = row['tengxun_types'] all_types = set() if iqiyi_types is not None and len(iqiyi_types) > 0: for iqiyi_type in iqiyi_types.split(' '): all_types.add(iqiyi_type) if tengxun_types is not None and len(tengxun_types) > 0: for tengxun_type in tengxun_types.split(' '): all_types.add(tengxun_type) sql = """ update scrapy.tv_category_scrapy set types = '%s' where id = '%s' """ sql = sql % (' '.join(all_types), _id) Mysql.execute(sql, conn=conn) Mysql.close(conn)