#!/usr/bin/env python #coding=utf-8 """清洗爬取到的分类数据 """ import datetime import os import sys import time from fty_util.common import Mysql reload(sys) sys.setdefaultencoding('utf8') conn = Mysql.createOfflineConn() # 查询爬取到的爱奇艺分类和腾讯视频分类 sql = """ select id, types from scrapy.tv_category_scrapy """ rows = Mysql.getAll(sql, conn=conn) for row in rows: _id = row['id'] types = row['types'] if types is not None and len(types) > 0: types_set = set() for _type in types.split(' '): if _type == '生活': continue elif _type == '军旅' or _type == '军事': types_set.add('军旅') elif _type == '惊悚' or _type == '恐怖': types_set.add('恐怖') elif _type == '魔幻' or _type == '奇幻': types_set.add('奇幻') elif _type == '偶像' or _type == '时装': types_set.add('偶像') elif _type == '喜剧' or _type == '搞笑': types_set.add('喜剧') elif _type == '悬疑' or _type == '冒险' or _type == '侦探': types_set.add('悬疑') elif _type == '言情' or _type == '情感' or _type == '爱情': types_set.add('情感') elif _type == '战争' or _type == '抗日' or _type == '革命': types_set.add('战争') elif _type == '警匪' or _type == '犯罪' or _type == '刑侦': types_set.add('罪案') elif _type == '传记' or _type == '人物' or _type == '传奇' or _type == '纪实': types_set.add('传记') else: types_set.add(_type) sql = """ update scrapy.tv_category_scrapy set types = '%s' where id = '%s' """ sql = sql % (' '.join(types_set), _id) Mysql.execute(sql, conn=conn) Mysql.close(conn)