12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364 |
- #!/usr/bin/env python
- #coding=utf-8
- """清洗爬取到的分类数据
- """
- import datetime
- import os
- import sys
- import time
- from fty_util.common import Mysql
- reload(sys)
- sys.setdefaultencoding('utf8')
- conn = Mysql.createOfflineConn()
- # 查询爬取到的爱奇艺分类和腾讯视频分类
- sql = """
- select id, types from scrapy.tv_category_scrapy
- """
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- _id = row['id']
- types = row['types']
- if types is not None and len(types) > 0:
- types_set = set()
- for _type in types.split(' '):
- if _type == '生活':
- continue
- elif _type == '军旅' or _type == '军事':
- types_set.add('军旅')
- elif _type == '惊悚' or _type == '恐怖':
- types_set.add('恐怖')
- elif _type == '魔幻' or _type == '奇幻':
- types_set.add('奇幻')
- elif _type == '偶像' or _type == '时装':
- types_set.add('偶像')
- elif _type == '喜剧' or _type == '搞笑':
- types_set.add('喜剧')
- elif _type == '悬疑' or _type == '冒险' or _type == '侦探':
- types_set.add('悬疑')
- elif _type == '言情' or _type == '情感' or _type == '爱情':
- types_set.add('情感')
- elif _type == '战争' or _type == '抗日' or _type == '革命':
- types_set.add('战争')
- elif _type == '警匪' or _type == '犯罪' or _type == '刑侦':
- types_set.add('罪案')
- elif _type == '传记' or _type == '人物' or _type == '传奇' or _type == '纪实':
- types_set.add('传记')
- else:
- types_set.add(_type)
- sql = """
- update scrapy.tv_category_scrapy set types = '%s' where id = '%s'
- """
- sql = sql % (' '.join(types_set), _id)
- Mysql.execute(sql, conn=conn)
- Mysql.close(conn)
|