1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556 |
- #!/usr/bin/env python
- #coding=utf-8
- """清洗爬取到的分类数据
- 流程:爬取爱奇艺数据,爬取腾讯数据 -> 将两个分类去重合并 -> 将分类处理到关联表中,每个tv_id对应一个分类
- """
- import datetime
- import os
- import sys
- import time
- from fty_util.common import Mysql
- reload(sys)
- sys.setdefaultencoding('utf8')
- # 爱奇艺数据清洗
- def iqiyi_content_clean():
- pass
- # 腾讯数据清洗
- def tengxun_content_clean():
- pass
- conn = Mysql.createOfflineConn()
- # 查询爬取到的爱奇艺分类和腾讯视频分类
- sql = """
- select id, tv_name, iqiyi_types, tengxun_types from scrapy.tv_category_scrapy
- """
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- _id = row['id']
- tv_name = row['tv_name']
- iqiyi_types = row['iqiyi_types']
- tengxun_types = row['tengxun_types']
- all_types = set()
- if iqiyi_types is not None and len(iqiyi_types) > 0:
- for iqiyi_type in iqiyi_types.split(' '):
- all_types.add(iqiyi_type)
- if tengxun_types is not None and len(tengxun_types) > 0:
- for tengxun_type in tengxun_types.split(' '):
- all_types.add(tengxun_type)
- sql = """
- update scrapy.tv_category_scrapy set types = '%s' where id = '%s'
- """
- sql = sql % (' '.join(all_types), _id)
- Mysql.execute(sql, conn=conn)
- Mysql.close(conn)
|