scrapy_category_clean.py 1.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. #!/usr/bin/env python
  2. #coding=utf-8
  3. """清洗爬取到的分类数据
  4. 流程:爬取爱奇艺数据,爬取腾讯数据 -> 将两个分类去重合并 -> 将分类处理到关联表中,每个tv_id对应一个分类
  5. """
  6. import datetime
  7. import os
  8. import sys
  9. import time
  10. from fty_util.common import Mysql
  11. reload(sys)
  12. sys.setdefaultencoding('utf8')
  13. conn = Mysql.createOfflineConn()
  14. # 查询爬取到的爱奇艺分类和腾讯视频分类
  15. sql = """
  16. select id, tv_name, iqiyi_types, tengxun_types from scrapy.tv_category_scrapy
  17. """
  18. rows = Mysql.getAll(sql, conn=conn)
  19. for row in rows:
  20. _id = row['id']
  21. tv_name = row['tv_name']
  22. iqiyi_types = row['iqiyi_types']
  23. tengxun_types = row['tengxun_types']
  24. all_types = set()
  25. if iqiyi_types is not None and len(iqiyi_types) > 0:
  26. for iqiyi_type in iqiyi_types.split(' '):
  27. all_types.add(iqiyi_type)
  28. if tengxun_types is not None and len(tengxun_types) > 0:
  29. for tengxun_type in tengxun_types.split(' '):
  30. all_types.add(tengxun_type)
  31. sql = """
  32. update scrapy.tv_category_scrapy set types = '%s' where id = '%s'
  33. """
  34. sql = sql % (' '.join(all_types), _id)
  35. Mysql.execute(sql, conn=conn)
  36. Mysql.close(conn)