scrapy_dianshiju_clean.py 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. #!/usr/bin/env python
  2. #coding=utf-8
  3. """清洗爬取到的分类数据
  4. 流程:爬取爱奇艺数据,爬取腾讯数据 -> 将两个分类去重合并 -> 将分类处理到关联表中,每个tv_id对应一个分类
  5. """
  6. import datetime
  7. import os
  8. import sys
  9. import time
  10. from fty_util.common import Mysql
  11. reload(sys)
  12. sys.setdefaultencoding('utf8')
  13. # 爱奇艺数据清洗
  14. def iqiyi_content_clean():
  15. pass
  16. # 腾讯数据清洗
  17. def tengxun_content_clean():
  18. pass
  19. conn = Mysql.createOfflineConn()
  20. # 查询爬取到的爱奇艺分类和腾讯视频分类
  21. sql = """
  22. select id, tv_name, iqiyi_types, tengxun_types from scrapy.tv_category_scrapy
  23. """
  24. rows = Mysql.getAll(sql, conn=conn)
  25. for row in rows:
  26. _id = row['id']
  27. tv_name = row['tv_name']
  28. iqiyi_types = row['iqiyi_types']
  29. tengxun_types = row['tengxun_types']
  30. all_types = set()
  31. if iqiyi_types is not None and len(iqiyi_types) > 0:
  32. for iqiyi_type in iqiyi_types.split(' '):
  33. all_types.add(iqiyi_type)
  34. if tengxun_types is not None and len(tengxun_types) > 0:
  35. for tengxun_type in tengxun_types.split(' '):
  36. all_types.add(tengxun_type)
  37. sql = """
  38. update scrapy.tv_category_scrapy set types = '%s' where id = '%s'
  39. """
  40. sql = sql % (' '.join(all_types), _id)
  41. Mysql.execute(sql, conn=conn)
  42. Mysql.close(conn)