scrapy_category_update.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. #!/usr/bin/env python
  2. #coding=utf-8
  3. """清洗爬取到的分类数据
  4. """
  5. import datetime
  6. import os
  7. import sys
  8. import time
  9. from fty_util.common import Mysql
  10. reload(sys)
  11. sys.setdefaultencoding('utf8')
  12. conn = Mysql.createOfflineConn()
  13. # 查询爬取到的爱奇艺分类和腾讯视频分类
  14. sql = """
  15. select id, types from scrapy.tv_category_scrapy
  16. """
  17. rows = Mysql.getAll(sql, conn=conn)
  18. for row in rows:
  19. _id = row['id']
  20. types = row['types']
  21. if types is not None and len(types) > 0:
  22. types_set = set()
  23. for _type in types.split(' '):
  24. if _type == '生活':
  25. continue
  26. elif _type == '军旅' or _type == '军事':
  27. types_set.add('军旅')
  28. elif _type == '惊悚' or _type == '恐怖':
  29. types_set.add('恐怖')
  30. elif _type == '魔幻' or _type == '奇幻':
  31. types_set.add('奇幻')
  32. elif _type == '偶像' or _type == '时装':
  33. types_set.add('偶像')
  34. elif _type == '喜剧' or _type == '搞笑':
  35. types_set.add('喜剧')
  36. elif _type == '悬疑' or _type == '冒险' or _type == '侦探':
  37. types_set.add('悬疑')
  38. elif _type == '言情' or _type == '情感' or _type == '爱情':
  39. types_set.add('情感')
  40. elif _type == '战争' or _type == '抗日' or _type == '革命':
  41. types_set.add('战争')
  42. elif _type == '警匪' or _type == '犯罪' or _type == '刑侦':
  43. types_set.add('罪案')
  44. elif _type == '传记' or _type == '人物' or _type == '传奇' or _type == '纪实':
  45. types_set.add('传记')
  46. else:
  47. types_set.add(_type)
  48. sql = """
  49. update scrapy.tv_category_scrapy set types = '%s' where id = '%s'
  50. """
  51. sql = sql % (' '.join(types_set), _id)
  52. Mysql.execute(sql, conn=conn)
  53. Mysql.close(conn)