transform_categories.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. #!/usr/bin/env python
  2. #coding=utf-8
  3. import datetime
  4. import sys
  5. from fty_util.common import Mysql
  6. reload(sys)
  7. sys.setdefaultencoding('utf8')
  8. conn = Mysql.createOfflineConn()
  9. cat_dict = {}
  10. # 获取所有标准分类和对应的映射分类
  11. sql = """
  12. select standard_category, reflect_category from odl.basic_categories
  13. """
  14. categories = Mysql.getAll(sql, conn=conn)
  15. for category in categories:
  16. standard_category = category['standard_category']
  17. reflect_category = category['reflect_category']
  18. cat_dict[reflect_category] = standard_category
  19. sql = """
  20. select tv_id, iqiyi_types, iqiyi_types_new, tengxun_types, tengxun_types_new, baike_types, baike_types_new, manual_types, manual_types_new from scrapy.types_analyse where iqiyi_tengxun_after_baike_after_manual is null or iqiyi_tengxun_after_baike_after_manual = '' order by tv_id asc
  21. """
  22. rows = Mysql.getAll(sql, conn=conn)
  23. for row in rows:
  24. tv_id = row['tv_id']
  25. iqiyi_types = row['iqiyi_types']
  26. iqiyi_types_new = row['iqiyi_types_new']
  27. tengxun_types = row['tengxun_types']
  28. tengxun_types_new = row['tengxun_types_new']
  29. baike_types = row['baike_types']
  30. baike_types_new = row['baike_types_new']
  31. manual_types = row['manual_types']
  32. manual_types_new = row['manual_types_new']
  33. iqiyi_types_set = set()
  34. if (iqiyi_types_new is None or len(iqiyi_types_new) == 0) and iqiyi_types is not None and len(iqiyi_types) > 0:
  35. for _type in iqiyi_types.split(' '):
  36. cate = cat_dict.get(_type)
  37. if cate is not None:
  38. iqiyi_types_set.add(cate)
  39. tengxun_types_set = set()
  40. if (tengxun_types_new is None or len(tengxun_types_new) == 0) and tengxun_types is not None and len(tengxun_types) > 0:
  41. for _type in tengxun_types.split(' '):
  42. cate = cat_dict.get(_type)
  43. if cate is not None:
  44. tengxun_types_set.add(cate)
  45. baike_types_set = set()
  46. if (baike_types_new is None or len(baike_types_new) == 0) and baike_types is not None and len(baike_types) > 0:
  47. for _type in baike_types.split(' '):
  48. cate = cat_dict.get(_type)
  49. if cate is not None:
  50. baike_types_set.add(cate)
  51. manual_types_set = set()
  52. if (manual_types_new is None or len(manual_types_new) == 0) and manual_types is not None and len(manual_types) > 0:
  53. for _type in manual_types.split(' '):
  54. cate = cat_dict.get(_type)
  55. if cate is not None:
  56. manual_types_set.add(cate)
  57. all_types = set()
  58. if len(iqiyi_types_set | tengxun_types_set) > 2:
  59. all_types = iqiyi_types_set | tengxun_types_set
  60. elif len(iqiyi_types_set | tengxun_types_set | baike_types_set) > 2:
  61. all_types = iqiyi_types_set | tengxun_types_set | baike_types_set
  62. elif len(iqiyi_types_set | tengxun_types_set | baike_types_set | manual_types_set) > 2:
  63. all_types = iqiyi_types_set | tengxun_types_set | baike_types_set | manual_types_set
  64. sql = """
  65. update scrapy.types_analyse set iqiyi_types_new = %s, tengxun_types_new = %s, baike_types_new = %s, manual_types_new = %s, iqiyi_tengxun_after_baike_after_manual = %s where tv_id = %s
  66. """
  67. value = (' '.join(iqiyi_types_set), ' '.join(tengxun_types_set), ' '.join(baike_types_set), ' '.join(manual_types_set), ' '.join(all_types), tv_id)
  68. Mysql.update(sql, param=value, conn=conn)
  69. # 更新 yxb.ad_tv_lib 表
  70. sql = """
  71. update yxb.ad_tv_lib set categories = %s where id = %s
  72. """
  73. value = (' '.join(all_types), tv_id)
  74. Mysql.update(sql, param=value, conn=conn)
  75. # 更新 odl.ad_tv_lib 表
  76. sql = """
  77. update odl.ad_tv_lib set categories = %s where tv_id = %s
  78. """
  79. value = (' '.join(all_types), tv_id)
  80. Mysql.update(sql, param=value, conn=conn)