get_data.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. #!/usr/bin/python
  2. # -*- coding:utf-8 -*-
  3. import sys
  4. import os
  5. sys.path.append(os.path.abspath('..'))
  6. from util import pgsql_util
  7. import json
  8. query_articles_sql = '''
  9. select a.aid,a.title,a.cities,a.cid,a.other_info,"array_agg"(t.tag_value),days,a.recom,a."rank",read_count
  10. from articles a
  11. LEFT JOIN article_tags t on a.aid = t.aid
  12. where %s and a.atype = '0' and cid <> 20 AND position('复制' in a.title) = 0 and read_count > 3
  13. and a.stock_aid is NULL
  14. GROUP BY a.aid
  15. %s
  16. '''
  17. query_sell_main_sql = '''
  18. select count(*)
  19. FROM zhima.order
  20. where aid in (select aid from articles where stock_aid = '%s')
  21. and pay_status = 7
  22. '''
  23. query_stock_main_sql = '''
  24. select sum(maxt::int)
  25. FROM zm_alipay
  26. where aid in (select aid from articles where stock_aid = '%s')
  27. and status in (1,2)
  28. '''
  29. query_sell_main_sql_2 = '''
  30. select count(*)
  31. FROM zhima.order
  32. where aid = '%s'
  33. and pay_status = 7
  34. '''
  35. query_stock_main_sql_2 = '''
  36. select sum(maxt::int)
  37. FROM zm_alipay
  38. where aid = '%s'
  39. and status in (1,2)
  40. '''
  41. def get_articles(filter, limit):
  42. rows = pgsql_util.get_rows(query_articles_sql % (filter, limit))
  43. attr_list = []
  44. for row in rows:
  45. attr = {}
  46. attr['aid'] = row['aid']
  47. attr['city'] = row['cities']
  48. attr['cid'] = row['cid']
  49. other_info = json.loads(row['other_info'])
  50. attr['dtype'] = other_info.get('dtype')
  51. attr['recruit'] = other_info.get('recruit')
  52. attr['country'] = other_info.get('country')
  53. attr['price'] = other_info.get('price').replace('元', '')
  54. if attr['price'] == '':
  55. continue
  56. attr['days'] = row['days']
  57. attr['recom'] = row['recom']
  58. attr['rank'] = row['rank']
  59. attr['read_count'] = row['read_count']
  60. if row['cities'] == '株洲':
  61. sell_count = pgsql_util.get_rows(query_sell_main_sql % (row['aid']))[0]['count']
  62. stock_count = pgsql_util.get_rows(query_stock_main_sql % (row['aid']))[0]['sum']
  63. else:
  64. sell_count = pgsql_util.get_rows(query_sell_main_sql_2 % (row['aid']))[0]['count']
  65. stock_count = pgsql_util.get_rows(query_stock_main_sql_2 % (row['aid']))[0]['sum']
  66. attr['sell_count'] = sell_count
  67. attr['stock_count'] = stock_count
  68. attr_list.append(attr)
  69. print(attr)
  70. return attr_list
  71. def to_a1(attr_list):
  72. for attr in attr_list:
  73. if attr['recruit'] is None:
  74. attr['recruit'] = 0
  75. if attr['stock_count'] is None:
  76. attr['stock_count'] = 0
  77. if attr['read_count'] is None:
  78. attr['read_count'] = 0
  79. if attr['rank'] is None:
  80. attr['rank'] = 0
  81. new_attr_list = []
  82. for attr in attr_list:
  83. print(attr['aid'])
  84. new_attr = []
  85. city = attr['city']
  86. cid = attr['cid']
  87. dtype = attr['dtype']
  88. recruit = attr['recruit']
  89. country = attr['country']
  90. price = attr['price'].replace('起', '')
  91. days = attr['days']
  92. recom = attr['recom']
  93. rank = attr['rank']
  94. read_count = attr['read_count']
  95. sell_count = attr['sell_count']
  96. stock_count = attr['stock_count']
  97. new_attr.extend(to_list_attr(city, open_city_list))
  98. new_attr.extend(to_list_attr(cid, open_cid_list))
  99. new_attr.extend(to_list_attr(dtype, open_dtype_list))
  100. new_attr.extend(to_list_attr(recruit, open_recruit_list))
  101. new_attr.extend(to_list_attr(country, open_country_list))
  102. new_attr.append(float(price))
  103. new_attr.append(float(days))
  104. new_attr.extend(to_list_attr(recom, open_recom_list))
  105. new_attr.append(rank)
  106. new_attr.append(read_count)
  107. new_attr.append(stock_count)
  108. new_attr.append(sell_count)
  109. print(new_attr)
  110. new_attr_list.append(new_attr)
  111. return new_attr_list
  112. open_city_list = ['北京', '长沙', '成都', '重庆', '佛山', '福州', '广州', '杭州', '合肥', '济南',
  113. '南京', '宁波', '青岛', '上海', '深圳', '苏州', '天津', '武汉', '厦门', '西安',
  114. '郑州', '株洲']
  115. open_cid_list = [23,24,25,26,27,28,29]
  116. open_dtype_list = [0,1,2,3,4,5]
  117. open_recruit_list = [0,1,2,3,4,5,6,7,8,9]
  118. open_country_list = ['港澳台', '国内', '新加坡', '英国', '日本', '泰国', '美国']
  119. open_recom_list = ['yes', 'no']
  120. def city_attr(city):
  121. return to_list_attr(city, open_city_list)
  122. def cid_attr(cid):
  123. return to_list_attr(cid, open_cid_list)
  124. def to_list_attr(item, a_list):
  125. c_list = [0 for i in a_list]
  126. try:
  127. idx = a_list.index(item)
  128. except:
  129. idx = 0
  130. c_list[idx] = 1
  131. return c_list
  132. def to_file(data_list, file_name):
  133. with open(file_name, "w") as f:
  134. for line in data_list:
  135. line = [line[:-2], [line[-1]]]
  136. f.write(str(line) + "\n")
  137. def ge_train():
  138. attr_list = get_articles("a.crt_time > '2018-01-01' ", "limit 5000")
  139. new_attr_list = to_a1(attr_list)
  140. to_file(new_attr_list, "train_data")
  141. def ge_test():
  142. attr_list = get_articles("a.crt_time > '2019-09-01' ", "limit 200 OFFSET 0")
  143. new_attr_list = to_a1(attr_list)
  144. to_file(new_attr_list, "test_data")
  145. if __name__ == '__main__':
  146. # ge_train()
  147. ge_test()