123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185 |
- #!/usr/bin/python
- # -*- coding:utf-8 -*-
- import sys
- import os
- sys.path.append(os.path.abspath('..'))
- from util import pgsql_util
- import json
- query_articles_sql = '''
- select a.aid,a.title,a.cities,a.cid,a.other_info,"array_agg"(t.tag_value),days,a.recom,a."rank",read_count
- from articles a
- LEFT JOIN article_tags t on a.aid = t.aid
- where %s and a.atype = '0' and cid <> 20 AND position('复制' in a.title) = 0 and read_count > 3
- and a.stock_aid is NULL
- GROUP BY a.aid
- %s
- '''
- query_sell_main_sql = '''
- select count(*)
- FROM zhima.order
- where aid in (select aid from articles where stock_aid = '%s')
- and pay_status = 7
- '''
- query_stock_main_sql = '''
- select sum(maxt::int)
- FROM zm_alipay
- where aid in (select aid from articles where stock_aid = '%s')
- and status in (1,2)
- '''
- query_sell_main_sql_2 = '''
- select count(*)
- FROM zhima.order
- where aid = '%s'
- and pay_status = 7
- '''
- query_stock_main_sql_2 = '''
- select sum(maxt::int)
- FROM zm_alipay
- where aid = '%s'
- and status in (1,2)
- '''
- def get_articles(filter, limit):
- rows = pgsql_util.get_rows(query_articles_sql % (filter, limit))
- attr_list = []
- for row in rows:
- attr = {}
- attr['aid'] = row['aid']
- attr['city'] = row['cities']
- attr['cid'] = row['cid']
- other_info = json.loads(row['other_info'])
- attr['dtype'] = other_info.get('dtype')
- attr['recruit'] = other_info.get('recruit')
- attr['country'] = other_info.get('country')
- attr['price'] = other_info.get('price').replace('元', '')
- if attr['price'] == '':
- continue
- attr['days'] = row['days']
- attr['recom'] = row['recom']
- attr['rank'] = row['rank']
- attr['read_count'] = row['read_count']
- if row['cities'] == '株洲':
- sell_count = pgsql_util.get_rows(query_sell_main_sql % (row['aid']))[0]['count']
- stock_count = pgsql_util.get_rows(query_stock_main_sql % (row['aid']))[0]['sum']
- else:
- sell_count = pgsql_util.get_rows(query_sell_main_sql_2 % (row['aid']))[0]['count']
- stock_count = pgsql_util.get_rows(query_stock_main_sql_2 % (row['aid']))[0]['sum']
- attr['sell_count'] = sell_count
- attr['stock_count'] = stock_count
- attr_list.append(attr)
- print(attr)
- return attr_list
- def to_a1(attr_list):
- for attr in attr_list:
- if attr['recruit'] is None:
- attr['recruit'] = 0
- if attr['stock_count'] is None:
- attr['stock_count'] = 0
- if attr['read_count'] is None:
- attr['read_count'] = 0
- if attr['rank'] is None:
- attr['rank'] = 0
- new_attr_list = []
- for attr in attr_list:
- print(attr['aid'])
- new_attr = []
- city = attr['city']
- cid = attr['cid']
- dtype = attr['dtype']
- recruit = attr['recruit']
- country = attr['country']
- price = attr['price'].replace('起', '')
- days = attr['days']
- recom = attr['recom']
- rank = attr['rank']
- read_count = attr['read_count']
- sell_count = attr['sell_count']
- stock_count = attr['stock_count']
- new_attr.extend(to_list_attr(city, open_city_list))
- new_attr.extend(to_list_attr(cid, open_cid_list))
- new_attr.extend(to_list_attr(dtype, open_dtype_list))
- new_attr.extend(to_list_attr(recruit, open_recruit_list))
- new_attr.extend(to_list_attr(country, open_country_list))
- new_attr.append(float(price))
- new_attr.append(float(days))
- new_attr.extend(to_list_attr(recom, open_recom_list))
- new_attr.append(rank)
- new_attr.append(read_count)
- new_attr.append(stock_count)
- new_attr.append(sell_count)
- print(new_attr)
- new_attr_list.append(new_attr)
- return new_attr_list
- open_city_list = ['北京', '长沙', '成都', '重庆', '佛山', '福州', '广州', '杭州', '合肥', '济南',
- '南京', '宁波', '青岛', '上海', '深圳', '苏州', '天津', '武汉', '厦门', '西安',
- '郑州', '株洲']
- open_cid_list = [23,24,25,26,27,28,29]
- open_dtype_list = [0,1,2,3,4,5]
- open_recruit_list = [0,1,2,3,4,5,6,7,8,9]
- open_country_list = ['港澳台', '国内', '新加坡', '英国', '日本', '泰国', '美国']
- open_recom_list = ['yes', 'no']
- def city_attr(city):
- return to_list_attr(city, open_city_list)
- def cid_attr(cid):
- return to_list_attr(cid, open_cid_list)
- def to_list_attr(item, a_list):
- c_list = [0 for i in a_list]
- try:
- idx = a_list.index(item)
- except:
- idx = 0
- c_list[idx] = 1
- return c_list
- def to_file(data_list, file_name):
- with open(file_name, "w") as f:
- for line in data_list:
- line = [line[:-2], [line[-1]]]
- f.write(str(line) + "\n")
- def ge_train():
- attr_list = get_articles("a.crt_time > '2018-01-01' ", "limit 5000")
- new_attr_list = to_a1(attr_list)
- to_file(new_attr_list, "train_data")
- def ge_test():
- attr_list = get_articles("a.crt_time > '2019-09-01' ", "limit 200 OFFSET 0")
- new_attr_list = to_a1(attr_list)
- to_file(new_attr_list, "test_data")
- if __name__ == '__main__':
- # ge_train()
- ge_test()
|