#!/usr/bin/python # -*- coding:utf-8 -*- import sys import os sys.path.append(os.path.abspath('..')) from util import pgsql_util import json query_articles_sql = ''' select a.aid,a.title,a.cities,a.cid,a.other_info,"array_agg"(t.tag_value),days,a.recom,a."rank",read_count from articles a LEFT JOIN article_tags t on a.aid = t.aid where %s and a.atype = '0' and cid <> 20 AND position('复制' in a.title) = 0 and read_count > 3 and a.stock_aid is NULL GROUP BY a.aid %s ''' query_sell_main_sql = ''' select count(*) FROM zhima.order where aid in (select aid from articles where stock_aid = '%s') and pay_status = 7 ''' query_stock_main_sql = ''' select sum(maxt::int) FROM zm_alipay where aid in (select aid from articles where stock_aid = '%s') and status in (1,2) ''' query_sell_main_sql_2 = ''' select count(*) FROM zhima.order where aid = '%s' and pay_status = 7 ''' query_stock_main_sql_2 = ''' select sum(maxt::int) FROM zm_alipay where aid = '%s' and status in (1,2) ''' def get_articles(filter, limit): rows = pgsql_util.get_rows(query_articles_sql % (filter, limit)) attr_list = [] for row in rows: attr = {} attr['aid'] = row['aid'] attr['city'] = row['cities'] attr['cid'] = row['cid'] other_info = json.loads(row['other_info']) attr['dtype'] = other_info.get('dtype') attr['recruit'] = other_info.get('recruit') attr['country'] = other_info.get('country') attr['price'] = other_info.get('price').replace('元', '') if attr['price'] == '': continue attr['days'] = row['days'] attr['recom'] = row['recom'] attr['rank'] = row['rank'] attr['read_count'] = row['read_count'] if row['cities'] == '株洲': sell_count = pgsql_util.get_rows(query_sell_main_sql % (row['aid']))[0]['count'] stock_count = pgsql_util.get_rows(query_stock_main_sql % (row['aid']))[0]['sum'] else: sell_count = pgsql_util.get_rows(query_sell_main_sql_2 % (row['aid']))[0]['count'] stock_count = pgsql_util.get_rows(query_stock_main_sql_2 % (row['aid']))[0]['sum'] attr['sell_count'] = sell_count attr['stock_count'] = stock_count attr_list.append(attr) print(attr) return attr_list def to_a1(attr_list): for attr in attr_list: if attr['recruit'] is None: attr['recruit'] = 0 if attr['stock_count'] is None: attr['stock_count'] = 0 if attr['read_count'] is None: attr['read_count'] = 0 if attr['rank'] is None: attr['rank'] = 0 new_attr_list = [] for attr in attr_list: print(attr['aid']) new_attr = [] city = attr['city'] cid = attr['cid'] dtype = attr['dtype'] recruit = attr['recruit'] country = attr['country'] price = attr['price'].replace('起', '') days = attr['days'] recom = attr['recom'] rank = attr['rank'] read_count = attr['read_count'] sell_count = attr['sell_count'] stock_count = attr['stock_count'] new_attr.extend(to_list_attr(city, open_city_list)) new_attr.extend(to_list_attr(cid, open_cid_list)) new_attr.extend(to_list_attr(dtype, open_dtype_list)) new_attr.extend(to_list_attr(recruit, open_recruit_list)) new_attr.extend(to_list_attr(country, open_country_list)) new_attr.append(float(price)) new_attr.append(float(days)) new_attr.extend(to_list_attr(recom, open_recom_list)) new_attr.append(rank) new_attr.append(read_count) new_attr.append(stock_count) new_attr.append(sell_count) print(new_attr) new_attr_list.append(new_attr) return new_attr_list open_city_list = ['北京', '长沙', '成都', '重庆', '佛山', '福州', '广州', '杭州', '合肥', '济南', '南京', '宁波', '青岛', '上海', '深圳', '苏州', '天津', '武汉', '厦门', '西安', '郑州', '株洲'] open_cid_list = [23,24,25,26,27,28,29] open_dtype_list = [0,1,2,3,4,5] open_recruit_list = [0,1,2,3,4,5,6,7,8,9] open_country_list = ['港澳台', '国内', '新加坡', '英国', '日本', '泰国', '美国'] open_recom_list = ['yes', 'no'] def city_attr(city): return to_list_attr(city, open_city_list) def cid_attr(cid): return to_list_attr(cid, open_cid_list) def to_list_attr(item, a_list): c_list = [0 for i in a_list] try: idx = a_list.index(item) except: idx = 0 c_list[idx] = 1 return c_list def to_file(data_list, file_name): with open(file_name, "w") as f: for line in data_list: line = [line[:-2], [line[-1]]] f.write(str(line) + "\n") def ge_train(): attr_list = get_articles("a.crt_time > '2018-01-01' ", "limit 5000") new_attr_list = to_a1(attr_list) to_file(new_attr_list, "train_data") def ge_test(): attr_list = get_articles("a.crt_time > '2019-09-01' ", "limit 200 OFFSET 0") new_attr_list = to_a1(attr_list) to_file(new_attr_list, "test_data") if __name__ == '__main__': # ge_train() ge_test()