Browse Source

产生测试数据和训练数据

yufeng0528 4 years ago
parent
commit
1af77291d0
1 changed files with 21 additions and 10 deletions
  1. 21 10
      bbztx/get_data.py

+ 21 - 10
bbztx/get_data.py

@@ -7,10 +7,10 @@ query_articles_sql = '''
7 7
 select a.aid,a.title,a.cities,a.cid,a.other_info,"array_agg"(t.tag_value),days,a.recom,a."rank",read_count
8 8
 from articles a
9 9
 LEFT JOIN article_tags t on a.aid = t.aid
10
-where a.crt_time > '2018-01-01' and a.atype = '0'
10
+where %s and a.atype = '0'
11 11
 and a.stock_aid is NULL
12 12
 GROUP BY a.aid
13
-limit 100 OFFSET 0
13
+limit 200 OFFSET 0
14 14
 '''
15 15
 
16 16
 query_sell_main_sql = '''
@@ -42,8 +42,8 @@ and status in (1,2)
42 42
 '''
43 43
 
44 44
 
45
-def get_articles():
46
-    rows = pgsql_util.get_rows(query_articles_sql)
45
+def get_articles(filter):
46
+    rows = pgsql_util.get_rows(query_articles_sql % (filter))
47 47
 
48 48
     attr_list = []
49 49
     for row in rows:
@@ -93,7 +93,7 @@ def to_a1(attr_list):
93 93
         dtype = attr['dtype']
94 94
         recruit = attr['recruit']
95 95
         country = attr['country']
96
-        price = attr['price']
96
+        price = attr['price'].replace('起', '')
97 97
         days = attr['days']
98 98
         recom = attr['recom']
99 99
         rank = attr['rank']
@@ -146,14 +146,25 @@ def to_list_attr(item, a_list):
146 146
     return c_list
147 147
 
148 148
 
149
-def to_file(data_list):
150
-    with open("train_data", "w") as f:
149
+def to_file(data_list, file_name):
150
+    with open(file_name, "w") as f:
151 151
         for line in data_list:
152 152
             line = [line[:-2], [line[-1]]]
153 153
             f.write(str(line) + "\n")
154 154
 
155 155
 
156
-if __name__ == '__main__':
157
-    attr_list = get_articles()
156
+def ge_train():
157
+    attr_list = get_articles("a.crt_time > '2018-01-01' ")
158
+    new_attr_list = to_a1(attr_list)
159
+    to_file(new_attr_list, "train_data")
160
+
161
+
162
+def ge_test():
163
+    attr_list = get_articles("a.crt_time > '2019-09-01' ")
158 164
     new_attr_list = to_a1(attr_list)
159
-    to_file(new_attr_list)
165
+    to_file(new_attr_list, "test_data")
166
+
167
+
168
+if __name__ == '__main__':
169
+    ge_train()
170
+    ge_test()