Browse Source

kmeans before train

yufeng0528 4 years ago
parent
commit
9c75846a69
3 changed files with 78 additions and 24 deletions
  1. 14 3
      stock/dnn_predict.py
  2. 47 14
      stock/dnn_train.py
  3. 17 7
      stock/kmeans.py

+ 14 - 3
stock/dnn_predict.py

@@ -56,10 +56,21 @@ def predict(file_path='', model_path='15min_dnn_seq.h5'):
56 56
                 up_num = up_num + 1
57 57
 
58 58
             i = i + 1
59
+    if up_num == 0:
60
+        up_num = 1
59 61
     print('DNN', up_right, up_num, up_right/up_num)
60
-    return win_dnn
62
+    return win_dnn,up_right/up_num
63
+
64
+
65
+def multi_predict():
66
+    r = 0
67
+    for x in [7,9,12,14,15]:
68
+        win_dnn, ratio = predict(file_path='D:\\data\\quantization\\kmeans\\stock2_10_' + str(x) + '_test.log', model_path='5d_dnn_seq_' + str(x) + '.h5')
69
+        r = r + ratio
70
+    print(r)
61 71
 
62 72
 
63 73
 if __name__ == '__main__':
64
-    predict(file_path='D:\\data\\quantization\\stock6_5_test.log', model_path='5d_dnn_seq.h5')
65
-    # predict(file_path='D:\\data\\quantization\\stock6_test.log', model_path='15m_dnn_seq.h5')
74
+    # predict(file_path='D:\\data\\quantization\\stock6_5_test.log', model_path='5d_dnn_seq.h5')
75
+    # predict(file_path='D:\\data\\quantization\\stock6_test.log', model_path='15m_dnn_seq.h5')
76
+    multi_predict()

+ 47 - 14
stock/dnn_train.py

@@ -7,13 +7,16 @@ import random
7 7
 from keras import regularizers
8 8
 from keras.models import load_model
9 9
 from imblearn.over_sampling import RandomOverSampler
10
+import joblib
10 11
 
11 12
 
12 13
 def read_data(path):
13 14
     lines = []
14 15
     with open(path) as f:
15
-        for x in range(30000):
16
-            lines.append(eval(f.readline().strip()))
16
+        # for x in range(30000):
17
+        # lines.append(eval(f.readline().strip()))
18
+        for line in f.readlines()[:]:
19
+            lines.append(eval(line.strip()))
17 20
 
18 21
     random.shuffle(lines)
19 22
     print('读取数据完毕')
@@ -36,25 +39,53 @@ def read_data(path):
36 39
     return X_resampled,y_resampled,np.array(test_x),np.array(test_y)
37 40
 
38 41
 
42
+def resample(path):
43
+    lines = []
44
+    with open(path) as f:
45
+        for x in range(160000):
46
+            lines.append(eval(f.readline().strip()))
47
+    estimator = joblib.load('km.pkl')
48
+
49
+    x = 17
50
+    for line in lines:
51
+        v = line[1:x*10 + 1]
52
+        v = np.array(v)
53
+        v = v.reshape(10, x)
54
+        v = v[:,0:4]
55
+        v = v.reshape(1, 40)
56
+        # print(v)
57
+        r = estimator.predict(v)
58
+        with open('D:\\data\\quantization\\kmeans\\stock2_10_' + str(r[0]) + '.log', 'a') as f:
59
+            f.write(str(line) + '\n')
60
+
61
+
62
+def mul_train():
63
+    # for x in range(0, 16):
64
+    for x in [7,9,12,14,15]:
65
+        train(input_dim=176, result_class=5, file_path="D:\\data\\quantization\\kmeans\\stock2_10_" + str(x) + ".log",
66
+              model_name='5d_dnn_seq_' + str(x) + '.h5')
67
+
68
+
39 69
 def train(input_dim=400, result_class=3, file_path="D:\\data\\quantization\\stock6.log", model_name=''):
40 70
     train_x,train_y,test_x,test_y=read_data(file_path)
41 71
 
42 72
     model = Sequential()
43 73
     model.add(Dense(units=120+input_dim, input_dim=input_dim,  activation='relu'))
44
-    model.add(Dense(units=120+input_dim, activation='relu',kernel_regularizer=regularizers.l2(0.001)))
74
+    model.add(Dense(units=120+input_dim, activation='relu',kernel_regularizer=regularizers.l1(0.001)))
45 75
     model.add(Dense(units=120+input_dim, activation='relu'))
46
-    model.add(Dropout(0.2))
47
-    model.add(Dense(units=60+input_dim, activation='selu'))
48
-    model.add(Dropout(0.2))
76
+    model.add(Dense(units=120 + input_dim, activation='relu'))
77
+    model.add(Dropout(0.1))
49 78
     model.add(Dense(units=60+input_dim, activation='selu'))
50 79
     # model.add(Dropout(0.2))
80
+    # model.add(Dense(units=60+input_dim, activation='selu'))
81
+    # model.add(Dropout(0.2))
51 82
     model.add(Dense(units=512, activation='relu'))
52 83
 
53 84
     model.add(Dense(units=result_class, activation='softmax'))
54 85
     model.compile(loss='categorical_crossentropy', optimizer="adam",metrics=['accuracy'])
55 86
 
56 87
     print("Starting training ")
57
-    model.fit(train_x, train_y, batch_size=32, epochs=56, shuffle=True)
88
+    model.fit(train_x, train_y, batch_size=32, epochs=117 + int(len(train_x)/900), shuffle=True)
58 89
     score = model.evaluate(test_x, test_y)
59 90
     print(score)
60 91
     print('Test score:', score[0])
@@ -62,13 +93,15 @@ def train(input_dim=400, result_class=3, file_path="D:\\data\\quantization\\stoc
62 93
 
63 94
     model.save(model_name)
64 95
 
65
-    model=None
66
-    model=load_model(model_name)
67
-    result=model.predict(test_x)
68
-    print(result)
69
-    print(test_y)
96
+    # model=None
97
+    # model=load_model(model_name)
98
+    # result=model.predict(test_x)
99
+    # print(result)
100
+    # print(test_y)
70 101
 
71 102
 
72 103
 if __name__ == '__main__':
73
-    train(input_dim=86, result_class=5, file_path="D:\\data\\quantization\\stock6_5.log", model_name='5d_dnn_seq.h5')
74
-    # train(input_dim=400, result_class=3, file_path="D:\\data\\quantization\\stock6.log", model_name='15m_dnn_seq.h5')
104
+    # train(input_dim=86, result_class=5, file_path="D:\\data\\quantization\\stock6_5.log", model_name='5d_dnn_seq.h5')
105
+    # train(input_dim=400, result_class=3, file_path="D:\\data\\quantization\\stock6.log", model_name='15m_dnn_seq.h5')
106
+    # resample('D:\\data\\quantization\\stock6_5.log')
107
+    mul_train()

+ 17 - 7
stock/kmeans.py

@@ -2,20 +2,20 @@
2 2
 from sklearn.cluster import KMeans
3 3
 import numpy as np
4 4
 from annoy import AnnoyIndex
5
-
5
+import joblib
6 6
 
7 7
 def read_data(path):
8 8
     lines = []
9 9
     with open(path) as f:
10
-        for x in range(160000):
10
+        for x in range(100000):
11 11
             line = eval(f.readline().strip())
12 12
             # if line[-1][0] == 1 or line[-1][1] == 1:
13 13
             lines.append(line)
14 14
 
15 15
     return lines
16 16
 
17
-length = 20
18
-def class_fic(file_path=''):
17
+length = 10
18
+def  class_fic(file_path=''):
19 19
     lines = read_data(file_path)
20 20
     print('读取数据完毕')
21 21
     size = len(lines[0])
@@ -24,9 +24,19 @@ def class_fic(file_path=''):
24 24
     v_x = train_x.reshape(train_x.shape[0], 4*length)
25 25
     stock_list = [s[size - 2] for s in lines]
26 26
 
27
+    estimator = KMeans(n_clusters=16, random_state=9)
28
+    estimator.fit(v_x)
29
+    label_pred = estimator.labels_  # 获取聚类标签
30
+    centroids = estimator.cluster_centers_
31
+    joblib.dump(estimator , 'km.pkl')
32
+
33
+    print(estimator.predict(v_x[:10]))
34
+
35
+    estimator = joblib.load('km.pkl')
36
+    print(estimator.predict(v_x[10:20]))
27 37
     # annoy_sim(v_x)
28
-    print('save数据完毕')
29
-    return find_annoy(train_y, stock_list)
38
+    # print('save数据完毕')
39
+    # return find_annoy(train_y, stock_list)
30 40
 
31 41
 def annoy_sim(lines):
32 42
     tree = 30
@@ -93,4 +103,4 @@ def find_annoy(lines, stock_list):
93 103
 
94 104
 
95 105
 if __name__ == '__main__':
96
-    class_fic(file_path="D:\\data\\quantization\\stock2_20.log")
106
+    class_fic(file_path="D:\\data\\quantization\\stock2_10.log")