123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 |
- # -*- encoding:utf-8 -*-
- from sklearn.cluster import KMeans
- import numpy as np
- # from annoy import AnnoyIndex
- import joblib
- def read_data(path):
- lines = []
- with open(path) as f:
- for x in range(20000):
- line = eval(f.readline().strip())
- # if line[-1][0] == 1 or line[-1][1] == 1:
- lines.append(line)
- return lines
- length = 18 # 周期是多少
- j = 20
- def class_fic(file_path=''):
- lines = read_data(file_path)
- print('读取数据完毕')
- size = len(lines[0])
- x_list = []
- for s in lines:
- tmp_list = []
- for x in range(0, length):
- tmp_list = tmp_list + s[x*j+5:x*j+9]
- x_list.append(tmp_list)
- train_x = np.array(x_list)
- # train_x = np.array([s[5:9] + s[j+5:j+9] + s[j*2+5:j*2+9] + s[j*3+5:j*3+9] + s[j*4+5:j*4+9] + s[j*5+5:j*5+9] + s[j*6+5:j*6+9]
- # + s[j*7+5:j*7+9] + s[j*8+5:j*8+9] + s[j*9+5:j*9+9] + s[j*10+5:j*10+9] + s[j*11+5:j*11+9] + s[j*12+5:j*12+9] + s[j*13+5:j*13+9]
- # + s[j*14+5:j*14+9] + s[j*15+5:j*15+9] + s[j*9+5:j*9+9] + s[j*10+5:j*10+9] + s[j*11+5:j*11+9] + s[j*12+5:j*12+9] + s[j*13+5:j*13+9]
- # for s in lines])
- # train_y = [s[size - 1] for s in lines]
- v_x = train_x.reshape(train_x.shape[0], 4*length)
- stock_list = [s[size - 2] for s in lines]
- estimator = KMeans(n_clusters=12, random_state=129)
- estimator.fit(v_x)
- label_pred = estimator.labels_ # 获取聚类标签
- centroids = estimator.cluster_centers_
- joblib.dump(estimator , 'km_dmi_18.pkl')
- print(estimator.predict(v_x[:10]))
- estimator = joblib.load('km_dmi_18.pkl')
- print(estimator.predict(v_x[10:20]))
- # annoy_sim(v_x)
- # print('save数据完毕')
- # return find_annoy(train_y, stock_list)
- def annoy_sim(lines):
- tree = 30
- t = AnnoyIndex(length*4, metric="angular") # 24是向量维度
- i = 0
- for stock in lines:
- t.add_item(i, stock)
- i = i + 1
- t.build(tree)
- t.save('stock_20d.ann')
- def find_annoy(lines, stock_list):
- t = AnnoyIndex(length*4, metric="angular")
- t.load('stock_20d.ann')
- num = 0
- right = 0
- win_dnn = []
- for i in range(len(lines)):
- index, distance = t.get_nns_by_item(i, 10, include_distances=True)
- # print(index, distance)
- # 预测
- total = 0
- g = 0
- for j in range(1, len(index)):
- if distance[j] < 0.4:
- total = total + 1
- if lines[j][0] == 1:
- g = g + 1
- elif lines[j][1] == 1:
- g = g + 1
- elif lines[j][2] == 1:
- g = g + 0.5
- if total > 1 and g / total > 0.38:
- right = right + 1
- if stock_list[i][1] > 20181101:
- print(stock_list[i])
- win_dnn.append([stock_list[i], lines[i]])
- # 计算
- # if lines[i][0] == 1:
- # g = 0
- # total = 0
- # for j in range(1,len(index)):
- # if distance[j] < 0.4:
- # total = total + 1
- # if lines[j][0] == 1:
- # g = g+1
- # elif lines[j][1] == 1:
- # g = g+1
- # if total > 1 and g/total > 0.21:
- # right = right + 1
- # if total > 1:
- # num = num + 1
- print(right, num)
- print('find数据完毕')
- return win_dnn
- if __name__ == '__main__':
- # class_fic(file_path="D:\\data\\quantization\\stock2_10.log")
- class_fic(file_path="D:\\data\\quantization\\stock9_18.log")
|