123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- # -*- encoding:utf-8 -*-
- from sklearn.cluster import KMeans
- import numpy as np
- # from annoy import AnnoyIndex
- import joblib
- def read_data(path):
- lines = []
- with open(path) as f:
- for x in range(600000):
- line = eval(f.readline().strip())
- # if line[-1][0] == 1 or line[-1][1] == 1:
- lines.append(line)
- return lines
- length = 18 # 周期是多少
- j = 24
- def class_fic_dmi(file_path=''):
- lines = read_data(file_path)
- print('读取数据完毕')
- size = len(lines[0])
- x_list = []
- for s in lines:
- tmp_list = []
- for x in range(0, length):
- tmp_list = tmp_list + s[x*j+5:x*j+9]
- x_list.append(tmp_list)
- train_x = np.array(x_list)
- # train_y = [s[size - 1] for s in lines]
- v_x = train_x.reshape(train_x.shape[0], 4*length)
- stock_list = [s[size - 2] for s in lines]
- estimator = KMeans(n_clusters=20, random_state=5179)
- estimator.fit(v_x)
- label_pred = estimator.labels_ # 获取聚类标签
- centroids = estimator.cluster_centers_
- print(label_pred)
- print(centroids)
- joblib.dump(estimator , 'km_dmi_18_20200225.pkl')
- print(estimator.predict(v_x[:10]))
- estimator = joblib.load('km_dmi_18_20200225.pkl')
- print(estimator.predict(v_x[10:20]))
- # annoy_sim(v_x)
- # print('save数据完毕')
- # return find_annoy(train_y, stock_list)
- # length = 18 # 周期是多少
- # j = 24
- def class_fic_k(file_path=''):
- lines = read_data(file_path)
- print('读取数据完毕')
- size = len(lines[0])
- x_list = []
- for s in lines:
- tmp_list = []
- for x in range(0, length):
- tmp_list = tmp_list + s[x*j+1:x*j+5]
- x_list.append(tmp_list)
- train_x = np.array(x_list)
- # train_x = np.array([s[5:9] + s[j+5:j+9] + s[j*2+5:j*2+9] + s[j*3+5:j*3+9] + s[j*4+5:j*4+9] + s[j*5+5:j*5+9] + s[j*6+5:j*6+9]
- # + s[j*7+5:j*7+9] + s[j*8+5:j*8+9] + s[j*9+5:j*9+9] + s[j*10+5:j*10+9] + s[j*11+5:j*11+9] + s[j*12+5:j*12+9] + s[j*13+5:j*13+9]
- # + s[j*14+5:j*14+9] + s[j*15+5:j*15+9] + s[j*9+5:j*9+9] + s[j*10+5:j*10+9] + s[j*11+5:j*11+9] + s[j*12+5:j*12+9] + s[j*13+5:j*13+9]
- # for s in lines])
- # train_y = [s[size - 1] for s in lines]
- v_x = train_x.reshape(train_x.shape[0], 4*length)
- stock_list = [s[size - 2] for s in lines]
- estimator = KMeans(n_clusters=16, random_state=311)
- estimator.fit(v_x)
- label_pred = estimator.labels_ # 获取聚类标签
- centroids = estimator.cluster_centers_
- joblib.dump(estimator , 'km_k_18.pkl')
- print(estimator.predict(v_x[:10]))
- estimator = joblib.load('km_k_18.pkl')
- print(estimator.predict(v_x[10:20]))
- if __name__ == '__main__':
- # class_fic(file_path="D:\\data\\quantization\\stock2_10.log")
- # class_fic_k(file_path="D:\\data\\quantization\\stock10_18_train.log")
- class_fic_dmi(file_path="D:\\data\\quantization\\stock12_18d_train.log")
|