# -*- encoding:utf-8 -*- from sklearn.cluster import KMeans import numpy as np # from annoy import AnnoyIndex import joblib def read_data(path): lines = [] with open(path) as f: for x in range(400000): line = eval(f.readline().strip()) # if line[-1][0] == 1 or line[-1][1] == 1: lines.append(line) return lines length = 18 # 周期是多少 j = 24 def class_fic_dmi(file_path=''): lines = read_data(file_path) print('读取数据完毕') size = len(lines[0]) x_list = [] for s in lines: tmp_list = [] for x in range(0, length): tmp_list = tmp_list + s[x*j+5:x*j+9] x_list.append(tmp_list) train_x = np.array(x_list) # train_y = [s[size - 1] for s in lines] v_x = train_x.reshape(train_x.shape[0], 4*length) stock_list = [s[size - 2] for s in lines] estimator = KMeans(n_clusters=20, random_state=5179) estimator.fit(v_x) label_pred = estimator.labels_ # 获取聚类标签 centroids = estimator.cluster_centers_ print(label_pred) print(centroids) joblib.dump(estimator , 'km_dmi_18_20200225.pkl') print(estimator.predict(v_x[:10])) estimator = joblib.load('km_dmi_18_20200225.pkl') print(estimator.predict(v_x[10:20])) # annoy_sim(v_x) # print('save数据完毕') # return find_annoy(train_y, stock_list) # length = 18 # 周期是多少 # j = 24 def class_fic_k(file_path=''): lines = read_data(file_path) print('读取数据完毕') size = len(lines[0]) x_list = [] for s in lines: tmp_list = [] for x in range(0, length): tmp_list = tmp_list + s[x*j+1:x*j+5] x_list.append(tmp_list) train_x = np.array(x_list) # train_x = np.array([s[5:9] + s[j+5:j+9] + s[j*2+5:j*2+9] + s[j*3+5:j*3+9] + s[j*4+5:j*4+9] + s[j*5+5:j*5+9] + s[j*6+5:j*6+9] # + s[j*7+5:j*7+9] + s[j*8+5:j*8+9] + s[j*9+5:j*9+9] + s[j*10+5:j*10+9] + s[j*11+5:j*11+9] + s[j*12+5:j*12+9] + s[j*13+5:j*13+9] # + s[j*14+5:j*14+9] + s[j*15+5:j*15+9] + s[j*9+5:j*9+9] + s[j*10+5:j*10+9] + s[j*11+5:j*11+9] + s[j*12+5:j*12+9] + s[j*13+5:j*13+9] # for s in lines]) # train_y = [s[size - 1] for s in lines] v_x = train_x.reshape(train_x.shape[0], 4*length) stock_list = [s[size - 2] for s in lines] estimator = KMeans(n_clusters=16, random_state=311) estimator.fit(v_x) label_pred = estimator.labels_ # 获取聚类标签 centroids = estimator.cluster_centers_ joblib.dump(estimator , 'km_k_18.pkl') print(estimator.predict(v_x[:10])) estimator = joblib.load('km_k_18.pkl') print(estimator.predict(v_x[10:20])) def class_fic_roc(file_path=''): lines = read_data(file_path) print('读取数据完毕') size = len(lines[0]) x_list = [] length = 18 # 周期是多少 j = 10 for s in lines: tmp_list = [] for x in range(0, length): tmp_list = tmp_list + s[x*j+8:x*j+10] x_list.append(tmp_list) train_x = np.array(x_list) v_x = train_x.reshape(train_x.shape[0], 2*length) estimator = KMeans(n_clusters=8, random_state=73011) estimator.fit(v_x) label_pred = estimator.labels_ # 获取聚类标签 centroids = estimator.cluster_centers_ joblib.dump(estimator , 'km_k_roc_8.pkl') print(estimator.predict(v_x[:10])) estimator = joblib.load('km_k_roc_8.pkl') print(estimator.predict(v_x[10:20])) def class_fic_dapan(file_path=''): lines = read_data(file_path) print('读取数据完毕') size = len(lines[0]) x_list = [] length = 18 # 周期是多少 j = 24 for s in lines: tmp_list = [] for x in range(0, length): tmp_list = tmp_list + s[x*j+16:x*j+17] x_list.append(tmp_list) train_x = np.array(x_list) v_x = train_x.reshape(train_x.shape[0], length) estimator = KMeans(n_clusters=8, random_state=82430) estimator.fit(v_x) label_pred = estimator.labels_ # 获取聚类标签 centroids = estimator.cluster_centers_ joblib.dump(estimator , 'km_k_dapan_8.pkl') print(estimator.predict(v_x[:10])) estimator = joblib.load('km_k_dapan_8.pkl') print(estimator.predict(v_x[10:20])) if __name__ == '__main__': # class_fic(file_path="D:\\data\\quantization\\stock2_10.log") # class_fic_k(file_path="D:\\data\\quantization\\stock10_18_train.log") # class_fic_dmi(file_path="D:\\data\\quantization\\stock12_18d_train.log") class_fic_dapan(file_path="D:\\data\\quantization\\stock160_18d_train.log")