yufeng
/
machine_learn


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
							# -*- encoding:utf-8 -*-
from sklearn.cluster import KMeans
import numpy as np
# from annoy import AnnoyIndex
import joblib


def read_data(path):
    lines = []
    with open(path) as f:
        for x in range(600000):
            line = eval(f.readline().strip())
            # if line[-1][0] == 1 or line[-1][1] == 1:
            lines.append(line)

    return lines


length = 18  # 周期是多少
j = 24
def class_fic_dmi(file_path=''):
    lines = read_data(file_path)
    print('读取数据完毕')
    size = len(lines[0])
    x_list = []
    for s in lines:
        tmp_list = []
        for x in range(0, length):
            tmp_list = tmp_list + s[x*j+5:x*j+9]
        x_list.append(tmp_list)
    train_x = np.array(x_list)
    # train_y = [s[size - 1] for s in lines]
    v_x = train_x.reshape(train_x.shape[0], 4*length)
    stock_list = [s[size - 2] for s in lines]

    estimator = KMeans(n_clusters=20, random_state=5179)
    estimator.fit(v_x)
    label_pred = estimator.labels_  # 获取聚类标签
    centroids = estimator.cluster_centers_

    print(label_pred)
    print(centroids)

    joblib.dump(estimator , 'km_dmi_18_20200225.pkl')

    print(estimator.predict(v_x[:10]))

    estimator = joblib.load('km_dmi_18_20200225.pkl')
    print(estimator.predict(v_x[10:20]))
    # annoy_sim(v_x)
    # print('save数据完毕')
    # return find_annoy(train_y, stock_list)

# length = 18  # 周期是多少
# j = 24
def class_fic_k(file_path=''):
    lines = read_data(file_path)
    print('读取数据完毕')
    size = len(lines[0])
    x_list = []
    for s in lines:
        tmp_list = []
        for x in range(0, length):
            tmp_list = tmp_list + s[x*j+1:x*j+5]
        x_list.append(tmp_list)
    train_x = np.array(x_list)
    # train_x = np.array([s[5:9] + s[j+5:j+9] + s[j*2+5:j*2+9] + s[j*3+5:j*3+9] + s[j*4+5:j*4+9] + s[j*5+5:j*5+9] + s[j*6+5:j*6+9]
    #                     + s[j*7+5:j*7+9] + s[j*8+5:j*8+9] + s[j*9+5:j*9+9] + s[j*10+5:j*10+9] + s[j*11+5:j*11+9] + s[j*12+5:j*12+9] + s[j*13+5:j*13+9]
    #                     + s[j*14+5:j*14+9] + s[j*15+5:j*15+9] + s[j*9+5:j*9+9] + s[j*10+5:j*10+9] + s[j*11+5:j*11+9] + s[j*12+5:j*12+9] + s[j*13+5:j*13+9]
    #                     for s in lines])
    # train_y = [s[size - 1] for s in lines]
    v_x = train_x.reshape(train_x.shape[0], 4*length)
    stock_list = [s[size - 2] for s in lines]

    estimator = KMeans(n_clusters=16, random_state=311)
    estimator.fit(v_x)
    label_pred = estimator.labels_  # 获取聚类标签
    centroids = estimator.cluster_centers_
    joblib.dump(estimator , 'km_k_18.pkl')

    print(estimator.predict(v_x[:10]))

    estimator = joblib.load('km_k_18.pkl')
    print(estimator.predict(v_x[10:20]))


if __name__ == '__main__':
    # class_fic(file_path="D:\\data\\quantization\\stock2_10.log")
    # class_fic_k(file_path="D:\\data\\quantization\\stock10_18_train.log")
    class_fic_dmi(file_path="D:\\data\\quantization\\stock12_18d_train.log")