kmeans.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. # -*- encoding:utf-8 -*-
  2. from sklearn.cluster import KMeans
  3. import numpy as np
  4. # from annoy import AnnoyIndex
  5. import joblib
  6. def read_data(path):
  7. lines = []
  8. with open(path) as f:
  9. for x in range(400000):
  10. line = eval(f.readline().strip())
  11. # if line[-1][0] == 1 or line[-1][1] == 1:
  12. lines.append(line)
  13. return lines
  14. length = 18 # 周期是多少
  15. j = 24
  16. def class_fic_dmi(file_path=''):
  17. lines = read_data(file_path)
  18. print('读取数据完毕')
  19. size = len(lines[0])
  20. x_list = []
  21. for s in lines:
  22. tmp_list = []
  23. for x in range(0, length):
  24. tmp_list = tmp_list + s[x*j+5:x*j+9]
  25. x_list.append(tmp_list)
  26. train_x = np.array(x_list)
  27. # train_y = [s[size - 1] for s in lines]
  28. v_x = train_x.reshape(train_x.shape[0], 4*length)
  29. stock_list = [s[size - 2] for s in lines]
  30. estimator = KMeans(n_clusters=20, random_state=5179)
  31. estimator.fit(v_x)
  32. label_pred = estimator.labels_ # 获取聚类标签
  33. centroids = estimator.cluster_centers_
  34. print(label_pred)
  35. print(centroids)
  36. joblib.dump(estimator , 'km_dmi_18_20200225.pkl')
  37. print(estimator.predict(v_x[:10]))
  38. estimator = joblib.load('km_dmi_18_20200225.pkl')
  39. print(estimator.predict(v_x[10:20]))
  40. # annoy_sim(v_x)
  41. # print('save数据完毕')
  42. # return find_annoy(train_y, stock_list)
  43. # length = 18 # 周期是多少
  44. # j = 24
  45. def class_fic_k(file_path=''):
  46. lines = read_data(file_path)
  47. print('读取数据完毕')
  48. size = len(lines[0])
  49. x_list = []
  50. for s in lines:
  51. tmp_list = []
  52. for x in range(0, length):
  53. tmp_list = tmp_list + s[x*j+1:x*j+5]
  54. x_list.append(tmp_list)
  55. train_x = np.array(x_list)
  56. # train_x = np.array([s[5:9] + s[j+5:j+9] + s[j*2+5:j*2+9] + s[j*3+5:j*3+9] + s[j*4+5:j*4+9] + s[j*5+5:j*5+9] + s[j*6+5:j*6+9]
  57. # + s[j*7+5:j*7+9] + s[j*8+5:j*8+9] + s[j*9+5:j*9+9] + s[j*10+5:j*10+9] + s[j*11+5:j*11+9] + s[j*12+5:j*12+9] + s[j*13+5:j*13+9]
  58. # + s[j*14+5:j*14+9] + s[j*15+5:j*15+9] + s[j*9+5:j*9+9] + s[j*10+5:j*10+9] + s[j*11+5:j*11+9] + s[j*12+5:j*12+9] + s[j*13+5:j*13+9]
  59. # for s in lines])
  60. # train_y = [s[size - 1] for s in lines]
  61. v_x = train_x.reshape(train_x.shape[0], 4*length)
  62. stock_list = [s[size - 2] for s in lines]
  63. estimator = KMeans(n_clusters=16, random_state=311)
  64. estimator.fit(v_x)
  65. label_pred = estimator.labels_ # 获取聚类标签
  66. centroids = estimator.cluster_centers_
  67. joblib.dump(estimator , 'km_k_18.pkl')
  68. print(estimator.predict(v_x[:10]))
  69. estimator = joblib.load('km_k_18.pkl')
  70. print(estimator.predict(v_x[10:20]))
  71. def class_fic_roc(file_path=''):
  72. lines = read_data(file_path)
  73. print('读取数据完毕')
  74. size = len(lines[0])
  75. x_list = []
  76. length = 18 # 周期是多少
  77. j = 10
  78. for s in lines:
  79. tmp_list = []
  80. for x in range(0, length):
  81. tmp_list = tmp_list + s[x*j+8:x*j+10]
  82. x_list.append(tmp_list)
  83. train_x = np.array(x_list)
  84. v_x = train_x.reshape(train_x.shape[0], 2*length)
  85. estimator = KMeans(n_clusters=8, random_state=73011)
  86. estimator.fit(v_x)
  87. label_pred = estimator.labels_ # 获取聚类标签
  88. centroids = estimator.cluster_centers_
  89. joblib.dump(estimator , 'km_k_roc_8.pkl')
  90. print(estimator.predict(v_x[:10]))
  91. estimator = joblib.load('km_k_roc_8.pkl')
  92. print(estimator.predict(v_x[10:20]))
  93. def class_fic_dapan(file_path=''):
  94. lines = read_data(file_path)
  95. print('读取数据完毕')
  96. size = len(lines[0])
  97. x_list = []
  98. length = 18 # 周期是多少
  99. j = 24
  100. for s in lines:
  101. tmp_list = []
  102. for x in range(0, length):
  103. tmp_list = tmp_list + s[x*j+16:x*j+17]
  104. x_list.append(tmp_list)
  105. train_x = np.array(x_list)
  106. v_x = train_x.reshape(train_x.shape[0], length)
  107. estimator = KMeans(n_clusters=8, random_state=82430)
  108. estimator.fit(v_x)
  109. label_pred = estimator.labels_ # 获取聚类标签
  110. centroids = estimator.cluster_centers_
  111. joblib.dump(estimator , 'km_k_dapan_8.pkl')
  112. print(estimator.predict(v_x[:10]))
  113. estimator = joblib.load('km_k_dapan_8.pkl')
  114. print(estimator.predict(v_x[10:20]))
  115. if __name__ == '__main__':
  116. # class_fic(file_path="D:\\data\\quantization\\stock2_10.log")
  117. # class_fic_k(file_path="D:\\data\\quantization\\stock10_18_train.log")
  118. # class_fic_dmi(file_path="D:\\data\\quantization\\stock12_18d_train.log")
  119. class_fic_dapan(file_path="D:\\data\\quantization\\stock160_18d_train.log")