|
@@ -0,0 +1,127 @@
|
|
1
|
+# -*- encoding:utf-8 -*-
|
|
2
|
+from sklearn.linear_model import LogisticRegression
|
|
3
|
+from math import exp
|
|
4
|
+from math import log2
|
|
5
|
+import numpy as np
|
|
6
|
+import matplotlib.pyplot as plt
|
|
7
|
+
|
|
8
|
+def read_data(path):
|
|
9
|
+ with open(path) as f:
|
|
10
|
+ lines = f.readlines()
|
|
11
|
+ lines = [eval(line.strip()) for line in lines]
|
|
12
|
+ X, y = zip(*lines)
|
|
13
|
+ X = np.array(X)
|
|
14
|
+ y = np.array(y)
|
|
15
|
+ return X, y
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+def curve(x_train, w, w0):
|
|
19
|
+ results = x_train.tolist()
|
|
20
|
+ for i in range(0, 100):
|
|
21
|
+ x1 = 1.0 * i / 10
|
|
22
|
+ x2 = -1 * (w[0] * x1 + w0) / w[1]
|
|
23
|
+ results.append([x1, x2])
|
|
24
|
+ results = ["{},{}".format(x1, x2) for [x1, x2] in results]
|
|
25
|
+ return results
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+def drawScatterAndLine(p, q):
|
|
29
|
+ x1 = []
|
|
30
|
+ x2 = []
|
|
31
|
+ y1 = []
|
|
32
|
+ y2 = []
|
|
33
|
+
|
|
34
|
+ for idx,i in enumerate(q):
|
|
35
|
+ if i == 0:
|
|
36
|
+ x1.append(p[idx][0])
|
|
37
|
+ y1.append(p[idx][1])
|
|
38
|
+ else:
|
|
39
|
+ x2.append(p[idx][0])
|
|
40
|
+ y2.append(p[idx][1])
|
|
41
|
+
|
|
42
|
+ plt.scatter(x1, y1)
|
|
43
|
+ plt.scatter(x2, y2)
|
|
44
|
+ plt.xlabel('p')
|
|
45
|
+ plt.ylabel('q')
|
|
46
|
+ plt.title('line regesion')
|
|
47
|
+ plt.show()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+def sigmoid(x):
|
|
51
|
+ return 1 / (1 + exp(-x))
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+def data_matrix(X):
|
|
55
|
+ data_mat = []
|
|
56
|
+ for d in X:
|
|
57
|
+ data_mat.append([1.0, *d])
|
|
58
|
+ return data_mat
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+max_iter = 100
|
|
62
|
+last_weights = []
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+def fit_1(X_train, y_train):
|
|
66
|
+ X_train = data_matrix(X_train)
|
|
67
|
+ weights = np.array([1,1,1])
|
|
68
|
+
|
|
69
|
+ x = np.array(X_train)
|
|
70
|
+ for iter_ in range(max_iter):
|
|
71
|
+ y = np.dot(x, np.transpose(weights))
|
|
72
|
+ sig_y = []
|
|
73
|
+ for i in range(len(y)):
|
|
74
|
+ sig_y.append(sigmoid(y[i]))
|
|
75
|
+
|
|
76
|
+ result = [0,0,0]
|
|
77
|
+ loss = 0
|
|
78
|
+ for i in range(len(X_train)):
|
|
79
|
+ delta_i = (y_train[i][0]*(1-sig_y[i]) - (1-y_train[i][0])*sig_y[i])
|
|
80
|
+ result = [result[0] + delta_i*X_train[i][0], result[1] + delta_i*X_train[i][1], result[2] + delta_i*X_train[i][2]]
|
|
81
|
+ loss = loss - y_train[i][0]*log2(sig_y[i]) - (1-y_train[i][0])*log2(1- sig_y[i])
|
|
82
|
+ result = -1 * np.array(result)/len(X_train)
|
|
83
|
+ print("loss: ", loss)
|
|
84
|
+
|
|
85
|
+ weights = weights - 0.8*result
|
|
86
|
+ print("weight:", weights)
|
|
87
|
+
|
|
88
|
+ return weights
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+def score(X_test, y_test, last_weights):
|
|
92
|
+ X_test = data_matrix(X_test)
|
|
93
|
+ loss = 0
|
|
94
|
+
|
|
95
|
+ y = np.dot(X_test, np.transpose(last_weights))
|
|
96
|
+ sig_y = []
|
|
97
|
+ for i in range(len(y)):
|
|
98
|
+ sig_y.append(sigmoid(y[i]))
|
|
99
|
+
|
|
100
|
+ for i in range(len(X_test)):
|
|
101
|
+ loss = loss - y_test[i][0] * log2(sig_y[i]) - (1 - y_test[i][0]) * log2(1 - sig_y[i])
|
|
102
|
+ print("y_test loss ", loss)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+def main():
|
|
106
|
+ X_train, y_train = read_data("train_data")
|
|
107
|
+ drawScatterAndLine(X_train, y_train)
|
|
108
|
+ X_test, y_test = read_data("test_data")
|
|
109
|
+
|
|
110
|
+ weight = fit_1(X_train, y_train)
|
|
111
|
+
|
|
112
|
+ score(X_test, y_test, weight)
|
|
113
|
+ # y_pred = model.predict_proba(X_test)
|
|
114
|
+ # print y_pred
|
|
115
|
+ # loss=log_loss(y_test,y_pred)
|
|
116
|
+ # print "KL_loss:",loss
|
|
117
|
+ # loss=log_loss(y_pred,y_test)
|
|
118
|
+ # print "KL_loss:",loss
|
|
119
|
+ '''
|
|
120
|
+ curve_results=curve(X_train,model.coef_.tolist()[0],model.intercept_.tolist()[0])
|
|
121
|
+ with open("train_with_splitline","w") as f :
|
|
122
|
+ f.writelines("\n".join(curve_results))
|
|
123
|
+ '''
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+if __name__ == '__main__':
|
|
127
|
+ main()
|