|
@@ -24,14 +24,14 @@ article_info=[]
|
24
|
24
|
print("解析数据")
|
25
|
25
|
i = 0
|
26
|
26
|
for line in lines:
|
27
|
|
- i = i+1
|
28
|
|
- data = line.split('",')
|
29
|
|
- if len(data[2]) < 5 or len(data[4]) < 40:
|
30
|
|
- continue
|
31
|
|
- #标题词,摘要词,正文词
|
32
|
|
- #真实场景中,加权,去停用词
|
33
|
|
- train.append(data[3])
|
34
|
|
- article_info.append({"article_id":i,"__url":data[1]})
|
|
27
|
+ i = i+1
|
|
28
|
+ data = line.split('",')
|
|
29
|
+ if len(data[2]) < 5 or len(data[4]) < 40:
|
|
30
|
+ continue
|
|
31
|
+ #标题词,摘要词,正文词
|
|
32
|
+ #真实场景中,加权,去停用词
|
|
33
|
+ train.append(data[3])
|
|
34
|
+ article_info.append({"article_id":i,"__url":data[1]})
|
35
|
35
|
lines=[]
|
36
|
36
|
print("解析结束")
|
37
|
37
|
dictionary = corpora.Dictionary([train])
|