yufeng 5 years ago
parent
commit
f61a70b000
76 changed files with 11277 additions and 60 deletions
  1. 61 60
      .gitignore
  2. 2820 0
      ad_tv_recom_score_matrix.txt
  3. 34 0
      bash_near_real_job.sh
  4. 20 0
      config.cfg
  5. 4 0
      dags/config.py
  6. 43 0
      dags/daily_dag.py
  7. 205 0
      dags/fty_operator.py
  8. 65 0
      dags/once_dag.py
  9. 35 0
      dags/realtime_dag.py
  10. 48 0
      dags/recent_one_year_stat_dag.py
  11. 70 0
      dags/subdags/idl_subdag.py
  12. 0 0
      fty_util/__init__.py
  13. 370 0
      fty_util/common.py
  14. 54 0
      fty_util/config.py
  15. 141 0
      idl_ad_pub_station_stats.py
  16. 131 0
      idl_tv_sr_denoise.py
  17. 98 0
      odl_near_realtime_calc.py
  18. 179 0
      online_ad_tv_sr_pre.py
  19. 9 0
      setup.py
  20. 191 0
      shell/bash_daily.sh
  21. 66 0
      shell/bash_daily_import.sh
  22. 55 0
      shell/bash_job.sh
  23. 43 0
      shell/bash_scrapy.sh
  24. 39 0
      task_clean/odl_ad_tv_record_distribution_update_company_field.py
  25. 41 0
      task_clean/odl_ad_tv_record_distribution_update_theme_field.py
  26. 47 0
      task_clean/scrapy_category_clean.py
  27. 63 0
      task_clean/scrapy_category_update.py
  28. 55 0
      task_clean/scrapy_dianshiju_clean.py
  29. 39 0
      task_clean/tv_category_relation.py
  30. 297 0
      task_clean/update_date.py
  31. 53 0
      task_clean/update_first_type.py
  32. 32 0
      task_idl/idl_ad_tv_record_distribution.py
  33. 31 0
      task_idl/idl_tv_article_marketing_count.py
  34. 36 0
      task_idl/idl_tv_article_marketing_detail.py
  35. 50 0
      task_idl/idl_tv_avg_ratings_stat.py
  36. 52 0
      task_idl/idl_tv_category_stat.py
  37. 51 0
      task_idl/idl_year_channel_avg_ratings_stat.py
  38. 56 0
      task_odl/odl_ad_audience_cps_time.py
  39. 57 0
      task_odl/odl_ad_audience_cps_time_incr_update.py
  40. 68 0
      task_odl/odl_ad_television.py
  41. 67 0
      task_odl/odl_ad_television_incr_update.py
  42. 65 0
      task_odl/odl_ad_tv_lib.py
  43. 35 0
      task_odl/odl_ad_tv_lib_insert.py
  44. 279 0
      task_odl/odl_ad_tv_record_distribution.py
  45. 279 0
      task_odl/odl_ad_tv_record_distribution_insert.py
  46. 70 0
      task_odl/odl_area_ad_television.py
  47. 67 0
      task_odl/odl_area_ad_television_incr_update.py
  48. 157 0
      task_other/idl_rank_update.py
  49. 96 0
      task_other/transform_categories.py
  50. 286 0
      task_scrapy/i_t_dsj_all.py
  51. 291 0
      task_scrapy/i_t_dsj_all_without_browser.py
  52. 198 0
      task_scrapy/i_t_dsj_categories.py
  53. 203 0
      task_scrapy/i_t_dsj_categories_without_browser.py
  54. 100 0
      task_scrapy/scrapy_all.py
  55. 143 0
      task_scrapy/scrapy_gongzhonghao_count.py
  56. 113 0
      task_scrapy/scrapy_huashutv.py
  57. 294 0
      task_scrapy/scrapy_iqiyi.py
  58. 59 0
      task_scrapy/scrapy_kankan.py
  59. 186 0
      task_scrapy/scrapy_leshi.py
  60. 146 0
      task_scrapy/scrapy_pptv.py
  61. 139 0
      task_scrapy/scrapy_sohu.py
  62. 231 0
      task_scrapy/scrapy_tengxun.py
  63. 97 0
      task_scrapy/scrapy_tianyancha.py
  64. 83 0
      task_scrapy/scrapy_tv_unhandle.py
  65. 146 0
      task_scrapy/scrapy_website_count.py
  66. 206 0
      task_scrapy/scrapy_website_count_new.py
  67. 222 0
      task_scrapy/scrapy_youku.py
  68. 43 0
      task_tmp/tmp_data_month.py
  69. 116 0
      task_tmp/tmp_tv_avg_ratings_fatt0.py
  70. 146 0
      task_tmp/tmp_tv_avg_ratings_stat.py
  71. 93 0
      task_tmp/tmp_tv_category_stat.py
  72. 85 0
      task_tmp/tmp_year_channel_avg_ratings_stat_by_tv.py
  73. 92 0
      task_yxb/ad_tv_lib_clean.py
  74. 181 0
      tmp_ad_tv_sr_stat.py
  75. 226 0
      tv_outline_recom.py
  76. 228 0
      tv_real_recom_fix.py

+ 61 - 60
.gitignore

@@ -1,60 +1,61 @@
1
-# ---> Python
2
-# Byte-compiled / optimized / DLL files
3
-__pycache__/
4
-*.py[cod]
5
-*$py.class
6
-
7
-# C extensions
8
-*.so
9
-
10
-# Distribution / packaging
11
-.Python
12
-env/
13
-build/
14
-develop-eggs/
15
-dist/
16
-downloads/
17
-eggs/
18
-.eggs/
19
-lib/
20
-lib64/
21
-parts/
22
-sdist/
23
-var/
24
-*.egg-info/
25
-.installed.cfg
26
-*.egg
27
-
28
-# PyInstaller
29
-#  Usually these files are written by a python script from a template
30
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
31
-*.manifest
32
-*.spec
33
-
34
-# Installer logs
35
-pip-log.txt
36
-pip-delete-this-directory.txt
37
-
38
-# Unit test / coverage reports
39
-htmlcov/
40
-.tox/
41
-.coverage
42
-.coverage.*
43
-.cache
44
-nosetests.xml
45
-coverage.xml
46
-*,cover
47
-
48
-# Translations
49
-*.mo
50
-*.pot
51
-
52
-# Django stuff:
53
-*.log
54
-
55
-# Sphinx documentation
56
-docs/_build/
57
-
58
-# PyBuilder
59
-target/
60
-
1
+# ---> Python
2
+# Byte-compiled / optimized / DLL files
3
+__pycache__/
4
+*.py[cod]
5
+*$py.class
6
+
7
+# C extensions
8
+*.so
9
+
10
+# Distribution / packaging
11
+.Python
12
+env/
13
+build/
14
+develop-eggs/
15
+dist/
16
+downloads/
17
+eggs/
18
+.eggs/
19
+lib/
20
+lib64/
21
+parts/
22
+sdist/
23
+var/
24
+*.egg-info/
25
+.installed.cfg
26
+*.egg
27
+
28
+# PyInstaller
29
+#  Usually these files are written by a python script from a template
30
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
31
+*.manifest
32
+*.spec
33
+
34
+# Installer logs
35
+pip-log.txt
36
+pip-delete-this-directory.txt
37
+
38
+# Unit test / coverage reports
39
+htmlcov/
40
+.tox/
41
+.coverage
42
+.coverage.*
43
+.cache
44
+nosetests.xml
45
+coverage.xml
46
+*,cover
47
+
48
+# Translations
49
+*.mo
50
+*.pot
51
+
52
+# Django stuff:
53
+*.log
54
+
55
+# Sphinx documentation
56
+docs/_build/
57
+
58
+# PyBuilder
59
+target/
60
+
61
+.DS_Store

File diff suppressed because it is too large
+ 2820 - 0
ad_tv_recom_score_matrix.txt


+ 34 - 0
bash_near_real_job.sh

@@ -0,0 +1,34 @@
1
+#!/bin/bash
2
+
3
+# 判断脚本执行路径是否存在,如果不存在则需要设置
4
+if [ -z $HUOJU_FTY_PATH ];
5
+then 
6
+    echo "not found"
7
+    export HUOJU_FTY_PATH=/root/py_script/
8
+else
9
+    echo "found"
10
+fi
11
+echo $HUOJU_FTY_PATH
12
+
13
+###############################操作################################
14
+#电视剧相似剧计算
15
+echo "tv_real_recom_fix"
16
+python ${HUOJU_FTY_PATH}tv_real_recom_fix.py $1
17
+if [ $? -ne 0 ];
18
+    then
19
+        content="tv_real_recom_fix"
20
+        echo $content
21
+    exit 1
22
+fi
23
+
24
+#电视剧收视指数预测
25
+echo "online_ad_tv_sr_pre"
26
+python ${HUOJU_FTY_PATH}online_ad_tv_sr_pre.py $1
27
+if [ $? -ne 0 ];
28
+    then
29
+        content="online_ad_tv_sr_pre"
30
+        echo $content
31
+    exit 1
32
+fi
33
+
34
+echo "脚本执行完毕"

+ 20 - 0
config.cfg

@@ -0,0 +1,20 @@
1
+[basic]
2
+tmp_path = /Users/wudancheng/huoju_fty_home/tmp_data/
3
+
4
+[online_config]
5
+host = 121.41.17.212
6
+user = root
7
+password = huojutech_yaozhi!23
8
+port = 3306
9
+
10
+[offline_config]
11
+host = 121.41.17.212
12
+user = root
13
+password = huojutech_yaozhi!23
14
+port = 3306
15
+
16
+[scrapy_config]
17
+host = 121.41.17.212
18
+user = root
19
+password = huojutech_yaozhi!23
20
+port = 3306

+ 4 - 0
dags/config.py

@@ -0,0 +1,4 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+SCRIPT_PATH = '/Users/wudancheng/huoju/code/huoju_fty/py_script'

+ 43 - 0
dags/daily_dag.py

@@ -0,0 +1,43 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+from airflow.models import DAG
5
+from airflow.operators.bash_operator import BashOperator
6
+from airflow.operators.subdag_operator import SubDagOperator
7
+from subdags.idl_subdag import idl_subdag
8
+import datetime
9
+from config import *
10
+
11
+default_args = {
12
+    'owner' : 'wdc',
13
+    'depends_on_past' : False,
14
+    'start_date' : datetime.datetime(2017, 01, 01),
15
+    'email' : ['wdc@huojutech.com'],
16
+    'email_on_failure' : False,
17
+    'email_on_retry': False,
18
+    # 'retries' : 3,
19
+    # 'retry_delay': datetime.timedelta(minutes=1),
20
+}
21
+
22
+DAG_NAME = 'daily_dag'
23
+
24
+# 每天零点三十分执行
25
+dag = DAG(
26
+    dag_id=DAG_NAME,
27
+    default_args=default_args,
28
+    schedule_interval='30 0 * * *',
29
+)
30
+
31
+# 每日增量更新
32
+odl_ad_television_incr_update = BashOperator(
33
+    task_id='odl_ad_television_incr_update',
34
+    bash_command='cd ' + SCRIPT_PATH + '; python odl_ad_television_incr_update.py',
35
+    dag=dag,
36
+)
37
+
38
+# 每日排名更新
39
+idl_rank_update = BashOperator(
40
+    task_id='idl_rank_update',
41
+    bash_command='cd ' + SCRIPT_PATH + '; python idl_rank_update.py',
42
+    dag=dag,
43
+)

+ 205 - 0
dags/fty_operator.py

@@ -0,0 +1,205 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+from airflow.models import DAG
5
+from airflow.operators.bash_operator import BashOperator
6
+from airflow.operators.subdag_operator import SubDagOperator
7
+from subdags.idl_subdag import idl_subdag
8
+import datetime
9
+from config import *
10
+
11
+default_args = {
12
+    'owner' : 'wdc',
13
+    'depends_on_past' : False,
14
+    'start_date' : datetime.datetime(2017, 01, 01),
15
+    'email' : ['wdc@huojutech.com'],
16
+    'email_on_failure' : False,
17
+    'email_on_retry': False,
18
+    # 'retries' : 3,
19
+    # 'retry_delay': datetime.timedelta(minutes=1),
20
+}
21
+
22
+DAG_NAME = 'fty_operator'
23
+
24
+dag = DAG(
25
+    dag_id=DAG_NAME,
26
+    default_args=default_args,
27
+    schedule_interval='0 1 * * *',
28
+)
29
+
30
+# 电视台收视率统计
31
+tmp_ad_television_stat_task = BashOperator(
32
+    task_id='tmp_ad_television_stat',
33
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_television_stat.py',
34
+    dag=dag,
35
+)
36
+
37
+# 月份统计
38
+tmp_ad_tv_station_mid_month_stat_task = BashOperator(
39
+    task_id='tmp_ad_tv_station_mid_month_stat',
40
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_month_stat.py',
41
+    dag=dag,
42
+)
43
+
44
+# 每月的数量,按电视台、剧场、类型分组
45
+tmp_ad_tv_station_mid_quantity_stat_task = BashOperator(
46
+    task_id='tmp_ad_tv_station_mid_quantity_stat',
47
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_quantity_stat.py',
48
+    dag=dag,
49
+)
50
+
51
+# 平台近一年平均收视率统计
52
+tmp_ad_tv_station_mid_avg_ratings_stat_task = BashOperator(
53
+    task_id='tmp_ad_tv_station_mid_avg_ratings_stat',
54
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_avg_ratings_stat.py',
55
+    dag=dag,
56
+)
57
+
58
+# 平台近一年去噪平均收视率
59
+tmp_ad_tv_station_mid_avg_ratings_denoising_stat_task = BashOperator(
60
+    task_id='tmp_ad_tv_station_mid_avg_ratings_denoising_stat',
61
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_avg_ratings_denoising_stat.py',
62
+    dag=dag,
63
+)
64
+
65
+# 每月平均收视率统计
66
+tmp_ad_tv_station_mid_ratings_stat_task = BashOperator(
67
+    task_id='tmp_ad_tv_station_mid_ratings_stat',
68
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_ratings_stat.py',
69
+    dag=dag,
70
+)
71
+
72
+# 每月平均收视指数统计
73
+tmp_ad_tv_station_mid_ratings_index_stat_task = BashOperator(
74
+    task_id='tmp_ad_tv_station_mid_ratings_index_stat',
75
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_ratings_index_stat.py',
76
+    dag=dag,
77
+)
78
+tmp_ad_tv_station_mid_ratings_index_stat_task.set_upstream(tmp_ad_tv_station_mid_avg_ratings_stat_task)
79
+
80
+# 每月去噪收视率统计
81
+tmp_ad_tv_station_mid_ratings_denoising_stat_task = BashOperator(
82
+    task_id='tmp_ad_tv_station_mid_ratings_denoising_stat',
83
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_ratings_denoising_stat.py',
84
+    dag=dag,
85
+)
86
+
87
+# 每月去噪收视指数统计
88
+tmp_ad_tv_station_mid_ratings_index_denoising_stat_task = BashOperator(
89
+    task_id='tmp_ad_tv_station_mid_ratings_index_denoising_stat',
90
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_ratings_index_denoising_stat.py',
91
+    dag=dag,
92
+)
93
+tmp_ad_tv_station_mid_ratings_index_denoising_stat_task.set_upstream(tmp_ad_tv_station_mid_avg_ratings_denoising_stat_task)
94
+
95
+# 平台近一年数量统计
96
+tmp_ad_tv_station_mid_type_stat_task = BashOperator(
97
+    task_id='tmp_ad_tv_station_mid_type_stat',
98
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_type_stat.py',
99
+    dag=dag,
100
+)
101
+tmp_ad_tv_station_mid_type_stat_task.set_upstream(tmp_ad_tv_station_mid_quantity_stat_task)
102
+
103
+# 类型热点统计
104
+tmp_tv_station_type_hot_task = BashOperator(
105
+    task_id='tmp_tv_station_type_hot',
106
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_tv_station_type_hot.py',
107
+    dag=dag,
108
+)
109
+tmp_tv_station_type_hot_task.set_upstream(tmp_ad_tv_station_mid_type_stat_task)
110
+
111
+# 类型趋势统计
112
+tmp_tv_station_type_trend_task = BashOperator(
113
+    task_id='tmp_tv_station_type_trend',
114
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_tv_station_type_trend.py',
115
+    dag=dag,
116
+)
117
+tmp_tv_station_type_trend_task.set_upstream(tmp_ad_tv_station_mid_type_stat_task)
118
+
119
+# 类型偏好统计
120
+tmp_tv_station_type_preference_task = BashOperator(
121
+    task_id='tmp_tv_station_type_preference',
122
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_tv_station_type_preference.py',
123
+    dag=dag,
124
+)
125
+tmp_tv_station_type_preference_task.set_upstream(tmp_tv_station_type_hot_task)
126
+tmp_tv_station_type_preference_task.set_upstream(tmp_tv_station_type_trend_task)
127
+
128
+# 收视指数偏好统计
129
+tmp_tv_station_ratings_index_preference_task = BashOperator(
130
+    task_id='tmp_tv_station_ratings_index_preference',
131
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_tv_station_ratings_index_preference.py',
132
+    dag=dag,
133
+)
134
+tmp_tv_station_ratings_index_preference_task.set_upstream(tmp_ad_tv_station_mid_ratings_index_stat_task)
135
+
136
+# 去噪收视指数偏好统计
137
+tmp_tv_station_ratings_index_denoising_preference_task = BashOperator(
138
+    task_id='tmp_tv_station_ratings_index_denoising_preference',
139
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_tv_station_ratings_index_denoising_preference.py',
140
+    dag=dag,
141
+)
142
+tmp_tv_station_ratings_index_denoising_preference_task.set_upstream(tmp_ad_tv_station_mid_ratings_index_denoising_stat_task)
143
+
144
+# 收视率趋势
145
+tmp_ratings_current_trending_task = BashOperator(
146
+    task_id='tmp_ratings_current_trending',
147
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ratings_current_trending.py',
148
+    dag=dag,
149
+)
150
+tmp_ratings_current_trending_task.set_upstream(tmp_ad_tv_station_mid_ratings_stat_task)
151
+
152
+# 排名趋势
153
+tmp_rank_trending_task = BashOperator(
154
+    task_id='tmp_rank_trending',
155
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_rank_trending.py',
156
+    dag=dag,
157
+)
158
+tmp_rank_trending_task.set_upstream(tmp_ad_tv_station_mid_ratings_stat_task)
159
+
160
+# 电视台推荐
161
+tmp_tv_station_recommend_task = BashOperator(
162
+    task_id='tmp_tv_station_recommend',
163
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_tv_station_recommend.py',
164
+    dag=dag,
165
+)
166
+tmp_tv_station_recommend_task.set_upstream(tmp_ad_tv_station_mid_ratings_index_stat_task)
167
+tmp_tv_station_recommend_task.set_upstream(tmp_tv_station_type_preference_task)
168
+tmp_tv_station_recommend_task.set_upstream(tmp_tv_station_ratings_index_preference_task)
169
+
170
+# 电视台去噪推荐
171
+tmp_tv_station_recommend_denoising_task = BashOperator(
172
+    task_id='tmp_tv_station_recommend_denoising',
173
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_tv_station_recommend_denoising.py',
174
+    dag=dag,
175
+)
176
+tmp_tv_station_recommend_denoising_task.set_upstream(tmp_ad_tv_station_mid_ratings_index_denoising_stat_task)
177
+tmp_tv_station_recommend_denoising_task.set_upstream(tmp_tv_station_type_preference_task)
178
+tmp_tv_station_recommend_denoising_task.set_upstream(tmp_tv_station_ratings_index_denoising_preference_task)
179
+
180
+idl_task = SubDagOperator(
181
+    task_id='idl_task',
182
+    subdag=idl_subdag(DAG_NAME, 'idl_task', default_args),
183
+    default_args=default_args,
184
+    dag=dag,
185
+)
186
+
187
+# 设置月份下行依赖
188
+tmp_ad_tv_station_mid_month_stat_task.set_downstream(tmp_ad_tv_station_mid_quantity_stat_task)
189
+tmp_ad_tv_station_mid_month_stat_task.set_downstream(tmp_ad_tv_station_mid_avg_ratings_stat_task)
190
+tmp_ad_tv_station_mid_month_stat_task.set_downstream(tmp_ad_tv_station_mid_avg_ratings_denoising_stat_task)
191
+tmp_ad_tv_station_mid_month_stat_task.set_downstream(tmp_ad_tv_station_mid_ratings_stat_task)
192
+tmp_ad_tv_station_mid_month_stat_task.set_downstream(tmp_ad_tv_station_mid_ratings_index_stat_task)
193
+tmp_ad_tv_station_mid_month_stat_task.set_downstream(tmp_ad_tv_station_mid_ratings_denoising_stat_task)
194
+tmp_ad_tv_station_mid_month_stat_task.set_downstream(tmp_ad_tv_station_mid_ratings_index_denoising_stat_task)
195
+tmp_ad_tv_station_mid_month_stat_task.set_downstream(tmp_ad_tv_station_mid_type_stat_task)
196
+
197
+
198
+idl_task.set_upstream(tmp_ad_tv_station_mid_ratings_stat_task)
199
+idl_task.set_upstream(tmp_ad_tv_station_mid_ratings_index_stat_task)
200
+
201
+tmp_tv_station_rank_task = BashOperator(
202
+    task_id='tmp_tv_station_rank',
203
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_tv_station_rank.py',
204
+    dag=dag,
205
+)

+ 65 - 0
dags/once_dag.py

@@ -0,0 +1,65 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+from airflow.models import DAG
5
+from airflow.operators.bash_operator import BashOperator
6
+from airflow.operators.subdag_operator import SubDagOperator
7
+from subdags.idl_subdag import idl_subdag
8
+import datetime
9
+from config import *
10
+
11
+default_args = {
12
+    'owner' : 'wdc',
13
+    'depends_on_past' : False,
14
+    'start_date' : datetime.datetime(2017, 01, 01),
15
+    'email' : ['wdc@huojutech.com'],
16
+    'email_on_failure' : False,
17
+    'email_on_retry': False,
18
+    # 'retries' : 3,
19
+    # 'retry_delay': datetime.timedelta(minutes=1),
20
+}
21
+
22
+DAG_NAME = 'once_dag'
23
+
24
+dag = DAG(
25
+    dag_id=DAG_NAME,
26
+    default_args=default_args,
27
+    schedule_interval='@once',
28
+)
29
+
30
+# 初始阶段运行一次即可
31
+once_history_ad_tv_station_mid_avg_ratings_denoising_stat = BashOperator(
32
+    task_id='once_history_ad_tv_station_mid_avg_ratings_denoising_stat',
33
+    bash_command='cd ' + SCRIPT_PATH + '; python once_history_ad_tv_station_mid_avg_ratings_denoising_stat.py',
34
+    dag=dag,
35
+)
36
+
37
+once_history_ad_tv_station_mid_avg_ratings_stat = BashOperator(
38
+    task_id='once_history_ad_tv_station_mid_avg_ratings_stat',
39
+    bash_command='cd ' + SCRIPT_PATH + '; python once_history_ad_tv_station_mid_avg_ratings_stat.py',
40
+    dag=dag,
41
+)
42
+
43
+once_history_ad_tv_station_mid_ratings_denoising_stat = BashOperator(
44
+    task_id='once_history_ad_tv_station_mid_ratings_denoising_stat',
45
+    bash_command='cd ' + SCRIPT_PATH + '; python once_history_ad_tv_station_mid_ratings_denoising_stat.py',
46
+    dag=dag,
47
+)
48
+
49
+once_history_ad_tv_station_mid_ratings_index_denoising_stat = BashOperator(
50
+    task_id='once_history_ad_tv_station_mid_ratings_index_denoising_stat',
51
+    bash_command='cd ' + SCRIPT_PATH + '; python once_history_ad_tv_station_mid_ratings_index_denoising_stat.py',
52
+    dag=dag,
53
+)
54
+
55
+once_history_ad_tv_station_mid_ratings_index_stat = BashOperator(
56
+    task_id='once_history_ad_tv_station_mid_ratings_index_stat',
57
+    bash_command='cd ' + SCRIPT_PATH + '; python once_history_ad_tv_station_mid_ratings_index_stat.py',
58
+    dag=dag,
59
+)
60
+
61
+once_history_ad_tv_station_mid_ratings_stat = BashOperator(
62
+    task_id='once_history_ad_tv_station_mid_ratings_stat',
63
+    bash_command='cd ' + SCRIPT_PATH + '; python once_history_ad_tv_station_mid_ratings_stat.py',
64
+    dag=dag,
65
+)

+ 35 - 0
dags/realtime_dag.py

@@ -0,0 +1,35 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+from airflow.models import DAG
5
+from airflow.operators.bash_operator import BashOperator
6
+from airflow.operators.subdag_operator import SubDagOperator
7
+from subdags.idl_subdag import idl_subdag
8
+import datetime
9
+from config import *
10
+
11
+default_args = {
12
+    'owner' : 'wdc',
13
+    'depends_on_past' : False,
14
+    'start_date' : datetime.datetime(2017, 01, 01),
15
+    'email' : ['wdc@huojutech.com'],
16
+    'email_on_failure' : False,
17
+    'email_on_retry': False,
18
+    # 'retries' : 3,
19
+    # 'retry_delay': datetime.timedelta(minutes=1),
20
+}
21
+
22
+DAG_NAME = 'realtime_dag'
23
+
24
+dag = DAG(
25
+    dag_id=DAG_NAME,
26
+    default_args=default_args,
27
+    schedule_interval='@once',
28
+)
29
+
30
+# 近实时计算(需要一直监听)
31
+odl_near_realtime_calc = BashOperator(
32
+    task_id='odl_near_realtime_calc',
33
+    bash_command='cd ' + SCRIPT_PATH + '; python odl_near_realtime_calc.py & 2>&1 /root/py_script_logs/realtime.log',
34
+    dag=dag,
35
+)

+ 48 - 0
dags/recent_one_year_stat_dag.py

@@ -0,0 +1,48 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+from airflow.models import DAG
5
+from airflow.operators.bash_operator import BashOperator
6
+from airflow.operators.subdag_operator import SubDagOperator
7
+from subdags.idl_subdag import idl_subdag
8
+import datetime
9
+from config import *
10
+
11
+default_args = {
12
+    'owner' : 'wdc',
13
+    'depends_on_past' : False,
14
+    'start_date' : datetime.datetime(2017, 01, 01),
15
+    'email' : ['wdc@huojutech.com'],
16
+    'email_on_failure' : False,
17
+    'email_on_retry': False,
18
+    # 'retries' : 3,
19
+    # 'retry_delay': datetime.timedelta(minutes=1),
20
+}
21
+
22
+DAG_NAME = 'recent_one_year_stat_dags'
23
+
24
+dag = DAG(
25
+    dag_id=DAG_NAME,
26
+    default_args=default_args,
27
+    schedule_interval='40 0 * * *',
28
+)
29
+
30
+# 最近一年数据分离
31
+tmp_recent_year_ad_television_data = BashOperator(
32
+    task_id='tmp_recent_year_ad_television_data',
33
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_recent_year_ad_television_data.py',
34
+    dag=dag,
35
+)
36
+
37
+# 近一年平均收视率、收视指数统计
38
+tmp_ad_tv_station_stat = BashOperator(
39
+    task_id='tmp_ad_tv_station_stat',
40
+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_stat.py',
41
+    dag=dag,
42
+)
43
+
44
+idl_ad_tv_station_stat = BashOperator(
45
+    task_id='idl_ad_tv_station_stat',
46
+    bash_command='cd ' + SCRIPT_PATH + '; python idl_ad_tv_station_stat.py',
47
+    dag=dag,
48
+)

+ 70 - 0
dags/subdags/idl_subdag.py

@@ -0,0 +1,70 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+from airflow.models import DAG
5
+from airflow.operators.bash_operator import BashOperator
6
+
7
+def idl_subdag(parent_dag_name, child_dag_name, args):
8
+    idl_subdag = DAG(
9
+        dag_id='%s.%s' % (parent_dag_name, child_dag_name),
10
+        default_args=args,
11
+        schedule_interval='@daily',
12
+    )
13
+
14
+    # 收视率趋势
15
+    BashOperator(
16
+        task_id='idl_trending-task',
17
+        bash_command='cd /Users/wudancheng/huoju/code/huoju_fty/py_script; python idl_trending.py',
18
+        default_args=args,
19
+        dag=idl_subdag,
20
+    )
21
+
22
+    # 类型统计
23
+    BashOperator(
24
+        task_id='idl_tv_station_type_stat-task',
25
+        bash_command='cd /Users/wudancheng/huoju/code/huoju_fty/py_script; python idl_tv_station_type_stat.py',
26
+        default_args=args,
27
+        dag=idl_subdag,
28
+    )
29
+
30
+    # 收视指数统计
31
+    BashOperator(
32
+        task_id='idl_tv_station_ratings_index_stat-task',
33
+        bash_command='cd /Users/wudancheng/huoju/code/huoju_fty/py_script; python idl_tv_station_ratings_index_stat.py',
34
+        default_args=args,
35
+        dag=idl_subdag,
36
+    )
37
+
38
+    # 电视台推荐
39
+    BashOperator(
40
+        task_id='idl_tv_station_recommend-task',
41
+        bash_command='cd /Users/wudancheng/huoju/code/huoju_fty/py_script; python idl_tv_station_recommend.py',
42
+        default_args=args,
43
+        dag=idl_subdag,
44
+    )
45
+
46
+    # 电视台收视率统计
47
+    BashOperator(
48
+        task_id='idl_ad_television_stat-task',
49
+        bash_command='cd /Users/wudancheng/huoju/code/huoju_fty/py_script; python idl_ad_television_stat.py',
50
+        default_args=args,
51
+        dag=idl_subdag,
52
+    )
53
+
54
+    # 备案发行数据
55
+    BashOperator(
56
+        task_id='idl_ad_tv_record_distribution-task',
57
+        bash_command='cd /Users/wudancheng/huoju/code/huoju_fty/py_script; python idl_ad_tv_record_distribution.py',
58
+        default_args=args,
59
+        dag=idl_subdag,
60
+    )
61
+
62
+    # idl_tv_station_rank
63
+    BashOperator(
64
+        task_id='idl_tv_station_rank-task',
65
+        bash_command='cd /Users/wudancheng/huoju/code/huoju_fty/py_script; python idl_tv_station_rank.py',
66
+        default_args=args,
67
+        dag=idl_subdag,
68
+    )
69
+    
70
+    return idl_subdag

+ 0 - 0
fty_util/__init__.py


+ 370 - 0
fty_util/common.py

@@ -0,0 +1,370 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+
4
+from mysql.connector.connection import MySQLConnection
5
+import commands
6
+import datetime
7
+import calendar
8
+from fty_util.config import APP_CFG
9
+
10
+class Mysql(object):
11
+
12
+    def __init__(self):
13
+        pass
14
+    
15
+    @staticmethod
16
+    def createOnlineConn():
17
+        # 如果需要使用连接池功能,可以多指定一个参数,pool_size=10或者pool_name
18
+        config = {
19
+            'user': APP_CFG.ONLINE_CONFIG_USER,
20
+            'password': APP_CFG.ONLINE_CONFIG_PASSWORD,
21
+            'host': APP_CFG.ONLINE_CONFIG_HOST,
22
+            'port': APP_CFG.ONLINE_CONFIG_PORT
23
+        }
24
+        # config = {
25
+        #     'user': 'root',
26
+        #     'password': 'huojutech_yaozhi!23',
27
+        #     'host': '121.41.17.212',
28
+        #     # 'database': 'yxb',
29
+        #     'port': 3306
30
+        # }
31
+        # 如果conn是直接新建的连接则它会被关闭,如果是从线程池中分配一个连接则会被归还给连接池
32
+        cnx = MySQLConnection()
33
+        try:
34
+            cnx.connect(**config)
35
+        except Exception, e:
36
+            print e
37
+            cnx.reconnect(attempts=3, delay=0)
38
+        return cnx
39
+
40
+    @staticmethod
41
+    def createOfflineConn():
42
+        # 如果需要使用连接池功能,可以多指定一个参数,pool_size=10或者pool_name
43
+        config = {
44
+            'user': APP_CFG.OFFLINE_CONFIG_USER,
45
+            'password': APP_CFG.OFFLINE_CONFIG_PASSWORD,
46
+            'host': APP_CFG.OFFLINE_CONFIG_HOST,
47
+            'port': APP_CFG.OFFLINE_CONFIG_PORT
48
+        }
49
+        # config = {
50
+        #     'user': 'root',
51
+        #     'password': 'huojutech_yaozhi!23',
52
+        #     'host': '121.41.17.212',
53
+        #     # 'database': 'yxb',
54
+        #     'port': 3306
55
+        # }
56
+        # 如果conn是直接新建的连接则它会被关闭,如果是从线程池中分配一个连接则会被归还给连接池
57
+        cnx = MySQLConnection()
58
+        try:
59
+            cnx.connect(**config)
60
+        except Exception, e:
61
+            print e
62
+            cnx.reconnect(attempts=3, delay=0)
63
+        return cnx
64
+
65
+    @staticmethod
66
+    def createScrapyConn():
67
+        # 如果需要使用连接池功能,可以多指定一个参数,pool_size=10或者pool_name
68
+        config = {
69
+            'user': APP_CFG.SCRAPY_CONFIG_USER,
70
+            'password': APP_CFG.SCRAPY_CONFIG_PASSWORD,
71
+            'host': APP_CFG.SCRAPY_CONFIG_HOST,
72
+            'port': APP_CFG.SCRAPY_CONFIG_PORT
73
+        }
74
+        # config = {
75
+        #     'user': 'root',
76
+        #     'password': 'huojutech_yaozhi!23',
77
+        #     'host': '121.41.17.212',
78
+        #     'port': 3306
79
+        # }
80
+        # 如果conn是直接新建的连接则它会被关闭,如果是从线程池中分配一个连接则会被归还给连接池
81
+        cnx = MySQLConnection()
82
+        try:
83
+            cnx.connect(**config)
84
+        except Exception, e:
85
+            print e
86
+            cnx.reconnect(attempts=3, delay=0)
87
+        return cnx
88
+
89
+    @staticmethod
90
+    def getCursor(conn=None, buffered=None):
91
+        if not conn.is_connected():
92
+            if conn is not None:
93
+                conn.close()
94
+            conn.reconnect(attempts=5)
95
+        if buffered is not None:
96
+            cursor = conn.cursor(buffered=True)
97
+        else:
98
+            cursor = conn.cursor()
99
+        return cursor
100
+
101
+        # if Mysql.__pool is None:
102
+        #     __pool = PooledDB(creator=mysql.connector, mincached=1, maxcached=20,
103
+        #         host=MYSQL_HOST,
104
+        #         port=MYSQL_PORT,
105
+        #         db=MYSQL_DBNAME,
106
+        #         user=MYSQL_USER,
107
+        #         passwd=MYSQL_PASSWD,
108
+        #         charset='utf8')
109
+        # return __pool.connection()
110
+
111
+    @staticmethod
112
+    def getAll(sql, param=None, conn=None):
113
+        # conn = self.getConn()
114
+        cursor = Mysql.getCursor(conn=conn)
115
+        """
116
+        @summary: 执行查询, 并取出所有结果集
117
+        @param sql: 查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来
118
+        @param param: 可选参数,条件列表值 (元组/列表)
119
+        @return: result list(字典对象)/boolean 查询到的结果集
120
+        """
121
+        if param is None:
122
+            cursor.execute(sql)
123
+        else:
124
+            cursor.execute(sql, param)
125
+        cols = [t[0] for t in cursor.description]
126
+        result = cursor.fetchall()
127
+        if result:
128
+            cursor.close()
129
+            return [dict(zip(cols, row)) for row in result]
130
+        else:
131
+            cursor.close()
132
+            return result
133
+
134
+    @staticmethod
135
+    def selectAll(sql, param=None, conn=None):
136
+        # conn = self.getConn()
137
+        cursor = Mysql.getCursor(conn=conn)
138
+        """
139
+        @summary: 执行查询, 并取出所有结果集
140
+        @param sql: 查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来
141
+        @param param: 可选参数,条件列表值 (元组/列表)
142
+        @return: result list查询到的结果集
143
+        """
144
+        if param is None:
145
+            cursor.execute(sql)
146
+        else:
147
+            cursor.execute(sql, param)
148
+        cols = [t[0] for t in cursor.description]
149
+        result = cursor.fetchall()
150
+        cursor.close()
151
+        return result
152
+
153
+    @staticmethod
154
+    def getOne(sql, param=None, conn=None):
155
+        # conn = self.getConn()
156
+        cursor = Mysql.getCursor(conn=conn, buffered=True)
157
+        """
158
+        @summary: 执行查询,并取出第一条
159
+        @param sql: 查询SQL, 如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来
160
+        @param param: 可选参数,条件列表值(元组/列表)
161
+        @return: result list/boolean 查询到的结果集
162
+        """
163
+        if param is None:
164
+            count = cursor.execute(sql)
165
+        else:
166
+            count = cursor.execute(sql, param)
167
+        result = cursor.fetchone()
168
+        cursor.close()
169
+        return result
170
+
171
+    @staticmethod
172
+    def getMany(sql, num, param=None, conn=None):
173
+        # conn = self.getConn()
174
+        cursor = Mysql.getCursor(conn=conn)
175
+        """
176
+        @summary: 执行查询, 并取出num条结果
177
+        @param sql: 查询SQL, 如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来
178
+        @param num: 取得的结果条数
179
+        @param param: 可选参数, 条件列表值(元组/列表)
180
+        @return: result list/boolean 查询到的结果集
181
+        """
182
+        if param is None:
183
+            count = cursor.execute(sql)
184
+        else:
185
+            count = cursor.execute(sql, param)
186
+        result = cursor.fetchmany(num)
187
+        cursor.close()
188
+        return result
189
+
190
+    @staticmethod
191
+    def insertOne(sql, value=None, conn=None):
192
+        # conn = self.getConn()
193
+        cursor = Mysql.getCursor(conn=conn)
194
+        """
195
+        @summary: 向数据表插入一条记录
196
+        @param sql: 要插入的SQL格式
197
+        @param value: 要插入的记录数据tuple/list
198
+        @return: insertId 受影响的行数
199
+        """
200
+        if value is None:
201
+            cursor.execute(sql)
202
+        else:
203
+            cursor.execute(sql, value)
204
+        Mysql.dispose(cursor, conn)
205
+        return Mysql.__getInsertId(conn)
206
+
207
+    @staticmethod
208
+    def insertMany(sql, values, conn):
209
+        # conn = self.getConn()
210
+        cursor = Mysql.getCursor(conn=conn)
211
+        """
212
+        @summary: 向数据表插入多条记录
213
+        @param sql: 要插入的SQL格式
214
+        @param values: 要插入的记录数据tuple(tuple)/list[list]
215
+        @return: count 受影响的行数
216
+        """
217
+        count = cursor.executemany(sql, values)
218
+        Mysql.dispose(cursor, conn)
219
+        return count
220
+
221
+    @staticmethod
222
+    def __getInsertId(conn):
223
+        # conn = self.getConn()
224
+        cursor = Mysql.getCursor(conn=conn)
225
+        """
226
+        获取当前连接最后一次插入操作生成的id,如果没有则为0
227
+        """
228
+        cursor.execute("select @@identity as id")
229
+        result = cursor.fetchall()
230
+        cursor.close()
231
+        return result[0][0]
232
+
233
+    @staticmethod
234
+    def __query(sql, param=None, conn=None):
235
+        # conn = Mysql.getConn()
236
+        cursor = Mysql.getCursor(conn=conn)
237
+        if param is None:
238
+            count = cursor.execute(sql)
239
+        else:
240
+            count = cursor.execute(sql, param)
241
+        Mysql.dispose(cursor, conn)
242
+        return count
243
+
244
+    @staticmethod
245
+    def execute(sql, param=None, conn=None):
246
+        # conn = self.getConn()
247
+        cursor = Mysql.getCursor(conn=conn)
248
+        if param is None:
249
+            count = cursor.execute(sql)
250
+        else:
251
+            count = cursor.execute(sql, param)
252
+        Mysql.dispose(cursor, conn)
253
+        return count
254
+
255
+    @staticmethod
256
+    def updateMany(sql, param=None, conn=None):
257
+        # conn = Mysql.getConn()
258
+        cursor = Mysql.getCursor(conn=conn)
259
+        count = cursor.executemany(sql, param)
260
+        return count
261
+
262
+    @staticmethod
263
+    def update(sql, param=None, conn=None):
264
+        """
265
+        @summary: 更新数据表记录
266
+        @param sql: sql格式及条件, 使用(%s, %s)
267
+        @param param: 要更新的 值 tuple/list
268
+        @return: count 受影响的行数
269
+        """
270
+        return Mysql.__query(sql, param=param, conn=conn)
271
+
272
+    @staticmethod
273
+    def delete(sql, param=None, conn=None):
274
+        """
275
+        @summary: 删除数据表记录
276
+        @param sql: sql格式及条件,使用(%s, %s)
277
+        @param param: 要删除的条件 值 tuple/list
278
+        @return: count 受影响的行数
279
+        """
280
+        return Mysql.__query(sql, param=param, conn=conn)
281
+
282
+    @staticmethod
283
+    def dispose(cursor, conn):
284
+        """
285
+            @summary: 释放连接池资源
286
+        """
287
+        conn.commit()
288
+        cursor.close()
289
+
290
+    @staticmethod
291
+    def close(conn):
292
+        if conn:
293
+            conn.close()
294
+
295
+    @staticmethod
296
+    def cmd(cmd):
297
+        status, output = commands.getstatusoutput(cmd)
298
+        if status != 0:
299
+            print '同步线上失败'
300
+        else:
301
+            print '同步线上成功'
302
+
303
+class Util(object):
304
+    
305
+    @staticmethod
306
+    def insert_by_chunk(sql, data_list, conn):
307
+        start = 0
308
+        while True:
309
+            end = start + 10000
310
+            if end >= len(data_list):
311
+                end = len(data_list)
312
+            if start >= len(data_list):
313
+                break
314
+            Mysql.insertMany(sql, data_list[start:end], conn)
315
+            start = end
316
+
317
+    @staticmethod
318
+    def calc_ratings_index(num1, num2):
319
+        """计算收视指数
320
+
321
+        收视指数 = 收视率/(近一年平均收视率 * 0.2)
322
+        收视指数 > 10 按10计算
323
+        收视指数 < 1 按1计算
324
+        Args: num1 收视率
325
+        Args: num2 近一年平均收视率
326
+        """
327
+        # 如果num1 或 num2为空,则直接返回指数为1
328
+        if num1 is None or num2 is None:
329
+            return 1.0
330
+        ratings_index = float(num1) / (float(num2) * 0.2)
331
+        if ratings_index > 10.0:
332
+            ratings_index = 10.0
333
+        if ratings_index < 1.0:
334
+            ratings_index = 1.0
335
+        return ratings_index
336
+
337
+    @staticmethod
338
+    def get_max_date_of_month(field):
339
+        """获取给定月份的最大日期
340
+
341
+        """
342
+        if isinstance(field, datetime.date):
343
+            month_str = field.strftime('%Y-%m-%d')
344
+            _year = str(month_str.split('-')[0])
345
+            _month = str(month_str.split('-')[1])
346
+            max_date = calendar.monthrange(int(_year), int(_month))
347
+            date_str = _year + '-' + _month + '-' + str(max_date[1])
348
+            return date_str
349
+
350
+    @staticmethod
351
+    def get_first_date_of_yesterday():
352
+        # 当前日期
353
+        now = datetime.date.today()
354
+        # 昨天日期
355
+        yesterday = now - datetime.timedelta(days=1)
356
+        # 昨天的当月日期
357
+        first_day = datetime.date(yesterday.year, yesterday.month, 1)
358
+        return first_day
359
+
360
+    @staticmethod
361
+    def get_max_date_of_one_year_ago(field):
362
+        """获取给定月份一年前的日期
363
+        """
364
+        if isinstance(field, datetime.date):
365
+            month_str = field.strftime('%Y-%m-%d')
366
+            _year = str(month_str.split('-')[0])
367
+            _month = str(month_str.split('-')[1])
368
+            max_date = calendar.monthrange(int(_year)-1, int(_month))
369
+            date_str = str(int(_year) - 1) + '-' + _month + '-' + str(max_date[1])
370
+            return date_str

+ 54 - 0
fty_util/config.py

@@ -0,0 +1,54 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+
5
+from __future__ import with_statement
6
+import sys
7
+import os
8
+import datetime
9
+import time
10
+import ConfigParser
11
+
12
+# 读取配置文件
13
+try:
14
+    config = ConfigParser.ConfigParser()
15
+    with open(os.path.expanduser('~') + '/huoju_fty_home/config.cfg', 'rw') as f:
16
+        config.readfp(f)
17
+except IOError, e:
18
+    print e
19
+    sys.exit(1)
20
+
21
+class APP_CFG(object):
22
+    
23
+    OFFLINE_CONFIG_HOST = config.get('offline_config', 'host')
24
+    OFFLINE_CONFIG_USER = config.get('offline_config', 'user')
25
+    OFFLINE_CONFIG_PASSWORD = config.get('offline_config', 'password')
26
+    OFFLINE_CONFIG_PORT = config.get('offline_config', 'port')
27
+
28
+    ONLINE_CONFIG_HOST = config.get('online_config', 'host')
29
+    ONLINE_CONFIG_USER = config.get('online_config', 'user')
30
+    ONLINE_CONFIG_PASSWORD = config.get('online_config', 'password')
31
+    ONLINE_CONFIG_PORT = config.get('online_config', 'port')
32
+
33
+    SCRAPY_CONFIG_HOST = config.get('scrapy_config', 'host')
34
+    SCRAPY_CONFIG_USER = config.get('scrapy_config', 'user')
35
+    SCRAPY_CONFIG_PASSWORD = config.get('scrapy_config', 'password')
36
+    SCRAPY_CONFIG_PORT = config.get('scrapy_config', 'port')
37
+
38
+    # # 离线数据库配置
39
+    # OFFLINE_CONFIG = config.get('info', 'offline_config')
40
+    # # 在线数据库配置
41
+    # ONLINE_CONFIG = config.get('info', 'online_config')
42
+
43
+    # # 电视台收视率统计路径
44
+    # AD_TELEVISION_STAT_PATH = config.get('info', 'tmp_path') + 'ad_television_stat.txt'
45
+    # # 备案发行数据路径
46
+    # AD_TV_RECORD_DISTRIBUTION_PATH = config.get('info', 'tmp_path') + 'ad_tv_record_distribution.txt'
47
+    # # 趋势路径
48
+    # TRENDING_PATH = config.get('info', 'tmp_path') + 'trending.txt'
49
+    # # 收视预测路径
50
+    # TV_STATION_RATINGS_STAT_PATH = config.get('info', 'tmp_path') + 'tv_station_ratings_stat.txt'
51
+    # # 推荐结果路径
52
+    # TV_STATION_RECOMMEND_PATH = config.get('info', 'tmp_path') + 'tv_station_recommend.txt'
53
+    # # 类型预测路径
54
+    # TV_STATION_TYPE_STAT_PATH = config.get('info', 'tmp_path') + 'tv_station_type_stat.txt'

+ 141 - 0
idl_ad_pub_station_stats.py

@@ -0,0 +1,141 @@
1
+#encoding=utf-8
2
+#author:wdw110
3
+#功能:统计电视台的各类型数量和收视率
4
+
5
+from __future__ import division
6
+import re
7
+import math
8
+import time
9
+import datetime
10
+import numpy as np
11
+from fty_util.common import Mysql
12
+
13
+tv_data = {}
14
+tv_data2 = {}
15
+tv_station = {}
16
+tv_station_type = {}
17
+channel_type = {}
18
+result_rate = []
19
+result_type = []
20
+result_channel = []
21
+
22
+conn = Mysql.createOfflineConn()
23
+
24
+sql = "select tv_name,channel,audience_rating,tv_date from odl.ad_television where theater_attribute='黄金剧场'"
25
+data = Mysql.selectAll(sql, conn=conn)
26
+
27
+sql_tv = "select tv_id,tv_name,theme,second_type,decade,first_type from odl.ad_tv_lib where is_use=1"
28
+tmp_data = Mysql.selectAll(sql_tv, conn=conn)
29
+for i in range(len(tmp_data)):
30
+	tv_id = tmp_data[i][0]
31
+	tv_name = tmp_data[i][1]
32
+	theme = tmp_data[i][2]
33
+	type2 = tmp_data[i][3]
34
+	decade = tmp_data[i][4]
35
+	type1 = tmp_data[i][5]
36
+	if type1 and type2:
37
+		tv_data[tv_name] = [tv_id,theme,type2,decade]
38
+		tv_data2[tv_name] = [tv_id,type1,type2]
39
+
40
+#按月统计电视台的收视率
41
+for i in range(len(data)):
42
+	tv_name = data[i][0]
43
+	channel = data[i][1]
44
+	aud_rating = data[i][2]
45
+	tv_date = datetime.datetime.strftime(data[i][3],'%Y-%m')
46
+
47
+	tv_station.setdefault(channel,{})
48
+	tv_station[channel].setdefault(tv_date,[])
49
+	tv_station[channel][tv_date].append(aud_rating)
50
+
51
+	channel_type.setdefault(channel,{})
52
+	channel_type[channel].setdefault(tv_date,{})
53
+
54
+	if tv_data2.get(tv_name):
55
+		tv_arr = tv_data2[tv_name][1:-1]
56
+		for level in range(len(tv_arr)):
57
+			channel_type[channel][tv_date].setdefault(level,{})
58
+			ty = tv_arr[level]
59
+			if ty:
60
+				type_arr = ty.split(u' ')
61
+				for tt in type_arr:
62
+					if len(tt):
63
+						channel_type[channel][tv_date][level].setdefault(tt,[])
64
+						channel_type[channel][tv_date][level][tt].append(aud_rating)
65
+
66
+for channel,value in channel_type.items():
67
+	for tv_date in value:
68
+		date = datetime.datetime.strptime(tv_date,'%Y-%m').date()
69
+		val = value[tv_date]
70
+		for level,v_obj in val.items():
71
+			for k,v in v_obj.items():
72
+				avg = sum(v)/len(v)
73
+				result_channel.append((channel,k,avg,level+1,date))
74
+
75
+for channel,value in tv_station.items():
76
+	for tv_date in value:
77
+		tmp_arr = value[tv_date]
78
+		avg_rating = sum(tmp_arr)/len(tmp_arr)
79
+		date = datetime.datetime.strptime(tv_date,'%Y-%m').date()
80
+		result_rate.append((channel,avg_rating,date))
81
+
82
+#按类型统计电视台播放电视剧数量
83
+for i in range(len(data)):
84
+	tv_name = data[i][0]
85
+	channel = data[i][1]
86
+	tv_date = datetime.datetime.strftime(data[i][3],'%Y-%m')
87
+
88
+	if tv_data.get(tv_name):
89
+		tv_id = tv_data[tv_name][0]
90
+		type1 = tv_data[tv_name][1]
91
+		type2 = tv_data[tv_name][2]
92
+		decade = tv_data[tv_name][3]
93
+		type_arr = type2.split(u' ') if type2 else []
94
+		tv_station_type.setdefault(channel,{})
95
+		tv_station_type[channel].setdefault(tv_date,{"type1":{},"type2":{}})
96
+		tv_station_type[channel][tv_date]['type1'].setdefault(type1,{})
97
+		tv_station_type[channel][tv_date]['type1'][type1][tv_id] = decade
98
+		for t2 in type_arr:
99
+			if len(t2):
100
+				tv_station_type[channel][tv_date]['type2'].setdefault(t2,{})
101
+				tv_station_type[channel][tv_date]['type2'][t2][tv_id] = decade
102
+
103
+for channel,value in tv_station_type.items():
104
+	for tv_date in value:
105
+		type1_obj = value[tv_date]['type1']
106
+		type2_obj = value[tv_date]['type2']
107
+		date = datetime.datetime.strptime(tv_date,'%Y-%m').date()
108
+		for t1,v1 in type1_obj.items():
109
+			for con in v1:
110
+				t1_arr = [channel,t1,'1',con,date,v1[con]]
111
+				result_type.append(t1_arr)
112
+		for t2,v2 in type2_obj.items():
113
+			for con in v2:
114
+				t2_arr = [channel,t2,'2',con,date,v2[con]]
115
+				result_type.append(t2_arr)
116
+
117
+delete = 'truncate table idl.ad_pub_station_rate_stats'
118
+Mysql.execute(delete, conn=conn)
119
+
120
+sql_rate = 'insert into idl.ad_pub_station_rate_stats(channel,avg_rating,date) values(%s,%s,%s)'
121
+for i in range(int(len(result_rate)/1000)+1):
122
+	tmp = result_rate[i*1000:(i+1)*1000]
123
+	Mysql.insertMany(sql_rate, tmp, conn=conn)
124
+
125
+delete = 'truncate table idl.ad_pub_station_type_stats'
126
+Mysql.execute(delete, conn=conn)
127
+
128
+sql_type = 'insert into idl.ad_pub_station_type_stats(channel,type,level,tv_id,date,decade) values(%s,%s,%s,%s,%s,%s)'
129
+for i in range(int(len(result_type)/1000)+1):
130
+	tmp = result_type[i*1000:(i+1)*1000]
131
+	Mysql.insertMany(sql_type, tmp, conn=conn)
132
+
133
+delete = 'truncate table idl.ad_pub_station_type_rate'
134
+Mysql.execute(delete, conn=conn)
135
+
136
+sql_channel = 'insert into idl.ad_pub_station_type_rate(channel,type,avg_rating,level,date) values(%s,%s,%s,%s,%s)'
137
+for i in range(int(len(result_channel)/1000)+1):
138
+	tmp = result_channel[i*1000:(i+1)*1000]
139
+	Mysql.insertMany(sql_channel, tmp, conn=conn)
140
+
141
+Mysql.close(conn)

+ 131 - 0
idl_tv_sr_denoise.py

@@ -0,0 +1,131 @@
1
+#encoding=utf-8
2
+#author:wdw110
3
+#功能: 对卫视和地方的电视剧收视数据进行去噪声处理
4
+
5
+from __future__ import division
6
+import math
7
+import copy
8
+import mysql.connector
9
+import time
10
+import numpy as np
11
+from fty_util.common import Mysql
12
+
13
+tbs = ['ad_television','area_ad_television']
14
+choose = ['ad','area']
15
+
16
+def Stat(Number): #Number:0和1
17
+	tv_data = []
18
+	tv_play = {}
19
+	tv_station = {}
20
+	dateline = str(time.localtime().tm_year-1)
21
+
22
+	conn = Mysql.createOfflineConn()
23
+
24
+	sql = "select id,tv_name,channel,theater_attribute,epi_num,tv_date,start_time,end_time,audience_rating,avg_rating,market_rating from odl.%s where year(tv_date)>=%s" % (tbs[Number], dateline)
25
+	data = Mysql.selectAll(sql, conn=conn)
26
+
27
+	for i in range(len(data)):
28
+		dd = list(data[i])
29
+		tv_id = data[i][0]
30
+		tv_name = data[i][1]
31
+		channel = data[i][2]
32
+		theater = data[i][3]
33
+		year = data[i][5].year
34
+		if dd[8]>=0 and dd[9]>=0 and dd[10]>=0 and dd[3]: 
35
+			key = (tv_name, channel, theater, year)
36
+			tv_play[tv_id] = dd[1:]
37
+			tv_station.setdefault(key,[[],[]])
38
+			tv_station[key][0].append(tv_id)
39
+			tmp_data = map(float,data[i][8:11])
40
+			tv_station[key][1].append(tmp_data)
41
+
42
+	def fivenum(arr):
43
+		"""Tukey's five number"""
44
+		arr = np.sort(arr)
45
+		res = []
46
+		n = len(arr)
47
+		if n == 0:
48
+			print '数组不能为空'
49
+			return res
50
+		else:
51
+			n4 = math.floor((n+3)/2)/2
52
+			d = [0,n4-1,(n-1)/2,n-n4,n-1]
53
+			d_floor = [int(math.floor(i)) for i in d]
54
+			d_ceil = [int(math.ceil(i)) for i in d]
55
+			res = list(0.5 * (arr[d_floor]+arr[d_ceil]))
56
+			return res
57
+
58
+	def denoise(arr):
59
+		"""异常值检测:在区间[Q1-1.5*(Q3-Q1),Q3+1.5*(Q3-Q1)]
60
+		之外的点均为异常值,其中,Q1为四分之一分位点,Q3为四分之三分位点"""
61
+		five_arr = fivenum(arr)
62
+		Q1 = five_arr[1]
63
+		Q3 = five_arr[3]
64
+		L1 = Q1-1.5*(Q3-Q1) #区间下界
65
+		L2 = Q3+1.5*(Q3-Q1) #区间上界
66
+		res = []
67
+
68
+		for i in range(len(arr)):
69
+			if arr[i]<=L2:
70
+				res.append(arr[i])
71
+			else:
72
+				res.append(-1)
73
+		return res
74
+
75
+	result = []
76
+	for key in tv_station:
77
+		id_arr = tv_station[key][0]
78
+		vv = np.array(tv_station[key][1])
79
+		tmp_arr = np.array(map(denoise,vv.transpose())).transpose()
80
+		for i in range(len(id_arr)):
81
+			tv_id = id_arr[i]
82
+			tmp = [tv_id] + tv_play[tv_id] + map(float,list(tmp_arr[i]))
83
+			result.append(tmp)
84
+
85
+
86
+	#写入yxb.ad_tv_rating_denoise
87
+	delete = 'delete from yxb.%s_tv_rating_denoise where year(tv_date)>=%s' %(choose[Number], dateline)
88
+	try:
89
+		Mysql.execute(delete, conn=conn)
90
+	except mysql.connector.errors.ProgrammingError as e:
91
+		pass
92
+
93
+	ind1 = 'DROP INDEX id ON yxb.%s_tv_rating_denoise' % choose[Number]
94
+	ind2 = 'DROP INDEX tv_date ON yxb.%s_tv_rating_denoise' % choose[Number] 
95
+	try:
96
+		Mysql.execute(ind1, conn=conn)
97
+	except mysql.connector.errors.ProgrammingError as e:
98
+		pass
99
+	try:
100
+		Mysql.execute(ind2, conn=conn)
101
+	except mysql.connector.errors.ProgrammingError as e:
102
+		pass
103
+
104
+	sql = 'insert into yxb.%s_tv_rating_denoise' % choose[Number] + ' values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
105
+	for i in range(int(len(result)/1000)+1):
106
+		tmp = result[i*1000:(i+1)*1000]
107
+		Mysql.insertMany(sql, tmp, conn=conn)
108
+
109
+	sql1 = 'CREATE INDEX id ON yxb.%s_tv_rating_denoise(id)' % choose[Number]
110
+	sql2 = 'CREATE INDEX tv_date ON yxb.%s_tv_rating_denoise (tv_date,theater_attribute)' % choose[Number]
111
+	Mysql.execute(sql1, conn=conn)
112
+	Mysql.execute(sql2, conn=conn)
113
+
114
+	#写入odl.ad_tv_rating_denoise
115
+	delete = 'delete from odl.%s_tv_rating_denoise where year(tv_date)>=%s' % (choose[Number],dateline)
116
+	try:
117
+		Mysql.execute(delete, conn=conn)
118
+	except mysql.connector.errors.ProgrammingError as e:
119
+		pass
120
+
121
+	sql = 'insert into odl.%s_tv_rating_denoise select * from yxb.%s_tv_rating_denoise where year(tv_date)>=%s' %(choose[Number], choose[Number], dateline)
122
+
123
+	try:
124
+		Mysql.execute(sql, conn=conn)
125
+	except mysql.connector.errors.ProgrammingError as e:
126
+		pass
127
+
128
+	Mysql.close(conn)
129
+
130
+for i in range(0,2):
131
+	Stat(i)

+ 98 - 0
odl_near_realtime_calc.py

@@ -0,0 +1,98 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""odl.ad_tv_id_pre数据监听
5
+
6
+每隔一段时间
7
+"""
8
+
9
+from fty_util.common import Mysql
10
+import commands
11
+import time
12
+import sys
13
+
14
+reload(sys)
15
+sys.setdefaultencoding('utf8')
16
+
17
+c_list = [',', '、', ',', ';', ';', '/']
18
+def replace_other_character(field):
19
+    if field is None:
20
+        return ''
21
+    if field == u'暂无信息':
22
+        field = ''
23
+    for c in c_list:
24
+        field = field.replace(c, ' ')
25
+    return field
26
+
27
+while True:
28
+    conn = Mysql.createOfflineConn()
29
+    print 'start heartbeat'
30
+    # 情况odl.ad_tv_lib表数据
31
+    sql = """
32
+        select tv_id, is_run from yxb.ad_tv_id_pre where is_run = 0
33
+    """
34
+    rows = Mysql.getAll(sql, conn=conn)
35
+
36
+    for row in rows:
37
+        tv_id = row['tv_id']
38
+        is_run = row['is_run']
39
+        if is_run == 0:
40
+            print tv_id
41
+            sql = """
42
+                update yxb.ad_tv_id_pre set is_run = 1 where tv_id = '%s'
43
+            """
44
+            sql = sql % (tv_id)
45
+            Mysql.update(sql, conn=conn)
46
+
47
+            sql = """
48
+                select tv_name, director, scriptwritter, main_actors, types, concat(decade, first_type) as first_type, second_type, \
49
+                description, pub_comp, pub_date, production, \
50
+                cehua, jianzhi, chupin_comp, chupin_date, show_time, decade, first_type, categories from yxb.ad_tv_lib where id = '%s'
51
+            """
52
+            sql = sql % (tv_id)
53
+            row = Mysql.getOne(sql, conn=conn)
54
+            tv_name = row[0]
55
+            director = replace_other_character(row[1])
56
+            scriptwritter = replace_other_character(row[2])
57
+            main_actors = replace_other_character(row[3])
58
+            types = replace_other_character(row[4])
59
+            first_type = replace_other_character(row[5])
60
+            second_type = replace_other_character(row[6])
61
+            description = row[7]
62
+            pub_comp = replace_other_character(row[8])
63
+            pub_date = row[9]
64
+            production = replace_other_character(row[10])
65
+            cehua = replace_other_character(row[11])
66
+            jianzhi = replace_other_character(row[12])
67
+            chupin_comp = replace_other_character(row[13])
68
+            chupin_date = row[14]
69
+            show_time = row[15]
70
+            decade = replace_other_character(row[16])
71
+            theme = replace_other_character(row[17])
72
+
73
+            sql = """
74
+                replace into odl.ad_tv_lib (tv_id, tv_name, director, scriptwriter, main_actors, types, first_type, second_type, description, \
75
+                pub_comp, pub_date, filmer, scheming, producer, produce_comp, produce_date, show_time, is_use, decade, theme) \
76
+                values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
77
+            """
78
+            value = (tv_id, tv_name, director, scriptwritter, main_actors, types, first_type, second_type, description, pub_comp, pub_date, production, cehua, jianzhi, chupin_comp, chupin_date, show_time, '1', decade, theme)
79
+            Mysql.execute(sql, param=value, conn=conn)
80
+        #todo 调用预测脚本
81
+        status, output = commands.getstatusoutput('sh bash_near_real_job.sh ' + str(tv_id))
82
+        if status != 0:
83
+            sql = """
84
+                update yxb.ad_tv_id_pre set is_run = 0 where tv_id = '%s'
85
+            """
86
+            sql = sql % (tv_id)
87
+            Mysql.update(sql, conn=conn)
88
+            print 'near_real_job.sh执行失败'
89
+            print output
90
+            break
91
+        else:
92
+            sql = """
93
+                delete from yxb.ad_tv_id_pre where tv_id = '%s' and is_run = 1
94
+            """
95
+            sql = sql % (tv_id)
96
+            Mysql.execute(sql, conn=conn)
97
+    Mysql.close(conn)
98
+    time.sleep(60)

+ 179 - 0
online_ad_tv_sr_pre.py

@@ -0,0 +1,179 @@
1
+#encoding=utf-8
2
+#author:wdw110
3
+#功能:预测计算电视剧收视指数和在电视台的收视率
4
+
5
+from __future__ import division
6
+import re
7
+import sys
8
+import time
9
+import copy
10
+import datetime
11
+import numpy as np
12
+from fty_util.common import Mysql
13
+
14
+tv_pre = [] #电视剧收视预测值
15
+tv_data_linear = {}
16
+tv_data_sim = {}
17
+tv_stage = {}
18
+channel_rate = {}
19
+type_rate = {}
20
+model_var = [{},{},{},{},{}]
21
+rules = u' '
22
+if len(sys.argv) > 1:
23
+	tv_id = int(sys.argv[1])
24
+else:
25
+	print '请输入电视剧id'
26
+	sys.exit()
27
+
28
+conn = Mysql.createOfflineConn()
29
+
30
+bound = [0,2]  #固定收视指数的最小和最大值
31
+
32
+sql = 'select * from tmp.ad_tv_sr_pre_var'
33
+tmp_data = Mysql.selectAll(sql, conn=conn)
34
+for i in range(len(tmp_data)):
35
+	var_name = tmp_data[i][1]
36
+	tv_sr_arr = map(float,tmp_data[i][3].strip('[|]').split(','))
37
+	year = tmp_data[i][4]
38
+	var_loc = tmp_data[i][5]-1
39
+
40
+	model_var[var_loc].setdefault(var_name,{})
41
+	model_var[var_loc][var_name][year] = sum(tv_sr_arr)/len(tv_sr_arr)
42
+
43
+sql = 'select tv_id,tv_name,director,scriptwriter,main_actors,filmer,categories from odl.ad_tv_lib where tv_id=%d' % tv_id
44
+tmp_data = Mysql.selectAll(sql, conn=conn)
45
+
46
+
47
+if tmp_data:
48
+	tv_id = int(tmp_data[0][0])
49
+	tv_name = tmp_data[0][1]
50
+	year = datetime.datetime.now().year
51
+	director = tmp_data[0][2] if tmp_data[0][2] else ''
52
+	scriptwriter = tmp_data[0][3] if tmp_data[0][3] else ''
53
+	actors = tmp_data[0][4] if tmp_data[0][4] else ''
54
+	filmer = tmp_data[0][5] if tmp_data[0][5] else ''
55
+	type1 = tmp_data[0][6] if tmp_data[0][6] else ''
56
+	tv_data_linear[tv_id] = [tv_name,director,scriptwriter,actors,filmer,type1,year]
57
+else:
58
+	print 'tv_id:%d不在ad_tv_lib表中!' % tv_id
59
+	sys.exit()
60
+
61
+def trans(val):
62
+	res = (val - bound[0])/(bound[1]-bound[0])*10.0
63
+	if res < 1.0:
64
+		res = 1.0
65
+	elif res > 10.0:
66
+		res = 10.0
67
+	return round(res,2)
68
+
69
+def tv_sr_pre(var_arr,year): #变量数组,各变量的名字
70
+	'''线性回归模型'''
71
+	coef = np.array([0.2103148,0.5182419,0.7822451,0.4921597,0.3865043,-1.3566513])
72
+	model_avg = [] #每个变量的近一年的平均值
73
+	model_val = np.ones(len(var_arr)+1)
74
+	for i in range(len(model_var)):
75
+		tmp_obj = model_var[i]
76
+		sum1,num = 0,0
77
+		for var_name in tmp_obj:
78
+			if year in tmp_obj[var_name]:
79
+				sum1 += tmp_obj[var_name][year]
80
+				num += 1
81
+			elif year-1 in tmp_obj[var_name]:
82
+				sum1 += tmp_obj[var_name][year-1]
83
+				num += 1
84
+		tmp_avg = sum1/num if num else 0
85
+		model_avg.append(tmp_avg)
86
+
87
+	for i in range(len(var_arr)):
88
+		p_arr = var_arr[i].split(u' ')
89
+		for peo in p_arr:
90
+			if peo:
91
+				if model_var[i].has_key(peo):
92
+					if year in model_var[i][peo]:
93
+						model_val[i] = model_var[i][peo][year]
94
+					else:
95
+						max_year = max(model_var[i][peo].keys())
96
+						model_val[i] = model_var[i][peo][max_year]
97
+				else:
98
+					model_val[i] = model_avg[i] #变量不在数据库时用均值代替
99
+	result = np.dot(coef,model_val)
100
+	return result
101
+
102
+now = datetime.datetime.now()
103
+aDay = datetime.timedelta(days=-365)
104
+date_line = (now + aDay).date()
105
+#默认时间为当前时间的近一年
106
+
107
+
108
+sql = "select * from idl.ad_pub_station_rate_stats"
109
+station_data = Mysql.selectAll(sql, conn=conn)
110
+
111
+
112
+sql = "select * from tmp.ad_pub_station_type_rate"
113
+type_data = Mysql.selectAll(sql, conn=conn)
114
+
115
+for i in range(len(station_data)):
116
+	channel = station_data[i][1]
117
+	aud_rating = station_data[i][2]
118
+	tv_date = station_data[i][3]
119
+	
120
+	channel_rate.setdefault(channel,[])
121
+	if tv_date >= date_line:
122
+		channel_rate[channel].append(aud_rating)
123
+
124
+for i in range(len(type_data)):
125
+	channel = type_data[i][1]
126
+	Type = type_data[i][2]
127
+	aud_rating = type_data[i][3]
128
+	tv_date = type_data[i][4]
129
+	
130
+	type_rate.setdefault(channel,{})
131
+	type_rate[channel].setdefault(Type,[])
132
+	if tv_date >= date_line:
133
+		type_rate[channel][Type].append(aud_rating)
134
+
135
+tv2type = copy.deepcopy(type_rate)
136
+for channel,value in type_rate.items():
137
+	for ty,v_arr in value.items():
138
+		tv2type[channel][ty] = sum(v_arr)/len(v_arr) if len(v_arr) else 0.0
139
+
140
+for tv_id in tv_data_linear:
141
+	tv_name = tv_data_linear[tv_id][0]
142
+	var_arr = tv_data_linear[tv_id][1:-1]
143
+	type1 = tv_data_linear[tv_id][5]
144
+	year = tv_data_linear[tv_id][-1]
145
+
146
+	tv_station = {}
147
+	ty_arr = type1.split(u' ')
148
+	for channel in channel_rate:
149
+		tmp,n = 0,0
150
+		value = tv2type.get(channel,{})
151
+		for tt in ty_arr:
152
+			if tt and value.has_key(tt):
153
+				tmp += value[tt]
154
+				n += 1
155
+		tv_station[channel] = tmp/n if n else 0
156
+
157
+	tv_sr = tv_sr_pre(var_arr,year)
158
+	for channel,vv in tv_station.items():
159
+		channel_avg = sum(channel_rate[channel])/len(channel_rate[channel]) if channel_rate[channel] else 0.0
160
+		#判断是否有该类型的平均收视率
161
+		if vv:
162
+			station_rate = 0.6 * tv_sr * channel_avg + 0.4 * vv
163
+		else:
164
+			station_rate = tv_sr * channel_avg
165
+		tv_pre.append((tv_id,tv_name,str(tv_sr),str(trans(tv_sr)),channel,str(station_rate)))
166
+
167
+delete = 'delete from idl.ad_tv_sr_pre where tv_id=%d' % tv_id
168
+Mysql.execute(delete, conn=conn)
169
+
170
+
171
+sql = 'insert into idl.ad_tv_sr_pre(tv_id,tv_name,tv_sr_pre,tv_sr_pre_trans,channel,station_tv_pre) values(%s,%s,%s,%s,%s,%s)'
172
+Mysql.insertMany(sql, tv_pre, conn=conn)
173
+
174
+
175
+sql = 'update tv_lib.gc_tv_series ts set ts.point = %s where ts.tv_id = %s' %(str(tv_pre[0][3]),tv_id)
176
+Mysql.update(sql, conn=conn)
177
+
178
+Mysql.close(conn)
179
+

+ 9 - 0
setup.py

@@ -0,0 +1,9 @@
1
+
2
+from setuptools import setup, find_packages
3
+
4
+setup(
5
+    name="fty_util",
6
+    version='1.0',
7
+    packages=find_packages(where='.'),
8
+    include_package_data=True,
9
+)

+ 191 - 0
shell/bash_daily.sh

@@ -0,0 +1,191 @@
1
+#!/bin/bash
2
+
3
+# 判断脚本执行路径是否存在,如果不存在则需要设置
4
+if [ -z $HUOJU_FTY_PATH ];
5
+then 
6
+    echo "not found"
7
+    export HUOJU_FTY_PATH=/root/py_script/
8
+else
9
+    echo "found"
10
+fi
11
+echo $HUOJU_FTY_PATH
12
+
13
+###############################daily操作################################
14
+
15
+# 电视台近一年平均收视率
16
+echo "执行tmp_year_channel_avg_ratings_stat_by_tv province任务"
17
+python ${HUOJU_FTY_PATH}task_tmp/tmp_year_channel_avg_ratings_stat_by_tv.py province
18
+if [ $? -ne 0 ];
19
+    then
20
+        content="任务tmp_year_channel_avg_ratings_stat_by_tv province失败"
21
+        echo $content
22
+    exit 1
23
+fi
24
+
25
+echo "执行tmp_year_channel_avg_ratings_stat_by_tv area任务"
26
+python ${HUOJU_FTY_PATH}task_tmp/tmp_year_channel_avg_ratings_stat_by_tv.py area
27
+if [ $? -ne 0 ];
28
+    then
29
+        content="任务tmp_year_channel_avg_ratings_stat_by_tv area失败"
30
+        echo $content
31
+    exit 1
32
+fi
33
+
34
+# 按月统计电视剧的收视情况
35
+echo "执行tmp_tv_avg_ratings_fatt0 province任务"
36
+python ${HUOJU_FTY_PATH}task_tmp/tmp_tv_avg_ratings_fatt0.py province
37
+if [ $? -ne 0 ];
38
+    then
39
+        content="任务tmp_tv_avg_ratings_fatt0 province失败"
40
+        echo $content
41
+    exit 1
42
+fi
43
+echo "执行tmp_tv_avg_ratings_stat province任务"
44
+python ${HUOJU_FTY_PATH}task_tmp/tmp_tv_avg_ratings_stat.py province
45
+if [ $? -ne 0 ];
46
+    then
47
+        content="任务tmp_tv_avg_ratings_stat province失败"
48
+        echo $content
49
+    exit 1
50
+fi
51
+
52
+echo "执行tmp_tv_avg_ratings_fatt0 area任务"
53
+python ${HUOJU_FTY_PATH}task_tmp/tmp_tv_avg_ratings_fatt0.py area
54
+if [ $? -ne 0 ];
55
+    then
56
+        content="任务tmp_tv_avg_ratings_fatt0 area失败"
57
+        echo $content
58
+    exit 1
59
+fi
60
+echo "执行tmp_tv_avg_ratings_stat area任务"
61
+python ${HUOJU_FTY_PATH}task_tmp/tmp_tv_avg_ratings_stat.py area
62
+if [ $? -ne 0 ];
63
+    then
64
+        content="任务tmp_tv_avg_ratings_stat area失败"
65
+        echo $content
66
+    exit 1
67
+fi
68
+
69
+# 电视台对应电视剧及类型关系数据
70
+echo "执行tmp_tv_category_stat province任务"
71
+python ${HUOJU_FTY_PATH}task_tmp/tmp_tv_category_stat.py province
72
+if [ $? -ne 0 ];
73
+    then
74
+        content="任务tmp_tv_category_stat province失败"
75
+        echo $content
76
+    exit 1
77
+fi
78
+echo "执行tmp_tv_category_stat area任务"
79
+python ${HUOJU_FTY_PATH}task_tmp/tmp_tv_category_stat.py area
80
+if [ $? -ne 0 ];
81
+    then
82
+        content="任务tmp_tv_category_stat area失败"
83
+        echo $content
84
+    exit 1
85
+fi
86
+
87
+# 同步tv_avg_ratings_stat
88
+echo "执行idl_tv_avg_ratings_stat province任务"
89
+python ${HUOJU_FTY_PATH}task_idl/idl_tv_avg_ratings_stat.py province
90
+if [ $? -ne 0 ];
91
+    then
92
+        content="任务idl_tv_avg_ratings_stat province失败"
93
+        echo $content
94
+    exit 1
95
+fi
96
+echo "执行idl_tv_avg_ratings_stat area任务"
97
+python ${HUOJU_FTY_PATH}task_idl/idl_tv_avg_ratings_stat.py area
98
+if [ $? -ne 0 ];
99
+    then
100
+        content="任务idl_tv_avg_ratings_stat area失败"
101
+        echo $content
102
+    exit 1
103
+fi
104
+
105
+# 同步tv_category_stat
106
+echo "执行idl_tv_category_stat province任务"
107
+python ${HUOJU_FTY_PATH}task_idl/idl_tv_category_stat.py province
108
+if [ $? -ne 0 ];
109
+    then
110
+        content="任务idl_tv_category_stat province失败"
111
+        echo $content
112
+    exit 1
113
+fi
114
+echo "执行idl_tv_category_stat area任务"
115
+python ${HUOJU_FTY_PATH}task_idl/idl_tv_category_stat.py area
116
+if [ $? -ne 0 ];
117
+    then
118
+        content="任务idl_tv_category_stat area失败"
119
+        echo $content
120
+    exit 1
121
+fi
122
+
123
+# 同步year_channel_avg_ratings_stat
124
+echo "执行idl_year_channel_avg_ratings_stat province任务"
125
+python ${HUOJU_FTY_PATH}task_idl/idl_year_channel_avg_ratings_stat.py province
126
+if [ $? -ne 0 ];
127
+    then
128
+        content="任务idl_year_channel_avg_ratings_stat province失败"
129
+        echo $content
130
+    exit 1
131
+fi
132
+echo "执行idl_year_channel_avg_ratings_stat area任务"
133
+python ${HUOJU_FTY_PATH}task_idl/idl_year_channel_avg_ratings_stat.py area
134
+if [ $? -ne 0 ];
135
+    then
136
+        content="任务idl_year_channel_avg_ratings_stat area失败"
137
+        echo $content
138
+    exit 1
139
+fi
140
+
141
+# 同步营销文章爬取数量
142
+echo "执行idl_tv_article_marketing_count任务"
143
+python ${HUOJU_FTY_PATH}task_idl/idl_tv_article_marketing_count.py
144
+if [ $? -ne 0 ];
145
+    then
146
+        content="任务idl_tv_article_marketing_count失败"
147
+        echo $content
148
+    exit 1
149
+fi
150
+
151
+# 同步营销文章爬取链接
152
+echo "执行idl_tv_article_marketing_detail任务"
153
+python ${HUOJU_FTY_PATH}task_idl/idl_tv_article_marketing_detail.py
154
+if [ $? -ne 0 ];
155
+    then
156
+        content="任务idl_tv_article_marketing_detail失败"
157
+        echo $content
158
+    exit 1
159
+fi
160
+
161
+# 类型转换
162
+echo "执行transform_categories任务"
163
+python ${HUOJU_FTY_PATH}task_other/transform_categories.py
164
+if [ $? -ne 0 ];
165
+    then
166
+        content="任务transform_categories失败"
167
+        echo $content
168
+    exit 1
169
+fi
170
+
171
+# yxb字段清理
172
+echo "执行ad_tv_lib_clean任务"
173
+python ${HUOJU_FTY_PATH}task_yxb/ad_tv_lib_clean.py
174
+if [ $? -ne 0 ];
175
+    then
176
+        content="任务ad_tv_lib_clean失败"
177
+        echo $content
178
+    exit 1
179
+fi
180
+
181
+# 排名
182
+echo "执行idl_rank_update任务"
183
+python ${HUOJU_FTY_PATH}task_other/idl_rank_update.py
184
+if [ $? -ne 0 ];
185
+    then
186
+        content="idl_rank_update失败"
187
+        echo $content
188
+    exit 1
189
+fi
190
+
191
+echo "每天脚本执行完毕"

+ 66 - 0
shell/bash_daily_import.sh

@@ -0,0 +1,66 @@
1
+#!/bin/bash
2
+
3
+# 判断脚本执行路径是否存在,如果不存在则需要设置
4
+if [ -z $HUOJU_FTY_PATH ];
5
+then
6
+    echo "not found"
7
+    export HUOJU_FTY_PATH=/root/py_script/
8
+else
9
+    echo "found"
10
+fi
11
+echo $HUOJU_FTY_PATH
12
+
13
+###############################import操作################################
14
+
15
+# odl.ad_tv_lib 增量更新操作
16
+echo "执行odl_ad_tv_lib_insert任务"
17
+python ${HUOJU_FTY_PATH}task_odl/odl_ad_tv_lib_insert.py
18
+if [ $? -ne 0 ];
19
+    then
20
+        content="odl_ad_tv_lib_insert失败"
21
+        echo $content
22
+    exit 1
23
+fi
24
+
25
+# odl.ad_television 增量更新操作
26
+echo "执行odl_ad_television_incr_update任务"
27
+python ${HUOJU_FTY_PATH}task_odl/odl_ad_television_incr_update.py
28
+if [ $? -ne 0 ];
29
+    then
30
+        content="odl_ad_television_incr_update失败"
31
+        echo $content
32
+    exit 1
33
+fi
34
+
35
+# odl.area_ad_television 增量更新操作
36
+echo "执行odl_area_ad_television_incr_update任务"
37
+python ${HUOJU_FTY_PATH}task_odl/odl_area_ad_television_incr_update.py
38
+if [ $? -ne 0 ];
39
+    then
40
+        content="odl_area_ad_television_incr_update失败"
41
+        echo $content
42
+    exit 1
43
+fi
44
+
45
+# odl.ad_audience_cps_time_incr_update 导入操作
46
+echo "执行odl_ad_audience_cps_time_incr_update任务"
47
+python ${HUOJU_FTY_PATH}task_odl/odl_ad_audience_cps_time_incr_update.py
48
+if [ $? -ne 0 ];
49
+    then
50
+        content="odl_ad_audience_cps_time_incr_update失败"
51
+        echo $content
52
+    exit 1
53
+fi
54
+echo "导入脚本执行完毕"
55
+
56
+# 月份提取
57
+echo "执行tmp_data_month任务"
58
+python ${HUOJU_FTY_PATH}task_tmp/tmp_data_month.py
59
+if [ $? -ne 0 ];
60
+    then
61
+        content="任务tmp_data_month失败"
62
+        echo $content
63
+    exit 1
64
+fi
65
+
66
+echo "月份提取完毕"

+ 55 - 0
shell/bash_job.sh

@@ -0,0 +1,55 @@
1
+#!/bin/bash
2
+
3
+# 判断脚本执行路径是否存在,如果不存在则需要设置
4
+if [ -z $HUOJU_FTY_PATH ];
5
+then
6
+    echo "not found"
7
+    export HUOJU_FTY_PATH=/root/py_script/
8
+else
9
+    echo "found"
10
+fi
11
+echo $HUOJU_FTY_PATH
12
+
13
+#########################基础数据###########################
14
+
15
+# 收视率数据去噪
16
+echo "idl_tv_sr_denoise"
17
+python ${HUOJU_FTY_PATH}idl_tv_sr_denoise.py
18
+if [ $? -ne 0 ];
19
+   then
20
+       content="idl_tv_sr_denoise"
21
+       echo $content
22
+   exit 1
23
+fi
24
+
25
+# 相似剧离线计算
26
+echo "tv_outline_recom"
27
+python ${HUOJU_FTY_PATH}tv_outline_recom.py
28
+if [ $? -ne 0 ];
29
+    then
30
+        content="tv_outline_recom"
31
+        echo $content
32
+    exit 1
33
+fi
34
+
35
+# 收视指数预测基础数据
36
+echo "tmp_ad_tv_sr_stat"
37
+python ${HUOJU_FTY_PATH}tmp_ad_tv_sr_stat.py
38
+if [ $? -ne 0 ];
39
+   then
40
+       content="tmp_ad_tv_sr_stat"
41
+       echo $content
42
+   exit 1
43
+fi
44
+
45
+# 发行平台数据统计
46
+echo "idl_ad_pub_station_stats"
47
+python ${HUOJU_FTY_PATH}idl_ad_pub_station_stats.py
48
+if [ $? -ne 0 ];
49
+    then
50
+        content="idl_ad_pub_station_stats"
51
+        echo $content
52
+    exit 1
53
+fi
54
+
55
+echo "脚本执行完毕"

+ 43 - 0
shell/bash_scrapy.sh

@@ -0,0 +1,43 @@
1
+#!/bin/bash
2
+
3
+# 判断脚本执行路径是否存在,如果不存在则需要设置
4
+if [ -z $HUOJU_FTY_PATH ];
5
+then 
6
+    echo "not found"
7
+    export HUOJU_FTY_PATH=/root/py_script/
8
+else
9
+    echo "found"
10
+fi
11
+echo $HUOJU_FTY_PATH
12
+
13
+###############################scrapy操作################################
14
+
15
+# 新剧营销文章链接爬取
16
+echo "执行scrapy_website_count_new任务"
17
+python ${HUOJU_FTY_PATH}task_scrapy/scrapy_website_count_new.py
18
+if [ $? -ne 0 ];
19
+    then
20
+        content="任务scrapy_website_count_new失败"
21
+        echo $content
22
+    exit 1
23
+fi
24
+
25
+# 营销文章数量爬取
26
+echo "执行scrapy_website_count任务"
27
+python ${HUOJU_FTY_PATH}task_scrapy/scrapy_website_count.py
28
+if [ $? -ne 0 ];
29
+    then
30
+        content="任务scrapy_website_count失败"
31
+        echo $content
32
+    exit 1
33
+fi
34
+
35
+# 从爱奇艺上爬取百科上未被爬到的电视剧
36
+echo "执行scrapy_tv_unhandle任务"
37
+python ${HUOJU_FTY_PATH}task_scrapy/scrapy_tv_unhandle.py
38
+if [ $? -ne 0 ];
39
+    then
40
+        content="任务scrapy_tv_unhandle失败"
41
+        echo $content
42
+    exit 1
43
+fi

+ 39 - 0
task_clean/odl_ad_tv_record_distribution_update_company_field.py

@@ -0,0 +1,39 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""更新表odl.ad_tv_record_distribution表的theme, first_type, second_type字段,去除空白符
5
+
6
+"""
7
+
8
+import datetime
9
+import os
10
+import sys
11
+import time
12
+
13
+from fty_util.common import Mysql
14
+
15
+reload(sys)
16
+sys.setdefaultencoding('utf8')
17
+
18
+conn = Mysql.createOfflineConn()
19
+# 清空走势数据
20
+
21
+sql = """
22
+    select id, company, `desc` from odl.ad_tv_record_distribution where id > 5000
23
+"""
24
+
25
+rows = Mysql.getAll(sql, conn=conn)
26
+for row in rows:
27
+    _id = row['id']
28
+    company = row['company']
29
+    desc = row['desc']
30
+    company = company.replace(u'报备机构:', '').replace(u'报备机构:', '').replace('\r', '').replace('\n', '').replace('\t', '').replace('\'', '\\\'').replace('\"', '\\\"')
31
+    desc = desc.replace(u'内容提要:','').replace(u'内容提要:','').replace('\r', '').replace('\n', '').replace('\t', '').replace('\'', '\\\'').replace('\"', '\\\"')
32
+
33
+    sql = """
34
+        update odl.ad_tv_record_distribution set company = '%s', `desc` = '%s' where id = '%s'
35
+    """
36
+    sql = sql % (company, desc, _id)
37
+    Mysql.execute(sql, conn=conn)
38
+
39
+Mysql.close(conn)

+ 41 - 0
task_clean/odl_ad_tv_record_distribution_update_theme_field.py

@@ -0,0 +1,41 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""更新表odl.ad_tv_record_distribution表的theme, first_type, second_type字段,去除空白符
5
+
6
+"""
7
+
8
+import datetime
9
+import os
10
+import sys
11
+import time
12
+
13
+from fty_util.common import Mysql
14
+
15
+reload(sys)
16
+sys.setdefaultencoding('utf8')
17
+
18
+conn = Mysql.createOfflineConn()
19
+# 清空走势数据
20
+
21
+sql = """
22
+    select id, theme, first_type, second_type from odl.ad_tv_record_distribution where LENGTH(theme) > 12
23
+"""
24
+
25
+rows = Mysql.getAll(sql, conn=conn)
26
+for row in rows:
27
+    _id = row['id']
28
+    theme = row['theme']
29
+    first_type = row['first_type']
30
+    second_type = row['second_type']
31
+    theme = theme.replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '')
32
+    first_type = first_type.replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '')
33
+    second_type = second_type.replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '')
34
+
35
+    sql = """
36
+        update odl.ad_tv_record_distribution set theme = '%s', first_type = '%s', second_type = '%s' where id = '%s'
37
+    """
38
+    sql = sql % (theme, first_type, second_type, _id)
39
+    Mysql.execute(sql, conn=conn)
40
+
41
+Mysql.close(conn)

+ 47 - 0
task_clean/scrapy_category_clean.py

@@ -0,0 +1,47 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""清洗爬取到的分类数据
5
+
6
+流程:爬取爱奇艺数据,爬取腾讯数据 -> 将两个分类去重合并 -> 将分类处理到关联表中,每个tv_id对应一个分类
7
+"""
8
+
9
+import datetime
10
+import os
11
+import sys
12
+import time
13
+
14
+from fty_util.common import Mysql
15
+
16
+reload(sys)
17
+sys.setdefaultencoding('utf8')
18
+
19
+conn = Mysql.createOfflineConn()
20
+
21
+# 查询爬取到的爱奇艺分类和腾讯视频分类
22
+sql = """
23
+    select id, tv_name, iqiyi_types, tengxun_types from scrapy.tv_category_scrapy
24
+"""
25
+rows = Mysql.getAll(sql, conn=conn)
26
+
27
+for row in rows:
28
+    _id = row['id']
29
+    tv_name = row['tv_name']
30
+    iqiyi_types = row['iqiyi_types']
31
+    tengxun_types = row['tengxun_types']
32
+    all_types = set()
33
+    if iqiyi_types is not None and len(iqiyi_types) > 0:
34
+        for iqiyi_type in iqiyi_types.split(' '):
35
+            all_types.add(iqiyi_type)
36
+
37
+    if tengxun_types is not None and len(tengxun_types) > 0:
38
+        for tengxun_type in tengxun_types.split(' '):
39
+            all_types.add(tengxun_type)
40
+
41
+    sql = """
42
+        update scrapy.tv_category_scrapy set types = '%s' where id = '%s'
43
+    """
44
+    sql = sql % (' '.join(all_types), _id)
45
+    Mysql.execute(sql, conn=conn)
46
+
47
+Mysql.close(conn)

+ 63 - 0
task_clean/scrapy_category_update.py

@@ -0,0 +1,63 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""清洗爬取到的分类数据
5
+
6
+"""
7
+
8
+import datetime
9
+import os
10
+import sys
11
+import time
12
+
13
+from fty_util.common import Mysql
14
+
15
+reload(sys)
16
+sys.setdefaultencoding('utf8')
17
+
18
+conn = Mysql.createOfflineConn()
19
+
20
+# 查询爬取到的爱奇艺分类和腾讯视频分类
21
+sql = """
22
+    select id, types from scrapy.tv_category_scrapy
23
+"""
24
+rows = Mysql.getAll(sql, conn=conn)
25
+
26
+for row in rows:
27
+    _id = row['id']
28
+    types = row['types']
29
+
30
+    if types is not None and len(types) > 0:
31
+        types_set = set()
32
+        for _type in types.split(' '):
33
+            if _type == '生活':
34
+                continue
35
+            elif _type == '军旅' or _type == '军事':
36
+                types_set.add('军旅')
37
+            elif _type == '惊悚' or _type == '恐怖':
38
+                types_set.add('恐怖')
39
+            elif _type == '魔幻' or _type == '奇幻':
40
+                types_set.add('奇幻')
41
+            elif _type == '偶像' or _type == '时装':
42
+                types_set.add('偶像')
43
+            elif _type == '喜剧' or _type == '搞笑':
44
+                types_set.add('喜剧')
45
+            elif _type == '悬疑' or _type == '冒险' or _type == '侦探':
46
+                types_set.add('悬疑')
47
+            elif _type == '言情' or _type == '情感' or _type == '爱情':
48
+                types_set.add('情感')
49
+            elif _type == '战争' or _type == '抗日' or _type == '革命':
50
+                types_set.add('战争')
51
+            elif _type == '警匪' or _type == '犯罪' or _type == '刑侦':
52
+                types_set.add('罪案')
53
+            elif _type == '传记' or _type == '人物' or _type == '传奇' or _type == '纪实':
54
+                types_set.add('传记')
55
+            else:
56
+                types_set.add(_type)
57
+        sql = """
58
+            update scrapy.tv_category_scrapy set types = '%s' where id = '%s'
59
+        """
60
+        sql = sql % (' '.join(types_set), _id)
61
+        Mysql.execute(sql, conn=conn)
62
+
63
+Mysql.close(conn)

+ 55 - 0
task_clean/scrapy_dianshiju_clean.py

@@ -0,0 +1,55 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""清洗爬取到的分类数据
5
+
6
+流程:爬取爱奇艺数据,爬取腾讯数据 -> 将两个分类去重合并 -> 将分类处理到关联表中,每个tv_id对应一个分类
7
+"""
8
+
9
+import datetime
10
+import os
11
+import sys
12
+import time
13
+
14
+from fty_util.common import Mysql
15
+
16
+reload(sys)
17
+sys.setdefaultencoding('utf8')
18
+
19
+# 爱奇艺数据清洗
20
+def iqiyi_content_clean():
21
+    pass
22
+
23
+# 腾讯数据清洗
24
+def tengxun_content_clean():
25
+    pass
26
+
27
+conn = Mysql.createOfflineConn()
28
+
29
+# 查询爬取到的爱奇艺分类和腾讯视频分类
30
+sql = """
31
+    select id, tv_name, iqiyi_types, tengxun_types from scrapy.tv_category_scrapy
32
+"""
33
+rows = Mysql.getAll(sql, conn=conn)
34
+
35
+for row in rows:
36
+    _id = row['id']
37
+    tv_name = row['tv_name']
38
+    iqiyi_types = row['iqiyi_types']
39
+    tengxun_types = row['tengxun_types']
40
+    all_types = set()
41
+    if iqiyi_types is not None and len(iqiyi_types) > 0:
42
+        for iqiyi_type in iqiyi_types.split(' '):
43
+            all_types.add(iqiyi_type)
44
+
45
+    if tengxun_types is not None and len(tengxun_types) > 0:
46
+        for tengxun_type in tengxun_types.split(' '):
47
+            all_types.add(tengxun_type)
48
+
49
+    sql = """
50
+        update scrapy.tv_category_scrapy set types = '%s' where id = '%s'
51
+    """
52
+    sql = sql % (' '.join(all_types), _id)
53
+    Mysql.execute(sql, conn=conn)
54
+
55
+Mysql.close(conn)

+ 39 - 0
task_clean/tv_category_relation.py

@@ -0,0 +1,39 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""将tv_category_scrapy表中的分类数据(多个)分割存到分类关联表中,记录为tv_id - category
5
+
6
+"""
7
+
8
+import sys
9
+import os
10
+import datetime
11
+from fty_util.common import Mysql
12
+import time
13
+
14
+reload(sys)
15
+sys.setdefaultencoding('utf8')
16
+
17
+conn = Mysql.createOfflineConn()
18
+# 清空走势数据
19
+
20
+sql = """
21
+    select id, types from scrapy.tv_category_scrapy where types is not null and types != '' order by id asc
22
+"""
23
+
24
+rows = Mysql.getAll(sql, conn=conn)
25
+for row in rows:
26
+    _id = row['id']
27
+    types = row['types']
28
+
29
+    type_list = types.split(' ')
30
+    
31
+    sql_insert = """
32
+        insert into odl.tv_category_relation (tv_id, category) values (%s, %s)
33
+    """
34
+    data_list = []
35
+    for _type in type_list:
36
+        data_list.append((_id, _type))
37
+    Mysql.insertMany(sql_insert, data_list, conn)
38
+
39
+Mysql.close(conn)

+ 297 - 0
task_clean/update_date.py

@@ -0,0 +1,297 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""电视台收视率统计
5
+
6
+"""
7
+import datetime
8
+import re
9
+import sys
10
+
11
+reload(sys)
12
+sys.setdefaultencoding('utf8')
13
+
14
+def parse_date(field, date_format):
15
+    """
16
+    日期转换
17
+    """
18
+    time_format = datetime.datetime.strptime(field, date_format)
19
+    time_format = time_format.strftime(u'%Y-%m-%d')
20
+    return time_format
21
+
22
+def parse_field(field):
23
+    """
24
+    处理字段,除了p_detail字段
25
+    """
26
+    if field is None or len(field) == 0:
27
+        return ""
28
+    else:
29
+        field = strQ2B(field)
30
+        return field.replace(' ', '')
31
+
32
+def strQ2B(ustring):
33
+    """
34
+    全角转半角
35
+    """
36
+    tstring = ""
37
+    for uchar in ustring:
38
+        inside_code = ord(uchar)
39
+        # 全角空格直接转换
40
+        if inside_code == 12288:
41
+            inside_code = 32
42
+        if inside_code == 160:
43
+            inside_code = 32
44
+        # 全角字符(除空格)根据关系转化
45
+        elif (inside_code >= 65281 and inside_code <= 65374):
46
+            inside_code -= 65248
47
+        
48
+        tstring += unichr(inside_code)
49
+    return tstring
50
+
51
+
52
+
53
+from dev_mysql_conn import Mysql
54
+
55
+def update_show_time():
56
+
57
+    conn = Mysql.createOfflineConn()
58
+
59
+    sql = """
60
+        select id, show_time from yxb.ad_tv_lib
61
+    """
62
+    rows = Mysql.getAll(sql, conn=conn)
63
+
64
+    for row in rows:
65
+        _id = row['id']
66
+        show_time = row['show_time']
67
+        if show_time is not None and len(show_time) > 0:
68
+            show_time = parse_field(show_time)
69
+            _str = show_time.decode('utf8')
70
+
71
+            # 格式xxxx年y月d日
72
+            # xx=ur"\d+[\u5e74]\d+[\u6708]\d+[\u65e5]"
73
+            # p = re.compile(xx)
74
+            # date_list = p.findall(_str)
75
+            # if date_list and len(date_list) > 0:
76
+            #     date = date_list[0]
77
+            #     show_date = parse_date(date, '%Y年%m月%d日')
78
+            #     print show_date
79
+            #     sql = """
80
+            #         update yxb.ad_tv_lib set show_time = '%s' where id = '%s'
81
+            #     """
82
+            #     sql = sql % (show_date, _id)
83
+            #     Mysql.update(sql, conn=conn)
84
+
85
+            # # 格式xxxx年y月d号
86
+            # xx=ur"\d+[\u5e74]\d+[\u6708]\d+[\u53f7]"
87
+            # p = re.compile(xx)
88
+            # date_list = p.findall(_str)
89
+            # if date_list and len(date_list) > 0:
90
+            #     date = date_list[0]
91
+            #     show_date = parse_date(date, '%Y年%m月%d号')
92
+            #     print _str
93
+            #     sql = """
94
+            #         update yxb.ad_tv_lib set show_time = '%s' where id = '%s'
95
+            #     """
96
+            #     sql = sql % (show_date, _id)
97
+            #     # Mysql.update(sql, conn=conn)
98
+
99
+            # xx=ur"\d+[\u5e74]\d+[\u6708]"
100
+            # p = re.compile(xx)
101
+            # date_list = p.findall(_str)
102
+            # if date_list and len(date_list) == 1:
103
+            #     print _str
104
+            #     date = date_list[0]
105
+            #     show_date = parse_date(date, '%Y年%m月')
106
+            #     sql = """
107
+            #         update yxb.ad_tv_lib set show_time = '%s' where id = '%s'
108
+            #     """
109
+            #     sql = sql % (show_date, _id)
110
+            #     # Mysql.update(sql, conn=conn)
111
+
112
+            # # 年月
113
+            # xx=ur"\d+[\u5e74]\d+[\u6708]"
114
+            # p = re.compile(xx)
115
+            # date_list = p.findall(_str)
116
+            # if date_list and len(date_list) > 0:
117
+            #     date = date_list[0]
118
+            #     show_time_date = parse_date(date, '%Y年%m月')
119
+            #     print _str
120
+            #     sql = """
121
+            #         update yxb.ad_tv_lib set show_time = '%s' where id = '%s'
122
+            #     """
123
+            #     sql = sql % (show_time_date, _id)
124
+            #     Mysql.update(sql, conn=conn)
125
+
126
+            # 年
127
+            xx=ur"\d+[\u5e74]"
128
+            p = re.compile(xx)
129
+            date_list = p.findall(_str)
130
+            if date_list and len(date_list) > 0:
131
+                date = date_list[0]
132
+                show_time_date = parse_date(date, '%Y年')
133
+                print _str
134
+                sql = """
135
+                    update yxb.ad_tv_lib set show_time = '%s' where id = '%s'
136
+                """
137
+                sql = sql % (show_time_date, _id)
138
+                Mysql.update(sql, conn=conn)
139
+
140
+    Mysql.close(conn)
141
+
142
+# 更新dates字段
143
+def update_dates():
144
+    conn = Mysql.createOnlineConn()
145
+    sql = """
146
+        select id, dates from yxb.ad_tv_lib
147
+    """
148
+    rows = Mysql.getAll(sql, conn=conn)
149
+    for row in rows:
150
+        _id = row['id']
151
+        dates = row['dates']
152
+        if dates is not None and len(dates) > 0:
153
+            dates = parse_field(dates)
154
+            _str = dates.decode('utf8')
155
+
156
+            # # 年月日
157
+            # xx=ur"\d+[\u5e74]\d+[\u6708]\d+[\u65e5]"
158
+            # p = re.compile(xx)
159
+            # date_list = p.findall(_str)
160
+            # if date_list and len(date_list) > 0:
161
+            #     date = date_list[0]
162
+            #     dates_date = parse_date(date, '%Y年%m月%d日')
163
+            #     print _str
164
+            #     sql = """
165
+            #         update yxb.ad_tv_lib set dates = '%s' where id = '%s'
166
+            #     """
167
+            #     sql = sql % (dates_date, _id)
168
+            #     Mysql.update(sql, conn=conn)
169
+
170
+            # # - -
171
+            # xx=ur"\d+[-]\d+[-]\d+"
172
+            # p = re.compile(xx)
173
+            # date_list = p.findall(_str)
174
+            # if date_list and len(date_list) > 0:
175
+            #     date = date_list[0]
176
+            #     dates_date = parse_date(date, '%Y-%m-%d')
177
+            #     print _str
178
+            #     sql = """
179
+            #         update yxb.ad_tv_lib set dates = '%s' where id = '%s'
180
+            #     """
181
+            #     sql = sql % (dates_date, _id)
182
+            #     Mysql.update(sql, conn=conn)
183
+
184
+            # # 年月
185
+            # xx=ur"\d+[\u5e74]\d+[\u6708]"
186
+            # p = re.compile(xx)
187
+            # date_list = p.findall(_str)
188
+            # if date_list and len(date_list) > 0:
189
+            #     date = date_list[0]
190
+            #     dates_date = parse_date(date, '%Y年%m月')
191
+            #     print _str
192
+            #     sql = """
193
+            #         update yxb.ad_tv_lib set dates = '%s' where id = '%s'
194
+            #     """
195
+            #     sql = sql % (dates_date, _id)
196
+            #     Mysql.update(sql, conn=conn)
197
+
198
+            # 年
199
+            xx=ur"\d+[\u5e74]"
200
+            p = re.compile(xx)
201
+            date_list = p.findall(_str)
202
+            if date_list and len(date_list) > 0:
203
+                date = date_list[0]
204
+                dates_date = parse_date(date, '%Y年')
205
+                print _str
206
+                sql = """
207
+                    update yxb.ad_tv_lib set dates = '%s' where id = '%s'
208
+                """
209
+                sql = sql % (dates_date, _id)
210
+                Mysql.update(sql, conn=conn)
211
+        else:
212
+            sql = """
213
+                update yxb.ad_tv_lib set dates = null where id = '%s'
214
+            """
215
+            sql = sql % (_id)
216
+            Mysql.update(sql, conn=conn)
217
+    Mysql.close(conn)
218
+
219
+def update_chupin_date():
220
+    conn = Mysql.createOnlineConn()
221
+    sql = """
222
+        select id, chupin_date from yxb.ad_tv_lib
223
+    """
224
+    rows = Mysql.getAll(sql, conn=conn)
225
+    for row in rows:
226
+        _id = row['id']
227
+        chupin_date = row['chupin_date']
228
+        if chupin_date is not None and len(chupin_date) > 0:
229
+            chupin_date = parse_field(chupin_date)
230
+            _str = chupin_date.decode('utf8')
231
+
232
+            # # 年月日
233
+            # xx=ur"\d+[\u5e74]\d+[\u6708]\d+[\u65e5]"
234
+            # p = re.compile(xx)
235
+            # date_list = p.findall(_str)
236
+            # if date_list and len(date_list) > 0:
237
+            #     date = date_list[0]
238
+            #     chupin_date_date = parse_date(date, '%Y年%m月%d日')
239
+            #     print _str
240
+            #     sql = """
241
+            #         update yxb.ad_tv_lib set chupin_date = '%s' where id = '%s'
242
+            #     """
243
+            #     sql = sql % (chupin_date_date, _id)
244
+            #     Mysql.update(sql, conn=conn)
245
+
246
+            # # - -
247
+            # xx=ur"\d+[-]\d+[-]\d+"
248
+            # p = re.compile(xx)
249
+            # date_list = p.findall(_str)
250
+            # if date_list and len(date_list) > 0:
251
+            #     date = date_list[0]
252
+            #     dates_date = parse_date(date, '%Y-%m-%d')
253
+            #     print _str
254
+            #     sql = """
255
+            #         update yxb.ad_tv_lib set dates = '%s' where id = '%s'
256
+            #     """
257
+            #     sql = sql % (dates_date, _id)
258
+            #     Mysql.update(sql, conn=conn)
259
+
260
+            # # 年月
261
+            # xx=ur"\d+[\u5e74]\d+[\u6708]"
262
+            # p = re.compile(xx)
263
+            # date_list = p.findall(_str)
264
+            # if date_list and len(date_list) > 0:
265
+            #     date = date_list[0]
266
+            #     chupin_date_date = parse_date(date, '%Y年%m月')
267
+            #     print _str
268
+            #     sql = """
269
+            #         update yxb.ad_tv_lib set chupin_date = '%s' where id = '%s'
270
+            #     """
271
+            #     sql = sql % (chupin_date_date, _id)
272
+            #     Mysql.update(sql, conn=conn)
273
+
274
+            # 年
275
+            xx=ur"\d+[\u5e74]"
276
+            p = re.compile(xx)
277
+            date_list = p.findall(_str)
278
+            if date_list and len(date_list) > 0:
279
+                date = date_list[0]
280
+                chupin_date_date = parse_date(date, '%Y年')
281
+                print _str
282
+                sql = """
283
+                    update yxb.ad_tv_lib set chupin_date = '%s' where id = '%s'
284
+                """
285
+                sql = sql % (chupin_date_date, _id)
286
+                Mysql.update(sql, conn=conn)
287
+        else:
288
+            sql = """
289
+                update yxb.ad_tv_lib set chupin_date = null where id = '%s'
290
+            """
291
+            sql = sql % (_id)
292
+            # Mysql.update(sql, conn=conn)
293
+    Mysql.close(conn)
294
+
295
+if __name__ == '__main__':
296
+    # update_show_time()
297
+    pass

+ 53 - 0
task_clean/update_first_type.py

@@ -0,0 +1,53 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""电视台收视率统计
5
+
6
+"""
7
+import sys
8
+
9
+from fty_util.common import Mysql
10
+
11
+reload(sys)
12
+sys.setdefaultencoding('utf8')
13
+
14
+conn = Mysql.createOfflineConn()
15
+
16
+sql = """
17
+    select tv_id, tv_name, first_type from odl.ad_tv_lib where is_use = 1 and decade is null
18
+"""
19
+rows = Mysql.getAll(sql, conn=conn)
20
+
21
+for row in rows:
22
+    tv_id = row['tv_id']
23
+    tv_name = row['tv_name']
24
+    first_type = row['first_type']
25
+
26
+    sql = """
27
+        select name, theme from odl.ad_tv_record_distribution where name = '%s'
28
+    """
29
+    sql = sql % (tv_name)
30
+    row = Mysql.getOne(sql, conn=conn)
31
+    if row is not None:
32
+        theme = row[1]
33
+        if theme is not None and len(theme) > 0:
34
+            if first_type is None or len(first_type) == 0:
35
+                first_type = theme
36
+            else:
37
+                decade = theme[:2]
38
+                update_sql =  """
39
+                    update odl.ad_tv_lib set decade = '%s' where tv_id = '%s'
40
+                """
41
+                update_sql = update_sql % (decade, tv_id)
42
+                Mysql.update(update_sql, conn=conn)
43
+                # first_type = theme[:2] + first_type
44
+    # if first_type is not None:
45
+    #     # update_sql =  """
46
+    #     #     update odl.ad_tv_lib set first_type = '%s' where tv_id = '%s'
47
+    #     # """
48
+    #     update_sql =  """
49
+    #         update odl.ad_tv_lib set decade = '%s' where tv_id = '%s'
50
+    #     """
51
+    #     update_sql = update_sql % (first_type, tv_id)
52
+    #     Mysql.update(update_sql, conn=conn)
53
+Mysql.close(conn)

+ 32 - 0
task_idl/idl_ad_tv_record_distribution.py

@@ -0,0 +1,32 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""
5
+
6
+"""
7
+
8
+import sys
9
+
10
+from fty_util.common import Mysql
11
+
12
+reload(sys)
13
+sys.setdefaultencoding('utf8')
14
+
15
+conn = Mysql.createOfflineConn()
16
+
17
+sql_comment = """
18
+    truncate table idl.ad_tv_record_distribution
19
+"""
20
+sql = """
21
+    truncate table idl.ad_tv_record_distribution
22
+"""
23
+
24
+Mysql.execute(sql, conn=conn)
25
+
26
+sql = """
27
+    insert into idl.ad_tv_record_distribution (tv_id, name, current_name, area, theme, first_type, second_type, company, record_date, form, num, film_date, film_period, `desc`, scriptwriter, director, distribution_date, is_distribute)
28
+    select tv_id, name, current_name, area, theme, first_type, second_type, company, record_date, form, num, film_date, film_period, `desc`, scriptwriter, director, distribution_date, is_distribute from odl.ad_tv_record_distribution
29
+"""
30
+Mysql.execute(sql, conn=conn)
31
+
32
+Mysql.close(conn)

+ 31 - 0
task_idl/idl_tv_article_marketing_count.py

@@ -0,0 +1,31 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""新剧营销文章数量爬取
5
+
6
+"""
7
+
8
+import os
9
+import sys
10
+
11
+from fty_util.common import Mysql, Util
12
+
13
+reload(sys)
14
+sys.setdefaultencoding('utf8')
15
+
16
+first_day = Util.get_first_date_of_yesterday()
17
+
18
+conn = Mysql.createOfflineConn()
19
+# 清空数据
20
+sql = """
21
+    truncate table idl.tv_article_marketing_count
22
+"""
23
+Mysql.execute(sql, conn=conn)
24
+
25
+sql = """
26
+    insert into idl.tv_article_marketing_count (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_count)
27
+    select tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_count from scrapy.scrapy_article_count order by tv_id asc
28
+"""
29
+Mysql.execute(sql, conn=conn)
30
+
31
+Mysql.close(conn)

+ 36 - 0
task_idl/idl_tv_article_marketing_detail.py

@@ -0,0 +1,36 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""新剧营销文章链接爬取
5
+
6
+"""
7
+
8
+import os
9
+import sys
10
+
11
+from fty_util.common import Mysql, Util
12
+
13
+reload(sys)
14
+sys.setdefaultencoding('utf8')
15
+
16
+first_day = Util.get_first_date_of_yesterday()
17
+
18
+conn = Mysql.createOfflineConn()
19
+sql = """
20
+    select count(*) as num from scrapy.scrapy_article
21
+"""
22
+row = Mysql.getOne(sql, conn=conn)
23
+if row is not None and row[0] is not None and row[0] > 0:
24
+    # 清空数据
25
+    sql = """
26
+        truncate table idl.tv_article_marketing_detail
27
+    """
28
+    Mysql.execute(sql, conn=conn)
29
+
30
+    sql = """
31
+        insert into idl.tv_article_marketing_detail (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_url, scrapy_title, scrapy_date)
32
+        select tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_url, scrapy_title, date_format(scrapy_date, '%Y-%m-%d') from scrapy.scrapy_article order by id asc
33
+    """
34
+    Mysql.execute(sql, conn=conn)
35
+
36
+Mysql.close(conn)

+ 50 - 0
task_idl/idl_tv_avg_ratings_stat.py

@@ -0,0 +1,50 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""计算每个电视剧的收视率
5
+
6
+"""
7
+
8
+import datetime
9
+import sys
10
+
11
+from fty_util.common import Mysql
12
+
13
+class tv_avg_ratings_stat():
14
+    
15
+    def province(self):
16
+        conn = Mysql.createOfflineConn()
17
+        sql = """
18
+            truncate table idl.tv_avg_ratings
19
+        """
20
+        Mysql.execute(sql, conn=conn)
21
+        sql = """
22
+            insert into idl.tv_avg_ratings (channel, theater_attribute, tv_name, tv_id, tv_date, value)
23
+            select channel, theater_attribute, tv_name, tv_id, tv_date, value from tmp.tv_avg_ratings
24
+        """
25
+        Mysql.execute(sql, conn=conn)
26
+        Mysql.close(conn)
27
+
28
+    def area(self):
29
+        conn = Mysql.createOfflineConn()
30
+        sql = """
31
+            truncate table idl.area_tv_avg_ratings
32
+        """
33
+        Mysql.execute(sql, conn=conn)
34
+        sql = """
35
+            insert into idl.area_tv_avg_ratings (channel, theater_attribute, tv_name, tv_id, tv_date, value)
36
+            select channel, theater_attribute, tv_name, tv_id, tv_date, value from tmp.area_tv_avg_ratings
37
+        """
38
+        Mysql.execute(sql, conn=conn)
39
+        Mysql.close(conn)
40
+
41
+if __name__ == '__main__':
42
+    if len(sys.argv) != 2:
43
+        print '没有输入参数,退出'
44
+        sys.exit(0)
45
+    print 'method name is ' + sys.argv[1]
46
+    obj = tv_avg_ratings_stat()
47
+    try:
48
+        getattr(obj, sys.argv[1])()
49
+    except Exception, e:
50
+        print e

+ 52 - 0
task_idl/idl_tv_category_stat.py

@@ -0,0 +1,52 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""近一年电视台平均收视率
5
+
6
+"""
7
+import sys
8
+
9
+from fty_util.common import Mysql, Util
10
+
11
+reload(sys)
12
+sys.setdefaultencoding('utf8')
13
+
14
+class tv_category_stat():
15
+    def province(self):
16
+        conn = Mysql.createOfflineConn()
17
+        sql = """
18
+            truncate table idl.tv_category_stat
19
+        """
20
+        Mysql.execute(sql, conn=conn)
21
+        # 将数据从tmp库插到idl库
22
+        sql = """
23
+            insert into idl.tv_category_stat (tv_id, category, channel, theater_attribute)
24
+            select tv_id, category, channel, theater_attribute from tmp.tv_category_stat
25
+        """
26
+        Mysql.execute(sql, conn=conn)
27
+        Mysql.close(conn)
28
+
29
+    def area(self):
30
+        conn = Mysql.createOfflineConn()
31
+        sql = """
32
+            truncate table idl.area_tv_category_stat
33
+        """
34
+        Mysql.execute(sql, conn=conn)
35
+        # 将数据从tmp库插到idl库
36
+        sql = """
37
+            insert into idl.area_tv_category_stat (tv_id, category, channel, theater_attribute)
38
+            select tv_id, category, channel, theater_attribute from tmp.area_tv_category_stat
39
+        """
40
+        Mysql.execute(sql, conn=conn)
41
+        Mysql.close(conn)
42
+
43
+if __name__ == '__main__':
44
+    if len(sys.argv) != 2:
45
+        print '没有输入参数,退出'
46
+        sys.exit(0)
47
+    print 'method name is ' + sys.argv[1]
48
+    obj = tv_category_stat()
49
+    try:
50
+        getattr(obj, sys.argv[1])()
51
+    except Exception, e:
52
+        print e

+ 51 - 0
task_idl/idl_year_channel_avg_ratings_stat.py

@@ -0,0 +1,51 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""近一年电视台平均收视率
5
+
6
+"""
7
+import sys
8
+
9
+from fty_util.common import Mysql, Util
10
+
11
+reload(sys)
12
+sys.setdefaultencoding('utf8')
13
+
14
+class channel_avg_ratings():
15
+    
16
+    def province(self):
17
+        conn = Mysql.createOfflineConn()
18
+        sql = """
19
+            truncate table idl.tv_channel_avg_ratings
20
+        """
21
+        Mysql.execute(sql, conn=conn)
22
+        sql = """
23
+            insert into idl.tv_channel_avg_ratings (channel, theater_attribute, value)
24
+            select channel, theater_attribute, value from tmp.channel_avg_ratings
25
+        """
26
+        Mysql.execute(sql, conn=conn)
27
+        Mysql.close(conn)
28
+
29
+    def area(self):
30
+        conn = Mysql.createOfflineConn()
31
+        sql = """
32
+            truncate table idl.area_tv_channel_avg_ratings
33
+        """
34
+        Mysql.execute(sql, conn=conn)
35
+        sql = """
36
+            insert into idl.area_tv_channel_avg_ratings (channel, theater_attribute, value)
37
+            select channel, theater_attribute, value from tmp.area_channel_avg_ratings
38
+        """
39
+        Mysql.execute(sql, conn=conn)
40
+        Mysql.close(conn)
41
+
42
+if __name__ == '__main__':
43
+    if len(sys.argv) != 2:
44
+        print '没有输入参数,退出'
45
+        sys.exit(0)
46
+    print 'method name is ' + sys.argv[1]
47
+    obj = channel_avg_ratings()
48
+    try:
49
+        getattr(obj, sys.argv[1])()
50
+    except Exception, e:
51
+        print e

+ 56 - 0
task_odl/odl_ad_audience_cps_time.py

@@ -0,0 +1,56 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""ad_television表数据处理
5
+
6
+"""
7
+
8
+import sys
9
+
10
+from fty_util.common import Mysql
11
+
12
+reload(sys)
13
+sys.setdefaultencoding('utf8')
14
+
15
+"""
16
+
17
+"""
18
+
19
+conn = Mysql.createOfflineConn()
20
+
21
+sql = """
22
+    truncate table odl.ad_audience_cps_time
23
+"""
24
+Mysql.execute(sql, conn=conn)
25
+
26
+m = 0
27
+n = 50000
28
+
29
+sql_count = """
30
+    select count(id) from yxb.ad_audience_cps_time
31
+"""
32
+count = 0
33
+try:
34
+    count = Mysql.getOne(sql_count, conn=conn)[0]
35
+except Exception, e:
36
+    print e
37
+    pass
38
+# 每年数据循环导入
39
+while m <= count + n:
40
+    sql = """
41
+        insert into odl.ad_audience_cps_time (id, tv_date, type, area, channel, timebucket, total_num, sex_man, sex_woman, age_414, 
42
+            age_1524, age_2534, age_3544, age_4554, age_5565, age_65, edu_none, edu_primary, edu_middle, edu_high, 
43
+            edu_college, job_manager, job_single, job_civilian, job_worker, job_student, job_none, job_other, inc_2000, inc_2035, inc_3550, inc_5059, inc_5901, inc_none) 
44
+        select id, tv_date, type, area, channel, timebucket, total_num, sex_man, sex_woman, age_414, 
45
+            age_1524, age_2534, age_3544, age_4554, age_5565, age_65, edu_none, edu_primary, edu_middle, edu_high, 
46
+            edu_college, job_manager, job_single, job_civilian, job_worker, job_student, job_none, job_other, inc_2000, inc_2035, inc_3550, inc_5059, inc_5901, inc_none
47
+        from yxb.ad_audience_cps_time
48
+        where timebucket = '全天' and area like 'CSM5%%'
49
+        limit %s, %s
50
+    """
51
+    sql = sql % (m, n)
52
+    print sql
53
+    Mysql.execute(sql, conn=conn)
54
+    m += n
55
+
56
+Mysql.close(conn)

+ 57 - 0
task_odl/odl_ad_audience_cps_time_incr_update.py

@@ -0,0 +1,57 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""ad_television表数据处理
5
+
6
+"""
7
+
8
+import sys
9
+
10
+from fty_util.common import Mysql
11
+
12
+reload(sys)
13
+sys.setdefaultencoding('utf8')
14
+
15
+"""
16
+
17
+"""
18
+
19
+conn = Mysql.createOfflineConn()
20
+
21
+# sql = """
22
+#     truncate table odl.ad_audience_cps_time
23
+# """
24
+# Mysql.execute(sql, conn=conn)
25
+
26
+# m = 0
27
+# n = 50000
28
+
29
+# sql_count = """
30
+#     select count(id) from yxb.ad_audience_cps_time
31
+# """
32
+# count = 0
33
+# try:
34
+#     count = Mysql.getOne(sql_count, conn=conn)[0]
35
+# except Exception, e:
36
+#     print e
37
+#     pass
38
+
39
+sql = """
40
+    select max(tv_date) as max_date from odl.ad_audience_cps_time
41
+"""
42
+row = Mysql.getOne(sql, conn=conn)
43
+max_date = row[0]
44
+
45
+sql = """
46
+    insert into odl.ad_audience_cps_time (id, tv_date, type, area, channel, timebucket, total_num, sex_man, sex_woman, age_414, 
47
+        age_1524, age_2534, age_3544, age_4554, age_5565, age_65, edu_none, edu_primary, edu_middle, edu_high, 
48
+        edu_college, job_manager, job_single, job_civilian, job_worker, job_student, job_none, job_other, inc_2000, inc_2035, inc_3550, inc_5059, inc_5901, inc_none) 
49
+    select id, tv_date, type, area, channel, timebucket, total_num, sex_man, sex_woman, age_414, 
50
+        age_1524, age_2534, age_3544, age_4554, age_5565, age_65, edu_none, edu_primary, edu_middle, edu_high, 
51
+        edu_college, job_manager, job_single, job_civilian, job_worker, job_student, job_none, job_other, inc_2000, inc_2035, inc_3550, inc_5059, inc_5901, inc_none
52
+    from yxb.ad_audience_cps_time
53
+    where tv_date > '%s' and timebucket = '全天' and area like 'CSM5%%'
54
+"""
55
+sql = sql % (max_date)
56
+Mysql.execute(sql, conn=conn)
57
+Mysql.close(conn)

+ 68 - 0
task_odl/odl_ad_television.py

@@ -0,0 +1,68 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""ad_television表数据处理
5
+
6
+"""
7
+
8
+import sys
9
+
10
+from fty_util.common import Mysql
11
+
12
+reload(sys)
13
+sys.setdefaultencoding('utf8')
14
+
15
+"""
16
+从yxb.ad_television_(2010,2011,2012,2013,2014,2015,2016)
17
+yxb.ad_rating_(2010,2011,2012,2013,2014,2015,2016)
18
+提取数据插入到odl.ad_television表中,作为数据分析来源数据
19
+"""
20
+
21
+conn = Mysql.createOfflineConn()
22
+
23
+sql_comment = """
24
+    truncate table odl.ad_television
25
+"""
26
+sql = """
27
+    truncate table odl.ad_television
28
+"""
29
+try:
30
+    Mysql.execute(sql, conn=conn)
31
+    print '清空odl.ad_television表成功'
32
+except Exception, e:
33
+    print '清空odl.ad_television表出错'
34
+
35
+for year in range(2010, 2017):
36
+    m = 0
37
+    n = 50000
38
+
39
+    sql_count = """
40
+        select count(id) from yxb.ad_television_%s
41
+    """
42
+    sql_count = sql_count % (year)
43
+    count = 0
44
+    try:
45
+        count = Mysql.getOne(sql_count, conn=conn)[0]
46
+    except Exception, e:
47
+        print e
48
+        pass
49
+    # 每年数据循环导入
50
+    while m <= count + n:
51
+        sql = """
52
+            insert into odl.ad_television (television_id, tv_id, tv_name, epi_num, host, channel, tv_date, weekday, start_time, end_time, \
53
+                theater_attribute, property, is_repeat, city, year, area, audience_num, audience_rating, avg_num, avg_rating, \
54
+                market_rating, avg_fans, avg_view_time) \
55
+            select aty.id, atl.id, aty.tv_name, aty.epi_num, aty.host, aty.channel, aty.tv_date, aty.weekday, aty.start_time, aty.end_time, aty.theater_attribute, \
56
+            aty.property, aty.is_repeat, aty.city, %s, \
57
+            ary.area, ary.audience_num, ary.audience_rating, ary.avg_num, ary.avg_rating, ary.market_rating, ary.avg_fans, ary.avg_view_time \
58
+            from yxb.ad_television_%s aty \
59
+            left join yxb.ad_rating_%s ary on ary.tv_id = aty.id and ary.area like 'CSM5%%'
60
+            left join yxb.ad_tv_lib atl on atl.tv_name = aty.tv_name
61
+            limit %s, %s
62
+        """
63
+        sql = sql % (year, year, year, m, n)
64
+        print sql
65
+        Mysql.execute(sql, conn=conn)
66
+        m += n
67
+
68
+Mysql.close(conn)

+ 67 - 0
task_odl/odl_ad_television_incr_update.py

@@ -0,0 +1,67 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""ad_television表数据增量更新
5
+
6
+"""
7
+
8
+import datetime
9
+import sys
10
+
11
+from fty_util.common import Mysql
12
+
13
+reload(sys)
14
+sys.setdefaultencoding('utf8')
15
+
16
+conn = Mysql.createOfflineConn()
17
+
18
+sql = """
19
+    select max(tv_date) as max_date from odl.ad_television
20
+"""
21
+row = Mysql.getOne(sql, conn=conn)
22
+max_date = row[0]
23
+
24
+# year = datetime.datetime.strptime(max_date, '%Y-%m-%d').year
25
+sql = """
26
+    insert into odl.ad_television (television_id, tv_id, tv_name, epi_num, host, channel, tv_date, weekday, start_time, end_time, \
27
+        theater_attribute, property, is_repeat, city, year, area, audience_num, audience_rating, avg_num, avg_rating, \
28
+        market_rating, avg_fans, avg_view_time) \
29
+    select aty.id, atl.id, aty.tv_name, aty.epi_num, aty.host, aty.channel, aty.tv_date, aty.weekday, aty.start_time, aty.end_time, aty.theater_attribute, \
30
+    aty.property, aty.is_repeat, aty.city, substring_index(aty.tv_date, '-', 1), \
31
+    ary.area, ary.audience_num, ary.audience_rating, ary.avg_num, ary.avg_rating, ary.market_rating, ary.avg_fans, ary.avg_view_time \
32
+    from yxb.ad_television_2016 aty \
33
+    left join yxb.ad_rating_2016 ary on ary.tv_id = aty.id and ary.area like 'CSM5%%'
34
+    left join yxb.ad_tv_lib atl on atl.tv_name = aty.tv_name
35
+    where aty.tv_date > '%s'
36
+"""
37
+sql = sql % (max_date)
38
+Mysql.execute(sql, conn=conn)
39
+
40
+"""
41
+凌晨剧场 0:00 - 6:00
42
+早间剧场:7:00-9:00
43
+上午剧场 9:00-12:00
44
+下午剧场 14:00-18:00
45
+晚间剧场 18:00-24:00
46
+黄金剧场:19:30-21:30
47
+"""
48
+sql = """
49
+    update odl.ad_television
50
+    set theater_attribute = 
51
+    (
52
+    case 
53
+    when start_time >= '00:00:00' and end_time <= '6:00:00' then '凌晨剧场'
54
+    when start_time >= '7:00:00' and end_time < '9:00:00' then '早间剧场'
55
+    when start_time >= '9:00:00' and end_time <= '12:00:00' then '上午剧场'
56
+    when start_time >= '14:00:00' and end_time < '18:00:00' then '下午剧场'
57
+    when start_time >= '19:30:00' and end_time <= '21:30:00' then '黄金剧场'
58
+    when (start_time >= '18:00:00' and end_time < '19:30:00') or (start_time > '21:30:00' and end_time < '24:00:00') then '晚间剧场'
59
+    end
60
+    )
61
+    where tv_date > '%s' and (theater_attribute is null or theater_attribute = '')
62
+"""
63
+sql = sql % (max_date)
64
+
65
+Mysql.execute(sql, conn=conn)
66
+
67
+Mysql.close(conn)

+ 65 - 0
task_odl/odl_ad_tv_lib.py

@@ -0,0 +1,65 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""odl.ad_tv_lib表数据导入
5
+
6
+从yxb.ad_tv_lib提取数据插入到odl.ad_tv_lib表中,作为数据分析来源数据
7
+"""
8
+
9
+import sys
10
+
11
+from fty_util.common import Mysql
12
+
13
+reload(sys)
14
+sys.setdefaultencoding('utf8')
15
+
16
+conn = Mysql.createOfflineConn()
17
+
18
+# 情况odl.ad_tv_lib表数据
19
+# sql = """
20
+#     truncate table odl.ad_tv_lib
21
+# """
22
+# Mysql.execute(sql, conn=conn)
23
+
24
+# 电视剧信息表
25
+sql = """
26
+    replace into odl.ad_tv_lib (tv_id, tv_name, director, scriptwriter, main_actors, types, first_type, second_type, description, \
27
+        pub_comp, pub_date, filmer, scheming, producer, produce_comp, produce_date, show_time, is_use, decade, theme) \
28
+    select id, tv_name, director, scriptwritter, main_actors, types, \
29
+    case when decade is not null and first_type is not null then concat(decade, first_type) else null end as first_type, second_type, \
30
+    description, pub_comp, pub_date, production, \
31
+    cehua, jianzhi, chupin_comp, chupin_date, show_time, is_use, decade, first_type from yxb.ad_tv_lib
32
+"""
33
+Mysql.execute(sql, conn=conn)
34
+
35
+# # 更新字段
36
+# sql = """
37
+#     update odl.ad_tv_lib set first_type = substring_index(replace(first_type, ' ', ','), ',', 1) where first_type is not null
38
+# """
39
+# Mysql.execute(sql, conn=conn)
40
+
41
+
42
+# # 清空odl.ad_tv_lib_filter表数据
43
+# sql = """
44
+#     truncate table odl.ad_tv_lib_filter
45
+# """
46
+# Mysql.execute(sql, conn=conn)
47
+
48
+# # 插入不需要过滤的电视剧
49
+# sql = """
50
+#     insert into odl.ad_tv_lib_filter (tv_id, tv_name) \
51
+#     select distinct tv_id, tv_name from odl.ad_television group by tv_id, tv_name
52
+# """
53
+# Mysql.execute(sql, conn=conn)
54
+
55
+# # 更新odl.ad_tv_lib的is_use字段
56
+# sql = """
57
+#     update odl.ad_tv_lib atl inner join odl.ad_tv_lib_filter atlf \
58
+#     on atlf.tv_id = atl.id or atlf.tv_name = atl.tv_name
59
+#     set atl.is_use = 1
60
+#     where atlf.tv_id is not null or atlf.tv_name is not null
61
+# """
62
+
63
+# Mysql.execute(sql, conn=conn)
64
+
65
+Mysql.close(conn)

+ 35 - 0
task_odl/odl_ad_tv_lib_insert.py

@@ -0,0 +1,35 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""odl.ad_tv_lib表更新数据
5
+
6
+"""
7
+
8
+import sys
9
+
10
+from fty_util.common import Mysql
11
+
12
+reload(sys)
13
+sys.setdefaultencoding('utf8')
14
+
15
+conn = Mysql.createOfflineConn()
16
+
17
+# 情况odl.ad_tv_lib表数据
18
+sql = """
19
+    select max(tv_id) as tv_id from odl.ad_tv_lib
20
+"""
21
+row = Mysql.getOne(sql, conn=conn)
22
+max_tv_id = row[0]
23
+
24
+# 电视剧信息表
25
+sql = """
26
+    insert into odl.ad_tv_lib (tv_id, tv_name, director, scriptwriter, main_actors, types, first_type, second_type, description, \
27
+        pub_comp, pub_date, filmer, scheming, producer, produce_comp, produce_date, show_time, is_use, categories) \
28
+    select id, tv_name, director, scriptwritter, main_actors, types, first_type, second_type, \
29
+    description, pub_comp, pub_date, production, \
30
+    cehua, jianzhi, chupin_comp, chupin_date, show_time, '0', categories from yxb.ad_tv_lib where id > '%s'
31
+"""
32
+sql = sql % (max_tv_id)
33
+Mysql.execute(sql, conn=conn)
34
+
35
+Mysql.close(conn)

+ 279 - 0
task_odl/odl_ad_tv_record_distribution.py

@@ -0,0 +1,279 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""备案、发行表数据处理
5
+
6
+"""
7
+
8
+import datetime
9
+import re
10
+import sys
11
+import time
12
+
13
+from fty_util.common import Mysql
14
+
15
+reload(sys)
16
+sys.setdefaultencoding('utf8')
17
+
18
+def parse_date(field, date_format):
19
+    """
20
+    日期转换
21
+    """
22
+    time_format = datetime.datetime.strptime(field, date_format)
23
+    time_format = time_format.strftime(u'%Y-%m-%d')
24
+    return time_format
25
+
26
+def parse_field(field):
27
+    """
28
+    处理字段,除了p_detail字段
29
+    """
30
+    if field is None or len(field) == 0:
31
+        return ""
32
+    else:
33
+        field = strQ2B(field)
34
+        return field.replace(' ', '')
35
+
36
+def strQ2B(ustring):
37
+    """
38
+    全角转半角
39
+    """
40
+    tstring = ""
41
+    for uchar in ustring:
42
+        inside_code = ord(uchar)
43
+        # 全角空格直接转换
44
+        if inside_code == 12288:
45
+            inside_code = 32
46
+        if inside_code == 160:
47
+            inside_code = 32
48
+        # 全角字符(除空格)根据关系转化
49
+        elif (inside_code >= 65281 and inside_code <= 65374):
50
+            inside_code -= 65248
51
+        
52
+        tstring += unichr(inside_code)
53
+    return tstring
54
+
55
+
56
+"""
57
+从 odl.dsj_gongshi(电视剧备案数据)和odl.faxing(电视剧发行数据)表中提取数据到odl.ad_tv_record_distribution表中,作为数据分析来源数据
58
+"""
59
+
60
+conn = Mysql.createOfflineConn()
61
+
62
+# 备案、发行表
63
+sql = """
64
+    select id, name, area, theme, company, commit_company, show_date, license_id, form, num, film_date, film_period, `desc`, \
65
+    province_advice, relative_depart_advice, remark, scrapy_url, scrapy_date, scrapy_title, p_detail, scrapy_detail_url, \
66
+    union_company, scriptwriter, director from odl.dsj_gongshi order by id asc
67
+"""
68
+
69
+rows = Mysql.getAll(sql, conn=conn)
70
+
71
+# conn_max = dev_mysql_conn.Mysql()
72
+sql_max = """
73
+    select max(tv_id) tv_id from odl.ad_tv_record_distribution
74
+"""
75
+row_max = Mysql.getOne(sql_max, conn=conn)
76
+max_id = 0
77
+if row_max is not None and len(row_max) > 0:
78
+    max_id = row_max[0]
79
+
80
+for row in rows:
81
+    _id = row['id']
82
+    if _id <= max_id:
83
+        continue
84
+    # 将p_detail字段转换为commit_company、show_date、license_id三个字段
85
+    p_detail = parse_field(row['p_detail'])
86
+    commit_company = ''
87
+    show_date = ''
88
+    license_id = ''
89
+    # 如果p_detail字段为空,则直接处理
90
+    if p_detail is None or len(p_detail) == 0:
91
+        commit_company = parse_field(row['commit_company'])
92
+        show_date = parse_field(row['show_date'])
93
+        show_date = show_date.replace(u'年', '-').replace(u'月', '-')
94
+        show_date = parse_date(show_date, u'%Y-%m-')
95
+        license_id = parse_field(row['license_id'])
96
+    else:
97
+        try:
98
+            str = p_detail.decode('utf8')
99
+            xx=ur"\d+[\u5e74]\d+[\u6708]"
100
+            p = re.compile(xx)
101
+            date = p.findall(str)[0]
102
+            company_license = str.split(date)
103
+
104
+            commit_company = company_license[0]
105
+            show_date = date
106
+            show_date = parse_date(show_date, '%Y年%m月')
107
+            license_id = company_license[-1]
108
+        except Exception, e:
109
+
110
+            pass
111
+    # 原始格式 2016.11
112
+    film_date = parse_field(row['film_date'])
113
+    try:
114
+        film_date_pattern = re.compile('\d+')
115
+        year_month = film_date_pattern.findall(film_date)
116
+        year = year_month[0]
117
+        month = year_month[1]
118
+        film_date = parse_date(str(year) + '.' + str(month), '%Y.%m')
119
+    except Exception, e:
120
+        film_date = ''
121
+
122
+    # film_date = parse_date(film_date, '%Y.%m')
123
+
124
+    film_period = parse_field(row['film_period'])
125
+
126
+    name = parse_field(row['name'])
127
+    area = parse_field(row['area'])
128
+    theme = parse_field(row['theme'])
129
+    company = parse_field(row['company'])
130
+    if company is None or len(company) == 0:
131
+        company = commit_company
132
+    form = parse_field(row['form'])
133
+    num = parse_field(row['num'])
134
+    desc = parse_field(row['desc'])
135
+    province_advice = parse_field(row['province_advice'])
136
+    relative_depart_advice = parse_field(row['relative_depart_advice'])
137
+    remark = parse_field(row['remark'])
138
+    scrapy_url = parse_field(row['scrapy_url'])
139
+    scrapy_date = row['scrapy_date']
140
+    scrapy_title = parse_field(row['scrapy_title'])
141
+    scrapy_detail_url = parse_field(row['scrapy_detail_url'])
142
+    union_company = parse_field(row['union_company'])
143
+    scriptwriter = parse_field(row['scriptwriter'])
144
+    director = parse_field(row['director'])
145
+    # 变更后的名称
146
+    current_name = ''
147
+    # 变更后的公司
148
+    current_company = ''
149
+    # 变更后的集数
150
+    current_num = 0
151
+
152
+    # conn_change_type1 = dev_mysql_conn.Mysql()
153
+    # 电视剧名称变更
154
+    sql_change_type1 = """
155
+        select original_name, current_name from odl.dsj_change where original_name = '%s' and change_type = 1
156
+    """
157
+    sql_change_type1 = sql_change_type1 % (name)
158
+    sql_change_type1_rows = Mysql.getAll(sql_change_type1, conn=conn)
159
+    
160
+    if len(sql_change_type1_rows) == 1:
161
+        current_name = sql_change_type1_rows[0]['current_name']
162
+    elif len(sql_change_type1_rows) > 1:
163
+        # 如果多余一条记录,则存放在dict中
164
+        name_dict = {}
165
+        for row in sql_change_type1_rows:
166
+            original_name = row['original_name']
167
+            current_name = row['current_name']
168
+            name_dict[original_name] = current_name
169
+        while True:
170
+            if len(name_dict) > 1:
171
+                current_name = name_dict.get(name)
172
+                if current_name is None or len(current_name) == 0:
173
+                    current_name = ''
174
+                    break
175
+                del name_dict[name]
176
+            else:
177
+                current_name = name_dict.get(current_name)
178
+                break
179
+
180
+    # conn_change_type2 = dev_mysql_conn.Mysql()
181
+    # 类型2变更
182
+    sql_change_type2 = """
183
+        select name, original_company, current_company from odl.dsj_change where name = '%s' and change_type = 2
184
+    """
185
+    sql_change_type2 = sql_change_type2 % (name)
186
+    sql_change_type2_rows = Mysql.getAll(sql_change_type2, conn=conn)
187
+    
188
+    if len(sql_change_type2_rows) == 1:
189
+        current_company = sql_change_type2_rows[0]['current_company']
190
+    elif len(sql_change_type2_rows) > 1:
191
+        company_dict = {}
192
+        for row in sql_change_type2_rows:
193
+            original_company = row['original_company']
194
+            current_company = row['current_company']
195
+            company_dict[original_company] = current_company
196
+        while True:
197
+            if len(company_dict) > 1:
198
+                current_company = company_dict.get(company)
199
+                if current_company is None or len(current_company) == 0:
200
+                    current_company = ''
201
+                    break
202
+                del company_dict[company]
203
+            else:
204
+                current_company = company_dict.get(current_company)
205
+                break
206
+
207
+
208
+    # conn_change_type3 = dev_mysql_conn.Mysql()
209
+    # 类型3变更
210
+    sql_change_type3 = """
211
+        select name, original_num, current_num from odl.dsj_change where name = '%s' and change_type = 3
212
+    """
213
+    sql_change_type3 = sql_change_type3 % (name)
214
+    sql_change_type3_rows = Mysql.getAll(sql_change_type3, conn=conn)
215
+
216
+    if len(sql_change_type3_rows) == 1:
217
+        current_num = sql_change_type3_rows[0]['current_num']
218
+    elif len(sql_change_type3_rows) > 1:
219
+        num_dict = {}
220
+        for row in sql_change_type3_rows:
221
+            original_num = row['original_num']
222
+            current_num = row['current_num']
223
+            num_dict[original_num] = current_num
224
+        while True:
225
+            if len(num_dict) > 1:
226
+                current_num = num_dict.get(num)
227
+                if current_num is None or len(current_num) == 0:
228
+                    current_num = 0
229
+                    break
230
+                del num_dict[num]
231
+            else:
232
+                current_num = num_dict.get(current_num)
233
+                break
234
+
235
+    if current_name is None or len(current_name) == 0:
236
+        current_name = name
237
+
238
+    # 发行数据查询
239
+    sql_distribution = """
240
+        select name, company, num, pub_date from odl.faxing where name = '%s'
241
+    """
242
+    sql_distribution = sql_distribution % (current_name)
243
+    try:
244
+        # conn_distribution = dev_mysql_conn.Mysql()
245
+        sql_distribution_rows = Mysql.getAll(sql_distribution, conn=conn)
246
+    except Exception, e:
247
+        # conn_distribution = dev_mysql_conn.Mysql()
248
+        sql_distribution_rows = Mysql.getAll(sql_distribution, conn=conn)
249
+    is_distribute = 0
250
+    pub_date = ''
251
+    if len(sql_distribution_rows) >= 1:
252
+        pub_date = sql_distribution_rows[0]['pub_date']
253
+        is_distribute = 1
254
+
255
+    sql_insert = """
256
+        insert into odl.ad_tv_record_distribution (tv_id, name, current_name, area, theme, first_type, second_type, company, record_date, form, num, \
257
+        film_date, film_period, `desc`, scriptwriter, director, distribution_date, is_distribute) values ('%s', '%s', '%s', '%s', '%s', \
258
+        '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')
259
+    """
260
+    sql_insert = sql_insert % (_id, name, current_name, area, theme, theme, theme, company, show_date, form, num, film_date, film_period, desc.replace("'", "\\'"),
261
+        scriptwriter, director, pub_date, is_distribute)
262
+
263
+    print sql_insert
264
+    try_times = 0
265
+    while True:
266
+        if try_times > 3:
267
+            break
268
+        try:
269
+            # conn1 = dev_mysql_conn.Mysql()
270
+            Mysql.insertOne(sql_insert, conn=conn)
271
+            try_times = 0
272
+            break
273
+        except Exception, e:
274
+            try_times += 1
275
+            print e
276
+    if try_times > 3:
277
+        break
278
+
279
+Mysql.close(conn)

+ 279 - 0
task_odl/odl_ad_tv_record_distribution_insert.py

@@ -0,0 +1,279 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""备案、发行表数据处理
5
+
6
+"""
7
+
8
+import datetime
9
+import re
10
+import sys
11
+import time
12
+
13
+from fty_util.common import Mysql
14
+
15
+reload(sys)
16
+sys.setdefaultencoding('utf8')
17
+
18
+def parse_date(field, date_format):
19
+    """
20
+    日期转换
21
+    """
22
+    time_format = datetime.datetime.strptime(field, date_format)
23
+    time_format = time_format.strftime(u'%Y-%m-%d')
24
+    return time_format
25
+
26
+def parse_field(field):
27
+    """
28
+    处理字段,除了p_detail字段
29
+    """
30
+    if field is None or len(field) == 0:
31
+        return ""
32
+    else:
33
+        field = strQ2B(field)
34
+        return field.replace(' ', '')
35
+
36
+def strQ2B(ustring):
37
+    """
38
+    全角转半角
39
+    """
40
+    tstring = ""
41
+    for uchar in ustring:
42
+        inside_code = ord(uchar)
43
+        # 全角空格直接转换
44
+        if inside_code == 12288:
45
+            inside_code = 32
46
+        if inside_code == 160:
47
+            inside_code = 32
48
+        # 全角字符(除空格)根据关系转化
49
+        elif (inside_code >= 65281 and inside_code <= 65374):
50
+            inside_code -= 65248
51
+        
52
+        tstring += unichr(inside_code)
53
+    return tstring
54
+
55
+
56
+"""
57
+从 odl.dsj_gongshi(电视剧备案数据)和odl.faxing(电视剧发行数据)表中提取数据到odl.ad_tv_record_distribution表中,作为数据分析来源数据
58
+"""
59
+
60
+conn = Mysql.createOfflineConn()
61
+
62
+# 备案、发行表
63
+sql = """
64
+    select id, name, area, theme, company, commit_company, show_date, license_id, form, num, film_date, film_period, `desc`, \
65
+    province_advice, relative_depart_advice, remark, scrapy_url, scrapy_date, scrapy_title, p_detail, scrapy_detail_url, \
66
+    union_company, scriptwriter, director from odl.dsj_gongshi order by id asc
67
+"""
68
+
69
+rows = Mysql.getAll(sql, conn=conn)
70
+
71
+# conn_max = dev_mysql_conn.Mysql()
72
+sql_max = """
73
+    select max(tv_id) tv_id from odl.ad_tv_record_distribution
74
+"""
75
+row_max = Mysql.getOne(sql_max, conn=conn)
76
+max_id = 0
77
+if row_max is not None and len(row_max) > 0:
78
+    max_id = row_max[0]
79
+
80
+for row in rows:
81
+    _id = row['id']
82
+    if _id <= max_id:
83
+        continue
84
+    # 将p_detail字段转换为commit_company、show_date、license_id三个字段
85
+    p_detail = parse_field(row['p_detail'])
86
+    commit_company = ''
87
+    show_date = ''
88
+    license_id = ''
89
+    # 如果p_detail字段为空,则直接处理
90
+    if p_detail is None or len(p_detail) == 0:
91
+        commit_company = parse_field(row['commit_company'])
92
+        show_date = parse_field(row['show_date'])
93
+        show_date = show_date.replace(u'年', '-').replace(u'月', '-')
94
+        show_date = parse_date(show_date, u'%Y-%m-')
95
+        license_id = parse_field(row['license_id'])
96
+    else:
97
+        try:
98
+            str = p_detail.decode('utf8')
99
+            xx=ur"\d+[\u5e74]\d+[\u6708]"
100
+            p = re.compile(xx)
101
+            date = p.findall(str)[0]
102
+            company_license = str.split(date)
103
+
104
+            commit_company = company_license[0]
105
+            show_date = date
106
+            show_date = parse_date(show_date, '%Y年%m月')
107
+            license_id = company_license[-1]
108
+        except Exception, e:
109
+
110
+            pass
111
+    # 原始格式 2016.11
112
+    film_date = parse_field(row['film_date'])
113
+    try:
114
+        film_date_pattern = re.compile('\d+')
115
+        year_month = film_date_pattern.findall(film_date)
116
+        year = year_month[0]
117
+        month = year_month[1]
118
+        film_date = parse_date(str(year) + '.' + str(month), '%Y.%m')
119
+    except Exception, e:
120
+        film_date = ''
121
+
122
+    # film_date = parse_date(film_date, '%Y.%m')
123
+
124
+    film_period = parse_field(row['film_period'])
125
+
126
+    name = parse_field(row['name'])
127
+    area = parse_field(row['area'])
128
+    theme = parse_field(row['theme'])
129
+    company = parse_field(row['company'])
130
+    if company is None or len(company) == 0:
131
+        company = commit_company
132
+    form = parse_field(row['form'])
133
+    num = parse_field(row['num'])
134
+    desc = parse_field(row['desc'])
135
+    province_advice = parse_field(row['province_advice'])
136
+    relative_depart_advice = parse_field(row['relative_depart_advice'])
137
+    remark = parse_field(row['remark'])
138
+    scrapy_url = parse_field(row['scrapy_url'])
139
+    scrapy_date = row['scrapy_date']
140
+    scrapy_title = parse_field(row['scrapy_title'])
141
+    scrapy_detail_url = parse_field(row['scrapy_detail_url'])
142
+    union_company = parse_field(row['union_company'])
143
+    scriptwriter = parse_field(row['scriptwriter'])
144
+    director = parse_field(row['director'])
145
+    # 变更后的名称
146
+    current_name = ''
147
+    # 变更后的公司
148
+    current_company = ''
149
+    # 变更后的集数
150
+    current_num = 0
151
+
152
+    # conn_change_type1 = dev_mysql_conn.Mysql()
153
+    # 电视剧名称变更
154
+    sql_change_type1 = """
155
+        select original_name, current_name from odl.dsj_change where original_name = '%s' and change_type = 1
156
+    """
157
+    sql_change_type1 = sql_change_type1 % (name)
158
+    sql_change_type1_rows = Mysql.getAll(sql_change_type1, conn=conn)
159
+    
160
+    if len(sql_change_type1_rows) == 1:
161
+        current_name = sql_change_type1_rows[0]['current_name']
162
+    elif len(sql_change_type1_rows) > 1:
163
+        # 如果多余一条记录,则存放在dict中
164
+        name_dict = {}
165
+        for row in sql_change_type1_rows:
166
+            original_name = row['original_name']
167
+            current_name = row['current_name']
168
+            name_dict[original_name] = current_name
169
+        while True:
170
+            if len(name_dict) > 1:
171
+                current_name = name_dict.get(name)
172
+                if current_name is None or len(current_name) == 0:
173
+                    current_name = ''
174
+                    break
175
+                del name_dict[name]
176
+            else:
177
+                current_name = name_dict.get(current_name)
178
+                break
179
+
180
+    # conn_change_type2 = dev_mysql_conn.Mysql()
181
+    # 类型2变更
182
+    sql_change_type2 = """
183
+        select name, original_company, current_company from odl.dsj_change where name = '%s' and change_type = 2
184
+    """
185
+    sql_change_type2 = sql_change_type2 % (name)
186
+    sql_change_type2_rows = Mysql.getAll(sql_change_type2, conn=conn)
187
+    
188
+    if len(sql_change_type2_rows) == 1:
189
+        current_company = sql_change_type2_rows[0]['current_company']
190
+    elif len(sql_change_type2_rows) > 1:
191
+        company_dict = {}
192
+        for row in sql_change_type2_rows:
193
+            original_company = row['original_company']
194
+            current_company = row['current_company']
195
+            company_dict[original_company] = current_company
196
+        while True:
197
+            if len(company_dict) > 1:
198
+                current_company = company_dict.get(company)
199
+                if current_company is None or len(current_company) == 0:
200
+                    current_company = ''
201
+                    break
202
+                del company_dict[company]
203
+            else:
204
+                current_company = company_dict.get(current_company)
205
+                break
206
+
207
+
208
+    # conn_change_type3 = dev_mysql_conn.Mysql()
209
+    # 类型3变更
210
+    sql_change_type3 = """
211
+        select name, original_num, current_num from odl.dsj_change where name = '%s' and change_type = 3
212
+    """
213
+    sql_change_type3 = sql_change_type3 % (name)
214
+    sql_change_type3_rows = Mysql.getAll(sql_change_type3, conn=conn)
215
+
216
+    if len(sql_change_type3_rows) == 1:
217
+        current_num = sql_change_type3_rows[0]['current_num']
218
+    elif len(sql_change_type3_rows) > 1:
219
+        num_dict = {}
220
+        for row in sql_change_type3_rows:
221
+            original_num = row['original_num']
222
+            current_num = row['current_num']
223
+            num_dict[original_num] = current_num
224
+        while True:
225
+            if len(num_dict) > 1:
226
+                current_num = num_dict.get(num)
227
+                if current_num is None or len(current_num) == 0:
228
+                    current_num = 0
229
+                    break
230
+                del num_dict[num]
231
+            else:
232
+                current_num = num_dict.get(current_num)
233
+                break
234
+
235
+    if current_name is None or len(current_name) == 0:
236
+        current_name = name
237
+
238
+    # 发行数据查询
239
+    sql_distribution = """
240
+        select name, company, num, pub_date from odl.faxing where name = '%s'
241
+    """
242
+    sql_distribution = sql_distribution % (current_name)
243
+    try:
244
+        # conn_distribution = dev_mysql_conn.Mysql()
245
+        sql_distribution_rows = Mysql.getAll(sql_distribution, conn=conn)
246
+    except Exception, e:
247
+        # conn_distribution = dev_mysql_conn.Mysql()
248
+        sql_distribution_rows = Mysql.getAll(sql_distribution, conn=conn)
249
+    is_distribute = 0
250
+    pub_date = ''
251
+    if len(sql_distribution_rows) >= 1:
252
+        pub_date = sql_distribution_rows[0]['pub_date']
253
+        is_distribute = 1
254
+
255
+    sql_insert = """
256
+        insert into odl.ad_tv_record_distribution (tv_id, name, current_name, area, theme, first_type, second_type, company, record_date, form, num, \
257
+        film_date, film_period, `desc`, scriptwriter, director, distribution_date, is_distribute) values ('%s', '%s', '%s', '%s', '%s', \
258
+        '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')
259
+    """
260
+    sql_insert = sql_insert % (_id, name, current_name, area, theme, theme, theme, company, show_date, form, num, film_date, film_period, desc.replace("'", "\\'"),
261
+        scriptwriter, director, pub_date, is_distribute)
262
+
263
+    print sql_insert
264
+    try_times = 0
265
+    while True:
266
+        if try_times > 3:
267
+            break
268
+        try:
269
+            # conn1 = dev_mysql_conn.Mysql()
270
+            Mysql.insertOne(sql_insert, conn=conn)
271
+            try_times = 0
272
+            break
273
+        except Exception, e:
274
+            try_times += 1
275
+            print e
276
+    if try_times > 3:
277
+        break
278
+
279
+Mysql.close(conn)

+ 70 - 0
task_odl/odl_area_ad_television.py

@@ -0,0 +1,70 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""ad_television表数据处理
5
+
6
+"""
7
+
8
+import sys
9
+
10
+from fty_util.common import Mysql
11
+
12
+reload(sys)
13
+sys.setdefaultencoding('utf8')
14
+
15
+"""
16
+从yxb.ad_television_tetv
17
+提取数据插入到odl.area_ad_television 表中,作为数据分析来源数据
18
+"""
19
+
20
+conn = Mysql.createOfflineConn()
21
+
22
+sql = """
23
+    truncate table odl.area_ad_television
24
+"""
25
+Mysql.execute(sql, conn=conn)
26
+
27
+m = 0
28
+n = 50000
29
+
30
+sql_count = """
31
+    select count(id) from yxb.ad_television_tetv
32
+"""
33
+count = Mysql.getOne(sql_count, conn=conn)[0]
34
+while m <= count + n:
35
+    sql = """
36
+        insert into odl.area_ad_television (television_id, tv_id, tv_name, epi_num, host, channel, tv_date, weekday, start_time, end_time, \
37
+            theater_attribute, property, is_repeat, city, area, audience_num, audience_rating, avg_num, avg_rating, \
38
+            market_rating, avg_fans, avg_view_time) \
39
+        select aty.id, atl.id, aty.tv_name, aty.epi_num, aty.host, aty.channel, aty.tv_date, aty.weekday, aty.start_time, aty.end_time, aty.theater_attribute, \
40
+        aty.property, aty.is_repeat, aty.city, \
41
+        ary.area, ary.audience_num, ary.audience_rating, ary.avg_num, ary.avg_rating, ary.market_rating, ary.avg_fans, ary.avg_view_time \
42
+        from yxb.ad_television_tetv aty \
43
+        left join yxb.ad_rating_tetv ary on ary.tv_id = aty.id and ary.area like 'CSM5%%'
44
+        left join yxb.ad_tv_lib atl on atl.tv_name = aty.tv_name
45
+        limit %s, %s
46
+    """
47
+    sql = sql % (m, n)
48
+    print sql
49
+    Mysql.execute(sql, conn=conn)
50
+    m += n
51
+
52
+sql = """
53
+    update odl.area_ad_television
54
+    set theater_attribute = 
55
+    (
56
+    case 
57
+    when start_time >= '00:00:00' and end_time <= '6:00:00' then '凌晨剧场'
58
+    when start_time >= '7:00:00' and end_time < '9:00:00' then '早间剧场'
59
+    when start_time >= '9:00:00' and end_time <= '12:00:00' then '上午剧场'
60
+    when start_time >= '14:00:00' and end_time < '18:00:00' then '下午剧场'
61
+    when start_time >= '19:30:00' and end_time <= '21:30:00' then '黄金剧场'
62
+    when (start_time >= '18:00:00' and end_time < '19:30:00') or (start_time > '21:30:00' and end_time < '24:00:00') then '晚间剧场'
63
+    end
64
+    )
65
+    where theater_attribute is null or theater_attribute = ''
66
+"""
67
+
68
+Mysql.execute(sql, conn=conn)
69
+
70
+Mysql.close(conn)

+ 67 - 0
task_odl/odl_area_ad_television_incr_update.py

@@ -0,0 +1,67 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""ad_television表数据增量更新
5
+
6
+"""
7
+
8
+import datetime
9
+import sys
10
+
11
+from fty_util.common import Mysql
12
+
13
+reload(sys)
14
+sys.setdefaultencoding('utf8')
15
+
16
+conn = Mysql.createOfflineConn()
17
+
18
+sql = """
19
+    select max(tv_date) as max_date from odl.area_ad_television
20
+"""
21
+row = Mysql.getOne(sql, conn=conn)
22
+max_date = row[0]
23
+
24
+# year = datetime.datetime.strptime(max_date, '%Y-%m-%d').year
25
+sql = """
26
+    insert into odl.area_ad_television (television_id, tv_id, tv_name, epi_num, host, channel, tv_date, weekday, start_time, end_time,
27
+        theater_attribute, property, is_repeat, city, area, audience_num, audience_rating, avg_num, avg_rating,
28
+        market_rating, avg_fans, avg_view_time)
29
+    select aty.id, atl.id, aty.tv_name, aty.epi_num, aty.host, aty.channel, aty.tv_date, aty.weekday, aty.start_time, aty.end_time, aty.theater_attribute,
30
+    aty.property, aty.is_repeat, aty.city, 
31
+    ary.area, ary.audience_num, ary.audience_rating, ary.avg_num, ary.avg_rating, ary.market_rating, ary.avg_fans, ary.avg_view_time
32
+    from yxb.ad_television_tetv aty
33
+    left join yxb.ad_rating_tetv ary on ary.tv_id = aty.id and ary.area like 'CSM5%%'
34
+    left join yxb.ad_tv_lib atl on atl.tv_name = aty.tv_name
35
+    where aty.tv_date > '%s'
36
+"""
37
+sql = sql % (max_date)
38
+Mysql.execute(sql, conn=conn)
39
+
40
+"""
41
+凌晨剧场 0:00 - 6:00
42
+早间剧场:7:00-9:00
43
+上午剧场 9:00-12:00
44
+下午剧场 14:00-18:00
45
+晚间剧场 18:00-24:00
46
+黄金剧场:19:30-21:30
47
+"""
48
+sql = """
49
+    update odl.area_ad_television
50
+    set theater_attribute = 
51
+    (
52
+    case 
53
+    when start_time >= '00:00:00' and end_time <= '6:00:00' then '凌晨剧场'
54
+    when start_time >= '7:00:00' and end_time < '9:00:00' then '早间剧场'
55
+    when start_time >= '9:00:00' and end_time <= '12:00:00' then '上午剧场'
56
+    when start_time >= '14:00:00' and end_time < '18:00:00' then '下午剧场'
57
+    when start_time >= '19:30:00' and end_time <= '21:30:00' then '黄金剧场'
58
+    when (start_time >= '18:00:00' and end_time < '19:30:00') or (start_time > '21:30:00' and end_time < '24:00:00') then '晚间剧场'
59
+    end
60
+    )
61
+    where tv_date > '%s' and (theater_attribute is null or theater_attribute = '')
62
+"""
63
+sql = sql % (max_date)
64
+
65
+Mysql.execute(sql, conn=conn)
66
+
67
+Mysql.close(conn)

+ 157 - 0
task_other/idl_rank_update.py

@@ -0,0 +1,157 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""排名更新
5
+
6
+"""
7
+
8
+import sys
9
+
10
+from fty_util.common import Mysql
11
+
12
+reload(sys)
13
+sys.setdefaultencoding('utf8')
14
+
15
+conn = Mysql.createOfflineConn()
16
+
17
+sql = """
18
+        REPLACE INTO idl.ad_tv_rate_rank(channel,tv_name,tv_date,audience_rate,rank,theater_attribute)
19
+        select channel,tv_name,tv_date,avg_rate,rank,'黄金剧场' from (
20
+		select heyf_tmp.channel,heyf_tmp.tv_name,heyf_tmp.tv_date,heyf_tmp.avg_rate,@rownum:=@rownum+1 ,  
21
+		  if(@pdept=heyf_tmp.tv_date,@rank:=@rank+1,@rank:=1) as rank,
22
+		  @pdept:=heyf_tmp.tv_date 
23
+		from(
24
+
25
+		select  channel,tv_name,round(avg(audience_rating),3) as avg_rate,tv_date   from (
26
+		select t.channel,t.tv_name, t.audience_rating ,  t.tv_date, month(t.tv_date) as  tv_month
27
+		from odl.ad_tv_rating_denoise t
28
+		where audience_rating >= 0 
29
+		and (t.start_time >= '19:30:00' and t.end_time <= '21:30:00' and t.start_time <= '21:30:00' and t.end_time >= '19:30:00')
30
+		)a
31
+		GROUP BY channel,tv_name,tv_date
32
+		order by  tv_date,avg_rate DESC
33
+
34
+        )  heyf_tmp , 
35
+		(select @rownum :=0 , @pdept := null ,@rank:=0) a  
36
+		 ) result
37
+"""
38
+Mysql.execute(sql, conn=conn)
39
+
40
+
41
+sql = """
42
+        REPLACE INTO idl.ad_tv_rate_rank(channel,tv_name,tv_date,audience_rate,rank,theater_attribute)
43
+        select channel,tv_name,tv_date,avg_rate,rank,'凌晨剧场' from (  
44
+		select heyf_tmp.channel,heyf_tmp.tv_name,heyf_tmp.tv_date,heyf_tmp.avg_rate,@rownum:=@rownum+1 ,  
45
+		  if(@pdept=heyf_tmp.tv_date,@rank:=@rank+1,@rank:=1) as rank,  
46
+		  @pdept:=heyf_tmp.tv_date 
47
+		from(
48
+
49
+		select  channel,tv_name,round(avg(audience_rating),3) as avg_rate,tv_date   from (
50
+		select t.channel,t.tv_name, t.audience_rating ,  t.tv_date, month(t.tv_date) as  tv_month
51
+		from odl.ad_tv_rating_denoise t
52
+		where audience_rating >= 0 
53
+		and (t.start_time >= '00:00:00' and t.end_time <= '06:00:00' and t.start_time <= '06:00:00' and t.end_time >= '00:00:00')
54
+		)a
55
+		GROUP BY channel,tv_name,tv_date
56
+		order by  tv_date,avg_rate DESC
57
+
58
+        )  heyf_tmp , 
59
+		(select @rownum :=0 , @pdept := null ,@rank:=0) a  
60
+		 ) result
61
+"""
62
+Mysql.execute(sql, conn=conn)
63
+
64
+sql = """
65
+        REPLACE INTO idl.ad_tv_rate_rank(channel,tv_name,tv_date,audience_rate,rank,theater_attribute)
66
+        select channel,tv_name,tv_date,avg_rate,rank,'早间剧场' from (  
67
+		select heyf_tmp.channel,heyf_tmp.tv_name,heyf_tmp.tv_date,heyf_tmp.avg_rate,@rownum:=@rownum+1 ,  
68
+		  if(@pdept=heyf_tmp.tv_date,@rank:=@rank+1,@rank:=1) as rank,  
69
+		  @pdept:=heyf_tmp.tv_date 
70
+		from(
71
+
72
+		select  channel,tv_name,round(avg(audience_rating),3) as avg_rate,tv_date   from (
73
+		select t.channel,t.tv_name, t.audience_rating ,  t.tv_date, month(t.tv_date) as  tv_month
74
+		from odl.ad_tv_rating_denoise t
75
+		where audience_rating >= 0 
76
+		and (t.start_time >= '07:00:00' and t.end_time <= '09:00:00' and t.start_time <= '09:00:00' and t.end_time >= '07:00:00')
77
+		)a
78
+		GROUP BY channel,tv_name,tv_date
79
+		order by  tv_date,avg_rate DESC
80
+
81
+        )  heyf_tmp , 
82
+		(select @rownum :=0 , @pdept := null ,@rank:=0) a  
83
+		 ) result
84
+"""
85
+Mysql.execute(sql, conn=conn)
86
+
87
+
88
+sql = """
89
+        REPLACE INTO idl.ad_tv_rate_rank(channel,tv_name,tv_date,audience_rate,rank,theater_attribute)
90
+        select channel,tv_name,tv_date,avg_rate,rank,'上午剧场' from (  
91
+		select heyf_tmp.channel,heyf_tmp.tv_name,heyf_tmp.tv_date,heyf_tmp.avg_rate,@rownum:=@rownum+1 ,  
92
+		  if(@pdept=heyf_tmp.tv_date,@rank:=@rank+1,@rank:=1) as rank,  
93
+		  @pdept:=heyf_tmp.tv_date 
94
+		from(
95
+
96
+		select  channel,tv_name,round(avg(audience_rating),3) as avg_rate,tv_date   from (
97
+		select t.channel,t.tv_name, t.audience_rating ,  t.tv_date, month(t.tv_date) as  tv_month
98
+		from odl.ad_tv_rating_denoise t
99
+		where audience_rating >= 0 
100
+		and (t.start_time >= '09:00:00' and t.end_time <= '12:00:00' and t.start_time <= '12:00:00' and t.end_time >= '09:00:00')
101
+		)a
102
+		GROUP BY channel,tv_name,tv_date
103
+		order by  tv_date,avg_rate DESC
104
+
105
+        )  heyf_tmp , 
106
+		(select @rownum :=0 , @pdept := null ,@rank:=0) a  
107
+		 ) result
108
+"""
109
+Mysql.execute(sql, conn=conn)
110
+
111
+sql = """
112
+        REPLACE INTO idl.ad_tv_rate_rank(channel,tv_name,tv_date,audience_rate,rank,theater_attribute)
113
+        select channel,tv_name,tv_date,avg_rate,rank,'下午剧场' from (  
114
+		select heyf_tmp.channel,heyf_tmp.tv_name,heyf_tmp.tv_date,heyf_tmp.avg_rate,@rownum:=@rownum+1 ,  
115
+		  if(@pdept=heyf_tmp.tv_date,@rank:=@rank+1,@rank:=1) as rank,  
116
+		  @pdept:=heyf_tmp.tv_date 
117
+		from(
118
+
119
+		select  channel,tv_name,round(avg(audience_rating),3) as avg_rate,tv_date   from (
120
+		select t.channel,t.tv_name, t.audience_rating ,  t.tv_date, month(t.tv_date) as  tv_month
121
+		from odl.ad_tv_rating_denoise t
122
+		where audience_rating >= 0 
123
+		and (t.start_time >= '14:00:00' and t.end_time <= '18:00:00' and t.start_time <= '18:00:00' and t.end_time >= '14:00:00')
124
+		)a
125
+		GROUP BY channel,tv_name,tv_date
126
+		order by  tv_date,avg_rate DESC
127
+
128
+        )  heyf_tmp , 
129
+		(select @rownum :=0 , @pdept := null ,@rank:=0) a  
130
+		 ) result
131
+"""
132
+Mysql.execute(sql, conn=conn)
133
+
134
+sql = """
135
+        REPLACE INTO idl.ad_tv_rate_rank(channel,tv_name,tv_date,audience_rate,rank,theater_attribute)
136
+        select channel,tv_name,tv_date,avg_rate,rank,'晚间剧场' from (  
137
+		select heyf_tmp.channel,heyf_tmp.tv_name,heyf_tmp.tv_date,heyf_tmp.avg_rate,@rownum:=@rownum+1 ,  
138
+		  if(@pdept=heyf_tmp.tv_date,@rank:=@rank+1,@rank:=1) as rank,  
139
+		  @pdept:=heyf_tmp.tv_date 
140
+		from(
141
+
142
+		select  channel,tv_name,round(avg(audience_rating),3) as avg_rate,tv_date   from (
143
+		select t.channel,t.tv_name, t.audience_rating ,  t.tv_date, month(t.tv_date) as  tv_month
144
+		from odl.ad_tv_rating_denoise t
145
+		where audience_rating >= 0 
146
+		and (t.start_time >= '18:00:00' and t.end_time <= '24:00:00' and t.start_time <= '24:00:00' and t.end_time >= '18:00:00')
147
+		)a
148
+		GROUP BY channel,tv_name,tv_date
149
+		order by  tv_date,avg_rate DESC
150
+
151
+        )  heyf_tmp , 
152
+		(select @rownum :=0 , @pdept := null ,@rank:=0) a  
153
+		 ) result
154
+"""
155
+Mysql.execute(sql, conn=conn)
156
+
157
+Mysql.close(conn)

+ 96 - 0
task_other/transform_categories.py

@@ -0,0 +1,96 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+import datetime
5
+import sys
6
+
7
+from fty_util.common import Mysql
8
+
9
+reload(sys)
10
+sys.setdefaultencoding('utf8')
11
+
12
+conn = Mysql.createOfflineConn()
13
+
14
+cat_dict = {}
15
+# 获取所有标准分类和对应的映射分类
16
+sql = """
17
+    select standard_category, reflect_category from odl.basic_categories
18
+"""
19
+categories = Mysql.getAll(sql, conn=conn)
20
+
21
+for category in categories:
22
+    standard_category =  category['standard_category']
23
+    reflect_category = category['reflect_category']
24
+    cat_dict[reflect_category] = standard_category
25
+
26
+sql = """
27
+    select tv_id, iqiyi_types, iqiyi_types_new, tengxun_types, tengxun_types_new, baike_types, baike_types_new, manual_types, manual_types_new from scrapy.types_analyse where iqiyi_tengxun_after_baike_after_manual is null or iqiyi_tengxun_after_baike_after_manual = '' order by tv_id asc
28
+"""
29
+
30
+rows = Mysql.getAll(sql, conn=conn)
31
+for row in rows:
32
+    tv_id = row['tv_id']
33
+    iqiyi_types = row['iqiyi_types']
34
+    iqiyi_types_new = row['iqiyi_types_new']
35
+    tengxun_types = row['tengxun_types']
36
+    tengxun_types_new = row['tengxun_types_new']
37
+    baike_types = row['baike_types']
38
+    baike_types_new = row['baike_types_new']
39
+    manual_types = row['manual_types']
40
+    manual_types_new = row['manual_types_new']
41
+
42
+    iqiyi_types_set = set()
43
+    if (iqiyi_types_new is None or len(iqiyi_types_new) == 0) and iqiyi_types is not None and len(iqiyi_types) > 0:
44
+        for _type in iqiyi_types.split(' '):
45
+            cate = cat_dict.get(_type)
46
+            if cate is not None:
47
+                iqiyi_types_set.add(cate)
48
+
49
+    tengxun_types_set = set()
50
+    if (tengxun_types_new is None or len(tengxun_types_new) == 0) and tengxun_types is not None and len(tengxun_types) > 0:
51
+        for _type in tengxun_types.split(' '):
52
+            cate = cat_dict.get(_type)
53
+            if cate is not None:
54
+                tengxun_types_set.add(cate)
55
+
56
+    baike_types_set = set()
57
+    if (baike_types_new is None or len(baike_types_new) == 0) and baike_types is not None and len(baike_types) > 0:
58
+        for _type in baike_types.split(' '):
59
+            cate = cat_dict.get(_type)
60
+            if cate is not None:
61
+                baike_types_set.add(cate)
62
+
63
+    manual_types_set = set()
64
+    if (manual_types_new is None or len(manual_types_new) == 0) and manual_types is not None and len(manual_types) > 0:
65
+        for _type in manual_types.split(' '):
66
+            cate = cat_dict.get(_type)
67
+            if cate is not None:
68
+                manual_types_set.add(cate)
69
+
70
+    all_types = set()
71
+    if len(iqiyi_types_set | tengxun_types_set) > 2:
72
+        all_types = iqiyi_types_set | tengxun_types_set
73
+    elif len(iqiyi_types_set | tengxun_types_set | baike_types_set) > 2:
74
+        all_types = iqiyi_types_set | tengxun_types_set | baike_types_set
75
+    elif len(iqiyi_types_set | tengxun_types_set | baike_types_set | manual_types_set) > 2:
76
+        all_types = iqiyi_types_set | tengxun_types_set | baike_types_set | manual_types_set
77
+
78
+    sql = """
79
+        update scrapy.types_analyse set iqiyi_types_new = %s, tengxun_types_new = %s, baike_types_new = %s, manual_types_new = %s, iqiyi_tengxun_after_baike_after_manual = %s where tv_id = %s
80
+    """
81
+    value = (' '.join(iqiyi_types_set), ' '.join(tengxun_types_set), ' '.join(baike_types_set), ' '.join(manual_types_set), ' '.join(all_types), tv_id)
82
+    Mysql.update(sql, param=value, conn=conn)
83
+
84
+    # 更新 yxb.ad_tv_lib 表
85
+    sql = """
86
+        update yxb.ad_tv_lib set categories = %s where id = %s
87
+    """
88
+    value = (' '.join(all_types), tv_id)
89
+    Mysql.update(sql, param=value, conn=conn)
90
+
91
+    # 更新 odl.ad_tv_lib 表
92
+    sql = """
93
+        update odl.ad_tv_lib set categories = %s where tv_id = %s
94
+    """
95
+    value = (' '.join(all_types), tv_id)
96
+    Mysql.update(sql, param=value, conn=conn)

+ 286 - 0
task_scrapy/i_t_dsj_all.py

@@ -0,0 +1,286 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+"""爱奇艺电视剧分类爬取
4
+
5
+分为两步
6
+第一步爬取搜索页面结果,找到符合条件的电视剧
7
+第二步根据保存的具体页面url爬取分类信息
8
+"""
9
+
10
+import random
11
+import sys
12
+import time
13
+
14
+from selenium import webdriver
15
+
16
+from fty_util.common import Mysql
17
+
18
+reload(sys)
19
+sys.setdefaultencoding('utf8')
20
+
21
+class DSJ_All(object):
22
+    # 爬取电视剧链接地址
23
+    def get_iqiyi_url():
24
+        
25
+        # 需要爬取的列表页面
26
+        start_urls = [
27
+            # 'http://www.iqiyi.com/lib/dianshiju/,,2017_4_1.html',
28
+            # 'http://www.iqiyi.com/lib/dianshiju/,,2016_4_1.html',
29
+            # 'http://www.iqiyi.com/lib/dianshiju/,,2015_4_1.html',
30
+            # 'http://www.iqiyi.com/lib/dianshiju/,,2014-2011_4_1.html',
31
+            # 'http://www.iqiyi.com/lib/dianshiju/,,2010-2000_4_1.html',
32
+            # 'http://www.iqiyi.com/lib/dianshiju/,,90%E5%B9%B4%E4%BB%A3_4_1.html',
33
+            # 'http://www.iqiyi.com/lib/dianshiju/,,80%E5%B9%B4%E4%BB%A3_4_1.html',
34
+            'http://www.iqiyi.com/lib/dianshiju/,,%E6%9B%B4%E6%97%A9_4_1.html'
35
+        ]
36
+        # 打开Firefox浏览器
37
+        driver = webdriver.Firefox()
38
+        driver.set_page_load_timeout(20)
39
+
40
+        # 数据库连接
41
+        conn = Mysql.createScrapyConn()
42
+        for url in start_urls:
43
+            # 打开主页
44
+            try:
45
+                driver.get(url)
46
+            except:
47
+                driver.execute_script('window.stop()')
48
+            
49
+
50
+            is_next = True
51
+            while is_next:
52
+                
53
+                try:
54
+                    next_page = driver.find_elements_by_xpath('//div[@class="mod-page"]/a')[-1]
55
+                except:
56
+                    continue
57
+                lis = driver.find_elements_by_xpath('//div[@class="wrapper-piclist"]/ul/li')
58
+                sql_insert = """
59
+                    insert into scrapy.iqiyi_dianshiju_url (url) values (%s)
60
+                """
61
+                data_list = []
62
+                for li in lis:
63
+                    try:
64
+                        tv_url = li.find_element_by_xpath('.//div[1]/a').get_attribute('href')
65
+                        print tv_url
66
+                        data_list.append((tv_url,))                    
67
+                    except Exception, e:
68
+                        print '没有'
69
+                        continue
70
+                    time.sleep(random.uniform(0, 2))
71
+                Mysql.insertMany(sql_insert, data_list, conn)
72
+                try:
73
+                    next_page_text = next_page.find_element_by_xpath('.').text
74
+                    if next_page_text == '下一页':
75
+                        next_page.click()
76
+                    else:
77
+                        is_next = False;
78
+                except:
79
+                    is_next = False;
80
+                time.sleep(10)
81
+        driver.quit()
82
+
83
+    # 爬取具体页面
84
+    def get_iqiyi_detail():
85
+        driver = webdriver.Firefox()
86
+        driver.set_page_load_timeout(10)
87
+
88
+        # 数据库连接
89
+        conn = Mysql.createScrapyConn()
90
+
91
+        sql = """
92
+            select max(id) from scrapy.iqiyi_dianshiju_detail
93
+        """
94
+        max_id = Mysql.getOne(sql, conn=conn)
95
+        max_id = max_id[0]
96
+        if max_id is None:
97
+            max_id = 0
98
+        # 获取所有url
99
+        sql = """
100
+            select id, url from scrapy.iqiyi_dianshiju_url where id > '%s' order by id asc
101
+        """
102
+        sql = sql % (max_id)
103
+        rows = Mysql.getAll(sql, conn=conn)
104
+        for row in rows:
105
+            _id = row['id']
106
+            url = row['url']
107
+            print url
108
+            try:
109
+                driver.get(url)
110
+            except:
111
+                driver.execute_script('window.stop()')
112
+            
113
+            detail_info = driver.find_element_by_xpath('//div[@class="result_detail"]')
114
+            # 详情html内容
115
+            detail_info_html = detail_info.get_attribute('innerHTML')
116
+            # 详情文本内容
117
+            detail_info_text = detail_info.find_element_by_xpath('.').text
118
+            # 电视剧名称
119
+            tv_name = detail_info.find_element_by_xpath('h1/a').text
120
+
121
+            #存入数据库
122
+            sql = """
123
+                insert into scrapy.iqiyi_dianshiju_detail (id, tv_name, detail_info_text, detail_info_html, url) values (%s, %s, %s, %s, %s)
124
+            """
125
+            value = (_id, tv_name, detail_info_text, detail_info_html, url)
126
+            Mysql.insertOne(sql, value=value, conn=conn)
127
+            time.sleep(random.uniform(1, 5))
128
+        driver.quit()
129
+
130
+    # 爬取电视剧链接地址
131
+    def get_tengxun_url():
132
+        start_urls = [
133
+            # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=2017',
134
+            # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=859',
135
+            # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=860',
136
+            # 'http://v.qq.com/x/list/tv?iyear=861&offset=0&sort=5',
137
+            # 'http://v.qq.com/x/list/tv?sort=5&offset=0&iyear=862',
138
+            # 'http://v.qq.com/x/list/tv?iyear=863&sort=5&offset=0',
139
+            # 'http://v.qq.com/x/list/tv?sort=5&iyear=864&offset=0',
140
+            'http://v.qq.com/x/list/tv?iyear=865&sort=5&offset=0',
141
+            'http://v.qq.com/x/list/tv?iyear=866&offset=0&sort=5'
142
+        ]
143
+        # 打开Firefox浏览器
144
+        driver = webdriver.Firefox()
145
+        driver.set_page_load_timeout(20)
146
+
147
+        # 数据库连接
148
+        conn = Mysql.createScrapyConn()
149
+        for url in start_urls:
150
+            # 打开主页
151
+            try:
152
+                driver.get(url)
153
+            except:
154
+                driver.execute_script('window.stop()')
155
+            
156
+            is_next = True
157
+            while is_next:
158
+                lis = driver.find_elements_by_xpath('//div[@class="mod_bd"]/div/ul/li')
159
+                print lis
160
+                sql_insert = """
161
+                    insert into scrapy.tengxun_dianshiju_url (url) values (%s)
162
+                """
163
+                data_list = []
164
+                for li in lis:
165
+                    try:
166
+                        tv_url = li.find_element_by_xpath('a').get_attribute('href')
167
+                        print tv_url
168
+                        data_list.append((tv_url,))                    
169
+                    except Exception, e:
170
+                        print '没有'
171
+                        continue
172
+                    time.sleep(1)
173
+                Mysql.insertMany(sql_insert, data_list, conn)
174
+                try:
175
+                    next_page = driver.find_elements_by_xpath('//div[@class="mod_pages"]/a')[-1]
176
+                except:
177
+                    is_next = False
178
+                    continue
179
+                try:
180
+                    next_page_text = next_page.find_element_by_xpath('.').text
181
+                    next_page_url = next_page.find_element_by_xpath('.').get_attribute('href')
182
+                    if next_page_url == 'javascript:;':
183
+                        is_next = False
184
+                        continue
185
+                    if next_page_text == '下一页':
186
+                        next_page.click()
187
+                    else:
188
+                        is_next = False;
189
+                except:
190
+                    is_next = False;
191
+                time.sleep(10)
192
+        driver.quit()
193
+
194
+    def get_tengxun_detail_url():
195
+        # 打开Firefox浏览器
196
+        driver = webdriver.Firefox()
197
+        driver.set_page_load_timeout(20)
198
+
199
+        # 数据库连接
200
+        conn = Mysql.createScrapyConn()
201
+
202
+        sql = """
203
+            select id, url from scrapy.tengxun_dianshiju_url where detail_url is null or detail_url = '' order by id asc
204
+        """
205
+        rows = Mysql.getAll(sql, conn=conn)
206
+        for row in rows:
207
+            _id = row['id']
208
+            url = row['url']
209
+
210
+            # 打开主页
211
+            try:
212
+                driver.get(url)
213
+            except:
214
+                driver.execute_script('window.stop()')
215
+            if re.match(r'(.*)detail(.*)', driver.current_url):
216
+                print driver.current_url
217
+                sql = """
218
+                    update scrapy.tengxun_dianshiju_url set detail_url = '%s' where id = '%s'
219
+                """
220
+                sql = sql % (driver.current_url, _id)
221
+                Mysql.update(sql, conn=conn)
222
+                continue
223
+            try:
224
+                a_list = driver.find_elements_by_xpath('//a[@class="album_title"]')
225
+                print a_list
226
+                for a in a_list:
227
+                    detail_href = a.find_element_by_xpath('.').get_attribute('href')
228
+                    if re.match(r'(.*)detail(.*)', detail_href):
229
+                        print detail_href
230
+                        sql = """
231
+                            update scrapy.tengxun_dianshiju_url set detail_url = '%s' where id = '%s'
232
+                        """
233
+                        sql = sql % (detail_href, _id)
234
+                        Mysql.update(sql, conn=conn)
235
+                        break
236
+            except Exception, e:
237
+                print e
238
+            time.sleep(random.uniform(0, 3))
239
+            
240
+        driver.quit()
241
+
242
+    # 爬取具体页面
243
+    def get_tengxun_detail():
244
+        driver = webdriver.Firefox()
245
+        driver.set_page_load_timeout(10)
246
+
247
+        # 数据库连接
248
+        conn = Mysql.createScrapyConn()
249
+        # 获取所有需要爬取的电视剧
250
+        sql = """
251
+            select url, detail_url from scrapy.tengxun_dianshiju_url order by id asc
252
+        """
253
+        rows = Mysql.getAll(sql, conn=conn)
254
+        for row in rows:
255
+            url = row['url']
256
+            detail_url = row['detail_url']
257
+            try:
258
+                driver.get(detail_url)
259
+            except:
260
+                driver.execute_script('window.stop()')
261
+            
262
+            detail_info = driver.find_element_by_xpath('//div[@class="container_inner"]')
263
+            # 详情html内容
264
+            detail_info_html = detail_info.get_attribute('innerHTML')
265
+            # 详情文本内容
266
+            detail_info_text = detail_info.find_element_by_xpath('.').text
267
+            # 电视剧名称
268
+            tv_name = detail_info.find_element_by_xpath('.//div[@class="video_title_collect cf"]/h1/a').text
269
+            sql = """
270
+                insert into scrapy.tengxun_dianshiju_detail (tv_name, detail_info_text, detail_info_html, cover_url, detail_url) values ('%s', '%s', '%s', '%s', '%s')
271
+            """
272
+            sql = sql % (tv_name, detail_info_text, detail_info_html, url, detail_url)
273
+            Mysql.insertOne(sql, conn=conn)
274
+        driver.quit()
275
+
276
+
277
+if __name__ == '__main__':
278
+    if len(sys.argv) != 2:
279
+        print '没有输入参数,退出'
280
+        sys.exit(0)
281
+    print 'method name is ' + sys.argv[1]
282
+    obj = DSJ_All()
283
+    try:
284
+        getattr(obj, sys.argv[1])()
285
+    except Exception, e:
286
+        print e

+ 291 - 0
task_scrapy/i_t_dsj_all_without_browser.py

@@ -0,0 +1,291 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+"""爱奇艺电视剧分类爬取
4
+
5
+分为两步
6
+第一步爬取搜索页面结果,找到符合条件的电视剧
7
+第二步根据保存的具体页面url爬取分类信息
8
+"""
9
+
10
+import random
11
+import sys
12
+import time
13
+
14
+from selenium import webdriver
15
+
16
+from fty_util.common import Mysql
17
+
18
+reload(sys)
19
+sys.setdefaultencoding('utf8')
20
+
21
+class DSJ_All(object):
22
+    # 爬取电视剧链接地址
23
+    def get_iqiyi_url():
24
+        
25
+        # 需要爬取的列表页面
26
+        start_urls = [
27
+            # 'http://www.iqiyi.com/lib/dianshiju/,,2017_4_1.html',
28
+            # 'http://www.iqiyi.com/lib/dianshiju/,,2016_4_1.html',
29
+            # 'http://www.iqiyi.com/lib/dianshiju/,,2015_4_1.html',
30
+            # 'http://www.iqiyi.com/lib/dianshiju/,,2014-2011_4_1.html',
31
+            # 'http://www.iqiyi.com/lib/dianshiju/,,2010-2000_4_1.html',
32
+            # 'http://www.iqiyi.com/lib/dianshiju/,,90%E5%B9%B4%E4%BB%A3_4_1.html',
33
+            # 'http://www.iqiyi.com/lib/dianshiju/,,80%E5%B9%B4%E4%BB%A3_4_1.html',
34
+            'http://www.iqiyi.com/lib/dianshiju/,,%E6%9B%B4%E6%97%A9_4_1.html'
35
+        ]
36
+        # 打开Firefox浏览器
37
+        # driver = webdriver.Firefox()
38
+        driver = webdriver.PhantomJS()
39
+        driver.set_page_load_timeout(20)
40
+
41
+        # 数据库连接
42
+        conn = Mysql.createScrapyConn()
43
+        for url in start_urls:
44
+            # 打开主页
45
+            try:
46
+                driver.get(url)
47
+            except:
48
+                driver.execute_script('window.stop()')
49
+            
50
+
51
+            is_next = True
52
+            while is_next:
53
+                
54
+                try:
55
+                    next_page = driver.find_elements_by_xpath('//div[@class="mod-page"]/a')[-1]
56
+                except:
57
+                    continue
58
+                lis = driver.find_elements_by_xpath('//div[@class="wrapper-piclist"]/ul/li')
59
+                sql_insert = """
60
+                    insert into scrapy.iqiyi_dianshiju_url (url) values (%s)
61
+                """
62
+                data_list = []
63
+                for li in lis:
64
+                    try:
65
+                        tv_url = li.find_element_by_xpath('.//div[1]/a').get_attribute('href')
66
+                        print tv_url
67
+                        data_list.append((tv_url,))                    
68
+                    except Exception, e:
69
+                        print '没有'
70
+                        continue
71
+                    time.sleep(random.uniform(0, 2))
72
+                Mysql.insertMany(sql_insert, data_list, conn)
73
+                try:
74
+                    next_page_text = next_page.find_element_by_xpath('.').text
75
+                    if next_page_text == '下一页':
76
+                        next_page.click()
77
+                    else:
78
+                        is_next = False;
79
+                except:
80
+                    is_next = False;
81
+                time.sleep(10)
82
+        driver.quit()
83
+
84
+    # 爬取具体页面
85
+    def get_iqiyi_detail():
86
+        # driver = webdriver.Firefox()
87
+        driver = webdriver.PhantomJS()
88
+        driver.set_page_load_timeout(10)
89
+
90
+        # 数据库连接
91
+        conn = Mysql.createScrapyConn()
92
+
93
+        sql = """
94
+            select max(id) from scrapy.iqiyi_dianshiju_detail
95
+        """
96
+        max_id = Mysql.getOne(sql, conn=conn)
97
+        max_id = max_id[0]
98
+        if max_id is None:
99
+            max_id = 0
100
+        # 获取所有url
101
+        sql = """
102
+            select id, url from scrapy.iqiyi_dianshiju_url where id > '%s' order by id asc
103
+        """
104
+        sql = sql % (max_id)
105
+        rows = Mysql.getAll(sql, conn=conn)
106
+        for row in rows:
107
+            _id = row['id']
108
+            url = row['url']
109
+            print url
110
+            try:
111
+                driver.get(url)
112
+            except:
113
+                driver.execute_script('window.stop()')
114
+            
115
+            detail_info = driver.find_element_by_xpath('//div[@class="result_detail"]')
116
+            # 详情html内容
117
+            detail_info_html = detail_info.get_attribute('innerHTML')
118
+            # 详情文本内容
119
+            detail_info_text = detail_info.find_element_by_xpath('.').text
120
+            # 电视剧名称
121
+            tv_name = detail_info.find_element_by_xpath('h1/a').text
122
+
123
+            #存入数据库
124
+            sql = """
125
+                insert into scrapy.iqiyi_dianshiju_detail (id, tv_name, detail_info_text, detail_info_html, url) values (%s, %s, %s, %s, %s)
126
+            """
127
+            value = (_id, tv_name, detail_info_text, detail_info_html, url)
128
+            Mysql.insertOne(sql, value=value, conn=conn)
129
+            time.sleep(random.uniform(1, 5))
130
+        driver.quit()
131
+
132
+    # 爬取电视剧链接地址
133
+    def get_tengxun_url():
134
+        start_urls = [
135
+            # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=2017',
136
+            # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=859',
137
+            # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=860',
138
+            # 'http://v.qq.com/x/list/tv?iyear=861&offset=0&sort=5',
139
+            # 'http://v.qq.com/x/list/tv?sort=5&offset=0&iyear=862',
140
+            # 'http://v.qq.com/x/list/tv?iyear=863&sort=5&offset=0',
141
+            # 'http://v.qq.com/x/list/tv?sort=5&iyear=864&offset=0',
142
+            'http://v.qq.com/x/list/tv?iyear=865&sort=5&offset=0',
143
+            'http://v.qq.com/x/list/tv?iyear=866&offset=0&sort=5'
144
+        ]
145
+        # 打开Firefox浏览器
146
+        # driver = webdriver.Firefox()
147
+        driver = webdriver.PhantomJS()
148
+        driver.set_page_load_timeout(20)
149
+
150
+        # 数据库连接
151
+        conn = Mysql.createScrapyConn()
152
+        for url in start_urls:
153
+            # 打开主页
154
+            try:
155
+                driver.get(url)
156
+            except:
157
+                driver.execute_script('window.stop()')
158
+            
159
+            is_next = True
160
+            while is_next:
161
+                lis = driver.find_elements_by_xpath('//div[@class="mod_bd"]/div/ul/li')
162
+                print lis
163
+                sql_insert = """
164
+                    insert into scrapy.tengxun_dianshiju_url (url) values (%s)
165
+                """
166
+                data_list = []
167
+                for li in lis:
168
+                    try:
169
+                        tv_url = li.find_element_by_xpath('a').get_attribute('href')
170
+                        print tv_url
171
+                        data_list.append((tv_url,))                    
172
+                    except Exception, e:
173
+                        print '没有'
174
+                        continue
175
+                    time.sleep(1)
176
+                Mysql.insertMany(sql_insert, data_list, conn)
177
+                try:
178
+                    next_page = driver.find_elements_by_xpath('//div[@class="mod_pages"]/a')[-1]
179
+                except:
180
+                    is_next = False
181
+                    continue
182
+                try:
183
+                    next_page_text = next_page.find_element_by_xpath('.').text
184
+                    next_page_url = next_page.find_element_by_xpath('.').get_attribute('href')
185
+                    if next_page_url == 'javascript:;':
186
+                        is_next = False
187
+                        continue
188
+                    if next_page_text == '下一页':
189
+                        next_page.click()
190
+                    else:
191
+                        is_next = False;
192
+                except:
193
+                    is_next = False;
194
+                time.sleep(10)
195
+        driver.quit()
196
+
197
+    def get_tengxun_detail_url():
198
+        # 打开Firefox浏览器
199
+        # driver = webdriver.Firefox()
200
+        driver = webdriver.PhantomJS()
201
+        driver.set_page_load_timeout(20)
202
+
203
+        # 数据库连接
204
+        conn = Mysql.createScrapyConn()
205
+
206
+        sql = """
207
+            select id, url from scrapy.tengxun_dianshiju_url where detail_url is null or detail_url = '' order by id asc
208
+        """
209
+        rows = Mysql.getAll(sql, conn=conn)
210
+        for row in rows:
211
+            _id = row['id']
212
+            url = row['url']
213
+
214
+            # 打开主页
215
+            try:
216
+                driver.get(url)
217
+            except:
218
+                driver.execute_script('window.stop()')
219
+            if re.match(r'(.*)detail(.*)', driver.current_url):
220
+                print driver.current_url
221
+                sql = """
222
+                    update scrapy.tengxun_dianshiju_url set detail_url = '%s' where id = '%s'
223
+                """
224
+                sql = sql % (driver.current_url, _id)
225
+                Mysql.update(sql, conn=conn)
226
+                continue
227
+            try:
228
+                a_list = driver.find_elements_by_xpath('//a[@class="album_title"]')
229
+                print a_list
230
+                for a in a_list:
231
+                    detail_href = a.find_element_by_xpath('.').get_attribute('href')
232
+                    if re.match(r'(.*)detail(.*)', detail_href):
233
+                        print detail_href
234
+                        sql = """
235
+                            update scrapy.tengxun_dianshiju_url set detail_url = '%s' where id = '%s'
236
+                        """
237
+                        sql = sql % (detail_href, _id)
238
+                        Mysql.update(sql, conn=conn)
239
+                        break
240
+            except Exception, e:
241
+                print e
242
+            time.sleep(random.uniform(0, 3))
243
+            
244
+        driver.quit()
245
+
246
+    # 爬取具体页面
247
+    def get_tengxun_detail():
248
+        # driver = webdriver.Firefox()
249
+        driver = webdriver.PhantomJS()
250
+        driver.set_page_load_timeout(10)
251
+
252
+        # 数据库连接
253
+        conn = Mysql.createScrapyConn()
254
+        # 获取所有需要爬取的电视剧
255
+        sql = """
256
+            select url, detail_url from scrapy.tengxun_dianshiju_url order by id asc
257
+        """
258
+        rows = Mysql.getAll(sql, conn=conn)
259
+        for row in rows:
260
+            url = row['url']
261
+            detail_url = row['detail_url']
262
+            try:
263
+                driver.get(detail_url)
264
+            except:
265
+                driver.execute_script('window.stop()')
266
+            
267
+            detail_info = driver.find_element_by_xpath('//div[@class="container_inner"]')
268
+            # 详情html内容
269
+            detail_info_html = detail_info.get_attribute('innerHTML')
270
+            # 详情文本内容
271
+            detail_info_text = detail_info.find_element_by_xpath('.').text
272
+            # 电视剧名称
273
+            tv_name = detail_info.find_element_by_xpath('.//div[@class="video_title_collect cf"]/h1/a').text
274
+            sql = """
275
+                insert into scrapy.tengxun_dianshiju_detail (tv_name, detail_info_text, detail_info_html, cover_url, detail_url) values ('%s', '%s', '%s', '%s', '%s')
276
+            """
277
+            sql = sql % (tv_name, detail_info_text, detail_info_html, url, detail_url)
278
+            Mysql.insertOne(sql, conn=conn)
279
+        driver.quit()
280
+
281
+
282
+if __name__ == '__main__':
283
+    if len(sys.argv) != 2:
284
+        print '没有输入参数,退出'
285
+        sys.exit(0)
286
+    print 'method name is ' + sys.argv[1]
287
+    obj = DSJ_All()
288
+    try:
289
+        getattr(obj, sys.argv[1])()
290
+    except Exception, e:
291
+        print e

+ 198 - 0
task_scrapy/i_t_dsj_categories.py

@@ -0,0 +1,198 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+"""爱奇艺电视剧分类爬取
4
+
5
+分为两步
6
+第一步爬取搜索页面结果,找到符合条件的电视剧
7
+第二步根据保存的具体页面url爬取分类信息
8
+"""
9
+
10
+import random
11
+import sys
12
+import time
13
+
14
+from selenium import webdriver
15
+
16
+from fty_util.common import Mysql
17
+
18
+reload(sys)
19
+sys.setdefaultencoding('utf8')
20
+
21
+class DSJ_Categories(object):
22
+    # 爬取电视剧链接地址
23
+    def get_iqiyi_url():
24
+        # 打开Firefox浏览器
25
+        driver = webdriver.Firefox()
26
+        driver.set_page_load_timeout(10)
27
+
28
+        # 数据库连接
29
+        conn = Mysql.createScrapyConn()
30
+        # 获取所有需要爬取的电视剧
31
+        sql = """
32
+            select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and (iqiyi_url is null or iqiyi_url = '') order by id asc
33
+        """
34
+        # rows = conn.getAll(sql)
35
+        rows = Mysql.getAll(sql, conn=conn)
36
+        for row in rows:
37
+            _id = row['id']
38
+            tv_name = row['tv_name']
39
+            print tv_name
40
+            start_url = "http://so.iqiyi.com/so/q_" + tv_name + "?source=input"
41
+            # 打开主页
42
+            try:
43
+                driver.get(start_url)
44
+            except:
45
+                driver.execute_script('window.stop()')
46
+            
47
+            lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li')
48
+            for li in lis:
49
+                try:
50
+                    first_num = li.find_element_by_xpath('.//div[@class="info_item mt15"]/div/div[2]/ul[1]/li[1]/a').text
51
+                    if '1' == first_num.strip():
52
+                        href = li.find_element_by_xpath('.//div/h3/a').get_attribute('href')
53
+                        print href
54
+                        sql = """
55
+                            update scrapy.tv_category_scrapy set iqiyi_url = '%s' where id = '%s'
56
+                        """
57
+                        sql = sql % (href, _id)
58
+                        # conn.update(sql)
59
+                        Mysql.update(sql, conn=conn)
60
+                        
61
+                        break
62
+                except Exception, e:
63
+                    print '没有'
64
+                    continue
65
+        driver.quit()
66
+
67
+    # 爬取具体页面
68
+    def get_detail():
69
+        driver = webdriver.Firefox()
70
+        driver.set_page_load_timeout(10)
71
+
72
+        # 数据库连接
73
+        conn = Mysql.createScrapyConn()
74
+        # 获取所有需要爬取的电视剧
75
+        sql = """
76
+            select id, tv_name, iqiyi_url from scrapy.tv_category_scrapy where id > 4573 and iqiyi_url is not null and iqiyi_url != '' and iqiyi_types is null order by id asc
77
+        """
78
+        # rows = conn.getAll(sql)
79
+        rows = Mysql.getAll(sql, conn=conn)
80
+        for row in rows:
81
+            _id = row['id']
82
+            tv_name = row['tv_name']
83
+            url = row['iqiyi_url']
84
+            print url
85
+            try:
86
+                driver.get(url)
87
+            except:
88
+                driver.execute_script('window.stop()')
89
+            
90
+            cats = driver.find_elements_by_xpath('//div[@class="look_point"]/a')
91
+            cats_set = set()
92
+            for cat in cats:
93
+                cats_set.add(cat.find_element_by_xpath('.').text.strip())
94
+
95
+            #存入数据库
96
+            sql = """
97
+                update scrapy.tv_category_scrapy set iqiyi_types = '%s' where id = '%s'
98
+            """
99
+            sql = sql % (' '.join(cats_set), _id)
100
+            # conn.update(sql)
101
+            Mysql.update(sql, conn=conn)
102
+        driver.quit()
103
+
104
+
105
+    # 爬取电视剧链接地址
106
+    def get_tengxun_url():
107
+        # 打开Firefox浏览器
108
+        driver = webdriver.Firefox()
109
+        driver.set_page_load_timeout(10)
110
+
111
+        # 数据库连接
112
+        conn = Mysql.createScrapyConn()
113
+        # 获取所有需要爬取的电视剧
114
+        sql = """
115
+            select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is null order by id asc
116
+        """
117
+        # rows = conn.getAll(sql)
118
+        rows = Mysql.getAll(sql, conn=conn)
119
+        for row in rows:
120
+            _id = row['id']
121
+            tv_name = row['tv_name']
122
+            print tv_name
123
+            start_url = "http://v.qq.com/x/search/?q=" + tv_name + "&stag=0"
124
+            # 打开主页
125
+            try:
126
+                driver.get(start_url)
127
+            except:
128
+                driver.execute_script('window.stop()')
129
+            
130
+            divs = driver.find_elements_by_xpath('//div[@class="wrapper_main"]/div')
131
+            for div in divs:
132
+                try:
133
+                    href = div.find_element_by_xpath('div[1]/div/h2/a').get_attribute('href')
134
+                    print href
135
+                    matchObj = re.match(r'(.*)detail(.*)', href, re.M | re.I)
136
+                    if matchObj:
137
+                        sql = """
138
+                            update scrapy.tv_category_scrapy set tengxun_url = %s where id = %s
139
+                        """
140
+                        # sql = sql % (href, _id)
141
+                        value = (href, _id)
142
+                        # conn.update(sql)
143
+                        Mysql.update(sql, param=value, conn=conn)
144
+                        break
145
+                except Exception, e:
146
+                    print '没有'
147
+                    print e
148
+                    continue
149
+        driver.quit()
150
+
151
+    # 爬取具体页面
152
+    def get_detail():
153
+        driver = webdriver.Firefox()
154
+        driver.set_page_load_timeout(10)
155
+
156
+        # 数据库连接
157
+        conn = Mysql.createScrapyConn()
158
+        # 获取所有需要爬取的电视剧
159
+        sql = """
160
+            select id, tv_name, tengxun_url from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is not null and tengxun_url != '' and tengxun_types is null order by id asc
161
+        """
162
+        # rows = Mysql.getAll(sql)
163
+        rows = Mysql.getAll(sql, conn=conn)
164
+        for row in rows:
165
+            _id = row['id']
166
+            tv_name = row['tv_name']
167
+            tengxun_url = row['tengxun_url']
168
+            print tengxun_url
169
+            # 打开主页
170
+            try:
171
+                driver.get(tengxun_url)
172
+            except:
173
+                driver.execute_script('window.stop()')
174
+            
175
+            cats = driver.find_elements_by_xpath('//div[@class="tag_list"]/a')
176
+            cats_set = set()
177
+            for cat in cats:
178
+                cat_name = cat.find_element_by_xpath('.').text
179
+                cats_set.add(cat_name)
180
+            #存入数据库
181
+            sql = """
182
+                update scrapy.tv_category_scrapy set tengxun_types = '%s' where id = '%s'
183
+            """
184
+            sql = sql % (' '.join(cats_set), _id)
185
+            # conn.update(sql)
186
+            Mysql.update(sql, conn=conn)
187
+        driver.quit()
188
+
189
+if __name__ == '__main__':
190
+    if len(sys.argv) != 2:
191
+        print '没有输入参数,退出'
192
+        sys.exit(0)
193
+    print 'method name is ' + sys.argv[1]
194
+    obj = DSJ_Categories()
195
+    try:
196
+        getattr(obj, sys.argv[1])()
197
+    except Exception, e:
198
+        print e

+ 203 - 0
task_scrapy/i_t_dsj_categories_without_browser.py

@@ -0,0 +1,203 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+"""电视剧分类爬取
4
+
5
+分为两步
6
+第一步爬取搜索页面结果,找到符合条件的电视剧
7
+第二步根据保存的具体页面url爬取分类信息
8
+"""
9
+
10
+import random
11
+import sys
12
+import time
13
+
14
+from selenium import webdriver
15
+from urllib import quote
16
+
17
+from fty_util.common import Mysql
18
+
19
+reload(sys)
20
+sys.setdefaultencoding('utf8')
21
+
22
+class DSJ_Categories(object):
23
+    # 爬取电视剧链接地址
24
+    def get_iqiyi_url(self):
25
+        # 打开Firefox浏览器
26
+        # driver = webdriver.Firefox()
27
+        driver = webdriver.PhantomJS()
28
+        driver.set_page_load_timeout(10)
29
+
30
+        # 数据库连接
31
+        conn = Mysql.createScrapyConn()
32
+        # 获取所有需要爬取的电视剧
33
+        # sql = """
34
+        #     select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and (iqiyi_url is null or iqiyi_url = '') order by id asc
35
+        # """
36
+        sql = """
37
+            select id, tv_name from scrapy.tv_category_scrapy where id > 5598 order by id asc
38
+        """
39
+        # rows = conn.getAll(sql)
40
+        rows = Mysql.getAll(sql, conn=conn)
41
+        for row in rows:
42
+            _id = row['id']
43
+            tv_name = row['tv_name']
44
+            print tv_name
45
+            start_url = "http://so.iqiyi.com/so/q_" + quote(str(tv_name)) + "?source=input"
46
+            # 打开主页
47
+            try:
48
+                driver.get(start_url)
49
+            except:
50
+                driver.execute_script('window.stop()')
51
+            lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li')
52
+            for li in lis:
53
+                try:
54
+                    first_num = li.find_element_by_xpath('.//div[@class="info_item mt15"]/div/div[2]/ul[1]/li[1]/a').text
55
+                    if '1' == first_num.strip():
56
+                        href = li.find_element_by_xpath('.//div/h3/a').get_attribute('href')
57
+                        print href
58
+                        sql = """
59
+                            update scrapy.tv_category_scrapy set iqiyi_url = '%s' where id = '%s'
60
+                        """
61
+                        sql = sql % (href, _id)
62
+                        # conn.update(sql)
63
+                        Mysql.update(sql, conn=conn)
64
+                        
65
+                        break
66
+                except Exception, e:
67
+                    print '没有'
68
+                    continue
69
+            break
70
+        driver.quit()
71
+
72
+    # 爬取具体页面
73
+    def get_iqiyi_detail(self):
74
+        driver = webdriver.PhantomJS()
75
+        driver.set_page_load_timeout(10)
76
+
77
+        # 数据库连接
78
+        conn = Mysql.createScrapyConn()
79
+        # 获取所有需要爬取的电视剧
80
+        sql = """
81
+            select id, tv_name, iqiyi_url from scrapy.tv_category_scrapy where id > 4573 and iqiyi_url is not null and iqiyi_url != '' and iqiyi_types is null order by id asc
82
+        """
83
+        # rows = conn.getAll(sql)
84
+        rows = Mysql.getAll(sql, conn=conn)
85
+        for row in rows:
86
+            _id = row['id']
87
+            tv_name = row['tv_name']
88
+            url = row['iqiyi_url']
89
+            print url
90
+            try:
91
+                driver.get(url)
92
+            except:
93
+                driver.execute_script('window.stop()')
94
+            
95
+            cats = driver.find_elements_by_xpath('//div[@class="look_point"]/a')
96
+            cats_set = set()
97
+            for cat in cats:
98
+                cats_set.add(cat.find_element_by_xpath('.').text.strip())
99
+
100
+            #存入数据库
101
+            sql = """
102
+                update scrapy.tv_category_scrapy set iqiyi_types = '%s' where id = '%s'
103
+            """
104
+            sql = sql % (' '.join(cats_set), _id)
105
+            # conn.update(sql)
106
+            Mysql.update(sql, conn=conn)
107
+        driver.quit()
108
+
109
+
110
+    # 爬取电视剧链接地址
111
+    def get_tengxun_url(self):
112
+        # 打开Firefox浏览器
113
+        driver = webdriver.PhantomJS()
114
+        driver.set_page_load_timeout(10)
115
+
116
+        # 数据库连接
117
+        conn = Mysql.createScrapyConn()
118
+        # 获取所有需要爬取的电视剧
119
+        sql = """
120
+            select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is null order by id asc
121
+        """
122
+        # rows = conn.getAll(sql)
123
+        rows = Mysql.getAll(sql, conn=conn)
124
+        for row in rows:
125
+            _id = row['id']
126
+            tv_name = row['tv_name']
127
+            print tv_name
128
+            start_url = "http://v.qq.com/x/search/?q=" + quote(str(tv_name)) + "&stag=0"
129
+            # 打开主页
130
+            try:
131
+                driver.get(start_url)
132
+            except:
133
+                driver.execute_script('window.stop()')
134
+            
135
+            divs = driver.find_elements_by_xpath('//div[@class="wrapper_main"]/div')
136
+            for div in divs:
137
+                try:
138
+                    href = div.find_element_by_xpath('div[1]/div/h2/a').get_attribute('href')
139
+                    print href
140
+                    matchObj = re.match(r'(.*)detail(.*)', href, re.M | re.I)
141
+                    if matchObj:
142
+                        sql = """
143
+                            update scrapy.tv_category_scrapy set tengxun_url = %s where id = %s
144
+                        """
145
+                        # sql = sql % (href, _id)
146
+                        value = (href, _id)
147
+                        # conn.update(sql)
148
+                        Mysql.update(sql, param=value, conn=conn)
149
+                        break
150
+                except Exception, e:
151
+                    print '没有'
152
+                    print e
153
+                    continue
154
+        driver.quit()
155
+
156
+    # 爬取具体页面
157
+    def get_tengxun_detail(self):
158
+        driver = webdriver.PhantomJS()
159
+        driver.set_page_load_timeout(10)
160
+
161
+        # 数据库连接
162
+        conn = Mysql.createScrapyConn()
163
+        # 获取所有需要爬取的电视剧
164
+        sql = """
165
+            select id, tv_name, tengxun_url from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is not null and tengxun_url != '' and tengxun_types is null order by id asc
166
+        """
167
+        # rows = Mysql.getAll(sql)
168
+        rows = Mysql.getAll(sql, conn=conn)
169
+        for row in rows:
170
+            _id = row['id']
171
+            tv_name = row['tv_name']
172
+            tengxun_url = row['tengxun_url']
173
+            print tengxun_url
174
+            # 打开主页
175
+            try:
176
+                driver.get(tengxun_url)
177
+            except:
178
+                driver.execute_script('window.stop()')
179
+            
180
+            cats = driver.find_elements_by_xpath('//div[@class="tag_list"]/a')
181
+            cats_set = set()
182
+            for cat in cats:
183
+                cat_name = cat.find_element_by_xpath('.').text
184
+                cats_set.add(cat_name)
185
+            #存入数据库
186
+            sql = """
187
+                update scrapy.tv_category_scrapy set tengxun_types = '%s' where id = '%s'
188
+            """
189
+            sql = sql % (' '.join(cats_set), _id)
190
+            # conn.update(sql)
191
+            Mysql.update(sql, conn=conn)
192
+        driver.quit()
193
+
194
+if __name__ == '__main__':
195
+    if len(sys.argv) != 2:
196
+        print '没有输入参数,退出'
197
+        sys.exit(0)
198
+    print 'method name is ' + sys.argv[1]
199
+    obj = DSJ_Categories()
200
+    try:
201
+        getattr(obj, sys.argv[1])()
202
+    except Exception, e:
203
+        print e

+ 100 - 0
task_scrapy/scrapy_all.py

@@ -0,0 +1,100 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+
4
+import random
5
+import sys
6
+import time
7
+
8
+from selenium import webdriver
9
+from urllib import quote
10
+
11
+from fty_util.common import Mysql
12
+
13
+reload(sys)
14
+sys.setdefaultencoding('utf8')
15
+
16
+def parse_playtimes():
17
+    conn = Mysql.createOfflineConn()
18
+
19
+    sql = """
20
+        select id, tv_name, url, playtimes from scrapy.wangju_all_url order by id asc
21
+    """
22
+
23
+    rows = Mysql.getAll(sql, conn=conn)
24
+
25
+    for row in rows:
26
+        _id = row['id']
27
+        tv_name = row['tv_name']
28
+        url = row['url']
29
+        playtimes = row['playtimes']
30
+
31
+        if playtimes is not None and len(playtimes.split('*')) == 2:
32
+            first_num, second_num = playtimes.split('*')
33
+            first_num = float(first_num)
34
+            second_num = int(second_num)
35
+
36
+            playtimes_new = first_num * second_num
37
+            sql = """
38
+                update scrapy.wangju_all_url set playtimes = '%s' where url = '%s'
39
+            """
40
+            sql = sql % (str(int(playtimes_new)), url)
41
+            Mysql.execute(sql, conn=conn)
42
+            
43
+def update_fields():
44
+    conn = Mysql.createOfflineConn()
45
+
46
+    sql = """
47
+        select id, tv_name, score, playtimes, source from scrapy.wangju_all_url order by id asc
48
+    """
49
+
50
+    rows = Mysql.getAll(sql, conn=conn)
51
+
52
+    for row in rows:
53
+        _id = row['id']
54
+        tv_name = row['tv_name']
55
+        score = row['score']
56
+        playtimes = row['playtimes']
57
+
58
+        source = row['source']
59
+        if 'pptv' == source:
60
+            sql = """
61
+                update scrapy.wangju_url set pptv_score = '%s', pptv_playtimes = '%s' where id = %s
62
+            """
63
+
64
+        if 'youku' == source:
65
+            sql = """
66
+                update scrapy.wangju_url set youku_score = '%s', youku_playtimes = '%s' where id = %s
67
+            """
68
+
69
+        if 'sohu' == source:
70
+            sql = """
71
+                update scrapy.wangju_url set sohu_score = '%s', sohu_playtimes = '%s' where id = %s
72
+            """
73
+
74
+        if 'leshi' == source:
75
+            sql = """
76
+                update scrapy.wangju_url set leshi_score = '%s', leshi_playtimes = '%s' where id = %s
77
+            """
78
+
79
+        if 'huashutv' == source:
80
+            sql = """
81
+                update scrapy.wangju_url set huashutv_score = '%s', huashutv_playtimes = '%s' where id = %s
82
+            """
83
+
84
+        if 'iqiyi' == source:
85
+            sql = """
86
+                update scrapy.wangju_url set iqiyi_score = '%s', iqiyi_playtimes = '%s' where id = %s
87
+            """
88
+
89
+        if 'tengxun' == source:
90
+            sql = """
91
+                update scrapy.wangju_url set tengxun_score = '%s', tengxun_playtimes = '%s' where id = %s
92
+            """
93
+        sql = sql % (score, playtimes, _id)
94
+        Mysql.execute(sql, conn=conn)
95
+        
96
+
97
+        
98
+if __name__ == '__main__':
99
+    # parse_playtimes()
100
+    update_fields()

+ 143 - 0
task_scrapy/scrapy_gongzhonghao_count.py

@@ -0,0 +1,143 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+
4
+import random
5
+import sys
6
+import time
7
+import collections
8
+
9
+from selenium import webdriver
10
+from urllib import quote
11
+
12
+from fty_util.common import Mysql
13
+
14
+reload(sys)
15
+sys.setdefaultencoding('utf8')
16
+
17
+def scrapy_website():
18
+    conn = Mysql.createOfflineConn()
19
+
20
+    # 将网站url和名称 放入有序字典中
21
+    websites_dict = collections.OrderedDict()
22
+    sql = """
23
+        select name, account from odl.basic_weixin_subscribe where is_delete != 1 order by id asc
24
+    """
25
+    websites = Mysql.getAll(sql, conn=conn)
26
+    for website in websites:
27
+        name = website['name']
28
+        account = website['account']
29
+        websites_dict[account] = name
30
+    driver = webdriver.Firefox()
31
+    driver.set_page_load_timeout(10)
32
+
33
+    start_url = 'http://weixin.sogou.com/'
34
+
35
+    sql = """
36
+        select tv_id, tv_name from odl.ad_tv_lib where tv_id order by id asc
37
+    """
38
+
39
+    tvs = Mysql.getAll(sql, conn=conn)
40
+
41
+    for tv in tvs:
42
+        tv_id = tv['tv_id']
43
+        tv_name = tv['tv_name']
44
+        try:
45
+            driver.get(start_url)
46
+        except Exception, e:
47
+            pass
48
+        try:
49
+            input_box = driver.find_element_by_id('upquery')
50
+            submit_button = driver.find_element_by_class_name('swz')
51
+        except Exception, e:
52
+            driver.refresh()
53
+        # 搜索条件
54
+        try:
55
+            input_box.clear()
56
+            input_box.send_keys(tv_name)
57
+            submit_button.click()
58
+        except Exception, e:
59
+            print '点击请求失败'
60
+
61
+        for account in websites_dict:
62
+            name = websites_dict.get(account)
63
+            input_box = None
64
+            submit_button = None
65
+
66
+            time.sleep(5)
67
+
68
+            js = 'document.getElementsByClassName("time-box float")[2].style.display="block"'
69
+            driver.execute_script(js)
70
+            js = 'document.getElementsByClassName("s-sea")[0].value = "' + account + '"'
71
+            driver.execute_script(js)
72
+            js = 'document.getElementById("search_enter").click()'
73
+            driver.execute_script(js)
74
+                # s_sea = driver.find_element_by_class_name('s-sea')
75
+                # search_enter = driver.find_element_by_id('search_enter')
76
+                # s_sea.clear()
77
+                # s_sea.send_keys(account)
78
+                # search_enter.click()
79
+            
80
+            time.sleep(10)
81
+            driver.execute_script('window.stop()')
82
+            # driver.refresh()
83
+            # 分页块
84
+            page = None
85
+            try:
86
+                page = driver.find_elements_by_xpath('//div[@id="pagebar_container"]/a')
87
+            except:
88
+                pass
89
+            count = 0
90
+            # 如果分页不存在,说明记录在十条以内或没有记录
91
+            if page is None or len(page) == 0:
92
+                try:
93
+                    divs = driver.find_elements_by_xpath('//ul[@class="news-list"]/li')
94
+                    if divs is not None and len(divs) > 0:
95
+                        count = len(divs)
96
+                except Exception, e:
97
+                    count = 0
98
+            #  如果分页存在,判断最后一页是不是10
99
+            else:
100
+                try:
101
+                    page_nums = driver.find_elements_by_xpath('//div[@id="pagebar_container"]/a')
102
+                    max_page_num = 1
103
+                    max_page_href= ''
104
+                    for page_num in page_nums:
105
+                        href = page_num.find_element_by_xpath('.').get_attribute('href')
106
+                        page_num_text = page_num.find_element_by_xpath('.').get_attribute('textContent')
107
+                        page_num_text = page_num_text.replace(' ', '').replace('\n', '')
108
+
109
+                        # 如果只是数字
110
+                        if page_num_text.isdigit():
111
+                            page_num_text = int(page_num_text)
112
+                            if page_num_text > max_page_num:
113
+                                max_page_num = page_num_text
114
+                                max_page_href = href
115
+                        # 如果是下一页字符串
116
+                        elif page_num_text == '下一页':
117
+                            break
118
+
119
+                    try:
120
+                        driver.get(max_page_href)
121
+                    except Exception, e:
122
+                        pass
123
+                    try:
124
+                        divs = driver.find_elements_by_xpath('//ul[@class="news-list"]/li')
125
+                        if divs is not None and len(divs) > 0:
126
+                            count = len(divs)
127
+                    except Exception, e:
128
+                        count = 0
129
+                    
130
+                    count = (max_page_num - 1) * 10 + count
131
+                except:
132
+                    continue
133
+
134
+            if count != 0:
135
+                sql = """
136
+                    insert into scrapy.scrapy_subscribe_count (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_count) values (%s, %s, %s, %s, %s, %s)
137
+                """
138
+                value = (tv_id, tv_name, 2, name, '', count)
139
+                Mysql.insertOne(sql, value=value, conn=conn)
140
+    driver.quit()
141
+
142
+if __name__ == '__main__':
143
+    scrapy_website()

+ 113 - 0
task_scrapy/scrapy_huashutv.py

@@ -0,0 +1,113 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+
4
+import random
5
+import sys
6
+import time
7
+
8
+from selenium import webdriver
9
+from urllib import quote
10
+
11
+from fty_util.common import Mysql
12
+
13
+reload(sys)
14
+sys.setdefaultencoding('utf8')
15
+
16
+def scrapy_url():
17
+    conn = Mysql.createOfflineConn()
18
+
19
+    sql = """
20
+        select id, tv_name from scrapy.wangju_url where url_huashutv is null order by id asc
21
+    """
22
+
23
+    rows = Mysql.getAll(sql, conn=conn)
24
+
25
+    driver = webdriver.Firefox()
26
+    driver.set_page_load_timeout(10)
27
+
28
+    for row in rows:
29
+        _id = row['id']
30
+        tv_name = row['tv_name']
31
+
32
+        url = 'http://www.wasu.cn/Search/show/k/' + quote(str(tv_name))
33
+
34
+        need_blank = True
35
+        try:
36
+            driver.get(url)
37
+        except Exception, e:
38
+            driver.execute_script('window.stop()')
39
+
40
+        divs = driver.find_elements_by_xpath('//div[@id="agg_list"]/div')
41
+        href_list = []
42
+        for div in divs:
43
+            try:
44
+                href = div.find_element_by_xpath('./div[1]/a[1]').get_attribute('href')
45
+                href_list.append(href)
46
+            except Exception, e:
47
+                pass
48
+        if len(href_list) > 0:
49
+            sql = """
50
+                update scrapy.wangju_url set url_huashutv = '%s' where id = %s
51
+            """
52
+            sql = sql % (','.join(href_list), _id)
53
+            Mysql.execute(sql, conn=conn)
54
+            need_blank = False
55
+        if need_blank:
56
+            sql = """
57
+                update scrapy.wangju_url set url_huashutv = '%s' where id = %s
58
+            """
59
+            sql = sql % ('', _id)
60
+            Mysql.execute(sql, conn=conn)
61
+    driver.quit()
62
+
63
+def scrapy_data():
64
+    conn = Mysql.createOfflineConn()
65
+
66
+    sql = """
67
+        select id, tv_name, url_huashutv from scrapy.wangju_url where url_huashutv is not null and url_huashutv != '' order by id asc
68
+    """
69
+
70
+    rows = Mysql.getAll(sql, conn=conn)
71
+
72
+    driver = webdriver.Firefox()
73
+    driver.set_page_load_timeout(10)
74
+
75
+    for row in rows:
76
+        _id = row['id']
77
+        tv_name = row['tv_name']
78
+        url_huashutv = row['url_huashutv']
79
+
80
+        urls = url_huashutv.split(',')
81
+        for url in urls:
82
+            if 'www.wasu.cn' not in url:
83
+                continue
84
+            try:
85
+                driver.get(url)
86
+            except Exception, e:
87
+                driver.execute_script('window.stop()')
88
+            
89
+            try:
90
+                href = driver.find_element_by_xpath('//div[@id="con_telelist_1"]/ul/li[1]/a').get_attribute('href')
91
+            except Exception, e:
92
+                href = None
93
+            
94
+            if href is not None and 'www.wasu.cn' in href:
95
+                print href
96
+                try:
97
+                    driver.get(href)
98
+                except Exception, e:
99
+                    driver.execute_script('window.stop()')
100
+                try:
101
+                    content = driver.find_element_by_xpath('//div[@id="play_vod_hits"]').get_attribute('textContent')
102
+                except Exception, e:
103
+                    continue
104
+                
105
+                sql = """
106
+                    insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
107
+                """
108
+                value = (_id, tv_name, url, '', content, 'huashutv')
109
+                Mysql.insertOne(sql, value=value, conn=conn)
110
+    driver.quit()
111
+if __name__ == '__main__':
112
+    scrapy_data()
113
+    # scrapy_url()

+ 294 - 0
task_scrapy/scrapy_iqiyi.py

@@ -0,0 +1,294 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+
4
+import random
5
+import sys
6
+import time
7
+
8
+from selenium import webdriver
9
+from urllib import quote
10
+
11
+from fty_util.common import Mysql
12
+
13
+reload(sys)
14
+sys.setdefaultencoding('utf8')
15
+
16
+"""
17
+爱奇艺爬取规则
18
+1、scrapy_url 通过搜索页面,爬取搜索到的电视剧页面url
19
+2、scrapy_data 进入搜索到的详情页面,爬取内容、每集url(播放数量在每集页面上显示)
20
+3、scrapy_play_page 进入第一集的播放页面,爬取播放记录数
21
+4、todo 每天爬取每页信息
22
+
23
+爱奇艺通过搜索到的详情页面没有播放数量和评论数量,需要一个个页面解析
24
+搜索页面-->搜索详情页面-->播放页面(只需取第一集播放页面即可)-->真实详情页面(爬取播放数量和评论数量(评论暂时爬不到))
25
+所以只要在播放页面爬取到播放量即可。
26
+"""
27
+
28
+# 爬取搜索页面
29
+def scrapy_url():
30
+    conn = Mysql.createOfflineConn()
31
+    sql = """
32
+        select id, tv_name from scrapy.wangju_url order by id asc
33
+    """
34
+    rows = Mysql.getAll(sql, conn=conn)
35
+    for row in rows:
36
+        driver = webdriver.PhantomJS()
37
+        driver.set_page_load_timeout(10)
38
+        _id = row['id']
39
+        tv_name = row['tv_name']
40
+        url = 'http://so.iqiyi.com/so/q_' + quote(str(tv_name))
41
+        try:
42
+            driver.get(url)
43
+        except Exception, e:
44
+            driver.execute_script('window.stop()')
45
+        lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li')
46
+        for li in lis:
47
+            try:
48
+                title = li.find_element_by_xpath('./div/h3/a').get_attribute('title')
49
+                href = li.find_element_by_xpath('./div/h3/a').get_attribute('href')
50
+                if 'www.iqiyi.com/lib' in href:
51
+                    print href
52
+                    sql = """
53
+                        insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
54
+                    """
55
+                    value = (_id, tv_name, href, title, '', 'iqiyi')
56
+                    Mysql.insertOne(sql, value=value, conn=conn)
57
+                    time.sleep(1)
58
+            except Exception, e:
59
+                print e
60
+                continue
61
+        driver.quit()
62
+
63
+# 爬取搜索到的详情页面
64
+def scrapy_data():
65
+    
66
+    conn = Mysql.createOfflineConn()
67
+    # sql = """
68
+    #     select id, tv_name, url_iqiyi from scrapy.wangju_url where url_iqiyi is not null and url_iqiyi != '' and iqiyi_fenji is null order by id asc
69
+    # """
70
+    sql = """
71
+        select id, tv_name, url, title from scrapy.wangju_all_url where source = 'iqiyi' order by id asc
72
+    """
73
+    rows = Mysql.getAll(sql, conn=conn)
74
+
75
+    for row in rows:
76
+        driver = webdriver.PhantomJS()
77
+        driver.set_page_load_timeout(10)
78
+        _id = row['id']
79
+        tv_name = row['tv_name']
80
+        url = row['url']
81
+        title = row['title']
82
+        
83
+        try:
84
+            driver.get(url)
85
+        except Exception, e:
86
+            driver.execute_script('window.stop()')
87
+        
88
+        # 爬取内容
89
+        try:
90
+            content = driver.find_element_by_xpath('//div[@class="result_detail"]').get_attribute('textContent')
91
+        except Exception, e:
92
+            content = ''
93
+        
94
+        # 爬取分集
95
+        try:
96
+            pagelist = driver.find_elements_by_xpath('//div[@class="mod_album_lists clearfix"]/div[3]/div/ul/li')
97
+        except Exception, e:
98
+            # 如果没有隐藏的集数,则用显示的集数
99
+            try:
100
+                pagelist = driver.find_elements_by_xpath('//div[@class="mod_album_lists clearfix"]/div/ul/li')
101
+            except Exception, e:
102
+                pagelist = None
103
+                pass
104
+
105
+        if pagelist is not None:
106
+            # 如果集数存在,则爬取每集url,用于爬取播放量和评论量
107
+            data_list = []
108
+            for page in pagelist:
109
+                num = page.find_element_by_xpath('./a').get_attribute('title')
110
+                num = num.replace(' ', '').replace('\n', '')
111
+                href = page.find_element_by_xpath('./a').get_attribute('href')
112
+                if 'www.iqiyi.com' in href:
113
+                    data_list.append((_id, tv_name, num, href, 'iqiyi'))
114
+            # 插入分集数据
115
+            if data_list is not None and len(data_list) > 0:
116
+                sql = """
117
+                    insert into scrapy.wangju_fenji_url (id, tv_name, num, url, source) values (%s, %s, %s, %s, %s)
118
+                """
119
+                Mysql.insertMany(sql, data_list, conn)
120
+        
121
+        # 更新内容
122
+        sql = """
123
+            update scrapy.wangju_all_url set content = %s where url = %s
124
+        """
125
+        value = (content, url)
126
+        Mysql.execute(sql, param=value, conn=conn)
127
+        driver.quit()
128
+
129
+# 爬取播放页面
130
+def scrapy_play_page():
131
+    conn = Mysql.createOfflineConn()
132
+    sql = """
133
+        select id, tv_name, url from scrapy.wangju_fenji_url where source = 'iqiyi' and num = '1' order by id asc
134
+    """
135
+    rows = Mysql.getAll(sql, conn=conn)
136
+
137
+    for row in rows:
138
+        driver = webdriver.Firefox()
139
+        driver.set_page_load_timeout(10)
140
+        _id = row['id']
141
+        tv_name = row['tv_name']
142
+        url = row['url']
143
+        if 'www.iqiyi.com' not in url:
144
+            driver.quit()
145
+            continue
146
+        else:
147
+            try:
148
+                driver.get(url)
149
+            except Exception, e:
150
+                print e
151
+                driver.execute_script('window.stop()')
152
+            try:
153
+                count = driver.find_element_by_xpath('//span[@id="widget-playcount"]').text
154
+            except Exception, e:
155
+                print e
156
+                count = 0
157
+            
158
+            print count
159
+
160
+            sql = """
161
+                update scrapy.wangju_url set iqiyi_playtimes = '%s' where id = %s
162
+            """
163
+            sql = sql % (count, _id)
164
+            Mysql.execute(sql, conn=conn)
165
+
166
+        driver.quit()
167
+
168
+# 每天爬取播放页面(爱奇艺只有每集的评论数量,没有每集播放数量)
169
+def scrapy_play_page_everyday():
170
+    conn = Mysql.createOfflineConn()
171
+    sql = """
172
+        select id, tv_name, num, url from scrapy.wangju_fenji_url where source = 'iqiyi' order by id asc
173
+    """
174
+    rows = Mysql.getAll(sql, conn=conn)
175
+
176
+    for row in rows:
177
+        driver = webdriver.Firefox()
178
+        driver.set_page_load_timeout(20)
179
+        _id = row['id']
180
+        tv_name = row['tv_name']
181
+        num = row['num']
182
+        url = row['url']
183
+        if 'www.iqiyi.com' not in url:
184
+            driver.quit()
185
+            sql = """
186
+                delete from scrapy.wangju_fenji_url where url = '%s'
187
+            """
188
+            sql = sql % (url,)
189
+            Mysql.execute(sql, conn=conn)
190
+            continue
191
+        else:
192
+            try:
193
+                driver.get(url)
194
+            except Exception, e:
195
+                print e
196
+                driver.execute_script('window.stop()')
197
+            try:
198
+                commenttimes = driver.find_element_by_xpath('//a[@class="blm-tab"]/em/i').text
199
+            except Exception, e:
200
+                print e
201
+                commenttimes = ''
202
+            
203
+            print url
204
+            print commenttimes
205
+        
206
+        # sql = """
207
+        #     insert into scrapy.wangju_fenji_data (id, tv_name, num, source, palytimes, commenttimes) values (%s, %s, %s, %s, %s, %s)
208
+        # """
209
+        # value = (_id, tv_name, num, 'iqiyi', playtimes, commenttimes)
210
+
211
+def parse_wangju_all_url_data():
212
+    conn = Mysql.createOfflineConn()
213
+
214
+    sql = """
215
+        select id, tv_name, url from scrapy.wangju_all_url where source = 'iqiyi' and (playtimes = '' or playtimes = '0') order by id asc
216
+    """
217
+
218
+    rows = Mysql.getAll(sql, conn=conn)
219
+
220
+    driver = webdriver.Firefox()
221
+    driver.set_page_load_timeout(10)
222
+
223
+    driver2 = webdriver.Firefox()
224
+    driver2.set_page_load_timeout(10)
225
+    for row in rows:
226
+        _id = row['id']
227
+        tv_name = row['tv_name']
228
+        url = row['url']
229
+        
230
+        try:
231
+            driver.get(url)
232
+        except Exception, e:
233
+            print e
234
+            driver.execute_script('window.stop()')
235
+        try:
236
+            score = driver.find_element_by_xpath('//span[@class="score_font"]').text
237
+            score = score.replace(' ', '').replace('\n', '')
238
+        except:
239
+            score = ''
240
+
241
+        try:
242
+            pagelist = driver.find_elements_by_xpath('//li[@class="album_item"]')
243
+        except Exception, e:
244
+            pass
245
+            pagelist = None
246
+        try:
247
+            if pagelist is not None:
248
+                page_dict = dict()
249
+                for page in pagelist:
250
+                    try:
251
+                        episode = page.find_element_by_xpath('./a').get_attribute('href')
252
+                        episode_text = page.find_element_by_xpath('./a').text
253
+                        page_dict[episode_text] = episode
254
+                    except:
255
+                        continue
256
+                if page_dict.get('1') is not None and 'www.iqiyi.com' in page_dict.get('1'):
257
+                    try:
258
+                        driver2.get(page_dict.get('1'))
259
+                        time.sleep(5)
260
+                    except Exception, e:
261
+                        print e
262
+                        driver2.execute_script('window.stop()')
263
+                    try:
264
+                        count = driver2.find_element_by_xpath('//a[@id="chartTrigger"]/span').text
265
+                    except Exception, e:
266
+                        print e
267
+                        count = '0'
268
+                    print count
269
+                    sql = """
270
+                        update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'iqiyi'
271
+                    """
272
+                    sql = sql % (score, count, url)
273
+                    Mysql.execute(sql, conn=conn)
274
+                else:
275
+                    sql = """
276
+                        delete from scrapy.wangju_all_url where url = '%s' and source = '%s'
277
+                    """
278
+                    sql = sql % (url, 'iqiyi')
279
+                    Mysql.execute(sql, conn=conn)
280
+            else:
281
+                sql = """
282
+                    delete from scrapy.wangju_all_url where url = '%s' and source = '%s'
283
+                """
284
+                sql = sql % (url, 'iqiyi')
285
+                Mysql.execute(sql, conn=conn)
286
+        except Exception, e:
287
+            continue
288
+
289
+if __name__ == '__main__':
290
+    # scrapy_url()
291
+    # scrapy_data()
292
+    # scrapy_play_page()
293
+    # scrapy_play_page_everyday()
294
+    parse_wangju_all_url_data()

+ 59 - 0
task_scrapy/scrapy_kankan.py

@@ -0,0 +1,59 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+
4
+import random
5
+import sys
6
+import time
7
+from selenium import webdriver
8
+from urllib import quote
9
+from fty_util.common import Mysql
10
+
11
+reload(sys)
12
+sys.setdefaultencoding('utf8')
13
+
14
+conn = Mysql.createOfflineConn()
15
+
16
+sql = """
17
+    select id, tv_name from scrapy.wangju_url where url_kankan is null order by id asc
18
+"""
19
+
20
+rows = Mysql.getAll(sql, conn=conn)
21
+
22
+driver = webdriver.Firefox()
23
+driver.set_page_load_timeout(10)
24
+for row in rows:
25
+    
26
+    _id = row['id']
27
+    tv_name = row['tv_name']
28
+
29
+    url = 'http://search.kankan.com/search.php?keyword=' + quote(str(tv_name))
30
+    need_blank = True
31
+    try:
32
+        driver.get(url)
33
+    except Exception, e:
34
+        driver.execute_script('window.stop()')
35
+
36
+    # 解析第一页
37
+    divs = driver.find_elements_by_xpath('//div[@class="searchmain"]/div')
38
+    for div in divs:
39
+        try:
40
+            title = div.find_element_by_xpath('//div[@class="reuslt_tt"]/h2/a').get_attribute('title')
41
+            href = div.find_element_by_xpath('./div/a').get_attribute('href')
42
+            _type = div.find_element_by_xpath('./div/div[2]').get_attribute('textContent')
43
+            sources = div.find_element_by_xpath('//ul[@class="sitelist"]').get_attribute('textContent')
44
+            if tv_name == title and u'电视剧' in _type and u'响巢看看' in sources:
45
+                sql = """
46
+                    update scrapy.wangju_url set url_kankan = '%s' where id = %s
47
+                """
48
+                sql = sql % (href, _id)
49
+                Mysql.execute(sql, conn=conn)
50
+                need_blank = False
51
+        except Exception, e:
52
+            continue
53
+    if need_blank:
54
+        sql = """
55
+            update scrapy.wangju_url set url_kankan = '%s' where id = %s
56
+        """
57
+        sql = sql % ('', _id)
58
+        Mysql.execute(sql, conn=conn)
59
+driver.quit()

+ 186 - 0
task_scrapy/scrapy_leshi.py

@@ -0,0 +1,186 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+
4
+import random
5
+import sys
6
+import time
7
+
8
+from selenium import webdriver
9
+from urllib import quote
10
+
11
+from fty_util.common import Mysql
12
+
13
+reload(sys)
14
+sys.setdefaultencoding('utf8')
15
+
16
+"""
17
+乐视视频爬取规则
18
+
19
+
20
+"""
21
+def scrapy_url():
22
+    conn = Mysql.createOfflineConn()
23
+
24
+    sql = """
25
+        select id, tv_name from scrapy.wangju_url where url_leshi is null order by id asc
26
+    """
27
+
28
+    rows = Mysql.getAll(sql, conn=conn)
29
+
30
+    driver = webdriver.Firefox()
31
+    driver.set_page_load_timeout(10)
32
+
33
+    for row in rows:
34
+        _id = row['id']
35
+        tv_name = row['tv_name']
36
+
37
+        url = 'http://so.le.com/s?wd=' + quote(str(tv_name))
38
+
39
+        try:
40
+            driver.get(url)
41
+        except Exception, e:
42
+            driver.execute_script('window.stop()')
43
+
44
+        divs = driver.find_elements_by_xpath('//div[@class="So-detail Tv-so"]')
45
+        href_list = []
46
+        for div in divs:
47
+            try:
48
+                href = div.find_element_by_xpath('./div/div[2]/div[1]/h1/a').get_attribute('href')
49
+                href_list.append(href)
50
+            except Exception, e:
51
+                pass
52
+        if len(href_list) > 0:
53
+            sql = """
54
+                update scrapy.wangju_url set url_leshi = '%s' where id = %s
55
+            """
56
+            sql = sql % (','.join(href_list), _id)
57
+            Mysql.execute(sql, conn=conn)
58
+            need_blank = False
59
+        if need_blank:
60
+            sql = """
61
+                update scrapy.wangju_url set url_leshi = '%s' where id = %s
62
+            """
63
+            sql = sql % ('', _id)
64
+            Mysql.execute(sql, conn=conn)
65
+    driver.quit()
66
+
67
+def scrapy_data():
68
+    conn = Mysql.createOfflineConn()
69
+
70
+    sql = """
71
+        select id, tv_name, url_leshi from scrapy.wangju_url where url_leshi is not null and url_leshi != '' order by id asc
72
+    """
73
+
74
+    rows = Mysql.getAll(sql, conn=conn)
75
+
76
+    driver = webdriver.Firefox()
77
+    driver.set_page_load_timeout(10)
78
+
79
+    for row in rows:
80
+        _id = row['id']
81
+        tv_name = row['tv_name']
82
+        url_leshi = row['url_leshi']
83
+
84
+        urls = url_leshi.split(',')
85
+        for url in urls:
86
+            if 'www.le.com' not in url:
87
+                continue
88
+            try:
89
+                driver.get(url)
90
+            except Exception, e:
91
+                driver.execute_script('window.stop()')
92
+            
93
+            try:
94
+                href = driver.find_element_by_xpath('//div[@id="j-adv-tv"]/div[2]/div[1]/div[2]/div[1]/div[2]/dl[1]/dt/a').get_attribute('href')
95
+            except Exception, e:
96
+                href = None
97
+            
98
+
99
+            if href is not None and 'www.le.com' in href:
100
+                print href
101
+                try:
102
+                    driver.get(href)
103
+                except Exception, e:
104
+                    driver.execute_script('window.stop()')
105
+                try:
106
+                    content = driver.find_element_by_xpath('//div[@class="Info"]').get_attribute('textContent')
107
+                except Exception, e:
108
+                    continue
109
+                
110
+                sql = """
111
+                    insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
112
+                """
113
+                value = (_id, tv_name, url, '', content, 'leshi')
114
+                Mysql.insertOne(sql, value=value, conn=conn)
115
+    driver.quit()
116
+
117
+def parse_wangju_all_url_title():
118
+    conn = Mysql.createOfflineConn()
119
+
120
+    sql = """
121
+        select id, tv_name, url from scrapy.wangju_all_url where source = 'leshi' order by id asc
122
+    """
123
+
124
+    rows = Mysql.getAll(sql, conn=conn)
125
+
126
+    driver = webdriver.Firefox()
127
+    driver.set_page_load_timeout(10)
128
+    for row in rows:
129
+        _id = row['id']
130
+        tv_name = row['tv_name']
131
+        url = row['url']
132
+
133
+        try:
134
+            driver.get(url)
135
+        except Exception, e:
136
+            print e
137
+            driver.execute_script('window.stop()')
138
+
139
+        try:
140
+            title = driver.find_element_by_xpath('//div[@class="listPic active"]/div[1]/p/i').text
141
+        except Exception, e:
142
+            title = ''
143
+
144
+        sql = """
145
+            update scrapy.wangju_all_url set title = '%s' where source = '%s' and url = '%s'
146
+        """
147
+        sql = sql % (title, 'leshi', url)
148
+        Mysql.execute(sql, conn=conn)
149
+
150
+def parse_content():
151
+    conn = Mysql.createOfflineConn()
152
+
153
+    sql = """
154
+        select id, tv_name, url, content from scrapy.wangju_all_url where source = 'leshi' order by id asc
155
+    """
156
+    rows = Mysql.getAll(sql, conn=conn)
157
+
158
+    for row in rows:
159
+        _id = row['id']
160
+        tv_name = row['tv_name']
161
+        url = row['url']
162
+        content = row['content']
163
+
164
+        import re
165
+        m = re.search(ur'([0-9]+[.]?)+', content)
166
+        score = '0'
167
+        if m is not None:
168
+            score = m.group(0)
169
+
170
+        play = '0'
171
+        m = re.search(ur'播放数:([0-9]+[.]?)+[(亿)(万)]', content)
172
+        if m is not None:
173
+            play = m.group(0)
174
+
175
+        sql = """
176
+            update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'leshi'
177
+        """
178
+        sql = sql % (score, play, url)
179
+        Mysql.execute(sql, conn=conn)
180
+
181
+if __name__ == '__main__':
182
+    # scrapy_data()
183
+    # scrapy_url()
184
+    
185
+    # parse_wangju_all_url_title()
186
+    parse_content()

+ 146 - 0
task_scrapy/scrapy_pptv.py

@@ -0,0 +1,146 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+
4
+import random
5
+import sys
6
+import time
7
+
8
+from selenium import webdriver
9
+from urllib import quote
10
+
11
+from fty_util.common import Mysql
12
+
13
+reload(sys)
14
+sys.setdefaultencoding('utf8')
15
+
16
+def scrapy_url():
17
+    conn = Mysql.createOfflineConn()
18
+
19
+    sql = """
20
+        select id, tv_name from scrapy.wangju_url where url_pptv is null order by id asc
21
+    """
22
+
23
+    rows = Mysql.getAll(sql, conn=conn)
24
+
25
+    driver = webdriver.Firefox()
26
+    driver.set_page_load_timeout(10)
27
+
28
+    for row in rows:
29
+        _id = row['id']
30
+        tv_name = row['tv_name']
31
+
32
+        url = 'http://search.pptv.com/s_video?kw=' + quote(str(tv_name))
33
+
34
+        need_blank = True
35
+        try:
36
+            driver.get(url)
37
+        except Exception, e:
38
+            driver.execute_script('window.stop()')
39
+
40
+        divs = driver.find_elements_by_xpath('//div[@id="search-result"]/div')
41
+        href_list = []
42
+        for div in divs:
43
+            try:
44
+                href = div.find_element_by_xpath('./div[2]/dl/dd/p/a').get_attribute('href')
45
+                href_list.append(href)
46
+            except Exception, e:
47
+                pass
48
+        if len(href_list) > 0:
49
+            sql = """
50
+                update scrapy.wangju_url set url_pptv = '%s' where id = %s
51
+            """
52
+            sql = sql % (','.join(href_list), _id)
53
+            Mysql.execute(sql, conn=conn)
54
+            need_blank = False
55
+        if need_blank:
56
+            sql = """
57
+                update scrapy.wangju_url set url_pptv = '%s' where id = %s
58
+            """
59
+            sql = sql % ('', _id)
60
+            Mysql.execute(sql, conn=conn)
61
+    driver.quit()
62
+
63
+def parse_unique_url():
64
+    conn = Mysql.createOfflineConn()
65
+    sql = """
66
+        select id, tv_name, url_pptv from scrapy.wangju_url where url_pptv is not null and url_pptv != '' and pptv_finished is null order by id asc
67
+    """
68
+    rows = Mysql.getAll(sql, conn=conn)
69
+
70
+    driver = webdriver.Firefox()
71
+    driver.set_page_load_timeout(10)
72
+    for row in rows:
73
+        _id = row['id']
74
+        tv_name = row['tv_name']
75
+        url_pptv = row['url_pptv']
76
+
77
+        urls = url_pptv.split(',')
78
+        for url in urls:
79
+            try:
80
+                driver.get(url)
81
+            except Exception, e:
82
+                try:
83
+                    driver.execute_script('window.stop()')
84
+                except:
85
+                    continue
86
+            try:
87
+                nav_type = driver.find_element_by_xpath('//div[@class="module module-bread-nav cf"]/p/a').text
88
+                if nav_type != u'电视剧':
89
+                    continue
90
+                else:
91
+                    title = driver.find_element_by_xpath('//div[@class="module-dpage-info"]/div[1]/h3').text
92
+                    content = driver.find_element_by_xpath('//div[@class="module-dpage-info"]/div[2]').get_attribute('textContent')
93
+                    
94
+                    sql = """
95
+                        insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
96
+                    """
97
+                    value = (_id, tv_name, url, title, content, 'pptv')
98
+                    Mysql.insertOne(sql, value=value, conn=conn)
99
+            except Exception, e:
100
+                pass
101
+        sql = """
102
+            update scrapy.wangju_url set pptv_finished = '%s' where id = %s
103
+        """
104
+        sql = sql % ('1', _id)
105
+        Mysql.execute(sql, conn=conn)
106
+    
107
+    driver.quit()
108
+
109
+def scrapy_fenji():
110
+    pass
111
+
112
+def parse_content():
113
+    conn = Mysql.createOfflineConn()
114
+
115
+    sql = """
116
+        select id, tv_name, url, content from scrapy.wangju_all_url where source = 'pptv' order by id asc
117
+    """
118
+    rows = Mysql.getAll(sql, conn=conn)
119
+
120
+    for row in rows:
121
+        _id = row['id']
122
+        tv_name = row['tv_name']
123
+        url = row['url']
124
+        content = row['content']
125
+
126
+        import re
127
+        m = re.search(ur'评分:\d+(.)\d+', content)
128
+        score = '0'
129
+        if m is not None:
130
+            score = m.group(0)
131
+
132
+        play = '0'
133
+        m = re.search(ur'播放:\d+(.)\d+[(亿)(万)]', content)
134
+        if m is not None:
135
+            play = m.group(0)
136
+
137
+        sql = """
138
+            update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'pptv'
139
+        """
140
+        sql = sql % (score, play, url)
141
+        Mysql.execute(sql, conn=conn)
142
+
143
+if __name__ == '__main__':
144
+    # scrapy_url()
145
+    # parse_unique_url()
146
+    parse_content()

+ 139 - 0
task_scrapy/scrapy_sohu.py

@@ -0,0 +1,139 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+
4
+import random
5
+import sys
6
+import time
7
+
8
+from selenium import webdriver
9
+from urllib import quote
10
+
11
+from fty_util.common import Mysql
12
+
13
+reload(sys)
14
+sys.setdefaultencoding('utf8')
15
+
16
+def scrapy_url():
17
+    conn = Mysql.createOfflineConn()
18
+
19
+    sql = """
20
+        select id, tv_name from scrapy.wangju_url where url_sohu is null order by id asc
21
+    """
22
+
23
+    rows = Mysql.getAll(sql, conn=conn)
24
+
25
+    driver = webdriver.Firefox()
26
+    driver.set_page_load_timeout(10)
27
+
28
+    for row in rows:
29
+        _id = row['id']
30
+        tv_name = row['tv_name']
31
+
32
+        url = 'http://so.tv.sohu.com/mts?box=1&wd=' + quote(str(tv_name))
33
+
34
+        need_blank = True
35
+        try:
36
+            driver.get(url)
37
+        except Exception, e:
38
+            driver.execute_script('window.stop()')
39
+
40
+        divs = driver.find_elements_by_xpath('//div[@class="wrap cfix"]/div')
41
+        href_list = []
42
+        for div in divs:
43
+            try:
44
+                href = div.find_element_by_xpath('./div/div[2]/div[1]/h2/a').get_attribute('href')
45
+                href_list.append(href)
46
+            except Exception, e:
47
+                pass
48
+        if len(href_list) > 0:
49
+            sql = """
50
+                update scrapy.wangju_url set url_sohu = '%s' where id = %s
51
+            """
52
+            sql = sql % (','.join(href_list), _id)
53
+            Mysql.execute(sql, conn=conn)
54
+            need_blank = False
55
+        if need_blank:
56
+            sql = """
57
+                update scrapy.wangju_url set url_sohu = '%s' where id = %s
58
+            """
59
+            sql = sql % ('', _id)
60
+            Mysql.execute(sql, conn=conn)
61
+    driver.quit()
62
+
63
+def scrapy_data():
64
+    conn = Mysql.createOfflineConn()
65
+
66
+    sql = """
67
+        select id, tv_name, url_sohu from scrapy.wangju_url where url_sohu is not null and url_sohu != '' order by id asc
68
+    """
69
+
70
+    rows = Mysql.getAll(sql, conn=conn)
71
+
72
+    driver = webdriver.Firefox()
73
+    driver.set_page_load_timeout(10)
74
+
75
+    for row in rows:
76
+        _id = row['id']
77
+        tv_name = row['tv_name']
78
+        url_sohu = row['url_sohu']
79
+
80
+        urls = url_sohu.split(',')
81
+        for url in urls:
82
+            try:
83
+                driver.get(url)
84
+            except Exception, e:
85
+                driver.execute_script('window.stop()')
86
+            
87
+            try:
88
+                title = ''
89
+                content = driver.find_element_by_xpath('//div[@class="infoR r"]').get_attribute('textContent')
90
+            except Exception, e:
91
+                try:
92
+                    title = driver.find_element_by_xpath('//div[@class="drama-name area rel cfix "]').get_attribute('textContent')
93
+                    content = driver.find_element_by_xpath('//div[@class="drama-infoR"]').get_attribute('textContent')
94
+                except Exception, e:
95
+                    continue
96
+            
97
+            sql = """
98
+                insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
99
+            """
100
+            value = (_id, tv_name, url, title, content, 'sohu')
101
+            Mysql.insertOne(sql, value=value, conn=conn)
102
+
103
+    driver.quit()
104
+
105
+def parse_wangju_all_url_data():
106
+    conn = Mysql.createOfflineConn()
107
+
108
+    sql = """
109
+        select id, tv_name, url, content from scrapy.wangju_all_url where source = 'sohu' order by id asc
110
+    """
111
+
112
+    rows = Mysql.getAll(sql, conn=conn)
113
+    for row in rows:
114
+        _id = row['id']
115
+        tv_name = row['tv_name']
116
+        url = row['url']
117
+        content = row['content']
118
+        
119
+        import re
120
+        m = re.search(ur'评分:\d+(.)\d+', content)
121
+        score = '0'
122
+        if m is not None:
123
+            score = m.group(0)
124
+        
125
+        play = '0'
126
+        m = re.search(ur'总播放:\d+(.)\d+[(亿)(万)]', content)
127
+        if m is not None:
128
+            play = m.group(0)
129
+        
130
+        sql = """
131
+            update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'sohu'
132
+        """
133
+        sql = sql % (score, play, url)
134
+        Mysql.execute(sql, conn=conn)
135
+
136
+if __name__ == '__main__':
137
+    # scrapy_data()
138
+    # scrapy_url()
139
+    parse_wangju_all_url_data()

+ 231 - 0
task_scrapy/scrapy_tengxun.py

@@ -0,0 +1,231 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+
4
+import random
5
+import sys
6
+import time
7
+
8
+from selenium import webdriver
9
+from urllib import quote
10
+
11
+from fty_util.common import Mysql
12
+
13
+
14
+reload(sys)
15
+sys.setdefaultencoding('utf8')
16
+
17
+"""
18
+腾讯视频爬取规则
19
+1、scrapy_url 通过搜索页面,爬取搜索到的最有可能是电视剧页面的url
20
+2、scrapy_data 进入搜索到的详情页面,爬取评分,每集url(播放数量在每集页面上显示)
21
+3、todo 爬取每页详情页
22
+
23
+腾讯视频通过搜索到的详情页面没有播放数量和评论数量,需要一个个页面解析
24
+搜索页面-->搜索详情页面-->播放页面(只需取第一集播放页面即可)
25
+所以只有在播放页面爬取到播放量即可。
26
+"""
27
+
28
+def scrapy_url():
29
+    conn = Mysql.createOfflineConn()
30
+    sql = """
31
+        select id, tv_name from scrapy.wangju_url order by id asc
32
+    """
33
+    rows = Mysql.getAll(sql, conn=conn)
34
+    for row in rows:
35
+        driver = webdriver.PhantomJS()
36
+        driver.set_page_load_timeout(10)
37
+        _id = row['id']
38
+        tv_name = row['tv_name']
39
+        url = 'https://v.qq.com/x/search/?q=' + quote(str(tv_name))
40
+        try:
41
+            driver.get(url)
42
+        except Exception, e:
43
+            driver.execute_script('window.stop()')
44
+
45
+        divs = driver.find_elements_by_xpath('//div[@class="wrapper_main"]/div')
46
+        for div in divs:
47
+            try:
48
+                title = div.find_element_by_xpath('./div[1]/div/h2/a/em').text
49
+                href = div.find_element_by_xpath('./div[1]/div/h2/a').get_attribute('href')
50
+                if 'v.qq.com/detail' in href:
51
+                    print href
52
+                    sql = """
53
+                        insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
54
+                    """
55
+                    value = (_id, tv_name, href, title, '', 'tengxun')
56
+                    Mysql.insertOne(sql, value=value, conn=conn)
57
+                    time.sleep(1)
58
+            except Exception, e:
59
+                print e
60
+                continue
61
+        driver.quit()
62
+
63
+# 爬取搜索到的详情页面
64
+def scrapy_data():
65
+    conn = Mysql.createOfflineConn()
66
+
67
+    # sql = """
68
+    #     select id, tv_name, url_tengxun from scrapy.wangju_url where url_tengxun is not null and url_tengxun != '' and tengxun_fenji is null order by id asc
69
+    # """
70
+    sql = """
71
+        select id, tv_name, url, title from scrapy.wangju_all_url where source = 'tengxun' order by id asc
72
+    """
73
+    rows = Mysql.getAll(sql, conn=conn)
74
+    for row in rows:
75
+        driver = webdriver.PhantomJS()
76
+        driver.set_page_load_timeout(10)
77
+        _id = row['id']
78
+        tv_name = row['tv_name']
79
+        url = row['url']
80
+
81
+        try:
82
+            driver.get(url)
83
+        except Exception, e:
84
+            driver.execute_script('window.stop()')
85
+
86
+        # 爬取内容
87
+        try:
88
+            content = driver.find_element_by_xpath('//div[@class="container_inner"]').get_attribute('textContent')
89
+        except Exception, e:
90
+            content = ''
91
+
92
+        try:
93
+            pagelist = driver.find_elements_by_xpath('//div[@class="mod_episode"]/span')
94
+            if pagelist is not None:
95
+                data_list = []
96
+                for page in pagelist:
97
+                    num = page.find_element_by_xpath('./a/span').text
98
+                    num = num.replace(' ', '').replace('\n', '')
99
+                    href = page.find_element_by_xpath('./a').get_attribute('href')
100
+                    if 'v.qq.com' in href:
101
+                        data_list.append((_id, tv_name, num, href, 'tengxun'))
102
+                # 插入分集数据
103
+                if data_list is not None and len(data_list) > 0:
104
+                    sql = """
105
+                        insert into scrapy.wangju_fenji_url (id, tv_name, num, url, source) values (%s, %s, %s, %s, %s)
106
+                    """
107
+                    Mysql.insertMany(sql, data_list, conn)
108
+        except Exception, e:
109
+            pass
110
+        
111
+        # 更新内容
112
+        sql = """
113
+            update scrapy.wangju_all_url set content = %s where url = %s
114
+        """
115
+        value = (content, url)
116
+        Mysql.execute(sql, param=value, conn=conn)
117
+        driver.quit()
118
+
119
+# 爬取播放页面
120
+def scrapy_play_page():
121
+    conn = Mysql.createOfflineConn()
122
+    sql = """
123
+        select id, tv_name, url from scrapy.wangju_fenji_url where source = 'tengxun' and num = '1' order by id asc
124
+    """
125
+    rows = Mysql.getAll(sql, conn=conn)
126
+    for row in rows:
127
+        driver = webdriver.Firefox()
128
+        driver.set_page_load_timeout(10)
129
+        _id = row['id']
130
+        tv_name = row['tv_name']
131
+        url = row['url']
132
+        if 'v.qq.com' not in url:
133
+            driver.quit()
134
+            continue
135
+        else:
136
+            try:
137
+                driver.get(url)
138
+            except Exception, e:
139
+                print e
140
+                driver.execute_script('window.stop()')
141
+            try:
142
+                count = driver.find_element_by_xpath('//em[@id="mod_cover_playnum"]').text
143
+            except Exception, e:
144
+                print e
145
+                count = 0
146
+            
147
+            print count
148
+
149
+            sql = """
150
+                update scrapy.wangju_url set tengxun_playtimes = '%s' where id = %s
151
+            """
152
+            sql = sql % (count, _id)
153
+            Mysql.execute(sql, conn=conn)
154
+
155
+        driver.quit()
156
+
157
+def parse_wangju_all_url_data():
158
+    conn = Mysql.createOfflineConn()
159
+
160
+    sql = """
161
+        select id, tv_name, url from scrapy.wangju_all_url where source = 'tengxun' order by id asc
162
+    """
163
+
164
+    rows = Mysql.getAll(sql, conn=conn)
165
+
166
+    driver = webdriver.Firefox()
167
+    driver.set_page_load_timeout(10)
168
+
169
+    driver2 = webdriver.Firefox()
170
+    driver2.set_page_load_timeout(10)
171
+    for row in rows:
172
+        _id = row['id']
173
+        tv_name = row['tv_name']
174
+        url = row['url']
175
+        
176
+        try:
177
+            driver.get(url)
178
+        except Exception, e:
179
+            print e
180
+            driver.execute_script('window.stop()')
181
+        try:
182
+            score = driver.find_element_by_xpath('//div[@class="video_score"]').text
183
+            score = score.replace(' ', '').replace('\n', '')
184
+        except:
185
+            score = ''
186
+        
187
+        try:
188
+            pagelist = driver.find_elements_by_xpath('//span[@class="item"]')
189
+        except:
190
+            pagelist = None
191
+        
192
+        try:
193
+            page_dict = dict()
194
+            if pagelist is not None:
195
+                for page in pagelist:
196
+                    episode = page.find_element_by_xpath('./a').get_attribute('href')
197
+                    episode_text = page.find_element_by_xpath('./a/span').text
198
+                    page_dict[episode_text] = episode
199
+            if page_dict.get('1') is not None and 'v.qq.com' in page_dict.get('1'):
200
+                try:
201
+                    driver2.get(page_dict.get('1'))
202
+                except Exception, e:
203
+                    print e
204
+                    driver2.execute_script('window.stop()')
205
+                try:
206
+                    count = driver2.find_element_by_xpath('//em[@id="mod_cover_playnum"]').text
207
+                except Exception, e:
208
+                    print e
209
+                    count = 0
210
+                sql = """
211
+                    update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'tengxun'
212
+                """
213
+                sql = sql % (score, count, url)
214
+                Mysql.execute(sql, conn=conn)
215
+            else:
216
+                sql = """
217
+                    delete from scrapy.wangju_all_url where url = '%s' and source = '%s'
218
+                """
219
+                sql = sql % (url, 'tengxun')
220
+                Mysql.execute(sql, conn=conn)
221
+        except Exception, e:
222
+            continue
223
+
224
+            
225
+        
226
+
227
+if __name__ == '__main__':
228
+    # scrapy_url()
229
+    # scrapy_data()
230
+    # scrapy_play_page()
231
+    parse_wangju_all_url_data()

+ 97 - 0
task_scrapy/scrapy_tianyancha.py

@@ -0,0 +1,97 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+
4
+import random
5
+import sys
6
+import time
7
+
8
+from selenium import webdriver
9
+from urllib import quote
10
+
11
+from fty_util.common import Mysql
12
+
13
+reload(sys)
14
+sys.setdefaultencoding('utf8')
15
+
16
+def scrapy_tianyancha():
17
+    
18
+    conn = Mysql.createOfflineConn()
19
+    urls = []
20
+    for i in range(1, 33):
21
+        urls.append(str('http://hangzhou.tianyancha.com/search/p' + str(i) + '?key=%E6%96%87%E5%8C%96%E4%BC%A0%E5%AA%92'))
22
+
23
+    driver = webdriver.Firefox()
24
+    driver.set_page_load_timeout(10)
25
+
26
+    for url in urls:
27
+        try:
28
+            driver.get(url)
29
+        except Exception, e:
30
+            print url
31
+            try:
32
+                driver.execute_script('window.stop()')
33
+            except Exception, e:
34
+                pass
35
+        time.sleep(10)
36
+        try:
37
+            divs = driver.find_elements_by_xpath('//div[@id="ng-view"]/div[2]/div/div/div[1]/div[3]/div')
38
+        except Exception, e:
39
+            continue
40
+        for div in divs:
41
+            try:
42
+                title = div.find_element_by_xpath('./div[2]/div/div[1]/a').get_attribute('textContent')
43
+                href = div.find_element_by_xpath('./div[2]/div/div[1]/a').get_attribute('href')
44
+
45
+                sql = """
46
+                    insert into scrapy.scrapy_tianyancha (name, url) values (%s, %s)
47
+                """
48
+                value = (title, href)
49
+                Mysql.insertOne(sql, value=value, conn=conn)
50
+            except Exception, e:
51
+                pass
52
+    driver.quit()
53
+
54
+def parse_detail():
55
+    
56
+    conn = Mysql.createOfflineConn()
57
+
58
+    sql = """
59
+        select id, url from scrapy.scrapy_tianyancha where content1 = '' or content1 is null order by id asc
60
+    """
61
+    rows = Mysql.getAll(sql, conn=conn)
62
+    driver = webdriver.Firefox()
63
+    driver.set_page_load_timeout(10)
64
+
65
+    for row in rows:
66
+        _id = row['id']
67
+        url = row['url']
68
+
69
+        try:
70
+            driver.get(url)
71
+        except Exception, e:
72
+            print url
73
+            try:
74
+                driver.execute_script('window.stop()')
75
+            except Exception, e:
76
+                pass
77
+        
78
+        time.sleep(5)
79
+        try:
80
+            content1 = driver.find_element_by_xpath('//div[@class="company_info_text"]').get_attribute('textContent')
81
+            content2_list = driver.find_elements_by_xpath('//div[@class="baseinfo-module-item"]')
82
+            content2 = ''
83
+            for content in content2_list:
84
+                content2 = content2 + content.find_element_by_xpath('.').get_attribute('textContent')
85
+        except Exception, e:
86
+            content1 = ''
87
+            content2 = ''
88
+        sql = """
89
+            update scrapy.scrapy_tianyancha set content1 = '%s', content2 = '%s' where id = %s
90
+        """
91
+        sql = sql % (content1, content2, _id)
92
+        Mysql.execute(sql, conn=conn)
93
+    driver.quit()
94
+
95
+if __name__ == '__main__':
96
+    # scrapy_tianyancha()
97
+    parse_detail()

+ 83 - 0
task_scrapy/scrapy_tv_unhandle.py

@@ -0,0 +1,83 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+
4
+import random
5
+import sys
6
+import time
7
+
8
+from selenium import webdriver
9
+from urllib import quote
10
+
11
+from fty_util.common import Mysql
12
+
13
+reload(sys)
14
+sys.setdefaultencoding('utf8')
15
+
16
+"""
17
+从爱奇艺中爬取百度百科没有爬到的内容
18
+"""
19
+
20
+# 爬取搜索页面
21
+def scrapy_url():
22
+    conn = Mysql.createOfflineConn()
23
+
24
+    sql = """
25
+        select max(tv_id) as tv_id from scrapy.iqiyi_dianshiju_detail
26
+    """
27
+
28
+    max_id = Mysql.getOne(sql, conn=conn)
29
+    if max_id is None or max_id[0] == 0:
30
+        max_tv_id = 0
31
+    else:
32
+        max_tv_id = max_id[0]
33
+
34
+    sql = """
35
+        select id, name from tv_lib.yxb_tv_series where id > %s and status = 12 order by id asc
36
+    """
37
+    sql = sql % (max_tv_id,)
38
+    rows = Mysql.getAll(sql, conn=conn)
39
+    driver = webdriver.PhantomJS()
40
+    driver.set_page_load_timeout(10)
41
+
42
+    driver2 = webdriver.PhantomJS()
43
+    driver2.set_page_load_timeout(10)
44
+    for row in rows:
45
+        _id = row['id']
46
+        name = row['name']
47
+        url = 'http://so.iqiyi.com/so/q_' + quote(str(name))
48
+        try:
49
+            driver.get(url)
50
+        except Exception, e:
51
+            driver.execute_script('window.stop()')
52
+        lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li')
53
+        for li in lis:
54
+            try:
55
+                title = li.find_element_by_xpath('./div/h3/a').get_attribute('title')
56
+                href = li.find_element_by_xpath('./div/h3/a').get_attribute('href')
57
+                if 'www.iqiyi.com/lib' in href:
58
+                    print href
59
+                    try:
60
+                        driver2.get(href)
61
+                    except:
62
+                        pass
63
+                    content = driver2.find_element_by_xpath('//div[@class="result_detail"]').get_attribute('textContent')
64
+                    if content is None:
65
+                        content = ''
66
+                    desc = driver2.find_element_by_xpath('//div[@class="mod-body introduce-info"]').get_attribute('textContext')
67
+                    if desc is None:
68
+                        desc = ''
69
+                    
70
+                    content = content + '\n' + '概述:' + desc
71
+                    sql = """
72
+                        insert into scrapy.iqiyi_dianshiju_detail (tv_id, tv_name, title, detail_info_text, url) values (%s, %s, %s, %s, %s)
73
+                    """
74
+                    value = (_id, name, title, content, href)
75
+                    Mysql.insertOne(sql, value=value, conn=conn)
76
+            except Exception, e:
77
+                print e
78
+                continue
79
+    driver.quit()
80
+    driver2.quit()
81
+
82
+if __name__ == '__main__':
83
+    scrapy_url()

+ 146 - 0
task_scrapy/scrapy_website_count.py

@@ -0,0 +1,146 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""
5
+营销文章数量爬取
6
+"""
7
+import random
8
+import sys
9
+import time
10
+import collections
11
+
12
+from selenium import webdriver
13
+from urllib import quote
14
+
15
+from fty_util.common import Mysql
16
+
17
+reload(sys)
18
+sys.setdefaultencoding('utf8')
19
+
20
+def scrapy_website():
21
+    conn = Mysql.createOfflineConn()
22
+
23
+    # 将网站url和名称 放入有序字典中
24
+    websites_dict = collections.OrderedDict()
25
+    sql = """
26
+        select name, update_url from odl.basic_websites order by id asc
27
+    """
28
+    websites = Mysql.getAll(sql, conn=conn)
29
+    for website in websites:
30
+        name = website['name']
31
+        update_url = website['update_url']
32
+        websites_dict[update_url] = name
33
+    driver = webdriver.PhantomJS()
34
+    driver.set_page_load_timeout(10)
35
+
36
+    sql = """
37
+        select max(tv_id) as tv_id from scrapy.scrapy_article_count
38
+    """
39
+
40
+    max_tv_id = Mysql.getOne(sql, conn=conn)
41
+    if max_tv_id is None or max_tv_id[0] == 0:
42
+        max_id = 0
43
+    else:
44
+        max_id = max_tv_id[0]
45
+
46
+    start_url = 'http://www.baidu.com/'
47
+
48
+    sql = """
49
+        select tv_id, tv_name from odl.ad_tv_lib where tv_id > %s order by id asc
50
+    """
51
+    sql = sql % (max_id, )
52
+
53
+    tvs = Mysql.getAll(sql, conn=conn)
54
+
55
+    for tv in tvs:
56
+        tv_id = tv['tv_id']
57
+        tv_name = tv['tv_name']
58
+
59
+        for update_url in websites_dict:
60
+            name = websites_dict.get(update_url)
61
+            try:
62
+                driver.get(start_url)
63
+            except Exception, e:
64
+                pass
65
+            # input_box = None
66
+            # submit_button = None
67
+            # try:
68
+            #     input_box = driver.find_element_by_id('kw')
69
+            #     submit_button = driver.find_element_by_id('su')
70
+            # except Exception, e:
71
+            #     driver.refresh()
72
+            # 搜索条件
73
+            line = 'intitle:' + tv_name + ' ' + 'site:' + update_url
74
+            try:
75
+                js = 'document.getElementById("kw").value = "' + line + '"'
76
+                driver.execute_script(js)
77
+                js = 'document.getElementById("su").click()'
78
+                driver.execute_script(js)
79
+                # input_box.clear()
80
+                # input_box.send_keys(line)
81
+                # submit_button.click()
82
+            except Exception, e:
83
+                print '点击请求失败'
84
+
85
+            time.sleep(1)
86
+            # 分页块
87
+            page = None
88
+            try:
89
+                page = driver.find_elements_by_xpath('//div[@id="page"]/a')
90
+            except:
91
+                pass
92
+            count = 0
93
+            # 如果分页不存在,说明记录在十条以内或没有记录
94
+            if page is None or len(page) == 0:
95
+                try:
96
+                    divs = driver.find_elements_by_xpath('//div[@id="content_left"]/div')
97
+                    if divs is not None and len(divs) > 0:
98
+                        count = len(divs)
99
+                except Exception, e:
100
+                    count = 0
101
+            #  如果分页存在,判断最后一页是不是10
102
+            else:
103
+                try:
104
+                    page_nums = driver.find_elements_by_xpath('//div[@id="page"]/a')
105
+                    max_page_num = 1
106
+                    max_page_href= ''
107
+                    for page_num in page_nums:
108
+                        href = page_num.find_element_by_xpath('.').get_attribute('href')
109
+                        page_num_text = page_num.find_element_by_xpath('.').get_attribute('textContent')
110
+                        page_num_text = page_num_text.replace(' ', '').replace('\n', '')
111
+
112
+                        # 如果只是数字
113
+                        if page_num_text.isdigit():
114
+                            page_num_text = int(page_num_text)
115
+                            if page_num_text > max_page_num:
116
+                                max_page_num = page_num_text
117
+                                max_page_href = href
118
+                        # 如果是下一页字符串
119
+                        elif page_num_text == '下一页>':
120
+                            break
121
+
122
+                    try:
123
+                        driver.get(max_page_href)
124
+                    except Exception, e:
125
+                        pass
126
+                    try:
127
+                        divs = driver.find_elements_by_xpath('//div[@id="content_left"]/div')
128
+                        if divs is not None and len(divs) > 0:
129
+                            count = len(divs)
130
+                    except Exception, e:
131
+                        count = 0
132
+                    
133
+                    count = (max_page_num - 1) * 10 + count
134
+                except:
135
+                    continue
136
+
137
+            if count != 0:
138
+                sql = """
139
+                    insert into scrapy.scrapy_article_count (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_count) values (%s, %s, %s, %s, %s, %s)
140
+                """
141
+                value = (tv_id, tv_name, 1, name, line, count)
142
+                Mysql.insertOne(sql, value=value, conn=conn)
143
+    driver.quit()
144
+
145
+if __name__ == '__main__':
146
+    scrapy_website()

+ 206 - 0
task_scrapy/scrapy_website_count_new.py

@@ -0,0 +1,206 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""
5
+新剧营销文章爬取
6
+"""
7
+import random
8
+import sys
9
+import time
10
+import collections
11
+
12
+from selenium import webdriver
13
+from urllib import quote
14
+
15
+from fty_util.common import Mysql
16
+
17
+reload(sys)
18
+sys.setdefaultencoding('utf8')
19
+
20
+def scrapy_website():
21
+    conn = Mysql.createOfflineConn()
22
+
23
+    # 清空scrapy.scrapy_article表
24
+    sql = """
25
+        truncate table scrapy.scrapy_article
26
+    """
27
+    Mysql.execute(sql, conn=conn)
28
+
29
+    # 将网站url和名称 放入有序字典中
30
+    websites_dict = collections.OrderedDict()
31
+    sql = """
32
+        select name, update_url from odl.basic_websites order by id asc
33
+    """
34
+    websites = Mysql.getAll(sql, conn=conn)
35
+    for website in websites:
36
+        name = website['name']
37
+        update_url = website['update_url']
38
+        websites_dict[update_url] = name
39
+    driver = webdriver.PhantomJS()
40
+    driver.set_page_load_timeout(10)
41
+
42
+    driver2 = webdriver.PhantomJS()
43
+    driver2.set_page_load_timeout(10)
44
+
45
+    start_url = 'http://www.baidu.com/'
46
+
47
+    sql = """
48
+        select id, tv_name from yxb.ad_tv_lib where source = 1 order by id asc
49
+    """
50
+
51
+    tvs = Mysql.getAll(sql, conn=conn)
52
+
53
+    for tv in tvs:
54
+        tv_id = tv['id']
55
+        tv_name = tv['tv_name']
56
+
57
+        for update_url in websites_dict:
58
+            name = websites_dict.get(update_url)
59
+            try:
60
+                driver.get(start_url)
61
+            except Exception, e:
62
+                pass
63
+            # input_box = None
64
+            # submit_button = None
65
+            # try:
66
+            #     input_box = driver.find_element_by_id('kw')
67
+            #     submit_button = driver.find_element_by_id('su')
68
+            # except Exception, e:
69
+            #     driver.refresh()
70
+            # 搜索条件
71
+            line = 'intitle:' + tv_name + ' ' + 'site:' + update_url
72
+            print line
73
+            try:
74
+                # input_box.clear()
75
+                # input_box.send_keys(line)
76
+                # submit_button.click()
77
+                js = 'document.getElementById("kw").value = "' + line + '"'
78
+                driver.execute_script(js)
79
+                js = 'document.getElementById("su").click()'
80
+                driver.execute_script(js)
81
+            except Exception, e:
82
+                print '点击请求失败'
83
+
84
+            time.sleep(1)
85
+            # 分页块
86
+            page = None
87
+            try:
88
+                page = driver.find_elements_by_xpath('//div[@id="page"]/a')
89
+            except:
90
+                pass
91
+            count = 0
92
+            # 如果分页不存在,说明记录在十条以内或没有记录
93
+            if page is None or len(page) == 0:
94
+                try:
95
+                    divs = driver.find_elements_by_xpath('//div[@id="content_left"]/div')
96
+                    if divs is not None and len(divs) > 0:
97
+                        count = len(divs)
98
+                        for div in divs:
99
+                            try:
100
+                                div_title = div.find_element_by_xpath('./h3/a').get_attribute('textContent')
101
+                                div_title = div_title.replace(' ', '').replace('\n', '')
102
+                                div_href = div.find_element_by_xpath('./h3/a').get_attribute('href')
103
+                                div_content = div.find_element_by_xpath('.').get_attribute('textContent')
104
+                                import re
105
+                                m = re.search(ur'[0-9]+年[0-9]+月[0-9]+日', div_content)
106
+                                if m is not None:
107
+                                    div_date = m.group(0)
108
+                                    div_date = div_date.replace('年', '-').replace('月', '-').replace('日','')
109
+                                    sql = """
110
+                                        insert into scrapy.scrapy_article (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_url, scrapy_title, scrapy_date) values (%s, %s, %s, %s, %s, %s, %s, %s)
111
+                                    """
112
+                                    value = (tv_id, tv_name, 1, name, line, div_href, div_title, div_date)
113
+                                    Mysql.insertOne(sql, value=value, conn=conn)
114
+                            except:
115
+                                pass
116
+                except Exception, e:
117
+                    print e
118
+                    count = 0
119
+            #  如果分页存在,判断最后一页是不是10
120
+            else:
121
+                
122
+                try:
123
+                    divs = driver.find_elements_by_xpath('//div[@id="content_left"]/div')
124
+                except:
125
+                    divs = None
126
+                if divs is not None and len(divs) > 0:
127
+                    # count = len(divs)
128
+                    for div in divs:
129
+                        try:
130
+                            try:
131
+                                div_title = div.find_element_by_xpath('./h3/a').get_attribute('textContent')
132
+                                div_title = div_title.replace(' ', '').replace('\n', '')
133
+                                div_href = div.find_element_by_xpath('./h3/a').get_attribute('href')
134
+                                div_content = div.find_element_by_xpath('.').get_attribute('textContent')
135
+                                import re
136
+                                m = re.search(ur'[0-9]+年[0-9]+月[0-9]+日', div_content)
137
+                                if m is not None:
138
+                                    div_date = m.group(0)
139
+                                    div_date = div_date.replace('年', '-').replace('月', '-').replace('日','')
140
+                                    sql = """
141
+                                        insert into scrapy.scrapy_article (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_url, scrapy_title, scrapy_date) values (%s, %s, %s, %s, %s, %s, %s, %s)
142
+                                    """
143
+                                    value = (tv_id, tv_name, 1, name, line, div_href, div_title, div_date)
144
+                                    Mysql.insertOne(sql, value=value, conn=conn)
145
+                            except:
146
+                                pass
147
+                        except Exception, e:
148
+                            pass
149
+
150
+                try:
151
+                    page_nums = driver.find_elements_by_xpath('//div[@id="page"]/a')
152
+                    max_page_num = 1
153
+                    max_page_href= ''
154
+                    for page_num in page_nums:
155
+                        href = page_num.find_element_by_xpath('.').get_attribute('href')
156
+                        page_num_text = page_num.find_element_by_xpath('.').get_attribute('textContent')
157
+                        page_num_text = page_num_text.replace(' ', '').replace('\n', '')
158
+
159
+                        # 如果只是数字
160
+                        if page_num_text.isdigit():
161
+                            page_num_text = int(page_num_text)
162
+                            if page_num_text > max_page_num:
163
+                                max_page_num = page_num_text
164
+                                max_page_href = href
165
+
166
+                                try:
167
+                                    driver2.get(max_page_href)
168
+                                except Exception, e:
169
+                                    print e
170
+                                    pass
171
+
172
+                                divs = driver2.find_elements_by_xpath('//div[@id="content_left"]/div')
173
+                                if divs is not None and len(divs) > 0:
174
+                                    # count = len(divs)
175
+                                    for div in divs:
176
+                                        try:
177
+                                            div_title = div.find_element_by_xpath('./h3/a').get_attribute('textContent')
178
+                                            div_title = div_title.replace(' ', '').replace('\n', '')
179
+                                            div_href = div.find_element_by_xpath('./h3/a').get_attribute('href')
180
+                                            div_content = div.find_element_by_xpath('.').get_attribute('textContent')
181
+                                            import re
182
+                                            m = re.search(ur'[0-9]+年[0-9]+月[0-9]+日', div_content)
183
+                                            if m is not None:
184
+                                                div_date = m.group(0)
185
+                                                div_date = div_date.replace('年', '-').replace('月', '-').replace('日','')
186
+                                                sql = """
187
+                                                    insert into scrapy.scrapy_article (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_url, scrapy_title, scrapy_date) values (%s, %s, %s, %s, %s, %s, %s, %s)
188
+                                                """
189
+                                                value = (tv_id, tv_name, 1, name, line, div_href, div_title, div_date)
190
+                                                Mysql.insertOne(sql, value=value, conn=conn)
191
+                                        except:
192
+                                            pass
193
+
194
+                        # 如果是下一页字符串
195
+                        elif page_num_text == '下一页>':
196
+                            break
197
+                except Exception, e:
198
+                    print e
199
+                    continue
200
+
201
+    driver.quit()
202
+    driver2.quit()
203
+    Mysql.close(conn=conn)
204
+
205
+if __name__ == '__main__':
206
+    scrapy_website()

+ 222 - 0
task_scrapy/scrapy_youku.py

@@ -0,0 +1,222 @@
1
+#/usr/bin/env python
2
+#coding=utf-8
3
+
4
+import random
5
+import sys
6
+import time
7
+
8
+from selenium import webdriver
9
+from urllib import quote
10
+
11
+from fty_util.common import Mysql
12
+
13
+reload(sys)
14
+sys.setdefaultencoding('utf8')
15
+
16
+def scrapy_url():
17
+    conn = Mysql.createOfflineConn()
18
+
19
+    sql = """
20
+        select id, tv_name from scrapy.wangju_url where url_youku is null order by id asc
21
+    """
22
+
23
+    rows = Mysql.getAll(sql, conn=conn)
24
+
25
+    driver = webdriver.Firefox()
26
+    driver.set_page_load_timeout(10)
27
+
28
+    for row in rows:
29
+        _id = row['id']
30
+        tv_name = row['tv_name']
31
+
32
+        url = 'http://www.soku.com/search_video/q_' + quote(str(tv_name))
33
+
34
+        # need_blank = True
35
+        try:
36
+            driver.get(url)
37
+        except Exception, e:
38
+            driver.execute_script('window.stop()')
39
+
40
+        divs = driver.find_elements_by_xpath('//div[@class="sk-express"]/div/div')
41
+        for div in divs:
42
+            try:
43
+                title = div.find_element_by_xpath('./div/div[2]/div[1]/div/h2/a[1]').get_attribute('textContent')
44
+                title = title.replace(' ', '').replace('\n', '')
45
+                href = div.find_element_by_xpath('//div[@class="info_cont"]/p/a').get_attribute('href')
46
+
47
+                jishu = None
48
+                try:
49
+                    jishu = div.find_elements_by_xpath('//div[@class="s_items all site14 "]/ul/li')
50
+                except Exception, e:
51
+                    pass
52
+                if jishu is None or len(jishu) == 0:
53
+                    try:
54
+                        # jishu = div.find_elements_by_xpath('//div[@class="s_items site14 "]/ul/li')
55
+                        jishu = div.find_elements_by_xpath('//div[@class="s_detail"]/div[4]/ul/li')
56
+                    except Exception, e:
57
+                        pass
58
+                if tv_name in title and jishu is not None and len(jishu) > 0:
59
+                    sql = """
60
+                        update scrapy.wangju_url set url_youku = '%s' where id = %s
61
+                    """
62
+                    sql = sql % (href, _id)
63
+                    Mysql.execute(sql, conn=conn)
64
+                    need_blank = False
65
+            except Exception, e:
66
+                pass
67
+        if need_blank:
68
+            sql = """
69
+                update scrapy.wangju_url set url_youku = '%s' where id = %s
70
+            """
71
+            sql = sql % ('', _id)
72
+            Mysql.execute(sql, conn=conn)
73
+    driver.quit()
74
+
75
+def scrapy_data():
76
+    conn = Mysql.createOfflineConn()
77
+
78
+    sql = """
79
+        select id, tv_name, url_youku from scrapy.wangju_url where url_youku is not null and url_youku != '' order by id asc
80
+    """
81
+
82
+    rows = Mysql.getAll(sql, conn=conn)
83
+
84
+    driver = webdriver.Firefox()
85
+    driver.set_page_load_timeout(10)
86
+
87
+    for row in rows:
88
+        _id = row['id']
89
+        tv_name = row['tv_name']
90
+        url_youku = row['url_youku']
91
+
92
+        need_blank = True
93
+        try:
94
+            driver.get(url_youku)
95
+        except Exception, e:
96
+            driver.execute_script('window.stop()')
97
+
98
+        try:
99
+            content = driver.find_element_by_xpath('//div[@class="detailinfo"]').get_attribute('textContent')
100
+        except Exception, e:
101
+            try:
102
+                content = driver.find_element_by_xpath('//div[@class="p-base"]').get_attribute('textContent')
103
+            except Exception, e:
104
+                continue
105
+
106
+        sql = """
107
+            insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
108
+        """
109
+        value = (_id, tv_name, url_youku, '', content, 'youku')
110
+        Mysql.insertOne(sql, value=value, conn=conn)
111
+
112
+    driver.quit()
113
+
114
+def parse_content():
115
+    conn = Mysql.createOfflineConn()
116
+
117
+    sql = """
118
+        select id, tv_name, url, content from scrapy.wangju_all_url where source = 'youku' order by id asc
119
+    """
120
+    rows = Mysql.getAll(sql, conn=conn)
121
+
122
+    for row in rows:
123
+        _id = row['id']
124
+        tv_name = row['tv_name']
125
+        url = row['url']
126
+        content = row['content']
127
+
128
+        import re
129
+        m = re.search(ur'评分: ([0-9]+[.]?)+', content)
130
+        score = '0'
131
+        if m is not None:
132
+            score = m.group(0)
133
+
134
+        play = '0'
135
+        m = re.search(ur'播放数:([0-9]+[,]?)+', content)
136
+        if m is not None:
137
+            play = m.group(0)
138
+
139
+        sql = """
140
+            update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'youku'
141
+        """
142
+        sql = sql % (score, play, url)
143
+        Mysql.execute(sql, conn=conn)
144
+
145
+# def parse_detail_content():
146
+#     conn = Mysql.createOfflineConn()
147
+
148
+#     sql = """
149
+#         select id, detail_info_text from scrapy.iqiyi_dianshiju_detail order by id asc
150
+#     """
151
+#     rows = Mysql.getAll(sql, conn=conn)
152
+
153
+#     for row in rows:
154
+#         _id = row['id']
155
+#         detail_info_text = row['detail_info_text']
156
+#         # sql = """
157
+#         #     update scrapy.iqiyi_dianshiju_detail aa inner join scrapy.iqiyi_dianshiju_detail_copy bb on aa.id = bb.id set aa.detail_info_text = bb.detail_info_text
158
+#         # """
159
+#         # Mysql.update(sql, conn=conn)
160
+#         if detail_info_text is not None:
161
+#             # content = ''
162
+#             # (line0, line1) = tuple(detail_info_text.split(u'评分'))
163
+#             # line0 = line0.replace('\n', '')
164
+#             # content = line0 + '\n' + line1
165
+#             for line in detail_info_text.split('\n'):
166
+                
167
+#             sql = """
168
+#                 update scrapy.iqiyi_dianshiju_detail set detail_info_text = %s where id = %s
169
+#             """
170
+#             value = (content, _id)
171
+#             Mysql.update(sql, param=value, conn=conn)
172
+#     Mysql.close(conn=conn)
173
+
174
+def update_tv_lib():
175
+    conn = Mysql.createOfflineConn()
176
+    sql = """
177
+        select tv_id, detail_info_text from scrapy.iqiyi_dianshiju_detail order by id asc
178
+    """
179
+    rows = Mysql.getAll(sql, conn=conn)
180
+
181
+    for row in rows:
182
+        tv_id = row['tv_id']
183
+        detail_info_text = row['detail_info_text']
184
+
185
+        lines = []
186
+        for line in detail_info_text.split('\n'):
187
+            lines.append(line)
188
+        director = ''
189
+        actors = ''
190
+        product_area = ''
191
+        premiere_time = ''
192
+        _type = ''
193
+        for i in range(len(lines)):
194
+            line = lines[i]
195
+            if u'导演' in line:
196
+                director = line.replace(u'导演:', '')
197
+            if u'主演' in line:
198
+                actors = line.replace(u'主演:', '')
199
+            if u'地区' in line:
200
+                product_area = line.replace(u'地区:', '')
201
+            if u'首播时间' in line:
202
+                premiere_time = line.replace(u'首播时间:', '')
203
+            if u'看点' in line:
204
+                # print line[i+1]
205
+                print lines[i+1]
206
+                _type = lines[i+1]
207
+            
208
+            # if u'更新时间' in line:
209
+            #     gengxin = lines[i+1]
210
+    
211
+        sql = """
212
+            update tv_lib.yxb_tv_series set level = %s, type = %s, script_form = %s, director = %s, product_area = %s, actors = %s, premiere_time = %s where id = %s
213
+        """
214
+        value = (5, _type, 1, director, product_area, actors, premiere_time, tv_id)
215
+        Mysql.update(sql, param=value, conn=conn)
216
+    Mysql.close(conn=conn)
217
+if __name__ == '__main__':
218
+    # scrapy_data()
219
+    # scrapy_url()
220
+    # parse_content()
221
+    # parse_detail_content()
222
+    update_tv_lib()

+ 43 - 0
task_tmp/tmp_data_month.py

@@ -0,0 +1,43 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""月份提取
5
+
6
+将odl.ad_television的时间按月份进行统计
7
+"""
8
+
9
+import datetime
10
+import sys
11
+
12
+from fty_util.common import Mysql
13
+
14
+reload(sys)
15
+sys.setdefaultencoding('utf8')
16
+
17
+conn = Mysql.createOfflineConn()
18
+
19
+sql = """
20
+    truncate table tmp.ad_television_month
21
+"""
22
+Mysql.execute(sql, conn=conn)
23
+
24
+# 月份提取
25
+sql = """
26
+    select date_format(t.tv_date, '%Y-%m') as month, year(t.tv_date) as year from (
27
+        select distinct tv_date from odl.ad_television group by tv_date
28
+    ) t
29
+    group by month
30
+"""
31
+rows = Mysql.getAll(sql, conn=conn)
32
+sql_insert = """
33
+    insert into tmp.ad_television_month (year, month) values (%s, %s)
34
+"""
35
+data_list = []
36
+for row in rows:
37
+    month = row['month']
38
+    year = row['year']
39
+    month_value = datetime.datetime.strptime(month, '%Y-%m')
40
+    data_list.append((year, month_value))
41
+if len(data_list) > 0:
42
+    Mysql.insertMany(sql_insert, data_list, conn)
43
+Mysql.close(conn)

+ 116 - 0
task_tmp/tmp_tv_avg_ratings_fatt0.py

@@ -0,0 +1,116 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""按月统计电视剧的收视情况
5
+
6
+province_input: odl.ad_television
7
+province_output: tmp.month_channel_stat
8
+
9
+area_input: odl.area_ad_television
10
+area_output: tmp.area_month_channel_stat
11
+"""
12
+import sys
13
+
14
+from fty_util.common import Mysql, Util
15
+
16
+reload(sys)
17
+sys.setdefaultencoding('utf8')
18
+
19
+class channel_history_month_stat():
20
+    
21
+    def province(self):
22
+        # 清空电视台数据统计表
23
+        conn = Mysql.createOfflineConn()
24
+        sql = """
25
+            truncate table tmp.month_channel_stat
26
+        """
27
+        Mysql.execute(sql, conn=conn)
28
+        sql = """
29
+            select distinct month from tmp.ad_television_month order by month asc
30
+        """
31
+        rows = Mysql.getAll(sql, conn=conn)
32
+        for row in rows:
33
+            month = row['month']
34
+            print month
35
+            month_max_date = Util.get_max_date_of_month(month)
36
+            
37
+            # 统计电视台当月电视剧收视数据
38
+            sql = """
39
+                select adt.channel, adt.theater_attribute, adt.tv_name, adt.tv_id,
40
+                min(adt.tv_date) as tv_date, sum(adt.audience_rating) as sum_value, count(adt.id) as count_value from odl.ad_television adt 
41
+                where adt.tv_date >= '%s' and adt.tv_date <= '%s' and adt.audience_rating > 0
42
+                group by adt.channel, adt.theater_attribute, adt.tv_name, adt.tv_id
43
+            """
44
+            sql = sql % (month, month_max_date)
45
+            rows_all = Mysql.getAll(sql, conn=conn)
46
+            data_list = []
47
+            sql_insert = """
48
+                insert into tmp.month_channel_stat (channel, theater_attribute, tv_name, tv_id, tv_date, sum_value, count_value, year, month)
49
+                values (%s, %s, %s, %s, %s, %s, %s, %s, %s)
50
+            """
51
+            year = month.year
52
+            for row_all in rows_all:
53
+                channel = row_all['channel']
54
+                theater_attribute = row_all['theater_attribute']
55
+                tv_name = row_all['tv_name']
56
+                tv_id = row_all['tv_id']
57
+                tv_date = row_all['tv_date']
58
+                sum_value = row_all['sum_value']
59
+                count_value = row_all['count_value']
60
+                data_list.append((channel, theater_attribute, tv_name, tv_id, tv_date, sum_value, count_value, year, month))
61
+            Mysql.insertMany(sql_insert, data_list, conn)
62
+        Mysql.close(conn)
63
+
64
+    def area(self):
65
+        # 清空电视台数据统计表
66
+        conn = Mysql.createOfflineConn()
67
+        sql = """
68
+            truncate table tmp.month_channel_stat
69
+        """
70
+        Mysql.execute(sql, conn=conn)
71
+        sql = """
72
+            select distinct month from tmp.ad_television_month order by month asc
73
+        """
74
+        rows = Mysql.getAll(sql, conn=conn)
75
+        for row in rows:
76
+            month = row['month']
77
+            print month
78
+            month_max_date = Util.get_max_date_of_month(month)
79
+            
80
+            # 统计电视台当月电视剧收视数据
81
+            sql = """
82
+                select adt.channel, adt.theater_attribute, adt.tv_name, adt.tv_id,
83
+                min(adt.tv_date) as tv_date, sum(adt.audience_rating) as sum_value, count(adt.id) as count_value from odl.area_ad_television adt 
84
+                where adt.tv_date >= '%s' and adt.tv_date <= '%s' and adt.audience_rating > 0
85
+                group by adt.channel, adt.theater_attribute, adt.tv_name, adt.tv_id
86
+            """
87
+            sql = sql % (month, month_max_date)
88
+            rows_all = Mysql.getAll(sql, conn=conn)
89
+            data_list = []
90
+            sql_insert = """
91
+                insert into tmp.area_month_channel_stat (channel, theater_attribute, tv_name, tv_id, tv_date, sum_value, count_value, year, month)
92
+                values (%s, %s, %s, %s, %s, %s, %s, %s, %s)
93
+            """
94
+            year = month.year
95
+            for row_all in rows_all:
96
+                channel = row_all['channel']
97
+                theater_attribute = row_all['theater_attribute']
98
+                tv_name = row_all['tv_name']
99
+                tv_id = row_all['tv_id']
100
+                tv_date = row_all['tv_date']
101
+                sum_value = row_all['sum_value']
102
+                count_value = row_all['count_value']
103
+                data_list.append((channel, theater_attribute, tv_name, tv_id, tv_date, sum_value, count_value, year, month))
104
+            Mysql.insertMany(sql_insert, data_list, conn=conn)
105
+        Mysql.close(conn)
106
+
107
+if __name__ == '__main__':
108
+    if len(sys.argv) != 2:
109
+        print '没有输入参数,退出'
110
+        sys.exit(0)
111
+    print 'method name is ' + sys.argv[1]
112
+    obj = channel_history_month_stat()
113
+    try:
114
+        getattr(obj, sys.argv[1])()
115
+    except Exception, e:
116
+        print e

+ 146 - 0
task_tmp/tmp_tv_avg_ratings_stat.py

@@ -0,0 +1,146 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""计算每个电视剧的收视率
5
+
6
+province_input: tmp.month_channel_stat
7
+province_output: tmp.tv_avg_ratings
8
+
9
+area_input: tmp.area_month_channel_stat
10
+area_output: tmp.area_tv_avg_ratings
11
+"""
12
+import datetime
13
+import sys
14
+
15
+from fty_util.common import Mysql, Util
16
+
17
+class tv_avg_ratings_clac():
18
+    
19
+    def province(self):
20
+        conn = Mysql.createOfflineConn()
21
+        sql = """
22
+            truncate table tmp.tv_avg_ratings
23
+        """
24
+        Mysql.execute(sql, conn=conn)
25
+        # 从tmp.month_channel_stat表中取出每月的统计数据
26
+        sql = """
27
+            select channel, theater_attribute, tv_name, tv_id, tv_date, sum_value, count_value from tmp.month_channel_stat order by tv_date asc
28
+        """
29
+        rows = Mysql.getAll(sql, conn=conn)
30
+        channel_dict = {}
31
+        # key中有日期
32
+        channel_date_dict = {}
33
+        for row in rows:
34
+            channel = row['channel']
35
+            theater_attribute = row['theater_attribute']
36
+            tv_name = row['tv_name']
37
+            tv_id = row['tv_id']
38
+            tv_date = row['tv_date']
39
+            sum_value = row['sum_value']
40
+            count_value = row['count_value']
41
+            key = (channel, theater_attribute, tv_name, tv_id)
42
+            # 如果两部电视剧在同一台同一剧场播放两次,则保存两条记录
43
+            # 如果字典中不存在数据,则添加
44
+            if channel_dict.get(key) is None:
45
+                channel_dict[key] = (str(tv_date), sum_value, count_value)
46
+                channel_date_dict[(channel, theater_attribute, tv_name, tv_id, str(tv_date))] = (sum_value, count_value)
47
+            else:
48
+                # 否则进行时间判断
49
+                value = channel_dict.get(key)
50
+                # 当前的日期
51
+                date1 = datetime.datetime.strptime(str(tv_date), '%Y-%m-%d')
52
+                # 保存的日期
53
+                date2 = datetime.datetime.strptime(str(value[0]), '%Y-%m-%d')
54
+                sub_value = (date2 - date1).days
55
+                # 如果相减天数在这之间,表示同一部电视剧
56
+                if sub_value <= 25 and sub_value >= -25:
57
+                    value2 = channel_date_dict.get((channel, theater_attribute, tv_name, tv_id, str(value[0])))
58
+                    channel_dict[key] = (str(value[0]), sum_value, count_value)
59
+                    channel_date_dict[(channel, theater_attribute, tv_name, tv_id, str(value[0]))] = tuple(x + y for x, y in zip((sum_value, count_value), tuple(value2)))
60
+                # 日期超出范围,则表示同一部电视剧播出了多次
61
+                else:
62
+                    channel_dict[key] = (str(tv_date), sum_value, count_value)
63
+                    channel_date_dict[(channel, theater_attribute, tv_name, tv_id, str(tv_date))] = (sum_value, count_value) 
64
+
65
+        sql = """
66
+            insert into tmp.tv_avg_ratings (channel, theater_attribute, tv_name, tv_id, tv_date, value) values (%s, %s, %s, %s, %s, %s)
67
+        """
68
+        data_list = []
69
+        for key in channel_date_dict.keys():
70
+            (channel, theater_attribute, tv_name, tv_id, tv_date) = key
71
+            (sum_value, count_value) = channel_date_dict.get(key)
72
+            # value = (channel, theater_attribute, tv_name, tv_id, tv_date, float(sum_value) / count_value)
73
+            data_list.append((channel, theater_attribute, tv_name, tv_id, tv_date, float(sum_value) / count_value))
74
+            # Mysql.insertOne(sql, value=value, conn=conn)
75
+        # Mysql.insertMany(sql, data_list, conn)
76
+        Util.insert_by_chunk(sql, data_list, conn)
77
+        Mysql.close(conn)
78
+
79
+    def area(self):
80
+        conn = Mysql.createOfflineConn()
81
+        sql = """
82
+            truncate table tmp.area_tv_avg_ratings
83
+        """
84
+        Mysql.execute(sql, conn=conn)
85
+        # 从tmp.month_channel_stat表中取出每月的统计数据
86
+        sql = """
87
+            select channel, theater_attribute, tv_name, tv_id, tv_date, sum_value, count_value from tmp.area_month_channel_stat order by tv_date asc
88
+        """
89
+        rows = Mysql.getAll(sql, conn=conn)
90
+        channel_dict = {}
91
+        # key中有日期
92
+        channel_date_dict = {}
93
+        for row in rows:
94
+            channel = row['channel']
95
+            theater_attribute = row['theater_attribute']
96
+            tv_name = row['tv_name']
97
+            tv_id = row['tv_id']
98
+            tv_date = row['tv_date']
99
+            sum_value = row['sum_value']
100
+            count_value = row['count_value']
101
+            key = (channel, theater_attribute, tv_name, tv_id)
102
+            # 如果两部电视剧在同一台同一剧场播放两次,则保存两条记录
103
+            # 如果字典中不存在数据,则添加
104
+            if channel_dict.get(key) is None:
105
+                channel_dict[key] = (str(tv_date), sum_value, count_value)
106
+                channel_date_dict[(channel, theater_attribute, tv_name, tv_id, str(tv_date))] = (sum_value, count_value)
107
+            else:
108
+                # 否则进行时间判断
109
+                value = channel_dict.get(key)
110
+                # 当前的日期
111
+                date1 = datetime.datetime.strptime(str(tv_date), '%Y-%m-%d')
112
+                # 保存的日期
113
+                date2 = datetime.datetime.strptime(str(value[0]), '%Y-%m-%d')
114
+                sub_value = (date2 - date1).days
115
+                # 如果相减天数在这之间,表示同一部电视剧
116
+                if sub_value <= 25 and sub_value >= -25:
117
+                    value2 = channel_date_dict.get((channel, theater_attribute, tv_name, tv_id, str(value[0])))
118
+                    channel_dict[key] = (str(value[0]), sum_value, count_value)
119
+                    channel_date_dict[(channel, theater_attribute, tv_name, tv_id, str(value[0]))] = tuple(x + y for x, y in zip((sum_value, count_value), tuple(value2)))
120
+                # 日期超出范围,则表示同一部电视剧播出了多次
121
+                else:
122
+                    channel_dict[key] = (str(tv_date), sum_value, count_value)
123
+                    channel_date_dict[(channel, theater_attribute, tv_name, tv_id, str(tv_date))] = (sum_value, count_value)
124
+                
125
+        sql = """
126
+            insert into tmp.area_tv_avg_ratings (channel, theater_attribute, tv_name, tv_id, tv_date, value) values (%s, %s, %s, %s, %s, %s)
127
+        """
128
+        data_list = []
129
+        for key in channel_date_dict.keys():
130
+            (channel, theater_attribute, tv_name, tv_id, tv_date) = key
131
+            (sum_value, count_value) = channel_date_dict.get(key)
132
+            data_list.append((channel, theater_attribute, tv_name, tv_id, tv_date, float(sum_value) / count_value))
133
+        
134
+        Util.insert_by_chunk(sql, data_list, conn)
135
+        Mysql.close(conn)
136
+
137
+if __name__ == '__main__':
138
+    if len(sys.argv) != 2:
139
+        print '没有输入参数,退出'
140
+        sys.exit(0)
141
+    print 'method name is ' + sys.argv[1]
142
+    obj = tv_avg_ratings_clac()
143
+    try:
144
+        getattr(obj, sys.argv[1])()
145
+    except Exception, e:
146
+        print e

+ 93 - 0
task_tmp/tmp_tv_category_stat.py

@@ -0,0 +1,93 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""电视台对应电视剧及类型关系数据
5
+
6
+电视台播放的电视剧根据类型分别保存记录
7
+province_input: odl.ad_television odl.ad_tv_lib
8
+province_output: tmp.tv_category_stat
9
+
10
+area_input: odl.area_ad_television odl.ad_tv_lib
11
+area_output: tmp.area_tv_category_stat
12
+"""
13
+import sys
14
+
15
+from fty_util.common import Mysql, Util
16
+
17
+reload(sys)
18
+sys.setdefaultencoding('utf8')
19
+
20
+class tv_category_stat():
21
+    
22
+    def province(self):
23
+        conn = Mysql.createOfflineConn()
24
+        sql = """
25
+            truncate table tmp.tv_category_stat
26
+        """
27
+        Mysql.execute(sql, conn=conn)
28
+        # 电视台播放电视剧分类数据
29
+        station_dict = {}
30
+        sql = """
31
+            select oat.tv_id, oat.channel, oat.theater_attribute, oatl.categories from odl.ad_television oat
32
+            left join odl.ad_tv_lib oatl on oat.tv_id = oatl.tv_id
33
+            where oat.tv_id is not null and oat.theater_attribute != '' and oat.theater_attribute is not null
34
+            group by tv_id, channel, theater_attribute
35
+        """
36
+        rows = Mysql.getAll(sql, conn=conn)
37
+        sql = """
38
+            insert into tmp.tv_category_stat (tv_id, category, channel, theater_attribute) values (%s, %s, %s, %s)
39
+        """
40
+        for row in rows:
41
+            tv_id = row['tv_id']
42
+            channel = row['channel']
43
+            theater_attribute = row['theater_attribute']
44
+            categories = row['categories']
45
+            if categories is not None and len(categories) > 0:
46
+                cate_list = categories.split(' ')
47
+                data_list = []
48
+                for cat in cate_list:
49
+                    data_list.append((tv_id, cat, channel, theater_attribute))
50
+                Mysql.insertMany(sql, data_list, conn)
51
+        Mysql.close(conn)
52
+
53
+    def area(self):
54
+        conn = Mysql.createOfflineConn()
55
+        sql = """
56
+            truncate table tmp.area_tv_category_stat
57
+        """
58
+        Mysql.execute(sql, conn=conn)
59
+        # 电视台播放电视剧分类数据
60
+        station_dict = {}
61
+        sql = """
62
+            select oat.tv_id, oat.channel, oat.theater_attribute, oatl.categories from odl.area_ad_television oat
63
+            left join odl.ad_tv_lib oatl on oat.tv_id = oatl.tv_id
64
+            where oat.tv_id is not null
65
+            group by tv_id, channel, theater_attribute
66
+        """
67
+        rows = Mysql.getAll(sql, conn=conn)
68
+        sql = """
69
+            insert into tmp.area_tv_category_stat (tv_id, category, channel, theater_attribute) values (%s, %s, %s, %s)
70
+        """
71
+        for row in rows:
72
+            tv_id = row['tv_id']
73
+            channel = row['channel']
74
+            theater_attribute = row['theater_attribute']
75
+            categories = row['categories']
76
+            if categories is not None and len(categories) > 0:
77
+                cate_list = categories.split(' ')
78
+                data_list = []
79
+                for cat in cate_list:
80
+                    data_list.append((tv_id, cat, channel, theater_attribute))
81
+                Mysql.insertMany(sql, data_list, conn)
82
+        Mysql.close(conn)
83
+
84
+if __name__ == '__main__':
85
+    if len(sys.argv) != 2:
86
+        print '没有输入参数,退出'
87
+        sys.exit(0)
88
+    print 'method name is ' + sys.argv[1]
89
+    obj = tv_category_stat()
90
+    try:
91
+        getattr(obj, sys.argv[1])()
92
+    except Exception, e:
93
+        print e

+ 85 - 0
task_tmp/tmp_year_channel_avg_ratings_stat_by_tv.py

@@ -0,0 +1,85 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+"""电视台近一年平均收视率
5
+
6
+计算方法:取出电视台的一年收视数据,求平均数
7
+avg(audience_rating) group by channel, theater_attribute
8
+
9
+province_input: odl.ad_television
10
+province_output: tmp.channel_avg_ratings
11
+
12
+area_input: odl.area_ad_television
13
+area_output: odl.area_channel_avg_ratings
14
+"""
15
+import sys
16
+
17
+from fty_util.common import Mysql, Util
18
+
19
+reload(sys)
20
+sys.setdefaultencoding('utf8')
21
+
22
+class channel_avg_ratings():
23
+    
24
+    # 央卫视频道
25
+    def province(self):
26
+        conn = Mysql.createOfflineConn()
27
+        first_day = Util.get_first_date_of_yesterday()
28
+        one_year_ago = Util.get_max_date_of_one_year_ago(first_day)
29
+        # 计算最近一年电视台的平均收视率
30
+        station_dict = {}
31
+        sql = """
32
+            select channel, theater_attribute, avg(audience_rating) as avg_rate from odl.ad_television
33
+            where tv_date > '%s' and tv_date <= '%s'
34
+            group by channel, theater_attribute
35
+        """
36
+        sql = sql % (one_year_ago, first_day)
37
+        rows = Mysql.getAll(sql, conn=conn)
38
+        data_list = []
39
+        sql = """
40
+            insert into tmp.channel_avg_ratings (channel, theater_attribute, value) values (%s, %s, %s)
41
+        """
42
+        for row in rows:
43
+            channel = row['channel']
44
+            theater_attribute = row['theater_attribute']
45
+            avg_rate = row['avg_rate']
46
+            data_list.append((channel, theater_attribute, avg_rate))
47
+        Mysql.insertMany(sql, data_list, conn)
48
+        Mysql.close(conn)
49
+
50
+    # 省级地面频道
51
+    def area(self):
52
+        conn = Mysql.createOfflineConn()
53
+        first_day = Util.get_first_date_of_yesterday()
54
+        one_year_ago = Util.get_max_date_of_one_year_ago(first_day)
55
+        # 计算最近一年电视台的平均收视率
56
+        station_dict = {}
57
+        sql = """
58
+            select channel, theater_attribute, avg(audience_rating) as avg_rate from odl.area_ad_television
59
+            where tv_date > '%s' and tv_date <= '%s'
60
+            group by channel, theater_attribute
61
+        """
62
+        sql = sql % (one_year_ago, first_day)
63
+        rows = Mysql.getAll(sql, conn=conn)
64
+        data_list = []
65
+        sql = """
66
+            insert into tmp.area_channel_avg_ratings (channel, theater_attribute, value) values (%s, %s, %s)
67
+        """
68
+        for row in rows:
69
+            channel = row['channel']
70
+            theater_attribute = row['theater_attribute']
71
+            avg_rate = row['avg_rate']
72
+            data_list.append((channel, theater_attribute, avg_rate))
73
+        Mysql.insertMany(sql, data_list, conn)
74
+        Mysql.close(conn)
75
+
76
+if __name__ == '__main__':
77
+    if len(sys.argv) != 2:
78
+        print '没有输入参数,退出'
79
+        sys.exit(0)
80
+    print 'method name is ' + sys.argv[1]
81
+    obj = channel_avg_ratings()
82
+    try:
83
+        getattr(obj, sys.argv[1])()
84
+    except Exception, e:
85
+        print e

+ 92 - 0
task_yxb/ad_tv_lib_clean.py

@@ -0,0 +1,92 @@
1
+#!/usr/bin/env python
2
+#coding=utf-8
3
+
4
+spec_char = ['栏目剧', '题材', '电视剧', '连续剧', '剧情', '。', ';', ';', ',', ',', '、', '/', ':', ':', '\\', '[1]', '[2]', '[3]', '[4]', '[5]', '[6]', '[7]', '[8]', '[9]', '[10]']
5
+
6
+import sys
7
+from fty_util.common import Mysql, Util
8
+
9
+reload(sys)
10
+sys.setdefaultencoding('utf8')
11
+
12
+conn = Mysql.createOfflineConn()
13
+# 清空数据
14
+sql = """
15
+    select id, director, scriptwritter, main_actors, types, areas, plat_form, pub_comp, online_form, production from yxb.ad_tv_lib where categories = '' or categories is null order by id asc
16
+"""
17
+rows = Mysql.getAll(sql, conn=conn)
18
+for row in rows:
19
+    _id = row['id']
20
+    director = row['director']
21
+    scriptwritter = row['scriptwritter']
22
+    main_actors = row['main_actors']
23
+    types = row['types']
24
+    areas = row['areas']
25
+    plat_form = row['plat_form']
26
+    pub_comp = row['pub_comp']
27
+    online_form = row['online_form']
28
+    production = row['production']
29
+
30
+    if director is not None and len(director) > 0:
31
+        for char in spec_char:
32
+            director = director.replace(char, " ")
33
+
34
+    if scriptwritter is not None and len(scriptwritter) > 0:
35
+        for char in spec_char:
36
+            scriptwritter = scriptwritter.replace(char, " ")
37
+
38
+    if main_actors is not None and len(main_actors) > 0:
39
+        for char in spec_char:
40
+            main_actors = main_actors.replace(char, " ")
41
+
42
+    if areas is not None and len(areas) > 0:
43
+        for char in spec_char:
44
+            areas = areas.replace(char, " ")
45
+
46
+    if plat_form is not None and len(plat_form) > 0:
47
+        for char in spec_char:
48
+            plat_form = plat_form.replace(char, " ")
49
+        
50
+    if pub_comp is not None and len(pub_comp) > 0:
51
+        for char in spec_char:
52
+            pub_comp = pub_comp.replace(char, " ")
53
+
54
+    if online_form is not None and len(online_form) > 0:
55
+        for char in spec_char:
56
+            online_form = online_form.replace(char, " ")
57
+
58
+    if production is not None and len(production) > 0:
59
+        for char in spec_char:
60
+            production = production.replace(char, " ")
61
+
62
+    if types is not None and len(types) > 0:
63
+        for char in spec_char:
64
+            types = types.replace(char, " ")
65
+            types = types.replace("  ", " ")
66
+
67
+    sql = """
68
+        update yxb.ad_tv_lib set director = '%s', scriptwritter = '%s', main_actors = '%s', types = '%s', areas = '%s', plat_form = '%s', pub_comp = '%s', online_form = '%s', production = '%s' where id = '%s'
69
+    """
70
+    sql = sql % (director, scriptwritter, main_actors, types, areas, plat_form, pub_comp, online_form, production, _id)
71
+    Mysql.execute(sql, conn=conn)
72
+
73
+
74
+# 更新 scrapy.types_analyse 类型字段
75
+
76
+
77
+Mysql.close(conn)
78
+# type_set = set()
79
+# for row in rows:
80
+#     _id = row['id']
81
+#     types = row['types']
82
+#     if types is not None and len(types) > 0:
83
+#         for char in spec_char:
84
+#             types = types.replace(char, " ")
85
+        
86
+#         for _type in types.split(" "):
87
+#             type_set.add(_type.strip())
88
+
89
+# for _type in type_set:
90
+#     print _type
91
+
92
+# Mysql.close(conn)

+ 181 - 0
tmp_ad_tv_sr_stat.py

@@ -0,0 +1,181 @@
1
+#encoding=utf-8
2
+#author:wdw110
3
+#功能:统计计算电视剧收视率的基础数据
4
+
5
+from __future__ import division
6
+import re
7
+import math
8
+import time
9
+import copy
10
+import datetime
11
+import numpy as np
12
+from fty_util.common import Mysql
13
+
14
+tv_data = {}
15
+tv_rate = {}
16
+tv_avg_sr = {}
17
+tv_station = {}
18
+tv_play = {}
19
+
20
+conn = Mysql.createOfflineConn()
21
+
22
+sql = "select tv_name,channel,audience_rating,tv_date from odl.ad_television where theater_attribute='黄金剧场'"
23
+data = Mysql.selectAll(sql, conn=conn)
24
+
25
+sql_tv = "select tv_id,tv_name,director,scriptwriter,main_actors,filmer,first_type,show_time from odl.ad_tv_lib where is_use=1"
26
+tmp_data = Mysql.selectAll(sql_tv, conn=conn)
27
+for i in range(len(tmp_data)):
28
+	tv_id = int(tmp_data[i][0])
29
+	tv_name = tmp_data[i][1]
30
+	director = tmp_data[i][2] if tmp_data[i][2] else ''
31
+	scriptwriter = tmp_data[i][3] if tmp_data[i][3] else ''
32
+	actors = tmp_data[i][4] if tmp_data[i][4] else ''
33
+	filmer = tmp_data[i][5] if tmp_data[i][5] else ''
34
+	type1 = tmp_data[i][6] if tmp_data[i][6] else ''
35
+	tv_data[(tv_id,tv_name)] = [director,scriptwriter,actors,filmer,type1]
36
+	tv_play[tv_name] = tmp_data[i][7]
37
+
38
+
39
+for i in range(len(data)):
40
+	tv_name = data[i][0]
41
+	channel = data[i][1]
42
+	aud_rating = float(data[i][2])
43
+	tv_date = datetime.datetime.strftime(data[i][3],'%Y-%m')
44
+	year = data[i][3].year
45
+	if aud_rating and tv_play.has_key(tv_name): #判断电视剧是在ad_tv_lib表中
46
+		show_time = tv_play[tv_name] if tv_play[tv_name] else str(year)
47
+
48
+		if str(year) in show_time:
49
+			tv_station.setdefault(channel,{})
50
+			tv_station[channel].setdefault(tv_date,[])
51
+			tv_station[channel][tv_date].append(aud_rating)
52
+
53
+			tv_rate.setdefault(tv_name,{})
54
+			if not tv_rate.get(tv_name):
55
+				tv_rate[tv_name].setdefault(year,{})
56
+			else:
57
+				yy = tv_rate[tv_name].keys()[0]
58
+				if year < yy:
59
+					del tv_rate[tv_name][yy]
60
+					tv_rate[tv_name][year] = {}
61
+			if tv_rate[tv_name].has_key(year):
62
+				tv_rate[tv_name][year].setdefault(channel,['9999',[]])
63
+				dd = tv_rate[tv_name][year][channel][0]
64
+				if tv_date < dd:
65
+					tv_rate[tv_name][year][channel][0] = tv_date
66
+				tv_rate[tv_name][year][channel][1].append(aud_rating)
67
+
68
+for channel,value in tv_station.items():
69
+	for tv_date in value:
70
+		tmp_arr = value[tv_date]
71
+		avg_rating = sum(tmp_arr)/len(tmp_arr)
72
+		tv_station[channel][tv_date] = avg_rating
73
+
74
+
75
+def avg_rate(Date,obj,channel): #Date:'2014-05',obj:tv_station,channel:电视台
76
+	'''电视台近一年的平均收视率'''
77
+	array = []
78
+	tmp = Date.split('-')
79
+	if int(tmp[1])==1:
80
+		month = '12'
81
+	elif 1<int(tmp[1])<=10:
82
+		month = '0'+str(int(tmp[1])-1)
83
+	else:
84
+		month = str(int(tmp[1])-1)
85
+	Date2 = str(int(tmp[0])-1)+'-'+month
86
+	tmp_obj = obj[channel]
87
+	for tv_date in tmp_obj:
88
+		if Date2<=tv_date<=Date:
89
+			array.append(tmp_obj[tv_date])
90
+	if not len(array):
91
+		print Date,channel,obj[channel]
92
+	res = sum(array)/len(array)
93
+	return res
94
+
95
+#tv_rate_new = copy.deepcopy(tv_rate)
96
+for tv_name in tv_rate:
97
+	year = tv_rate[tv_name].keys()[0]
98
+	tv_sr = []
99
+	for channel,value in tv_rate[tv_name][year].items():
100
+		tv_date = value[0]
101
+		tv_avg_rating = sum(value[1])/len(value[1])
102
+		tv_sr.append(tv_avg_rating/avg_rate(tv_date,tv_station,channel))
103
+	tv_avg_sr[tv_name] = (year,sum(tv_sr)/len(tv_sr))
104
+
105
+people_sr = [{},{},{},{},{}] #每个维度中所有变量的值
106
+for id_name in tv_data:
107
+	tv_id = id_name[0]
108
+	tv_name = id_name[1]
109
+	people_arr = tv_data[id_name]
110
+	if tv_avg_sr.get(tv_name):
111
+		year,avg_sr = tv_avg_sr[tv_name]
112
+		for i in range(len(people_arr)):
113
+			if people_arr[i]:
114
+				p_arr = people_arr[i].split(u' ')
115
+				for peo in p_arr:
116
+					if peo:
117
+						people_sr[i].setdefault(peo,{})
118
+						people_sr[i][peo].setdefault(year,[])
119
+						people_sr[i][peo][year].append((tv_id,avg_sr))
120
+
121
+people_sr_new = copy.deepcopy(people_sr)
122
+people_sr_new2 = copy.deepcopy(people_sr)
123
+for i in range(len(people_sr)):
124
+	for peo in people_sr[i]:
125
+		peo_obj = people_sr[i][peo]
126
+		for year,value in people_sr[i][peo].items():
127
+			people_sr_new[i][peo][year] = []
128
+			people_sr_new2[i][peo][year] = []
129
+			for j in range(2010,year+1):
130
+				if peo_obj.has_key(j):
131
+					tmp_arr = [jj[1] for jj in peo_obj[j]]
132
+					people_sr_new[i][peo][year].extend(tmp_arr)
133
+					people_sr_new2[i][peo][year].extend(peo_obj[j])
134
+
135
+
136
+result_sr = []  #每个电视剧的收视指数信息
137
+for id_name in tv_data:
138
+	tv_id = id_name[0]
139
+	tv_name = id_name[1]
140
+	people_arr = tv_data[id_name]
141
+	if tv_avg_sr.get(tv_name):
142
+		year,avg_sr = tv_avg_sr[tv_name]
143
+		peo_arr = [tv_id,tv_name,avg_sr]
144
+		for i in range(len(people_arr)):
145
+			tmp_str = ''
146
+			p_arr = people_arr[i].split(u' ')
147
+			for peo in p_arr:
148
+				if people_sr_new[i].has_key(peo):
149
+					var = sum(people_sr_new[i][peo][year])/len(people_sr_new[i][peo][year])
150
+					tmp_str += peo + ":" + str(var) + ';'
151
+			peo_arr.append(tmp_str)
152
+		peo_arr.append(year)
153
+		result_sr.append(tuple(peo_arr))
154
+					
155
+result = []
156
+for i in range(len(people_sr_new2)):
157
+	people_obj = people_sr_new2[i]
158
+	for peo in people_obj:
159
+		for year,value in people_obj[peo].items():
160
+			str1 = str([arr[0] for arr in value])
161
+			str2 = str([arr[1] for arr in value])
162
+			result.append((peo,str1,str2,year,i+1))
163
+
164
+
165
+delete = 'delete from tmp.ad_tv_sr_pre_var'
166
+Mysql.execute(delete, conn=conn)
167
+
168
+sql = 'insert into tmp.ad_tv_sr_pre_var values(%s,%s,%s,%s,%s)'
169
+for i in range(int(len(result)/1000)+1):
170
+	tmp = result[i*1000:(i+1)*1000]
171
+	Mysql.insertMany(sql, tmp, conn=conn)
172
+
173
+delete = 'delete from tmp.ad_tv_sr'
174
+Mysql.execute(delete, conn=conn)
175
+
176
+sql_sr = 'insert into tmp.ad_tv_sr values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'
177
+for i in range(int(len(result_sr)/1000)+1):
178
+	tmp = result_sr[i*1000:(i+1)*1000]
179
+	Mysql.insertMany(sql_sr, tmp, conn=conn)
180
+
181
+Mysql.close(conn)

+ 226 - 0
tv_outline_recom.py

@@ -0,0 +1,226 @@
1
+#encoding=utf-8
2
+#author:wdw110
3
+#功能:离线计算电视剧的相似剧
4
+
5
+from __future__ import division
6
+import re
7
+import math
8
+import jieba
9
+import numpy as np
10
+from fty_util.common import Mysql
11
+
12
+tv_tf = {}  #{id:[[{},{},..],..],...}
13
+idf = {}
14
+idf_aft = {}
15
+var_stat = [[],[],[],[],[],[],[]] #各维度的词统计
16
+seq2id = {}
17
+weight = [5,2,1,1,1,1,2]
18
+tags = {} #标签库
19
+
20
+conn = Mysql.createOfflineConn()
21
+
22
+dims = ['tv_id','types','description','director','main_actors','scriptwriter','filmer','decade']
23
+
24
+sql = "select %s from odl.ad_tv_lib where is_use=1" %(', '.join(dims))
25
+tv_data = Mysql.selectAll(sql, conn=conn)
26
+
27
+sql2 = 'select tag from odl.ad_type_lib'
28
+tmp = Mysql.selectAll(sql2, conn=conn)
29
+for word in tmp:
30
+	tags[word[0]] = 1
31
+
32
+def find_tag(sentence): #sentence为电视剧的描述信息
33
+	seg = jieba.cut(sentence)
34
+	res = {}
35
+	for word in seg:
36
+		if tags.get(word):
37
+			res.setdefault(word,1)
38
+	return u' '.join(res.keys())
39
+
40
+
41
+for i in range(len(tv_data)):
42
+	tv_id = int(tv_data[i][0])
43
+	tv_data[i] = list(tv_data[i])
44
+	arr = tv_data[i][1:]
45
+	tmp = []  #每个电视剧的所有关键词
46
+	word_count = {} #每个电视剧的关键词的数量
47
+	dim_tmp = [] #每个电视剧的每个维度的关键词统计[[{},{}..],..]
48
+	tv_tf.setdefault(tv_id,[])
49
+	seq2id[i] = tv_id
50
+	if not arr[1]: 
51
+		arr[1] = ''
52
+		tv_data[i][2] = ''
53
+	else: 
54
+		arr[1] = find_tag(arr[1])
55
+		tv_data[i][2] = arr[1]
56
+	for i in range(len(arr)):
57
+		obj = {}
58
+		if not arr[i]: 
59
+			wd = u''
60
+		else:
61
+			wd = arr[i]
62
+		words = wd.split(u' ')
63
+		#print words			
64
+		for word in words:
65
+			if word:
66
+				obj[word] = 1
67
+				word_count.setdefault(word,0)
68
+				word_count[word] += 1
69
+		dim_tmp.append(obj)
70
+		var_stat[i].extend(obj.keys())
71
+		tmp.extend(obj.keys()) 
72
+	n = len(tmp) #每个电视剧的总词数
73
+	for obj_j in dim_tmp:
74
+		for k in obj_j:
75
+			obj_j[k] = word_count[k]/n
76
+	tv_tf[tv_id] = dim_tmp
77
+	for word in list(set(tmp)):
78
+		idf.setdefault(word,0)
79
+		idf[word] += 1
80
+
81
+N = len(tv_tf)  #总电视剧数量
82
+for key in idf:
83
+	idf_aft[key] = math.log10(N/idf[key])
84
+
85
+for i in range(len(var_stat)):
86
+	var_stat[i] = list(set(var_stat[i])) #去重处理
87
+
88
+
89
+#计算电视剧矩阵得分
90
+def tv_score(weight, tf, idf):
91
+	col = len(tf)
92
+	row = sum([len(v) for v in var_stat])
93
+	res = np.zeros((col, row))
94
+	score_arr = {}
95
+	for i in range(col):
96
+		tv_arr = tf[seq2id[i]]
97
+		mm = 0 #每个词的位置
98
+		score_arr.setdefault(i,[])
99
+		for j in range(len(tv_arr)):
100
+			tmp2 = np.zeros(len(var_stat[j])) #每个维度的向量
101
+			if j>0: mm += len(var_stat[j-1]) 
102
+			for word,value in tv_arr[j].items():
103
+				score = weight[j]*value*idf[word]
104
+				ll = var_stat[j].index(word)
105
+				nn = ll + mm
106
+				res[i,nn] = score
107
+				tmp2[ll] = score
108
+			score_arr[i].append(tmp2)
109
+	return res,score_arr
110
+
111
+def cos_distance(vec1, vec2):
112
+	v11 = vec1*vec1
113
+	v12 = vec1*vec2
114
+	v22 = vec2*vec2
115
+	mer = sum(v12[v12>0])
116
+	denominator = math.sqrt(sum(v11[v11>0])) + math.sqrt(sum(v22[v22>0]))
117
+	if not denominator:
118
+		return 0
119
+	return mer/denominator
120
+
121
+def tv_sim(data): #tv_id:要计算的电视剧(1,2,3...),data:电视剧得分矩阵({1:[],2:[]})
122
+	n,m = data.shape
123
+	res = np.zeros((n,n))
124
+	result = []
125
+	x = range(1,n+1)
126
+	for i in range(n):
127
+		res[i,i] = 1
128
+		for j in range(i+1,n):
129
+			res[i,j] = cos_distance(data[i,],data[j,])
130
+			res[j,i] =  res[i,j]
131
+		index_arr = np.argsort(-res[i,])
132
+		sort_arr = res[i,][index_arr]
133
+		id_arr = np.array([seq2id[i] for i in index_arr])
134
+		tmp = zip(id_arr,sort_arr)
135
+		result.append(dict(enumerate(tmp[0:100])))
136
+	return result
137
+
138
+dat,score_mat = tv_score(weight,tv_tf,idf_aft)
139
+res_sim = tv_sim(dat)
140
+
141
+
142
+#将结果和中间数据保存到数据库中
143
+'''
144
+sql = 'delete from idl.ad_tv_cos'
145
+cursor.execute(sql)
146
+db.commit()
147
+
148
+vv = []
149
+for i in range(len(res_sim)):
150
+	sim_arr = []
151
+	for key,value in res_sim[i].items():
152
+		sim_arr.append(value[0])
153
+	vv.append((seq2id[i],str(res_sim[i]),str(sim_arr)))
154
+
155
+sql = 'insert into idl.ad_tv_cos values (%s,%s,%s)'
156
+
157
+for i in range(int(len(vv)/1000)+1):
158
+	tmp = vv[i*1000:(i+1)*1000]
159
+	cursor.executemany(sql,tmp)
160
+db.commit()
161
+'''
162
+
163
+delete = 'delete from tmp.ad_tv_recom_idf'
164
+Mysql.execute(delete, conn=conn)
165
+
166
+tmp_ll = list(idf.items())
167
+vv = [(i+1,tmp_ll[i][0],tmp_ll[i][1],N) for i in range(len(tmp_ll))]
168
+sql = 'insert into tmp.ad_tv_recom_idf values(%s,%s,%s,%s)'
169
+Mysql.insertMany(sql, vv, conn=conn)
170
+
171
+
172
+delete = 'delete from tmp.ad_tv_recom_tf'
173
+Mysql.execute(delete, conn=conn)
174
+
175
+vv = []
176
+for key,tv_arr in tv_tf.items():
177
+	tmp = []
178
+	tmp.append(int(key))
179
+	for tv_obj in tv_arr:
180
+		ss = ';'.join([k.encode('utf-8')+':'+str(v) for k,v in tv_obj.items()])
181
+		tmp.append(ss)
182
+	vv.append(tuple(tmp))
183
+sql = 'insert into tmp.ad_tv_recom_tf values(%s,%s,%s,%s,%s,%s,%s,%s)'
184
+Mysql.insertMany(sql, vv, conn=conn)
185
+
186
+delete = 'delete from tmp.ad_tv_recom_var_stat'
187
+Mysql.execute(delete, conn=conn)
188
+
189
+dim_arr = [(','.join(tmp_arr) for tmp_arr in var_stat)]
190
+sql = 'insert into tmp.ad_tv_recom_var_stat values(%s,%s,%s,%s,%s,%s,%s)'
191
+Mysql.insertMany(sql, dim_arr, conn=conn)
192
+
193
+Mysql.close(conn)
194
+
195
+
196
+#将结果保存到本地
197
+f1 = open('ad_tv_recom_score_matrix.txt','w')
198
+f1.write('id\ttype\ttag\tdirector\tmain_actors\tscriptwritter\tproduction\n')
199
+for i in range(dat.shape[0]):
200
+	ss = str(seq2id[i])
201
+	for tt in score_mat[i]:
202
+		ss += '\t'+','.join([str(i)+':'+str(tt[i]) for i in np.nonzero(tt)[0]])
203
+	f1.write(ss+'\n')
204
+
205
+f1.close()
206
+'''
207
+def en2str(word):
208
+	return word.encode('utf-8')
209
+
210
+f2 = open('ad_tv_recom_var_stat.txt','w')
211
+f2.write('type\ttag\tdirector\tmain_actors\tscriptwritter\tproduction\n')
212
+ss = '\t'.join([','.join(map(en2str,tmp_arr)) for tmp_arr in var_stat])
213
+f2.write(ss+'\n')
214
+
215
+f2.close()
216
+
217
+f3 = open('data/tv_outline_cos1.txt','w')
218
+for i in range(len(res_sim)):
219
+	sim_arr = []
220
+	for key,value in res_sim[i].items():
221
+		sim_arr.append(value[0])
222
+	f3.write(str(seq2id[i])+'\t'+str(res_sim[i])+'\t'+str(sim_arr)+'\n')
223
+
224
+f1.close()
225
+f3.close()
226
+'''

+ 228 - 0
tv_real_recom_fix.py

@@ -0,0 +1,228 @@
1
+#encoding=utf-8
2
+
3
+from __future__ import division
4
+import re
5
+import sys
6
+import time
7
+import math
8
+import jieba
9
+import datetime
10
+import numpy as np 
11
+from fty_util.common import Mysql
12
+
13
+start = time.time()
14
+tf = {} #{id:[{},{},..],...}
15
+idf_bre = {}
16
+idf_aft = {}
17
+tv_data = []  #新电视剧的变量数据
18
+score_mat = {} #电视剧得分矩阵{id:[{},{}...],...}
19
+tags = {} #标签库
20
+weight = [5,2,1,1,1,1,2]
21
+
22
+if len(sys.argv) > 1:
23
+	tv_id = int(sys.argv[1])
24
+else:
25
+	print '请输入电视剧id'
26
+	sys.exit()
27
+
28
+conn = Mysql.createOfflineConn()
29
+
30
+sql1 = 'select * from tmp.ad_tv_recom_idf'
31
+tmp = Mysql.selectAll(sql1, conn=conn)
32
+tv_sum = tmp[0][3] #历史总电视剧数
33
+for i in range(len(tmp)):
34
+	arr = tmp[i]
35
+	idf_bre[arr[1]] = arr[2]
36
+
37
+sql2 = 'select * from tmp.ad_tv_recom_var_stat'
38
+tmp = Mysql.selectAll(sql2, conn=conn)
39
+var_stat = [word.split(',') for word in tmp[0]] #各维度的词统计
40
+
41
+ff = open('ad_tv_recom_score_matrix.txt','r')
42
+title = '' #文本的列标题
43
+for line in ff.readlines():
44
+	arr = line.strip('\n').split('\t')
45
+	if arr[0] == 'id': 
46
+		title = line
47
+	else:
48
+		k = int(arr[0])
49
+		score_mat.setdefault(k,[])
50
+		for j in arr[1:]:
51
+			obj = {}
52
+			if len(j):
53
+				for ss in j.split(','):
54
+					tmp_arr = ss.split(':')
55
+					obj[int(tmp_arr[0])] = tmp_arr[1] 
56
+	 		score_mat[k].append(obj)
57
+ff.close()
58
+
59
+sql4 = 'select tag from odl.ad_type_lib'
60
+tmp = Mysql.selectAll(sql4, conn=conn)
61
+for word in tmp:
62
+	tags[word[0]] = 1
63
+
64
+dims = ['tv_id','types','description','director','main_actors','scriptwriter','filmer','decade']
65
+
66
+sql = "select %s from odl.ad_tv_lib where tv_id=%d" %(', '.join(dims),tv_id)
67
+tv_data = Mysql.selectAll(sql, conn=conn)
68
+
69
+def find_tag(sentence): #sentence为电视剧的描述信息
70
+	seg = jieba.cut(sentence)
71
+	res = {}
72
+	for word in seg:
73
+		if tags.get(word):
74
+			res.setdefault(word,1)
75
+	return u' '.join(res.keys())
76
+
77
+for i in range(len(tv_data)):
78
+	tv_data[i] = list(tv_data[i])
79
+	tv_data[i][0] = int(tv_data[i][0])
80
+	key = tv_data[i][0]
81
+	arr = tv_data[i][1:]
82
+	tmp = []  #每个电视剧的所有关键词
83
+	dim_tmp = [] #每个电视剧的每个维度的关键词统计[[{},{}..],..]
84
+	if key not in score_mat:
85
+		tv_sum += 1 #所有电视剧的数量
86
+	if not arr[1]: 
87
+		arr[1] = ''
88
+		tv_data[i][2] = ''
89
+	else: 
90
+		arr[1] = find_tag(arr[1])
91
+		tv_data[i][2] = arr[1]
92
+	for j in range(len(arr)):
93
+		obj = {}
94
+		if not arr[j]: 
95
+			wd = u''
96
+		else:
97
+			wd = arr[j]
98
+		words = wd.split(u' ')
99
+		words = list(set(words))
100
+		if u'' in words:
101
+			words.remove(u'')
102
+		tmp.extend(words) 
103
+		for word in words:
104
+			obj.setdefault(word, 0)
105
+			obj[word] += 1
106
+		dim_tmp.append(obj)
107
+	n = len(tmp) #每个电视剧的总词数
108
+	for l in range(len(dim_tmp)):
109
+		obj_j = dim_tmp[l]
110
+		for k in obj_j:
111
+			if n: obj_j[k] /= n
112
+			else: obj_j[k] = 0
113
+			if k not in var_stat[l]: #判断新剧的关键词是否在历史关键词库中
114
+				var_stat[l].append(k)
115
+	tf[key] = dim_tmp
116
+	for ww in list(set(tmp)):
117
+		if not idf_bre.has_key(ww):
118
+			idf_bre[ww] = 1
119
+		else:
120
+			if key not in score_mat:
121
+				idf_bre[ww] += 1 
122
+
123
+for key in idf_bre:
124
+	idf_aft[key] = math.log10(tv_sum/idf_bre[key])
125
+
126
+
127
+#对历史电视剧的得分矩阵重新计算
128
+length = sum([len(v) for v in var_stat])
129
+for key in score_mat:
130
+	tmp_arr = score_mat[key]
131
+	tmp = np.zeros(length)
132
+	ll = 0
133
+	for i in range(len(var_stat)):
134
+		if i > 0: ll += len(var_stat[i-1])
135
+		mat = tmp_arr[i]
136
+		for k,v in mat.items():
137
+			tmp[ll+k] = v
138
+	score_mat[key] = tmp
139
+
140
+
141
+#计算电视剧矩阵得分
142
+def tv_score(weight, tf, idf):
143
+	res = {}
144
+	row = sum([len(v) for v in var_stat])
145
+	for i in tf:
146
+		tv_arr = tf[i]
147
+		mm = 0 #每个词的位置
148
+		res.setdefault(i,np.zeros(row))
149
+		for j in range(len(tv_arr)):
150
+			if j>0: mm += len(var_stat[j-1]) 
151
+			for word,value in tv_arr[j].items():
152
+				score = weight[j]*value*idf[word]
153
+				nn = var_stat[j].index(word) + mm
154
+				res[i][nn] = score
155
+	return res
156
+
157
+def cos_distance(vec1, vec2):
158
+	v11 = vec1*vec1
159
+	v12 = vec1*vec2
160
+	v22 = vec2*vec2
161
+	mer = sum(v12[v12>0])
162
+	denominator = math.sqrt(sum(v11[v11>0])) + math.sqrt(sum(v22[v22>0]))
163
+	if not denominator:
164
+		return 0
165
+	return mer/denominator
166
+
167
+def tv_sim(tv_id,data): #tv_id:要计算的电视剧(1,2,3...),data:电视剧得分矩阵({1:[],2:[]})
168
+	res = []
169
+	vec1 = data[tv_id]
170
+	for key,tv_arr in data.items():
171
+		cos = cos_distance(vec1,tv_arr)
172
+		res.append([key,cos])
173
+	return dict(enumerate(sorted(res,key=lambda x:x[1],reverse=True)[0:400]))
174
+
175
+dat = tv_score(weight,tf,idf_aft)
176
+score_mat_new = dict(score_mat,**dat) #将新剧和老剧的得分合并
177
+
178
+
179
+#将结果和中间数据保存到数据库中
180
+for key in dat:
181
+	res = tv_sim(key, score_mat_new)
182
+	sim_arr = ','.join([str(i[0]) for i in res.values()])
183
+	sql = 'replace into idl.ad_tv_4sim_wmd values ("%d","%s")' %(key,sim_arr)
184
+	Mysql.execute(sql, conn=conn)
185
+
186
+
187
+vv = []
188
+for key,tv_arr in tf.items():
189
+	tmp = []
190
+	tmp.append(int(key))
191
+	for tv_obj in tv_arr:
192
+		ss = ';'.join([k.encode('utf-8')+':'+str(v) for k,v in tv_obj.items()])
193
+		tmp.append(ss)
194
+	if key not in score_mat:
195
+		sql = 'replace into tmp.ad_tv_recom_tf values("%s","%s","%s","%s","%s","%s","%s","%s")' % tuple(tmp)
196
+		Mysql.execute(sql, conn=conn)
197
+
198
+
199
+delete = 'delete from tmp.ad_tv_recom_idf'
200
+Mysql.execute(delete, conn=conn)
201
+
202
+tmp_ll = list(idf_bre.items())
203
+vv = [(i+1,tmp_ll[i][0],tmp_ll[i][1],tv_sum) for i in range(len(tmp_ll))]
204
+sql = 'insert into tmp.ad_tv_recom_idf values(%s,%s,%s,%s)'
205
+Mysql.insertMany(sql, vv, conn=conn)
206
+
207
+
208
+delete = 'delete from tmp.ad_tv_recom_var_stat'
209
+Mysql.execute(delete, conn=conn)
210
+
211
+dim_arr = [(','.join(tmp_arr) for tmp_arr in var_stat)]
212
+sql = 'insert into tmp.ad_tv_recom_var_stat values(%s,%s,%s,%s,%s,%s,%s)'
213
+Mysql.insertMany(sql, dim_arr, conn=conn)
214
+
215
+
216
+f1 = open('ad_tv_recom_score_matrix.txt','a')
217
+for tv_id,np_arr in dat.items():
218
+	nn = 0
219
+	if tv_id not in score_mat:
220
+		res = str(tv_id)
221
+		for arr in var_stat:
222
+			tmp = np_arr[nn:(nn+len(arr))]
223
+			nn +=  len(arr)
224
+			res += '\t' + ','.join([str(i)+':'+str(tmp[i]) for i in np.nonzero(tmp)[0]])
225
+		f1.write(res+'\n')	
226
+f1.close()
227
+
228
+Mysql.close(conn)