5 years ago · f61a70b000
--- a/.gitignore
+++ b/.gitignore
@@ -1,60 +1,61 @@
 
				-# ---> Python
			
 
				-# Byte-compiled / optimized / DLL files
			
 
				-__pycache__/
			
 
				-*.py[cod]
			
 
				-*$py.class
			
 
				-
			
 
				-# C extensions
			
 
				-*.so
			
 
				-
			
 
				-# Distribution / packaging
			
 
				-.Python
			
 
				-env/
			
 
				-build/
			
 
				-develop-eggs/
			
 
				-dist/
			
 
				-downloads/
			
 
				-eggs/
			
 
				-.eggs/
			
 
				-lib/
			
 
				-lib64/
			
 
				-parts/
			
 
				-sdist/
			
 
				-var/
			
 
				-*.egg-info/
			
 
				-.installed.cfg
			
 
				-*.egg
			
 
				-
			
 
				-# PyInstaller
			
 
				-#  Usually these files are written by a python script from a template
			
 
				-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
			
 
				-*.manifest
			
 
				-*.spec
			
 
				-
			
 
				-# Installer logs
			
 
				-pip-log.txt
			
 
				-pip-delete-this-directory.txt
			
 
				-
			
 
				-# Unit test / coverage reports
			
 
				-htmlcov/
			
 
				-.tox/
			
 
				-.coverage
			
 
				-.coverage.*
			
 
				-.cache
			
 
				-nosetests.xml
			
 
				-coverage.xml
			
 
				-*,cover
			
 
				-
			
 
				-# Translations
			
 
				-*.mo
			
 
				-*.pot
			
 
				-
			
 
				-# Django stuff:
			
 
				-*.log
			
 
				-
			
 
				-# Sphinx documentation
			
 
				-docs/_build/
			
 
				-
			
 
				-# PyBuilder
			
 
				-target/
			
 
				-
			
 
				+# ---> Python

			
 
				+# Byte-compiled / optimized / DLL files

			
 
				+__pycache__/

			
 
				+*.py[cod]

			
 
				+*$py.class

			
 
				+

			
 
				+# C extensions

			
 
				+*.so

			
 
				+

			
 
				+# Distribution / packaging

			
 
				+.Python

			
 
				+env/

			
 
				+build/

			
 
				+develop-eggs/

			
 
				+dist/

			
 
				+downloads/

			
 
				+eggs/

			
 
				+.eggs/

			
 
				+lib/

			
 
				+lib64/

			
 
				+parts/

			
 
				+sdist/

			
 
				+var/

			
 
				+*.egg-info/

			
 
				+.installed.cfg

			
 
				+*.egg

			
 
				+

			
 
				+# PyInstaller

			
 
				+#  Usually these files are written by a python script from a template

			
 
				+#  before PyInstaller builds the exe, so as to inject date/other infos into it.

			
 
				+*.manifest

			
 
				+*.spec

			
 
				+

			
 
				+# Installer logs

			
 
				+pip-log.txt

			
 
				+pip-delete-this-directory.txt

			
 
				+

			
 
				+# Unit test / coverage reports

			
 
				+htmlcov/

			
 
				+.tox/

			
 
				+.coverage

			
 
				+.coverage.*

			
 
				+.cache

			
 
				+nosetests.xml

			
 
				+coverage.xml

			
 
				+*,cover

			
 
				+

			
 
				+# Translations

			
 
				+*.mo

			
 
				+*.pot

			
 
				+

			
 
				+# Django stuff:

			
 
				+*.log

			
 
				+

			
 
				+# Sphinx documentation

			
 
				+docs/_build/

			
 
				+

			
 
				+# PyBuilder

			
 
				+target/

			
 
				+

			
 
				+.DS_Store

			
--- a/ad_tv_recom_score_matrix.txt
+++ b/ad_tv_recom_score_matrix.txt
--- a/bash_near_real_job.sh
+++ b/bash_near_real_job.sh
@@ -0,0 +1,34 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# 判断脚本执行路径是否存在，如果不存在则需要设置
			
 
				+if [ -z $HUOJU_FTY_PATH ];
			
 
				+then 
			
 
				+    echo "not found"
			
 
				+    export HUOJU_FTY_PATH=/root/py_script/
			
 
				+else
			
 
				+    echo "found"
			
 
				+fi
			
 
				+echo $HUOJU_FTY_PATH
			
 
				+
			
 
				+###############################操作################################
			
 
				+#电视剧相似剧计算
			
 
				+echo "tv_real_recom_fix"
			
 
				+python ${HUOJU_FTY_PATH}tv_real_recom_fix.py $1
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="tv_real_recom_fix"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+#电视剧收视指数预测
			
 
				+echo "online_ad_tv_sr_pre"
			
 
				+python ${HUOJU_FTY_PATH}online_ad_tv_sr_pre.py $1
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="online_ad_tv_sr_pre"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+echo "脚本执行完毕"
			
--- a/config.cfg
+++ b/config.cfg
@@ -0,0 +1,20 @@
 
				+[basic]
			
 
				+tmp_path = /Users/wudancheng/huoju_fty_home/tmp_data/
			
 
				+
			
 
				+[online_config]
			
 
				+host = 121.41.17.212
			
 
				+user = root
			
 
				+password = huojutech_yaozhi!23
			
 
				+port = 3306
			
 
				+
			
 
				+[offline_config]
			
 
				+host = 121.41.17.212
			
 
				+user = root
			
 
				+password = huojutech_yaozhi!23
			
 
				+port = 3306
			
 
				+
			
 
				+[scrapy_config]
			
 
				+host = 121.41.17.212
			
 
				+user = root
			
 
				+password = huojutech_yaozhi!23
			
 
				+port = 3306
			
--- a/dags/config.py
+++ b/dags/config.py
@@ -0,0 +1,4 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+SCRIPT_PATH = '/Users/wudancheng/huoju/code/huoju_fty/py_script'
			
--- a/dags/daily_dag.py
+++ b/dags/daily_dag.py
@@ -0,0 +1,43 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+from airflow.models import DAG
			
 
				+from airflow.operators.bash_operator import BashOperator
			
 
				+from airflow.operators.subdag_operator import SubDagOperator
			
 
				+from subdags.idl_subdag import idl_subdag
			
 
				+import datetime
			
 
				+from config import *
			
 
				+
			
 
				+default_args = {
			
 
				+    'owner' : 'wdc',
			
 
				+    'depends_on_past' : False,
			
 
				+    'start_date' : datetime.datetime(2017, 01, 01),
			
 
				+    'email' : ['wdc@huojutech.com'],
			
 
				+    'email_on_failure' : False,
			
 
				+    'email_on_retry': False,
			
 
				+    # 'retries' : 3,
			
 
				+    # 'retry_delay': datetime.timedelta(minutes=1),
			
 
				+}
			
 
				+
			
 
				+DAG_NAME = 'daily_dag'
			
 
				+
			
 
				+# 每天零点三十分执行
			
 
				+dag = DAG(
			
 
				+    dag_id=DAG_NAME,
			
 
				+    default_args=default_args,
			
 
				+    schedule_interval='30 0 * * *',
			
 
				+)
			
 
				+
			
 
				+# 每日增量更新
			
 
				+odl_ad_television_incr_update = BashOperator(
			
 
				+    task_id='odl_ad_television_incr_update',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python odl_ad_television_incr_update.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+
			
 
				+# 每日排名更新
			
 
				+idl_rank_update = BashOperator(
			
 
				+    task_id='idl_rank_update',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python idl_rank_update.py',
			
 
				+    dag=dag,
			
 
				+)
			
--- a/dags/fty_operator.py
+++ b/dags/fty_operator.py
@@ -0,0 +1,205 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+from airflow.models import DAG
			
 
				+from airflow.operators.bash_operator import BashOperator
			
 
				+from airflow.operators.subdag_operator import SubDagOperator
			
 
				+from subdags.idl_subdag import idl_subdag
			
 
				+import datetime
			
 
				+from config import *
			
 
				+
			
 
				+default_args = {
			
 
				+    'owner' : 'wdc',
			
 
				+    'depends_on_past' : False,
			
 
				+    'start_date' : datetime.datetime(2017, 01, 01),
			
 
				+    'email' : ['wdc@huojutech.com'],
			
 
				+    'email_on_failure' : False,
			
 
				+    'email_on_retry': False,
			
 
				+    # 'retries' : 3,
			
 
				+    # 'retry_delay': datetime.timedelta(minutes=1),
			
 
				+}
			
 
				+
			
 
				+DAG_NAME = 'fty_operator'
			
 
				+
			
 
				+dag = DAG(
			
 
				+    dag_id=DAG_NAME,
			
 
				+    default_args=default_args,
			
 
				+    schedule_interval='0 1 * * *',
			
 
				+)
			
 
				+
			
 
				+# 电视台收视率统计
			
 
				+tmp_ad_television_stat_task = BashOperator(
			
 
				+    task_id='tmp_ad_television_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_television_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+
			
 
				+# 月份统计
			
 
				+tmp_ad_tv_station_mid_month_stat_task = BashOperator(
			
 
				+    task_id='tmp_ad_tv_station_mid_month_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_month_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+
			
 
				+# 每月的数量，按电视台、剧场、类型分组
			
 
				+tmp_ad_tv_station_mid_quantity_stat_task = BashOperator(
			
 
				+    task_id='tmp_ad_tv_station_mid_quantity_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_quantity_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+
			
 
				+# 平台近一年平均收视率统计
			
 
				+tmp_ad_tv_station_mid_avg_ratings_stat_task = BashOperator(
			
 
				+    task_id='tmp_ad_tv_station_mid_avg_ratings_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_avg_ratings_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+
			
 
				+# 平台近一年去噪平均收视率
			
 
				+tmp_ad_tv_station_mid_avg_ratings_denoising_stat_task = BashOperator(
			
 
				+    task_id='tmp_ad_tv_station_mid_avg_ratings_denoising_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_avg_ratings_denoising_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+
			
 
				+# 每月平均收视率统计
			
 
				+tmp_ad_tv_station_mid_ratings_stat_task = BashOperator(
			
 
				+    task_id='tmp_ad_tv_station_mid_ratings_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_ratings_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+
			
 
				+# 每月平均收视指数统计
			
 
				+tmp_ad_tv_station_mid_ratings_index_stat_task = BashOperator(
			
 
				+    task_id='tmp_ad_tv_station_mid_ratings_index_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_ratings_index_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+tmp_ad_tv_station_mid_ratings_index_stat_task.set_upstream(tmp_ad_tv_station_mid_avg_ratings_stat_task)
			
 
				+
			
 
				+# 每月去噪收视率统计
			
 
				+tmp_ad_tv_station_mid_ratings_denoising_stat_task = BashOperator(
			
 
				+    task_id='tmp_ad_tv_station_mid_ratings_denoising_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_ratings_denoising_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+
			
 
				+# 每月去噪收视指数统计
			
 
				+tmp_ad_tv_station_mid_ratings_index_denoising_stat_task = BashOperator(
			
 
				+    task_id='tmp_ad_tv_station_mid_ratings_index_denoising_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_ratings_index_denoising_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+tmp_ad_tv_station_mid_ratings_index_denoising_stat_task.set_upstream(tmp_ad_tv_station_mid_avg_ratings_denoising_stat_task)
			
 
				+
			
 
				+# 平台近一年数量统计
			
 
				+tmp_ad_tv_station_mid_type_stat_task = BashOperator(
			
 
				+    task_id='tmp_ad_tv_station_mid_type_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_mid_type_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+tmp_ad_tv_station_mid_type_stat_task.set_upstream(tmp_ad_tv_station_mid_quantity_stat_task)
			
 
				+
			
 
				+# 类型热点统计
			
 
				+tmp_tv_station_type_hot_task = BashOperator(
			
 
				+    task_id='tmp_tv_station_type_hot',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_tv_station_type_hot.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+tmp_tv_station_type_hot_task.set_upstream(tmp_ad_tv_station_mid_type_stat_task)
			
 
				+
			
 
				+# 类型趋势统计
			
 
				+tmp_tv_station_type_trend_task = BashOperator(
			
 
				+    task_id='tmp_tv_station_type_trend',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_tv_station_type_trend.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+tmp_tv_station_type_trend_task.set_upstream(tmp_ad_tv_station_mid_type_stat_task)
			
 
				+
			
 
				+# 类型偏好统计
			
 
				+tmp_tv_station_type_preference_task = BashOperator(
			
 
				+    task_id='tmp_tv_station_type_preference',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_tv_station_type_preference.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+tmp_tv_station_type_preference_task.set_upstream(tmp_tv_station_type_hot_task)
			
 
				+tmp_tv_station_type_preference_task.set_upstream(tmp_tv_station_type_trend_task)
			
 
				+
			
 
				+# 收视指数偏好统计
			
 
				+tmp_tv_station_ratings_index_preference_task = BashOperator(
			
 
				+    task_id='tmp_tv_station_ratings_index_preference',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_tv_station_ratings_index_preference.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+tmp_tv_station_ratings_index_preference_task.set_upstream(tmp_ad_tv_station_mid_ratings_index_stat_task)
			
 
				+
			
 
				+# 去噪收视指数偏好统计
			
 
				+tmp_tv_station_ratings_index_denoising_preference_task = BashOperator(
			
 
				+    task_id='tmp_tv_station_ratings_index_denoising_preference',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_tv_station_ratings_index_denoising_preference.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+tmp_tv_station_ratings_index_denoising_preference_task.set_upstream(tmp_ad_tv_station_mid_ratings_index_denoising_stat_task)
			
 
				+
			
 
				+# 收视率趋势
			
 
				+tmp_ratings_current_trending_task = BashOperator(
			
 
				+    task_id='tmp_ratings_current_trending',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ratings_current_trending.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+tmp_ratings_current_trending_task.set_upstream(tmp_ad_tv_station_mid_ratings_stat_task)
			
 
				+
			
 
				+# 排名趋势
			
 
				+tmp_rank_trending_task = BashOperator(
			
 
				+    task_id='tmp_rank_trending',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_rank_trending.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+tmp_rank_trending_task.set_upstream(tmp_ad_tv_station_mid_ratings_stat_task)
			
 
				+
			
 
				+# 电视台推荐
			
 
				+tmp_tv_station_recommend_task = BashOperator(
			
 
				+    task_id='tmp_tv_station_recommend',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_tv_station_recommend.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+tmp_tv_station_recommend_task.set_upstream(tmp_ad_tv_station_mid_ratings_index_stat_task)
			
 
				+tmp_tv_station_recommend_task.set_upstream(tmp_tv_station_type_preference_task)
			
 
				+tmp_tv_station_recommend_task.set_upstream(tmp_tv_station_ratings_index_preference_task)
			
 
				+
			
 
				+# 电视台去噪推荐
			
 
				+tmp_tv_station_recommend_denoising_task = BashOperator(
			
 
				+    task_id='tmp_tv_station_recommend_denoising',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_tv_station_recommend_denoising.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+tmp_tv_station_recommend_denoising_task.set_upstream(tmp_ad_tv_station_mid_ratings_index_denoising_stat_task)
			
 
				+tmp_tv_station_recommend_denoising_task.set_upstream(tmp_tv_station_type_preference_task)
			
 
				+tmp_tv_station_recommend_denoising_task.set_upstream(tmp_tv_station_ratings_index_denoising_preference_task)
			
 
				+
			
 
				+idl_task = SubDagOperator(
			
 
				+    task_id='idl_task',
			
 
				+    subdag=idl_subdag(DAG_NAME, 'idl_task', default_args),
			
 
				+    default_args=default_args,
			
 
				+    dag=dag,
			
 
				+)
			
 
				+
			
 
				+# 设置月份下行依赖
			
 
				+tmp_ad_tv_station_mid_month_stat_task.set_downstream(tmp_ad_tv_station_mid_quantity_stat_task)
			
 
				+tmp_ad_tv_station_mid_month_stat_task.set_downstream(tmp_ad_tv_station_mid_avg_ratings_stat_task)
			
 
				+tmp_ad_tv_station_mid_month_stat_task.set_downstream(tmp_ad_tv_station_mid_avg_ratings_denoising_stat_task)
			
 
				+tmp_ad_tv_station_mid_month_stat_task.set_downstream(tmp_ad_tv_station_mid_ratings_stat_task)
			
 
				+tmp_ad_tv_station_mid_month_stat_task.set_downstream(tmp_ad_tv_station_mid_ratings_index_stat_task)
			
 
				+tmp_ad_tv_station_mid_month_stat_task.set_downstream(tmp_ad_tv_station_mid_ratings_denoising_stat_task)
			
 
				+tmp_ad_tv_station_mid_month_stat_task.set_downstream(tmp_ad_tv_station_mid_ratings_index_denoising_stat_task)
			
 
				+tmp_ad_tv_station_mid_month_stat_task.set_downstream(tmp_ad_tv_station_mid_type_stat_task)
			
 
				+
			
 
				+
			
 
				+idl_task.set_upstream(tmp_ad_tv_station_mid_ratings_stat_task)
			
 
				+idl_task.set_upstream(tmp_ad_tv_station_mid_ratings_index_stat_task)
			
 
				+
			
 
				+tmp_tv_station_rank_task = BashOperator(
			
 
				+    task_id='tmp_tv_station_rank',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_tv_station_rank.py',
			
 
				+    dag=dag,
			
 
				+)
			
--- a/dags/once_dag.py
+++ b/dags/once_dag.py
@@ -0,0 +1,65 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+from airflow.models import DAG
			
 
				+from airflow.operators.bash_operator import BashOperator
			
 
				+from airflow.operators.subdag_operator import SubDagOperator
			
 
				+from subdags.idl_subdag import idl_subdag
			
 
				+import datetime
			
 
				+from config import *
			
 
				+
			
 
				+default_args = {
			
 
				+    'owner' : 'wdc',
			
 
				+    'depends_on_past' : False,
			
 
				+    'start_date' : datetime.datetime(2017, 01, 01),
			
 
				+    'email' : ['wdc@huojutech.com'],
			
 
				+    'email_on_failure' : False,
			
 
				+    'email_on_retry': False,
			
 
				+    # 'retries' : 3,
			
 
				+    # 'retry_delay': datetime.timedelta(minutes=1),
			
 
				+}
			
 
				+
			
 
				+DAG_NAME = 'once_dag'
			
 
				+
			
 
				+dag = DAG(
			
 
				+    dag_id=DAG_NAME,
			
 
				+    default_args=default_args,
			
 
				+    schedule_interval='@once',
			
 
				+)
			
 
				+
			
 
				+# 初始阶段运行一次即可
			
 
				+once_history_ad_tv_station_mid_avg_ratings_denoising_stat = BashOperator(
			
 
				+    task_id='once_history_ad_tv_station_mid_avg_ratings_denoising_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python once_history_ad_tv_station_mid_avg_ratings_denoising_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+
			
 
				+once_history_ad_tv_station_mid_avg_ratings_stat = BashOperator(
			
 
				+    task_id='once_history_ad_tv_station_mid_avg_ratings_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python once_history_ad_tv_station_mid_avg_ratings_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+
			
 
				+once_history_ad_tv_station_mid_ratings_denoising_stat = BashOperator(
			
 
				+    task_id='once_history_ad_tv_station_mid_ratings_denoising_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python once_history_ad_tv_station_mid_ratings_denoising_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+
			
 
				+once_history_ad_tv_station_mid_ratings_index_denoising_stat = BashOperator(
			
 
				+    task_id='once_history_ad_tv_station_mid_ratings_index_denoising_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python once_history_ad_tv_station_mid_ratings_index_denoising_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+
			
 
				+once_history_ad_tv_station_mid_ratings_index_stat = BashOperator(
			
 
				+    task_id='once_history_ad_tv_station_mid_ratings_index_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python once_history_ad_tv_station_mid_ratings_index_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+
			
 
				+once_history_ad_tv_station_mid_ratings_stat = BashOperator(
			
 
				+    task_id='once_history_ad_tv_station_mid_ratings_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python once_history_ad_tv_station_mid_ratings_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
--- a/dags/realtime_dag.py
+++ b/dags/realtime_dag.py
@@ -0,0 +1,35 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+from airflow.models import DAG
			
 
				+from airflow.operators.bash_operator import BashOperator
			
 
				+from airflow.operators.subdag_operator import SubDagOperator
			
 
				+from subdags.idl_subdag import idl_subdag
			
 
				+import datetime
			
 
				+from config import *
			
 
				+
			
 
				+default_args = {
			
 
				+    'owner' : 'wdc',
			
 
				+    'depends_on_past' : False,
			
 
				+    'start_date' : datetime.datetime(2017, 01, 01),
			
 
				+    'email' : ['wdc@huojutech.com'],
			
 
				+    'email_on_failure' : False,
			
 
				+    'email_on_retry': False,
			
 
				+    # 'retries' : 3,
			
 
				+    # 'retry_delay': datetime.timedelta(minutes=1),
			
 
				+}
			
 
				+
			
 
				+DAG_NAME = 'realtime_dag'
			
 
				+
			
 
				+dag = DAG(
			
 
				+    dag_id=DAG_NAME,
			
 
				+    default_args=default_args,
			
 
				+    schedule_interval='@once',
			
 
				+)
			
 
				+
			
 
				+# 近实时计算(需要一直监听)
			
 
				+odl_near_realtime_calc = BashOperator(
			
 
				+    task_id='odl_near_realtime_calc',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python odl_near_realtime_calc.py & 2>&1 /root/py_script_logs/realtime.log',
			
 
				+    dag=dag,
			
 
				+)
			
--- a/dags/recent_one_year_stat_dag.py
+++ b/dags/recent_one_year_stat_dag.py
@@ -0,0 +1,48 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+from airflow.models import DAG
			
 
				+from airflow.operators.bash_operator import BashOperator
			
 
				+from airflow.operators.subdag_operator import SubDagOperator
			
 
				+from subdags.idl_subdag import idl_subdag
			
 
				+import datetime
			
 
				+from config import *
			
 
				+
			
 
				+default_args = {
			
 
				+    'owner' : 'wdc',
			
 
				+    'depends_on_past' : False,
			
 
				+    'start_date' : datetime.datetime(2017, 01, 01),
			
 
				+    'email' : ['wdc@huojutech.com'],
			
 
				+    'email_on_failure' : False,
			
 
				+    'email_on_retry': False,
			
 
				+    # 'retries' : 3,
			
 
				+    # 'retry_delay': datetime.timedelta(minutes=1),
			
 
				+}
			
 
				+
			
 
				+DAG_NAME = 'recent_one_year_stat_dags'
			
 
				+
			
 
				+dag = DAG(
			
 
				+    dag_id=DAG_NAME,
			
 
				+    default_args=default_args,
			
 
				+    schedule_interval='40 0 * * *',
			
 
				+)
			
 
				+
			
 
				+# 最近一年数据分离
			
 
				+tmp_recent_year_ad_television_data = BashOperator(
			
 
				+    task_id='tmp_recent_year_ad_television_data',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_recent_year_ad_television_data.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+
			
 
				+# 近一年平均收视率、收视指数统计
			
 
				+tmp_ad_tv_station_stat = BashOperator(
			
 
				+    task_id='tmp_ad_tv_station_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python tmp_ad_tv_station_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
 
				+
			
 
				+idl_ad_tv_station_stat = BashOperator(
			
 
				+    task_id='idl_ad_tv_station_stat',
			
 
				+    bash_command='cd ' + SCRIPT_PATH + '; python idl_ad_tv_station_stat.py',
			
 
				+    dag=dag,
			
 
				+)
			
--- a/dags/subdags/idl_subdag.py
+++ b/dags/subdags/idl_subdag.py
@@ -0,0 +1,70 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+from airflow.models import DAG
			
 
				+from airflow.operators.bash_operator import BashOperator
			
 
				+
			
 
				+def idl_subdag(parent_dag_name, child_dag_name, args):
			
 
				+    idl_subdag = DAG(
			
 
				+        dag_id='%s.%s' % (parent_dag_name, child_dag_name),
			
 
				+        default_args=args,
			
 
				+        schedule_interval='@daily',
			
 
				+    )
			
 
				+
			
 
				+    # 收视率趋势
			
 
				+    BashOperator(
			
 
				+        task_id='idl_trending-task',
			
 
				+        bash_command='cd /Users/wudancheng/huoju/code/huoju_fty/py_script; python idl_trending.py',
			
 
				+        default_args=args,
			
 
				+        dag=idl_subdag,
			
 
				+    )
			
 
				+
			
 
				+    # 类型统计
			
 
				+    BashOperator(
			
 
				+        task_id='idl_tv_station_type_stat-task',
			
 
				+        bash_command='cd /Users/wudancheng/huoju/code/huoju_fty/py_script; python idl_tv_station_type_stat.py',
			
 
				+        default_args=args,
			
 
				+        dag=idl_subdag,
			
 
				+    )
			
 
				+
			
 
				+    # 收视指数统计
			
 
				+    BashOperator(
			
 
				+        task_id='idl_tv_station_ratings_index_stat-task',
			
 
				+        bash_command='cd /Users/wudancheng/huoju/code/huoju_fty/py_script; python idl_tv_station_ratings_index_stat.py',
			
 
				+        default_args=args,
			
 
				+        dag=idl_subdag,
			
 
				+    )
			
 
				+
			
 
				+    # 电视台推荐
			
 
				+    BashOperator(
			
 
				+        task_id='idl_tv_station_recommend-task',
			
 
				+        bash_command='cd /Users/wudancheng/huoju/code/huoju_fty/py_script; python idl_tv_station_recommend.py',
			
 
				+        default_args=args,
			
 
				+        dag=idl_subdag,
			
 
				+    )
			
 
				+
			
 
				+    # 电视台收视率统计
			
 
				+    BashOperator(
			
 
				+        task_id='idl_ad_television_stat-task',
			
 
				+        bash_command='cd /Users/wudancheng/huoju/code/huoju_fty/py_script; python idl_ad_television_stat.py',
			
 
				+        default_args=args,
			
 
				+        dag=idl_subdag,
			
 
				+    )
			
 
				+
			
 
				+    # 备案发行数据
			
 
				+    BashOperator(
			
 
				+        task_id='idl_ad_tv_record_distribution-task',
			
 
				+        bash_command='cd /Users/wudancheng/huoju/code/huoju_fty/py_script; python idl_ad_tv_record_distribution.py',
			
 
				+        default_args=args,
			
 
				+        dag=idl_subdag,
			
 
				+    )
			
 
				+
			
 
				+    # idl_tv_station_rank
			
 
				+    BashOperator(
			
 
				+        task_id='idl_tv_station_rank-task',
			
 
				+        bash_command='cd /Users/wudancheng/huoju/code/huoju_fty/py_script; python idl_tv_station_rank.py',
			
 
				+        default_args=args,
			
 
				+        dag=idl_subdag,
			
 
				+    )
			
 
				+    
			
 
				+    return idl_subdag
			
--- a/fty_util/__init__.py
+++ b/fty_util/__init__.py
--- a/fty_util/common.py
+++ b/fty_util/common.py
@@ -0,0 +1,370 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+from mysql.connector.connection import MySQLConnection
			
 
				+import commands
			
 
				+import datetime
			
 
				+import calendar
			
 
				+from fty_util.config import APP_CFG
			
 
				+
			
 
				+class Mysql(object):
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        pass
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def createOnlineConn():
			
 
				+        # 如果需要使用连接池功能，可以多指定一个参数，pool_size=10或者pool_name
			
 
				+        config = {
			
 
				+            'user': APP_CFG.ONLINE_CONFIG_USER,
			
 
				+            'password': APP_CFG.ONLINE_CONFIG_PASSWORD,
			
 
				+            'host': APP_CFG.ONLINE_CONFIG_HOST,
			
 
				+            'port': APP_CFG.ONLINE_CONFIG_PORT
			
 
				+        }
			
 
				+        # config = {
			
 
				+        #     'user': 'root',
			
 
				+        #     'password': 'huojutech_yaozhi!23',
			
 
				+        #     'host': '121.41.17.212',
			
 
				+        #     # 'database': 'yxb',
			
 
				+        #     'port': 3306
			
 
				+        # }
			
 
				+        # 如果conn是直接新建的连接则它会被关闭，如果是从线程池中分配一个连接则会被归还给连接池
			
 
				+        cnx = MySQLConnection()
			
 
				+        try:
			
 
				+            cnx.connect(**config)
			
 
				+        except Exception, e:
			
 
				+            print e
			
 
				+            cnx.reconnect(attempts=3, delay=0)
			
 
				+        return cnx
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def createOfflineConn():
			
 
				+        # 如果需要使用连接池功能，可以多指定一个参数，pool_size=10或者pool_name
			
 
				+        config = {
			
 
				+            'user': APP_CFG.OFFLINE_CONFIG_USER,
			
 
				+            'password': APP_CFG.OFFLINE_CONFIG_PASSWORD,
			
 
				+            'host': APP_CFG.OFFLINE_CONFIG_HOST,
			
 
				+            'port': APP_CFG.OFFLINE_CONFIG_PORT
			
 
				+        }
			
 
				+        # config = {
			
 
				+        #     'user': 'root',
			
 
				+        #     'password': 'huojutech_yaozhi!23',
			
 
				+        #     'host': '121.41.17.212',
			
 
				+        #     # 'database': 'yxb',
			
 
				+        #     'port': 3306
			
 
				+        # }
			
 
				+        # 如果conn是直接新建的连接则它会被关闭，如果是从线程池中分配一个连接则会被归还给连接池
			
 
				+        cnx = MySQLConnection()
			
 
				+        try:
			
 
				+            cnx.connect(**config)
			
 
				+        except Exception, e:
			
 
				+            print e
			
 
				+            cnx.reconnect(attempts=3, delay=0)
			
 
				+        return cnx
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def createScrapyConn():
			
 
				+        # 如果需要使用连接池功能，可以多指定一个参数，pool_size=10或者pool_name
			
 
				+        config = {
			
 
				+            'user': APP_CFG.SCRAPY_CONFIG_USER,
			
 
				+            'password': APP_CFG.SCRAPY_CONFIG_PASSWORD,
			
 
				+            'host': APP_CFG.SCRAPY_CONFIG_HOST,
			
 
				+            'port': APP_CFG.SCRAPY_CONFIG_PORT
			
 
				+        }
			
 
				+        # config = {
			
 
				+        #     'user': 'root',
			
 
				+        #     'password': 'huojutech_yaozhi!23',
			
 
				+        #     'host': '121.41.17.212',
			
 
				+        #     'port': 3306
			
 
				+        # }
			
 
				+        # 如果conn是直接新建的连接则它会被关闭，如果是从线程池中分配一个连接则会被归还给连接池
			
 
				+        cnx = MySQLConnection()
			
 
				+        try:
			
 
				+            cnx.connect(**config)
			
 
				+        except Exception, e:
			
 
				+            print e
			
 
				+            cnx.reconnect(attempts=3, delay=0)
			
 
				+        return cnx
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def getCursor(conn=None, buffered=None):
			
 
				+        if not conn.is_connected():
			
 
				+            if conn is not None:
			
 
				+                conn.close()
			
 
				+            conn.reconnect(attempts=5)
			
 
				+        if buffered is not None:
			
 
				+            cursor = conn.cursor(buffered=True)
			
 
				+        else:
			
 
				+            cursor = conn.cursor()
			
 
				+        return cursor
			
 
				+
			
 
				+        # if Mysql.__pool is None:
			
 
				+        #     __pool = PooledDB(creator=mysql.connector, mincached=1, maxcached=20,
			
 
				+        #         host=MYSQL_HOST,
			
 
				+        #         port=MYSQL_PORT,
			
 
				+        #         db=MYSQL_DBNAME,
			
 
				+        #         user=MYSQL_USER,
			
 
				+        #         passwd=MYSQL_PASSWD,
			
 
				+        #         charset='utf8')
			
 
				+        # return __pool.connection()
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def getAll(sql, param=None, conn=None):
			
 
				+        # conn = self.getConn()
			
 
				+        cursor = Mysql.getCursor(conn=conn)
			
 
				+        """
			
 
				+        @summary: 执行查询， 并取出所有结果集
			
 
				+        @param sql: 查询SQL，如果有查询条件，请只指定条件列表，并将条件值使用参数[param]传递进来
			
 
				+        @param param: 可选参数，条件列表值 （元组/列表）
			
 
				+        @return: result list(字典对象)/boolean 查询到的结果集
			
 
				+        """
			
 
				+        if param is None:
			
 
				+            cursor.execute(sql)
			
 
				+        else:
			
 
				+            cursor.execute(sql, param)
			
 
				+        cols = [t[0] for t in cursor.description]
			
 
				+        result = cursor.fetchall()
			
 
				+        if result:
			
 
				+            cursor.close()
			
 
				+            return [dict(zip(cols, row)) for row in result]
			
 
				+        else:
			
 
				+            cursor.close()
			
 
				+            return result
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def selectAll(sql, param=None, conn=None):
			
 
				+        # conn = self.getConn()
			
 
				+        cursor = Mysql.getCursor(conn=conn)
			
 
				+        """
			
 
				+        @summary: 执行查询， 并取出所有结果集
			
 
				+        @param sql: 查询SQL，如果有查询条件，请只指定条件列表，并将条件值使用参数[param]传递进来
			
 
				+        @param param: 可选参数，条件列表值 （元组/列表）
			
 
				+        @return: result list查询到的结果集
			
 
				+        """
			
 
				+        if param is None:
			
 
				+            cursor.execute(sql)
			
 
				+        else:
			
 
				+            cursor.execute(sql, param)
			
 
				+        cols = [t[0] for t in cursor.description]
			
 
				+        result = cursor.fetchall()
			
 
				+        cursor.close()
			
 
				+        return result
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def getOne(sql, param=None, conn=None):
			
 
				+        # conn = self.getConn()
			
 
				+        cursor = Mysql.getCursor(conn=conn, buffered=True)
			
 
				+        """
			
 
				+        @summary: 执行查询，并取出第一条
			
 
				+        @param sql: 查询SQL， 如果有查询条件，请只指定条件列表，并将条件值使用参数[param]传递进来
			
 
				+        @param param: 可选参数，条件列表值(元组/列表)
			
 
				+        @return: result list/boolean 查询到的结果集
			
 
				+        """
			
 
				+        if param is None:
			
 
				+            count = cursor.execute(sql)
			
 
				+        else:
			
 
				+            count = cursor.execute(sql, param)
			
 
				+        result = cursor.fetchone()
			
 
				+        cursor.close()
			
 
				+        return result
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def getMany(sql, num, param=None, conn=None):
			
 
				+        # conn = self.getConn()
			
 
				+        cursor = Mysql.getCursor(conn=conn)
			
 
				+        """
			
 
				+        @summary: 执行查询， 并取出num条结果
			
 
				+        @param sql: 查询SQL， 如果有查询条件，请只指定条件列表，并将条件值使用参数[param]传递进来
			
 
				+        @param num: 取得的结果条数
			
 
				+        @param param: 可选参数， 条件列表值（元组/列表）
			
 
				+        @return: result list/boolean 查询到的结果集
			
 
				+        """
			
 
				+        if param is None:
			
 
				+            count = cursor.execute(sql)
			
 
				+        else:
			
 
				+            count = cursor.execute(sql, param)
			
 
				+        result = cursor.fetchmany(num)
			
 
				+        cursor.close()
			
 
				+        return result
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def insertOne(sql, value=None, conn=None):
			
 
				+        # conn = self.getConn()
			
 
				+        cursor = Mysql.getCursor(conn=conn)
			
 
				+        """
			
 
				+        @summary: 向数据表插入一条记录
			
 
				+        @param sql: 要插入的SQL格式
			
 
				+        @param value: 要插入的记录数据tuple/list
			
 
				+        @return: insertId 受影响的行数
			
 
				+        """
			
 
				+        if value is None:
			
 
				+            cursor.execute(sql)
			
 
				+        else:
			
 
				+            cursor.execute(sql, value)
			
 
				+        Mysql.dispose(cursor, conn)
			
 
				+        return Mysql.__getInsertId(conn)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def insertMany(sql, values, conn):
			
 
				+        # conn = self.getConn()
			
 
				+        cursor = Mysql.getCursor(conn=conn)
			
 
				+        """
			
 
				+        @summary: 向数据表插入多条记录
			
 
				+        @param sql: 要插入的SQL格式
			
 
				+        @param values: 要插入的记录数据tuple(tuple)/list[list]
			
 
				+        @return: count 受影响的行数
			
 
				+        """
			
 
				+        count = cursor.executemany(sql, values)
			
 
				+        Mysql.dispose(cursor, conn)
			
 
				+        return count
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def __getInsertId(conn):
			
 
				+        # conn = self.getConn()
			
 
				+        cursor = Mysql.getCursor(conn=conn)
			
 
				+        """
			
 
				+        获取当前连接最后一次插入操作生成的id，如果没有则为0
			
 
				+        """
			
 
				+        cursor.execute("select @@identity as id")
			
 
				+        result = cursor.fetchall()
			
 
				+        cursor.close()
			
 
				+        return result[0][0]
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def __query(sql, param=None, conn=None):
			
 
				+        # conn = Mysql.getConn()
			
 
				+        cursor = Mysql.getCursor(conn=conn)
			
 
				+        if param is None:
			
 
				+            count = cursor.execute(sql)
			
 
				+        else:
			
 
				+            count = cursor.execute(sql, param)
			
 
				+        Mysql.dispose(cursor, conn)
			
 
				+        return count
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def execute(sql, param=None, conn=None):
			
 
				+        # conn = self.getConn()
			
 
				+        cursor = Mysql.getCursor(conn=conn)
			
 
				+        if param is None:
			
 
				+            count = cursor.execute(sql)
			
 
				+        else:
			
 
				+            count = cursor.execute(sql, param)
			
 
				+        Mysql.dispose(cursor, conn)
			
 
				+        return count
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def updateMany(sql, param=None, conn=None):
			
 
				+        # conn = Mysql.getConn()
			
 
				+        cursor = Mysql.getCursor(conn=conn)
			
 
				+        count = cursor.executemany(sql, param)
			
 
				+        return count
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def update(sql, param=None, conn=None):
			
 
				+        """
			
 
				+        @summary: 更新数据表记录
			
 
				+        @param sql: sql格式及条件， 使用(%s, %s)
			
 
				+        @param param: 要更新的 值 tuple/list
			
 
				+        @return: count 受影响的行数
			
 
				+        """
			
 
				+        return Mysql.__query(sql, param=param, conn=conn)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def delete(sql, param=None, conn=None):
			
 
				+        """
			
 
				+        @summary: 删除数据表记录
			
 
				+        @param sql: sql格式及条件，使用(%s, %s)
			
 
				+        @param param: 要删除的条件 值 tuple/list
			
 
				+        @return: count 受影响的行数
			
 
				+        """
			
 
				+        return Mysql.__query(sql, param=param, conn=conn)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def dispose(cursor, conn):
			
 
				+        """
			
 
				+            @summary: 释放连接池资源
			
 
				+        """
			
 
				+        conn.commit()
			
 
				+        cursor.close()
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def close(conn):
			
 
				+        if conn:
			
 
				+            conn.close()
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def cmd(cmd):
			
 
				+        status, output = commands.getstatusoutput(cmd)
			
 
				+        if status != 0:
			
 
				+            print '同步线上失败'
			
 
				+        else:
			
 
				+            print '同步线上成功'
			
 
				+
			
 
				+class Util(object):
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def insert_by_chunk(sql, data_list, conn):
			
 
				+        start = 0
			
 
				+        while True:
			
 
				+            end = start + 10000
			
 
				+            if end >= len(data_list):
			
 
				+                end = len(data_list)
			
 
				+            if start >= len(data_list):
			
 
				+                break
			
 
				+            Mysql.insertMany(sql, data_list[start:end], conn)
			
 
				+            start = end
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def calc_ratings_index(num1, num2):
			
 
				+        """计算收视指数
			
 
				+
			
 
				+        收视指数 = 收视率/(近一年平均收视率 * 0.2)
			
 
				+        收视指数 > 10 按10计算
			
 
				+        收视指数 < 1 按1计算
			
 
				+        Args: num1 收视率
			
 
				+        Args: num2 近一年平均收视率
			
 
				+        """
			
 
				+        # 如果num1 或 num2为空，则直接返回指数为1
			
 
				+        if num1 is None or num2 is None:
			
 
				+            return 1.0
			
 
				+        ratings_index = float(num1) / (float(num2) * 0.2)
			
 
				+        if ratings_index > 10.0:
			
 
				+            ratings_index = 10.0
			
 
				+        if ratings_index < 1.0:
			
 
				+            ratings_index = 1.0
			
 
				+        return ratings_index
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def get_max_date_of_month(field):
			
 
				+        """获取给定月份的最大日期
			
 
				+
			
 
				+        """
			
 
				+        if isinstance(field, datetime.date):
			
 
				+            month_str = field.strftime('%Y-%m-%d')
			
 
				+            _year = str(month_str.split('-')[0])
			
 
				+            _month = str(month_str.split('-')[1])
			
 
				+            max_date = calendar.monthrange(int(_year), int(_month))
			
 
				+            date_str = _year + '-' + _month + '-' + str(max_date[1])
			
 
				+            return date_str
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def get_first_date_of_yesterday():
			
 
				+        # 当前日期
			
 
				+        now = datetime.date.today()
			
 
				+        # 昨天日期
			
 
				+        yesterday = now - datetime.timedelta(days=1)
			
 
				+        # 昨天的当月日期
			
 
				+        first_day = datetime.date(yesterday.year, yesterday.month, 1)
			
 
				+        return first_day
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def get_max_date_of_one_year_ago(field):
			
 
				+        """获取给定月份一年前的日期
			
 
				+        """
			
 
				+        if isinstance(field, datetime.date):
			
 
				+            month_str = field.strftime('%Y-%m-%d')
			
 
				+            _year = str(month_str.split('-')[0])
			
 
				+            _month = str(month_str.split('-')[1])
			
 
				+            max_date = calendar.monthrange(int(_year)-1, int(_month))
			
 
				+            date_str = str(int(_year) - 1) + '-' + _month + '-' + str(max_date[1])
			
 
				+            return date_str
			
--- a/fty_util/config.py
+++ b/fty_util/config.py
@@ -0,0 +1,54 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+
			
 
				+from __future__ import with_statement
			
 
				+import sys
			
 
				+import os
			
 
				+import datetime
			
 
				+import time
			
 
				+import ConfigParser
			
 
				+
			
 
				+# 读取配置文件
			
 
				+try:
			
 
				+    config = ConfigParser.ConfigParser()
			
 
				+    with open(os.path.expanduser('~') + '/huoju_fty_home/config.cfg', 'rw') as f:
			
 
				+        config.readfp(f)
			
 
				+except IOError, e:
			
 
				+    print e
			
 
				+    sys.exit(1)
			
 
				+
			
 
				+class APP_CFG(object):
			
 
				+    
			
 
				+    OFFLINE_CONFIG_HOST = config.get('offline_config', 'host')
			
 
				+    OFFLINE_CONFIG_USER = config.get('offline_config', 'user')
			
 
				+    OFFLINE_CONFIG_PASSWORD = config.get('offline_config', 'password')
			
 
				+    OFFLINE_CONFIG_PORT = config.get('offline_config', 'port')
			
 
				+
			
 
				+    ONLINE_CONFIG_HOST = config.get('online_config', 'host')
			
 
				+    ONLINE_CONFIG_USER = config.get('online_config', 'user')
			
 
				+    ONLINE_CONFIG_PASSWORD = config.get('online_config', 'password')
			
 
				+    ONLINE_CONFIG_PORT = config.get('online_config', 'port')
			
 
				+
			
 
				+    SCRAPY_CONFIG_HOST = config.get('scrapy_config', 'host')
			
 
				+    SCRAPY_CONFIG_USER = config.get('scrapy_config', 'user')
			
 
				+    SCRAPY_CONFIG_PASSWORD = config.get('scrapy_config', 'password')
			
 
				+    SCRAPY_CONFIG_PORT = config.get('scrapy_config', 'port')
			
 
				+
			
 
				+    # # 离线数据库配置
			
 
				+    # OFFLINE_CONFIG = config.get('info', 'offline_config')
			
 
				+    # # 在线数据库配置
			
 
				+    # ONLINE_CONFIG = config.get('info', 'online_config')
			
 
				+
			
 
				+    # # 电视台收视率统计路径
			
 
				+    # AD_TELEVISION_STAT_PATH = config.get('info', 'tmp_path') + 'ad_television_stat.txt'
			
 
				+    # # 备案发行数据路径
			
 
				+    # AD_TV_RECORD_DISTRIBUTION_PATH = config.get('info', 'tmp_path') + 'ad_tv_record_distribution.txt'
			
 
				+    # # 趋势路径
			
 
				+    # TRENDING_PATH = config.get('info', 'tmp_path') + 'trending.txt'
			
 
				+    # # 收视预测路径
			
 
				+    # TV_STATION_RATINGS_STAT_PATH = config.get('info', 'tmp_path') + 'tv_station_ratings_stat.txt'
			
 
				+    # # 推荐结果路径
			
 
				+    # TV_STATION_RECOMMEND_PATH = config.get('info', 'tmp_path') + 'tv_station_recommend.txt'
			
 
				+    # # 类型预测路径
			
 
				+    # TV_STATION_TYPE_STAT_PATH = config.get('info', 'tmp_path') + 'tv_station_type_stat.txt'
			
--- a/idl_ad_pub_station_stats.py
+++ b/idl_ad_pub_station_stats.py
@@ -0,0 +1,141 @@
 
				+#encoding=utf-8
			
 
				+#author:wdw110
			
 
				+#功能：统计电视台的各类型数量和收视率
			
 
				+
			
 
				+from __future__ import division
			
 
				+import re
			
 
				+import math
			
 
				+import time
			
 
				+import datetime
			
 
				+import numpy as np
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+tv_data = {}
			
 
				+tv_data2 = {}
			
 
				+tv_station = {}
			
 
				+tv_station_type = {}
			
 
				+channel_type = {}
			
 
				+result_rate = []
			
 
				+result_type = []
			
 
				+result_channel = []
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+sql = "select tv_name,channel,audience_rating,tv_date from odl.ad_television where theater_attribute='黄金剧场'"
			
 
				+data = Mysql.selectAll(sql, conn=conn)
			
 
				+
			
 
				+sql_tv = "select tv_id,tv_name,theme,second_type,decade,first_type from odl.ad_tv_lib where is_use=1"
			
 
				+tmp_data = Mysql.selectAll(sql_tv, conn=conn)
			
 
				+for i in range(len(tmp_data)):
			
 
				+	tv_id = tmp_data[i][0]
			
 
				+	tv_name = tmp_data[i][1]
			
 
				+	theme = tmp_data[i][2]
			
 
				+	type2 = tmp_data[i][3]
			
 
				+	decade = tmp_data[i][4]
			
 
				+	type1 = tmp_data[i][5]
			
 
				+	if type1 and type2:
			
 
				+		tv_data[tv_name] = [tv_id,theme,type2,decade]
			
 
				+		tv_data2[tv_name] = [tv_id,type1,type2]
			
 
				+
			
 
				+#按月统计电视台的收视率
			
 
				+for i in range(len(data)):
			
 
				+	tv_name = data[i][0]
			
 
				+	channel = data[i][1]
			
 
				+	aud_rating = data[i][2]
			
 
				+	tv_date = datetime.datetime.strftime(data[i][3],'%Y-%m')
			
 
				+
			
 
				+	tv_station.setdefault(channel,{})
			
 
				+	tv_station[channel].setdefault(tv_date,[])
			
 
				+	tv_station[channel][tv_date].append(aud_rating)
			
 
				+
			
 
				+	channel_type.setdefault(channel,{})
			
 
				+	channel_type[channel].setdefault(tv_date,{})
			
 
				+
			
 
				+	if tv_data2.get(tv_name):
			
 
				+		tv_arr = tv_data2[tv_name][1:-1]
			
 
				+		for level in range(len(tv_arr)):
			
 
				+			channel_type[channel][tv_date].setdefault(level,{})
			
 
				+			ty = tv_arr[level]
			
 
				+			if ty:
			
 
				+				type_arr = ty.split(u' ')
			
 
				+				for tt in type_arr:
			
 
				+					if len(tt):
			
 
				+						channel_type[channel][tv_date][level].setdefault(tt,[])
			
 
				+						channel_type[channel][tv_date][level][tt].append(aud_rating)
			
 
				+
			
 
				+for channel,value in channel_type.items():
			
 
				+	for tv_date in value:
			
 
				+		date = datetime.datetime.strptime(tv_date,'%Y-%m').date()
			
 
				+		val = value[tv_date]
			
 
				+		for level,v_obj in val.items():
			
 
				+			for k,v in v_obj.items():
			
 
				+				avg = sum(v)/len(v)
			
 
				+				result_channel.append((channel,k,avg,level+1,date))
			
 
				+
			
 
				+for channel,value in tv_station.items():
			
 
				+	for tv_date in value:
			
 
				+		tmp_arr = value[tv_date]
			
 
				+		avg_rating = sum(tmp_arr)/len(tmp_arr)
			
 
				+		date = datetime.datetime.strptime(tv_date,'%Y-%m').date()
			
 
				+		result_rate.append((channel,avg_rating,date))
			
 
				+
			
 
				+#按类型统计电视台播放电视剧数量
			
 
				+for i in range(len(data)):
			
 
				+	tv_name = data[i][0]
			
 
				+	channel = data[i][1]
			
 
				+	tv_date = datetime.datetime.strftime(data[i][3],'%Y-%m')
			
 
				+
			
 
				+	if tv_data.get(tv_name):
			
 
				+		tv_id = tv_data[tv_name][0]
			
 
				+		type1 = tv_data[tv_name][1]
			
 
				+		type2 = tv_data[tv_name][2]
			
 
				+		decade = tv_data[tv_name][3]
			
 
				+		type_arr = type2.split(u' ') if type2 else []
			
 
				+		tv_station_type.setdefault(channel,{})
			
 
				+		tv_station_type[channel].setdefault(tv_date,{"type1":{},"type2":{}})
			
 
				+		tv_station_type[channel][tv_date]['type1'].setdefault(type1,{})
			
 
				+		tv_station_type[channel][tv_date]['type1'][type1][tv_id] = decade
			
 
				+		for t2 in type_arr:
			
 
				+			if len(t2):
			
 
				+				tv_station_type[channel][tv_date]['type2'].setdefault(t2,{})
			
 
				+				tv_station_type[channel][tv_date]['type2'][t2][tv_id] = decade
			
 
				+
			
 
				+for channel,value in tv_station_type.items():
			
 
				+	for tv_date in value:
			
 
				+		type1_obj = value[tv_date]['type1']
			
 
				+		type2_obj = value[tv_date]['type2']
			
 
				+		date = datetime.datetime.strptime(tv_date,'%Y-%m').date()
			
 
				+		for t1,v1 in type1_obj.items():
			
 
				+			for con in v1:
			
 
				+				t1_arr = [channel,t1,'1',con,date,v1[con]]
			
 
				+				result_type.append(t1_arr)
			
 
				+		for t2,v2 in type2_obj.items():
			
 
				+			for con in v2:
			
 
				+				t2_arr = [channel,t2,'2',con,date,v2[con]]
			
 
				+				result_type.append(t2_arr)
			
 
				+
			
 
				+delete = 'truncate table idl.ad_pub_station_rate_stats'
			
 
				+Mysql.execute(delete, conn=conn)
			
 
				+
			
 
				+sql_rate = 'insert into idl.ad_pub_station_rate_stats(channel,avg_rating,date) values(%s,%s,%s)'
			
 
				+for i in range(int(len(result_rate)/1000)+1):
			
 
				+	tmp = result_rate[i*1000:(i+1)*1000]
			
 
				+	Mysql.insertMany(sql_rate, tmp, conn=conn)
			
 
				+
			
 
				+delete = 'truncate table idl.ad_pub_station_type_stats'
			
 
				+Mysql.execute(delete, conn=conn)
			
 
				+
			
 
				+sql_type = 'insert into idl.ad_pub_station_type_stats(channel,type,level,tv_id,date,decade) values(%s,%s,%s,%s,%s,%s)'
			
 
				+for i in range(int(len(result_type)/1000)+1):
			
 
				+	tmp = result_type[i*1000:(i+1)*1000]
			
 
				+	Mysql.insertMany(sql_type, tmp, conn=conn)
			
 
				+
			
 
				+delete = 'truncate table idl.ad_pub_station_type_rate'
			
 
				+Mysql.execute(delete, conn=conn)
			
 
				+
			
 
				+sql_channel = 'insert into idl.ad_pub_station_type_rate(channel,type,avg_rating,level,date) values(%s,%s,%s,%s,%s)'
			
 
				+for i in range(int(len(result_channel)/1000)+1):
			
 
				+	tmp = result_channel[i*1000:(i+1)*1000]
			
 
				+	Mysql.insertMany(sql_channel, tmp, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/idl_tv_sr_denoise.py
+++ b/idl_tv_sr_denoise.py
@@ -0,0 +1,131 @@
 
				+#encoding=utf-8
			
 
				+#author:wdw110
			
 
				+#功能: 对卫视和地方的电视剧收视数据进行去噪声处理
			
 
				+
			
 
				+from __future__ import division
			
 
				+import math
			
 
				+import copy
			
 
				+import mysql.connector
			
 
				+import time
			
 
				+import numpy as np
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+tbs = ['ad_television','area_ad_television']
			
 
				+choose = ['ad','area']
			
 
				+
			
 
				+def Stat(Number): #Number:0和1
			
 
				+	tv_data = []
			
 
				+	tv_play = {}
			
 
				+	tv_station = {}
			
 
				+	dateline = str(time.localtime().tm_year-1)
			
 
				+
			
 
				+	conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+	sql = "select id,tv_name,channel,theater_attribute,epi_num,tv_date,start_time,end_time,audience_rating,avg_rating,market_rating from odl.%s where year(tv_date)>=%s" % (tbs[Number], dateline)
			
 
				+	data = Mysql.selectAll(sql, conn=conn)
			
 
				+
			
 
				+	for i in range(len(data)):
			
 
				+		dd = list(data[i])
			
 
				+		tv_id = data[i][0]
			
 
				+		tv_name = data[i][1]
			
 
				+		channel = data[i][2]
			
 
				+		theater = data[i][3]
			
 
				+		year = data[i][5].year
			
 
				+		if dd[8]>=0 and dd[9]>=0 and dd[10]>=0 and dd[3]: 
			
 
				+			key = (tv_name, channel, theater, year)
			
 
				+			tv_play[tv_id] = dd[1:]
			
 
				+			tv_station.setdefault(key,[[],[]])
			
 
				+			tv_station[key][0].append(tv_id)
			
 
				+			tmp_data = map(float,data[i][8:11])
			
 
				+			tv_station[key][1].append(tmp_data)
			
 
				+
			
 
				+	def fivenum(arr):
			
 
				+		"""Tukey's five number"""
			
 
				+		arr = np.sort(arr)
			
 
				+		res = []
			
 
				+		n = len(arr)
			
 
				+		if n == 0:
			
 
				+			print '数组不能为空'
			
 
				+			return res
			
 
				+		else:
			
 
				+			n4 = math.floor((n+3)/2)/2
			
 
				+			d = [0,n4-1,(n-1)/2,n-n4,n-1]
			
 
				+			d_floor = [int(math.floor(i)) for i in d]
			
 
				+			d_ceil = [int(math.ceil(i)) for i in d]
			
 
				+			res = list(0.5 * (arr[d_floor]+arr[d_ceil]))
			
 
				+			return res
			
 
				+
			
 
				+	def denoise(arr):
			
 
				+		"""异常值检测：在区间[Q1-1.5*(Q3-Q1),Q3+1.5*(Q3-Q1)]
			
 
				+		之外的点均为异常值，其中，Q1为四分之一分位点，Q3为四分之三分位点"""
			
 
				+		five_arr = fivenum(arr)
			
 
				+		Q1 = five_arr[1]
			
 
				+		Q3 = five_arr[3]
			
 
				+		L1 = Q1-1.5*(Q3-Q1) #区间下界
			
 
				+		L2 = Q3+1.5*(Q3-Q1) #区间上界
			
 
				+		res = []
			
 
				+
			
 
				+		for i in range(len(arr)):
			
 
				+			if arr[i]<=L2:
			
 
				+				res.append(arr[i])
			
 
				+			else:
			
 
				+				res.append(-1)
			
 
				+		return res
			
 
				+
			
 
				+	result = []
			
 
				+	for key in tv_station:
			
 
				+		id_arr = tv_station[key][0]
			
 
				+		vv = np.array(tv_station[key][1])
			
 
				+		tmp_arr = np.array(map(denoise,vv.transpose())).transpose()
			
 
				+		for i in range(len(id_arr)):
			
 
				+			tv_id = id_arr[i]
			
 
				+			tmp = [tv_id] + tv_play[tv_id] + map(float,list(tmp_arr[i]))
			
 
				+			result.append(tmp)
			
 
				+
			
 
				+
			
 
				+	#写入yxb.ad_tv_rating_denoise
			
 
				+	delete = 'delete from yxb.%s_tv_rating_denoise where year(tv_date)>=%s' %(choose[Number], dateline)
			
 
				+	try:
			
 
				+		Mysql.execute(delete, conn=conn)
			
 
				+	except mysql.connector.errors.ProgrammingError as e:
			
 
				+		pass
			
 
				+
			
 
				+	ind1 = 'DROP INDEX id ON yxb.%s_tv_rating_denoise' % choose[Number]
			
 
				+	ind2 = 'DROP INDEX tv_date ON yxb.%s_tv_rating_denoise' % choose[Number] 
			
 
				+	try:
			
 
				+		Mysql.execute(ind1, conn=conn)
			
 
				+	except mysql.connector.errors.ProgrammingError as e:
			
 
				+		pass
			
 
				+	try:
			
 
				+		Mysql.execute(ind2, conn=conn)
			
 
				+	except mysql.connector.errors.ProgrammingError as e:
			
 
				+		pass
			
 
				+
			
 
				+	sql = 'insert into yxb.%s_tv_rating_denoise' % choose[Number] + ' values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
			
 
				+	for i in range(int(len(result)/1000)+1):
			
 
				+		tmp = result[i*1000:(i+1)*1000]
			
 
				+		Mysql.insertMany(sql, tmp, conn=conn)
			
 
				+
			
 
				+	sql1 = 'CREATE INDEX id ON yxb.%s_tv_rating_denoise(id)' % choose[Number]
			
 
				+	sql2 = 'CREATE INDEX tv_date ON yxb.%s_tv_rating_denoise (tv_date,theater_attribute)' % choose[Number]
			
 
				+	Mysql.execute(sql1, conn=conn)
			
 
				+	Mysql.execute(sql2, conn=conn)
			
 
				+
			
 
				+	#写入odl.ad_tv_rating_denoise
			
 
				+	delete = 'delete from odl.%s_tv_rating_denoise where year(tv_date)>=%s' % (choose[Number],dateline)
			
 
				+	try:
			
 
				+		Mysql.execute(delete, conn=conn)
			
 
				+	except mysql.connector.errors.ProgrammingError as e:
			
 
				+		pass
			
 
				+
			
 
				+	sql = 'insert into odl.%s_tv_rating_denoise select * from yxb.%s_tv_rating_denoise where year(tv_date)>=%s' %(choose[Number], choose[Number], dateline)
			
 
				+
			
 
				+	try:
			
 
				+		Mysql.execute(sql, conn=conn)
			
 
				+	except mysql.connector.errors.ProgrammingError as e:
			
 
				+		pass
			
 
				+
			
 
				+	Mysql.close(conn)
			
 
				+
			
 
				+for i in range(0,2):
			
 
				+	Stat(i)
			
--- a/odl_near_realtime_calc.py
+++ b/odl_near_realtime_calc.py
@@ -0,0 +1,98 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""odl.ad_tv_id_pre数据监听
			
 
				+
			
 
				+每隔一段时间
			
 
				+"""
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+import commands
			
 
				+import time
			
 
				+import sys
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+c_list = [',', '、', '，', ';', '；', '/']
			
 
				+def replace_other_character(field):
			
 
				+    if field is None:
			
 
				+        return ''
			
 
				+    if field == u'暂无信息':
			
 
				+        field = ''
			
 
				+    for c in c_list:
			
 
				+        field = field.replace(c, ' ')
			
 
				+    return field
			
 
				+
			
 
				+while True:
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+    print 'start heartbeat'
			
 
				+    # 情况odl.ad_tv_lib表数据
			
 
				+    sql = """
			
 
				+        select tv_id, is_run from yxb.ad_tv_id_pre where is_run = 0
			
 
				+    """
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        tv_id = row['tv_id']
			
 
				+        is_run = row['is_run']
			
 
				+        if is_run == 0:
			
 
				+            print tv_id
			
 
				+            sql = """
			
 
				+                update yxb.ad_tv_id_pre set is_run = 1 where tv_id = '%s'
			
 
				+            """
			
 
				+            sql = sql % (tv_id)
			
 
				+            Mysql.update(sql, conn=conn)
			
 
				+
			
 
				+            sql = """
			
 
				+                select tv_name, director, scriptwritter, main_actors, types, concat(decade, first_type) as first_type, second_type, \
			
 
				+                description, pub_comp, pub_date, production, \
			
 
				+                cehua, jianzhi, chupin_comp, chupin_date, show_time, decade, first_type, categories from yxb.ad_tv_lib where id = '%s'
			
 
				+            """
			
 
				+            sql = sql % (tv_id)
			
 
				+            row = Mysql.getOne(sql, conn=conn)
			
 
				+            tv_name = row[0]
			
 
				+            director = replace_other_character(row[1])
			
 
				+            scriptwritter = replace_other_character(row[2])
			
 
				+            main_actors = replace_other_character(row[3])
			
 
				+            types = replace_other_character(row[4])
			
 
				+            first_type = replace_other_character(row[5])
			
 
				+            second_type = replace_other_character(row[6])
			
 
				+            description = row[7]
			
 
				+            pub_comp = replace_other_character(row[8])
			
 
				+            pub_date = row[9]
			
 
				+            production = replace_other_character(row[10])
			
 
				+            cehua = replace_other_character(row[11])
			
 
				+            jianzhi = replace_other_character(row[12])
			
 
				+            chupin_comp = replace_other_character(row[13])
			
 
				+            chupin_date = row[14]
			
 
				+            show_time = row[15]
			
 
				+            decade = replace_other_character(row[16])
			
 
				+            theme = replace_other_character(row[17])
			
 
				+
			
 
				+            sql = """
			
 
				+                replace into odl.ad_tv_lib (tv_id, tv_name, director, scriptwriter, main_actors, types, first_type, second_type, description, \
			
 
				+                pub_comp, pub_date, filmer, scheming, producer, produce_comp, produce_date, show_time, is_use, decade, theme) \
			
 
				+                values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
			
 
				+            """
			
 
				+            value = (tv_id, tv_name, director, scriptwritter, main_actors, types, first_type, second_type, description, pub_comp, pub_date, production, cehua, jianzhi, chupin_comp, chupin_date, show_time, '1', decade, theme)
			
 
				+            Mysql.execute(sql, param=value, conn=conn)
			
 
				+        #todo 调用预测脚本
			
 
				+        status, output = commands.getstatusoutput('sh bash_near_real_job.sh ' + str(tv_id))
			
 
				+        if status != 0:
			
 
				+            sql = """
			
 
				+                update yxb.ad_tv_id_pre set is_run = 0 where tv_id = '%s'
			
 
				+            """
			
 
				+            sql = sql % (tv_id)
			
 
				+            Mysql.update(sql, conn=conn)
			
 
				+            print 'near_real_job.sh执行失败'
			
 
				+            print output
			
 
				+            break
			
 
				+        else:
			
 
				+            sql = """
			
 
				+                delete from yxb.ad_tv_id_pre where tv_id = '%s' and is_run = 1
			
 
				+            """
			
 
				+            sql = sql % (tv_id)
			
 
				+            Mysql.execute(sql, conn=conn)
			
 
				+    Mysql.close(conn)
			
 
				+    time.sleep(60)
			
--- a/online_ad_tv_sr_pre.py
+++ b/online_ad_tv_sr_pre.py
@@ -0,0 +1,179 @@
 
				+#encoding=utf-8
			
 
				+#author:wdw110
			
 
				+#功能：预测计算电视剧收视指数和在电视台的收视率
			
 
				+
			
 
				+from __future__ import division
			
 
				+import re
			
 
				+import sys
			
 
				+import time
			
 
				+import copy
			
 
				+import datetime
			
 
				+import numpy as np
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+tv_pre = [] #电视剧收视预测值
			
 
				+tv_data_linear = {}
			
 
				+tv_data_sim = {}
			
 
				+tv_stage = {}
			
 
				+channel_rate = {}
			
 
				+type_rate = {}
			
 
				+model_var = [{},{},{},{},{}]
			
 
				+rules = u' '
			
 
				+if len(sys.argv) > 1:
			
 
				+	tv_id = int(sys.argv[1])
			
 
				+else:
			
 
				+	print '请输入电视剧id'
			
 
				+	sys.exit()
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+bound = [0,2]  #固定收视指数的最小和最大值
			
 
				+
			
 
				+sql = 'select * from tmp.ad_tv_sr_pre_var'
			
 
				+tmp_data = Mysql.selectAll(sql, conn=conn)
			
 
				+for i in range(len(tmp_data)):
			
 
				+	var_name = tmp_data[i][1]
			
 
				+	tv_sr_arr = map(float,tmp_data[i][3].strip('[|]').split(','))
			
 
				+	year = tmp_data[i][4]
			
 
				+	var_loc = tmp_data[i][5]-1
			
 
				+
			
 
				+	model_var[var_loc].setdefault(var_name,{})
			
 
				+	model_var[var_loc][var_name][year] = sum(tv_sr_arr)/len(tv_sr_arr)
			
 
				+
			
 
				+sql = 'select tv_id,tv_name,director,scriptwriter,main_actors,filmer,categories from odl.ad_tv_lib where tv_id=%d' % tv_id
			
 
				+tmp_data = Mysql.selectAll(sql, conn=conn)
			
 
				+
			
 
				+
			
 
				+if tmp_data:
			
 
				+	tv_id = int(tmp_data[0][0])
			
 
				+	tv_name = tmp_data[0][1]
			
 
				+	year = datetime.datetime.now().year
			
 
				+	director = tmp_data[0][2] if tmp_data[0][2] else ''
			
 
				+	scriptwriter = tmp_data[0][3] if tmp_data[0][3] else ''
			
 
				+	actors = tmp_data[0][4] if tmp_data[0][4] else ''
			
 
				+	filmer = tmp_data[0][5] if tmp_data[0][5] else ''
			
 
				+	type1 = tmp_data[0][6] if tmp_data[0][6] else ''
			
 
				+	tv_data_linear[tv_id] = [tv_name,director,scriptwriter,actors,filmer,type1,year]
			
 
				+else:
			
 
				+	print 'tv_id:%d不在ad_tv_lib表中！' % tv_id
			
 
				+	sys.exit()
			
 
				+
			
 
				+def trans(val):
			
 
				+	res = (val - bound[0])/(bound[1]-bound[0])*10.0
			
 
				+	if res < 1.0:
			
 
				+		res = 1.0
			
 
				+	elif res > 10.0:
			
 
				+		res = 10.0
			
 
				+	return round(res,2)
			
 
				+
			
 
				+def tv_sr_pre(var_arr,year): #变量数组，各变量的名字
			
 
				+	'''线性回归模型'''
			
 
				+	coef = np.array([0.2103148,0.5182419,0.7822451,0.4921597,0.3865043,-1.3566513])
			
 
				+	model_avg = [] #每个变量的近一年的平均值
			
 
				+	model_val = np.ones(len(var_arr)+1)
			
 
				+	for i in range(len(model_var)):
			
 
				+		tmp_obj = model_var[i]
			
 
				+		sum1,num = 0,0
			
 
				+		for var_name in tmp_obj:
			
 
				+			if year in tmp_obj[var_name]:
			
 
				+				sum1 += tmp_obj[var_name][year]
			
 
				+				num += 1
			
 
				+			elif year-1 in tmp_obj[var_name]:
			
 
				+				sum1 += tmp_obj[var_name][year-1]
			
 
				+				num += 1
			
 
				+		tmp_avg = sum1/num if num else 0
			
 
				+		model_avg.append(tmp_avg)
			
 
				+
			
 
				+	for i in range(len(var_arr)):
			
 
				+		p_arr = var_arr[i].split(u' ')
			
 
				+		for peo in p_arr:
			
 
				+			if peo:
			
 
				+				if model_var[i].has_key(peo):
			
 
				+					if year in model_var[i][peo]:
			
 
				+						model_val[i] = model_var[i][peo][year]
			
 
				+					else:
			
 
				+						max_year = max(model_var[i][peo].keys())
			
 
				+						model_val[i] = model_var[i][peo][max_year]
			
 
				+				else:
			
 
				+					model_val[i] = model_avg[i] #变量不在数据库时用均值代替
			
 
				+	result = np.dot(coef,model_val)
			
 
				+	return result
			
 
				+
			
 
				+now = datetime.datetime.now()
			
 
				+aDay = datetime.timedelta(days=-365)
			
 
				+date_line = (now + aDay).date()
			
 
				+#默认时间为当前时间的近一年
			
 
				+
			
 
				+
			
 
				+sql = "select * from idl.ad_pub_station_rate_stats"
			
 
				+station_data = Mysql.selectAll(sql, conn=conn)
			
 
				+
			
 
				+
			
 
				+sql = "select * from tmp.ad_pub_station_type_rate"
			
 
				+type_data = Mysql.selectAll(sql, conn=conn)
			
 
				+
			
 
				+for i in range(len(station_data)):
			
 
				+	channel = station_data[i][1]
			
 
				+	aud_rating = station_data[i][2]
			
 
				+	tv_date = station_data[i][3]
			
 
				+	
			
 
				+	channel_rate.setdefault(channel,[])
			
 
				+	if tv_date >= date_line:
			
 
				+		channel_rate[channel].append(aud_rating)
			
 
				+
			
 
				+for i in range(len(type_data)):
			
 
				+	channel = type_data[i][1]
			
 
				+	Type = type_data[i][2]
			
 
				+	aud_rating = type_data[i][3]
			
 
				+	tv_date = type_data[i][4]
			
 
				+	
			
 
				+	type_rate.setdefault(channel,{})
			
 
				+	type_rate[channel].setdefault(Type,[])
			
 
				+	if tv_date >= date_line:
			
 
				+		type_rate[channel][Type].append(aud_rating)
			
 
				+
			
 
				+tv2type = copy.deepcopy(type_rate)
			
 
				+for channel,value in type_rate.items():
			
 
				+	for ty,v_arr in value.items():
			
 
				+		tv2type[channel][ty] = sum(v_arr)/len(v_arr) if len(v_arr) else 0.0
			
 
				+
			
 
				+for tv_id in tv_data_linear:
			
 
				+	tv_name = tv_data_linear[tv_id][0]
			
 
				+	var_arr = tv_data_linear[tv_id][1:-1]
			
 
				+	type1 = tv_data_linear[tv_id][5]
			
 
				+	year = tv_data_linear[tv_id][-1]
			
 
				+
			
 
				+	tv_station = {}
			
 
				+	ty_arr = type1.split(u' ')
			
 
				+	for channel in channel_rate:
			
 
				+		tmp,n = 0,0
			
 
				+		value = tv2type.get(channel,{})
			
 
				+		for tt in ty_arr:
			
 
				+			if tt and value.has_key(tt):
			
 
				+				tmp += value[tt]
			
 
				+				n += 1
			
 
				+		tv_station[channel] = tmp/n if n else 0
			
 
				+
			
 
				+	tv_sr = tv_sr_pre(var_arr,year)
			
 
				+	for channel,vv in tv_station.items():
			
 
				+		channel_avg = sum(channel_rate[channel])/len(channel_rate[channel]) if channel_rate[channel] else 0.0
			
 
				+		#判断是否有该类型的平均收视率
			
 
				+		if vv:
			
 
				+			station_rate = 0.6 * tv_sr * channel_avg + 0.4 * vv
			
 
				+		else:
			
 
				+			station_rate = tv_sr * channel_avg
			
 
				+		tv_pre.append((tv_id,tv_name,str(tv_sr),str(trans(tv_sr)),channel,str(station_rate)))
			
 
				+
			
 
				+delete = 'delete from idl.ad_tv_sr_pre where tv_id=%d' % tv_id
			
 
				+Mysql.execute(delete, conn=conn)
			
 
				+
			
 
				+
			
 
				+sql = 'insert into idl.ad_tv_sr_pre(tv_id,tv_name,tv_sr_pre,tv_sr_pre_trans,channel,station_tv_pre) values(%s,%s,%s,%s,%s,%s)'
			
 
				+Mysql.insertMany(sql, tv_pre, conn=conn)
			
 
				+
			
 
				+
			
 
				+sql = 'update tv_lib.gc_tv_series ts set ts.point = %s where ts.tv_id = %s' %(str(tv_pre[0][3]),tv_id)
			
 
				+Mysql.update(sql, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
 
				+
			
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,9 @@
 
				+
			
 
				+from setuptools import setup, find_packages
			
 
				+
			
 
				+setup(
			
 
				+    name="fty_util",
			
 
				+    version='1.0',
			
 
				+    packages=find_packages(where='.'),
			
 
				+    include_package_data=True,
			
 
				+)
			
--- a/shell/bash_daily.sh
+++ b/shell/bash_daily.sh
@@ -0,0 +1,191 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# 判断脚本执行路径是否存在，如果不存在则需要设置
			
 
				+if [ -z $HUOJU_FTY_PATH ];
			
 
				+then 
			
 
				+    echo "not found"
			
 
				+    export HUOJU_FTY_PATH=/root/py_script/
			
 
				+else
			
 
				+    echo "found"
			
 
				+fi
			
 
				+echo $HUOJU_FTY_PATH
			
 
				+
			
 
				+###############################daily操作################################
			
 
				+
			
 
				+# 电视台近一年平均收视率
			
 
				+echo "执行tmp_year_channel_avg_ratings_stat_by_tv province任务"
			
 
				+python ${HUOJU_FTY_PATH}task_tmp/tmp_year_channel_avg_ratings_stat_by_tv.py province
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务tmp_year_channel_avg_ratings_stat_by_tv province失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+echo "执行tmp_year_channel_avg_ratings_stat_by_tv area任务"
			
 
				+python ${HUOJU_FTY_PATH}task_tmp/tmp_year_channel_avg_ratings_stat_by_tv.py area
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务tmp_year_channel_avg_ratings_stat_by_tv area失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# 按月统计电视剧的收视情况
			
 
				+echo "执行tmp_tv_avg_ratings_fatt0 province任务"
			
 
				+python ${HUOJU_FTY_PATH}task_tmp/tmp_tv_avg_ratings_fatt0.py province
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务tmp_tv_avg_ratings_fatt0 province失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+echo "执行tmp_tv_avg_ratings_stat province任务"
			
 
				+python ${HUOJU_FTY_PATH}task_tmp/tmp_tv_avg_ratings_stat.py province
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务tmp_tv_avg_ratings_stat province失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+echo "执行tmp_tv_avg_ratings_fatt0 area任务"
			
 
				+python ${HUOJU_FTY_PATH}task_tmp/tmp_tv_avg_ratings_fatt0.py area
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务tmp_tv_avg_ratings_fatt0 area失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+echo "执行tmp_tv_avg_ratings_stat area任务"
			
 
				+python ${HUOJU_FTY_PATH}task_tmp/tmp_tv_avg_ratings_stat.py area
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务tmp_tv_avg_ratings_stat area失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# 电视台对应电视剧及类型关系数据
			
 
				+echo "执行tmp_tv_category_stat province任务"
			
 
				+python ${HUOJU_FTY_PATH}task_tmp/tmp_tv_category_stat.py province
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务tmp_tv_category_stat province失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+echo "执行tmp_tv_category_stat area任务"
			
 
				+python ${HUOJU_FTY_PATH}task_tmp/tmp_tv_category_stat.py area
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务tmp_tv_category_stat area失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# 同步tv_avg_ratings_stat
			
 
				+echo "执行idl_tv_avg_ratings_stat province任务"
			
 
				+python ${HUOJU_FTY_PATH}task_idl/idl_tv_avg_ratings_stat.py province
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务idl_tv_avg_ratings_stat province失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+echo "执行idl_tv_avg_ratings_stat area任务"
			
 
				+python ${HUOJU_FTY_PATH}task_idl/idl_tv_avg_ratings_stat.py area
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务idl_tv_avg_ratings_stat area失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# 同步tv_category_stat
			
 
				+echo "执行idl_tv_category_stat province任务"
			
 
				+python ${HUOJU_FTY_PATH}task_idl/idl_tv_category_stat.py province
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务idl_tv_category_stat province失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+echo "执行idl_tv_category_stat area任务"
			
 
				+python ${HUOJU_FTY_PATH}task_idl/idl_tv_category_stat.py area
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务idl_tv_category_stat area失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# 同步year_channel_avg_ratings_stat
			
 
				+echo "执行idl_year_channel_avg_ratings_stat province任务"
			
 
				+python ${HUOJU_FTY_PATH}task_idl/idl_year_channel_avg_ratings_stat.py province
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务idl_year_channel_avg_ratings_stat province失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+echo "执行idl_year_channel_avg_ratings_stat area任务"
			
 
				+python ${HUOJU_FTY_PATH}task_idl/idl_year_channel_avg_ratings_stat.py area
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务idl_year_channel_avg_ratings_stat area失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# 同步营销文章爬取数量
			
 
				+echo "执行idl_tv_article_marketing_count任务"
			
 
				+python ${HUOJU_FTY_PATH}task_idl/idl_tv_article_marketing_count.py
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务idl_tv_article_marketing_count失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# 同步营销文章爬取链接
			
 
				+echo "执行idl_tv_article_marketing_detail任务"
			
 
				+python ${HUOJU_FTY_PATH}task_idl/idl_tv_article_marketing_detail.py
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务idl_tv_article_marketing_detail失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# 类型转换
			
 
				+echo "执行transform_categories任务"
			
 
				+python ${HUOJU_FTY_PATH}task_other/transform_categories.py
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务transform_categories失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# yxb字段清理
			
 
				+echo "执行ad_tv_lib_clean任务"
			
 
				+python ${HUOJU_FTY_PATH}task_yxb/ad_tv_lib_clean.py
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务ad_tv_lib_clean失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# 排名
			
 
				+echo "执行idl_rank_update任务"
			
 
				+python ${HUOJU_FTY_PATH}task_other/idl_rank_update.py
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="idl_rank_update失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+echo "每天脚本执行完毕"
			
--- a/shell/bash_daily_import.sh
+++ b/shell/bash_daily_import.sh
@@ -0,0 +1,66 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# 判断脚本执行路径是否存在，如果不存在则需要设置
			
 
				+if [ -z $HUOJU_FTY_PATH ];
			
 
				+then
			
 
				+    echo "not found"
			
 
				+    export HUOJU_FTY_PATH=/root/py_script/
			
 
				+else
			
 
				+    echo "found"
			
 
				+fi
			
 
				+echo $HUOJU_FTY_PATH
			
 
				+
			
 
				+###############################import操作################################
			
 
				+
			
 
				+# odl.ad_tv_lib 增量更新操作
			
 
				+echo "执行odl_ad_tv_lib_insert任务"
			
 
				+python ${HUOJU_FTY_PATH}task_odl/odl_ad_tv_lib_insert.py
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="odl_ad_tv_lib_insert失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# odl.ad_television 增量更新操作
			
 
				+echo "执行odl_ad_television_incr_update任务"
			
 
				+python ${HUOJU_FTY_PATH}task_odl/odl_ad_television_incr_update.py
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="odl_ad_television_incr_update失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# odl.area_ad_television 增量更新操作
			
 
				+echo "执行odl_area_ad_television_incr_update任务"
			
 
				+python ${HUOJU_FTY_PATH}task_odl/odl_area_ad_television_incr_update.py
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="odl_area_ad_television_incr_update失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# odl.ad_audience_cps_time_incr_update 导入操作
			
 
				+echo "执行odl_ad_audience_cps_time_incr_update任务"
			
 
				+python ${HUOJU_FTY_PATH}task_odl/odl_ad_audience_cps_time_incr_update.py
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="odl_ad_audience_cps_time_incr_update失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+echo "导入脚本执行完毕"
			
 
				+
			
 
				+# 月份提取
			
 
				+echo "执行tmp_data_month任务"
			
 
				+python ${HUOJU_FTY_PATH}task_tmp/tmp_data_month.py
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务tmp_data_month失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+echo "月份提取完毕"
			
--- a/shell/bash_job.sh
+++ b/shell/bash_job.sh
@@ -0,0 +1,55 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# 判断脚本执行路径是否存在，如果不存在则需要设置
			
 
				+if [ -z $HUOJU_FTY_PATH ];
			
 
				+then
			
 
				+    echo "not found"
			
 
				+    export HUOJU_FTY_PATH=/root/py_script/
			
 
				+else
			
 
				+    echo "found"
			
 
				+fi
			
 
				+echo $HUOJU_FTY_PATH
			
 
				+
			
 
				+#########################基础数据###########################
			
 
				+
			
 
				+# 收视率数据去噪
			
 
				+echo "idl_tv_sr_denoise"
			
 
				+python ${HUOJU_FTY_PATH}idl_tv_sr_denoise.py
			
 
				+if [ $? -ne 0 ];
			
 
				+   then
			
 
				+       content="idl_tv_sr_denoise"
			
 
				+       echo $content
			
 
				+   exit 1
			
 
				+fi
			
 
				+
			
 
				+# 相似剧离线计算
			
 
				+echo "tv_outline_recom"
			
 
				+python ${HUOJU_FTY_PATH}tv_outline_recom.py
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="tv_outline_recom"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# 收视指数预测基础数据
			
 
				+echo "tmp_ad_tv_sr_stat"
			
 
				+python ${HUOJU_FTY_PATH}tmp_ad_tv_sr_stat.py
			
 
				+if [ $? -ne 0 ];
			
 
				+   then
			
 
				+       content="tmp_ad_tv_sr_stat"
			
 
				+       echo $content
			
 
				+   exit 1
			
 
				+fi
			
 
				+
			
 
				+# 发行平台数据统计
			
 
				+echo "idl_ad_pub_station_stats"
			
 
				+python ${HUOJU_FTY_PATH}idl_ad_pub_station_stats.py
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="idl_ad_pub_station_stats"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+echo "脚本执行完毕"
			
--- a/shell/bash_scrapy.sh
+++ b/shell/bash_scrapy.sh
@@ -0,0 +1,43 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# 判断脚本执行路径是否存在，如果不存在则需要设置
			
 
				+if [ -z $HUOJU_FTY_PATH ];
			
 
				+then 
			
 
				+    echo "not found"
			
 
				+    export HUOJU_FTY_PATH=/root/py_script/
			
 
				+else
			
 
				+    echo "found"
			
 
				+fi
			
 
				+echo $HUOJU_FTY_PATH
			
 
				+
			
 
				+###############################scrapy操作################################
			
 
				+
			
 
				+# 新剧营销文章链接爬取
			
 
				+echo "执行scrapy_website_count_new任务"
			
 
				+python ${HUOJU_FTY_PATH}task_scrapy/scrapy_website_count_new.py
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务scrapy_website_count_new失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# 营销文章数量爬取
			
 
				+echo "执行scrapy_website_count任务"
			
 
				+python ${HUOJU_FTY_PATH}task_scrapy/scrapy_website_count.py
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务scrapy_website_count失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# 从爱奇艺上爬取百科上未被爬到的电视剧
			
 
				+echo "执行scrapy_tv_unhandle任务"
			
 
				+python ${HUOJU_FTY_PATH}task_scrapy/scrapy_tv_unhandle.py
			
 
				+if [ $? -ne 0 ];
			
 
				+    then
			
 
				+        content="任务scrapy_tv_unhandle失败"
			
 
				+        echo $content
			
 
				+    exit 1
			
 
				+fi
			
--- a/task_clean/odl_ad_tv_record_distribution_update_company_field.py
+++ b/task_clean/odl_ad_tv_record_distribution_update_company_field.py
@@ -0,0 +1,39 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""更新表odl.ad_tv_record_distribution表的theme, first_type, second_type字段，去除空白符
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import datetime
			
 
				+import os
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+# 清空走势数据
			
 
				+
			
 
				+sql = """
			
 
				+    select id, company, `desc` from odl.ad_tv_record_distribution where id > 5000
			
 
				+"""
			
 
				+
			
 
				+rows = Mysql.getAll(sql, conn=conn)
			
 
				+for row in rows:
			
 
				+    _id = row['id']
			
 
				+    company = row['company']
			
 
				+    desc = row['desc']
			
 
				+    company = company.replace(u'报备机构:', '').replace(u'报备机构：', '').replace('\r', '').replace('\n', '').replace('\t', '').replace('\'', '\\\'').replace('\"', '\\\"')
			
 
				+    desc = desc.replace(u'内容提要:','').replace(u'内容提要：','').replace('\r', '').replace('\n', '').replace('\t', '').replace('\'', '\\\'').replace('\"', '\\\"')
			
 
				+
			
 
				+    sql = """
			
 
				+        update odl.ad_tv_record_distribution set company = '%s', `desc` = '%s' where id = '%s'
			
 
				+    """
			
 
				+    sql = sql % (company, desc, _id)
			
 
				+    Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_clean/odl_ad_tv_record_distribution_update_theme_field.py
+++ b/task_clean/odl_ad_tv_record_distribution_update_theme_field.py
@@ -0,0 +1,41 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""更新表odl.ad_tv_record_distribution表的theme, first_type, second_type字段，去除空白符
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import datetime
			
 
				+import os
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+# 清空走势数据
			
 
				+
			
 
				+sql = """
			
 
				+    select id, theme, first_type, second_type from odl.ad_tv_record_distribution where LENGTH(theme) > 12
			
 
				+"""
			
 
				+
			
 
				+rows = Mysql.getAll(sql, conn=conn)
			
 
				+for row in rows:
			
 
				+    _id = row['id']
			
 
				+    theme = row['theme']
			
 
				+    first_type = row['first_type']
			
 
				+    second_type = row['second_type']
			
 
				+    theme = theme.replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '')
			
 
				+    first_type = first_type.replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '')
			
 
				+    second_type = second_type.replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '')
			
 
				+
			
 
				+    sql = """
			
 
				+        update odl.ad_tv_record_distribution set theme = '%s', first_type = '%s', second_type = '%s' where id = '%s'
			
 
				+    """
			
 
				+    sql = sql % (theme, first_type, second_type, _id)
			
 
				+    Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_clean/scrapy_category_clean.py
+++ b/task_clean/scrapy_category_clean.py
@@ -0,0 +1,47 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""清洗爬取到的分类数据
			
 
				+
			
 
				+流程：爬取爱奇艺数据，爬取腾讯数据 -> 将两个分类去重合并 -> 将分类处理到关联表中，每个tv_id对应一个分类
			
 
				+"""
			
 
				+
			
 
				+import datetime
			
 
				+import os
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+# 查询爬取到的爱奇艺分类和腾讯视频分类
			
 
				+sql = """
			
 
				+    select id, tv_name, iqiyi_types, tengxun_types from scrapy.tv_category_scrapy
			
 
				+"""
			
 
				+rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+for row in rows:
			
 
				+    _id = row['id']
			
 
				+    tv_name = row['tv_name']
			
 
				+    iqiyi_types = row['iqiyi_types']
			
 
				+    tengxun_types = row['tengxun_types']
			
 
				+    all_types = set()
			
 
				+    if iqiyi_types is not None and len(iqiyi_types) > 0:
			
 
				+        for iqiyi_type in iqiyi_types.split(' '):
			
 
				+            all_types.add(iqiyi_type)
			
 
				+
			
 
				+    if tengxun_types is not None and len(tengxun_types) > 0:
			
 
				+        for tengxun_type in tengxun_types.split(' '):
			
 
				+            all_types.add(tengxun_type)
			
 
				+
			
 
				+    sql = """
			
 
				+        update scrapy.tv_category_scrapy set types = '%s' where id = '%s'
			
 
				+    """
			
 
				+    sql = sql % (' '.join(all_types), _id)
			
 
				+    Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_clean/scrapy_category_update.py
+++ b/task_clean/scrapy_category_update.py
@@ -0,0 +1,63 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""清洗爬取到的分类数据
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import datetime
			
 
				+import os
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+# 查询爬取到的爱奇艺分类和腾讯视频分类
			
 
				+sql = """
			
 
				+    select id, types from scrapy.tv_category_scrapy
			
 
				+"""
			
 
				+rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+for row in rows:
			
 
				+    _id = row['id']
			
 
				+    types = row['types']
			
 
				+
			
 
				+    if types is not None and len(types) > 0:
			
 
				+        types_set = set()
			
 
				+        for _type in types.split(' '):
			
 
				+            if _type == '生活':
			
 
				+                continue
			
 
				+            elif _type == '军旅' or _type == '军事':
			
 
				+                types_set.add('军旅')
			
 
				+            elif _type == '惊悚' or _type == '恐怖':
			
 
				+                types_set.add('恐怖')
			
 
				+            elif _type == '魔幻' or _type == '奇幻':
			
 
				+                types_set.add('奇幻')
			
 
				+            elif _type == '偶像' or _type == '时装':
			
 
				+                types_set.add('偶像')
			
 
				+            elif _type == '喜剧' or _type == '搞笑':
			
 
				+                types_set.add('喜剧')
			
 
				+            elif _type == '悬疑' or _type == '冒险' or _type == '侦探':
			
 
				+                types_set.add('悬疑')
			
 
				+            elif _type == '言情' or _type == '情感' or _type == '爱情':
			
 
				+                types_set.add('情感')
			
 
				+            elif _type == '战争' or _type == '抗日' or _type == '革命':
			
 
				+                types_set.add('战争')
			
 
				+            elif _type == '警匪' or _type == '犯罪' or _type == '刑侦':
			
 
				+                types_set.add('罪案')
			
 
				+            elif _type == '传记' or _type == '人物' or _type == '传奇' or _type == '纪实':
			
 
				+                types_set.add('传记')
			
 
				+            else:
			
 
				+                types_set.add(_type)
			
 
				+        sql = """
			
 
				+            update scrapy.tv_category_scrapy set types = '%s' where id = '%s'
			
 
				+        """
			
 
				+        sql = sql % (' '.join(types_set), _id)
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_clean/scrapy_dianshiju_clean.py
+++ b/task_clean/scrapy_dianshiju_clean.py
@@ -0,0 +1,55 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""清洗爬取到的分类数据
			
 
				+
			
 
				+流程：爬取爱奇艺数据，爬取腾讯数据 -> 将两个分类去重合并 -> 将分类处理到关联表中，每个tv_id对应一个分类
			
 
				+"""
			
 
				+
			
 
				+import datetime
			
 
				+import os
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+# 爱奇艺数据清洗
			
 
				+def iqiyi_content_clean():
			
 
				+    pass
			
 
				+
			
 
				+# 腾讯数据清洗
			
 
				+def tengxun_content_clean():
			
 
				+    pass
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+# 查询爬取到的爱奇艺分类和腾讯视频分类
			
 
				+sql = """
			
 
				+    select id, tv_name, iqiyi_types, tengxun_types from scrapy.tv_category_scrapy
			
 
				+"""
			
 
				+rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+for row in rows:
			
 
				+    _id = row['id']
			
 
				+    tv_name = row['tv_name']
			
 
				+    iqiyi_types = row['iqiyi_types']
			
 
				+    tengxun_types = row['tengxun_types']
			
 
				+    all_types = set()
			
 
				+    if iqiyi_types is not None and len(iqiyi_types) > 0:
			
 
				+        for iqiyi_type in iqiyi_types.split(' '):
			
 
				+            all_types.add(iqiyi_type)
			
 
				+
			
 
				+    if tengxun_types is not None and len(tengxun_types) > 0:
			
 
				+        for tengxun_type in tengxun_types.split(' '):
			
 
				+            all_types.add(tengxun_type)
			
 
				+
			
 
				+    sql = """
			
 
				+        update scrapy.tv_category_scrapy set types = '%s' where id = '%s'
			
 
				+    """
			
 
				+    sql = sql % (' '.join(all_types), _id)
			
 
				+    Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_clean/tv_category_relation.py
+++ b/task_clean/tv_category_relation.py
@@ -0,0 +1,39 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""将tv_category_scrapy表中的分类数据(多个)分割存到分类关联表中，记录为tv_id - category
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+import os
			
 
				+import datetime
			
 
				+from fty_util.common import Mysql
			
 
				+import time
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+# 清空走势数据
			
 
				+
			
 
				+sql = """
			
 
				+    select id, types from scrapy.tv_category_scrapy where types is not null and types != '' order by id asc
			
 
				+"""
			
 
				+
			
 
				+rows = Mysql.getAll(sql, conn=conn)
			
 
				+for row in rows:
			
 
				+    _id = row['id']
			
 
				+    types = row['types']
			
 
				+
			
 
				+    type_list = types.split(' ')
			
 
				+    
			
 
				+    sql_insert = """
			
 
				+        insert into odl.tv_category_relation (tv_id, category) values (%s, %s)
			
 
				+    """
			
 
				+    data_list = []
			
 
				+    for _type in type_list:
			
 
				+        data_list.append((_id, _type))
			
 
				+    Mysql.insertMany(sql_insert, data_list, conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_clean/update_date.py
+++ b/task_clean/update_date.py
@@ -0,0 +1,297 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""电视台收视率统计
			
 
				+
			
 
				+"""
			
 
				+import datetime
			
 
				+import re
			
 
				+import sys
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+def parse_date(field, date_format):
			
 
				+    """
			
 
				+    日期转换
			
 
				+    """
			
 
				+    time_format = datetime.datetime.strptime(field, date_format)
			
 
				+    time_format = time_format.strftime(u'%Y-%m-%d')
			
 
				+    return time_format
			
 
				+
			
 
				+def parse_field(field):
			
 
				+    """
			
 
				+    处理字段，除了p_detail字段
			
 
				+    """
			
 
				+    if field is None or len(field) == 0:
			
 
				+        return ""
			
 
				+    else:
			
 
				+        field = strQ2B(field)
			
 
				+        return field.replace(' ', '')
			
 
				+
			
 
				+def strQ2B(ustring):
			
 
				+    """
			
 
				+    全角转半角
			
 
				+    """
			
 
				+    tstring = ""
			
 
				+    for uchar in ustring:
			
 
				+        inside_code = ord(uchar)
			
 
				+        # 全角空格直接转换
			
 
				+        if inside_code == 12288:
			
 
				+            inside_code = 32
			
 
				+        if inside_code == 160:
			
 
				+            inside_code = 32
			
 
				+        # 全角字符（除空格）根据关系转化
			
 
				+        elif (inside_code >= 65281 and inside_code <= 65374):
			
 
				+            inside_code -= 65248
			
 
				+        
			
 
				+        tstring += unichr(inside_code)
			
 
				+    return tstring
			
 
				+
			
 
				+
			
 
				+
			
 
				+from dev_mysql_conn import Mysql
			
 
				+
			
 
				+def update_show_time():
			
 
				+
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, show_time from yxb.ad_tv_lib
			
 
				+    """
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        show_time = row['show_time']
			
 
				+        if show_time is not None and len(show_time) > 0:
			
 
				+            show_time = parse_field(show_time)
			
 
				+            _str = show_time.decode('utf8')
			
 
				+
			
 
				+            # 格式xxxx年y月d日
			
 
				+            # xx=ur"\d+[\u5e74]\d+[\u6708]\d+[\u65e5]"
			
 
				+            # p = re.compile(xx)
			
 
				+            # date_list = p.findall(_str)
			
 
				+            # if date_list and len(date_list) > 0:
			
 
				+            #     date = date_list[0]
			
 
				+            #     show_date = parse_date(date, '%Y年%m月%d日')
			
 
				+            #     print show_date
			
 
				+            #     sql = """
			
 
				+            #         update yxb.ad_tv_lib set show_time = '%s' where id = '%s'
			
 
				+            #     """
			
 
				+            #     sql = sql % (show_date, _id)
			
 
				+            #     Mysql.update(sql, conn=conn)
			
 
				+
			
 
				+            # # 格式xxxx年y月d号
			
 
				+            # xx=ur"\d+[\u5e74]\d+[\u6708]\d+[\u53f7]"
			
 
				+            # p = re.compile(xx)
			
 
				+            # date_list = p.findall(_str)
			
 
				+            # if date_list and len(date_list) > 0:
			
 
				+            #     date = date_list[0]
			
 
				+            #     show_date = parse_date(date, '%Y年%m月%d号')
			
 
				+            #     print _str
			
 
				+            #     sql = """
			
 
				+            #         update yxb.ad_tv_lib set show_time = '%s' where id = '%s'
			
 
				+            #     """
			
 
				+            #     sql = sql % (show_date, _id)
			
 
				+            #     # Mysql.update(sql, conn=conn)
			
 
				+
			
 
				+            # xx=ur"\d+[\u5e74]\d+[\u6708]"
			
 
				+            # p = re.compile(xx)
			
 
				+            # date_list = p.findall(_str)
			
 
				+            # if date_list and len(date_list) == 1:
			
 
				+            #     print _str
			
 
				+            #     date = date_list[0]
			
 
				+            #     show_date = parse_date(date, '%Y年%m月')
			
 
				+            #     sql = """
			
 
				+            #         update yxb.ad_tv_lib set show_time = '%s' where id = '%s'
			
 
				+            #     """
			
 
				+            #     sql = sql % (show_date, _id)
			
 
				+            #     # Mysql.update(sql, conn=conn)
			
 
				+
			
 
				+            # # 年月
			
 
				+            # xx=ur"\d+[\u5e74]\d+[\u6708]"
			
 
				+            # p = re.compile(xx)
			
 
				+            # date_list = p.findall(_str)
			
 
				+            # if date_list and len(date_list) > 0:
			
 
				+            #     date = date_list[0]
			
 
				+            #     show_time_date = parse_date(date, '%Y年%m月')
			
 
				+            #     print _str
			
 
				+            #     sql = """
			
 
				+            #         update yxb.ad_tv_lib set show_time = '%s' where id = '%s'
			
 
				+            #     """
			
 
				+            #     sql = sql % (show_time_date, _id)
			
 
				+            #     Mysql.update(sql, conn=conn)
			
 
				+
			
 
				+            # 年
			
 
				+            xx=ur"\d+[\u5e74]"
			
 
				+            p = re.compile(xx)
			
 
				+            date_list = p.findall(_str)
			
 
				+            if date_list and len(date_list) > 0:
			
 
				+                date = date_list[0]
			
 
				+                show_time_date = parse_date(date, '%Y年')
			
 
				+                print _str
			
 
				+                sql = """
			
 
				+                    update yxb.ad_tv_lib set show_time = '%s' where id = '%s'
			
 
				+                """
			
 
				+                sql = sql % (show_time_date, _id)
			
 
				+                Mysql.update(sql, conn=conn)
			
 
				+
			
 
				+    Mysql.close(conn)
			
 
				+
			
 
				+# 更新dates字段
			
 
				+def update_dates():
			
 
				+    conn = Mysql.createOnlineConn()
			
 
				+    sql = """
			
 
				+        select id, dates from yxb.ad_tv_lib
			
 
				+    """
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        dates = row['dates']
			
 
				+        if dates is not None and len(dates) > 0:
			
 
				+            dates = parse_field(dates)
			
 
				+            _str = dates.decode('utf8')
			
 
				+
			
 
				+            # # 年月日
			
 
				+            # xx=ur"\d+[\u5e74]\d+[\u6708]\d+[\u65e5]"
			
 
				+            # p = re.compile(xx)
			
 
				+            # date_list = p.findall(_str)
			
 
				+            # if date_list and len(date_list) > 0:
			
 
				+            #     date = date_list[0]
			
 
				+            #     dates_date = parse_date(date, '%Y年%m月%d日')
			
 
				+            #     print _str
			
 
				+            #     sql = """
			
 
				+            #         update yxb.ad_tv_lib set dates = '%s' where id = '%s'
			
 
				+            #     """
			
 
				+            #     sql = sql % (dates_date, _id)
			
 
				+            #     Mysql.update(sql, conn=conn)
			
 
				+
			
 
				+            # # - -
			
 
				+            # xx=ur"\d+[-]\d+[-]\d+"
			
 
				+            # p = re.compile(xx)
			
 
				+            # date_list = p.findall(_str)
			
 
				+            # if date_list and len(date_list) > 0:
			
 
				+            #     date = date_list[0]
			
 
				+            #     dates_date = parse_date(date, '%Y-%m-%d')
			
 
				+            #     print _str
			
 
				+            #     sql = """
			
 
				+            #         update yxb.ad_tv_lib set dates = '%s' where id = '%s'
			
 
				+            #     """
			
 
				+            #     sql = sql % (dates_date, _id)
			
 
				+            #     Mysql.update(sql, conn=conn)
			
 
				+
			
 
				+            # # 年月
			
 
				+            # xx=ur"\d+[\u5e74]\d+[\u6708]"
			
 
				+            # p = re.compile(xx)
			
 
				+            # date_list = p.findall(_str)
			
 
				+            # if date_list and len(date_list) > 0:
			
 
				+            #     date = date_list[0]
			
 
				+            #     dates_date = parse_date(date, '%Y年%m月')
			
 
				+            #     print _str
			
 
				+            #     sql = """
			
 
				+            #         update yxb.ad_tv_lib set dates = '%s' where id = '%s'
			
 
				+            #     """
			
 
				+            #     sql = sql % (dates_date, _id)
			
 
				+            #     Mysql.update(sql, conn=conn)
			
 
				+
			
 
				+            # 年
			
 
				+            xx=ur"\d+[\u5e74]"
			
 
				+            p = re.compile(xx)
			
 
				+            date_list = p.findall(_str)
			
 
				+            if date_list and len(date_list) > 0:
			
 
				+                date = date_list[0]
			
 
				+                dates_date = parse_date(date, '%Y年')
			
 
				+                print _str
			
 
				+                sql = """
			
 
				+                    update yxb.ad_tv_lib set dates = '%s' where id = '%s'
			
 
				+                """
			
 
				+                sql = sql % (dates_date, _id)
			
 
				+                Mysql.update(sql, conn=conn)
			
 
				+        else:
			
 
				+            sql = """
			
 
				+                update yxb.ad_tv_lib set dates = null where id = '%s'
			
 
				+            """
			
 
				+            sql = sql % (_id)
			
 
				+            Mysql.update(sql, conn=conn)
			
 
				+    Mysql.close(conn)
			
 
				+
			
 
				+def update_chupin_date():
			
 
				+    conn = Mysql.createOnlineConn()
			
 
				+    sql = """
			
 
				+        select id, chupin_date from yxb.ad_tv_lib
			
 
				+    """
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        chupin_date = row['chupin_date']
			
 
				+        if chupin_date is not None and len(chupin_date) > 0:
			
 
				+            chupin_date = parse_field(chupin_date)
			
 
				+            _str = chupin_date.decode('utf8')
			
 
				+
			
 
				+            # # 年月日
			
 
				+            # xx=ur"\d+[\u5e74]\d+[\u6708]\d+[\u65e5]"
			
 
				+            # p = re.compile(xx)
			
 
				+            # date_list = p.findall(_str)
			
 
				+            # if date_list and len(date_list) > 0:
			
 
				+            #     date = date_list[0]
			
 
				+            #     chupin_date_date = parse_date(date, '%Y年%m月%d日')
			
 
				+            #     print _str
			
 
				+            #     sql = """
			
 
				+            #         update yxb.ad_tv_lib set chupin_date = '%s' where id = '%s'
			
 
				+            #     """
			
 
				+            #     sql = sql % (chupin_date_date, _id)
			
 
				+            #     Mysql.update(sql, conn=conn)
			
 
				+
			
 
				+            # # - -
			
 
				+            # xx=ur"\d+[-]\d+[-]\d+"
			
 
				+            # p = re.compile(xx)
			
 
				+            # date_list = p.findall(_str)
			
 
				+            # if date_list and len(date_list) > 0:
			
 
				+            #     date = date_list[0]
			
 
				+            #     dates_date = parse_date(date, '%Y-%m-%d')
			
 
				+            #     print _str
			
 
				+            #     sql = """
			
 
				+            #         update yxb.ad_tv_lib set dates = '%s' where id = '%s'
			
 
				+            #     """
			
 
				+            #     sql = sql % (dates_date, _id)
			
 
				+            #     Mysql.update(sql, conn=conn)
			
 
				+
			
 
				+            # # 年月
			
 
				+            # xx=ur"\d+[\u5e74]\d+[\u6708]"
			
 
				+            # p = re.compile(xx)
			
 
				+            # date_list = p.findall(_str)
			
 
				+            # if date_list and len(date_list) > 0:
			
 
				+            #     date = date_list[0]
			
 
				+            #     chupin_date_date = parse_date(date, '%Y年%m月')
			
 
				+            #     print _str
			
 
				+            #     sql = """
			
 
				+            #         update yxb.ad_tv_lib set chupin_date = '%s' where id = '%s'
			
 
				+            #     """
			
 
				+            #     sql = sql % (chupin_date_date, _id)
			
 
				+            #     Mysql.update(sql, conn=conn)
			
 
				+
			
 
				+            # 年
			
 
				+            xx=ur"\d+[\u5e74]"
			
 
				+            p = re.compile(xx)
			
 
				+            date_list = p.findall(_str)
			
 
				+            if date_list and len(date_list) > 0:
			
 
				+                date = date_list[0]
			
 
				+                chupin_date_date = parse_date(date, '%Y年')
			
 
				+                print _str
			
 
				+                sql = """
			
 
				+                    update yxb.ad_tv_lib set chupin_date = '%s' where id = '%s'
			
 
				+                """
			
 
				+                sql = sql % (chupin_date_date, _id)
			
 
				+                Mysql.update(sql, conn=conn)
			
 
				+        else:
			
 
				+            sql = """
			
 
				+                update yxb.ad_tv_lib set chupin_date = null where id = '%s'
			
 
				+            """
			
 
				+            sql = sql % (_id)
			
 
				+            # Mysql.update(sql, conn=conn)
			
 
				+    Mysql.close(conn)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # update_show_time()
			
 
				+    pass
			
--- a/task_clean/update_first_type.py
+++ b/task_clean/update_first_type.py
@@ -0,0 +1,53 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""电视台收视率统计
			
 
				+
			
 
				+"""
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+sql = """
			
 
				+    select tv_id, tv_name, first_type from odl.ad_tv_lib where is_use = 1 and decade is null
			
 
				+"""
			
 
				+rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+for row in rows:
			
 
				+    tv_id = row['tv_id']
			
 
				+    tv_name = row['tv_name']
			
 
				+    first_type = row['first_type']
			
 
				+
			
 
				+    sql = """
			
 
				+        select name, theme from odl.ad_tv_record_distribution where name = '%s'
			
 
				+    """
			
 
				+    sql = sql % (tv_name)
			
 
				+    row = Mysql.getOne(sql, conn=conn)
			
 
				+    if row is not None:
			
 
				+        theme = row[1]
			
 
				+        if theme is not None and len(theme) > 0:
			
 
				+            if first_type is None or len(first_type) == 0:
			
 
				+                first_type = theme
			
 
				+            else:
			
 
				+                decade = theme[:2]
			
 
				+                update_sql =  """
			
 
				+                    update odl.ad_tv_lib set decade = '%s' where tv_id = '%s'
			
 
				+                """
			
 
				+                update_sql = update_sql % (decade, tv_id)
			
 
				+                Mysql.update(update_sql, conn=conn)
			
 
				+                # first_type = theme[:2] + first_type
			
 
				+    # if first_type is not None:
			
 
				+    #     # update_sql =  """
			
 
				+    #     #     update odl.ad_tv_lib set first_type = '%s' where tv_id = '%s'
			
 
				+    #     # """
			
 
				+    #     update_sql =  """
			
 
				+    #         update odl.ad_tv_lib set decade = '%s' where tv_id = '%s'
			
 
				+    #     """
			
 
				+    #     update_sql = update_sql % (first_type, tv_id)
			
 
				+    #     Mysql.update(update_sql, conn=conn)
			
 
				+Mysql.close(conn)
			
--- a/task_idl/idl_ad_tv_record_distribution.py
+++ b/task_idl/idl_ad_tv_record_distribution.py
@@ -0,0 +1,32 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+sql_comment = """
			
 
				+    truncate table idl.ad_tv_record_distribution
			
 
				+"""
			
 
				+sql = """
			
 
				+    truncate table idl.ad_tv_record_distribution
			
 
				+"""
			
 
				+
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+sql = """
			
 
				+    insert into idl.ad_tv_record_distribution (tv_id, name, current_name, area, theme, first_type, second_type, company, record_date, form, num, film_date, film_period, `desc`, scriptwriter, director, distribution_date, is_distribute)
			
 
				+    select tv_id, name, current_name, area, theme, first_type, second_type, company, record_date, form, num, film_date, film_period, `desc`, scriptwriter, director, distribution_date, is_distribute from odl.ad_tv_record_distribution
			
 
				+"""
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_idl/idl_tv_article_marketing_count.py
+++ b/task_idl/idl_tv_article_marketing_count.py
@@ -0,0 +1,31 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""新剧营销文章数量爬取
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql, Util
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+first_day = Util.get_first_date_of_yesterday()
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+# 清空数据
			
 
				+sql = """
			
 
				+    truncate table idl.tv_article_marketing_count
			
 
				+"""
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+sql = """
			
 
				+    insert into idl.tv_article_marketing_count (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_count)
			
 
				+    select tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_count from scrapy.scrapy_article_count order by tv_id asc
			
 
				+"""
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_idl/idl_tv_article_marketing_detail.py
+++ b/task_idl/idl_tv_article_marketing_detail.py
@@ -0,0 +1,36 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""新剧营销文章链接爬取
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql, Util
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+first_day = Util.get_first_date_of_yesterday()
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+sql = """
			
 
				+    select count(*) as num from scrapy.scrapy_article
			
 
				+"""
			
 
				+row = Mysql.getOne(sql, conn=conn)
			
 
				+if row is not None and row[0] is not None and row[0] > 0:
			
 
				+    # 清空数据
			
 
				+    sql = """
			
 
				+        truncate table idl.tv_article_marketing_detail
			
 
				+    """
			
 
				+    Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+    sql = """
			
 
				+        insert into idl.tv_article_marketing_detail (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_url, scrapy_title, scrapy_date)
			
 
				+        select tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_url, scrapy_title, date_format(scrapy_date, '%Y-%m-%d') from scrapy.scrapy_article order by id asc
			
 
				+    """
			
 
				+    Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_idl/idl_tv_avg_ratings_stat.py
+++ b/task_idl/idl_tv_avg_ratings_stat.py
@@ -0,0 +1,50 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""计算每个电视剧的收视率
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import datetime
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+class tv_avg_ratings_stat():
			
 
				+    
			
 
				+    def province(self):
			
 
				+        conn = Mysql.createOfflineConn()
			
 
				+        sql = """
			
 
				+            truncate table idl.tv_avg_ratings
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        sql = """
			
 
				+            insert into idl.tv_avg_ratings (channel, theater_attribute, tv_name, tv_id, tv_date, value)
			
 
				+            select channel, theater_attribute, tv_name, tv_id, tv_date, value from tmp.tv_avg_ratings
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        Mysql.close(conn)
			
 
				+
			
 
				+    def area(self):
			
 
				+        conn = Mysql.createOfflineConn()
			
 
				+        sql = """
			
 
				+            truncate table idl.area_tv_avg_ratings
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        sql = """
			
 
				+            insert into idl.area_tv_avg_ratings (channel, theater_attribute, tv_name, tv_id, tv_date, value)
			
 
				+            select channel, theater_attribute, tv_name, tv_id, tv_date, value from tmp.area_tv_avg_ratings
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        Mysql.close(conn)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    if len(sys.argv) != 2:
			
 
				+        print '没有输入参数，退出'
			
 
				+        sys.exit(0)
			
 
				+    print 'method name is ' + sys.argv[1]
			
 
				+    obj = tv_avg_ratings_stat()
			
 
				+    try:
			
 
				+        getattr(obj, sys.argv[1])()
			
 
				+    except Exception, e:
			
 
				+        print e
			
--- a/task_idl/idl_tv_category_stat.py
+++ b/task_idl/idl_tv_category_stat.py
@@ -0,0 +1,52 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""近一年电视台平均收视率
			
 
				+
			
 
				+"""
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql, Util
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+class tv_category_stat():
			
 
				+    def province(self):
			
 
				+        conn = Mysql.createOfflineConn()
			
 
				+        sql = """
			
 
				+            truncate table idl.tv_category_stat
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        # 将数据从tmp库插到idl库
			
 
				+        sql = """
			
 
				+            insert into idl.tv_category_stat (tv_id, category, channel, theater_attribute)
			
 
				+            select tv_id, category, channel, theater_attribute from tmp.tv_category_stat
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        Mysql.close(conn)
			
 
				+
			
 
				+    def area(self):
			
 
				+        conn = Mysql.createOfflineConn()
			
 
				+        sql = """
			
 
				+            truncate table idl.area_tv_category_stat
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        # 将数据从tmp库插到idl库
			
 
				+        sql = """
			
 
				+            insert into idl.area_tv_category_stat (tv_id, category, channel, theater_attribute)
			
 
				+            select tv_id, category, channel, theater_attribute from tmp.area_tv_category_stat
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        Mysql.close(conn)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    if len(sys.argv) != 2:
			
 
				+        print '没有输入参数，退出'
			
 
				+        sys.exit(0)
			
 
				+    print 'method name is ' + sys.argv[1]
			
 
				+    obj = tv_category_stat()
			
 
				+    try:
			
 
				+        getattr(obj, sys.argv[1])()
			
 
				+    except Exception, e:
			
 
				+        print e
			
--- a/task_idl/idl_year_channel_avg_ratings_stat.py
+++ b/task_idl/idl_year_channel_avg_ratings_stat.py
@@ -0,0 +1,51 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""近一年电视台平均收视率
			
 
				+
			
 
				+"""
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql, Util
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+class channel_avg_ratings():
			
 
				+    
			
 
				+    def province(self):
			
 
				+        conn = Mysql.createOfflineConn()
			
 
				+        sql = """
			
 
				+            truncate table idl.tv_channel_avg_ratings
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        sql = """
			
 
				+            insert into idl.tv_channel_avg_ratings (channel, theater_attribute, value)
			
 
				+            select channel, theater_attribute, value from tmp.channel_avg_ratings
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        Mysql.close(conn)
			
 
				+
			
 
				+    def area(self):
			
 
				+        conn = Mysql.createOfflineConn()
			
 
				+        sql = """
			
 
				+            truncate table idl.area_tv_channel_avg_ratings
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        sql = """
			
 
				+            insert into idl.area_tv_channel_avg_ratings (channel, theater_attribute, value)
			
 
				+            select channel, theater_attribute, value from tmp.area_channel_avg_ratings
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        Mysql.close(conn)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    if len(sys.argv) != 2:
			
 
				+        print '没有输入参数，退出'
			
 
				+        sys.exit(0)
			
 
				+    print 'method name is ' + sys.argv[1]
			
 
				+    obj = channel_avg_ratings()
			
 
				+    try:
			
 
				+        getattr(obj, sys.argv[1])()
			
 
				+    except Exception, e:
			
 
				+        print e
			
--- a/task_odl/odl_ad_audience_cps_time.py
+++ b/task_odl/odl_ad_audience_cps_time.py
@@ -0,0 +1,56 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""ad_television表数据处理
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+sql = """
			
 
				+    truncate table odl.ad_audience_cps_time
			
 
				+"""
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+m = 0
			
 
				+n = 50000
			
 
				+
			
 
				+sql_count = """
			
 
				+    select count(id) from yxb.ad_audience_cps_time
			
 
				+"""
			
 
				+count = 0
			
 
				+try:
			
 
				+    count = Mysql.getOne(sql_count, conn=conn)[0]
			
 
				+except Exception, e:
			
 
				+    print e
			
 
				+    pass
			
 
				+# 每年数据循环导入
			
 
				+while m <= count + n:
			
 
				+    sql = """
			
 
				+        insert into odl.ad_audience_cps_time (id, tv_date, type, area, channel, timebucket, total_num, sex_man, sex_woman, age_414, 
			
 
				+            age_1524, age_2534, age_3544, age_4554, age_5565, age_65, edu_none, edu_primary, edu_middle, edu_high, 
			
 
				+            edu_college, job_manager, job_single, job_civilian, job_worker, job_student, job_none, job_other, inc_2000, inc_2035, inc_3550, inc_5059, inc_5901, inc_none) 
			
 
				+        select id, tv_date, type, area, channel, timebucket, total_num, sex_man, sex_woman, age_414, 
			
 
				+            age_1524, age_2534, age_3544, age_4554, age_5565, age_65, edu_none, edu_primary, edu_middle, edu_high, 
			
 
				+            edu_college, job_manager, job_single, job_civilian, job_worker, job_student, job_none, job_other, inc_2000, inc_2035, inc_3550, inc_5059, inc_5901, inc_none
			
 
				+        from yxb.ad_audience_cps_time
			
 
				+        where timebucket = '全天' and area like 'CSM5%%'
			
 
				+        limit %s, %s
			
 
				+    """
			
 
				+    sql = sql % (m, n)
			
 
				+    print sql
			
 
				+    Mysql.execute(sql, conn=conn)
			
 
				+    m += n
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_odl/odl_ad_audience_cps_time_incr_update.py
+++ b/task_odl/odl_ad_audience_cps_time_incr_update.py
@@ -0,0 +1,57 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""ad_television表数据处理
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+# sql = """
			
 
				+#     truncate table odl.ad_audience_cps_time
			
 
				+# """
			
 
				+# Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+# m = 0
			
 
				+# n = 50000
			
 
				+
			
 
				+# sql_count = """
			
 
				+#     select count(id) from yxb.ad_audience_cps_time
			
 
				+# """
			
 
				+# count = 0
			
 
				+# try:
			
 
				+#     count = Mysql.getOne(sql_count, conn=conn)[0]
			
 
				+# except Exception, e:
			
 
				+#     print e
			
 
				+#     pass
			
 
				+
			
 
				+sql = """
			
 
				+    select max(tv_date) as max_date from odl.ad_audience_cps_time
			
 
				+"""
			
 
				+row = Mysql.getOne(sql, conn=conn)
			
 
				+max_date = row[0]
			
 
				+
			
 
				+sql = """
			
 
				+    insert into odl.ad_audience_cps_time (id, tv_date, type, area, channel, timebucket, total_num, sex_man, sex_woman, age_414, 
			
 
				+        age_1524, age_2534, age_3544, age_4554, age_5565, age_65, edu_none, edu_primary, edu_middle, edu_high, 
			
 
				+        edu_college, job_manager, job_single, job_civilian, job_worker, job_student, job_none, job_other, inc_2000, inc_2035, inc_3550, inc_5059, inc_5901, inc_none) 
			
 
				+    select id, tv_date, type, area, channel, timebucket, total_num, sex_man, sex_woman, age_414, 
			
 
				+        age_1524, age_2534, age_3544, age_4554, age_5565, age_65, edu_none, edu_primary, edu_middle, edu_high, 
			
 
				+        edu_college, job_manager, job_single, job_civilian, job_worker, job_student, job_none, job_other, inc_2000, inc_2035, inc_3550, inc_5059, inc_5901, inc_none
			
 
				+    from yxb.ad_audience_cps_time
			
 
				+    where tv_date > '%s' and timebucket = '全天' and area like 'CSM5%%'
			
 
				+"""
			
 
				+sql = sql % (max_date)
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+Mysql.close(conn)
			
--- a/task_odl/odl_ad_television.py
+++ b/task_odl/odl_ad_television.py
@@ -0,0 +1,68 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""ad_television表数据处理
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+"""
			
 
				+从yxb.ad_television_(2010,2011,2012,2013,2014,2015,2016)
			
 
				+yxb.ad_rating_(2010,2011,2012,2013,2014,2015,2016)
			
 
				+提取数据插入到odl.ad_television表中，作为数据分析来源数据
			
 
				+"""
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+sql_comment = """
			
 
				+    truncate table odl.ad_television
			
 
				+"""
			
 
				+sql = """
			
 
				+    truncate table odl.ad_television
			
 
				+"""
			
 
				+try:
			
 
				+    Mysql.execute(sql, conn=conn)
			
 
				+    print '清空odl.ad_television表成功'
			
 
				+except Exception, e:
			
 
				+    print '清空odl.ad_television表出错'
			
 
				+
			
 
				+for year in range(2010, 2017):
			
 
				+    m = 0
			
 
				+    n = 50000
			
 
				+
			
 
				+    sql_count = """
			
 
				+        select count(id) from yxb.ad_television_%s
			
 
				+    """
			
 
				+    sql_count = sql_count % (year)
			
 
				+    count = 0
			
 
				+    try:
			
 
				+        count = Mysql.getOne(sql_count, conn=conn)[0]
			
 
				+    except Exception, e:
			
 
				+        print e
			
 
				+        pass
			
 
				+    # 每年数据循环导入
			
 
				+    while m <= count + n:
			
 
				+        sql = """
			
 
				+            insert into odl.ad_television (television_id, tv_id, tv_name, epi_num, host, channel, tv_date, weekday, start_time, end_time, \
			
 
				+                theater_attribute, property, is_repeat, city, year, area, audience_num, audience_rating, avg_num, avg_rating, \
			
 
				+                market_rating, avg_fans, avg_view_time) \
			
 
				+            select aty.id, atl.id, aty.tv_name, aty.epi_num, aty.host, aty.channel, aty.tv_date, aty.weekday, aty.start_time, aty.end_time, aty.theater_attribute, \
			
 
				+            aty.property, aty.is_repeat, aty.city, %s, \
			
 
				+            ary.area, ary.audience_num, ary.audience_rating, ary.avg_num, ary.avg_rating, ary.market_rating, ary.avg_fans, ary.avg_view_time \
			
 
				+            from yxb.ad_television_%s aty \
			
 
				+            left join yxb.ad_rating_%s ary on ary.tv_id = aty.id and ary.area like 'CSM5%%'
			
 
				+            left join yxb.ad_tv_lib atl on atl.tv_name = aty.tv_name
			
 
				+            limit %s, %s
			
 
				+        """
			
 
				+        sql = sql % (year, year, year, m, n)
			
 
				+        print sql
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        m += n
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_odl/odl_ad_television_incr_update.py
+++ b/task_odl/odl_ad_television_incr_update.py
@@ -0,0 +1,67 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""ad_television表数据增量更新
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import datetime
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+sql = """
			
 
				+    select max(tv_date) as max_date from odl.ad_television
			
 
				+"""
			
 
				+row = Mysql.getOne(sql, conn=conn)
			
 
				+max_date = row[0]
			
 
				+
			
 
				+# year = datetime.datetime.strptime(max_date, '%Y-%m-%d').year
			
 
				+sql = """
			
 
				+    insert into odl.ad_television (television_id, tv_id, tv_name, epi_num, host, channel, tv_date, weekday, start_time, end_time, \
			
 
				+        theater_attribute, property, is_repeat, city, year, area, audience_num, audience_rating, avg_num, avg_rating, \
			
 
				+        market_rating, avg_fans, avg_view_time) \
			
 
				+    select aty.id, atl.id, aty.tv_name, aty.epi_num, aty.host, aty.channel, aty.tv_date, aty.weekday, aty.start_time, aty.end_time, aty.theater_attribute, \
			
 
				+    aty.property, aty.is_repeat, aty.city, substring_index(aty.tv_date, '-', 1), \
			
 
				+    ary.area, ary.audience_num, ary.audience_rating, ary.avg_num, ary.avg_rating, ary.market_rating, ary.avg_fans, ary.avg_view_time \
			
 
				+    from yxb.ad_television_2016 aty \
			
 
				+    left join yxb.ad_rating_2016 ary on ary.tv_id = aty.id and ary.area like 'CSM5%%'
			
 
				+    left join yxb.ad_tv_lib atl on atl.tv_name = aty.tv_name
			
 
				+    where aty.tv_date > '%s'
			
 
				+"""
			
 
				+sql = sql % (max_date)
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+"""
			
 
				+凌晨剧场 0:00 - 6:00
			
 
				+早间剧场：7:00-9:00
			
 
				+上午剧场 9:00-12：00
			
 
				+下午剧场 14:00-18:00
			
 
				+晚间剧场 18:00-24:00
			
 
				+黄金剧场：19:30-21:30
			
 
				+"""
			
 
				+sql = """
			
 
				+    update odl.ad_television
			
 
				+    set theater_attribute = 
			
 
				+    (
			
 
				+    case 
			
 
				+    when start_time >= '00:00:00' and end_time <= '6:00:00' then '凌晨剧场'
			
 
				+    when start_time >= '7:00:00' and end_time < '9:00:00' then '早间剧场'
			
 
				+    when start_time >= '9:00:00' and end_time <= '12:00:00' then '上午剧场'
			
 
				+    when start_time >= '14:00:00' and end_time < '18:00:00' then '下午剧场'
			
 
				+    when start_time >= '19:30:00' and end_time <= '21:30:00' then '黄金剧场'
			
 
				+    when (start_time >= '18:00:00' and end_time < '19:30:00') or (start_time > '21:30:00' and end_time < '24:00:00') then '晚间剧场'
			
 
				+    end
			
 
				+    )
			
 
				+    where tv_date > '%s' and (theater_attribute is null or theater_attribute = '')
			
 
				+"""
			
 
				+sql = sql % (max_date)
			
 
				+
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_odl/odl_ad_tv_lib.py
+++ b/task_odl/odl_ad_tv_lib.py
@@ -0,0 +1,65 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""odl.ad_tv_lib表数据导入
			
 
				+
			
 
				+从yxb.ad_tv_lib提取数据插入到odl.ad_tv_lib表中，作为数据分析来源数据
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+# 情况odl.ad_tv_lib表数据
			
 
				+# sql = """
			
 
				+#     truncate table odl.ad_tv_lib
			
 
				+# """
			
 
				+# Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+# 电视剧信息表
			
 
				+sql = """
			
 
				+    replace into odl.ad_tv_lib (tv_id, tv_name, director, scriptwriter, main_actors, types, first_type, second_type, description, \
			
 
				+        pub_comp, pub_date, filmer, scheming, producer, produce_comp, produce_date, show_time, is_use, decade, theme) \
			
 
				+    select id, tv_name, director, scriptwritter, main_actors, types, \
			
 
				+    case when decade is not null and first_type is not null then concat(decade, first_type) else null end as first_type, second_type, \
			
 
				+    description, pub_comp, pub_date, production, \
			
 
				+    cehua, jianzhi, chupin_comp, chupin_date, show_time, is_use, decade, first_type from yxb.ad_tv_lib
			
 
				+"""
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+# # 更新字段
			
 
				+# sql = """
			
 
				+#     update odl.ad_tv_lib set first_type = substring_index(replace(first_type, ' ', ','), ',', 1) where first_type is not null
			
 
				+# """
			
 
				+# Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+
			
 
				+# # 清空odl.ad_tv_lib_filter表数据
			
 
				+# sql = """
			
 
				+#     truncate table odl.ad_tv_lib_filter
			
 
				+# """
			
 
				+# Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+# # 插入不需要过滤的电视剧
			
 
				+# sql = """
			
 
				+#     insert into odl.ad_tv_lib_filter (tv_id, tv_name) \
			
 
				+#     select distinct tv_id, tv_name from odl.ad_television group by tv_id, tv_name
			
 
				+# """
			
 
				+# Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+# # 更新odl.ad_tv_lib的is_use字段
			
 
				+# sql = """
			
 
				+#     update odl.ad_tv_lib atl inner join odl.ad_tv_lib_filter atlf \
			
 
				+#     on atlf.tv_id = atl.id or atlf.tv_name = atl.tv_name
			
 
				+#     set atl.is_use = 1
			
 
				+#     where atlf.tv_id is not null or atlf.tv_name is not null
			
 
				+# """
			
 
				+
			
 
				+# Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_odl/odl_ad_tv_lib_insert.py
+++ b/task_odl/odl_ad_tv_lib_insert.py
@@ -0,0 +1,35 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""odl.ad_tv_lib表更新数据
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+# 情况odl.ad_tv_lib表数据
			
 
				+sql = """
			
 
				+    select max(tv_id) as tv_id from odl.ad_tv_lib
			
 
				+"""
			
 
				+row = Mysql.getOne(sql, conn=conn)
			
 
				+max_tv_id = row[0]
			
 
				+
			
 
				+# 电视剧信息表
			
 
				+sql = """
			
 
				+    insert into odl.ad_tv_lib (tv_id, tv_name, director, scriptwriter, main_actors, types, first_type, second_type, description, \
			
 
				+        pub_comp, pub_date, filmer, scheming, producer, produce_comp, produce_date, show_time, is_use, categories) \
			
 
				+    select id, tv_name, director, scriptwritter, main_actors, types, first_type, second_type, \
			
 
				+    description, pub_comp, pub_date, production, \
			
 
				+    cehua, jianzhi, chupin_comp, chupin_date, show_time, '0', categories from yxb.ad_tv_lib where id > '%s'
			
 
				+"""
			
 
				+sql = sql % (max_tv_id)
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_odl/odl_ad_tv_record_distribution.py
+++ b/task_odl/odl_ad_tv_record_distribution.py
@@ -0,0 +1,279 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""备案、发行表数据处理
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import datetime
			
 
				+import re
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+def parse_date(field, date_format):
			
 
				+    """
			
 
				+    日期转换
			
 
				+    """
			
 
				+    time_format = datetime.datetime.strptime(field, date_format)
			
 
				+    time_format = time_format.strftime(u'%Y-%m-%d')
			
 
				+    return time_format
			
 
				+
			
 
				+def parse_field(field):
			
 
				+    """
			
 
				+    处理字段，除了p_detail字段
			
 
				+    """
			
 
				+    if field is None or len(field) == 0:
			
 
				+        return ""
			
 
				+    else:
			
 
				+        field = strQ2B(field)
			
 
				+        return field.replace(' ', '')
			
 
				+
			
 
				+def strQ2B(ustring):
			
 
				+    """
			
 
				+    全角转半角
			
 
				+    """
			
 
				+    tstring = ""
			
 
				+    for uchar in ustring:
			
 
				+        inside_code = ord(uchar)
			
 
				+        # 全角空格直接转换
			
 
				+        if inside_code == 12288:
			
 
				+            inside_code = 32
			
 
				+        if inside_code == 160:
			
 
				+            inside_code = 32
			
 
				+        # 全角字符（除空格）根据关系转化
			
 
				+        elif (inside_code >= 65281 and inside_code <= 65374):
			
 
				+            inside_code -= 65248
			
 
				+        
			
 
				+        tstring += unichr(inside_code)
			
 
				+    return tstring
			
 
				+
			
 
				+
			
 
				+"""
			
 
				+从 odl.dsj_gongshi（电视剧备案数据）和odl.faxing（电视剧发行数据）表中提取数据到odl.ad_tv_record_distribution表中，作为数据分析来源数据
			
 
				+"""
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+# 备案、发行表
			
 
				+sql = """
			
 
				+    select id, name, area, theme, company, commit_company, show_date, license_id, form, num, film_date, film_period, `desc`, \
			
 
				+    province_advice, relative_depart_advice, remark, scrapy_url, scrapy_date, scrapy_title, p_detail, scrapy_detail_url, \
			
 
				+    union_company, scriptwriter, director from odl.dsj_gongshi order by id asc
			
 
				+"""
			
 
				+
			
 
				+rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+# conn_max = dev_mysql_conn.Mysql()
			
 
				+sql_max = """
			
 
				+    select max(tv_id) tv_id from odl.ad_tv_record_distribution
			
 
				+"""
			
 
				+row_max = Mysql.getOne(sql_max, conn=conn)
			
 
				+max_id = 0
			
 
				+if row_max is not None and len(row_max) > 0:
			
 
				+    max_id = row_max[0]
			
 
				+
			
 
				+for row in rows:
			
 
				+    _id = row['id']
			
 
				+    if _id <= max_id:
			
 
				+        continue
			
 
				+    # 将p_detail字段转换为commit_company、show_date、license_id三个字段
			
 
				+    p_detail = parse_field(row['p_detail'])
			
 
				+    commit_company = ''
			
 
				+    show_date = ''
			
 
				+    license_id = ''
			
 
				+    # 如果p_detail字段为空，则直接处理
			
 
				+    if p_detail is None or len(p_detail) == 0:
			
 
				+        commit_company = parse_field(row['commit_company'])
			
 
				+        show_date = parse_field(row['show_date'])
			
 
				+        show_date = show_date.replace(u'年', '-').replace(u'月', '-')
			
 
				+        show_date = parse_date(show_date, u'%Y-%m-')
			
 
				+        license_id = parse_field(row['license_id'])
			
 
				+    else:
			
 
				+        try:
			
 
				+            str = p_detail.decode('utf8')
			
 
				+            xx=ur"\d+[\u5e74]\d+[\u6708]"
			
 
				+            p = re.compile(xx)
			
 
				+            date = p.findall(str)[0]
			
 
				+            company_license = str.split(date)
			
 
				+
			
 
				+            commit_company = company_license[0]
			
 
				+            show_date = date
			
 
				+            show_date = parse_date(show_date, '%Y年%m月')
			
 
				+            license_id = company_license[-1]
			
 
				+        except Exception, e:
			
 
				+
			
 
				+            pass
			
 
				+    # 原始格式 2016.11
			
 
				+    film_date = parse_field(row['film_date'])
			
 
				+    try:
			
 
				+        film_date_pattern = re.compile('\d+')
			
 
				+        year_month = film_date_pattern.findall(film_date)
			
 
				+        year = year_month[0]
			
 
				+        month = year_month[1]
			
 
				+        film_date = parse_date(str(year) + '.' + str(month), '%Y.%m')
			
 
				+    except Exception, e:
			
 
				+        film_date = ''
			
 
				+
			
 
				+    # film_date = parse_date(film_date, '%Y.%m')
			
 
				+
			
 
				+    film_period = parse_field(row['film_period'])
			
 
				+
			
 
				+    name = parse_field(row['name'])
			
 
				+    area = parse_field(row['area'])
			
 
				+    theme = parse_field(row['theme'])
			
 
				+    company = parse_field(row['company'])
			
 
				+    if company is None or len(company) == 0:
			
 
				+        company = commit_company
			
 
				+    form = parse_field(row['form'])
			
 
				+    num = parse_field(row['num'])
			
 
				+    desc = parse_field(row['desc'])
			
 
				+    province_advice = parse_field(row['province_advice'])
			
 
				+    relative_depart_advice = parse_field(row['relative_depart_advice'])
			
 
				+    remark = parse_field(row['remark'])
			
 
				+    scrapy_url = parse_field(row['scrapy_url'])
			
 
				+    scrapy_date = row['scrapy_date']
			
 
				+    scrapy_title = parse_field(row['scrapy_title'])
			
 
				+    scrapy_detail_url = parse_field(row['scrapy_detail_url'])
			
 
				+    union_company = parse_field(row['union_company'])
			
 
				+    scriptwriter = parse_field(row['scriptwriter'])
			
 
				+    director = parse_field(row['director'])
			
 
				+    # 变更后的名称
			
 
				+    current_name = ''
			
 
				+    # 变更后的公司
			
 
				+    current_company = ''
			
 
				+    # 变更后的集数
			
 
				+    current_num = 0
			
 
				+
			
 
				+    # conn_change_type1 = dev_mysql_conn.Mysql()
			
 
				+    # 电视剧名称变更
			
 
				+    sql_change_type1 = """
			
 
				+        select original_name, current_name from odl.dsj_change where original_name = '%s' and change_type = 1
			
 
				+    """
			
 
				+    sql_change_type1 = sql_change_type1 % (name)
			
 
				+    sql_change_type1_rows = Mysql.getAll(sql_change_type1, conn=conn)
			
 
				+    
			
 
				+    if len(sql_change_type1_rows) == 1:
			
 
				+        current_name = sql_change_type1_rows[0]['current_name']
			
 
				+    elif len(sql_change_type1_rows) > 1:
			
 
				+        # 如果多余一条记录，则存放在dict中
			
 
				+        name_dict = {}
			
 
				+        for row in sql_change_type1_rows:
			
 
				+            original_name = row['original_name']
			
 
				+            current_name = row['current_name']
			
 
				+            name_dict[original_name] = current_name
			
 
				+        while True:
			
 
				+            if len(name_dict) > 1:
			
 
				+                current_name = name_dict.get(name)
			
 
				+                if current_name is None or len(current_name) == 0:
			
 
				+                    current_name = ''
			
 
				+                    break
			
 
				+                del name_dict[name]
			
 
				+            else:
			
 
				+                current_name = name_dict.get(current_name)
			
 
				+                break
			
 
				+
			
 
				+    # conn_change_type2 = dev_mysql_conn.Mysql()
			
 
				+    # 类型2变更
			
 
				+    sql_change_type2 = """
			
 
				+        select name, original_company, current_company from odl.dsj_change where name = '%s' and change_type = 2
			
 
				+    """
			
 
				+    sql_change_type2 = sql_change_type2 % (name)
			
 
				+    sql_change_type2_rows = Mysql.getAll(sql_change_type2, conn=conn)
			
 
				+    
			
 
				+    if len(sql_change_type2_rows) == 1:
			
 
				+        current_company = sql_change_type2_rows[0]['current_company']
			
 
				+    elif len(sql_change_type2_rows) > 1:
			
 
				+        company_dict = {}
			
 
				+        for row in sql_change_type2_rows:
			
 
				+            original_company = row['original_company']
			
 
				+            current_company = row['current_company']
			
 
				+            company_dict[original_company] = current_company
			
 
				+        while True:
			
 
				+            if len(company_dict) > 1:
			
 
				+                current_company = company_dict.get(company)
			
 
				+                if current_company is None or len(current_company) == 0:
			
 
				+                    current_company = ''
			
 
				+                    break
			
 
				+                del company_dict[company]
			
 
				+            else:
			
 
				+                current_company = company_dict.get(current_company)
			
 
				+                break
			
 
				+
			
 
				+
			
 
				+    # conn_change_type3 = dev_mysql_conn.Mysql()
			
 
				+    # 类型3变更
			
 
				+    sql_change_type3 = """
			
 
				+        select name, original_num, current_num from odl.dsj_change where name = '%s' and change_type = 3
			
 
				+    """
			
 
				+    sql_change_type3 = sql_change_type3 % (name)
			
 
				+    sql_change_type3_rows = Mysql.getAll(sql_change_type3, conn=conn)
			
 
				+
			
 
				+    if len(sql_change_type3_rows) == 1:
			
 
				+        current_num = sql_change_type3_rows[0]['current_num']
			
 
				+    elif len(sql_change_type3_rows) > 1:
			
 
				+        num_dict = {}
			
 
				+        for row in sql_change_type3_rows:
			
 
				+            original_num = row['original_num']
			
 
				+            current_num = row['current_num']
			
 
				+            num_dict[original_num] = current_num
			
 
				+        while True:
			
 
				+            if len(num_dict) > 1:
			
 
				+                current_num = num_dict.get(num)
			
 
				+                if current_num is None or len(current_num) == 0:
			
 
				+                    current_num = 0
			
 
				+                    break
			
 
				+                del num_dict[num]
			
 
				+            else:
			
 
				+                current_num = num_dict.get(current_num)
			
 
				+                break
			
 
				+
			
 
				+    if current_name is None or len(current_name) == 0:
			
 
				+        current_name = name
			
 
				+
			
 
				+    # 发行数据查询
			
 
				+    sql_distribution = """
			
 
				+        select name, company, num, pub_date from odl.faxing where name = '%s'
			
 
				+    """
			
 
				+    sql_distribution = sql_distribution % (current_name)
			
 
				+    try:
			
 
				+        # conn_distribution = dev_mysql_conn.Mysql()
			
 
				+        sql_distribution_rows = Mysql.getAll(sql_distribution, conn=conn)
			
 
				+    except Exception, e:
			
 
				+        # conn_distribution = dev_mysql_conn.Mysql()
			
 
				+        sql_distribution_rows = Mysql.getAll(sql_distribution, conn=conn)
			
 
				+    is_distribute = 0
			
 
				+    pub_date = ''
			
 
				+    if len(sql_distribution_rows) >= 1:
			
 
				+        pub_date = sql_distribution_rows[0]['pub_date']
			
 
				+        is_distribute = 1
			
 
				+
			
 
				+    sql_insert = """
			
 
				+        insert into odl.ad_tv_record_distribution (tv_id, name, current_name, area, theme, first_type, second_type, company, record_date, form, num, \
			
 
				+        film_date, film_period, `desc`, scriptwriter, director, distribution_date, is_distribute) values ('%s', '%s', '%s', '%s', '%s', \
			
 
				+        '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')
			
 
				+    """
			
 
				+    sql_insert = sql_insert % (_id, name, current_name, area, theme, theme, theme, company, show_date, form, num, film_date, film_period, desc.replace("'", "\\'"),
			
 
				+        scriptwriter, director, pub_date, is_distribute)
			
 
				+
			
 
				+    print sql_insert
			
 
				+    try_times = 0
			
 
				+    while True:
			
 
				+        if try_times > 3:
			
 
				+            break
			
 
				+        try:
			
 
				+            # conn1 = dev_mysql_conn.Mysql()
			
 
				+            Mysql.insertOne(sql_insert, conn=conn)
			
 
				+            try_times = 0
			
 
				+            break
			
 
				+        except Exception, e:
			
 
				+            try_times += 1
			
 
				+            print e
			
 
				+    if try_times > 3:
			
 
				+        break
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_odl/odl_ad_tv_record_distribution_insert.py
+++ b/task_odl/odl_ad_tv_record_distribution_insert.py
@@ -0,0 +1,279 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""备案、发行表数据处理
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import datetime
			
 
				+import re
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+def parse_date(field, date_format):
			
 
				+    """
			
 
				+    日期转换
			
 
				+    """
			
 
				+    time_format = datetime.datetime.strptime(field, date_format)
			
 
				+    time_format = time_format.strftime(u'%Y-%m-%d')
			
 
				+    return time_format
			
 
				+
			
 
				+def parse_field(field):
			
 
				+    """
			
 
				+    处理字段，除了p_detail字段
			
 
				+    """
			
 
				+    if field is None or len(field) == 0:
			
 
				+        return ""
			
 
				+    else:
			
 
				+        field = strQ2B(field)
			
 
				+        return field.replace(' ', '')
			
 
				+
			
 
				+def strQ2B(ustring):
			
 
				+    """
			
 
				+    全角转半角
			
 
				+    """
			
 
				+    tstring = ""
			
 
				+    for uchar in ustring:
			
 
				+        inside_code = ord(uchar)
			
 
				+        # 全角空格直接转换
			
 
				+        if inside_code == 12288:
			
 
				+            inside_code = 32
			
 
				+        if inside_code == 160:
			
 
				+            inside_code = 32
			
 
				+        # 全角字符（除空格）根据关系转化
			
 
				+        elif (inside_code >= 65281 and inside_code <= 65374):
			
 
				+            inside_code -= 65248
			
 
				+        
			
 
				+        tstring += unichr(inside_code)
			
 
				+    return tstring
			
 
				+
			
 
				+
			
 
				+"""
			
 
				+从 odl.dsj_gongshi（电视剧备案数据）和odl.faxing（电视剧发行数据）表中提取数据到odl.ad_tv_record_distribution表中，作为数据分析来源数据
			
 
				+"""
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+# 备案、发行表
			
 
				+sql = """
			
 
				+    select id, name, area, theme, company, commit_company, show_date, license_id, form, num, film_date, film_period, `desc`, \
			
 
				+    province_advice, relative_depart_advice, remark, scrapy_url, scrapy_date, scrapy_title, p_detail, scrapy_detail_url, \
			
 
				+    union_company, scriptwriter, director from odl.dsj_gongshi order by id asc
			
 
				+"""
			
 
				+
			
 
				+rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+# conn_max = dev_mysql_conn.Mysql()
			
 
				+sql_max = """
			
 
				+    select max(tv_id) tv_id from odl.ad_tv_record_distribution
			
 
				+"""
			
 
				+row_max = Mysql.getOne(sql_max, conn=conn)
			
 
				+max_id = 0
			
 
				+if row_max is not None and len(row_max) > 0:
			
 
				+    max_id = row_max[0]
			
 
				+
			
 
				+for row in rows:
			
 
				+    _id = row['id']
			
 
				+    if _id <= max_id:
			
 
				+        continue
			
 
				+    # 将p_detail字段转换为commit_company、show_date、license_id三个字段
			
 
				+    p_detail = parse_field(row['p_detail'])
			
 
				+    commit_company = ''
			
 
				+    show_date = ''
			
 
				+    license_id = ''
			
 
				+    # 如果p_detail字段为空，则直接处理
			
 
				+    if p_detail is None or len(p_detail) == 0:
			
 
				+        commit_company = parse_field(row['commit_company'])
			
 
				+        show_date = parse_field(row['show_date'])
			
 
				+        show_date = show_date.replace(u'年', '-').replace(u'月', '-')
			
 
				+        show_date = parse_date(show_date, u'%Y-%m-')
			
 
				+        license_id = parse_field(row['license_id'])
			
 
				+    else:
			
 
				+        try:
			
 
				+            str = p_detail.decode('utf8')
			
 
				+            xx=ur"\d+[\u5e74]\d+[\u6708]"
			
 
				+            p = re.compile(xx)
			
 
				+            date = p.findall(str)[0]
			
 
				+            company_license = str.split(date)
			
 
				+
			
 
				+            commit_company = company_license[0]
			
 
				+            show_date = date
			
 
				+            show_date = parse_date(show_date, '%Y年%m月')
			
 
				+            license_id = company_license[-1]
			
 
				+        except Exception, e:
			
 
				+
			
 
				+            pass
			
 
				+    # 原始格式 2016.11
			
 
				+    film_date = parse_field(row['film_date'])
			
 
				+    try:
			
 
				+        film_date_pattern = re.compile('\d+')
			
 
				+        year_month = film_date_pattern.findall(film_date)
			
 
				+        year = year_month[0]
			
 
				+        month = year_month[1]
			
 
				+        film_date = parse_date(str(year) + '.' + str(month), '%Y.%m')
			
 
				+    except Exception, e:
			
 
				+        film_date = ''
			
 
				+
			
 
				+    # film_date = parse_date(film_date, '%Y.%m')
			
 
				+
			
 
				+    film_period = parse_field(row['film_period'])
			
 
				+
			
 
				+    name = parse_field(row['name'])
			
 
				+    area = parse_field(row['area'])
			
 
				+    theme = parse_field(row['theme'])
			
 
				+    company = parse_field(row['company'])
			
 
				+    if company is None or len(company) == 0:
			
 
				+        company = commit_company
			
 
				+    form = parse_field(row['form'])
			
 
				+    num = parse_field(row['num'])
			
 
				+    desc = parse_field(row['desc'])
			
 
				+    province_advice = parse_field(row['province_advice'])
			
 
				+    relative_depart_advice = parse_field(row['relative_depart_advice'])
			
 
				+    remark = parse_field(row['remark'])
			
 
				+    scrapy_url = parse_field(row['scrapy_url'])
			
 
				+    scrapy_date = row['scrapy_date']
			
 
				+    scrapy_title = parse_field(row['scrapy_title'])
			
 
				+    scrapy_detail_url = parse_field(row['scrapy_detail_url'])
			
 
				+    union_company = parse_field(row['union_company'])
			
 
				+    scriptwriter = parse_field(row['scriptwriter'])
			
 
				+    director = parse_field(row['director'])
			
 
				+    # 变更后的名称
			
 
				+    current_name = ''
			
 
				+    # 变更后的公司
			
 
				+    current_company = ''
			
 
				+    # 变更后的集数
			
 
				+    current_num = 0
			
 
				+
			
 
				+    # conn_change_type1 = dev_mysql_conn.Mysql()
			
 
				+    # 电视剧名称变更
			
 
				+    sql_change_type1 = """
			
 
				+        select original_name, current_name from odl.dsj_change where original_name = '%s' and change_type = 1
			
 
				+    """
			
 
				+    sql_change_type1 = sql_change_type1 % (name)
			
 
				+    sql_change_type1_rows = Mysql.getAll(sql_change_type1, conn=conn)
			
 
				+    
			
 
				+    if len(sql_change_type1_rows) == 1:
			
 
				+        current_name = sql_change_type1_rows[0]['current_name']
			
 
				+    elif len(sql_change_type1_rows) > 1:
			
 
				+        # 如果多余一条记录，则存放在dict中
			
 
				+        name_dict = {}
			
 
				+        for row in sql_change_type1_rows:
			
 
				+            original_name = row['original_name']
			
 
				+            current_name = row['current_name']
			
 
				+            name_dict[original_name] = current_name
			
 
				+        while True:
			
 
				+            if len(name_dict) > 1:
			
 
				+                current_name = name_dict.get(name)
			
 
				+                if current_name is None or len(current_name) == 0:
			
 
				+                    current_name = ''
			
 
				+                    break
			
 
				+                del name_dict[name]
			
 
				+            else:
			
 
				+                current_name = name_dict.get(current_name)
			
 
				+                break
			
 
				+
			
 
				+    # conn_change_type2 = dev_mysql_conn.Mysql()
			
 
				+    # 类型2变更
			
 
				+    sql_change_type2 = """
			
 
				+        select name, original_company, current_company from odl.dsj_change where name = '%s' and change_type = 2
			
 
				+    """
			
 
				+    sql_change_type2 = sql_change_type2 % (name)
			
 
				+    sql_change_type2_rows = Mysql.getAll(sql_change_type2, conn=conn)
			
 
				+    
			
 
				+    if len(sql_change_type2_rows) == 1:
			
 
				+        current_company = sql_change_type2_rows[0]['current_company']
			
 
				+    elif len(sql_change_type2_rows) > 1:
			
 
				+        company_dict = {}
			
 
				+        for row in sql_change_type2_rows:
			
 
				+            original_company = row['original_company']
			
 
				+            current_company = row['current_company']
			
 
				+            company_dict[original_company] = current_company
			
 
				+        while True:
			
 
				+            if len(company_dict) > 1:
			
 
				+                current_company = company_dict.get(company)
			
 
				+                if current_company is None or len(current_company) == 0:
			
 
				+                    current_company = ''
			
 
				+                    break
			
 
				+                del company_dict[company]
			
 
				+            else:
			
 
				+                current_company = company_dict.get(current_company)
			
 
				+                break
			
 
				+
			
 
				+
			
 
				+    # conn_change_type3 = dev_mysql_conn.Mysql()
			
 
				+    # 类型3变更
			
 
				+    sql_change_type3 = """
			
 
				+        select name, original_num, current_num from odl.dsj_change where name = '%s' and change_type = 3
			
 
				+    """
			
 
				+    sql_change_type3 = sql_change_type3 % (name)
			
 
				+    sql_change_type3_rows = Mysql.getAll(sql_change_type3, conn=conn)
			
 
				+
			
 
				+    if len(sql_change_type3_rows) == 1:
			
 
				+        current_num = sql_change_type3_rows[0]['current_num']
			
 
				+    elif len(sql_change_type3_rows) > 1:
			
 
				+        num_dict = {}
			
 
				+        for row in sql_change_type3_rows:
			
 
				+            original_num = row['original_num']
			
 
				+            current_num = row['current_num']
			
 
				+            num_dict[original_num] = current_num
			
 
				+        while True:
			
 
				+            if len(num_dict) > 1:
			
 
				+                current_num = num_dict.get(num)
			
 
				+                if current_num is None or len(current_num) == 0:
			
 
				+                    current_num = 0
			
 
				+                    break
			
 
				+                del num_dict[num]
			
 
				+            else:
			
 
				+                current_num = num_dict.get(current_num)
			
 
				+                break
			
 
				+
			
 
				+    if current_name is None or len(current_name) == 0:
			
 
				+        current_name = name
			
 
				+
			
 
				+    # 发行数据查询
			
 
				+    sql_distribution = """
			
 
				+        select name, company, num, pub_date from odl.faxing where name = '%s'
			
 
				+    """
			
 
				+    sql_distribution = sql_distribution % (current_name)
			
 
				+    try:
			
 
				+        # conn_distribution = dev_mysql_conn.Mysql()
			
 
				+        sql_distribution_rows = Mysql.getAll(sql_distribution, conn=conn)
			
 
				+    except Exception, e:
			
 
				+        # conn_distribution = dev_mysql_conn.Mysql()
			
 
				+        sql_distribution_rows = Mysql.getAll(sql_distribution, conn=conn)
			
 
				+    is_distribute = 0
			
 
				+    pub_date = ''
			
 
				+    if len(sql_distribution_rows) >= 1:
			
 
				+        pub_date = sql_distribution_rows[0]['pub_date']
			
 
				+        is_distribute = 1
			
 
				+
			
 
				+    sql_insert = """
			
 
				+        insert into odl.ad_tv_record_distribution (tv_id, name, current_name, area, theme, first_type, second_type, company, record_date, form, num, \
			
 
				+        film_date, film_period, `desc`, scriptwriter, director, distribution_date, is_distribute) values ('%s', '%s', '%s', '%s', '%s', \
			
 
				+        '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')
			
 
				+    """
			
 
				+    sql_insert = sql_insert % (_id, name, current_name, area, theme, theme, theme, company, show_date, form, num, film_date, film_period, desc.replace("'", "\\'"),
			
 
				+        scriptwriter, director, pub_date, is_distribute)
			
 
				+
			
 
				+    print sql_insert
			
 
				+    try_times = 0
			
 
				+    while True:
			
 
				+        if try_times > 3:
			
 
				+            break
			
 
				+        try:
			
 
				+            # conn1 = dev_mysql_conn.Mysql()
			
 
				+            Mysql.insertOne(sql_insert, conn=conn)
			
 
				+            try_times = 0
			
 
				+            break
			
 
				+        except Exception, e:
			
 
				+            try_times += 1
			
 
				+            print e
			
 
				+    if try_times > 3:
			
 
				+        break
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_odl/odl_area_ad_television.py
+++ b/task_odl/odl_area_ad_television.py
@@ -0,0 +1,70 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""ad_television表数据处理
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+"""
			
 
				+从yxb.ad_television_tetv
			
 
				+提取数据插入到odl.area_ad_television 表中，作为数据分析来源数据
			
 
				+"""
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+sql = """
			
 
				+    truncate table odl.area_ad_television
			
 
				+"""
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+m = 0
			
 
				+n = 50000
			
 
				+
			
 
				+sql_count = """
			
 
				+    select count(id) from yxb.ad_television_tetv
			
 
				+"""
			
 
				+count = Mysql.getOne(sql_count, conn=conn)[0]
			
 
				+while m <= count + n:
			
 
				+    sql = """
			
 
				+        insert into odl.area_ad_television (television_id, tv_id, tv_name, epi_num, host, channel, tv_date, weekday, start_time, end_time, \
			
 
				+            theater_attribute, property, is_repeat, city, area, audience_num, audience_rating, avg_num, avg_rating, \
			
 
				+            market_rating, avg_fans, avg_view_time) \
			
 
				+        select aty.id, atl.id, aty.tv_name, aty.epi_num, aty.host, aty.channel, aty.tv_date, aty.weekday, aty.start_time, aty.end_time, aty.theater_attribute, \
			
 
				+        aty.property, aty.is_repeat, aty.city, \
			
 
				+        ary.area, ary.audience_num, ary.audience_rating, ary.avg_num, ary.avg_rating, ary.market_rating, ary.avg_fans, ary.avg_view_time \
			
 
				+        from yxb.ad_television_tetv aty \
			
 
				+        left join yxb.ad_rating_tetv ary on ary.tv_id = aty.id and ary.area like 'CSM5%%'
			
 
				+        left join yxb.ad_tv_lib atl on atl.tv_name = aty.tv_name
			
 
				+        limit %s, %s
			
 
				+    """
			
 
				+    sql = sql % (m, n)
			
 
				+    print sql
			
 
				+    Mysql.execute(sql, conn=conn)
			
 
				+    m += n
			
 
				+
			
 
				+sql = """
			
 
				+    update odl.area_ad_television
			
 
				+    set theater_attribute = 
			
 
				+    (
			
 
				+    case 
			
 
				+    when start_time >= '00:00:00' and end_time <= '6:00:00' then '凌晨剧场'
			
 
				+    when start_time >= '7:00:00' and end_time < '9:00:00' then '早间剧场'
			
 
				+    when start_time >= '9:00:00' and end_time <= '12:00:00' then '上午剧场'
			
 
				+    when start_time >= '14:00:00' and end_time < '18:00:00' then '下午剧场'
			
 
				+    when start_time >= '19:30:00' and end_time <= '21:30:00' then '黄金剧场'
			
 
				+    when (start_time >= '18:00:00' and end_time < '19:30:00') or (start_time > '21:30:00' and end_time < '24:00:00') then '晚间剧场'
			
 
				+    end
			
 
				+    )
			
 
				+    where theater_attribute is null or theater_attribute = ''
			
 
				+"""
			
 
				+
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_odl/odl_area_ad_television_incr_update.py
+++ b/task_odl/odl_area_ad_television_incr_update.py
@@ -0,0 +1,67 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""ad_television表数据增量更新
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import datetime
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+sql = """
			
 
				+    select max(tv_date) as max_date from odl.area_ad_television
			
 
				+"""
			
 
				+row = Mysql.getOne(sql, conn=conn)
			
 
				+max_date = row[0]
			
 
				+
			
 
				+# year = datetime.datetime.strptime(max_date, '%Y-%m-%d').year
			
 
				+sql = """
			
 
				+    insert into odl.area_ad_television (television_id, tv_id, tv_name, epi_num, host, channel, tv_date, weekday, start_time, end_time,
			
 
				+        theater_attribute, property, is_repeat, city, area, audience_num, audience_rating, avg_num, avg_rating,
			
 
				+        market_rating, avg_fans, avg_view_time)
			
 
				+    select aty.id, atl.id, aty.tv_name, aty.epi_num, aty.host, aty.channel, aty.tv_date, aty.weekday, aty.start_time, aty.end_time, aty.theater_attribute,
			
 
				+    aty.property, aty.is_repeat, aty.city, 
			
 
				+    ary.area, ary.audience_num, ary.audience_rating, ary.avg_num, ary.avg_rating, ary.market_rating, ary.avg_fans, ary.avg_view_time
			
 
				+    from yxb.ad_television_tetv aty
			
 
				+    left join yxb.ad_rating_tetv ary on ary.tv_id = aty.id and ary.area like 'CSM5%%'
			
 
				+    left join yxb.ad_tv_lib atl on atl.tv_name = aty.tv_name
			
 
				+    where aty.tv_date > '%s'
			
 
				+"""
			
 
				+sql = sql % (max_date)
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+"""
			
 
				+凌晨剧场 0:00 - 6:00
			
 
				+早间剧场：7:00-9:00
			
 
				+上午剧场 9:00-12：00
			
 
				+下午剧场 14:00-18:00
			
 
				+晚间剧场 18:00-24:00
			
 
				+黄金剧场：19:30-21:30
			
 
				+"""
			
 
				+sql = """
			
 
				+    update odl.area_ad_television
			
 
				+    set theater_attribute = 
			
 
				+    (
			
 
				+    case 
			
 
				+    when start_time >= '00:00:00' and end_time <= '6:00:00' then '凌晨剧场'
			
 
				+    when start_time >= '7:00:00' and end_time < '9:00:00' then '早间剧场'
			
 
				+    when start_time >= '9:00:00' and end_time <= '12:00:00' then '上午剧场'
			
 
				+    when start_time >= '14:00:00' and end_time < '18:00:00' then '下午剧场'
			
 
				+    when start_time >= '19:30:00' and end_time <= '21:30:00' then '黄金剧场'
			
 
				+    when (start_time >= '18:00:00' and end_time < '19:30:00') or (start_time > '21:30:00' and end_time < '24:00:00') then '晚间剧场'
			
 
				+    end
			
 
				+    )
			
 
				+    where tv_date > '%s' and (theater_attribute is null or theater_attribute = '')
			
 
				+"""
			
 
				+sql = sql % (max_date)
			
 
				+
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_other/idl_rank_update.py
+++ b/task_other/idl_rank_update.py
@@ -0,0 +1,157 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""排名更新
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+sql = """
			
 
				+        REPLACE INTO idl.ad_tv_rate_rank(channel,tv_name,tv_date,audience_rate,rank,theater_attribute)
			
 
				+        select channel,tv_name,tv_date,avg_rate,rank,'黄金剧场' from (
			
 
				+		select heyf_tmp.channel,heyf_tmp.tv_name,heyf_tmp.tv_date,heyf_tmp.avg_rate,@rownum:=@rownum+1 ,  
			
 
				+		  if(@pdept=heyf_tmp.tv_date,@rank:=@rank+1,@rank:=1) as rank,
			
 
				+		  @pdept:=heyf_tmp.tv_date 
			
 
				+		from(
			
 
				+
			
 
				+		select  channel,tv_name,round(avg(audience_rating),3) as avg_rate,tv_date   from (
			
 
				+		select t.channel,t.tv_name, t.audience_rating ,  t.tv_date, month(t.tv_date) as  tv_month
			
 
				+		from odl.ad_tv_rating_denoise t
			
 
				+		where audience_rating >= 0 
			
 
				+		and (t.start_time >= '19:30:00' and t.end_time <= '21:30:00' and t.start_time <= '21:30:00' and t.end_time >= '19:30:00')
			
 
				+		)a
			
 
				+		GROUP BY channel,tv_name,tv_date
			
 
				+		order by  tv_date,avg_rate DESC
			
 
				+
			
 
				+        )  heyf_tmp , 
			
 
				+		(select @rownum :=0 , @pdept := null ,@rank:=0) a  
			
 
				+		 ) result
			
 
				+"""
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+
			
 
				+sql = """
			
 
				+        REPLACE INTO idl.ad_tv_rate_rank(channel,tv_name,tv_date,audience_rate,rank,theater_attribute)
			
 
				+        select channel,tv_name,tv_date,avg_rate,rank,'凌晨剧场' from (  
			
 
				+		select heyf_tmp.channel,heyf_tmp.tv_name,heyf_tmp.tv_date,heyf_tmp.avg_rate,@rownum:=@rownum+1 ,  
			
 
				+		  if(@pdept=heyf_tmp.tv_date,@rank:=@rank+1,@rank:=1) as rank,  
			
 
				+		  @pdept:=heyf_tmp.tv_date 
			
 
				+		from(
			
 
				+
			
 
				+		select  channel,tv_name,round(avg(audience_rating),3) as avg_rate,tv_date   from (
			
 
				+		select t.channel,t.tv_name, t.audience_rating ,  t.tv_date, month(t.tv_date) as  tv_month
			
 
				+		from odl.ad_tv_rating_denoise t
			
 
				+		where audience_rating >= 0 
			
 
				+		and (t.start_time >= '00:00:00' and t.end_time <= '06:00:00' and t.start_time <= '06:00:00' and t.end_time >= '00:00:00')
			
 
				+		)a
			
 
				+		GROUP BY channel,tv_name,tv_date
			
 
				+		order by  tv_date,avg_rate DESC
			
 
				+
			
 
				+        )  heyf_tmp , 
			
 
				+		(select @rownum :=0 , @pdept := null ,@rank:=0) a  
			
 
				+		 ) result
			
 
				+"""
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+sql = """
			
 
				+        REPLACE INTO idl.ad_tv_rate_rank(channel,tv_name,tv_date,audience_rate,rank,theater_attribute)
			
 
				+        select channel,tv_name,tv_date,avg_rate,rank,'早间剧场' from (  
			
 
				+		select heyf_tmp.channel,heyf_tmp.tv_name,heyf_tmp.tv_date,heyf_tmp.avg_rate,@rownum:=@rownum+1 ,  
			
 
				+		  if(@pdept=heyf_tmp.tv_date,@rank:=@rank+1,@rank:=1) as rank,  
			
 
				+		  @pdept:=heyf_tmp.tv_date 
			
 
				+		from(
			
 
				+
			
 
				+		select  channel,tv_name,round(avg(audience_rating),3) as avg_rate,tv_date   from (
			
 
				+		select t.channel,t.tv_name, t.audience_rating ,  t.tv_date, month(t.tv_date) as  tv_month
			
 
				+		from odl.ad_tv_rating_denoise t
			
 
				+		where audience_rating >= 0 
			
 
				+		and (t.start_time >= '07:00:00' and t.end_time <= '09:00:00' and t.start_time <= '09:00:00' and t.end_time >= '07:00:00')
			
 
				+		)a
			
 
				+		GROUP BY channel,tv_name,tv_date
			
 
				+		order by  tv_date,avg_rate DESC
			
 
				+
			
 
				+        )  heyf_tmp , 
			
 
				+		(select @rownum :=0 , @pdept := null ,@rank:=0) a  
			
 
				+		 ) result
			
 
				+"""
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+
			
 
				+sql = """
			
 
				+        REPLACE INTO idl.ad_tv_rate_rank(channel,tv_name,tv_date,audience_rate,rank,theater_attribute)
			
 
				+        select channel,tv_name,tv_date,avg_rate,rank,'上午剧场' from (  
			
 
				+		select heyf_tmp.channel,heyf_tmp.tv_name,heyf_tmp.tv_date,heyf_tmp.avg_rate,@rownum:=@rownum+1 ,  
			
 
				+		  if(@pdept=heyf_tmp.tv_date,@rank:=@rank+1,@rank:=1) as rank,  
			
 
				+		  @pdept:=heyf_tmp.tv_date 
			
 
				+		from(
			
 
				+
			
 
				+		select  channel,tv_name,round(avg(audience_rating),3) as avg_rate,tv_date   from (
			
 
				+		select t.channel,t.tv_name, t.audience_rating ,  t.tv_date, month(t.tv_date) as  tv_month
			
 
				+		from odl.ad_tv_rating_denoise t
			
 
				+		where audience_rating >= 0 
			
 
				+		and (t.start_time >= '09:00:00' and t.end_time <= '12:00:00' and t.start_time <= '12:00:00' and t.end_time >= '09:00:00')
			
 
				+		)a
			
 
				+		GROUP BY channel,tv_name,tv_date
			
 
				+		order by  tv_date,avg_rate DESC
			
 
				+
			
 
				+        )  heyf_tmp , 
			
 
				+		(select @rownum :=0 , @pdept := null ,@rank:=0) a  
			
 
				+		 ) result
			
 
				+"""
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+sql = """
			
 
				+        REPLACE INTO idl.ad_tv_rate_rank(channel,tv_name,tv_date,audience_rate,rank,theater_attribute)
			
 
				+        select channel,tv_name,tv_date,avg_rate,rank,'下午剧场' from (  
			
 
				+		select heyf_tmp.channel,heyf_tmp.tv_name,heyf_tmp.tv_date,heyf_tmp.avg_rate,@rownum:=@rownum+1 ,  
			
 
				+		  if(@pdept=heyf_tmp.tv_date,@rank:=@rank+1,@rank:=1) as rank,  
			
 
				+		  @pdept:=heyf_tmp.tv_date 
			
 
				+		from(
			
 
				+
			
 
				+		select  channel,tv_name,round(avg(audience_rating),3) as avg_rate,tv_date   from (
			
 
				+		select t.channel,t.tv_name, t.audience_rating ,  t.tv_date, month(t.tv_date) as  tv_month
			
 
				+		from odl.ad_tv_rating_denoise t
			
 
				+		where audience_rating >= 0 
			
 
				+		and (t.start_time >= '14:00:00' and t.end_time <= '18:00:00' and t.start_time <= '18:00:00' and t.end_time >= '14:00:00')
			
 
				+		)a
			
 
				+		GROUP BY channel,tv_name,tv_date
			
 
				+		order by  tv_date,avg_rate DESC
			
 
				+
			
 
				+        )  heyf_tmp , 
			
 
				+		(select @rownum :=0 , @pdept := null ,@rank:=0) a  
			
 
				+		 ) result
			
 
				+"""
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+sql = """
			
 
				+        REPLACE INTO idl.ad_tv_rate_rank(channel,tv_name,tv_date,audience_rate,rank,theater_attribute)
			
 
				+        select channel,tv_name,tv_date,avg_rate,rank,'晚间剧场' from (  
			
 
				+		select heyf_tmp.channel,heyf_tmp.tv_name,heyf_tmp.tv_date,heyf_tmp.avg_rate,@rownum:=@rownum+1 ,  
			
 
				+		  if(@pdept=heyf_tmp.tv_date,@rank:=@rank+1,@rank:=1) as rank,  
			
 
				+		  @pdept:=heyf_tmp.tv_date 
			
 
				+		from(
			
 
				+
			
 
				+		select  channel,tv_name,round(avg(audience_rating),3) as avg_rate,tv_date   from (
			
 
				+		select t.channel,t.tv_name, t.audience_rating ,  t.tv_date, month(t.tv_date) as  tv_month
			
 
				+		from odl.ad_tv_rating_denoise t
			
 
				+		where audience_rating >= 0 
			
 
				+		and (t.start_time >= '18:00:00' and t.end_time <= '24:00:00' and t.start_time <= '24:00:00' and t.end_time >= '18:00:00')
			
 
				+		)a
			
 
				+		GROUP BY channel,tv_name,tv_date
			
 
				+		order by  tv_date,avg_rate DESC
			
 
				+
			
 
				+        )  heyf_tmp , 
			
 
				+		(select @rownum :=0 , @pdept := null ,@rank:=0) a  
			
 
				+		 ) result
			
 
				+"""
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/task_other/transform_categories.py
+++ b/task_other/transform_categories.py
@@ -0,0 +1,96 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+import datetime
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+cat_dict = {}
			
 
				+# 获取所有标准分类和对应的映射分类
			
 
				+sql = """
			
 
				+    select standard_category, reflect_category from odl.basic_categories
			
 
				+"""
			
 
				+categories = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+for category in categories:
			
 
				+    standard_category =  category['standard_category']
			
 
				+    reflect_category = category['reflect_category']
			
 
				+    cat_dict[reflect_category] = standard_category
			
 
				+
			
 
				+sql = """
			
 
				+    select tv_id, iqiyi_types, iqiyi_types_new, tengxun_types, tengxun_types_new, baike_types, baike_types_new, manual_types, manual_types_new from scrapy.types_analyse where iqiyi_tengxun_after_baike_after_manual is null or iqiyi_tengxun_after_baike_after_manual = '' order by tv_id asc
			
 
				+"""
			
 
				+
			
 
				+rows = Mysql.getAll(sql, conn=conn)
			
 
				+for row in rows:
			
 
				+    tv_id = row['tv_id']
			
 
				+    iqiyi_types = row['iqiyi_types']
			
 
				+    iqiyi_types_new = row['iqiyi_types_new']
			
 
				+    tengxun_types = row['tengxun_types']
			
 
				+    tengxun_types_new = row['tengxun_types_new']
			
 
				+    baike_types = row['baike_types']
			
 
				+    baike_types_new = row['baike_types_new']
			
 
				+    manual_types = row['manual_types']
			
 
				+    manual_types_new = row['manual_types_new']
			
 
				+
			
 
				+    iqiyi_types_set = set()
			
 
				+    if (iqiyi_types_new is None or len(iqiyi_types_new) == 0) and iqiyi_types is not None and len(iqiyi_types) > 0:
			
 
				+        for _type in iqiyi_types.split(' '):
			
 
				+            cate = cat_dict.get(_type)
			
 
				+            if cate is not None:
			
 
				+                iqiyi_types_set.add(cate)
			
 
				+
			
 
				+    tengxun_types_set = set()
			
 
				+    if (tengxun_types_new is None or len(tengxun_types_new) == 0) and tengxun_types is not None and len(tengxun_types) > 0:
			
 
				+        for _type in tengxun_types.split(' '):
			
 
				+            cate = cat_dict.get(_type)
			
 
				+            if cate is not None:
			
 
				+                tengxun_types_set.add(cate)
			
 
				+
			
 
				+    baike_types_set = set()
			
 
				+    if (baike_types_new is None or len(baike_types_new) == 0) and baike_types is not None and len(baike_types) > 0:
			
 
				+        for _type in baike_types.split(' '):
			
 
				+            cate = cat_dict.get(_type)
			
 
				+            if cate is not None:
			
 
				+                baike_types_set.add(cate)
			
 
				+
			
 
				+    manual_types_set = set()
			
 
				+    if (manual_types_new is None or len(manual_types_new) == 0) and manual_types is not None and len(manual_types) > 0:
			
 
				+        for _type in manual_types.split(' '):
			
 
				+            cate = cat_dict.get(_type)
			
 
				+            if cate is not None:
			
 
				+                manual_types_set.add(cate)
			
 
				+
			
 
				+    all_types = set()
			
 
				+    if len(iqiyi_types_set | tengxun_types_set) > 2:
			
 
				+        all_types = iqiyi_types_set | tengxun_types_set
			
 
				+    elif len(iqiyi_types_set | tengxun_types_set | baike_types_set) > 2:
			
 
				+        all_types = iqiyi_types_set | tengxun_types_set | baike_types_set
			
 
				+    elif len(iqiyi_types_set | tengxun_types_set | baike_types_set | manual_types_set) > 2:
			
 
				+        all_types = iqiyi_types_set | tengxun_types_set | baike_types_set | manual_types_set
			
 
				+
			
 
				+    sql = """
			
 
				+        update scrapy.types_analyse set iqiyi_types_new = %s, tengxun_types_new = %s, baike_types_new = %s, manual_types_new = %s, iqiyi_tengxun_after_baike_after_manual = %s where tv_id = %s
			
 
				+    """
			
 
				+    value = (' '.join(iqiyi_types_set), ' '.join(tengxun_types_set), ' '.join(baike_types_set), ' '.join(manual_types_set), ' '.join(all_types), tv_id)
			
 
				+    Mysql.update(sql, param=value, conn=conn)
			
 
				+
			
 
				+    # 更新 yxb.ad_tv_lib 表
			
 
				+    sql = """
			
 
				+        update yxb.ad_tv_lib set categories = %s where id = %s
			
 
				+    """
			
 
				+    value = (' '.join(all_types), tv_id)
			
 
				+    Mysql.update(sql, param=value, conn=conn)
			
 
				+
			
 
				+    # 更新 odl.ad_tv_lib 表
			
 
				+    sql = """
			
 
				+        update odl.ad_tv_lib set categories = %s where tv_id = %s
			
 
				+    """
			
 
				+    value = (' '.join(all_types), tv_id)
			
 
				+    Mysql.update(sql, param=value, conn=conn)
			
--- a/task_scrapy/i_t_dsj_all.py
+++ b/task_scrapy/i_t_dsj_all.py
@@ -0,0 +1,286 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+"""爱奇艺电视剧分类爬取
			
 
				+
			
 
				+分为两步
			
 
				+第一步爬取搜索页面结果，找到符合条件的电视剧
			
 
				+第二步根据保存的具体页面url爬取分类信息
			
 
				+"""
			
 
				+
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+class DSJ_All(object):
			
 
				+    # 爬取电视剧链接地址
			
 
				+    def get_iqiyi_url():
			
 
				+        
			
 
				+        # 需要爬取的列表页面
			
 
				+        start_urls = [
			
 
				+            # 'http://www.iqiyi.com/lib/dianshiju/,,2017_4_1.html',
			
 
				+            # 'http://www.iqiyi.com/lib/dianshiju/,,2016_4_1.html',
			
 
				+            # 'http://www.iqiyi.com/lib/dianshiju/,,2015_4_1.html',
			
 
				+            # 'http://www.iqiyi.com/lib/dianshiju/,,2014-2011_4_1.html',
			
 
				+            # 'http://www.iqiyi.com/lib/dianshiju/,,2010-2000_4_1.html',
			
 
				+            # 'http://www.iqiyi.com/lib/dianshiju/,,90%E5%B9%B4%E4%BB%A3_4_1.html',
			
 
				+            # 'http://www.iqiyi.com/lib/dianshiju/,,80%E5%B9%B4%E4%BB%A3_4_1.html',
			
 
				+            'http://www.iqiyi.com/lib/dianshiju/,,%E6%9B%B4%E6%97%A9_4_1.html'
			
 
				+        ]
			
 
				+        # 打开Firefox浏览器
			
 
				+        driver = webdriver.Firefox()
			
 
				+        driver.set_page_load_timeout(20)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+        for url in start_urls:
			
 
				+            # 打开主页
			
 
				+            try:
			
 
				+                driver.get(url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+
			
 
				+            is_next = True
			
 
				+            while is_next:
			
 
				+                
			
 
				+                try:
			
 
				+                    next_page = driver.find_elements_by_xpath('//div[@class="mod-page"]/a')[-1]
			
 
				+                except:
			
 
				+                    continue
			
 
				+                lis = driver.find_elements_by_xpath('//div[@class="wrapper-piclist"]/ul/li')
			
 
				+                sql_insert = """
			
 
				+                    insert into scrapy.iqiyi_dianshiju_url (url) values (%s)
			
 
				+                """
			
 
				+                data_list = []
			
 
				+                for li in lis:
			
 
				+                    try:
			
 
				+                        tv_url = li.find_element_by_xpath('.//div[1]/a').get_attribute('href')
			
 
				+                        print tv_url
			
 
				+                        data_list.append((tv_url,))                    
			
 
				+                    except Exception, e:
			
 
				+                        print '没有'
			
 
				+                        continue
			
 
				+                    time.sleep(random.uniform(0, 2))
			
 
				+                Mysql.insertMany(sql_insert, data_list, conn)
			
 
				+                try:
			
 
				+                    next_page_text = next_page.find_element_by_xpath('.').text
			
 
				+                    if next_page_text == '下一页':
			
 
				+                        next_page.click()
			
 
				+                    else:
			
 
				+                        is_next = False;
			
 
				+                except:
			
 
				+                    is_next = False;
			
 
				+                time.sleep(10)
			
 
				+        driver.quit()
			
 
				+
			
 
				+    # 爬取具体页面
			
 
				+    def get_iqiyi_detail():
			
 
				+        driver = webdriver.Firefox()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+
			
 
				+        sql = """
			
 
				+            select max(id) from scrapy.iqiyi_dianshiju_detail
			
 
				+        """
			
 
				+        max_id = Mysql.getOne(sql, conn=conn)
			
 
				+        max_id = max_id[0]
			
 
				+        if max_id is None:
			
 
				+            max_id = 0
			
 
				+        # 获取所有url
			
 
				+        sql = """
			
 
				+            select id, url from scrapy.iqiyi_dianshiju_url where id > '%s' order by id asc
			
 
				+        """
			
 
				+        sql = sql % (max_id)
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        for row in rows:
			
 
				+            _id = row['id']
			
 
				+            url = row['url']
			
 
				+            print url
			
 
				+            try:
			
 
				+                driver.get(url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+            detail_info = driver.find_element_by_xpath('//div[@class="result_detail"]')
			
 
				+            # 详情html内容
			
 
				+            detail_info_html = detail_info.get_attribute('innerHTML')
			
 
				+            # 详情文本内容
			
 
				+            detail_info_text = detail_info.find_element_by_xpath('.').text
			
 
				+            # 电视剧名称
			
 
				+            tv_name = detail_info.find_element_by_xpath('h1/a').text
			
 
				+
			
 
				+            #存入数据库
			
 
				+            sql = """
			
 
				+                insert into scrapy.iqiyi_dianshiju_detail (id, tv_name, detail_info_text, detail_info_html, url) values (%s, %s, %s, %s, %s)
			
 
				+            """
			
 
				+            value = (_id, tv_name, detail_info_text, detail_info_html, url)
			
 
				+            Mysql.insertOne(sql, value=value, conn=conn)
			
 
				+            time.sleep(random.uniform(1, 5))
			
 
				+        driver.quit()
			
 
				+
			
 
				+    # 爬取电视剧链接地址
			
 
				+    def get_tengxun_url():
			
 
				+        start_urls = [
			
 
				+            # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=2017',
			
 
				+            # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=859',
			
 
				+            # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=860',
			
 
				+            # 'http://v.qq.com/x/list/tv?iyear=861&offset=0&sort=5',
			
 
				+            # 'http://v.qq.com/x/list/tv?sort=5&offset=0&iyear=862',
			
 
				+            # 'http://v.qq.com/x/list/tv?iyear=863&sort=5&offset=0',
			
 
				+            # 'http://v.qq.com/x/list/tv?sort=5&iyear=864&offset=0',
			
 
				+            'http://v.qq.com/x/list/tv?iyear=865&sort=5&offset=0',
			
 
				+            'http://v.qq.com/x/list/tv?iyear=866&offset=0&sort=5'
			
 
				+        ]
			
 
				+        # 打开Firefox浏览器
			
 
				+        driver = webdriver.Firefox()
			
 
				+        driver.set_page_load_timeout(20)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+        for url in start_urls:
			
 
				+            # 打开主页
			
 
				+            try:
			
 
				+                driver.get(url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+            is_next = True
			
 
				+            while is_next:
			
 
				+                lis = driver.find_elements_by_xpath('//div[@class="mod_bd"]/div/ul/li')
			
 
				+                print lis
			
 
				+                sql_insert = """
			
 
				+                    insert into scrapy.tengxun_dianshiju_url (url) values (%s)
			
 
				+                """
			
 
				+                data_list = []
			
 
				+                for li in lis:
			
 
				+                    try:
			
 
				+                        tv_url = li.find_element_by_xpath('a').get_attribute('href')
			
 
				+                        print tv_url
			
 
				+                        data_list.append((tv_url,))                    
			
 
				+                    except Exception, e:
			
 
				+                        print '没有'
			
 
				+                        continue
			
 
				+                    time.sleep(1)
			
 
				+                Mysql.insertMany(sql_insert, data_list, conn)
			
 
				+                try:
			
 
				+                    next_page = driver.find_elements_by_xpath('//div[@class="mod_pages"]/a')[-1]
			
 
				+                except:
			
 
				+                    is_next = False
			
 
				+                    continue
			
 
				+                try:
			
 
				+                    next_page_text = next_page.find_element_by_xpath('.').text
			
 
				+                    next_page_url = next_page.find_element_by_xpath('.').get_attribute('href')
			
 
				+                    if next_page_url == 'javascript:;':
			
 
				+                        is_next = False
			
 
				+                        continue
			
 
				+                    if next_page_text == '下一页':
			
 
				+                        next_page.click()
			
 
				+                    else:
			
 
				+                        is_next = False;
			
 
				+                except:
			
 
				+                    is_next = False;
			
 
				+                time.sleep(10)
			
 
				+        driver.quit()
			
 
				+
			
 
				+    def get_tengxun_detail_url():
			
 
				+        # 打开Firefox浏览器
			
 
				+        driver = webdriver.Firefox()
			
 
				+        driver.set_page_load_timeout(20)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+
			
 
				+        sql = """
			
 
				+            select id, url from scrapy.tengxun_dianshiju_url where detail_url is null or detail_url = '' order by id asc
			
 
				+        """
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        for row in rows:
			
 
				+            _id = row['id']
			
 
				+            url = row['url']
			
 
				+
			
 
				+            # 打开主页
			
 
				+            try:
			
 
				+                driver.get(url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            if re.match(r'(.*)detail(.*)', driver.current_url):
			
 
				+                print driver.current_url
			
 
				+                sql = """
			
 
				+                    update scrapy.tengxun_dianshiju_url set detail_url = '%s' where id = '%s'
			
 
				+                """
			
 
				+                sql = sql % (driver.current_url, _id)
			
 
				+                Mysql.update(sql, conn=conn)
			
 
				+                continue
			
 
				+            try:
			
 
				+                a_list = driver.find_elements_by_xpath('//a[@class="album_title"]')
			
 
				+                print a_list
			
 
				+                for a in a_list:
			
 
				+                    detail_href = a.find_element_by_xpath('.').get_attribute('href')
			
 
				+                    if re.match(r'(.*)detail(.*)', detail_href):
			
 
				+                        print detail_href
			
 
				+                        sql = """
			
 
				+                            update scrapy.tengxun_dianshiju_url set detail_url = '%s' where id = '%s'
			
 
				+                        """
			
 
				+                        sql = sql % (detail_href, _id)
			
 
				+                        Mysql.update(sql, conn=conn)
			
 
				+                        break
			
 
				+            except Exception, e:
			
 
				+                print e
			
 
				+            time.sleep(random.uniform(0, 3))
			
 
				+            
			
 
				+        driver.quit()
			
 
				+
			
 
				+    # 爬取具体页面
			
 
				+    def get_tengxun_detail():
			
 
				+        driver = webdriver.Firefox()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+        # 获取所有需要爬取的电视剧
			
 
				+        sql = """
			
 
				+            select url, detail_url from scrapy.tengxun_dianshiju_url order by id asc
			
 
				+        """
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        for row in rows:
			
 
				+            url = row['url']
			
 
				+            detail_url = row['detail_url']
			
 
				+            try:
			
 
				+                driver.get(detail_url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+            detail_info = driver.find_element_by_xpath('//div[@class="container_inner"]')
			
 
				+            # 详情html内容
			
 
				+            detail_info_html = detail_info.get_attribute('innerHTML')
			
 
				+            # 详情文本内容
			
 
				+            detail_info_text = detail_info.find_element_by_xpath('.').text
			
 
				+            # 电视剧名称
			
 
				+            tv_name = detail_info.find_element_by_xpath('.//div[@class="video_title_collect cf"]/h1/a').text
			
 
				+            sql = """
			
 
				+                insert into scrapy.tengxun_dianshiju_detail (tv_name, detail_info_text, detail_info_html, cover_url, detail_url) values ('%s', '%s', '%s', '%s', '%s')
			
 
				+            """
			
 
				+            sql = sql % (tv_name, detail_info_text, detail_info_html, url, detail_url)
			
 
				+            Mysql.insertOne(sql, conn=conn)
			
 
				+        driver.quit()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    if len(sys.argv) != 2:
			
 
				+        print '没有输入参数，退出'
			
 
				+        sys.exit(0)
			
 
				+    print 'method name is ' + sys.argv[1]
			
 
				+    obj = DSJ_All()
			
 
				+    try:
			
 
				+        getattr(obj, sys.argv[1])()
			
 
				+    except Exception, e:
			
 
				+        print e
			
--- a/task_scrapy/i_t_dsj_all_without_browser.py
+++ b/task_scrapy/i_t_dsj_all_without_browser.py
@@ -0,0 +1,291 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+"""爱奇艺电视剧分类爬取
			
 
				+
			
 
				+分为两步
			
 
				+第一步爬取搜索页面结果，找到符合条件的电视剧
			
 
				+第二步根据保存的具体页面url爬取分类信息
			
 
				+"""
			
 
				+
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+class DSJ_All(object):
			
 
				+    # 爬取电视剧链接地址
			
 
				+    def get_iqiyi_url():
			
 
				+        
			
 
				+        # 需要爬取的列表页面
			
 
				+        start_urls = [
			
 
				+            # 'http://www.iqiyi.com/lib/dianshiju/,,2017_4_1.html',
			
 
				+            # 'http://www.iqiyi.com/lib/dianshiju/,,2016_4_1.html',
			
 
				+            # 'http://www.iqiyi.com/lib/dianshiju/,,2015_4_1.html',
			
 
				+            # 'http://www.iqiyi.com/lib/dianshiju/,,2014-2011_4_1.html',
			
 
				+            # 'http://www.iqiyi.com/lib/dianshiju/,,2010-2000_4_1.html',
			
 
				+            # 'http://www.iqiyi.com/lib/dianshiju/,,90%E5%B9%B4%E4%BB%A3_4_1.html',
			
 
				+            # 'http://www.iqiyi.com/lib/dianshiju/,,80%E5%B9%B4%E4%BB%A3_4_1.html',
			
 
				+            'http://www.iqiyi.com/lib/dianshiju/,,%E6%9B%B4%E6%97%A9_4_1.html'
			
 
				+        ]
			
 
				+        # 打开Firefox浏览器
			
 
				+        # driver = webdriver.Firefox()
			
 
				+        driver = webdriver.PhantomJS()
			
 
				+        driver.set_page_load_timeout(20)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+        for url in start_urls:
			
 
				+            # 打开主页
			
 
				+            try:
			
 
				+                driver.get(url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+
			
 
				+            is_next = True
			
 
				+            while is_next:
			
 
				+                
			
 
				+                try:
			
 
				+                    next_page = driver.find_elements_by_xpath('//div[@class="mod-page"]/a')[-1]
			
 
				+                except:
			
 
				+                    continue
			
 
				+                lis = driver.find_elements_by_xpath('//div[@class="wrapper-piclist"]/ul/li')
			
 
				+                sql_insert = """
			
 
				+                    insert into scrapy.iqiyi_dianshiju_url (url) values (%s)
			
 
				+                """
			
 
				+                data_list = []
			
 
				+                for li in lis:
			
 
				+                    try:
			
 
				+                        tv_url = li.find_element_by_xpath('.//div[1]/a').get_attribute('href')
			
 
				+                        print tv_url
			
 
				+                        data_list.append((tv_url,))                    
			
 
				+                    except Exception, e:
			
 
				+                        print '没有'
			
 
				+                        continue
			
 
				+                    time.sleep(random.uniform(0, 2))
			
 
				+                Mysql.insertMany(sql_insert, data_list, conn)
			
 
				+                try:
			
 
				+                    next_page_text = next_page.find_element_by_xpath('.').text
			
 
				+                    if next_page_text == '下一页':
			
 
				+                        next_page.click()
			
 
				+                    else:
			
 
				+                        is_next = False;
			
 
				+                except:
			
 
				+                    is_next = False;
			
 
				+                time.sleep(10)
			
 
				+        driver.quit()
			
 
				+
			
 
				+    # 爬取具体页面
			
 
				+    def get_iqiyi_detail():
			
 
				+        # driver = webdriver.Firefox()
			
 
				+        driver = webdriver.PhantomJS()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+
			
 
				+        sql = """
			
 
				+            select max(id) from scrapy.iqiyi_dianshiju_detail
			
 
				+        """
			
 
				+        max_id = Mysql.getOne(sql, conn=conn)
			
 
				+        max_id = max_id[0]
			
 
				+        if max_id is None:
			
 
				+            max_id = 0
			
 
				+        # 获取所有url
			
 
				+        sql = """
			
 
				+            select id, url from scrapy.iqiyi_dianshiju_url where id > '%s' order by id asc
			
 
				+        """
			
 
				+        sql = sql % (max_id)
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        for row in rows:
			
 
				+            _id = row['id']
			
 
				+            url = row['url']
			
 
				+            print url
			
 
				+            try:
			
 
				+                driver.get(url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+            detail_info = driver.find_element_by_xpath('//div[@class="result_detail"]')
			
 
				+            # 详情html内容
			
 
				+            detail_info_html = detail_info.get_attribute('innerHTML')
			
 
				+            # 详情文本内容
			
 
				+            detail_info_text = detail_info.find_element_by_xpath('.').text
			
 
				+            # 电视剧名称
			
 
				+            tv_name = detail_info.find_element_by_xpath('h1/a').text
			
 
				+
			
 
				+            #存入数据库
			
 
				+            sql = """
			
 
				+                insert into scrapy.iqiyi_dianshiju_detail (id, tv_name, detail_info_text, detail_info_html, url) values (%s, %s, %s, %s, %s)
			
 
				+            """
			
 
				+            value = (_id, tv_name, detail_info_text, detail_info_html, url)
			
 
				+            Mysql.insertOne(sql, value=value, conn=conn)
			
 
				+            time.sleep(random.uniform(1, 5))
			
 
				+        driver.quit()
			
 
				+
			
 
				+    # 爬取电视剧链接地址
			
 
				+    def get_tengxun_url():
			
 
				+        start_urls = [
			
 
				+            # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=2017',
			
 
				+            # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=859',
			
 
				+            # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=860',
			
 
				+            # 'http://v.qq.com/x/list/tv?iyear=861&offset=0&sort=5',
			
 
				+            # 'http://v.qq.com/x/list/tv?sort=5&offset=0&iyear=862',
			
 
				+            # 'http://v.qq.com/x/list/tv?iyear=863&sort=5&offset=0',
			
 
				+            # 'http://v.qq.com/x/list/tv?sort=5&iyear=864&offset=0',
			
 
				+            'http://v.qq.com/x/list/tv?iyear=865&sort=5&offset=0',
			
 
				+            'http://v.qq.com/x/list/tv?iyear=866&offset=0&sort=5'
			
 
				+        ]
			
 
				+        # 打开Firefox浏览器
			
 
				+        # driver = webdriver.Firefox()
			
 
				+        driver = webdriver.PhantomJS()
			
 
				+        driver.set_page_load_timeout(20)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+        for url in start_urls:
			
 
				+            # 打开主页
			
 
				+            try:
			
 
				+                driver.get(url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+            is_next = True
			
 
				+            while is_next:
			
 
				+                lis = driver.find_elements_by_xpath('//div[@class="mod_bd"]/div/ul/li')
			
 
				+                print lis
			
 
				+                sql_insert = """
			
 
				+                    insert into scrapy.tengxun_dianshiju_url (url) values (%s)
			
 
				+                """
			
 
				+                data_list = []
			
 
				+                for li in lis:
			
 
				+                    try:
			
 
				+                        tv_url = li.find_element_by_xpath('a').get_attribute('href')
			
 
				+                        print tv_url
			
 
				+                        data_list.append((tv_url,))                    
			
 
				+                    except Exception, e:
			
 
				+                        print '没有'
			
 
				+                        continue
			
 
				+                    time.sleep(1)
			
 
				+                Mysql.insertMany(sql_insert, data_list, conn)
			
 
				+                try:
			
 
				+                    next_page = driver.find_elements_by_xpath('//div[@class="mod_pages"]/a')[-1]
			
 
				+                except:
			
 
				+                    is_next = False
			
 
				+                    continue
			
 
				+                try:
			
 
				+                    next_page_text = next_page.find_element_by_xpath('.').text
			
 
				+                    next_page_url = next_page.find_element_by_xpath('.').get_attribute('href')
			
 
				+                    if next_page_url == 'javascript:;':
			
 
				+                        is_next = False
			
 
				+                        continue
			
 
				+                    if next_page_text == '下一页':
			
 
				+                        next_page.click()
			
 
				+                    else:
			
 
				+                        is_next = False;
			
 
				+                except:
			
 
				+                    is_next = False;
			
 
				+                time.sleep(10)
			
 
				+        driver.quit()
			
 
				+
			
 
				+    def get_tengxun_detail_url():
			
 
				+        # 打开Firefox浏览器
			
 
				+        # driver = webdriver.Firefox()
			
 
				+        driver = webdriver.PhantomJS()
			
 
				+        driver.set_page_load_timeout(20)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+
			
 
				+        sql = """
			
 
				+            select id, url from scrapy.tengxun_dianshiju_url where detail_url is null or detail_url = '' order by id asc
			
 
				+        """
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        for row in rows:
			
 
				+            _id = row['id']
			
 
				+            url = row['url']
			
 
				+
			
 
				+            # 打开主页
			
 
				+            try:
			
 
				+                driver.get(url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            if re.match(r'(.*)detail(.*)', driver.current_url):
			
 
				+                print driver.current_url
			
 
				+                sql = """
			
 
				+                    update scrapy.tengxun_dianshiju_url set detail_url = '%s' where id = '%s'
			
 
				+                """
			
 
				+                sql = sql % (driver.current_url, _id)
			
 
				+                Mysql.update(sql, conn=conn)
			
 
				+                continue
			
 
				+            try:
			
 
				+                a_list = driver.find_elements_by_xpath('//a[@class="album_title"]')
			
 
				+                print a_list
			
 
				+                for a in a_list:
			
 
				+                    detail_href = a.find_element_by_xpath('.').get_attribute('href')
			
 
				+                    if re.match(r'(.*)detail(.*)', detail_href):
			
 
				+                        print detail_href
			
 
				+                        sql = """
			
 
				+                            update scrapy.tengxun_dianshiju_url set detail_url = '%s' where id = '%s'
			
 
				+                        """
			
 
				+                        sql = sql % (detail_href, _id)
			
 
				+                        Mysql.update(sql, conn=conn)
			
 
				+                        break
			
 
				+            except Exception, e:
			
 
				+                print e
			
 
				+            time.sleep(random.uniform(0, 3))
			
 
				+            
			
 
				+        driver.quit()
			
 
				+
			
 
				+    # 爬取具体页面
			
 
				+    def get_tengxun_detail():
			
 
				+        # driver = webdriver.Firefox()
			
 
				+        driver = webdriver.PhantomJS()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+        # 获取所有需要爬取的电视剧
			
 
				+        sql = """
			
 
				+            select url, detail_url from scrapy.tengxun_dianshiju_url order by id asc
			
 
				+        """
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        for row in rows:
			
 
				+            url = row['url']
			
 
				+            detail_url = row['detail_url']
			
 
				+            try:
			
 
				+                driver.get(detail_url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+            detail_info = driver.find_element_by_xpath('//div[@class="container_inner"]')
			
 
				+            # 详情html内容
			
 
				+            detail_info_html = detail_info.get_attribute('innerHTML')
			
 
				+            # 详情文本内容
			
 
				+            detail_info_text = detail_info.find_element_by_xpath('.').text
			
 
				+            # 电视剧名称
			
 
				+            tv_name = detail_info.find_element_by_xpath('.//div[@class="video_title_collect cf"]/h1/a').text
			
 
				+            sql = """
			
 
				+                insert into scrapy.tengxun_dianshiju_detail (tv_name, detail_info_text, detail_info_html, cover_url, detail_url) values ('%s', '%s', '%s', '%s', '%s')
			
 
				+            """
			
 
				+            sql = sql % (tv_name, detail_info_text, detail_info_html, url, detail_url)
			
 
				+            Mysql.insertOne(sql, conn=conn)
			
 
				+        driver.quit()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    if len(sys.argv) != 2:
			
 
				+        print '没有输入参数，退出'
			
 
				+        sys.exit(0)
			
 
				+    print 'method name is ' + sys.argv[1]
			
 
				+    obj = DSJ_All()
			
 
				+    try:
			
 
				+        getattr(obj, sys.argv[1])()
			
 
				+    except Exception, e:
			
 
				+        print e
			
--- a/task_scrapy/i_t_dsj_categories.py
+++ b/task_scrapy/i_t_dsj_categories.py
@@ -0,0 +1,198 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+"""爱奇艺电视剧分类爬取
			
 
				+
			
 
				+分为两步
			
 
				+第一步爬取搜索页面结果，找到符合条件的电视剧
			
 
				+第二步根据保存的具体页面url爬取分类信息
			
 
				+"""
			
 
				+
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+class DSJ_Categories(object):
			
 
				+    # 爬取电视剧链接地址
			
 
				+    def get_iqiyi_url():
			
 
				+        # 打开Firefox浏览器
			
 
				+        driver = webdriver.Firefox()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+        # 获取所有需要爬取的电视剧
			
 
				+        sql = """
			
 
				+            select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and (iqiyi_url is null or iqiyi_url = '') order by id asc
			
 
				+        """
			
 
				+        # rows = conn.getAll(sql)
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        for row in rows:
			
 
				+            _id = row['id']
			
 
				+            tv_name = row['tv_name']
			
 
				+            print tv_name
			
 
				+            start_url = "http://so.iqiyi.com/so/q_" + tv_name + "?source=input"
			
 
				+            # 打开主页
			
 
				+            try:
			
 
				+                driver.get(start_url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+            lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li')
			
 
				+            for li in lis:
			
 
				+                try:
			
 
				+                    first_num = li.find_element_by_xpath('.//div[@class="info_item mt15"]/div/div[2]/ul[1]/li[1]/a').text
			
 
				+                    if '1' == first_num.strip():
			
 
				+                        href = li.find_element_by_xpath('.//div/h3/a').get_attribute('href')
			
 
				+                        print href
			
 
				+                        sql = """
			
 
				+                            update scrapy.tv_category_scrapy set iqiyi_url = '%s' where id = '%s'
			
 
				+                        """
			
 
				+                        sql = sql % (href, _id)
			
 
				+                        # conn.update(sql)
			
 
				+                        Mysql.update(sql, conn=conn)
			
 
				+                        
			
 
				+                        break
			
 
				+                except Exception, e:
			
 
				+                    print '没有'
			
 
				+                    continue
			
 
				+        driver.quit()
			
 
				+
			
 
				+    # 爬取具体页面
			
 
				+    def get_detail():
			
 
				+        driver = webdriver.Firefox()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+        # 获取所有需要爬取的电视剧
			
 
				+        sql = """
			
 
				+            select id, tv_name, iqiyi_url from scrapy.tv_category_scrapy where id > 4573 and iqiyi_url is not null and iqiyi_url != '' and iqiyi_types is null order by id asc
			
 
				+        """
			
 
				+        # rows = conn.getAll(sql)
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        for row in rows:
			
 
				+            _id = row['id']
			
 
				+            tv_name = row['tv_name']
			
 
				+            url = row['iqiyi_url']
			
 
				+            print url
			
 
				+            try:
			
 
				+                driver.get(url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+            cats = driver.find_elements_by_xpath('//div[@class="look_point"]/a')
			
 
				+            cats_set = set()
			
 
				+            for cat in cats:
			
 
				+                cats_set.add(cat.find_element_by_xpath('.').text.strip())
			
 
				+
			
 
				+            #存入数据库
			
 
				+            sql = """
			
 
				+                update scrapy.tv_category_scrapy set iqiyi_types = '%s' where id = '%s'
			
 
				+            """
			
 
				+            sql = sql % (' '.join(cats_set), _id)
			
 
				+            # conn.update(sql)
			
 
				+            Mysql.update(sql, conn=conn)
			
 
				+        driver.quit()
			
 
				+
			
 
				+
			
 
				+    # 爬取电视剧链接地址
			
 
				+    def get_tengxun_url():
			
 
				+        # 打开Firefox浏览器
			
 
				+        driver = webdriver.Firefox()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+        # 获取所有需要爬取的电视剧
			
 
				+        sql = """
			
 
				+            select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is null order by id asc
			
 
				+        """
			
 
				+        # rows = conn.getAll(sql)
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        for row in rows:
			
 
				+            _id = row['id']
			
 
				+            tv_name = row['tv_name']
			
 
				+            print tv_name
			
 
				+            start_url = "http://v.qq.com/x/search/?q=" + tv_name + "&stag=0"
			
 
				+            # 打开主页
			
 
				+            try:
			
 
				+                driver.get(start_url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+            divs = driver.find_elements_by_xpath('//div[@class="wrapper_main"]/div')
			
 
				+            for div in divs:
			
 
				+                try:
			
 
				+                    href = div.find_element_by_xpath('div[1]/div/h2/a').get_attribute('href')
			
 
				+                    print href
			
 
				+                    matchObj = re.match(r'(.*)detail(.*)', href, re.M | re.I)
			
 
				+                    if matchObj:
			
 
				+                        sql = """
			
 
				+                            update scrapy.tv_category_scrapy set tengxun_url = %s where id = %s
			
 
				+                        """
			
 
				+                        # sql = sql % (href, _id)
			
 
				+                        value = (href, _id)
			
 
				+                        # conn.update(sql)
			
 
				+                        Mysql.update(sql, param=value, conn=conn)
			
 
				+                        break
			
 
				+                except Exception, e:
			
 
				+                    print '没有'
			
 
				+                    print e
			
 
				+                    continue
			
 
				+        driver.quit()
			
 
				+
			
 
				+    # 爬取具体页面
			
 
				+    def get_detail():
			
 
				+        driver = webdriver.Firefox()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+        # 获取所有需要爬取的电视剧
			
 
				+        sql = """
			
 
				+            select id, tv_name, tengxun_url from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is not null and tengxun_url != '' and tengxun_types is null order by id asc
			
 
				+        """
			
 
				+        # rows = Mysql.getAll(sql)
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        for row in rows:
			
 
				+            _id = row['id']
			
 
				+            tv_name = row['tv_name']
			
 
				+            tengxun_url = row['tengxun_url']
			
 
				+            print tengxun_url
			
 
				+            # 打开主页
			
 
				+            try:
			
 
				+                driver.get(tengxun_url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+            cats = driver.find_elements_by_xpath('//div[@class="tag_list"]/a')
			
 
				+            cats_set = set()
			
 
				+            for cat in cats:
			
 
				+                cat_name = cat.find_element_by_xpath('.').text
			
 
				+                cats_set.add(cat_name)
			
 
				+            #存入数据库
			
 
				+            sql = """
			
 
				+                update scrapy.tv_category_scrapy set tengxun_types = '%s' where id = '%s'
			
 
				+            """
			
 
				+            sql = sql % (' '.join(cats_set), _id)
			
 
				+            # conn.update(sql)
			
 
				+            Mysql.update(sql, conn=conn)
			
 
				+        driver.quit()
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    if len(sys.argv) != 2:
			
 
				+        print '没有输入参数，退出'
			
 
				+        sys.exit(0)
			
 
				+    print 'method name is ' + sys.argv[1]
			
 
				+    obj = DSJ_Categories()
			
 
				+    try:
			
 
				+        getattr(obj, sys.argv[1])()
			
 
				+    except Exception, e:
			
 
				+        print e
			
--- a/task_scrapy/i_t_dsj_categories_without_browser.py
+++ b/task_scrapy/i_t_dsj_categories_without_browser.py
@@ -0,0 +1,203 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+"""电视剧分类爬取
			
 
				+
			
 
				+分为两步
			
 
				+第一步爬取搜索页面结果，找到符合条件的电视剧
			
 
				+第二步根据保存的具体页面url爬取分类信息
			
 
				+"""
			
 
				+
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+from urllib import quote
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+class DSJ_Categories(object):
			
 
				+    # 爬取电视剧链接地址
			
 
				+    def get_iqiyi_url(self):
			
 
				+        # 打开Firefox浏览器
			
 
				+        # driver = webdriver.Firefox()
			
 
				+        driver = webdriver.PhantomJS()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+        # 获取所有需要爬取的电视剧
			
 
				+        # sql = """
			
 
				+        #     select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and (iqiyi_url is null or iqiyi_url = '') order by id asc
			
 
				+        # """
			
 
				+        sql = """
			
 
				+            select id, tv_name from scrapy.tv_category_scrapy where id > 5598 order by id asc
			
 
				+        """
			
 
				+        # rows = conn.getAll(sql)
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        for row in rows:
			
 
				+            _id = row['id']
			
 
				+            tv_name = row['tv_name']
			
 
				+            print tv_name
			
 
				+            start_url = "http://so.iqiyi.com/so/q_" + quote(str(tv_name)) + "?source=input"
			
 
				+            # 打开主页
			
 
				+            try:
			
 
				+                driver.get(start_url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li')
			
 
				+            for li in lis:
			
 
				+                try:
			
 
				+                    first_num = li.find_element_by_xpath('.//div[@class="info_item mt15"]/div/div[2]/ul[1]/li[1]/a').text
			
 
				+                    if '1' == first_num.strip():
			
 
				+                        href = li.find_element_by_xpath('.//div/h3/a').get_attribute('href')
			
 
				+                        print href
			
 
				+                        sql = """
			
 
				+                            update scrapy.tv_category_scrapy set iqiyi_url = '%s' where id = '%s'
			
 
				+                        """
			
 
				+                        sql = sql % (href, _id)
			
 
				+                        # conn.update(sql)
			
 
				+                        Mysql.update(sql, conn=conn)
			
 
				+                        
			
 
				+                        break
			
 
				+                except Exception, e:
			
 
				+                    print '没有'
			
 
				+                    continue
			
 
				+            break
			
 
				+        driver.quit()
			
 
				+
			
 
				+    # 爬取具体页面
			
 
				+    def get_iqiyi_detail(self):
			
 
				+        driver = webdriver.PhantomJS()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+        # 获取所有需要爬取的电视剧
			
 
				+        sql = """
			
 
				+            select id, tv_name, iqiyi_url from scrapy.tv_category_scrapy where id > 4573 and iqiyi_url is not null and iqiyi_url != '' and iqiyi_types is null order by id asc
			
 
				+        """
			
 
				+        # rows = conn.getAll(sql)
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        for row in rows:
			
 
				+            _id = row['id']
			
 
				+            tv_name = row['tv_name']
			
 
				+            url = row['iqiyi_url']
			
 
				+            print url
			
 
				+            try:
			
 
				+                driver.get(url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+            cats = driver.find_elements_by_xpath('//div[@class="look_point"]/a')
			
 
				+            cats_set = set()
			
 
				+            for cat in cats:
			
 
				+                cats_set.add(cat.find_element_by_xpath('.').text.strip())
			
 
				+
			
 
				+            #存入数据库
			
 
				+            sql = """
			
 
				+                update scrapy.tv_category_scrapy set iqiyi_types = '%s' where id = '%s'
			
 
				+            """
			
 
				+            sql = sql % (' '.join(cats_set), _id)
			
 
				+            # conn.update(sql)
			
 
				+            Mysql.update(sql, conn=conn)
			
 
				+        driver.quit()
			
 
				+
			
 
				+
			
 
				+    # 爬取电视剧链接地址
			
 
				+    def get_tengxun_url(self):
			
 
				+        # 打开Firefox浏览器
			
 
				+        driver = webdriver.PhantomJS()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+        # 获取所有需要爬取的电视剧
			
 
				+        sql = """
			
 
				+            select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is null order by id asc
			
 
				+        """
			
 
				+        # rows = conn.getAll(sql)
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        for row in rows:
			
 
				+            _id = row['id']
			
 
				+            tv_name = row['tv_name']
			
 
				+            print tv_name
			
 
				+            start_url = "http://v.qq.com/x/search/?q=" + quote(str(tv_name)) + "&stag=0"
			
 
				+            # 打开主页
			
 
				+            try:
			
 
				+                driver.get(start_url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+            divs = driver.find_elements_by_xpath('//div[@class="wrapper_main"]/div')
			
 
				+            for div in divs:
			
 
				+                try:
			
 
				+                    href = div.find_element_by_xpath('div[1]/div/h2/a').get_attribute('href')
			
 
				+                    print href
			
 
				+                    matchObj = re.match(r'(.*)detail(.*)', href, re.M | re.I)
			
 
				+                    if matchObj:
			
 
				+                        sql = """
			
 
				+                            update scrapy.tv_category_scrapy set tengxun_url = %s where id = %s
			
 
				+                        """
			
 
				+                        # sql = sql % (href, _id)
			
 
				+                        value = (href, _id)
			
 
				+                        # conn.update(sql)
			
 
				+                        Mysql.update(sql, param=value, conn=conn)
			
 
				+                        break
			
 
				+                except Exception, e:
			
 
				+                    print '没有'
			
 
				+                    print e
			
 
				+                    continue
			
 
				+        driver.quit()
			
 
				+
			
 
				+    # 爬取具体页面
			
 
				+    def get_tengxun_detail(self):
			
 
				+        driver = webdriver.PhantomJS()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+
			
 
				+        # 数据库连接
			
 
				+        conn = Mysql.createScrapyConn()
			
 
				+        # 获取所有需要爬取的电视剧
			
 
				+        sql = """
			
 
				+            select id, tv_name, tengxun_url from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is not null and tengxun_url != '' and tengxun_types is null order by id asc
			
 
				+        """
			
 
				+        # rows = Mysql.getAll(sql)
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        for row in rows:
			
 
				+            _id = row['id']
			
 
				+            tv_name = row['tv_name']
			
 
				+            tengxun_url = row['tengxun_url']
			
 
				+            print tengxun_url
			
 
				+            # 打开主页
			
 
				+            try:
			
 
				+                driver.get(tengxun_url)
			
 
				+            except:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+            cats = driver.find_elements_by_xpath('//div[@class="tag_list"]/a')
			
 
				+            cats_set = set()
			
 
				+            for cat in cats:
			
 
				+                cat_name = cat.find_element_by_xpath('.').text
			
 
				+                cats_set.add(cat_name)
			
 
				+            #存入数据库
			
 
				+            sql = """
			
 
				+                update scrapy.tv_category_scrapy set tengxun_types = '%s' where id = '%s'
			
 
				+            """
			
 
				+            sql = sql % (' '.join(cats_set), _id)
			
 
				+            # conn.update(sql)
			
 
				+            Mysql.update(sql, conn=conn)
			
 
				+        driver.quit()
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    if len(sys.argv) != 2:
			
 
				+        print '没有输入参数，退出'
			
 
				+        sys.exit(0)
			
 
				+    print 'method name is ' + sys.argv[1]
			
 
				+    obj = DSJ_Categories()
			
 
				+    try:
			
 
				+        getattr(obj, sys.argv[1])()
			
 
				+    except Exception, e:
			
 
				+        print e
			
--- a/task_scrapy/scrapy_all.py
+++ b/task_scrapy/scrapy_all.py
@@ -0,0 +1,100 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+from urllib import quote
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+def parse_playtimes():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name, url, playtimes from scrapy.wangju_all_url order by id asc
			
 
				+    """
			
 
				+
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url = row['url']
			
 
				+        playtimes = row['playtimes']
			
 
				+
			
 
				+        if playtimes is not None and len(playtimes.split('*')) == 2:
			
 
				+            first_num, second_num = playtimes.split('*')
			
 
				+            first_num = float(first_num)
			
 
				+            second_num = int(second_num)
			
 
				+
			
 
				+            playtimes_new = first_num * second_num
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_all_url set playtimes = '%s' where url = '%s'
			
 
				+            """
			
 
				+            sql = sql % (str(int(playtimes_new)), url)
			
 
				+            Mysql.execute(sql, conn=conn)
			
 
				+            
			
 
				+def update_fields():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name, score, playtimes, source from scrapy.wangju_all_url order by id asc
			
 
				+    """
			
 
				+
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        score = row['score']
			
 
				+        playtimes = row['playtimes']
			
 
				+
			
 
				+        source = row['source']
			
 
				+        if 'pptv' == source:
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set pptv_score = '%s', pptv_playtimes = '%s' where id = %s
			
 
				+            """
			
 
				+
			
 
				+        if 'youku' == source:
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set youku_score = '%s', youku_playtimes = '%s' where id = %s
			
 
				+            """
			
 
				+
			
 
				+        if 'sohu' == source:
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set sohu_score = '%s', sohu_playtimes = '%s' where id = %s
			
 
				+            """
			
 
				+
			
 
				+        if 'leshi' == source:
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set leshi_score = '%s', leshi_playtimes = '%s' where id = %s
			
 
				+            """
			
 
				+
			
 
				+        if 'huashutv' == source:
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set huashutv_score = '%s', huashutv_playtimes = '%s' where id = %s
			
 
				+            """
			
 
				+
			
 
				+        if 'iqiyi' == source:
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set iqiyi_score = '%s', iqiyi_playtimes = '%s' where id = %s
			
 
				+            """
			
 
				+
			
 
				+        if 'tengxun' == source:
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set tengxun_score = '%s', tengxun_playtimes = '%s' where id = %s
			
 
				+            """
			
 
				+        sql = sql % (score, playtimes, _id)
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        
			
 
				+
			
 
				+        
			
 
				+if __name__ == '__main__':
			
 
				+    # parse_playtimes()
			
 
				+    update_fields()
			
--- a/task_scrapy/scrapy_gongzhonghao_count.py
+++ b/task_scrapy/scrapy_gongzhonghao_count.py
@@ -0,0 +1,143 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+import collections
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+from urllib import quote
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+def scrapy_website():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    # 将网站url和名称 放入有序字典中
			
 
				+    websites_dict = collections.OrderedDict()
			
 
				+    sql = """
			
 
				+        select name, account from odl.basic_weixin_subscribe where is_delete != 1 order by id asc
			
 
				+    """
			
 
				+    websites = Mysql.getAll(sql, conn=conn)
			
 
				+    for website in websites:
			
 
				+        name = website['name']
			
 
				+        account = website['account']
			
 
				+        websites_dict[account] = name
			
 
				+    driver = webdriver.Firefox()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+
			
 
				+    start_url = 'http://weixin.sogou.com/'
			
 
				+
			
 
				+    sql = """
			
 
				+        select tv_id, tv_name from odl.ad_tv_lib where tv_id order by id asc
			
 
				+    """
			
 
				+
			
 
				+    tvs = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    for tv in tvs:
			
 
				+        tv_id = tv['tv_id']
			
 
				+        tv_name = tv['tv_name']
			
 
				+        try:
			
 
				+            driver.get(start_url)
			
 
				+        except Exception, e:
			
 
				+            pass
			
 
				+        try:
			
 
				+            input_box = driver.find_element_by_id('upquery')
			
 
				+            submit_button = driver.find_element_by_class_name('swz')
			
 
				+        except Exception, e:
			
 
				+            driver.refresh()
			
 
				+        # 搜索条件
			
 
				+        try:
			
 
				+            input_box.clear()
			
 
				+            input_box.send_keys(tv_name)
			
 
				+            submit_button.click()
			
 
				+        except Exception, e:
			
 
				+            print '点击请求失败'
			
 
				+
			
 
				+        for account in websites_dict:
			
 
				+            name = websites_dict.get(account)
			
 
				+            input_box = None
			
 
				+            submit_button = None
			
 
				+
			
 
				+            time.sleep(5)
			
 
				+
			
 
				+            js = 'document.getElementsByClassName("time-box float")[2].style.display="block"'
			
 
				+            driver.execute_script(js)
			
 
				+            js = 'document.getElementsByClassName("s-sea")[0].value = "' + account + '"'
			
 
				+            driver.execute_script(js)
			
 
				+            js = 'document.getElementById("search_enter").click()'
			
 
				+            driver.execute_script(js)
			
 
				+                # s_sea = driver.find_element_by_class_name('s-sea')
			
 
				+                # search_enter = driver.find_element_by_id('search_enter')
			
 
				+                # s_sea.clear()
			
 
				+                # s_sea.send_keys(account)
			
 
				+                # search_enter.click()
			
 
				+            
			
 
				+            time.sleep(10)
			
 
				+            driver.execute_script('window.stop()')
			
 
				+            # driver.refresh()
			
 
				+            # 分页块
			
 
				+            page = None
			
 
				+            try:
			
 
				+                page = driver.find_elements_by_xpath('//div[@id="pagebar_container"]/a')
			
 
				+            except:
			
 
				+                pass
			
 
				+            count = 0
			
 
				+            # 如果分页不存在，说明记录在十条以内或没有记录
			
 
				+            if page is None or len(page) == 0:
			
 
				+                try:
			
 
				+                    divs = driver.find_elements_by_xpath('//ul[@class="news-list"]/li')
			
 
				+                    if divs is not None and len(divs) > 0:
			
 
				+                        count = len(divs)
			
 
				+                except Exception, e:
			
 
				+                    count = 0
			
 
				+            #  如果分页存在，判断最后一页是不是10
			
 
				+            else:
			
 
				+                try:
			
 
				+                    page_nums = driver.find_elements_by_xpath('//div[@id="pagebar_container"]/a')
			
 
				+                    max_page_num = 1
			
 
				+                    max_page_href= ''
			
 
				+                    for page_num in page_nums:
			
 
				+                        href = page_num.find_element_by_xpath('.').get_attribute('href')
			
 
				+                        page_num_text = page_num.find_element_by_xpath('.').get_attribute('textContent')
			
 
				+                        page_num_text = page_num_text.replace(' ', '').replace('\n', '')
			
 
				+
			
 
				+                        # 如果只是数字
			
 
				+                        if page_num_text.isdigit():
			
 
				+                            page_num_text = int(page_num_text)
			
 
				+                            if page_num_text > max_page_num:
			
 
				+                                max_page_num = page_num_text
			
 
				+                                max_page_href = href
			
 
				+                        # 如果是下一页字符串
			
 
				+                        elif page_num_text == '下一页':
			
 
				+                            break
			
 
				+
			
 
				+                    try:
			
 
				+                        driver.get(max_page_href)
			
 
				+                    except Exception, e:
			
 
				+                        pass
			
 
				+                    try:
			
 
				+                        divs = driver.find_elements_by_xpath('//ul[@class="news-list"]/li')
			
 
				+                        if divs is not None and len(divs) > 0:
			
 
				+                            count = len(divs)
			
 
				+                    except Exception, e:
			
 
				+                        count = 0
			
 
				+                    
			
 
				+                    count = (max_page_num - 1) * 10 + count
			
 
				+                except:
			
 
				+                    continue
			
 
				+
			
 
				+            if count != 0:
			
 
				+                sql = """
			
 
				+                    insert into scrapy.scrapy_subscribe_count (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_count) values (%s, %s, %s, %s, %s, %s)
			
 
				+                """
			
 
				+                value = (tv_id, tv_name, 2, name, '', count)
			
 
				+                Mysql.insertOne(sql, value=value, conn=conn)
			
 
				+    driver.quit()
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    scrapy_website()
			
--- a/task_scrapy/scrapy_huashutv.py
+++ b/task_scrapy/scrapy_huashutv.py
@@ -0,0 +1,113 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+from urllib import quote
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+def scrapy_url():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name from scrapy.wangju_url where url_huashutv is null order by id asc
			
 
				+    """
			
 
				+
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    driver = webdriver.Firefox()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+
			
 
				+        url = 'http://www.wasu.cn/Search/show/k/' + quote(str(tv_name))
			
 
				+
			
 
				+        need_blank = True
			
 
				+        try:
			
 
				+            driver.get(url)
			
 
				+        except Exception, e:
			
 
				+            driver.execute_script('window.stop()')
			
 
				+
			
 
				+        divs = driver.find_elements_by_xpath('//div[@id="agg_list"]/div')
			
 
				+        href_list = []
			
 
				+        for div in divs:
			
 
				+            try:
			
 
				+                href = div.find_element_by_xpath('./div[1]/a[1]').get_attribute('href')
			
 
				+                href_list.append(href)
			
 
				+            except Exception, e:
			
 
				+                pass
			
 
				+        if len(href_list) > 0:
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set url_huashutv = '%s' where id = %s
			
 
				+            """
			
 
				+            sql = sql % (','.join(href_list), _id)
			
 
				+            Mysql.execute(sql, conn=conn)
			
 
				+            need_blank = False
			
 
				+        if need_blank:
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set url_huashutv = '%s' where id = %s
			
 
				+            """
			
 
				+            sql = sql % ('', _id)
			
 
				+            Mysql.execute(sql, conn=conn)
			
 
				+    driver.quit()
			
 
				+
			
 
				+def scrapy_data():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name, url_huashutv from scrapy.wangju_url where url_huashutv is not null and url_huashutv != '' order by id asc
			
 
				+    """
			
 
				+
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    driver = webdriver.Firefox()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url_huashutv = row['url_huashutv']
			
 
				+
			
 
				+        urls = url_huashutv.split(',')
			
 
				+        for url in urls:
			
 
				+            if 'www.wasu.cn' not in url:
			
 
				+                continue
			
 
				+            try:
			
 
				+                driver.get(url)
			
 
				+            except Exception, e:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+            try:
			
 
				+                href = driver.find_element_by_xpath('//div[@id="con_telelist_1"]/ul/li[1]/a').get_attribute('href')
			
 
				+            except Exception, e:
			
 
				+                href = None
			
 
				+            
			
 
				+            if href is not None and 'www.wasu.cn' in href:
			
 
				+                print href
			
 
				+                try:
			
 
				+                    driver.get(href)
			
 
				+                except Exception, e:
			
 
				+                    driver.execute_script('window.stop()')
			
 
				+                try:
			
 
				+                    content = driver.find_element_by_xpath('//div[@id="play_vod_hits"]').get_attribute('textContent')
			
 
				+                except Exception, e:
			
 
				+                    continue
			
 
				+                
			
 
				+                sql = """
			
 
				+                    insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
			
 
				+                """
			
 
				+                value = (_id, tv_name, url, '', content, 'huashutv')
			
 
				+                Mysql.insertOne(sql, value=value, conn=conn)
			
 
				+    driver.quit()
			
 
				+if __name__ == '__main__':
			
 
				+    scrapy_data()
			
 
				+    # scrapy_url()
			
--- a/task_scrapy/scrapy_iqiyi.py
+++ b/task_scrapy/scrapy_iqiyi.py
@@ -0,0 +1,294 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+from urllib import quote
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+"""
			
 
				+爱奇艺爬取规则
			
 
				+1、scrapy_url 通过搜索页面，爬取搜索到的电视剧页面url
			
 
				+2、scrapy_data 进入搜索到的详情页面，爬取内容、每集url（播放数量在每集页面上显示）
			
 
				+3、scrapy_play_page 进入第一集的播放页面，爬取播放记录数
			
 
				+4、todo 每天爬取每页信息
			
 
				+
			
 
				+爱奇艺通过搜索到的详情页面没有播放数量和评论数量，需要一个个页面解析
			
 
				+搜索页面-->搜索详情页面-->播放页面（只需取第一集播放页面即可）-->真实详情页面（爬取播放数量和评论数量（评论暂时爬不到））
			
 
				+所以只要在播放页面爬取到播放量即可。
			
 
				+"""
			
 
				+
			
 
				+# 爬取搜索页面
			
 
				+def scrapy_url():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+    sql = """
			
 
				+        select id, tv_name from scrapy.wangju_url order by id asc
			
 
				+    """
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+    for row in rows:
			
 
				+        driver = webdriver.PhantomJS()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url = 'http://so.iqiyi.com/so/q_' + quote(str(tv_name))
			
 
				+        try:
			
 
				+            driver.get(url)
			
 
				+        except Exception, e:
			
 
				+            driver.execute_script('window.stop()')
			
 
				+        lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li')
			
 
				+        for li in lis:
			
 
				+            try:
			
 
				+                title = li.find_element_by_xpath('./div/h3/a').get_attribute('title')
			
 
				+                href = li.find_element_by_xpath('./div/h3/a').get_attribute('href')
			
 
				+                if 'www.iqiyi.com/lib' in href:
			
 
				+                    print href
			
 
				+                    sql = """
			
 
				+                        insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
			
 
				+                    """
			
 
				+                    value = (_id, tv_name, href, title, '', 'iqiyi')
			
 
				+                    Mysql.insertOne(sql, value=value, conn=conn)
			
 
				+                    time.sleep(1)
			
 
				+            except Exception, e:
			
 
				+                print e
			
 
				+                continue
			
 
				+        driver.quit()
			
 
				+
			
 
				+# 爬取搜索到的详情页面
			
 
				+def scrapy_data():
			
 
				+    
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+    # sql = """
			
 
				+    #     select id, tv_name, url_iqiyi from scrapy.wangju_url where url_iqiyi is not null and url_iqiyi != '' and iqiyi_fenji is null order by id asc
			
 
				+    # """
			
 
				+    sql = """
			
 
				+        select id, tv_name, url, title from scrapy.wangju_all_url where source = 'iqiyi' order by id asc
			
 
				+    """
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        driver = webdriver.PhantomJS()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url = row['url']
			
 
				+        title = row['title']
			
 
				+        
			
 
				+        try:
			
 
				+            driver.get(url)
			
 
				+        except Exception, e:
			
 
				+            driver.execute_script('window.stop()')
			
 
				+        
			
 
				+        # 爬取内容
			
 
				+        try:
			
 
				+            content = driver.find_element_by_xpath('//div[@class="result_detail"]').get_attribute('textContent')
			
 
				+        except Exception, e:
			
 
				+            content = ''
			
 
				+        
			
 
				+        # 爬取分集
			
 
				+        try:
			
 
				+            pagelist = driver.find_elements_by_xpath('//div[@class="mod_album_lists clearfix"]/div[3]/div/ul/li')
			
 
				+        except Exception, e:
			
 
				+            # 如果没有隐藏的集数，则用显示的集数
			
 
				+            try:
			
 
				+                pagelist = driver.find_elements_by_xpath('//div[@class="mod_album_lists clearfix"]/div/ul/li')
			
 
				+            except Exception, e:
			
 
				+                pagelist = None
			
 
				+                pass
			
 
				+
			
 
				+        if pagelist is not None:
			
 
				+            # 如果集数存在，则爬取每集url，用于爬取播放量和评论量
			
 
				+            data_list = []
			
 
				+            for page in pagelist:
			
 
				+                num = page.find_element_by_xpath('./a').get_attribute('title')
			
 
				+                num = num.replace(' ', '').replace('\n', '')
			
 
				+                href = page.find_element_by_xpath('./a').get_attribute('href')
			
 
				+                if 'www.iqiyi.com' in href:
			
 
				+                    data_list.append((_id, tv_name, num, href, 'iqiyi'))
			
 
				+            # 插入分集数据
			
 
				+            if data_list is not None and len(data_list) > 0:
			
 
				+                sql = """
			
 
				+                    insert into scrapy.wangju_fenji_url (id, tv_name, num, url, source) values (%s, %s, %s, %s, %s)
			
 
				+                """
			
 
				+                Mysql.insertMany(sql, data_list, conn)
			
 
				+        
			
 
				+        # 更新内容
			
 
				+        sql = """
			
 
				+            update scrapy.wangju_all_url set content = %s where url = %s
			
 
				+        """
			
 
				+        value = (content, url)
			
 
				+        Mysql.execute(sql, param=value, conn=conn)
			
 
				+        driver.quit()
			
 
				+
			
 
				+# 爬取播放页面
			
 
				+def scrapy_play_page():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+    sql = """
			
 
				+        select id, tv_name, url from scrapy.wangju_fenji_url where source = 'iqiyi' and num = '1' order by id asc
			
 
				+    """
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        driver = webdriver.Firefox()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url = row['url']
			
 
				+        if 'www.iqiyi.com' not in url:
			
 
				+            driver.quit()
			
 
				+            continue
			
 
				+        else:
			
 
				+            try:
			
 
				+                driver.get(url)
			
 
				+            except Exception, e:
			
 
				+                print e
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            try:
			
 
				+                count = driver.find_element_by_xpath('//span[@id="widget-playcount"]').text
			
 
				+            except Exception, e:
			
 
				+                print e
			
 
				+                count = 0
			
 
				+            
			
 
				+            print count
			
 
				+
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set iqiyi_playtimes = '%s' where id = %s
			
 
				+            """
			
 
				+            sql = sql % (count, _id)
			
 
				+            Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+        driver.quit()
			
 
				+
			
 
				+# 每天爬取播放页面（爱奇艺只有每集的评论数量，没有每集播放数量）
			
 
				+def scrapy_play_page_everyday():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+    sql = """
			
 
				+        select id, tv_name, num, url from scrapy.wangju_fenji_url where source = 'iqiyi' order by id asc
			
 
				+    """
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        driver = webdriver.Firefox()
			
 
				+        driver.set_page_load_timeout(20)
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        num = row['num']
			
 
				+        url = row['url']
			
 
				+        if 'www.iqiyi.com' not in url:
			
 
				+            driver.quit()
			
 
				+            sql = """
			
 
				+                delete from scrapy.wangju_fenji_url where url = '%s'
			
 
				+            """
			
 
				+            sql = sql % (url,)
			
 
				+            Mysql.execute(sql, conn=conn)
			
 
				+            continue
			
 
				+        else:
			
 
				+            try:
			
 
				+                driver.get(url)
			
 
				+            except Exception, e:
			
 
				+                print e
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            try:
			
 
				+                commenttimes = driver.find_element_by_xpath('//a[@class="blm-tab"]/em/i').text
			
 
				+            except Exception, e:
			
 
				+                print e
			
 
				+                commenttimes = ''
			
 
				+            
			
 
				+            print url
			
 
				+            print commenttimes
			
 
				+        
			
 
				+        # sql = """
			
 
				+        #     insert into scrapy.wangju_fenji_data (id, tv_name, num, source, palytimes, commenttimes) values (%s, %s, %s, %s, %s, %s)
			
 
				+        # """
			
 
				+        # value = (_id, tv_name, num, 'iqiyi', playtimes, commenttimes)
			
 
				+
			
 
				+def parse_wangju_all_url_data():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name, url from scrapy.wangju_all_url where source = 'iqiyi' and (playtimes = '' or playtimes = '0') order by id asc
			
 
				+    """
			
 
				+
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    driver = webdriver.Firefox()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+
			
 
				+    driver2 = webdriver.Firefox()
			
 
				+    driver2.set_page_load_timeout(10)
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url = row['url']
			
 
				+        
			
 
				+        try:
			
 
				+            driver.get(url)
			
 
				+        except Exception, e:
			
 
				+            print e
			
 
				+            driver.execute_script('window.stop()')
			
 
				+        try:
			
 
				+            score = driver.find_element_by_xpath('//span[@class="score_font"]').text
			
 
				+            score = score.replace(' ', '').replace('\n', '')
			
 
				+        except:
			
 
				+            score = ''
			
 
				+
			
 
				+        try:
			
 
				+            pagelist = driver.find_elements_by_xpath('//li[@class="album_item"]')
			
 
				+        except Exception, e:
			
 
				+            pass
			
 
				+            pagelist = None
			
 
				+        try:
			
 
				+            if pagelist is not None:
			
 
				+                page_dict = dict()
			
 
				+                for page in pagelist:
			
 
				+                    try:
			
 
				+                        episode = page.find_element_by_xpath('./a').get_attribute('href')
			
 
				+                        episode_text = page.find_element_by_xpath('./a').text
			
 
				+                        page_dict[episode_text] = episode
			
 
				+                    except:
			
 
				+                        continue
			
 
				+                if page_dict.get('1') is not None and 'www.iqiyi.com' in page_dict.get('1'):
			
 
				+                    try:
			
 
				+                        driver2.get(page_dict.get('1'))
			
 
				+                        time.sleep(5)
			
 
				+                    except Exception, e:
			
 
				+                        print e
			
 
				+                        driver2.execute_script('window.stop()')
			
 
				+                    try:
			
 
				+                        count = driver2.find_element_by_xpath('//a[@id="chartTrigger"]/span').text
			
 
				+                    except Exception, e:
			
 
				+                        print e
			
 
				+                        count = '0'
			
 
				+                    print count
			
 
				+                    sql = """
			
 
				+                        update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'iqiyi'
			
 
				+                    """
			
 
				+                    sql = sql % (score, count, url)
			
 
				+                    Mysql.execute(sql, conn=conn)
			
 
				+                else:
			
 
				+                    sql = """
			
 
				+                        delete from scrapy.wangju_all_url where url = '%s' and source = '%s'
			
 
				+                    """
			
 
				+                    sql = sql % (url, 'iqiyi')
			
 
				+                    Mysql.execute(sql, conn=conn)
			
 
				+            else:
			
 
				+                sql = """
			
 
				+                    delete from scrapy.wangju_all_url where url = '%s' and source = '%s'
			
 
				+                """
			
 
				+                sql = sql % (url, 'iqiyi')
			
 
				+                Mysql.execute(sql, conn=conn)
			
 
				+        except Exception, e:
			
 
				+            continue
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # scrapy_url()
			
 
				+    # scrapy_data()
			
 
				+    # scrapy_play_page()
			
 
				+    # scrapy_play_page_everyday()
			
 
				+    parse_wangju_all_url_data()
			
--- a/task_scrapy/scrapy_kankan.py
+++ b/task_scrapy/scrapy_kankan.py
@@ -0,0 +1,59 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+from selenium import webdriver
			
 
				+from urllib import quote
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+sql = """
			
 
				+    select id, tv_name from scrapy.wangju_url where url_kankan is null order by id asc
			
 
				+"""
			
 
				+
			
 
				+rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+driver = webdriver.Firefox()
			
 
				+driver.set_page_load_timeout(10)
			
 
				+for row in rows:
			
 
				+    
			
 
				+    _id = row['id']
			
 
				+    tv_name = row['tv_name']
			
 
				+
			
 
				+    url = 'http://search.kankan.com/search.php?keyword=' + quote(str(tv_name))
			
 
				+    need_blank = True
			
 
				+    try:
			
 
				+        driver.get(url)
			
 
				+    except Exception, e:
			
 
				+        driver.execute_script('window.stop()')
			
 
				+
			
 
				+    # 解析第一页
			
 
				+    divs = driver.find_elements_by_xpath('//div[@class="searchmain"]/div')
			
 
				+    for div in divs:
			
 
				+        try:
			
 
				+            title = div.find_element_by_xpath('//div[@class="reuslt_tt"]/h2/a').get_attribute('title')
			
 
				+            href = div.find_element_by_xpath('./div/a').get_attribute('href')
			
 
				+            _type = div.find_element_by_xpath('./div/div[2]').get_attribute('textContent')
			
 
				+            sources = div.find_element_by_xpath('//ul[@class="sitelist"]').get_attribute('textContent')
			
 
				+            if tv_name == title and u'电视剧' in _type and u'响巢看看' in sources:
			
 
				+                sql = """
			
 
				+                    update scrapy.wangju_url set url_kankan = '%s' where id = %s
			
 
				+                """
			
 
				+                sql = sql % (href, _id)
			
 
				+                Mysql.execute(sql, conn=conn)
			
 
				+                need_blank = False
			
 
				+        except Exception, e:
			
 
				+            continue
			
 
				+    if need_blank:
			
 
				+        sql = """
			
 
				+            update scrapy.wangju_url set url_kankan = '%s' where id = %s
			
 
				+        """
			
 
				+        sql = sql % ('', _id)
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+driver.quit()
			
--- a/task_scrapy/scrapy_leshi.py
+++ b/task_scrapy/scrapy_leshi.py
@@ -0,0 +1,186 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+from urllib import quote
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+"""
			
 
				+乐视视频爬取规则
			
 
				+
			
 
				+
			
 
				+"""
			
 
				+def scrapy_url():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name from scrapy.wangju_url where url_leshi is null order by id asc
			
 
				+    """
			
 
				+
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    driver = webdriver.Firefox()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+
			
 
				+        url = 'http://so.le.com/s?wd=' + quote(str(tv_name))
			
 
				+
			
 
				+        try:
			
 
				+            driver.get(url)
			
 
				+        except Exception, e:
			
 
				+            driver.execute_script('window.stop()')
			
 
				+
			
 
				+        divs = driver.find_elements_by_xpath('//div[@class="So-detail Tv-so"]')
			
 
				+        href_list = []
			
 
				+        for div in divs:
			
 
				+            try:
			
 
				+                href = div.find_element_by_xpath('./div/div[2]/div[1]/h1/a').get_attribute('href')
			
 
				+                href_list.append(href)
			
 
				+            except Exception, e:
			
 
				+                pass
			
 
				+        if len(href_list) > 0:
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set url_leshi = '%s' where id = %s
			
 
				+            """
			
 
				+            sql = sql % (','.join(href_list), _id)
			
 
				+            Mysql.execute(sql, conn=conn)
			
 
				+            need_blank = False
			
 
				+        if need_blank:
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set url_leshi = '%s' where id = %s
			
 
				+            """
			
 
				+            sql = sql % ('', _id)
			
 
				+            Mysql.execute(sql, conn=conn)
			
 
				+    driver.quit()
			
 
				+
			
 
				+def scrapy_data():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name, url_leshi from scrapy.wangju_url where url_leshi is not null and url_leshi != '' order by id asc
			
 
				+    """
			
 
				+
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    driver = webdriver.Firefox()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url_leshi = row['url_leshi']
			
 
				+
			
 
				+        urls = url_leshi.split(',')
			
 
				+        for url in urls:
			
 
				+            if 'www.le.com' not in url:
			
 
				+                continue
			
 
				+            try:
			
 
				+                driver.get(url)
			
 
				+            except Exception, e:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+            try:
			
 
				+                href = driver.find_element_by_xpath('//div[@id="j-adv-tv"]/div[2]/div[1]/div[2]/div[1]/div[2]/dl[1]/dt/a').get_attribute('href')
			
 
				+            except Exception, e:
			
 
				+                href = None
			
 
				+            
			
 
				+
			
 
				+            if href is not None and 'www.le.com' in href:
			
 
				+                print href
			
 
				+                try:
			
 
				+                    driver.get(href)
			
 
				+                except Exception, e:
			
 
				+                    driver.execute_script('window.stop()')
			
 
				+                try:
			
 
				+                    content = driver.find_element_by_xpath('//div[@class="Info"]').get_attribute('textContent')
			
 
				+                except Exception, e:
			
 
				+                    continue
			
 
				+                
			
 
				+                sql = """
			
 
				+                    insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
			
 
				+                """
			
 
				+                value = (_id, tv_name, url, '', content, 'leshi')
			
 
				+                Mysql.insertOne(sql, value=value, conn=conn)
			
 
				+    driver.quit()
			
 
				+
			
 
				+def parse_wangju_all_url_title():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name, url from scrapy.wangju_all_url where source = 'leshi' order by id asc
			
 
				+    """
			
 
				+
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    driver = webdriver.Firefox()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url = row['url']
			
 
				+
			
 
				+        try:
			
 
				+            driver.get(url)
			
 
				+        except Exception, e:
			
 
				+            print e
			
 
				+            driver.execute_script('window.stop()')
			
 
				+
			
 
				+        try:
			
 
				+            title = driver.find_element_by_xpath('//div[@class="listPic active"]/div[1]/p/i').text
			
 
				+        except Exception, e:
			
 
				+            title = ''
			
 
				+
			
 
				+        sql = """
			
 
				+            update scrapy.wangju_all_url set title = '%s' where source = '%s' and url = '%s'
			
 
				+        """
			
 
				+        sql = sql % (title, 'leshi', url)
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+def parse_content():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name, url, content from scrapy.wangju_all_url where source = 'leshi' order by id asc
			
 
				+    """
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url = row['url']
			
 
				+        content = row['content']
			
 
				+
			
 
				+        import re
			
 
				+        m = re.search(ur'([0-9]+[.]?)+', content)
			
 
				+        score = '0'
			
 
				+        if m is not None:
			
 
				+            score = m.group(0)
			
 
				+
			
 
				+        play = '0'
			
 
				+        m = re.search(ur'播放数：([0-9]+[.]?)+[(亿)(万)]', content)
			
 
				+        if m is not None:
			
 
				+            play = m.group(0)
			
 
				+
			
 
				+        sql = """
			
 
				+            update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'leshi'
			
 
				+        """
			
 
				+        sql = sql % (score, play, url)
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # scrapy_data()
			
 
				+    # scrapy_url()
			
 
				+    
			
 
				+    # parse_wangju_all_url_title()
			
 
				+    parse_content()
			
--- a/task_scrapy/scrapy_pptv.py
+++ b/task_scrapy/scrapy_pptv.py
@@ -0,0 +1,146 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+from urllib import quote
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+def scrapy_url():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name from scrapy.wangju_url where url_pptv is null order by id asc
			
 
				+    """
			
 
				+
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    driver = webdriver.Firefox()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+
			
 
				+        url = 'http://search.pptv.com/s_video?kw=' + quote(str(tv_name))
			
 
				+
			
 
				+        need_blank = True
			
 
				+        try:
			
 
				+            driver.get(url)
			
 
				+        except Exception, e:
			
 
				+            driver.execute_script('window.stop()')
			
 
				+
			
 
				+        divs = driver.find_elements_by_xpath('//div[@id="search-result"]/div')
			
 
				+        href_list = []
			
 
				+        for div in divs:
			
 
				+            try:
			
 
				+                href = div.find_element_by_xpath('./div[2]/dl/dd/p/a').get_attribute('href')
			
 
				+                href_list.append(href)
			
 
				+            except Exception, e:
			
 
				+                pass
			
 
				+        if len(href_list) > 0:
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set url_pptv = '%s' where id = %s
			
 
				+            """
			
 
				+            sql = sql % (','.join(href_list), _id)
			
 
				+            Mysql.execute(sql, conn=conn)
			
 
				+            need_blank = False
			
 
				+        if need_blank:
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set url_pptv = '%s' where id = %s
			
 
				+            """
			
 
				+            sql = sql % ('', _id)
			
 
				+            Mysql.execute(sql, conn=conn)
			
 
				+    driver.quit()
			
 
				+
			
 
				+def parse_unique_url():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+    sql = """
			
 
				+        select id, tv_name, url_pptv from scrapy.wangju_url where url_pptv is not null and url_pptv != '' and pptv_finished is null order by id asc
			
 
				+    """
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    driver = webdriver.Firefox()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url_pptv = row['url_pptv']
			
 
				+
			
 
				+        urls = url_pptv.split(',')
			
 
				+        for url in urls:
			
 
				+            try:
			
 
				+                driver.get(url)
			
 
				+            except Exception, e:
			
 
				+                try:
			
 
				+                    driver.execute_script('window.stop()')
			
 
				+                except:
			
 
				+                    continue
			
 
				+            try:
			
 
				+                nav_type = driver.find_element_by_xpath('//div[@class="module module-bread-nav cf"]/p/a').text
			
 
				+                if nav_type != u'电视剧':
			
 
				+                    continue
			
 
				+                else:
			
 
				+                    title = driver.find_element_by_xpath('//div[@class="module-dpage-info"]/div[1]/h3').text
			
 
				+                    content = driver.find_element_by_xpath('//div[@class="module-dpage-info"]/div[2]').get_attribute('textContent')
			
 
				+                    
			
 
				+                    sql = """
			
 
				+                        insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
			
 
				+                    """
			
 
				+                    value = (_id, tv_name, url, title, content, 'pptv')
			
 
				+                    Mysql.insertOne(sql, value=value, conn=conn)
			
 
				+            except Exception, e:
			
 
				+                pass
			
 
				+        sql = """
			
 
				+            update scrapy.wangju_url set pptv_finished = '%s' where id = %s
			
 
				+        """
			
 
				+        sql = sql % ('1', _id)
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+    
			
 
				+    driver.quit()
			
 
				+
			
 
				+def scrapy_fenji():
			
 
				+    pass
			
 
				+
			
 
				+def parse_content():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name, url, content from scrapy.wangju_all_url where source = 'pptv' order by id asc
			
 
				+    """
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url = row['url']
			
 
				+        content = row['content']
			
 
				+
			
 
				+        import re
			
 
				+        m = re.search(ur'评分：\d+(.)\d+', content)
			
 
				+        score = '0'
			
 
				+        if m is not None:
			
 
				+            score = m.group(0)
			
 
				+
			
 
				+        play = '0'
			
 
				+        m = re.search(ur'播放：\d+(.)\d+[(亿)(万)]', content)
			
 
				+        if m is not None:
			
 
				+            play = m.group(0)
			
 
				+
			
 
				+        sql = """
			
 
				+            update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'pptv'
			
 
				+        """
			
 
				+        sql = sql % (score, play, url)
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # scrapy_url()
			
 
				+    # parse_unique_url()
			
 
				+    parse_content()
			
--- a/task_scrapy/scrapy_sohu.py
+++ b/task_scrapy/scrapy_sohu.py
@@ -0,0 +1,139 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+from urllib import quote
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+def scrapy_url():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name from scrapy.wangju_url where url_sohu is null order by id asc
			
 
				+    """
			
 
				+
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    driver = webdriver.Firefox()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+
			
 
				+        url = 'http://so.tv.sohu.com/mts?box=1&wd=' + quote(str(tv_name))
			
 
				+
			
 
				+        need_blank = True
			
 
				+        try:
			
 
				+            driver.get(url)
			
 
				+        except Exception, e:
			
 
				+            driver.execute_script('window.stop()')
			
 
				+
			
 
				+        divs = driver.find_elements_by_xpath('//div[@class="wrap cfix"]/div')
			
 
				+        href_list = []
			
 
				+        for div in divs:
			
 
				+            try:
			
 
				+                href = div.find_element_by_xpath('./div/div[2]/div[1]/h2/a').get_attribute('href')
			
 
				+                href_list.append(href)
			
 
				+            except Exception, e:
			
 
				+                pass
			
 
				+        if len(href_list) > 0:
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set url_sohu = '%s' where id = %s
			
 
				+            """
			
 
				+            sql = sql % (','.join(href_list), _id)
			
 
				+            Mysql.execute(sql, conn=conn)
			
 
				+            need_blank = False
			
 
				+        if need_blank:
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set url_sohu = '%s' where id = %s
			
 
				+            """
			
 
				+            sql = sql % ('', _id)
			
 
				+            Mysql.execute(sql, conn=conn)
			
 
				+    driver.quit()
			
 
				+
			
 
				+def scrapy_data():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name, url_sohu from scrapy.wangju_url where url_sohu is not null and url_sohu != '' order by id asc
			
 
				+    """
			
 
				+
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    driver = webdriver.Firefox()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url_sohu = row['url_sohu']
			
 
				+
			
 
				+        urls = url_sohu.split(',')
			
 
				+        for url in urls:
			
 
				+            try:
			
 
				+                driver.get(url)
			
 
				+            except Exception, e:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            
			
 
				+            try:
			
 
				+                title = ''
			
 
				+                content = driver.find_element_by_xpath('//div[@class="infoR r"]').get_attribute('textContent')
			
 
				+            except Exception, e:
			
 
				+                try:
			
 
				+                    title = driver.find_element_by_xpath('//div[@class="drama-name area rel cfix "]').get_attribute('textContent')
			
 
				+                    content = driver.find_element_by_xpath('//div[@class="drama-infoR"]').get_attribute('textContent')
			
 
				+                except Exception, e:
			
 
				+                    continue
			
 
				+            
			
 
				+            sql = """
			
 
				+                insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
			
 
				+            """
			
 
				+            value = (_id, tv_name, url, title, content, 'sohu')
			
 
				+            Mysql.insertOne(sql, value=value, conn=conn)
			
 
				+
			
 
				+    driver.quit()
			
 
				+
			
 
				+def parse_wangju_all_url_data():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name, url, content from scrapy.wangju_all_url where source = 'sohu' order by id asc
			
 
				+    """
			
 
				+
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url = row['url']
			
 
				+        content = row['content']
			
 
				+        
			
 
				+        import re
			
 
				+        m = re.search(ur'评分：\d+(.)\d+', content)
			
 
				+        score = '0'
			
 
				+        if m is not None:
			
 
				+            score = m.group(0)
			
 
				+        
			
 
				+        play = '0'
			
 
				+        m = re.search(ur'总播放：\d+(.)\d+[(亿)(万)]', content)
			
 
				+        if m is not None:
			
 
				+            play = m.group(0)
			
 
				+        
			
 
				+        sql = """
			
 
				+            update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'sohu'
			
 
				+        """
			
 
				+        sql = sql % (score, play, url)
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # scrapy_data()
			
 
				+    # scrapy_url()
			
 
				+    parse_wangju_all_url_data()
			
--- a/task_scrapy/scrapy_tengxun.py
+++ b/task_scrapy/scrapy_tengxun.py
@@ -0,0 +1,231 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+from urllib import quote
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+"""
			
 
				+腾讯视频爬取规则
			
 
				+1、scrapy_url 通过搜索页面，爬取搜索到的最有可能是电视剧页面的url
			
 
				+2、scrapy_data 进入搜索到的详情页面，爬取评分，每集url（播放数量在每集页面上显示）
			
 
				+3、todo 爬取每页详情页
			
 
				+
			
 
				+腾讯视频通过搜索到的详情页面没有播放数量和评论数量，需要一个个页面解析
			
 
				+搜索页面-->搜索详情页面-->播放页面（只需取第一集播放页面即可）
			
 
				+所以只有在播放页面爬取到播放量即可。
			
 
				+"""
			
 
				+
			
 
				+def scrapy_url():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+    sql = """
			
 
				+        select id, tv_name from scrapy.wangju_url order by id asc
			
 
				+    """
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+    for row in rows:
			
 
				+        driver = webdriver.PhantomJS()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url = 'https://v.qq.com/x/search/?q=' + quote(str(tv_name))
			
 
				+        try:
			
 
				+            driver.get(url)
			
 
				+        except Exception, e:
			
 
				+            driver.execute_script('window.stop()')
			
 
				+
			
 
				+        divs = driver.find_elements_by_xpath('//div[@class="wrapper_main"]/div')
			
 
				+        for div in divs:
			
 
				+            try:
			
 
				+                title = div.find_element_by_xpath('./div[1]/div/h2/a/em').text
			
 
				+                href = div.find_element_by_xpath('./div[1]/div/h2/a').get_attribute('href')
			
 
				+                if 'v.qq.com/detail' in href:
			
 
				+                    print href
			
 
				+                    sql = """
			
 
				+                        insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
			
 
				+                    """
			
 
				+                    value = (_id, tv_name, href, title, '', 'tengxun')
			
 
				+                    Mysql.insertOne(sql, value=value, conn=conn)
			
 
				+                    time.sleep(1)
			
 
				+            except Exception, e:
			
 
				+                print e
			
 
				+                continue
			
 
				+        driver.quit()
			
 
				+
			
 
				+# 爬取搜索到的详情页面
			
 
				+def scrapy_data():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    # sql = """
			
 
				+    #     select id, tv_name, url_tengxun from scrapy.wangju_url where url_tengxun is not null and url_tengxun != '' and tengxun_fenji is null order by id asc
			
 
				+    # """
			
 
				+    sql = """
			
 
				+        select id, tv_name, url, title from scrapy.wangju_all_url where source = 'tengxun' order by id asc
			
 
				+    """
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+    for row in rows:
			
 
				+        driver = webdriver.PhantomJS()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url = row['url']
			
 
				+
			
 
				+        try:
			
 
				+            driver.get(url)
			
 
				+        except Exception, e:
			
 
				+            driver.execute_script('window.stop()')
			
 
				+
			
 
				+        # 爬取内容
			
 
				+        try:
			
 
				+            content = driver.find_element_by_xpath('//div[@class="container_inner"]').get_attribute('textContent')
			
 
				+        except Exception, e:
			
 
				+            content = ''
			
 
				+
			
 
				+        try:
			
 
				+            pagelist = driver.find_elements_by_xpath('//div[@class="mod_episode"]/span')
			
 
				+            if pagelist is not None:
			
 
				+                data_list = []
			
 
				+                for page in pagelist:
			
 
				+                    num = page.find_element_by_xpath('./a/span').text
			
 
				+                    num = num.replace(' ', '').replace('\n', '')
			
 
				+                    href = page.find_element_by_xpath('./a').get_attribute('href')
			
 
				+                    if 'v.qq.com' in href:
			
 
				+                        data_list.append((_id, tv_name, num, href, 'tengxun'))
			
 
				+                # 插入分集数据
			
 
				+                if data_list is not None and len(data_list) > 0:
			
 
				+                    sql = """
			
 
				+                        insert into scrapy.wangju_fenji_url (id, tv_name, num, url, source) values (%s, %s, %s, %s, %s)
			
 
				+                    """
			
 
				+                    Mysql.insertMany(sql, data_list, conn)
			
 
				+        except Exception, e:
			
 
				+            pass
			
 
				+        
			
 
				+        # 更新内容
			
 
				+        sql = """
			
 
				+            update scrapy.wangju_all_url set content = %s where url = %s
			
 
				+        """
			
 
				+        value = (content, url)
			
 
				+        Mysql.execute(sql, param=value, conn=conn)
			
 
				+        driver.quit()
			
 
				+
			
 
				+# 爬取播放页面
			
 
				+def scrapy_play_page():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+    sql = """
			
 
				+        select id, tv_name, url from scrapy.wangju_fenji_url where source = 'tengxun' and num = '1' order by id asc
			
 
				+    """
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+    for row in rows:
			
 
				+        driver = webdriver.Firefox()
			
 
				+        driver.set_page_load_timeout(10)
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url = row['url']
			
 
				+        if 'v.qq.com' not in url:
			
 
				+            driver.quit()
			
 
				+            continue
			
 
				+        else:
			
 
				+            try:
			
 
				+                driver.get(url)
			
 
				+            except Exception, e:
			
 
				+                print e
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            try:
			
 
				+                count = driver.find_element_by_xpath('//em[@id="mod_cover_playnum"]').text
			
 
				+            except Exception, e:
			
 
				+                print e
			
 
				+                count = 0
			
 
				+            
			
 
				+            print count
			
 
				+
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set tengxun_playtimes = '%s' where id = %s
			
 
				+            """
			
 
				+            sql = sql % (count, _id)
			
 
				+            Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+        driver.quit()
			
 
				+
			
 
				+def parse_wangju_all_url_data():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name, url from scrapy.wangju_all_url where source = 'tengxun' order by id asc
			
 
				+    """
			
 
				+
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    driver = webdriver.Firefox()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+
			
 
				+    driver2 = webdriver.Firefox()
			
 
				+    driver2.set_page_load_timeout(10)
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url = row['url']
			
 
				+        
			
 
				+        try:
			
 
				+            driver.get(url)
			
 
				+        except Exception, e:
			
 
				+            print e
			
 
				+            driver.execute_script('window.stop()')
			
 
				+        try:
			
 
				+            score = driver.find_element_by_xpath('//div[@class="video_score"]').text
			
 
				+            score = score.replace(' ', '').replace('\n', '')
			
 
				+        except:
			
 
				+            score = ''
			
 
				+        
			
 
				+        try:
			
 
				+            pagelist = driver.find_elements_by_xpath('//span[@class="item"]')
			
 
				+        except:
			
 
				+            pagelist = None
			
 
				+        
			
 
				+        try:
			
 
				+            page_dict = dict()
			
 
				+            if pagelist is not None:
			
 
				+                for page in pagelist:
			
 
				+                    episode = page.find_element_by_xpath('./a').get_attribute('href')
			
 
				+                    episode_text = page.find_element_by_xpath('./a/span').text
			
 
				+                    page_dict[episode_text] = episode
			
 
				+            if page_dict.get('1') is not None and 'v.qq.com' in page_dict.get('1'):
			
 
				+                try:
			
 
				+                    driver2.get(page_dict.get('1'))
			
 
				+                except Exception, e:
			
 
				+                    print e
			
 
				+                    driver2.execute_script('window.stop()')
			
 
				+                try:
			
 
				+                    count = driver2.find_element_by_xpath('//em[@id="mod_cover_playnum"]').text
			
 
				+                except Exception, e:
			
 
				+                    print e
			
 
				+                    count = 0
			
 
				+                sql = """
			
 
				+                    update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'tengxun'
			
 
				+                """
			
 
				+                sql = sql % (score, count, url)
			
 
				+                Mysql.execute(sql, conn=conn)
			
 
				+            else:
			
 
				+                sql = """
			
 
				+                    delete from scrapy.wangju_all_url where url = '%s' and source = '%s'
			
 
				+                """
			
 
				+                sql = sql % (url, 'tengxun')
			
 
				+                Mysql.execute(sql, conn=conn)
			
 
				+        except Exception, e:
			
 
				+            continue
			
 
				+
			
 
				+            
			
 
				+        
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # scrapy_url()
			
 
				+    # scrapy_data()
			
 
				+    # scrapy_play_page()
			
 
				+    parse_wangju_all_url_data()
			
--- a/task_scrapy/scrapy_tianyancha.py
+++ b/task_scrapy/scrapy_tianyancha.py
@@ -0,0 +1,97 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+from urllib import quote
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+def scrapy_tianyancha():
			
 
				+    
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+    urls = []
			
 
				+    for i in range(1, 33):
			
 
				+        urls.append(str('http://hangzhou.tianyancha.com/search/p' + str(i) + '?key=%E6%96%87%E5%8C%96%E4%BC%A0%E5%AA%92'))
			
 
				+
			
 
				+    driver = webdriver.Firefox()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+
			
 
				+    for url in urls:
			
 
				+        try:
			
 
				+            driver.get(url)
			
 
				+        except Exception, e:
			
 
				+            print url
			
 
				+            try:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            except Exception, e:
			
 
				+                pass
			
 
				+        time.sleep(10)
			
 
				+        try:
			
 
				+            divs = driver.find_elements_by_xpath('//div[@id="ng-view"]/div[2]/div/div/div[1]/div[3]/div')
			
 
				+        except Exception, e:
			
 
				+            continue
			
 
				+        for div in divs:
			
 
				+            try:
			
 
				+                title = div.find_element_by_xpath('./div[2]/div/div[1]/a').get_attribute('textContent')
			
 
				+                href = div.find_element_by_xpath('./div[2]/div/div[1]/a').get_attribute('href')
			
 
				+
			
 
				+                sql = """
			
 
				+                    insert into scrapy.scrapy_tianyancha (name, url) values (%s, %s)
			
 
				+                """
			
 
				+                value = (title, href)
			
 
				+                Mysql.insertOne(sql, value=value, conn=conn)
			
 
				+            except Exception, e:
			
 
				+                pass
			
 
				+    driver.quit()
			
 
				+
			
 
				+def parse_detail():
			
 
				+    
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, url from scrapy.scrapy_tianyancha where content1 = '' or content1 is null order by id asc
			
 
				+    """
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+    driver = webdriver.Firefox()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        url = row['url']
			
 
				+
			
 
				+        try:
			
 
				+            driver.get(url)
			
 
				+        except Exception, e:
			
 
				+            print url
			
 
				+            try:
			
 
				+                driver.execute_script('window.stop()')
			
 
				+            except Exception, e:
			
 
				+                pass
			
 
				+        
			
 
				+        time.sleep(5)
			
 
				+        try:
			
 
				+            content1 = driver.find_element_by_xpath('//div[@class="company_info_text"]').get_attribute('textContent')
			
 
				+            content2_list = driver.find_elements_by_xpath('//div[@class="baseinfo-module-item"]')
			
 
				+            content2 = ''
			
 
				+            for content in content2_list:
			
 
				+                content2 = content2 + content.find_element_by_xpath('.').get_attribute('textContent')
			
 
				+        except Exception, e:
			
 
				+            content1 = ''
			
 
				+            content2 = ''
			
 
				+        sql = """
			
 
				+            update scrapy.scrapy_tianyancha set content1 = '%s', content2 = '%s' where id = %s
			
 
				+        """
			
 
				+        sql = sql % (content1, content2, _id)
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+    driver.quit()
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # scrapy_tianyancha()
			
 
				+    parse_detail()
			
--- a/task_scrapy/scrapy_tv_unhandle.py
+++ b/task_scrapy/scrapy_tv_unhandle.py
@@ -0,0 +1,83 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+from urllib import quote
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+"""
			
 
				+从爱奇艺中爬取百度百科没有爬到的内容
			
 
				+"""
			
 
				+
			
 
				+# 爬取搜索页面
			
 
				+def scrapy_url():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select max(tv_id) as tv_id from scrapy.iqiyi_dianshiju_detail
			
 
				+    """
			
 
				+
			
 
				+    max_id = Mysql.getOne(sql, conn=conn)
			
 
				+    if max_id is None or max_id[0] == 0:
			
 
				+        max_tv_id = 0
			
 
				+    else:
			
 
				+        max_tv_id = max_id[0]
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, name from tv_lib.yxb_tv_series where id > %s and status = 12 order by id asc
			
 
				+    """
			
 
				+    sql = sql % (max_tv_id,)
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+    driver = webdriver.PhantomJS()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+
			
 
				+    driver2 = webdriver.PhantomJS()
			
 
				+    driver2.set_page_load_timeout(10)
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        name = row['name']
			
 
				+        url = 'http://so.iqiyi.com/so/q_' + quote(str(name))
			
 
				+        try:
			
 
				+            driver.get(url)
			
 
				+        except Exception, e:
			
 
				+            driver.execute_script('window.stop()')
			
 
				+        lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li')
			
 
				+        for li in lis:
			
 
				+            try:
			
 
				+                title = li.find_element_by_xpath('./div/h3/a').get_attribute('title')
			
 
				+                href = li.find_element_by_xpath('./div/h3/a').get_attribute('href')
			
 
				+                if 'www.iqiyi.com/lib' in href:
			
 
				+                    print href
			
 
				+                    try:
			
 
				+                        driver2.get(href)
			
 
				+                    except:
			
 
				+                        pass
			
 
				+                    content = driver2.find_element_by_xpath('//div[@class="result_detail"]').get_attribute('textContent')
			
 
				+                    if content is None:
			
 
				+                        content = ''
			
 
				+                    desc = driver2.find_element_by_xpath('//div[@class="mod-body introduce-info"]').get_attribute('textContext')
			
 
				+                    if desc is None:
			
 
				+                        desc = ''
			
 
				+                    
			
 
				+                    content = content + '\n' + '概述：' + desc
			
 
				+                    sql = """
			
 
				+                        insert into scrapy.iqiyi_dianshiju_detail (tv_id, tv_name, title, detail_info_text, url) values (%s, %s, %s, %s, %s)
			
 
				+                    """
			
 
				+                    value = (_id, name, title, content, href)
			
 
				+                    Mysql.insertOne(sql, value=value, conn=conn)
			
 
				+            except Exception, e:
			
 
				+                print e
			
 
				+                continue
			
 
				+    driver.quit()
			
 
				+    driver2.quit()
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    scrapy_url()
			
--- a/task_scrapy/scrapy_website_count.py
+++ b/task_scrapy/scrapy_website_count.py
@@ -0,0 +1,146 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""
			
 
				+营销文章数量爬取
			
 
				+"""
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+import collections
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+from urllib import quote
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+def scrapy_website():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    # 将网站url和名称 放入有序字典中
			
 
				+    websites_dict = collections.OrderedDict()
			
 
				+    sql = """
			
 
				+        select name, update_url from odl.basic_websites order by id asc
			
 
				+    """
			
 
				+    websites = Mysql.getAll(sql, conn=conn)
			
 
				+    for website in websites:
			
 
				+        name = website['name']
			
 
				+        update_url = website['update_url']
			
 
				+        websites_dict[update_url] = name
			
 
				+    driver = webdriver.PhantomJS()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+
			
 
				+    sql = """
			
 
				+        select max(tv_id) as tv_id from scrapy.scrapy_article_count
			
 
				+    """
			
 
				+
			
 
				+    max_tv_id = Mysql.getOne(sql, conn=conn)
			
 
				+    if max_tv_id is None or max_tv_id[0] == 0:
			
 
				+        max_id = 0
			
 
				+    else:
			
 
				+        max_id = max_tv_id[0]
			
 
				+
			
 
				+    start_url = 'http://www.baidu.com/'
			
 
				+
			
 
				+    sql = """
			
 
				+        select tv_id, tv_name from odl.ad_tv_lib where tv_id > %s order by id asc
			
 
				+    """
			
 
				+    sql = sql % (max_id, )
			
 
				+
			
 
				+    tvs = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    for tv in tvs:
			
 
				+        tv_id = tv['tv_id']
			
 
				+        tv_name = tv['tv_name']
			
 
				+
			
 
				+        for update_url in websites_dict:
			
 
				+            name = websites_dict.get(update_url)
			
 
				+            try:
			
 
				+                driver.get(start_url)
			
 
				+            except Exception, e:
			
 
				+                pass
			
 
				+            # input_box = None
			
 
				+            # submit_button = None
			
 
				+            # try:
			
 
				+            #     input_box = driver.find_element_by_id('kw')
			
 
				+            #     submit_button = driver.find_element_by_id('su')
			
 
				+            # except Exception, e:
			
 
				+            #     driver.refresh()
			
 
				+            # 搜索条件
			
 
				+            line = 'intitle:' + tv_name + ' ' + 'site:' + update_url
			
 
				+            try:
			
 
				+                js = 'document.getElementById("kw").value = "' + line + '"'
			
 
				+                driver.execute_script(js)
			
 
				+                js = 'document.getElementById("su").click()'
			
 
				+                driver.execute_script(js)
			
 
				+                # input_box.clear()
			
 
				+                # input_box.send_keys(line)
			
 
				+                # submit_button.click()
			
 
				+            except Exception, e:
			
 
				+                print '点击请求失败'
			
 
				+
			
 
				+            time.sleep(1)
			
 
				+            # 分页块
			
 
				+            page = None
			
 
				+            try:
			
 
				+                page = driver.find_elements_by_xpath('//div[@id="page"]/a')
			
 
				+            except:
			
 
				+                pass
			
 
				+            count = 0
			
 
				+            # 如果分页不存在，说明记录在十条以内或没有记录
			
 
				+            if page is None or len(page) == 0:
			
 
				+                try:
			
 
				+                    divs = driver.find_elements_by_xpath('//div[@id="content_left"]/div')
			
 
				+                    if divs is not None and len(divs) > 0:
			
 
				+                        count = len(divs)
			
 
				+                except Exception, e:
			
 
				+                    count = 0
			
 
				+            #  如果分页存在，判断最后一页是不是10
			
 
				+            else:
			
 
				+                try:
			
 
				+                    page_nums = driver.find_elements_by_xpath('//div[@id="page"]/a')
			
 
				+                    max_page_num = 1
			
 
				+                    max_page_href= ''
			
 
				+                    for page_num in page_nums:
			
 
				+                        href = page_num.find_element_by_xpath('.').get_attribute('href')
			
 
				+                        page_num_text = page_num.find_element_by_xpath('.').get_attribute('textContent')
			
 
				+                        page_num_text = page_num_text.replace(' ', '').replace('\n', '')
			
 
				+
			
 
				+                        # 如果只是数字
			
 
				+                        if page_num_text.isdigit():
			
 
				+                            page_num_text = int(page_num_text)
			
 
				+                            if page_num_text > max_page_num:
			
 
				+                                max_page_num = page_num_text
			
 
				+                                max_page_href = href
			
 
				+                        # 如果是下一页字符串
			
 
				+                        elif page_num_text == '下一页>':
			
 
				+                            break
			
 
				+
			
 
				+                    try:
			
 
				+                        driver.get(max_page_href)
			
 
				+                    except Exception, e:
			
 
				+                        pass
			
 
				+                    try:
			
 
				+                        divs = driver.find_elements_by_xpath('//div[@id="content_left"]/div')
			
 
				+                        if divs is not None and len(divs) > 0:
			
 
				+                            count = len(divs)
			
 
				+                    except Exception, e:
			
 
				+                        count = 0
			
 
				+                    
			
 
				+                    count = (max_page_num - 1) * 10 + count
			
 
				+                except:
			
 
				+                    continue
			
 
				+
			
 
				+            if count != 0:
			
 
				+                sql = """
			
 
				+                    insert into scrapy.scrapy_article_count (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_count) values (%s, %s, %s, %s, %s, %s)
			
 
				+                """
			
 
				+                value = (tv_id, tv_name, 1, name, line, count)
			
 
				+                Mysql.insertOne(sql, value=value, conn=conn)
			
 
				+    driver.quit()
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    scrapy_website()
			
--- a/task_scrapy/scrapy_website_count_new.py
+++ b/task_scrapy/scrapy_website_count_new.py
@@ -0,0 +1,206 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""
			
 
				+新剧营销文章爬取
			
 
				+"""
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+import collections
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+from urllib import quote
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+def scrapy_website():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    # 清空scrapy.scrapy_article表
			
 
				+    sql = """
			
 
				+        truncate table scrapy.scrapy_article
			
 
				+    """
			
 
				+    Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+    # 将网站url和名称 放入有序字典中
			
 
				+    websites_dict = collections.OrderedDict()
			
 
				+    sql = """
			
 
				+        select name, update_url from odl.basic_websites order by id asc
			
 
				+    """
			
 
				+    websites = Mysql.getAll(sql, conn=conn)
			
 
				+    for website in websites:
			
 
				+        name = website['name']
			
 
				+        update_url = website['update_url']
			
 
				+        websites_dict[update_url] = name
			
 
				+    driver = webdriver.PhantomJS()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+
			
 
				+    driver2 = webdriver.PhantomJS()
			
 
				+    driver2.set_page_load_timeout(10)
			
 
				+
			
 
				+    start_url = 'http://www.baidu.com/'
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name from yxb.ad_tv_lib where source = 1 order by id asc
			
 
				+    """
			
 
				+
			
 
				+    tvs = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    for tv in tvs:
			
 
				+        tv_id = tv['id']
			
 
				+        tv_name = tv['tv_name']
			
 
				+
			
 
				+        for update_url in websites_dict:
			
 
				+            name = websites_dict.get(update_url)
			
 
				+            try:
			
 
				+                driver.get(start_url)
			
 
				+            except Exception, e:
			
 
				+                pass
			
 
				+            # input_box = None
			
 
				+            # submit_button = None
			
 
				+            # try:
			
 
				+            #     input_box = driver.find_element_by_id('kw')
			
 
				+            #     submit_button = driver.find_element_by_id('su')
			
 
				+            # except Exception, e:
			
 
				+            #     driver.refresh()
			
 
				+            # 搜索条件
			
 
				+            line = 'intitle:' + tv_name + ' ' + 'site:' + update_url
			
 
				+            print line
			
 
				+            try:
			
 
				+                # input_box.clear()
			
 
				+                # input_box.send_keys(line)
			
 
				+                # submit_button.click()
			
 
				+                js = 'document.getElementById("kw").value = "' + line + '"'
			
 
				+                driver.execute_script(js)
			
 
				+                js = 'document.getElementById("su").click()'
			
 
				+                driver.execute_script(js)
			
 
				+            except Exception, e:
			
 
				+                print '点击请求失败'
			
 
				+
			
 
				+            time.sleep(1)
			
 
				+            # 分页块
			
 
				+            page = None
			
 
				+            try:
			
 
				+                page = driver.find_elements_by_xpath('//div[@id="page"]/a')
			
 
				+            except:
			
 
				+                pass
			
 
				+            count = 0
			
 
				+            # 如果分页不存在，说明记录在十条以内或没有记录
			
 
				+            if page is None or len(page) == 0:
			
 
				+                try:
			
 
				+                    divs = driver.find_elements_by_xpath('//div[@id="content_left"]/div')
			
 
				+                    if divs is not None and len(divs) > 0:
			
 
				+                        count = len(divs)
			
 
				+                        for div in divs:
			
 
				+                            try:
			
 
				+                                div_title = div.find_element_by_xpath('./h3/a').get_attribute('textContent')
			
 
				+                                div_title = div_title.replace(' ', '').replace('\n', '')
			
 
				+                                div_href = div.find_element_by_xpath('./h3/a').get_attribute('href')
			
 
				+                                div_content = div.find_element_by_xpath('.').get_attribute('textContent')
			
 
				+                                import re
			
 
				+                                m = re.search(ur'[0-9]+年[0-9]+月[0-9]+日', div_content)
			
 
				+                                if m is not None:
			
 
				+                                    div_date = m.group(0)
			
 
				+                                    div_date = div_date.replace('年', '-').replace('月', '-').replace('日','')
			
 
				+                                    sql = """
			
 
				+                                        insert into scrapy.scrapy_article (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_url, scrapy_title, scrapy_date) values (%s, %s, %s, %s, %s, %s, %s, %s)
			
 
				+                                    """
			
 
				+                                    value = (tv_id, tv_name, 1, name, line, div_href, div_title, div_date)
			
 
				+                                    Mysql.insertOne(sql, value=value, conn=conn)
			
 
				+                            except:
			
 
				+                                pass
			
 
				+                except Exception, e:
			
 
				+                    print e
			
 
				+                    count = 0
			
 
				+            #  如果分页存在，判断最后一页是不是10
			
 
				+            else:
			
 
				+                
			
 
				+                try:
			
 
				+                    divs = driver.find_elements_by_xpath('//div[@id="content_left"]/div')
			
 
				+                except:
			
 
				+                    divs = None
			
 
				+                if divs is not None and len(divs) > 0:
			
 
				+                    # count = len(divs)
			
 
				+                    for div in divs:
			
 
				+                        try:
			
 
				+                            try:
			
 
				+                                div_title = div.find_element_by_xpath('./h3/a').get_attribute('textContent')
			
 
				+                                div_title = div_title.replace(' ', '').replace('\n', '')
			
 
				+                                div_href = div.find_element_by_xpath('./h3/a').get_attribute('href')
			
 
				+                                div_content = div.find_element_by_xpath('.').get_attribute('textContent')
			
 
				+                                import re
			
 
				+                                m = re.search(ur'[0-9]+年[0-9]+月[0-9]+日', div_content)
			
 
				+                                if m is not None:
			
 
				+                                    div_date = m.group(0)
			
 
				+                                    div_date = div_date.replace('年', '-').replace('月', '-').replace('日','')
			
 
				+                                    sql = """
			
 
				+                                        insert into scrapy.scrapy_article (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_url, scrapy_title, scrapy_date) values (%s, %s, %s, %s, %s, %s, %s, %s)
			
 
				+                                    """
			
 
				+                                    value = (tv_id, tv_name, 1, name, line, div_href, div_title, div_date)
			
 
				+                                    Mysql.insertOne(sql, value=value, conn=conn)
			
 
				+                            except:
			
 
				+                                pass
			
 
				+                        except Exception, e:
			
 
				+                            pass
			
 
				+
			
 
				+                try:
			
 
				+                    page_nums = driver.find_elements_by_xpath('//div[@id="page"]/a')
			
 
				+                    max_page_num = 1
			
 
				+                    max_page_href= ''
			
 
				+                    for page_num in page_nums:
			
 
				+                        href = page_num.find_element_by_xpath('.').get_attribute('href')
			
 
				+                        page_num_text = page_num.find_element_by_xpath('.').get_attribute('textContent')
			
 
				+                        page_num_text = page_num_text.replace(' ', '').replace('\n', '')
			
 
				+
			
 
				+                        # 如果只是数字
			
 
				+                        if page_num_text.isdigit():
			
 
				+                            page_num_text = int(page_num_text)
			
 
				+                            if page_num_text > max_page_num:
			
 
				+                                max_page_num = page_num_text
			
 
				+                                max_page_href = href
			
 
				+
			
 
				+                                try:
			
 
				+                                    driver2.get(max_page_href)
			
 
				+                                except Exception, e:
			
 
				+                                    print e
			
 
				+                                    pass
			
 
				+
			
 
				+                                divs = driver2.find_elements_by_xpath('//div[@id="content_left"]/div')
			
 
				+                                if divs is not None and len(divs) > 0:
			
 
				+                                    # count = len(divs)
			
 
				+                                    for div in divs:
			
 
				+                                        try:
			
 
				+                                            div_title = div.find_element_by_xpath('./h3/a').get_attribute('textContent')
			
 
				+                                            div_title = div_title.replace(' ', '').replace('\n', '')
			
 
				+                                            div_href = div.find_element_by_xpath('./h3/a').get_attribute('href')
			
 
				+                                            div_content = div.find_element_by_xpath('.').get_attribute('textContent')
			
 
				+                                            import re
			
 
				+                                            m = re.search(ur'[0-9]+年[0-9]+月[0-9]+日', div_content)
			
 
				+                                            if m is not None:
			
 
				+                                                div_date = m.group(0)
			
 
				+                                                div_date = div_date.replace('年', '-').replace('月', '-').replace('日','')
			
 
				+                                                sql = """
			
 
				+                                                    insert into scrapy.scrapy_article (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_url, scrapy_title, scrapy_date) values (%s, %s, %s, %s, %s, %s, %s, %s)
			
 
				+                                                """
			
 
				+                                                value = (tv_id, tv_name, 1, name, line, div_href, div_title, div_date)
			
 
				+                                                Mysql.insertOne(sql, value=value, conn=conn)
			
 
				+                                        except:
			
 
				+                                            pass
			
 
				+
			
 
				+                        # 如果是下一页字符串
			
 
				+                        elif page_num_text == '下一页>':
			
 
				+                            break
			
 
				+                except Exception, e:
			
 
				+                    print e
			
 
				+                    continue
			
 
				+
			
 
				+    driver.quit()
			
 
				+    driver2.quit()
			
 
				+    Mysql.close(conn=conn)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    scrapy_website()
			
--- a/task_scrapy/scrapy_youku.py
+++ b/task_scrapy/scrapy_youku.py
@@ -0,0 +1,222 @@
 
				+#/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+from urllib import quote
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+def scrapy_url():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name from scrapy.wangju_url where url_youku is null order by id asc
			
 
				+    """
			
 
				+
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    driver = webdriver.Firefox()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+
			
 
				+        url = 'http://www.soku.com/search_video/q_' + quote(str(tv_name))
			
 
				+
			
 
				+        # need_blank = True
			
 
				+        try:
			
 
				+            driver.get(url)
			
 
				+        except Exception, e:
			
 
				+            driver.execute_script('window.stop()')
			
 
				+
			
 
				+        divs = driver.find_elements_by_xpath('//div[@class="sk-express"]/div/div')
			
 
				+        for div in divs:
			
 
				+            try:
			
 
				+                title = div.find_element_by_xpath('./div/div[2]/div[1]/div/h2/a[1]').get_attribute('textContent')
			
 
				+                title = title.replace(' ', '').replace('\n', '')
			
 
				+                href = div.find_element_by_xpath('//div[@class="info_cont"]/p/a').get_attribute('href')
			
 
				+
			
 
				+                jishu = None
			
 
				+                try:
			
 
				+                    jishu = div.find_elements_by_xpath('//div[@class="s_items all site14 "]/ul/li')
			
 
				+                except Exception, e:
			
 
				+                    pass
			
 
				+                if jishu is None or len(jishu) == 0:
			
 
				+                    try:
			
 
				+                        # jishu = div.find_elements_by_xpath('//div[@class="s_items site14 "]/ul/li')
			
 
				+                        jishu = div.find_elements_by_xpath('//div[@class="s_detail"]/div[4]/ul/li')
			
 
				+                    except Exception, e:
			
 
				+                        pass
			
 
				+                if tv_name in title and jishu is not None and len(jishu) > 0:
			
 
				+                    sql = """
			
 
				+                        update scrapy.wangju_url set url_youku = '%s' where id = %s
			
 
				+                    """
			
 
				+                    sql = sql % (href, _id)
			
 
				+                    Mysql.execute(sql, conn=conn)
			
 
				+                    need_blank = False
			
 
				+            except Exception, e:
			
 
				+                pass
			
 
				+        if need_blank:
			
 
				+            sql = """
			
 
				+                update scrapy.wangju_url set url_youku = '%s' where id = %s
			
 
				+            """
			
 
				+            sql = sql % ('', _id)
			
 
				+            Mysql.execute(sql, conn=conn)
			
 
				+    driver.quit()
			
 
				+
			
 
				+def scrapy_data():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name, url_youku from scrapy.wangju_url where url_youku is not null and url_youku != '' order by id asc
			
 
				+    """
			
 
				+
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    driver = webdriver.Firefox()
			
 
				+    driver.set_page_load_timeout(10)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url_youku = row['url_youku']
			
 
				+
			
 
				+        need_blank = True
			
 
				+        try:
			
 
				+            driver.get(url_youku)
			
 
				+        except Exception, e:
			
 
				+            driver.execute_script('window.stop()')
			
 
				+
			
 
				+        try:
			
 
				+            content = driver.find_element_by_xpath('//div[@class="detailinfo"]').get_attribute('textContent')
			
 
				+        except Exception, e:
			
 
				+            try:
			
 
				+                content = driver.find_element_by_xpath('//div[@class="p-base"]').get_attribute('textContent')
			
 
				+            except Exception, e:
			
 
				+                continue
			
 
				+
			
 
				+        sql = """
			
 
				+            insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
			
 
				+        """
			
 
				+        value = (_id, tv_name, url_youku, '', content, 'youku')
			
 
				+        Mysql.insertOne(sql, value=value, conn=conn)
			
 
				+
			
 
				+    driver.quit()
			
 
				+
			
 
				+def parse_content():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+    sql = """
			
 
				+        select id, tv_name, url, content from scrapy.wangju_all_url where source = 'youku' order by id asc
			
 
				+    """
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        _id = row['id']
			
 
				+        tv_name = row['tv_name']
			
 
				+        url = row['url']
			
 
				+        content = row['content']
			
 
				+
			
 
				+        import re
			
 
				+        m = re.search(ur'评分： ([0-9]+[.]?)+', content)
			
 
				+        score = '0'
			
 
				+        if m is not None:
			
 
				+            score = m.group(0)
			
 
				+
			
 
				+        play = '0'
			
 
				+        m = re.search(ur'播放数：([0-9]+[,]?)+', content)
			
 
				+        if m is not None:
			
 
				+            play = m.group(0)
			
 
				+
			
 
				+        sql = """
			
 
				+            update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'youku'
			
 
				+        """
			
 
				+        sql = sql % (score, play, url)
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+# def parse_detail_content():
			
 
				+#     conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+#     sql = """
			
 
				+#         select id, detail_info_text from scrapy.iqiyi_dianshiju_detail order by id asc
			
 
				+#     """
			
 
				+#     rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+#     for row in rows:
			
 
				+#         _id = row['id']
			
 
				+#         detail_info_text = row['detail_info_text']
			
 
				+#         # sql = """
			
 
				+#         #     update scrapy.iqiyi_dianshiju_detail aa inner join scrapy.iqiyi_dianshiju_detail_copy bb on aa.id = bb.id set aa.detail_info_text = bb.detail_info_text
			
 
				+#         # """
			
 
				+#         # Mysql.update(sql, conn=conn)
			
 
				+#         if detail_info_text is not None:
			
 
				+#             # content = ''
			
 
				+#             # (line0, line1) = tuple(detail_info_text.split(u'评分'))
			
 
				+#             # line0 = line0.replace('\n', '')
			
 
				+#             # content = line0 + '\n' + line1
			
 
				+#             for line in detail_info_text.split('\n'):
			
 
				+                
			
 
				+#             sql = """
			
 
				+#                 update scrapy.iqiyi_dianshiju_detail set detail_info_text = %s where id = %s
			
 
				+#             """
			
 
				+#             value = (content, _id)
			
 
				+#             Mysql.update(sql, param=value, conn=conn)
			
 
				+#     Mysql.close(conn=conn)
			
 
				+
			
 
				+def update_tv_lib():
			
 
				+    conn = Mysql.createOfflineConn()
			
 
				+    sql = """
			
 
				+        select tv_id, detail_info_text from scrapy.iqiyi_dianshiju_detail order by id asc
			
 
				+    """
			
 
				+    rows = Mysql.getAll(sql, conn=conn)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        tv_id = row['tv_id']
			
 
				+        detail_info_text = row['detail_info_text']
			
 
				+
			
 
				+        lines = []
			
 
				+        for line in detail_info_text.split('\n'):
			
 
				+            lines.append(line)
			
 
				+        director = ''
			
 
				+        actors = ''
			
 
				+        product_area = ''
			
 
				+        premiere_time = ''
			
 
				+        _type = ''
			
 
				+        for i in range(len(lines)):
			
 
				+            line = lines[i]
			
 
				+            if u'导演' in line:
			
 
				+                director = line.replace(u'导演：', '')
			
 
				+            if u'主演' in line:
			
 
				+                actors = line.replace(u'主演：', '')
			
 
				+            if u'地区' in line:
			
 
				+                product_area = line.replace(u'地区：', '')
			
 
				+            if u'首播时间' in line:
			
 
				+                premiere_time = line.replace(u'首播时间：', '')
			
 
				+            if u'看点' in line:
			
 
				+                # print line[i+1]
			
 
				+                print lines[i+1]
			
 
				+                _type = lines[i+1]
			
 
				+            
			
 
				+            # if u'更新时间' in line:
			
 
				+            #     gengxin = lines[i+1]
			
 
				+    
			
 
				+        sql = """
			
 
				+            update tv_lib.yxb_tv_series set level = %s, type = %s, script_form = %s, director = %s, product_area = %s, actors = %s, premiere_time = %s where id = %s
			
 
				+        """
			
 
				+        value = (5, _type, 1, director, product_area, actors, premiere_time, tv_id)
			
 
				+        Mysql.update(sql, param=value, conn=conn)
			
 
				+    Mysql.close(conn=conn)
			
 
				+if __name__ == '__main__':
			
 
				+    # scrapy_data()
			
 
				+    # scrapy_url()
			
 
				+    # parse_content()
			
 
				+    # parse_detail_content()
			
 
				+    update_tv_lib()
			
--- a/task_tmp/tmp_data_month.py
+++ b/task_tmp/tmp_data_month.py
@@ -0,0 +1,43 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""月份提取
			
 
				+
			
 
				+将odl.ad_television的时间按月份进行统计
			
 
				+"""
			
 
				+
			
 
				+import datetime
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+sql = """
			
 
				+    truncate table tmp.ad_television_month
			
 
				+"""
			
 
				+Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+# 月份提取
			
 
				+sql = """
			
 
				+    select date_format(t.tv_date, '%Y-%m') as month, year(t.tv_date) as year from (
			
 
				+        select distinct tv_date from odl.ad_television group by tv_date
			
 
				+    ) t
			
 
				+    group by month
			
 
				+"""
			
 
				+rows = Mysql.getAll(sql, conn=conn)
			
 
				+sql_insert = """
			
 
				+    insert into tmp.ad_television_month (year, month) values (%s, %s)
			
 
				+"""
			
 
				+data_list = []
			
 
				+for row in rows:
			
 
				+    month = row['month']
			
 
				+    year = row['year']
			
 
				+    month_value = datetime.datetime.strptime(month, '%Y-%m')
			
 
				+    data_list.append((year, month_value))
			
 
				+if len(data_list) > 0:
			
 
				+    Mysql.insertMany(sql_insert, data_list, conn)
			
 
				+Mysql.close(conn)
			
--- a/task_tmp/tmp_tv_avg_ratings_fatt0.py
+++ b/task_tmp/tmp_tv_avg_ratings_fatt0.py
@@ -0,0 +1,116 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""按月统计电视剧的收视情况
			
 
				+
			
 
				+province_input: odl.ad_television
			
 
				+province_output: tmp.month_channel_stat
			
 
				+
			
 
				+area_input: odl.area_ad_television
			
 
				+area_output: tmp.area_month_channel_stat
			
 
				+"""
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql, Util
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+class channel_history_month_stat():
			
 
				+    
			
 
				+    def province(self):
			
 
				+        # 清空电视台数据统计表
			
 
				+        conn = Mysql.createOfflineConn()
			
 
				+        sql = """
			
 
				+            truncate table tmp.month_channel_stat
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        sql = """
			
 
				+            select distinct month from tmp.ad_television_month order by month asc
			
 
				+        """
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        for row in rows:
			
 
				+            month = row['month']
			
 
				+            print month
			
 
				+            month_max_date = Util.get_max_date_of_month(month)
			
 
				+            
			
 
				+            # 统计电视台当月电视剧收视数据
			
 
				+            sql = """
			
 
				+                select adt.channel, adt.theater_attribute, adt.tv_name, adt.tv_id,
			
 
				+                min(adt.tv_date) as tv_date, sum(adt.audience_rating) as sum_value, count(adt.id) as count_value from odl.ad_television adt 
			
 
				+                where adt.tv_date >= '%s' and adt.tv_date <= '%s' and adt.audience_rating > 0
			
 
				+                group by adt.channel, adt.theater_attribute, adt.tv_name, adt.tv_id
			
 
				+            """
			
 
				+            sql = sql % (month, month_max_date)
			
 
				+            rows_all = Mysql.getAll(sql, conn=conn)
			
 
				+            data_list = []
			
 
				+            sql_insert = """
			
 
				+                insert into tmp.month_channel_stat (channel, theater_attribute, tv_name, tv_id, tv_date, sum_value, count_value, year, month)
			
 
				+                values (%s, %s, %s, %s, %s, %s, %s, %s, %s)
			
 
				+            """
			
 
				+            year = month.year
			
 
				+            for row_all in rows_all:
			
 
				+                channel = row_all['channel']
			
 
				+                theater_attribute = row_all['theater_attribute']
			
 
				+                tv_name = row_all['tv_name']
			
 
				+                tv_id = row_all['tv_id']
			
 
				+                tv_date = row_all['tv_date']
			
 
				+                sum_value = row_all['sum_value']
			
 
				+                count_value = row_all['count_value']
			
 
				+                data_list.append((channel, theater_attribute, tv_name, tv_id, tv_date, sum_value, count_value, year, month))
			
 
				+            Mysql.insertMany(sql_insert, data_list, conn)
			
 
				+        Mysql.close(conn)
			
 
				+
			
 
				+    def area(self):
			
 
				+        # 清空电视台数据统计表
			
 
				+        conn = Mysql.createOfflineConn()
			
 
				+        sql = """
			
 
				+            truncate table tmp.month_channel_stat
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        sql = """
			
 
				+            select distinct month from tmp.ad_television_month order by month asc
			
 
				+        """
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        for row in rows:
			
 
				+            month = row['month']
			
 
				+            print month
			
 
				+            month_max_date = Util.get_max_date_of_month(month)
			
 
				+            
			
 
				+            # 统计电视台当月电视剧收视数据
			
 
				+            sql = """
			
 
				+                select adt.channel, adt.theater_attribute, adt.tv_name, adt.tv_id,
			
 
				+                min(adt.tv_date) as tv_date, sum(adt.audience_rating) as sum_value, count(adt.id) as count_value from odl.area_ad_television adt 
			
 
				+                where adt.tv_date >= '%s' and adt.tv_date <= '%s' and adt.audience_rating > 0
			
 
				+                group by adt.channel, adt.theater_attribute, adt.tv_name, adt.tv_id
			
 
				+            """
			
 
				+            sql = sql % (month, month_max_date)
			
 
				+            rows_all = Mysql.getAll(sql, conn=conn)
			
 
				+            data_list = []
			
 
				+            sql_insert = """
			
 
				+                insert into tmp.area_month_channel_stat (channel, theater_attribute, tv_name, tv_id, tv_date, sum_value, count_value, year, month)
			
 
				+                values (%s, %s, %s, %s, %s, %s, %s, %s, %s)
			
 
				+            """
			
 
				+            year = month.year
			
 
				+            for row_all in rows_all:
			
 
				+                channel = row_all['channel']
			
 
				+                theater_attribute = row_all['theater_attribute']
			
 
				+                tv_name = row_all['tv_name']
			
 
				+                tv_id = row_all['tv_id']
			
 
				+                tv_date = row_all['tv_date']
			
 
				+                sum_value = row_all['sum_value']
			
 
				+                count_value = row_all['count_value']
			
 
				+                data_list.append((channel, theater_attribute, tv_name, tv_id, tv_date, sum_value, count_value, year, month))
			
 
				+            Mysql.insertMany(sql_insert, data_list, conn=conn)
			
 
				+        Mysql.close(conn)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    if len(sys.argv) != 2:
			
 
				+        print '没有输入参数，退出'
			
 
				+        sys.exit(0)
			
 
				+    print 'method name is ' + sys.argv[1]
			
 
				+    obj = channel_history_month_stat()
			
 
				+    try:
			
 
				+        getattr(obj, sys.argv[1])()
			
 
				+    except Exception, e:
			
 
				+        print e
			
--- a/task_tmp/tmp_tv_avg_ratings_stat.py
+++ b/task_tmp/tmp_tv_avg_ratings_stat.py
@@ -0,0 +1,146 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""计算每个电视剧的收视率
			
 
				+
			
 
				+province_input: tmp.month_channel_stat
			
 
				+province_output: tmp.tv_avg_ratings
			
 
				+
			
 
				+area_input: tmp.area_month_channel_stat
			
 
				+area_output: tmp.area_tv_avg_ratings
			
 
				+"""
			
 
				+import datetime
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql, Util
			
 
				+
			
 
				+class tv_avg_ratings_clac():
			
 
				+    
			
 
				+    def province(self):
			
 
				+        conn = Mysql.createOfflineConn()
			
 
				+        sql = """
			
 
				+            truncate table tmp.tv_avg_ratings
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        # 从tmp.month_channel_stat表中取出每月的统计数据
			
 
				+        sql = """
			
 
				+            select channel, theater_attribute, tv_name, tv_id, tv_date, sum_value, count_value from tmp.month_channel_stat order by tv_date asc
			
 
				+        """
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        channel_dict = {}
			
 
				+        # key中有日期
			
 
				+        channel_date_dict = {}
			
 
				+        for row in rows:
			
 
				+            channel = row['channel']
			
 
				+            theater_attribute = row['theater_attribute']
			
 
				+            tv_name = row['tv_name']
			
 
				+            tv_id = row['tv_id']
			
 
				+            tv_date = row['tv_date']
			
 
				+            sum_value = row['sum_value']
			
 
				+            count_value = row['count_value']
			
 
				+            key = (channel, theater_attribute, tv_name, tv_id)
			
 
				+            # 如果两部电视剧在同一台同一剧场播放两次，则保存两条记录
			
 
				+            # 如果字典中不存在数据，则添加
			
 
				+            if channel_dict.get(key) is None:
			
 
				+                channel_dict[key] = (str(tv_date), sum_value, count_value)
			
 
				+                channel_date_dict[(channel, theater_attribute, tv_name, tv_id, str(tv_date))] = (sum_value, count_value)
			
 
				+            else:
			
 
				+                # 否则进行时间判断
			
 
				+                value = channel_dict.get(key)
			
 
				+                # 当前的日期
			
 
				+                date1 = datetime.datetime.strptime(str(tv_date), '%Y-%m-%d')
			
 
				+                # 保存的日期
			
 
				+                date2 = datetime.datetime.strptime(str(value[0]), '%Y-%m-%d')
			
 
				+                sub_value = (date2 - date1).days
			
 
				+                # 如果相减天数在这之间，表示同一部电视剧
			
 
				+                if sub_value <= 25 and sub_value >= -25:
			
 
				+                    value2 = channel_date_dict.get((channel, theater_attribute, tv_name, tv_id, str(value[0])))
			
 
				+                    channel_dict[key] = (str(value[0]), sum_value, count_value)
			
 
				+                    channel_date_dict[(channel, theater_attribute, tv_name, tv_id, str(value[0]))] = tuple(x + y for x, y in zip((sum_value, count_value), tuple(value2)))
			
 
				+                # 日期超出范围，则表示同一部电视剧播出了多次
			
 
				+                else:
			
 
				+                    channel_dict[key] = (str(tv_date), sum_value, count_value)
			
 
				+                    channel_date_dict[(channel, theater_attribute, tv_name, tv_id, str(tv_date))] = (sum_value, count_value) 
			
 
				+
			
 
				+        sql = """
			
 
				+            insert into tmp.tv_avg_ratings (channel, theater_attribute, tv_name, tv_id, tv_date, value) values (%s, %s, %s, %s, %s, %s)
			
 
				+        """
			
 
				+        data_list = []
			
 
				+        for key in channel_date_dict.keys():
			
 
				+            (channel, theater_attribute, tv_name, tv_id, tv_date) = key
			
 
				+            (sum_value, count_value) = channel_date_dict.get(key)
			
 
				+            # value = (channel, theater_attribute, tv_name, tv_id, tv_date, float(sum_value) / count_value)
			
 
				+            data_list.append((channel, theater_attribute, tv_name, tv_id, tv_date, float(sum_value) / count_value))
			
 
				+            # Mysql.insertOne(sql, value=value, conn=conn)
			
 
				+        # Mysql.insertMany(sql, data_list, conn)
			
 
				+        Util.insert_by_chunk(sql, data_list, conn)
			
 
				+        Mysql.close(conn)
			
 
				+
			
 
				+    def area(self):
			
 
				+        conn = Mysql.createOfflineConn()
			
 
				+        sql = """
			
 
				+            truncate table tmp.area_tv_avg_ratings
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        # 从tmp.month_channel_stat表中取出每月的统计数据
			
 
				+        sql = """
			
 
				+            select channel, theater_attribute, tv_name, tv_id, tv_date, sum_value, count_value from tmp.area_month_channel_stat order by tv_date asc
			
 
				+        """
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        channel_dict = {}
			
 
				+        # key中有日期
			
 
				+        channel_date_dict = {}
			
 
				+        for row in rows:
			
 
				+            channel = row['channel']
			
 
				+            theater_attribute = row['theater_attribute']
			
 
				+            tv_name = row['tv_name']
			
 
				+            tv_id = row['tv_id']
			
 
				+            tv_date = row['tv_date']
			
 
				+            sum_value = row['sum_value']
			
 
				+            count_value = row['count_value']
			
 
				+            key = (channel, theater_attribute, tv_name, tv_id)
			
 
				+            # 如果两部电视剧在同一台同一剧场播放两次，则保存两条记录
			
 
				+            # 如果字典中不存在数据，则添加
			
 
				+            if channel_dict.get(key) is None:
			
 
				+                channel_dict[key] = (str(tv_date), sum_value, count_value)
			
 
				+                channel_date_dict[(channel, theater_attribute, tv_name, tv_id, str(tv_date))] = (sum_value, count_value)
			
 
				+            else:
			
 
				+                # 否则进行时间判断
			
 
				+                value = channel_dict.get(key)
			
 
				+                # 当前的日期
			
 
				+                date1 = datetime.datetime.strptime(str(tv_date), '%Y-%m-%d')
			
 
				+                # 保存的日期
			
 
				+                date2 = datetime.datetime.strptime(str(value[0]), '%Y-%m-%d')
			
 
				+                sub_value = (date2 - date1).days
			
 
				+                # 如果相减天数在这之间，表示同一部电视剧
			
 
				+                if sub_value <= 25 and sub_value >= -25:
			
 
				+                    value2 = channel_date_dict.get((channel, theater_attribute, tv_name, tv_id, str(value[0])))
			
 
				+                    channel_dict[key] = (str(value[0]), sum_value, count_value)
			
 
				+                    channel_date_dict[(channel, theater_attribute, tv_name, tv_id, str(value[0]))] = tuple(x + y for x, y in zip((sum_value, count_value), tuple(value2)))
			
 
				+                # 日期超出范围，则表示同一部电视剧播出了多次
			
 
				+                else:
			
 
				+                    channel_dict[key] = (str(tv_date), sum_value, count_value)
			
 
				+                    channel_date_dict[(channel, theater_attribute, tv_name, tv_id, str(tv_date))] = (sum_value, count_value)
			
 
				+                
			
 
				+        sql = """
			
 
				+            insert into tmp.area_tv_avg_ratings (channel, theater_attribute, tv_name, tv_id, tv_date, value) values (%s, %s, %s, %s, %s, %s)
			
 
				+        """
			
 
				+        data_list = []
			
 
				+        for key in channel_date_dict.keys():
			
 
				+            (channel, theater_attribute, tv_name, tv_id, tv_date) = key
			
 
				+            (sum_value, count_value) = channel_date_dict.get(key)
			
 
				+            data_list.append((channel, theater_attribute, tv_name, tv_id, tv_date, float(sum_value) / count_value))
			
 
				+        
			
 
				+        Util.insert_by_chunk(sql, data_list, conn)
			
 
				+        Mysql.close(conn)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    if len(sys.argv) != 2:
			
 
				+        print '没有输入参数，退出'
			
 
				+        sys.exit(0)
			
 
				+    print 'method name is ' + sys.argv[1]
			
 
				+    obj = tv_avg_ratings_clac()
			
 
				+    try:
			
 
				+        getattr(obj, sys.argv[1])()
			
 
				+    except Exception, e:
			
 
				+        print e
			
--- a/task_tmp/tmp_tv_category_stat.py
+++ b/task_tmp/tmp_tv_category_stat.py
@@ -0,0 +1,93 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""电视台对应电视剧及类型关系数据
			
 
				+
			
 
				+电视台播放的电视剧根据类型分别保存记录
			
 
				+province_input: odl.ad_television odl.ad_tv_lib
			
 
				+province_output: tmp.tv_category_stat
			
 
				+
			
 
				+area_input: odl.area_ad_television odl.ad_tv_lib
			
 
				+area_output: tmp.area_tv_category_stat
			
 
				+"""
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql, Util
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+class tv_category_stat():
			
 
				+    
			
 
				+    def province(self):
			
 
				+        conn = Mysql.createOfflineConn()
			
 
				+        sql = """
			
 
				+            truncate table tmp.tv_category_stat
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        # 电视台播放电视剧分类数据
			
 
				+        station_dict = {}
			
 
				+        sql = """
			
 
				+            select oat.tv_id, oat.channel, oat.theater_attribute, oatl.categories from odl.ad_television oat
			
 
				+            left join odl.ad_tv_lib oatl on oat.tv_id = oatl.tv_id
			
 
				+            where oat.tv_id is not null and oat.theater_attribute != '' and oat.theater_attribute is not null
			
 
				+            group by tv_id, channel, theater_attribute
			
 
				+        """
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        sql = """
			
 
				+            insert into tmp.tv_category_stat (tv_id, category, channel, theater_attribute) values (%s, %s, %s, %s)
			
 
				+        """
			
 
				+        for row in rows:
			
 
				+            tv_id = row['tv_id']
			
 
				+            channel = row['channel']
			
 
				+            theater_attribute = row['theater_attribute']
			
 
				+            categories = row['categories']
			
 
				+            if categories is not None and len(categories) > 0:
			
 
				+                cate_list = categories.split(' ')
			
 
				+                data_list = []
			
 
				+                for cat in cate_list:
			
 
				+                    data_list.append((tv_id, cat, channel, theater_attribute))
			
 
				+                Mysql.insertMany(sql, data_list, conn)
			
 
				+        Mysql.close(conn)
			
 
				+
			
 
				+    def area(self):
			
 
				+        conn = Mysql.createOfflineConn()
			
 
				+        sql = """
			
 
				+            truncate table tmp.area_tv_category_stat
			
 
				+        """
			
 
				+        Mysql.execute(sql, conn=conn)
			
 
				+        # 电视台播放电视剧分类数据
			
 
				+        station_dict = {}
			
 
				+        sql = """
			
 
				+            select oat.tv_id, oat.channel, oat.theater_attribute, oatl.categories from odl.area_ad_television oat
			
 
				+            left join odl.ad_tv_lib oatl on oat.tv_id = oatl.tv_id
			
 
				+            where oat.tv_id is not null
			
 
				+            group by tv_id, channel, theater_attribute
			
 
				+        """
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        sql = """
			
 
				+            insert into tmp.area_tv_category_stat (tv_id, category, channel, theater_attribute) values (%s, %s, %s, %s)
			
 
				+        """
			
 
				+        for row in rows:
			
 
				+            tv_id = row['tv_id']
			
 
				+            channel = row['channel']
			
 
				+            theater_attribute = row['theater_attribute']
			
 
				+            categories = row['categories']
			
 
				+            if categories is not None and len(categories) > 0:
			
 
				+                cate_list = categories.split(' ')
			
 
				+                data_list = []
			
 
				+                for cat in cate_list:
			
 
				+                    data_list.append((tv_id, cat, channel, theater_attribute))
			
 
				+                Mysql.insertMany(sql, data_list, conn)
			
 
				+        Mysql.close(conn)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    if len(sys.argv) != 2:
			
 
				+        print '没有输入参数，退出'
			
 
				+        sys.exit(0)
			
 
				+    print 'method name is ' + sys.argv[1]
			
 
				+    obj = tv_category_stat()
			
 
				+    try:
			
 
				+        getattr(obj, sys.argv[1])()
			
 
				+    except Exception, e:
			
 
				+        print e
			
--- a/task_tmp/tmp_year_channel_avg_ratings_stat_by_tv.py
+++ b/task_tmp/tmp_year_channel_avg_ratings_stat_by_tv.py
@@ -0,0 +1,85 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+"""电视台近一年平均收视率
			
 
				+
			
 
				+计算方法：取出电视台的一年收视数据，求平均数
			
 
				+avg(audience_rating) group by channel, theater_attribute
			
 
				+
			
 
				+province_input: odl.ad_television
			
 
				+province_output: tmp.channel_avg_ratings
			
 
				+
			
 
				+area_input: odl.area_ad_television
			
 
				+area_output: odl.area_channel_avg_ratings
			
 
				+"""
			
 
				+import sys
			
 
				+
			
 
				+from fty_util.common import Mysql, Util
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+class channel_avg_ratings():
			
 
				+    
			
 
				+    # 央卫视频道
			
 
				+    def province(self):
			
 
				+        conn = Mysql.createOfflineConn()
			
 
				+        first_day = Util.get_first_date_of_yesterday()
			
 
				+        one_year_ago = Util.get_max_date_of_one_year_ago(first_day)
			
 
				+        # 计算最近一年电视台的平均收视率
			
 
				+        station_dict = {}
			
 
				+        sql = """
			
 
				+            select channel, theater_attribute, avg(audience_rating) as avg_rate from odl.ad_television
			
 
				+            where tv_date > '%s' and tv_date <= '%s'
			
 
				+            group by channel, theater_attribute
			
 
				+        """
			
 
				+        sql = sql % (one_year_ago, first_day)
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        data_list = []
			
 
				+        sql = """
			
 
				+            insert into tmp.channel_avg_ratings (channel, theater_attribute, value) values (%s, %s, %s)
			
 
				+        """
			
 
				+        for row in rows:
			
 
				+            channel = row['channel']
			
 
				+            theater_attribute = row['theater_attribute']
			
 
				+            avg_rate = row['avg_rate']
			
 
				+            data_list.append((channel, theater_attribute, avg_rate))
			
 
				+        Mysql.insertMany(sql, data_list, conn)
			
 
				+        Mysql.close(conn)
			
 
				+
			
 
				+    # 省级地面频道
			
 
				+    def area(self):
			
 
				+        conn = Mysql.createOfflineConn()
			
 
				+        first_day = Util.get_first_date_of_yesterday()
			
 
				+        one_year_ago = Util.get_max_date_of_one_year_ago(first_day)
			
 
				+        # 计算最近一年电视台的平均收视率
			
 
				+        station_dict = {}
			
 
				+        sql = """
			
 
				+            select channel, theater_attribute, avg(audience_rating) as avg_rate from odl.area_ad_television
			
 
				+            where tv_date > '%s' and tv_date <= '%s'
			
 
				+            group by channel, theater_attribute
			
 
				+        """
			
 
				+        sql = sql % (one_year_ago, first_day)
			
 
				+        rows = Mysql.getAll(sql, conn=conn)
			
 
				+        data_list = []
			
 
				+        sql = """
			
 
				+            insert into tmp.area_channel_avg_ratings (channel, theater_attribute, value) values (%s, %s, %s)
			
 
				+        """
			
 
				+        for row in rows:
			
 
				+            channel = row['channel']
			
 
				+            theater_attribute = row['theater_attribute']
			
 
				+            avg_rate = row['avg_rate']
			
 
				+            data_list.append((channel, theater_attribute, avg_rate))
			
 
				+        Mysql.insertMany(sql, data_list, conn)
			
 
				+        Mysql.close(conn)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    if len(sys.argv) != 2:
			
 
				+        print '没有输入参数，退出'
			
 
				+        sys.exit(0)
			
 
				+    print 'method name is ' + sys.argv[1]
			
 
				+    obj = channel_avg_ratings()
			
 
				+    try:
			
 
				+        getattr(obj, sys.argv[1])()
			
 
				+    except Exception, e:
			
 
				+        print e
			
--- a/task_yxb/ad_tv_lib_clean.py
+++ b/task_yxb/ad_tv_lib_clean.py
@@ -0,0 +1,92 @@
 
				+#!/usr/bin/env python
			
 
				+#coding=utf-8
			
 
				+
			
 
				+spec_char = ['栏目剧', '题材', '电视剧', '连续剧', '剧情', '。', ';', '；', '，', ',', '、', '/', ':', '：', '\\', '[1]', '[2]', '[3]', '[4]', '[5]', '[6]', '[7]', '[8]', '[9]', '[10]']
			
 
				+
			
 
				+import sys
			
 
				+from fty_util.common import Mysql, Util
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+# 清空数据
			
 
				+sql = """
			
 
				+    select id, director, scriptwritter, main_actors, types, areas, plat_form, pub_comp, online_form, production from yxb.ad_tv_lib where categories = '' or categories is null order by id asc
			
 
				+"""
			
 
				+rows = Mysql.getAll(sql, conn=conn)
			
 
				+for row in rows:
			
 
				+    _id = row['id']
			
 
				+    director = row['director']
			
 
				+    scriptwritter = row['scriptwritter']
			
 
				+    main_actors = row['main_actors']
			
 
				+    types = row['types']
			
 
				+    areas = row['areas']
			
 
				+    plat_form = row['plat_form']
			
 
				+    pub_comp = row['pub_comp']
			
 
				+    online_form = row['online_form']
			
 
				+    production = row['production']
			
 
				+
			
 
				+    if director is not None and len(director) > 0:
			
 
				+        for char in spec_char:
			
 
				+            director = director.replace(char, " ")
			
 
				+
			
 
				+    if scriptwritter is not None and len(scriptwritter) > 0:
			
 
				+        for char in spec_char:
			
 
				+            scriptwritter = scriptwritter.replace(char, " ")
			
 
				+
			
 
				+    if main_actors is not None and len(main_actors) > 0:
			
 
				+        for char in spec_char:
			
 
				+            main_actors = main_actors.replace(char, " ")
			
 
				+
			
 
				+    if areas is not None and len(areas) > 0:
			
 
				+        for char in spec_char:
			
 
				+            areas = areas.replace(char, " ")
			
 
				+
			
 
				+    if plat_form is not None and len(plat_form) > 0:
			
 
				+        for char in spec_char:
			
 
				+            plat_form = plat_form.replace(char, " ")
			
 
				+        
			
 
				+    if pub_comp is not None and len(pub_comp) > 0:
			
 
				+        for char in spec_char:
			
 
				+            pub_comp = pub_comp.replace(char, " ")
			
 
				+
			
 
				+    if online_form is not None and len(online_form) > 0:
			
 
				+        for char in spec_char:
			
 
				+            online_form = online_form.replace(char, " ")
			
 
				+
			
 
				+    if production is not None and len(production) > 0:
			
 
				+        for char in spec_char:
			
 
				+            production = production.replace(char, " ")
			
 
				+
			
 
				+    if types is not None and len(types) > 0:
			
 
				+        for char in spec_char:
			
 
				+            types = types.replace(char, " ")
			
 
				+            types = types.replace("  ", " ")
			
 
				+
			
 
				+    sql = """
			
 
				+        update yxb.ad_tv_lib set director = '%s', scriptwritter = '%s', main_actors = '%s', types = '%s', areas = '%s', plat_form = '%s', pub_comp = '%s', online_form = '%s', production = '%s' where id = '%s'
			
 
				+    """
			
 
				+    sql = sql % (director, scriptwritter, main_actors, types, areas, plat_form, pub_comp, online_form, production, _id)
			
 
				+    Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+
			
 
				+# 更新 scrapy.types_analyse 类型字段
			
 
				+
			
 
				+
			
 
				+Mysql.close(conn)
			
 
				+# type_set = set()
			
 
				+# for row in rows:
			
 
				+#     _id = row['id']
			
 
				+#     types = row['types']
			
 
				+#     if types is not None and len(types) > 0:
			
 
				+#         for char in spec_char:
			
 
				+#             types = types.replace(char, " ")
			
 
				+        
			
 
				+#         for _type in types.split(" "):
			
 
				+#             type_set.add(_type.strip())
			
 
				+
			
 
				+# for _type in type_set:
			
 
				+#     print _type
			
 
				+
			
 
				+# Mysql.close(conn)
			
--- a/tmp_ad_tv_sr_stat.py
+++ b/tmp_ad_tv_sr_stat.py
@@ -0,0 +1,181 @@
 
				+#encoding=utf-8
			
 
				+#author:wdw110
			
 
				+#功能：统计计算电视剧收视率的基础数据
			
 
				+
			
 
				+from __future__ import division
			
 
				+import re
			
 
				+import math
			
 
				+import time
			
 
				+import copy
			
 
				+import datetime
			
 
				+import numpy as np
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+tv_data = {}
			
 
				+tv_rate = {}
			
 
				+tv_avg_sr = {}
			
 
				+tv_station = {}
			
 
				+tv_play = {}
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+sql = "select tv_name,channel,audience_rating,tv_date from odl.ad_television where theater_attribute='黄金剧场'"
			
 
				+data = Mysql.selectAll(sql, conn=conn)
			
 
				+
			
 
				+sql_tv = "select tv_id,tv_name,director,scriptwriter,main_actors,filmer,first_type,show_time from odl.ad_tv_lib where is_use=1"
			
 
				+tmp_data = Mysql.selectAll(sql_tv, conn=conn)
			
 
				+for i in range(len(tmp_data)):
			
 
				+	tv_id = int(tmp_data[i][0])
			
 
				+	tv_name = tmp_data[i][1]
			
 
				+	director = tmp_data[i][2] if tmp_data[i][2] else ''
			
 
				+	scriptwriter = tmp_data[i][3] if tmp_data[i][3] else ''
			
 
				+	actors = tmp_data[i][4] if tmp_data[i][4] else ''
			
 
				+	filmer = tmp_data[i][5] if tmp_data[i][5] else ''
			
 
				+	type1 = tmp_data[i][6] if tmp_data[i][6] else ''
			
 
				+	tv_data[(tv_id,tv_name)] = [director,scriptwriter,actors,filmer,type1]
			
 
				+	tv_play[tv_name] = tmp_data[i][7]
			
 
				+
			
 
				+
			
 
				+for i in range(len(data)):
			
 
				+	tv_name = data[i][0]
			
 
				+	channel = data[i][1]
			
 
				+	aud_rating = float(data[i][2])
			
 
				+	tv_date = datetime.datetime.strftime(data[i][3],'%Y-%m')
			
 
				+	year = data[i][3].year
			
 
				+	if aud_rating and tv_play.has_key(tv_name): #判断电视剧是在ad_tv_lib表中
			
 
				+		show_time = tv_play[tv_name] if tv_play[tv_name] else str(year)
			
 
				+
			
 
				+		if str(year) in show_time:
			
 
				+			tv_station.setdefault(channel,{})
			
 
				+			tv_station[channel].setdefault(tv_date,[])
			
 
				+			tv_station[channel][tv_date].append(aud_rating)
			
 
				+
			
 
				+			tv_rate.setdefault(tv_name,{})
			
 
				+			if not tv_rate.get(tv_name):
			
 
				+				tv_rate[tv_name].setdefault(year,{})
			
 
				+			else:
			
 
				+				yy = tv_rate[tv_name].keys()[0]
			
 
				+				if year < yy:
			
 
				+					del tv_rate[tv_name][yy]
			
 
				+					tv_rate[tv_name][year] = {}
			
 
				+			if tv_rate[tv_name].has_key(year):
			
 
				+				tv_rate[tv_name][year].setdefault(channel,['9999',[]])
			
 
				+				dd = tv_rate[tv_name][year][channel][0]
			
 
				+				if tv_date < dd:
			
 
				+					tv_rate[tv_name][year][channel][0] = tv_date
			
 
				+				tv_rate[tv_name][year][channel][1].append(aud_rating)
			
 
				+
			
 
				+for channel,value in tv_station.items():
			
 
				+	for tv_date in value:
			
 
				+		tmp_arr = value[tv_date]
			
 
				+		avg_rating = sum(tmp_arr)/len(tmp_arr)
			
 
				+		tv_station[channel][tv_date] = avg_rating
			
 
				+
			
 
				+
			
 
				+def avg_rate(Date,obj,channel): #Date:'2014-05',obj:tv_station,channel:电视台
			
 
				+	'''电视台近一年的平均收视率'''
			
 
				+	array = []
			
 
				+	tmp = Date.split('-')
			
 
				+	if int(tmp[1])==1:
			
 
				+		month = '12'
			
 
				+	elif 1<int(tmp[1])<=10:
			
 
				+		month = '0'+str(int(tmp[1])-1)
			
 
				+	else:
			
 
				+		month = str(int(tmp[1])-1)
			
 
				+	Date2 = str(int(tmp[0])-1)+'-'+month
			
 
				+	tmp_obj = obj[channel]
			
 
				+	for tv_date in tmp_obj:
			
 
				+		if Date2<=tv_date<=Date:
			
 
				+			array.append(tmp_obj[tv_date])
			
 
				+	if not len(array):
			
 
				+		print Date,channel,obj[channel]
			
 
				+	res = sum(array)/len(array)
			
 
				+	return res
			
 
				+
			
 
				+#tv_rate_new = copy.deepcopy(tv_rate)
			
 
				+for tv_name in tv_rate:
			
 
				+	year = tv_rate[tv_name].keys()[0]
			
 
				+	tv_sr = []
			
 
				+	for channel,value in tv_rate[tv_name][year].items():
			
 
				+		tv_date = value[0]
			
 
				+		tv_avg_rating = sum(value[1])/len(value[1])
			
 
				+		tv_sr.append(tv_avg_rating/avg_rate(tv_date,tv_station,channel))
			
 
				+	tv_avg_sr[tv_name] = (year,sum(tv_sr)/len(tv_sr))
			
 
				+
			
 
				+people_sr = [{},{},{},{},{}] #每个维度中所有变量的值
			
 
				+for id_name in tv_data:
			
 
				+	tv_id = id_name[0]
			
 
				+	tv_name = id_name[1]
			
 
				+	people_arr = tv_data[id_name]
			
 
				+	if tv_avg_sr.get(tv_name):
			
 
				+		year,avg_sr = tv_avg_sr[tv_name]
			
 
				+		for i in range(len(people_arr)):
			
 
				+			if people_arr[i]:
			
 
				+				p_arr = people_arr[i].split(u' ')
			
 
				+				for peo in p_arr:
			
 
				+					if peo:
			
 
				+						people_sr[i].setdefault(peo,{})
			
 
				+						people_sr[i][peo].setdefault(year,[])
			
 
				+						people_sr[i][peo][year].append((tv_id,avg_sr))
			
 
				+
			
 
				+people_sr_new = copy.deepcopy(people_sr)
			
 
				+people_sr_new2 = copy.deepcopy(people_sr)
			
 
				+for i in range(len(people_sr)):
			
 
				+	for peo in people_sr[i]:
			
 
				+		peo_obj = people_sr[i][peo]
			
 
				+		for year,value in people_sr[i][peo].items():
			
 
				+			people_sr_new[i][peo][year] = []
			
 
				+			people_sr_new2[i][peo][year] = []
			
 
				+			for j in range(2010,year+1):
			
 
				+				if peo_obj.has_key(j):
			
 
				+					tmp_arr = [jj[1] for jj in peo_obj[j]]
			
 
				+					people_sr_new[i][peo][year].extend(tmp_arr)
			
 
				+					people_sr_new2[i][peo][year].extend(peo_obj[j])
			
 
				+
			
 
				+
			
 
				+result_sr = []  #每个电视剧的收视指数信息
			
 
				+for id_name in tv_data:
			
 
				+	tv_id = id_name[0]
			
 
				+	tv_name = id_name[1]
			
 
				+	people_arr = tv_data[id_name]
			
 
				+	if tv_avg_sr.get(tv_name):
			
 
				+		year,avg_sr = tv_avg_sr[tv_name]
			
 
				+		peo_arr = [tv_id,tv_name,avg_sr]
			
 
				+		for i in range(len(people_arr)):
			
 
				+			tmp_str = ''
			
 
				+			p_arr = people_arr[i].split(u' ')
			
 
				+			for peo in p_arr:
			
 
				+				if people_sr_new[i].has_key(peo):
			
 
				+					var = sum(people_sr_new[i][peo][year])/len(people_sr_new[i][peo][year])
			
 
				+					tmp_str += peo + ":" + str(var) + ';'
			
 
				+			peo_arr.append(tmp_str)
			
 
				+		peo_arr.append(year)
			
 
				+		result_sr.append(tuple(peo_arr))
			
 
				+					
			
 
				+result = []
			
 
				+for i in range(len(people_sr_new2)):
			
 
				+	people_obj = people_sr_new2[i]
			
 
				+	for peo in people_obj:
			
 
				+		for year,value in people_obj[peo].items():
			
 
				+			str1 = str([arr[0] for arr in value])
			
 
				+			str2 = str([arr[1] for arr in value])
			
 
				+			result.append((peo,str1,str2,year,i+1))
			
 
				+
			
 
				+
			
 
				+delete = 'delete from tmp.ad_tv_sr_pre_var'
			
 
				+Mysql.execute(delete, conn=conn)
			
 
				+
			
 
				+sql = 'insert into tmp.ad_tv_sr_pre_var values(%s,%s,%s,%s,%s)'
			
 
				+for i in range(int(len(result)/1000)+1):
			
 
				+	tmp = result[i*1000:(i+1)*1000]
			
 
				+	Mysql.insertMany(sql, tmp, conn=conn)
			
 
				+
			
 
				+delete = 'delete from tmp.ad_tv_sr'
			
 
				+Mysql.execute(delete, conn=conn)
			
 
				+
			
 
				+sql_sr = 'insert into tmp.ad_tv_sr values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'
			
 
				+for i in range(int(len(result_sr)/1000)+1):
			
 
				+	tmp = result_sr[i*1000:(i+1)*1000]
			
 
				+	Mysql.insertMany(sql_sr, tmp, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
--- a/tv_outline_recom.py
+++ b/tv_outline_recom.py
@@ -0,0 +1,226 @@
 
				+#encoding=utf-8
			
 
				+#author:wdw110
			
 
				+#功能：离线计算电视剧的相似剧
			
 
				+
			
 
				+from __future__ import division
			
 
				+import re
			
 
				+import math
			
 
				+import jieba
			
 
				+import numpy as np
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+tv_tf = {}  #{id:[[{},{},..],..],...}
			
 
				+idf = {}
			
 
				+idf_aft = {}
			
 
				+var_stat = [[],[],[],[],[],[],[]] #各维度的词统计
			
 
				+seq2id = {}
			
 
				+weight = [5,2,1,1,1,1,2]
			
 
				+tags = {} #标签库
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+dims = ['tv_id','types','description','director','main_actors','scriptwriter','filmer','decade']
			
 
				+
			
 
				+sql = "select %s from odl.ad_tv_lib where is_use=1" %(', '.join(dims))
			
 
				+tv_data = Mysql.selectAll(sql, conn=conn)
			
 
				+
			
 
				+sql2 = 'select tag from odl.ad_type_lib'
			
 
				+tmp = Mysql.selectAll(sql2, conn=conn)
			
 
				+for word in tmp:
			
 
				+	tags[word[0]] = 1
			
 
				+
			
 
				+def find_tag(sentence): #sentence为电视剧的描述信息
			
 
				+	seg = jieba.cut(sentence)
			
 
				+	res = {}
			
 
				+	for word in seg:
			
 
				+		if tags.get(word):
			
 
				+			res.setdefault(word,1)
			
 
				+	return u' '.join(res.keys())
			
 
				+
			
 
				+
			
 
				+for i in range(len(tv_data)):
			
 
				+	tv_id = int(tv_data[i][0])
			
 
				+	tv_data[i] = list(tv_data[i])
			
 
				+	arr = tv_data[i][1:]
			
 
				+	tmp = []  #每个电视剧的所有关键词
			
 
				+	word_count = {} #每个电视剧的关键词的数量
			
 
				+	dim_tmp = [] #每个电视剧的每个维度的关键词统计[[{},{}..],..]
			
 
				+	tv_tf.setdefault(tv_id,[])
			
 
				+	seq2id[i] = tv_id
			
 
				+	if not arr[1]: 
			
 
				+		arr[1] = ''
			
 
				+		tv_data[i][2] = ''
			
 
				+	else: 
			
 
				+		arr[1] = find_tag(arr[1])
			
 
				+		tv_data[i][2] = arr[1]
			
 
				+	for i in range(len(arr)):
			
 
				+		obj = {}
			
 
				+		if not arr[i]: 
			
 
				+			wd = u''
			
 
				+		else:
			
 
				+			wd = arr[i]
			
 
				+		words = wd.split(u' ')
			
 
				+		#print words			
			
 
				+		for word in words:
			
 
				+			if word:
			
 
				+				obj[word] = 1
			
 
				+				word_count.setdefault(word,0)
			
 
				+				word_count[word] += 1
			
 
				+		dim_tmp.append(obj)
			
 
				+		var_stat[i].extend(obj.keys())
			
 
				+		tmp.extend(obj.keys()) 
			
 
				+	n = len(tmp) #每个电视剧的总词数
			
 
				+	for obj_j in dim_tmp:
			
 
				+		for k in obj_j:
			
 
				+			obj_j[k] = word_count[k]/n
			
 
				+	tv_tf[tv_id] = dim_tmp
			
 
				+	for word in list(set(tmp)):
			
 
				+		idf.setdefault(word,0)
			
 
				+		idf[word] += 1
			
 
				+
			
 
				+N = len(tv_tf)  #总电视剧数量
			
 
				+for key in idf:
			
 
				+	idf_aft[key] = math.log10(N/idf[key])
			
 
				+
			
 
				+for i in range(len(var_stat)):
			
 
				+	var_stat[i] = list(set(var_stat[i])) #去重处理
			
 
				+
			
 
				+
			
 
				+#计算电视剧矩阵得分
			
 
				+def tv_score(weight, tf, idf):
			
 
				+	col = len(tf)
			
 
				+	row = sum([len(v) for v in var_stat])
			
 
				+	res = np.zeros((col, row))
			
 
				+	score_arr = {}
			
 
				+	for i in range(col):
			
 
				+		tv_arr = tf[seq2id[i]]
			
 
				+		mm = 0 #每个词的位置
			
 
				+		score_arr.setdefault(i,[])
			
 
				+		for j in range(len(tv_arr)):
			
 
				+			tmp2 = np.zeros(len(var_stat[j])) #每个维度的向量
			
 
				+			if j>0: mm += len(var_stat[j-1]) 
			
 
				+			for word,value in tv_arr[j].items():
			
 
				+				score = weight[j]*value*idf[word]
			
 
				+				ll = var_stat[j].index(word)
			
 
				+				nn = ll + mm
			
 
				+				res[i,nn] = score
			
 
				+				tmp2[ll] = score
			
 
				+			score_arr[i].append(tmp2)
			
 
				+	return res,score_arr
			
 
				+
			
 
				+def cos_distance(vec1, vec2):
			
 
				+	v11 = vec1*vec1
			
 
				+	v12 = vec1*vec2
			
 
				+	v22 = vec2*vec2
			
 
				+	mer = sum(v12[v12>0])
			
 
				+	denominator = math.sqrt(sum(v11[v11>0])) + math.sqrt(sum(v22[v22>0]))
			
 
				+	if not denominator:
			
 
				+		return 0
			
 
				+	return mer/denominator
			
 
				+
			
 
				+def tv_sim(data): #tv_id:要计算的电视剧(1,2,3...)，data:电视剧得分矩阵({1:[],2:[]})
			
 
				+	n,m = data.shape
			
 
				+	res = np.zeros((n,n))
			
 
				+	result = []
			
 
				+	x = range(1,n+1)
			
 
				+	for i in range(n):
			
 
				+		res[i,i] = 1
			
 
				+		for j in range(i+1,n):
			
 
				+			res[i,j] = cos_distance(data[i,],data[j,])
			
 
				+			res[j,i] =  res[i,j]
			
 
				+		index_arr = np.argsort(-res[i,])
			
 
				+		sort_arr = res[i,][index_arr]
			
 
				+		id_arr = np.array([seq2id[i] for i in index_arr])
			
 
				+		tmp = zip(id_arr,sort_arr)
			
 
				+		result.append(dict(enumerate(tmp[0:100])))
			
 
				+	return result
			
 
				+
			
 
				+dat,score_mat = tv_score(weight,tv_tf,idf_aft)
			
 
				+res_sim = tv_sim(dat)
			
 
				+
			
 
				+
			
 
				+#将结果和中间数据保存到数据库中
			
 
				+'''
			
 
				+sql = 'delete from idl.ad_tv_cos'
			
 
				+cursor.execute(sql)
			
 
				+db.commit()
			
 
				+
			
 
				+vv = []
			
 
				+for i in range(len(res_sim)):
			
 
				+	sim_arr = []
			
 
				+	for key,value in res_sim[i].items():
			
 
				+		sim_arr.append(value[0])
			
 
				+	vv.append((seq2id[i],str(res_sim[i]),str(sim_arr)))
			
 
				+
			
 
				+sql = 'insert into idl.ad_tv_cos values (%s,%s,%s)'
			
 
				+
			
 
				+for i in range(int(len(vv)/1000)+1):
			
 
				+	tmp = vv[i*1000:(i+1)*1000]
			
 
				+	cursor.executemany(sql,tmp)
			
 
				+db.commit()
			
 
				+'''
			
 
				+
			
 
				+delete = 'delete from tmp.ad_tv_recom_idf'
			
 
				+Mysql.execute(delete, conn=conn)
			
 
				+
			
 
				+tmp_ll = list(idf.items())
			
 
				+vv = [(i+1,tmp_ll[i][0],tmp_ll[i][1],N) for i in range(len(tmp_ll))]
			
 
				+sql = 'insert into tmp.ad_tv_recom_idf values(%s,%s,%s,%s)'
			
 
				+Mysql.insertMany(sql, vv, conn=conn)
			
 
				+
			
 
				+
			
 
				+delete = 'delete from tmp.ad_tv_recom_tf'
			
 
				+Mysql.execute(delete, conn=conn)
			
 
				+
			
 
				+vv = []
			
 
				+for key,tv_arr in tv_tf.items():
			
 
				+	tmp = []
			
 
				+	tmp.append(int(key))
			
 
				+	for tv_obj in tv_arr:
			
 
				+		ss = ';'.join([k.encode('utf-8')+':'+str(v) for k,v in tv_obj.items()])
			
 
				+		tmp.append(ss)
			
 
				+	vv.append(tuple(tmp))
			
 
				+sql = 'insert into tmp.ad_tv_recom_tf values(%s,%s,%s,%s,%s,%s,%s,%s)'
			
 
				+Mysql.insertMany(sql, vv, conn=conn)
			
 
				+
			
 
				+delete = 'delete from tmp.ad_tv_recom_var_stat'
			
 
				+Mysql.execute(delete, conn=conn)
			
 
				+
			
 
				+dim_arr = [(','.join(tmp_arr) for tmp_arr in var_stat)]
			
 
				+sql = 'insert into tmp.ad_tv_recom_var_stat values(%s,%s,%s,%s,%s,%s,%s)'
			
 
				+Mysql.insertMany(sql, dim_arr, conn=conn)
			
 
				+
			
 
				+Mysql.close(conn)
			
 
				+
			
 
				+
			
 
				+#将结果保存到本地
			
 
				+f1 = open('ad_tv_recom_score_matrix.txt','w')
			
 
				+f1.write('id\ttype\ttag\tdirector\tmain_actors\tscriptwritter\tproduction\n')
			
 
				+for i in range(dat.shape[0]):
			
 
				+	ss = str(seq2id[i])
			
 
				+	for tt in score_mat[i]:
			
 
				+		ss += '\t'+','.join([str(i)+':'+str(tt[i]) for i in np.nonzero(tt)[0]])
			
 
				+	f1.write(ss+'\n')
			
 
				+
			
 
				+f1.close()
			
 
				+'''
			
 
				+def en2str(word):
			
 
				+	return word.encode('utf-8')
			
 
				+
			
 
				+f2 = open('ad_tv_recom_var_stat.txt','w')
			
 
				+f2.write('type\ttag\tdirector\tmain_actors\tscriptwritter\tproduction\n')
			
 
				+ss = '\t'.join([','.join(map(en2str,tmp_arr)) for tmp_arr in var_stat])
			
 
				+f2.write(ss+'\n')
			
 
				+
			
 
				+f2.close()
			
 
				+
			
 
				+f3 = open('data/tv_outline_cos1.txt','w')
			
 
				+for i in range(len(res_sim)):
			
 
				+	sim_arr = []
			
 
				+	for key,value in res_sim[i].items():
			
 
				+		sim_arr.append(value[0])
			
 
				+	f3.write(str(seq2id[i])+'\t'+str(res_sim[i])+'\t'+str(sim_arr)+'\n')
			
 
				+
			
 
				+f1.close()
			
 
				+f3.close()
			
 
				+'''
			
--- a/tv_real_recom_fix.py
+++ b/tv_real_recom_fix.py
@@ -0,0 +1,228 @@
 
				+#encoding=utf-8
			
 
				+
			
 
				+from __future__ import division
			
 
				+import re
			
 
				+import sys
			
 
				+import time
			
 
				+import math
			
 
				+import jieba
			
 
				+import datetime
			
 
				+import numpy as np 
			
 
				+from fty_util.common import Mysql
			
 
				+
			
 
				+start = time.time()
			
 
				+tf = {} #{id:[{},{},..],...}
			
 
				+idf_bre = {}
			
 
				+idf_aft = {}
			
 
				+tv_data = []  #新电视剧的变量数据
			
 
				+score_mat = {} #电视剧得分矩阵{id:[{},{}...],...}
			
 
				+tags = {} #标签库
			
 
				+weight = [5,2,1,1,1,1,2]
			
 
				+
			
 
				+if len(sys.argv) > 1:
			
 
				+	tv_id = int(sys.argv[1])
			
 
				+else:
			
 
				+	print '请输入电视剧id'
			
 
				+	sys.exit()
			
 
				+
			
 
				+conn = Mysql.createOfflineConn()
			
 
				+
			
 
				+sql1 = 'select * from tmp.ad_tv_recom_idf'
			
 
				+tmp = Mysql.selectAll(sql1, conn=conn)
			
 
				+tv_sum = tmp[0][3] #历史总电视剧数
			
 
				+for i in range(len(tmp)):
			
 
				+	arr = tmp[i]
			
 
				+	idf_bre[arr[1]] = arr[2]
			
 
				+
			
 
				+sql2 = 'select * from tmp.ad_tv_recom_var_stat'
			
 
				+tmp = Mysql.selectAll(sql2, conn=conn)
			
 
				+var_stat = [word.split(',') for word in tmp[0]] #各维度的词统计
			
 
				+
			
 
				+ff = open('ad_tv_recom_score_matrix.txt','r')
			
 
				+title = '' #文本的列标题
			
 
				+for line in ff.readlines():
			
 
				+	arr = line.strip('\n').split('\t')
			
 
				+	if arr[0] == 'id': 
			
 
				+		title = line
			
 
				+	else:
			
 
				+		k = int(arr[0])
			
 
				+		score_mat.setdefault(k,[])
			
 
				+		for j in arr[1:]:
			
 
				+			obj = {}
			
 
				+			if len(j):
			
 
				+				for ss in j.split(','):
			
 
				+					tmp_arr = ss.split(':')
			
 
				+					obj[int(tmp_arr[0])] = tmp_arr[1] 
			
 
				+	 		score_mat[k].append(obj)
			
 
				+ff.close()
			
 
				+
			
 
				+sql4 = 'select tag from odl.ad_type_lib'
			
 
				+tmp = Mysql.selectAll(sql4, conn=conn)
			
 
				+for word in tmp:
			
 
				+	tags[word[0]] = 1
			
 
				+
			
 
				+dims = ['tv_id','types','description','director','main_actors','scriptwriter','filmer','decade']
			
 
				+
			
 
				+sql = "select %s from odl.ad_tv_lib where tv_id=%d" %(', '.join(dims),tv_id)
			
 
				+tv_data = Mysql.selectAll(sql, conn=conn)
			
 
				+
			
 
				+def find_tag(sentence): #sentence为电视剧的描述信息
			
 
				+	seg = jieba.cut(sentence)
			
 
				+	res = {}
			
 
				+	for word in seg:
			
 
				+		if tags.get(word):
			
 
				+			res.setdefault(word,1)
			
 
				+	return u' '.join(res.keys())
			
 
				+
			
 
				+for i in range(len(tv_data)):
			
 
				+	tv_data[i] = list(tv_data[i])
			
 
				+	tv_data[i][0] = int(tv_data[i][0])
			
 
				+	key = tv_data[i][0]
			
 
				+	arr = tv_data[i][1:]
			
 
				+	tmp = []  #每个电视剧的所有关键词
			
 
				+	dim_tmp = [] #每个电视剧的每个维度的关键词统计[[{},{}..],..]
			
 
				+	if key not in score_mat:
			
 
				+		tv_sum += 1 #所有电视剧的数量
			
 
				+	if not arr[1]: 
			
 
				+		arr[1] = ''
			
 
				+		tv_data[i][2] = ''
			
 
				+	else: 
			
 
				+		arr[1] = find_tag(arr[1])
			
 
				+		tv_data[i][2] = arr[1]
			
 
				+	for j in range(len(arr)):
			
 
				+		obj = {}
			
 
				+		if not arr[j]: 
			
 
				+			wd = u''
			
 
				+		else:
			
 
				+			wd = arr[j]
			
 
				+		words = wd.split(u' ')
			
 
				+		words = list(set(words))
			
 
				+		if u'' in words:
			
 
				+			words.remove(u'')
			
 
				+		tmp.extend(words) 
			
 
				+		for word in words:
			
 
				+			obj.setdefault(word, 0)
			
 
				+			obj[word] += 1
			
 
				+		dim_tmp.append(obj)
			
 
				+	n = len(tmp) #每个电视剧的总词数
			
 
				+	for l in range(len(dim_tmp)):
			
 
				+		obj_j = dim_tmp[l]
			
 
				+		for k in obj_j:
			
 
				+			if n: obj_j[k] /= n
			
 
				+			else: obj_j[k] = 0
			
 
				+			if k not in var_stat[l]: #判断新剧的关键词是否在历史关键词库中
			
 
				+				var_stat[l].append(k)
			
 
				+	tf[key] = dim_tmp
			
 
				+	for ww in list(set(tmp)):
			
 
				+		if not idf_bre.has_key(ww):
			
 
				+			idf_bre[ww] = 1
			
 
				+		else:
			
 
				+			if key not in score_mat:
			
 
				+				idf_bre[ww] += 1 
			
 
				+
			
 
				+for key in idf_bre:
			
 
				+	idf_aft[key] = math.log10(tv_sum/idf_bre[key])
			
 
				+
			
 
				+
			
 
				+#对历史电视剧的得分矩阵重新计算
			
 
				+length = sum([len(v) for v in var_stat])
			
 
				+for key in score_mat:
			
 
				+	tmp_arr = score_mat[key]
			
 
				+	tmp = np.zeros(length)
			
 
				+	ll = 0
			
 
				+	for i in range(len(var_stat)):
			
 
				+		if i > 0: ll += len(var_stat[i-1])
			
 
				+		mat = tmp_arr[i]
			
 
				+		for k,v in mat.items():
			
 
				+			tmp[ll+k] = v
			
 
				+	score_mat[key] = tmp
			
 
				+
			
 
				+
			
 
				+#计算电视剧矩阵得分
			
 
				+def tv_score(weight, tf, idf):
			
 
				+	res = {}
			
 
				+	row = sum([len(v) for v in var_stat])
			
 
				+	for i in tf:
			
 
				+		tv_arr = tf[i]
			
 
				+		mm = 0 #每个词的位置
			
 
				+		res.setdefault(i,np.zeros(row))
			
 
				+		for j in range(len(tv_arr)):
			
 
				+			if j>0: mm += len(var_stat[j-1]) 
			
 
				+			for word,value in tv_arr[j].items():
			
 
				+				score = weight[j]*value*idf[word]
			
 
				+				nn = var_stat[j].index(word) + mm
			
 
				+				res[i][nn] = score
			
 
				+	return res
			
 
				+
			
 
				+def cos_distance(vec1, vec2):
			
 
				+	v11 = vec1*vec1
			
 
				+	v12 = vec1*vec2
			
 
				+	v22 = vec2*vec2
			
 
				+	mer = sum(v12[v12>0])
			
 
				+	denominator = math.sqrt(sum(v11[v11>0])) + math.sqrt(sum(v22[v22>0]))
			
 
				+	if not denominator:
			
 
				+		return 0
			
 
				+	return mer/denominator
			
 
				+
			
 
				+def tv_sim(tv_id,data): #tv_id:要计算的电视剧(1,2,3...)，data:电视剧得分矩阵({1:[],2:[]})
			
 
				+	res = []
			
 
				+	vec1 = data[tv_id]
			
 
				+	for key,tv_arr in data.items():
			
 
				+		cos = cos_distance(vec1,tv_arr)
			
 
				+		res.append([key,cos])
			
 
				+	return dict(enumerate(sorted(res,key=lambda x:x[1],reverse=True)[0:400]))
			
 
				+
			
 
				+dat = tv_score(weight,tf,idf_aft)
			
 
				+score_mat_new = dict(score_mat,**dat) #将新剧和老剧的得分合并
			
 
				+
			
 
				+
			
 
				+#将结果和中间数据保存到数据库中
			
 
				+for key in dat:
			
 
				+	res = tv_sim(key, score_mat_new)
			
 
				+	sim_arr = ','.join([str(i[0]) for i in res.values()])
			
 
				+	sql = 'replace into idl.ad_tv_4sim_wmd values ("%d","%s")' %(key,sim_arr)
			
 
				+	Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+
			
 
				+vv = []
			
 
				+for key,tv_arr in tf.items():
			
 
				+	tmp = []
			
 
				+	tmp.append(int(key))
			
 
				+	for tv_obj in tv_arr:
			
 
				+		ss = ';'.join([k.encode('utf-8')+':'+str(v) for k,v in tv_obj.items()])
			
 
				+		tmp.append(ss)
			
 
				+	if key not in score_mat:
			
 
				+		sql = 'replace into tmp.ad_tv_recom_tf values("%s","%s","%s","%s","%s","%s","%s","%s")' % tuple(tmp)
			
 
				+		Mysql.execute(sql, conn=conn)
			
 
				+
			
 
				+
			
 
				+delete = 'delete from tmp.ad_tv_recom_idf'
			
 
				+Mysql.execute(delete, conn=conn)
			
 
				+
			
 
				+tmp_ll = list(idf_bre.items())
			
 
				+vv = [(i+1,tmp_ll[i][0],tmp_ll[i][1],tv_sum) for i in range(len(tmp_ll))]
			
 
				+sql = 'insert into tmp.ad_tv_recom_idf values(%s,%s,%s,%s)'
			
 
				+Mysql.insertMany(sql, vv, conn=conn)
			
 
				+
			
 
				+
			
 
				+delete = 'delete from tmp.ad_tv_recom_var_stat'
			
 
				+Mysql.execute(delete, conn=conn)
			
 
				+
			
 
				+dim_arr = [(','.join(tmp_arr) for tmp_arr in var_stat)]
			
 
				+sql = 'insert into tmp.ad_tv_recom_var_stat values(%s,%s,%s,%s,%s,%s,%s)'
			
 
				+Mysql.insertMany(sql, dim_arr, conn=conn)
			
 
				+
			
 
				+
			
 
				+f1 = open('ad_tv_recom_score_matrix.txt','a')
			
 
				+for tv_id,np_arr in dat.items():
			
 
				+	nn = 0
			
 
				+	if tv_id not in score_mat:
			
 
				+		res = str(tv_id)
			
 
				+		for arr in var_stat:
			
 
				+			tmp = np_arr[nn:(nn+len(arr))]
			
 
				+			nn +=  len(arr)
			
 
				+			res += '\t' + ','.join([str(i)+':'+str(tmp[i]) for i in np.nonzero(tmp)[0]])
			
 
				+		f1.write(res+'\n')	
			
 
				+f1.close()
			
 
				+
			
 
				+Mysql.close(conn)