lianghua
/
py_script


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
							#/usr/bin/env python
#coding=utf-8

import random
import sys
import time

from selenium import webdriver
from urllib import quote

from fty_util.common import Mysql


reload(sys)
sys.setdefaultencoding('utf8')

"""
腾讯视频爬取规则
1、scrapy_url 通过搜索页面，爬取搜索到的最有可能是电视剧页面的url
2、scrapy_data 进入搜索到的详情页面，爬取评分，每集url（播放数量在每集页面上显示）
3、todo 爬取每页详情页

腾讯视频通过搜索到的详情页面没有播放数量和评论数量，需要一个个页面解析
搜索页面-->搜索详情页面-->播放页面（只需取第一集播放页面即可）
所以只有在播放页面爬取到播放量即可。
"""

def scrapy_url():
    conn = Mysql.createOfflineConn()
    sql = """
        select id, tv_name from scrapy.wangju_url order by id asc
    """
    rows = Mysql.getAll(sql, conn=conn)
    for row in rows:
        driver = webdriver.PhantomJS()
        driver.set_page_load_timeout(10)
        _id = row['id']
        tv_name = row['tv_name']
        url = 'https://v.qq.com/x/search/?q=' + quote(str(tv_name))
        try:
            driver.get(url)
        except Exception, e:
            driver.execute_script('window.stop()')

        divs = driver.find_elements_by_xpath('//div[@class="wrapper_main"]/div')
        for div in divs:
            try:
                title = div.find_element_by_xpath('./div[1]/div/h2/a/em').text
                href = div.find_element_by_xpath('./div[1]/div/h2/a').get_attribute('href')
                if 'v.qq.com/detail' in href:
                    print href
                    sql = """
                        insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
                    """
                    value = (_id, tv_name, href, title, '', 'tengxun')
                    Mysql.insertOne(sql, value=value, conn=conn)
                    time.sleep(1)
            except Exception, e:
                print e
                continue
        driver.quit()

# 爬取搜索到的详情页面
def scrapy_data():
    conn = Mysql.createOfflineConn()

    # sql = """
    #     select id, tv_name, url_tengxun from scrapy.wangju_url where url_tengxun is not null and url_tengxun != '' and tengxun_fenji is null order by id asc
    # """
    sql = """
        select id, tv_name, url, title from scrapy.wangju_all_url where source = 'tengxun' order by id asc
    """
    rows = Mysql.getAll(sql, conn=conn)
    for row in rows:
        driver = webdriver.PhantomJS()
        driver.set_page_load_timeout(10)
        _id = row['id']
        tv_name = row['tv_name']
        url = row['url']

        try:
            driver.get(url)
        except Exception, e:
            driver.execute_script('window.stop()')

        # 爬取内容
        try:
            content = driver.find_element_by_xpath('//div[@class="container_inner"]').get_attribute('textContent')
        except Exception, e:
            content = ''

        try:
            pagelist = driver.find_elements_by_xpath('//div[@class="mod_episode"]/span')
            if pagelist is not None:
                data_list = []
                for page in pagelist:
                    num = page.find_element_by_xpath('./a/span').text
                    num = num.replace(' ', '').replace('\n', '')
                    href = page.find_element_by_xpath('./a').get_attribute('href')
                    if 'v.qq.com' in href:
                        data_list.append((_id, tv_name, num, href, 'tengxun'))
                # 插入分集数据
                if data_list is not None and len(data_list) > 0:
                    sql = """
                        insert into scrapy.wangju_fenji_url (id, tv_name, num, url, source) values (%s, %s, %s, %s, %s)
                    """
                    Mysql.insertMany(sql, data_list, conn)
        except Exception, e:
            pass
        
        # 更新内容
        sql = """
            update scrapy.wangju_all_url set content = %s where url = %s
        """
        value = (content, url)
        Mysql.execute(sql, param=value, conn=conn)
        driver.quit()

# 爬取播放页面
def scrapy_play_page():
    conn = Mysql.createOfflineConn()
    sql = """
        select id, tv_name, url from scrapy.wangju_fenji_url where source = 'tengxun' and num = '1' order by id asc
    """
    rows = Mysql.getAll(sql, conn=conn)
    for row in rows:
        driver = webdriver.Firefox()
        driver.set_page_load_timeout(10)
        _id = row['id']
        tv_name = row['tv_name']
        url = row['url']
        if 'v.qq.com' not in url:
            driver.quit()
            continue
        else:
            try:
                driver.get(url)
            except Exception, e:
                print e
                driver.execute_script('window.stop()')
            try:
                count = driver.find_element_by_xpath('//em[@id="mod_cover_playnum"]').text
            except Exception, e:
                print e
                count = 0
            
            print count

            sql = """
                update scrapy.wangju_url set tengxun_playtimes = '%s' where id = %s
            """
            sql = sql % (count, _id)
            Mysql.execute(sql, conn=conn)

        driver.quit()

def parse_wangju_all_url_data():
    conn = Mysql.createOfflineConn()

    sql = """
        select id, tv_name, url from scrapy.wangju_all_url where source = 'tengxun' order by id asc
    """

    rows = Mysql.getAll(sql, conn=conn)

    driver = webdriver.Firefox()
    driver.set_page_load_timeout(10)

    driver2 = webdriver.Firefox()
    driver2.set_page_load_timeout(10)
    for row in rows:
        _id = row['id']
        tv_name = row['tv_name']
        url = row['url']
        
        try:
            driver.get(url)
        except Exception, e:
            print e
            driver.execute_script('window.stop()')
        try:
            score = driver.find_element_by_xpath('//div[@class="video_score"]').text
            score = score.replace(' ', '').replace('\n', '')
        except:
            score = ''
        
        try:
            pagelist = driver.find_elements_by_xpath('//span[@class="item"]')
        except:
            pagelist = None
        
        try:
            page_dict = dict()
            if pagelist is not None:
                for page in pagelist:
                    episode = page.find_element_by_xpath('./a').get_attribute('href')
                    episode_text = page.find_element_by_xpath('./a/span').text
                    page_dict[episode_text] = episode
            if page_dict.get('1') is not None and 'v.qq.com' in page_dict.get('1'):
                try:
                    driver2.get(page_dict.get('1'))
                except Exception, e:
                    print e
                    driver2.execute_script('window.stop()')
                try:
                    count = driver2.find_element_by_xpath('//em[@id="mod_cover_playnum"]').text
                except Exception, e:
                    print e
                    count = 0
                sql = """
                    update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'tengxun'
                """
                sql = sql % (score, count, url)
                Mysql.execute(sql, conn=conn)
            else:
                sql = """
                    delete from scrapy.wangju_all_url where url = '%s' and source = '%s'
                """
                sql = sql % (url, 'tengxun')
                Mysql.execute(sql, conn=conn)
        except Exception, e:
            continue

            
if __name__ == '__main__':
    # scrapy_url()
    # scrapy_data()
    # scrapy_play_page()
    parse_wangju_all_url_data()