lianghua
/
py_script


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
							#/usr/bin/env python
#coding=utf-8

import random
import sys
import time

from selenium import webdriver
from urllib import quote

from fty_util.common import Mysql

reload(sys)
sys.setdefaultencoding('utf8')

"""
爱奇艺爬取规则
1、scrapy_url 通过搜索页面，爬取搜索到的电视剧页面url
2、scrapy_data 进入搜索到的详情页面，爬取内容、每集url（播放数量在每集页面上显示）
3、scrapy_play_page 进入第一集的播放页面，爬取播放记录数
4、todo 每天爬取每页信息

爱奇艺通过搜索到的详情页面没有播放数量和评论数量，需要一个个页面解析
搜索页面-->搜索详情页面-->播放页面（只需取第一集播放页面即可）-->真实详情页面（爬取播放数量和评论数量（评论暂时爬不到））
所以只要在播放页面爬取到播放量即可。
"""

# 爬取搜索页面
def scrapy_url():
    conn = Mysql.createOfflineConn()
    sql = """
        select id, tv_name from scrapy.wangju_url order by id asc
    """
    rows = Mysql.getAll(sql, conn=conn)
    for row in rows:
        driver = webdriver.PhantomJS()
        driver.set_page_load_timeout(10)
        _id = row['id']
        tv_name = row['tv_name']
        url = 'http://so.iqiyi.com/so/q_' + quote(str(tv_name))
        try:
            driver.get(url)
        except Exception, e:
            driver.execute_script('window.stop()')
        lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li')
        for li in lis:
            try:
                title = li.find_element_by_xpath('./div/h3/a').get_attribute('title')
                href = li.find_element_by_xpath('./div/h3/a').get_attribute('href')
                if 'www.iqiyi.com/lib' in href:
                    print href
                    sql = """
                        insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
                    """
                    value = (_id, tv_name, href, title, '', 'iqiyi')
                    Mysql.insertOne(sql, value=value, conn=conn)
                    time.sleep(1)
            except Exception, e:
                print e
                continue
        driver.quit()

# 爬取搜索到的详情页面
def scrapy_data():
    
    conn = Mysql.createOfflineConn()
    # sql = """
    #     select id, tv_name, url_iqiyi from scrapy.wangju_url where url_iqiyi is not null and url_iqiyi != '' and iqiyi_fenji is null order by id asc
    # """
    sql = """
        select id, tv_name, url, title from scrapy.wangju_all_url where source = 'iqiyi' order by id asc
    """
    rows = Mysql.getAll(sql, conn=conn)

    for row in rows:
        driver = webdriver.PhantomJS()
        driver.set_page_load_timeout(10)
        _id = row['id']
        tv_name = row['tv_name']
        url = row['url']
        title = row['title']
        
        try:
            driver.get(url)
        except Exception, e:
            driver.execute_script('window.stop()')
        
        # 爬取内容
        try:
            content = driver.find_element_by_xpath('//div[@class="result_detail"]').get_attribute('textContent')
        except Exception, e:
            content = ''
        
        # 爬取分集
        try:
            pagelist = driver.find_elements_by_xpath('//div[@class="mod_album_lists clearfix"]/div[3]/div/ul/li')
        except Exception, e:
            # 如果没有隐藏的集数，则用显示的集数
            try:
                pagelist = driver.find_elements_by_xpath('//div[@class="mod_album_lists clearfix"]/div/ul/li')
            except Exception, e:
                pagelist = None
                pass

        if pagelist is not None:
            # 如果集数存在，则爬取每集url，用于爬取播放量和评论量
            data_list = []
            for page in pagelist:
                num = page.find_element_by_xpath('./a').get_attribute('title')
                num = num.replace(' ', '').replace('\n', '')
                href = page.find_element_by_xpath('./a').get_attribute('href')
                if 'www.iqiyi.com' in href:
                    data_list.append((_id, tv_name, num, href, 'iqiyi'))
            # 插入分集数据
            if data_list is not None and len(data_list) > 0:
                sql = """
                    insert into scrapy.wangju_fenji_url (id, tv_name, num, url, source) values (%s, %s, %s, %s, %s)
                """
                Mysql.insertMany(sql, data_list, conn)
        
        # 更新内容
        sql = """
            update scrapy.wangju_all_url set content = %s where url = %s
        """
        value = (content, url)
        Mysql.execute(sql, param=value, conn=conn)
        driver.quit()

# 爬取播放页面
def scrapy_play_page():
    conn = Mysql.createOfflineConn()
    sql = """
        select id, tv_name, url from scrapy.wangju_fenji_url where source = 'iqiyi' and num = '1' order by id asc
    """
    rows = Mysql.getAll(sql, conn=conn)

    for row in rows:
        driver = webdriver.Firefox()
        driver.set_page_load_timeout(10)
        _id = row['id']
        tv_name = row['tv_name']
        url = row['url']
        if 'www.iqiyi.com' not in url:
            driver.quit()
            continue
        else:
            try:
                driver.get(url)
            except Exception, e:
                print e
                driver.execute_script('window.stop()')
            try:
                count = driver.find_element_by_xpath('//span[@id="widget-playcount"]').text
            except Exception, e:
                print e
                count = 0
            
            print count

            sql = """
                update scrapy.wangju_url set iqiyi_playtimes = '%s' where id = %s
            """
            sql = sql % (count, _id)
            Mysql.execute(sql, conn=conn)

        driver.quit()

# 每天爬取播放页面（爱奇艺只有每集的评论数量，没有每集播放数量）
def scrapy_play_page_everyday():
    conn = Mysql.createOfflineConn()
    sql = """
        select id, tv_name, num, url from scrapy.wangju_fenji_url where source = 'iqiyi' order by id asc
    """
    rows = Mysql.getAll(sql, conn=conn)

    for row in rows:
        driver = webdriver.Firefox()
        driver.set_page_load_timeout(20)
        _id = row['id']
        tv_name = row['tv_name']
        num = row['num']
        url = row['url']
        if 'www.iqiyi.com' not in url:
            driver.quit()
            sql = """
                delete from scrapy.wangju_fenji_url where url = '%s'
            """
            sql = sql % (url,)
            Mysql.execute(sql, conn=conn)
            continue
        else:
            try:
                driver.get(url)
            except Exception, e:
                print e
                driver.execute_script('window.stop()')
            try:
                commenttimes = driver.find_element_by_xpath('//a[@class="blm-tab"]/em/i').text
            except Exception, e:
                print e
                commenttimes = ''
            
            print url
            print commenttimes
        
        # sql = """
        #     insert into scrapy.wangju_fenji_data (id, tv_name, num, source, palytimes, commenttimes) values (%s, %s, %s, %s, %s, %s)
        # """
        # value = (_id, tv_name, num, 'iqiyi', playtimes, commenttimes)

def parse_wangju_all_url_data():
    conn = Mysql.createOfflineConn()

    sql = """
        select id, tv_name, url from scrapy.wangju_all_url where source = 'iqiyi' and (playtimes = '' or playtimes = '0') order by id asc
    """

    rows = Mysql.getAll(sql, conn=conn)

    driver = webdriver.Firefox()
    driver.set_page_load_timeout(10)

    driver2 = webdriver.Firefox()
    driver2.set_page_load_timeout(10)
    for row in rows:
        _id = row['id']
        tv_name = row['tv_name']
        url = row['url']
        
        try:
            driver.get(url)
        except Exception, e:
            print e
            driver.execute_script('window.stop()')
        try:
            score = driver.find_element_by_xpath('//span[@class="score_font"]').text
            score = score.replace(' ', '').replace('\n', '')
        except:
            score = ''

        try:
            pagelist = driver.find_elements_by_xpath('//li[@class="album_item"]')
        except Exception, e:
            pass
            pagelist = None
        try:
            if pagelist is not None:
                page_dict = dict()
                for page in pagelist:
                    try:
                        episode = page.find_element_by_xpath('./a').get_attribute('href')
                        episode_text = page.find_element_by_xpath('./a').text
                        page_dict[episode_text] = episode
                    except:
                        continue
                if page_dict.get('1') is not None and 'www.iqiyi.com' in page_dict.get('1'):
                    try:
                        driver2.get(page_dict.get('1'))
                        time.sleep(5)
                    except Exception, e:
                        print e
                        driver2.execute_script('window.stop()')
                    try:
                        count = driver2.find_element_by_xpath('//a[@id="chartTrigger"]/span').text
                    except Exception, e:
                        print e
                        count = '0'
                    print count
                    sql = """
                        update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'iqiyi'
                    """
                    sql = sql % (score, count, url)
                    Mysql.execute(sql, conn=conn)
                else:
                    sql = """
                        delete from scrapy.wangju_all_url where url = '%s' and source = '%s'
                    """
                    sql = sql % (url, 'iqiyi')
                    Mysql.execute(sql, conn=conn)
            else:
                sql = """
                    delete from scrapy.wangju_all_url where url = '%s' and source = '%s'
                """
                sql = sql % (url, 'iqiyi')
                Mysql.execute(sql, conn=conn)
        except Exception, e:
            continue

if __name__ == '__main__':
    # scrapy_url()
    # scrapy_data()
    # scrapy_play_page()
    # scrapy_play_page_everyday()
    parse_wangju_all_url_data()