lianghua
/
py_script


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
							#/usr/bin/env python
#coding=utf-8
"""爱奇艺电视剧分类爬取

分为两步
第一步爬取搜索页面结果，找到符合条件的电视剧
第二步根据保存的具体页面url爬取分类信息
"""

import random
import sys
import time

from selenium import webdriver

from fty_util.common import Mysql

reload(sys)
sys.setdefaultencoding('utf8')

class DSJ_Categories(object):
    # 爬取电视剧链接地址
    def get_iqiyi_url():
        # 打开Firefox浏览器
        driver = webdriver.Firefox()
        driver.set_page_load_timeout(10)

        # 数据库连接
        conn = Mysql.createScrapyConn()
        # 获取所有需要爬取的电视剧
        sql = """
            select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and (iqiyi_url is null or iqiyi_url = '') order by id asc
        """
        # rows = conn.getAll(sql)
        rows = Mysql.getAll(sql, conn=conn)
        for row in rows:
            _id = row['id']
            tv_name = row['tv_name']
            print tv_name
            start_url = "http://so.iqiyi.com/so/q_" + tv_name + "?source=input"
            # 打开主页
            try:
                driver.get(start_url)
            except:
                driver.execute_script('window.stop()')
            
            lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li')
            for li in lis:
                try:
                    first_num = li.find_element_by_xpath('.//div[@class="info_item mt15"]/div/div[2]/ul[1]/li[1]/a').text
                    if '1' == first_num.strip():
                        href = li.find_element_by_xpath('.//div/h3/a').get_attribute('href')
                        print href
                        sql = """
                            update scrapy.tv_category_scrapy set iqiyi_url = '%s' where id = '%s'
                        """
                        sql = sql % (href, _id)
                        # conn.update(sql)
                        Mysql.update(sql, conn=conn)
                        
                        break
                except Exception, e:
                    print '没有'
                    continue
        driver.quit()

    # 爬取具体页面
    def get_detail():
        driver = webdriver.Firefox()
        driver.set_page_load_timeout(10)

        # 数据库连接
        conn = Mysql.createScrapyConn()
        # 获取所有需要爬取的电视剧
        sql = """
            select id, tv_name, iqiyi_url from scrapy.tv_category_scrapy where id > 4573 and iqiyi_url is not null and iqiyi_url != '' and iqiyi_types is null order by id asc
        """
        # rows = conn.getAll(sql)
        rows = Mysql.getAll(sql, conn=conn)
        for row in rows:
            _id = row['id']
            tv_name = row['tv_name']
            url = row['iqiyi_url']
            print url
            try:
                driver.get(url)
            except:
                driver.execute_script('window.stop()')
            
            cats = driver.find_elements_by_xpath('//div[@class="look_point"]/a')
            cats_set = set()
            for cat in cats:
                cats_set.add(cat.find_element_by_xpath('.').text.strip())

            #存入数据库
            sql = """
                update scrapy.tv_category_scrapy set iqiyi_types = '%s' where id = '%s'
            """
            sql = sql % (' '.join(cats_set), _id)
            # conn.update(sql)
            Mysql.update(sql, conn=conn)
        driver.quit()


    # 爬取电视剧链接地址
    def get_tengxun_url():
        # 打开Firefox浏览器
        driver = webdriver.Firefox()
        driver.set_page_load_timeout(10)

        # 数据库连接
        conn = Mysql.createScrapyConn()
        # 获取所有需要爬取的电视剧
        sql = """
            select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is null order by id asc
        """
        # rows = conn.getAll(sql)
        rows = Mysql.getAll(sql, conn=conn)
        for row in rows:
            _id = row['id']
            tv_name = row['tv_name']
            print tv_name
            start_url = "http://v.qq.com/x/search/?q=" + tv_name + "&stag=0"
            # 打开主页
            try:
                driver.get(start_url)
            except:
                driver.execute_script('window.stop()')
            
            divs = driver.find_elements_by_xpath('//div[@class="wrapper_main"]/div')
            for div in divs:
                try:
                    href = div.find_element_by_xpath('div[1]/div/h2/a').get_attribute('href')
                    print href
                    matchObj = re.match(r'(.*)detail(.*)', href, re.M | re.I)
                    if matchObj:
                        sql = """
                            update scrapy.tv_category_scrapy set tengxun_url = %s where id = %s
                        """
                        # sql = sql % (href, _id)
                        value = (href, _id)
                        # conn.update(sql)
                        Mysql.update(sql, param=value, conn=conn)
                        break
                except Exception, e:
                    print '没有'
                    print e
                    continue
        driver.quit()

    # 爬取具体页面
    def get_detail():
        driver = webdriver.Firefox()
        driver.set_page_load_timeout(10)

        # 数据库连接
        conn = Mysql.createScrapyConn()
        # 获取所有需要爬取的电视剧
        sql = """
            select id, tv_name, tengxun_url from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is not null and tengxun_url != '' and tengxun_types is null order by id asc
        """
        # rows = Mysql.getAll(sql)
        rows = Mysql.getAll(sql, conn=conn)
        for row in rows:
            _id = row['id']
            tv_name = row['tv_name']
            tengxun_url = row['tengxun_url']
            print tengxun_url
            # 打开主页
            try:
                driver.get(tengxun_url)
            except:
                driver.execute_script('window.stop()')
            
            cats = driver.find_elements_by_xpath('//div[@class="tag_list"]/a')
            cats_set = set()
            for cat in cats:
                cat_name = cat.find_element_by_xpath('.').text
                cats_set.add(cat_name)
            #存入数据库
            sql = """
                update scrapy.tv_category_scrapy set tengxun_types = '%s' where id = '%s'
            """
            sql = sql % (' '.join(cats_set), _id)
            # conn.update(sql)
            Mysql.update(sql, conn=conn)
        driver.quit()

if __name__ == '__main__':
    if len(sys.argv) != 2:
        print '没有输入参数，退出'
        sys.exit(0)
    print 'method name is ' + sys.argv[1]
    obj = DSJ_Categories()
    try:
        getattr(obj, sys.argv[1])()
    except Exception, e:
        print e