#/usr/bin/env python
#coding=utf-8
"""爱奇艺电视剧分类爬取

分为两步
第一步爬取搜索页面结果，找到符合条件的电视剧
第二步根据保存的具体页面url爬取分类信息
"""

import random
import sys
import time

from selenium import webdriver

from fty_util.common import Mysql

reload(sys)
sys.setdefaultencoding('utf8')

class DSJ_All(object):
    # 爬取电视剧链接地址
    def get_iqiyi_url():
        
        # 需要爬取的列表页面
        start_urls = [
            # 'http://www.iqiyi.com/lib/dianshiju/,,2017_4_1.html',
            # 'http://www.iqiyi.com/lib/dianshiju/,,2016_4_1.html',
            # 'http://www.iqiyi.com/lib/dianshiju/,,2015_4_1.html',
            # 'http://www.iqiyi.com/lib/dianshiju/,,2014-2011_4_1.html',
            # 'http://www.iqiyi.com/lib/dianshiju/,,2010-2000_4_1.html',
            # 'http://www.iqiyi.com/lib/dianshiju/,,90%E5%B9%B4%E4%BB%A3_4_1.html',
            # 'http://www.iqiyi.com/lib/dianshiju/,,80%E5%B9%B4%E4%BB%A3_4_1.html',
            'http://www.iqiyi.com/lib/dianshiju/,,%E6%9B%B4%E6%97%A9_4_1.html'
        ]
        # 打开Firefox浏览器
        driver = webdriver.Firefox()
        driver.set_page_load_timeout(20)

        # 数据库连接
        conn = Mysql.createScrapyConn()
        for url in start_urls:
            # 打开主页
            try:
                driver.get(url)
            except:
                driver.execute_script('window.stop()')
            

            is_next = True
            while is_next:
                
                try:
                    next_page = driver.find_elements_by_xpath('//div[@class="mod-page"]/a')[-1]
                except:
                    continue
                lis = driver.find_elements_by_xpath('//div[@class="wrapper-piclist"]/ul/li')
                sql_insert = """
                    insert into scrapy.iqiyi_dianshiju_url (url) values (%s)
                """
                data_list = []
                for li in lis:
                    try:
                        tv_url = li.find_element_by_xpath('.//div[1]/a').get_attribute('href')
                        print tv_url
                        data_list.append((tv_url,))                    
                    except Exception, e:
                        print '没有'
                        continue
                    time.sleep(random.uniform(0, 2))
                Mysql.insertMany(sql_insert, data_list, conn)
                try:
                    next_page_text = next_page.find_element_by_xpath('.').text
                    if next_page_text == '下一页':
                        next_page.click()
                    else:
                        is_next = False;
                except:
                    is_next = False;
                time.sleep(10)
        driver.quit()

    # 爬取具体页面
    def get_iqiyi_detail():
        driver = webdriver.Firefox()
        driver.set_page_load_timeout(10)

        # 数据库连接
        conn = Mysql.createScrapyConn()

        sql = """
            select max(id) from scrapy.iqiyi_dianshiju_detail
        """
        max_id = Mysql.getOne(sql, conn=conn)
        max_id = max_id[0]
        if max_id is None:
            max_id = 0
        # 获取所有url
        sql = """
            select id, url from scrapy.iqiyi_dianshiju_url where id > '%s' order by id asc
        """
        sql = sql % (max_id)
        rows = Mysql.getAll(sql, conn=conn)
        for row in rows:
            _id = row['id']
            url = row['url']
            print url
            try:
                driver.get(url)
            except:
                driver.execute_script('window.stop()')
            
            detail_info = driver.find_element_by_xpath('//div[@class="result_detail"]')
            # 详情html内容
            detail_info_html = detail_info.get_attribute('innerHTML')
            # 详情文本内容
            detail_info_text = detail_info.find_element_by_xpath('.').text
            # 电视剧名称
            tv_name = detail_info.find_element_by_xpath('h1/a').text

            #存入数据库
            sql = """
                insert into scrapy.iqiyi_dianshiju_detail (id, tv_name, detail_info_text, detail_info_html, url) values (%s, %s, %s, %s, %s)
            """
            value = (_id, tv_name, detail_info_text, detail_info_html, url)
            Mysql.insertOne(sql, value=value, conn=conn)
            time.sleep(random.uniform(1, 5))
        driver.quit()

    # 爬取电视剧链接地址
    def get_tengxun_url():
        start_urls = [
            # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=2017',
            # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=859',
            # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=860',
            # 'http://v.qq.com/x/list/tv?iyear=861&offset=0&sort=5',
            # 'http://v.qq.com/x/list/tv?sort=5&offset=0&iyear=862',
            # 'http://v.qq.com/x/list/tv?iyear=863&sort=5&offset=0',
            # 'http://v.qq.com/x/list/tv?sort=5&iyear=864&offset=0',
            'http://v.qq.com/x/list/tv?iyear=865&sort=5&offset=0',
            'http://v.qq.com/x/list/tv?iyear=866&offset=0&sort=5'
        ]
        # 打开Firefox浏览器
        driver = webdriver.Firefox()
        driver.set_page_load_timeout(20)

        # 数据库连接
        conn = Mysql.createScrapyConn()
        for url in start_urls:
            # 打开主页
            try:
                driver.get(url)
            except:
                driver.execute_script('window.stop()')
            
            is_next = True
            while is_next:
                lis = driver.find_elements_by_xpath('//div[@class="mod_bd"]/div/ul/li')
                print lis
                sql_insert = """
                    insert into scrapy.tengxun_dianshiju_url (url) values (%s)
                """
                data_list = []
                for li in lis:
                    try:
                        tv_url = li.find_element_by_xpath('a').get_attribute('href')
                        print tv_url
                        data_list.append((tv_url,))                    
                    except Exception, e:
                        print '没有'
                        continue
                    time.sleep(1)
                Mysql.insertMany(sql_insert, data_list, conn)
                try:
                    next_page = driver.find_elements_by_xpath('//div[@class="mod_pages"]/a')[-1]
                except:
                    is_next = False
                    continue
                try:
                    next_page_text = next_page.find_element_by_xpath('.').text
                    next_page_url = next_page.find_element_by_xpath('.').get_attribute('href')
                    if next_page_url == 'javascript:;':
                        is_next = False
                        continue
                    if next_page_text == '下一页':
                        next_page.click()
                    else:
                        is_next = False;
                except:
                    is_next = False;
                time.sleep(10)
        driver.quit()

    def get_tengxun_detail_url():
        # 打开Firefox浏览器
        driver = webdriver.Firefox()
        driver.set_page_load_timeout(20)

        # 数据库连接
        conn = Mysql.createScrapyConn()

        sql = """
            select id, url from scrapy.tengxun_dianshiju_url where detail_url is null or detail_url = '' order by id asc
        """
        rows = Mysql.getAll(sql, conn=conn)
        for row in rows:
            _id = row['id']
            url = row['url']

            # 打开主页
            try:
                driver.get(url)
            except:
                driver.execute_script('window.stop()')
            if re.match(r'(.*)detail(.*)', driver.current_url):
                print driver.current_url
                sql = """
                    update scrapy.tengxun_dianshiju_url set detail_url = '%s' where id = '%s'
                """
                sql = sql % (driver.current_url, _id)
                Mysql.update(sql, conn=conn)
                continue
            try:
                a_list = driver.find_elements_by_xpath('//a[@class="album_title"]')
                print a_list
                for a in a_list:
                    detail_href = a.find_element_by_xpath('.').get_attribute('href')
                    if re.match(r'(.*)detail(.*)', detail_href):
                        print detail_href
                        sql = """
                            update scrapy.tengxun_dianshiju_url set detail_url = '%s' where id = '%s'
                        """
                        sql = sql % (detail_href, _id)
                        Mysql.update(sql, conn=conn)
                        break
            except Exception, e:
                print e
            time.sleep(random.uniform(0, 3))
            
        driver.quit()

    # 爬取具体页面
    def get_tengxun_detail():
        driver = webdriver.Firefox()
        driver.set_page_load_timeout(10)

        # 数据库连接
        conn = Mysql.createScrapyConn()
        # 获取所有需要爬取的电视剧
        sql = """
            select url, detail_url from scrapy.tengxun_dianshiju_url order by id asc
        """
        rows = Mysql.getAll(sql, conn=conn)
        for row in rows:
            url = row['url']
            detail_url = row['detail_url']
            try:
                driver.get(detail_url)
            except:
                driver.execute_script('window.stop()')
            
            detail_info = driver.find_element_by_xpath('//div[@class="container_inner"]')
            # 详情html内容
            detail_info_html = detail_info.get_attribute('innerHTML')
            # 详情文本内容
            detail_info_text = detail_info.find_element_by_xpath('.').text
            # 电视剧名称
            tv_name = detail_info.find_element_by_xpath('.//div[@class="video_title_collect cf"]/h1/a').text
            sql = """
                insert into scrapy.tengxun_dianshiju_detail (tv_name, detail_info_text, detail_info_html, cover_url, detail_url) values ('%s', '%s', '%s', '%s', '%s')
            """
            sql = sql % (tv_name, detail_info_text, detail_info_html, url, detail_url)
            Mysql.insertOne(sql, conn=conn)
        driver.quit()


if __name__ == '__main__':
    if len(sys.argv) != 2:
        print '没有输入参数，退出'
        sys.exit(0)
    print 'method name is ' + sys.argv[1]
    obj = DSJ_All()
    try:
        getattr(obj, sys.argv[1])()
    except Exception, e:
        print e