lianghua
/
py_script


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
							#/usr/bin/env python
#coding=utf-8

import random
import sys
import time

from selenium import webdriver
from urllib import quote

from fty_util.common import Mysql

reload(sys)
sys.setdefaultencoding('utf8')

"""
乐视视频爬取规则


"""
def scrapy_url():
    conn = Mysql.createOfflineConn()

    sql = """
        select id, tv_name from scrapy.wangju_url where url_leshi is null order by id asc
    """

    rows = Mysql.getAll(sql, conn=conn)

    driver = webdriver.Firefox()
    driver.set_page_load_timeout(10)

    for row in rows:
        _id = row['id']
        tv_name = row['tv_name']

        url = 'http://so.le.com/s?wd=' + quote(str(tv_name))

        try:
            driver.get(url)
        except Exception, e:
            driver.execute_script('window.stop()')

        divs = driver.find_elements_by_xpath('//div[@class="So-detail Tv-so"]')
        href_list = []
        for div in divs:
            try:
                href = div.find_element_by_xpath('./div/div[2]/div[1]/h1/a').get_attribute('href')
                href_list.append(href)
            except Exception, e:
                pass
        if len(href_list) > 0:
            sql = """
                update scrapy.wangju_url set url_leshi = '%s' where id = %s
            """
            sql = sql % (','.join(href_list), _id)
            Mysql.execute(sql, conn=conn)
            need_blank = False
        if need_blank:
            sql = """
                update scrapy.wangju_url set url_leshi = '%s' where id = %s
            """
            sql = sql % ('', _id)
            Mysql.execute(sql, conn=conn)
    driver.quit()

def scrapy_data():
    conn = Mysql.createOfflineConn()

    sql = """
        select id, tv_name, url_leshi from scrapy.wangju_url where url_leshi is not null and url_leshi != '' order by id asc
    """

    rows = Mysql.getAll(sql, conn=conn)

    driver = webdriver.Firefox()
    driver.set_page_load_timeout(10)

    for row in rows:
        _id = row['id']
        tv_name = row['tv_name']
        url_leshi = row['url_leshi']

        urls = url_leshi.split(',')
        for url in urls:
            if 'www.le.com' not in url:
                continue
            try:
                driver.get(url)
            except Exception, e:
                driver.execute_script('window.stop()')
            
            try:
                href = driver.find_element_by_xpath('//div[@id="j-adv-tv"]/div[2]/div[1]/div[2]/div[1]/div[2]/dl[1]/dt/a').get_attribute('href')
            except Exception, e:
                href = None
            

            if href is not None and 'www.le.com' in href:
                print href
                try:
                    driver.get(href)
                except Exception, e:
                    driver.execute_script('window.stop()')
                try:
                    content = driver.find_element_by_xpath('//div[@class="Info"]').get_attribute('textContent')
                except Exception, e:
                    continue
                
                sql = """
                    insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
                """
                value = (_id, tv_name, url, '', content, 'leshi')
                Mysql.insertOne(sql, value=value, conn=conn)
    driver.quit()

def parse_wangju_all_url_title():
    conn = Mysql.createOfflineConn()

    sql = """
        select id, tv_name, url from scrapy.wangju_all_url where source = 'leshi' order by id asc
    """

    rows = Mysql.getAll(sql, conn=conn)

    driver = webdriver.Firefox()
    driver.set_page_load_timeout(10)
    for row in rows:
        _id = row['id']
        tv_name = row['tv_name']
        url = row['url']

        try:
            driver.get(url)
        except Exception, e:
            print e
            driver.execute_script('window.stop()')

        try:
            title = driver.find_element_by_xpath('//div[@class="listPic active"]/div[1]/p/i').text
        except Exception, e:
            title = ''

        sql = """
            update scrapy.wangju_all_url set title = '%s' where source = '%s' and url = '%s'
        """
        sql = sql % (title, 'leshi', url)
        Mysql.execute(sql, conn=conn)

def parse_content():
    conn = Mysql.createOfflineConn()

    sql = """
        select id, tv_name, url, content from scrapy.wangju_all_url where source = 'leshi' order by id asc
    """
    rows = Mysql.getAll(sql, conn=conn)

    for row in rows:
        _id = row['id']
        tv_name = row['tv_name']
        url = row['url']
        content = row['content']

        import re
        m = re.search(ur'([0-9]+[.]?)+', content)
        score = '0'
        if m is not None:
            score = m.group(0)

        play = '0'
        m = re.search(ur'播放数：([0-9]+[.]?)+[(亿)(万)]', content)
        if m is not None:
            play = m.group(0)

        sql = """
            update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'leshi'
        """
        sql = sql % (score, play, url)
        Mysql.execute(sql, conn=conn)

if __name__ == '__main__':
    # scrapy_data()
    # scrapy_url()
    
    # parse_wangju_all_url_title()
    parse_content()