lianghua
/
py_script


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
							#/usr/bin/env python
#coding=utf-8

import random
import sys
import time

from selenium import webdriver
from urllib import quote

from fty_util.common import Mysql

reload(sys)
sys.setdefaultencoding('utf8')

def scrapy_url():
    conn = Mysql.createOfflineConn()

    sql = """
        select id, tv_name from scrapy.wangju_url where url_pptv is null order by id asc
    """

    rows = Mysql.getAll(sql, conn=conn)

    driver = webdriver.Firefox()
    driver.set_page_load_timeout(10)

    for row in rows:
        _id = row['id']
        tv_name = row['tv_name']

        url = 'http://search.pptv.com/s_video?kw=' + quote(str(tv_name))

        need_blank = True
        try:
            driver.get(url)
        except Exception, e:
            driver.execute_script('window.stop()')

        divs = driver.find_elements_by_xpath('//div[@id="search-result"]/div')
        href_list = []
        for div in divs:
            try:
                href = div.find_element_by_xpath('./div[2]/dl/dd/p/a').get_attribute('href')
                href_list.append(href)
            except Exception, e:
                pass
        if len(href_list) > 0:
            sql = """
                update scrapy.wangju_url set url_pptv = '%s' where id = %s
            """
            sql = sql % (','.join(href_list), _id)
            Mysql.execute(sql, conn=conn)
            need_blank = False
        if need_blank:
            sql = """
                update scrapy.wangju_url set url_pptv = '%s' where id = %s
            """
            sql = sql % ('', _id)
            Mysql.execute(sql, conn=conn)
    driver.quit()

def parse_unique_url():
    conn = Mysql.createOfflineConn()
    sql = """
        select id, tv_name, url_pptv from scrapy.wangju_url where url_pptv is not null and url_pptv != '' and pptv_finished is null order by id asc
    """
    rows = Mysql.getAll(sql, conn=conn)

    driver = webdriver.Firefox()
    driver.set_page_load_timeout(10)
    for row in rows:
        _id = row['id']
        tv_name = row['tv_name']
        url_pptv = row['url_pptv']

        urls = url_pptv.split(',')
        for url in urls:
            try:
                driver.get(url)
            except Exception, e:
                try:
                    driver.execute_script('window.stop()')
                except:
                    continue
            try:
                nav_type = driver.find_element_by_xpath('//div[@class="module module-bread-nav cf"]/p/a').text
                if nav_type != u'电视剧':
                    continue
                else:
                    title = driver.find_element_by_xpath('//div[@class="module-dpage-info"]/div[1]/h3').text
                    content = driver.find_element_by_xpath('//div[@class="module-dpage-info"]/div[2]').get_attribute('textContent')
                    
                    sql = """
                        insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
                    """
                    value = (_id, tv_name, url, title, content, 'pptv')
                    Mysql.insertOne(sql, value=value, conn=conn)
            except Exception, e:
                pass
        sql = """
            update scrapy.wangju_url set pptv_finished = '%s' where id = %s
        """
        sql = sql % ('1', _id)
        Mysql.execute(sql, conn=conn)
    
    driver.quit()

def scrapy_fenji():
    pass

def parse_content():
    conn = Mysql.createOfflineConn()

    sql = """
        select id, tv_name, url, content from scrapy.wangju_all_url where source = 'pptv' order by id asc
    """
    rows = Mysql.getAll(sql, conn=conn)

    for row in rows:
        _id = row['id']
        tv_name = row['tv_name']
        url = row['url']
        content = row['content']

        import re
        m = re.search(ur'评分：\d+(.)\d+', content)
        score = '0'
        if m is not None:
            score = m.group(0)

        play = '0'
        m = re.search(ur'播放：\d+(.)\d+[(亿)(万)]', content)
        if m is not None:
            play = m.group(0)

        sql = """
            update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'pptv'
        """
        sql = sql % (score, play, url)
        Mysql.execute(sql, conn=conn)

if __name__ == '__main__':
    # scrapy_url()
    # parse_unique_url()
    parse_content()