#/usr/bin/env python #coding=utf-8 import random import sys import time from selenium import webdriver from urllib import quote from fty_util.common import Mysql reload(sys) sys.setdefaultencoding('utf8') def scrapy_url(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name from scrapy.wangju_url where url_pptv is null order by id asc """ rows = Mysql.getAll(sql, conn=conn) driver = webdriver.Firefox() driver.set_page_load_timeout(10) for row in rows: _id = row['id'] tv_name = row['tv_name'] url = 'http://search.pptv.com/s_video?kw=' + quote(str(tv_name)) need_blank = True try: driver.get(url) except Exception, e: driver.execute_script('window.stop()') divs = driver.find_elements_by_xpath('//div[@id="search-result"]/div') href_list = [] for div in divs: try: href = div.find_element_by_xpath('./div[2]/dl/dd/p/a').get_attribute('href') href_list.append(href) except Exception, e: pass if len(href_list) > 0: sql = """ update scrapy.wangju_url set url_pptv = '%s' where id = %s """ sql = sql % (','.join(href_list), _id) Mysql.execute(sql, conn=conn) need_blank = False if need_blank: sql = """ update scrapy.wangju_url set url_pptv = '%s' where id = %s """ sql = sql % ('', _id) Mysql.execute(sql, conn=conn) driver.quit() def parse_unique_url(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name, url_pptv from scrapy.wangju_url where url_pptv is not null and url_pptv != '' and pptv_finished is null order by id asc """ rows = Mysql.getAll(sql, conn=conn) driver = webdriver.Firefox() driver.set_page_load_timeout(10) for row in rows: _id = row['id'] tv_name = row['tv_name'] url_pptv = row['url_pptv'] urls = url_pptv.split(',') for url in urls: try: driver.get(url) except Exception, e: try: driver.execute_script('window.stop()') except: continue try: nav_type = driver.find_element_by_xpath('//div[@class="module module-bread-nav cf"]/p/a').text if nav_type != u'电视剧': continue else: title = driver.find_element_by_xpath('//div[@class="module-dpage-info"]/div[1]/h3').text content = driver.find_element_by_xpath('//div[@class="module-dpage-info"]/div[2]').get_attribute('textContent') sql = """ insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s) """ value = (_id, tv_name, url, title, content, 'pptv') Mysql.insertOne(sql, value=value, conn=conn) except Exception, e: pass sql = """ update scrapy.wangju_url set pptv_finished = '%s' where id = %s """ sql = sql % ('1', _id) Mysql.execute(sql, conn=conn) driver.quit() def scrapy_fenji(): pass def parse_content(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name, url, content from scrapy.wangju_all_url where source = 'pptv' order by id asc """ rows = Mysql.getAll(sql, conn=conn) for row in rows: _id = row['id'] tv_name = row['tv_name'] url = row['url'] content = row['content'] import re m = re.search(ur'评分:\d+(.)\d+', content) score = '0' if m is not None: score = m.group(0) play = '0' m = re.search(ur'播放:\d+(.)\d+[(亿)(万)]', content) if m is not None: play = m.group(0) sql = """ update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'pptv' """ sql = sql % (score, play, url) Mysql.execute(sql, conn=conn) if __name__ == '__main__': # scrapy_url() # parse_unique_url() parse_content()