#/usr/bin/env python #coding=utf-8 import random import sys import time from selenium import webdriver from urllib import quote from fty_util.common import Mysql reload(sys) sys.setdefaultencoding('utf8') conn = Mysql.createOfflineConn() sql = """ select id, tv_name from scrapy.wangju_url where url_kankan is null order by id asc """ rows = Mysql.getAll(sql, conn=conn) driver = webdriver.Firefox() driver.set_page_load_timeout(10) for row in rows: _id = row['id'] tv_name = row['tv_name'] url = 'http://search.kankan.com/search.php?keyword=' + quote(str(tv_name)) need_blank = True try: driver.get(url) except Exception, e: driver.execute_script('window.stop()') # 解析第一页 divs = driver.find_elements_by_xpath('//div[@class="searchmain"]/div') for div in divs: try: title = div.find_element_by_xpath('//div[@class="reuslt_tt"]/h2/a').get_attribute('title') href = div.find_element_by_xpath('./div/a').get_attribute('href') _type = div.find_element_by_xpath('./div/div[2]').get_attribute('textContent') sources = div.find_element_by_xpath('//ul[@class="sitelist"]').get_attribute('textContent') if tv_name == title and u'电视剧' in _type and u'响巢看看' in sources: sql = """ update scrapy.wangju_url set url_kankan = '%s' where id = %s """ sql = sql % (href, _id) Mysql.execute(sql, conn=conn) need_blank = False except Exception, e: continue if need_blank: sql = """ update scrapy.wangju_url set url_kankan = '%s' where id = %s """ sql = sql % ('', _id) Mysql.execute(sql, conn=conn) driver.quit()