1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859 |
- #/usr/bin/env python
- #coding=utf-8
- import random
- import sys
- import time
- from selenium import webdriver
- from urllib import quote
- from fty_util.common import Mysql
- reload(sys)
- sys.setdefaultencoding('utf8')
- conn = Mysql.createOfflineConn()
- sql = """
- select id, tv_name from scrapy.wangju_url where url_kankan is null order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- for row in rows:
-
- _id = row['id']
- tv_name = row['tv_name']
- url = 'http://search.kankan.com/search.php?keyword=' + quote(str(tv_name))
- need_blank = True
- try:
- driver.get(url)
- except Exception, e:
- driver.execute_script('window.stop()')
- # 解析第一页
- divs = driver.find_elements_by_xpath('//div[@class="searchmain"]/div')
- for div in divs:
- try:
- title = div.find_element_by_xpath('//div[@class="reuslt_tt"]/h2/a').get_attribute('title')
- href = div.find_element_by_xpath('./div/a').get_attribute('href')
- _type = div.find_element_by_xpath('./div/div[2]').get_attribute('textContent')
- sources = div.find_element_by_xpath('//ul[@class="sitelist"]').get_attribute('textContent')
- if tv_name == title and u'电视剧' in _type and u'响巢看看' in sources:
- sql = """
- update scrapy.wangju_url set url_kankan = '%s' where id = %s
- """
- sql = sql % (href, _id)
- Mysql.execute(sql, conn=conn)
- need_blank = False
- except Exception, e:
- continue
- if need_blank:
- sql = """
- update scrapy.wangju_url set url_kankan = '%s' where id = %s
- """
- sql = sql % ('', _id)
- Mysql.execute(sql, conn=conn)
- driver.quit()
|