scrapy_kankan.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. #/usr/bin/env python
  2. #coding=utf-8
  3. import random
  4. import sys
  5. import time
  6. from selenium import webdriver
  7. from urllib import quote
  8. from fty_util.common import Mysql
  9. reload(sys)
  10. sys.setdefaultencoding('utf8')
  11. conn = Mysql.createOfflineConn()
  12. sql = """
  13. select id, tv_name from scrapy.wangju_url where url_kankan is null order by id asc
  14. """
  15. rows = Mysql.getAll(sql, conn=conn)
  16. driver = webdriver.Firefox()
  17. driver.set_page_load_timeout(10)
  18. for row in rows:
  19. _id = row['id']
  20. tv_name = row['tv_name']
  21. url = 'http://search.kankan.com/search.php?keyword=' + quote(str(tv_name))
  22. need_blank = True
  23. try:
  24. driver.get(url)
  25. except Exception, e:
  26. driver.execute_script('window.stop()')
  27. # 解析第一页
  28. divs = driver.find_elements_by_xpath('//div[@class="searchmain"]/div')
  29. for div in divs:
  30. try:
  31. title = div.find_element_by_xpath('//div[@class="reuslt_tt"]/h2/a').get_attribute('title')
  32. href = div.find_element_by_xpath('./div/a').get_attribute('href')
  33. _type = div.find_element_by_xpath('./div/div[2]').get_attribute('textContent')
  34. sources = div.find_element_by_xpath('//ul[@class="sitelist"]').get_attribute('textContent')
  35. if tv_name == title and u'电视剧' in _type and u'响巢看看' in sources:
  36. sql = """
  37. update scrapy.wangju_url set url_kankan = '%s' where id = %s
  38. """
  39. sql = sql % (href, _id)
  40. Mysql.execute(sql, conn=conn)
  41. need_blank = False
  42. except Exception, e:
  43. continue
  44. if need_blank:
  45. sql = """
  46. update scrapy.wangju_url set url_kankan = '%s' where id = %s
  47. """
  48. sql = sql % ('', _id)
  49. Mysql.execute(sql, conn=conn)
  50. driver.quit()