scrapy_tv_unhandle.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. #/usr/bin/env python
  2. #coding=utf-8
  3. import random
  4. import sys
  5. import time
  6. from selenium import webdriver
  7. from urllib import quote
  8. from fty_util.common import Mysql
  9. reload(sys)
  10. sys.setdefaultencoding('utf8')
  11. """
  12. 从爱奇艺中爬取百度百科没有爬到的内容
  13. """
  14. # 爬取搜索页面
  15. def scrapy_url():
  16. conn = Mysql.createOfflineConn()
  17. sql = """
  18. select max(tv_id) as tv_id from scrapy.iqiyi_dianshiju_detail
  19. """
  20. max_id = Mysql.getOne(sql, conn=conn)
  21. if max_id is None or max_id[0] == 0:
  22. max_tv_id = 0
  23. else:
  24. max_tv_id = max_id[0]
  25. sql = """
  26. select id, name from tv_lib.yxb_tv_series where id > %s and status = 12 order by id asc
  27. """
  28. sql = sql % (max_tv_id,)
  29. rows = Mysql.getAll(sql, conn=conn)
  30. driver = webdriver.PhantomJS()
  31. driver.set_page_load_timeout(10)
  32. driver2 = webdriver.PhantomJS()
  33. driver2.set_page_load_timeout(10)
  34. for row in rows:
  35. _id = row['id']
  36. name = row['name']
  37. url = 'http://so.iqiyi.com/so/q_' + quote(str(name))
  38. try:
  39. driver.get(url)
  40. except Exception, e:
  41. driver.execute_script('window.stop()')
  42. lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li')
  43. for li in lis:
  44. try:
  45. title = li.find_element_by_xpath('./div/h3/a').get_attribute('title')
  46. href = li.find_element_by_xpath('./div/h3/a').get_attribute('href')
  47. if 'www.iqiyi.com/lib' in href:
  48. print href
  49. try:
  50. driver2.get(href)
  51. except:
  52. pass
  53. content = driver2.find_element_by_xpath('//div[@class="result_detail"]').get_attribute('textContent')
  54. if content is None:
  55. content = ''
  56. desc = driver2.find_element_by_xpath('//div[@class="mod-body introduce-info"]').get_attribute('textContext')
  57. if desc is None:
  58. desc = ''
  59. content = content + '\n' + '概述:' + desc
  60. sql = """
  61. insert into scrapy.iqiyi_dianshiju_detail (tv_id, tv_name, title, detail_info_text, url) values (%s, %s, %s, %s, %s)
  62. """
  63. value = (_id, name, title, content, href)
  64. Mysql.insertOne(sql, value=value, conn=conn)
  65. except Exception, e:
  66. print e
  67. continue
  68. driver.quit()
  69. driver2.quit()
  70. if __name__ == '__main__':
  71. scrapy_url()