scrapy_tianyancha.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. #/usr/bin/env python
  2. #coding=utf-8
  3. import random
  4. import sys
  5. import time
  6. from selenium import webdriver
  7. from urllib import quote
  8. from fty_util.common import Mysql
  9. reload(sys)
  10. sys.setdefaultencoding('utf8')
  11. def scrapy_tianyancha():
  12. conn = Mysql.createOfflineConn()
  13. urls = []
  14. for i in range(1, 33):
  15. urls.append(str('http://hangzhou.tianyancha.com/search/p' + str(i) + '?key=%E6%96%87%E5%8C%96%E4%BC%A0%E5%AA%92'))
  16. driver = webdriver.Firefox()
  17. driver.set_page_load_timeout(10)
  18. for url in urls:
  19. try:
  20. driver.get(url)
  21. except Exception, e:
  22. print url
  23. try:
  24. driver.execute_script('window.stop()')
  25. except Exception, e:
  26. pass
  27. time.sleep(10)
  28. try:
  29. divs = driver.find_elements_by_xpath('//div[@id="ng-view"]/div[2]/div/div/div[1]/div[3]/div')
  30. except Exception, e:
  31. continue
  32. for div in divs:
  33. try:
  34. title = div.find_element_by_xpath('./div[2]/div/div[1]/a').get_attribute('textContent')
  35. href = div.find_element_by_xpath('./div[2]/div/div[1]/a').get_attribute('href')
  36. sql = """
  37. insert into scrapy.scrapy_tianyancha (name, url) values (%s, %s)
  38. """
  39. value = (title, href)
  40. Mysql.insertOne(sql, value=value, conn=conn)
  41. except Exception, e:
  42. pass
  43. driver.quit()
  44. def parse_detail():
  45. conn = Mysql.createOfflineConn()
  46. sql = """
  47. select id, url from scrapy.scrapy_tianyancha where content1 = '' or content1 is null order by id asc
  48. """
  49. rows = Mysql.getAll(sql, conn=conn)
  50. driver = webdriver.Firefox()
  51. driver.set_page_load_timeout(10)
  52. for row in rows:
  53. _id = row['id']
  54. url = row['url']
  55. try:
  56. driver.get(url)
  57. except Exception, e:
  58. print url
  59. try:
  60. driver.execute_script('window.stop()')
  61. except Exception, e:
  62. pass
  63. time.sleep(5)
  64. try:
  65. content1 = driver.find_element_by_xpath('//div[@class="company_info_text"]').get_attribute('textContent')
  66. content2_list = driver.find_elements_by_xpath('//div[@class="baseinfo-module-item"]')
  67. content2 = ''
  68. for content in content2_list:
  69. content2 = content2 + content.find_element_by_xpath('.').get_attribute('textContent')
  70. except Exception, e:
  71. content1 = ''
  72. content2 = ''
  73. sql = """
  74. update scrapy.scrapy_tianyancha set content1 = '%s', content2 = '%s' where id = %s
  75. """
  76. sql = sql % (content1, content2, _id)
  77. Mysql.execute(sql, conn=conn)
  78. driver.quit()
  79. if __name__ == '__main__':
  80. # scrapy_tianyancha()
  81. parse_detail()