12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- #/usr/bin/env python
- #coding=utf-8
- import random
- import sys
- import time
- from selenium import webdriver
- from urllib import quote
- from fty_util.common import Mysql
- reload(sys)
- sys.setdefaultencoding('utf8')
- def scrapy_tianyancha():
-
- conn = Mysql.createOfflineConn()
- urls = []
- for i in range(1, 33):
- urls.append(str('http://hangzhou.tianyancha.com/search/p' + str(i) + '?key=%E6%96%87%E5%8C%96%E4%BC%A0%E5%AA%92'))
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- for url in urls:
- try:
- driver.get(url)
- except Exception, e:
- print url
- try:
- driver.execute_script('window.stop()')
- except Exception, e:
- pass
- time.sleep(10)
- try:
- divs = driver.find_elements_by_xpath('//div[@id="ng-view"]/div[2]/div/div/div[1]/div[3]/div')
- except Exception, e:
- continue
- for div in divs:
- try:
- title = div.find_element_by_xpath('./div[2]/div/div[1]/a').get_attribute('textContent')
- href = div.find_element_by_xpath('./div[2]/div/div[1]/a').get_attribute('href')
- sql = """
- insert into scrapy.scrapy_tianyancha (name, url) values (%s, %s)
- """
- value = (title, href)
- Mysql.insertOne(sql, value=value, conn=conn)
- except Exception, e:
- pass
- driver.quit()
- def parse_detail():
-
- conn = Mysql.createOfflineConn()
- sql = """
- select id, url from scrapy.scrapy_tianyancha where content1 = '' or content1 is null order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- for row in rows:
- _id = row['id']
- url = row['url']
- try:
- driver.get(url)
- except Exception, e:
- print url
- try:
- driver.execute_script('window.stop()')
- except Exception, e:
- pass
-
- time.sleep(5)
- try:
- content1 = driver.find_element_by_xpath('//div[@class="company_info_text"]').get_attribute('textContent')
- content2_list = driver.find_elements_by_xpath('//div[@class="baseinfo-module-item"]')
- content2 = ''
- for content in content2_list:
- content2 = content2 + content.find_element_by_xpath('.').get_attribute('textContent')
- except Exception, e:
- content1 = ''
- content2 = ''
- sql = """
- update scrapy.scrapy_tianyancha set content1 = '%s', content2 = '%s' where id = %s
- """
- sql = sql % (content1, content2, _id)
- Mysql.execute(sql, conn=conn)
- driver.quit()
- if __name__ == '__main__':
- # scrapy_tianyancha()
- parse_detail()
|