lianghua
/
py_script


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
							#/usr/bin/env python
#coding=utf-8

import random
import sys
import time

from selenium import webdriver
from urllib import quote

from fty_util.common import Mysql

reload(sys)
sys.setdefaultencoding('utf8')

def scrapy_tianyancha():
    
    conn = Mysql.createOfflineConn()
    urls = []
    for i in range(1, 33):
        urls.append(str('http://hangzhou.tianyancha.com/search/p' + str(i) + '?key=%E6%96%87%E5%8C%96%E4%BC%A0%E5%AA%92'))

    driver = webdriver.Firefox()
    driver.set_page_load_timeout(10)

    for url in urls:
        try:
            driver.get(url)
        except Exception, e:
            print url
            try:
                driver.execute_script('window.stop()')
            except Exception, e:
                pass
        time.sleep(10)
        try:
            divs = driver.find_elements_by_xpath('//div[@id="ng-view"]/div[2]/div/div/div[1]/div[3]/div')
        except Exception, e:
            continue
        for div in divs:
            try:
                title = div.find_element_by_xpath('./div[2]/div/div[1]/a').get_attribute('textContent')
                href = div.find_element_by_xpath('./div[2]/div/div[1]/a').get_attribute('href')

                sql = """
                    insert into scrapy.scrapy_tianyancha (name, url) values (%s, %s)
                """
                value = (title, href)
                Mysql.insertOne(sql, value=value, conn=conn)
            except Exception, e:
                pass
    driver.quit()

def parse_detail():
    
    conn = Mysql.createOfflineConn()

    sql = """
        select id, url from scrapy.scrapy_tianyancha where content1 = '' or content1 is null order by id asc
    """
    rows = Mysql.getAll(sql, conn=conn)
    driver = webdriver.Firefox()
    driver.set_page_load_timeout(10)

    for row in rows:
        _id = row['id']
        url = row['url']

        try:
            driver.get(url)
        except Exception, e:
            print url
            try:
                driver.execute_script('window.stop()')
            except Exception, e:
                pass
        
        time.sleep(5)
        try:
            content1 = driver.find_element_by_xpath('//div[@class="company_info_text"]').get_attribute('textContent')
            content2_list = driver.find_elements_by_xpath('//div[@class="baseinfo-module-item"]')
            content2 = ''
            for content in content2_list:
                content2 = content2 + content.find_element_by_xpath('.').get_attribute('textContent')
        except Exception, e:
            content1 = ''
            content2 = ''
        sql = """
            update scrapy.scrapy_tianyancha set content1 = '%s', content2 = '%s' where id = %s
        """
        sql = sql % (content1, content2, _id)
        Mysql.execute(sql, conn=conn)
    driver.quit()

if __name__ == '__main__':
    # scrapy_tianyancha()
    parse_detail()