#/usr/bin/env python #coding=utf-8 import random import sys import time import collections from selenium import webdriver from urllib import quote from fty_util.common import Mysql reload(sys) sys.setdefaultencoding('utf8') def scrapy_website(): conn = Mysql.createOfflineConn() # 将网站url和名称 放入有序字典中 websites_dict = collections.OrderedDict() sql = """ select name, account from odl.basic_weixin_subscribe where is_delete != 1 order by id asc """ websites = Mysql.getAll(sql, conn=conn) for website in websites: name = website['name'] account = website['account'] websites_dict[account] = name driver = webdriver.Firefox() driver.set_page_load_timeout(10) start_url = 'http://weixin.sogou.com/' sql = """ select tv_id, tv_name from odl.ad_tv_lib where tv_id order by id asc """ tvs = Mysql.getAll(sql, conn=conn) for tv in tvs: tv_id = tv['tv_id'] tv_name = tv['tv_name'] try: driver.get(start_url) except Exception, e: pass try: input_box = driver.find_element_by_id('upquery') submit_button = driver.find_element_by_class_name('swz') except Exception, e: driver.refresh() # 搜索条件 try: input_box.clear() input_box.send_keys(tv_name) submit_button.click() except Exception, e: print '点击请求失败' for account in websites_dict: name = websites_dict.get(account) input_box = None submit_button = None time.sleep(5) js = 'document.getElementsByClassName("time-box float")[2].style.display="block"' driver.execute_script(js) js = 'document.getElementsByClassName("s-sea")[0].value = "' + account + '"' driver.execute_script(js) js = 'document.getElementById("search_enter").click()' driver.execute_script(js) # s_sea = driver.find_element_by_class_name('s-sea') # search_enter = driver.find_element_by_id('search_enter') # s_sea.clear() # s_sea.send_keys(account) # search_enter.click() time.sleep(10) driver.execute_script('window.stop()') # driver.refresh() # 分页块 page = None try: page = driver.find_elements_by_xpath('//div[@id="pagebar_container"]/a') except: pass count = 0 # 如果分页不存在,说明记录在十条以内或没有记录 if page is None or len(page) == 0: try: divs = driver.find_elements_by_xpath('//ul[@class="news-list"]/li') if divs is not None and len(divs) > 0: count = len(divs) except Exception, e: count = 0 # 如果分页存在,判断最后一页是不是10 else: try: page_nums = driver.find_elements_by_xpath('//div[@id="pagebar_container"]/a') max_page_num = 1 max_page_href= '' for page_num in page_nums: href = page_num.find_element_by_xpath('.').get_attribute('href') page_num_text = page_num.find_element_by_xpath('.').get_attribute('textContent') page_num_text = page_num_text.replace(' ', '').replace('\n', '') # 如果只是数字 if page_num_text.isdigit(): page_num_text = int(page_num_text) if page_num_text > max_page_num: max_page_num = page_num_text max_page_href = href # 如果是下一页字符串 elif page_num_text == '下一页': break try: driver.get(max_page_href) except Exception, e: pass try: divs = driver.find_elements_by_xpath('//ul[@class="news-list"]/li') if divs is not None and len(divs) > 0: count = len(divs) except Exception, e: count = 0 count = (max_page_num - 1) * 10 + count except: continue if count != 0: sql = """ insert into scrapy.scrapy_subscribe_count (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_count) values (%s, %s, %s, %s, %s, %s) """ value = (tv_id, tv_name, 2, name, '', count) Mysql.insertOne(sql, value=value, conn=conn) driver.quit() if __name__ == '__main__': scrapy_website()