#/usr/bin/env python #coding=utf-8 import random import sys import time from selenium import webdriver from urllib import quote from fty_util.common import Mysql reload(sys) sys.setdefaultencoding('utf8') def scrapy_url(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name from scrapy.wangju_url where url_huashutv is null order by id asc """ rows = Mysql.getAll(sql, conn=conn) driver = webdriver.Firefox() driver.set_page_load_timeout(10) for row in rows: _id = row['id'] tv_name = row['tv_name'] url = 'http://www.wasu.cn/Search/show/k/' + quote(str(tv_name)) need_blank = True try: driver.get(url) except Exception, e: driver.execute_script('window.stop()') divs = driver.find_elements_by_xpath('//div[@id="agg_list"]/div') href_list = [] for div in divs: try: href = div.find_element_by_xpath('./div[1]/a[1]').get_attribute('href') href_list.append(href) except Exception, e: pass if len(href_list) > 0: sql = """ update scrapy.wangju_url set url_huashutv = '%s' where id = %s """ sql = sql % (','.join(href_list), _id) Mysql.execute(sql, conn=conn) need_blank = False if need_blank: sql = """ update scrapy.wangju_url set url_huashutv = '%s' where id = %s """ sql = sql % ('', _id) Mysql.execute(sql, conn=conn) driver.quit() def scrapy_data(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name, url_huashutv from scrapy.wangju_url where url_huashutv is not null and url_huashutv != '' order by id asc """ rows = Mysql.getAll(sql, conn=conn) driver = webdriver.Firefox() driver.set_page_load_timeout(10) for row in rows: _id = row['id'] tv_name = row['tv_name'] url_huashutv = row['url_huashutv'] urls = url_huashutv.split(',') for url in urls: if 'www.wasu.cn' not in url: continue try: driver.get(url) except Exception, e: driver.execute_script('window.stop()') try: href = driver.find_element_by_xpath('//div[@id="con_telelist_1"]/ul/li[1]/a').get_attribute('href') except Exception, e: href = None if href is not None and 'www.wasu.cn' in href: print href try: driver.get(href) except Exception, e: driver.execute_script('window.stop()') try: content = driver.find_element_by_xpath('//div[@id="play_vod_hits"]').get_attribute('textContent') except Exception, e: continue sql = """ insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s) """ value = (_id, tv_name, url, '', content, 'huashutv') Mysql.insertOne(sql, value=value, conn=conn) driver.quit() if __name__ == '__main__': scrapy_data() # scrapy_url()