#/usr/bin/env python #coding=utf-8 import random import sys import time from selenium import webdriver from urllib import quote from fty_util.common import Mysql reload(sys) sys.setdefaultencoding('utf8') """ 乐视视频爬取规则 """ def scrapy_url(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name from scrapy.wangju_url where url_leshi is null order by id asc """ rows = Mysql.getAll(sql, conn=conn) driver = webdriver.Firefox() driver.set_page_load_timeout(10) for row in rows: _id = row['id'] tv_name = row['tv_name'] url = 'http://so.le.com/s?wd=' + quote(str(tv_name)) try: driver.get(url) except Exception, e: driver.execute_script('window.stop()') divs = driver.find_elements_by_xpath('//div[@class="So-detail Tv-so"]') href_list = [] for div in divs: try: href = div.find_element_by_xpath('./div/div[2]/div[1]/h1/a').get_attribute('href') href_list.append(href) except Exception, e: pass if len(href_list) > 0: sql = """ update scrapy.wangju_url set url_leshi = '%s' where id = %s """ sql = sql % (','.join(href_list), _id) Mysql.execute(sql, conn=conn) need_blank = False if need_blank: sql = """ update scrapy.wangju_url set url_leshi = '%s' where id = %s """ sql = sql % ('', _id) Mysql.execute(sql, conn=conn) driver.quit() def scrapy_data(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name, url_leshi from scrapy.wangju_url where url_leshi is not null and url_leshi != '' order by id asc """ rows = Mysql.getAll(sql, conn=conn) driver = webdriver.Firefox() driver.set_page_load_timeout(10) for row in rows: _id = row['id'] tv_name = row['tv_name'] url_leshi = row['url_leshi'] urls = url_leshi.split(',') for url in urls: if 'www.le.com' not in url: continue try: driver.get(url) except Exception, e: driver.execute_script('window.stop()') try: href = driver.find_element_by_xpath('//div[@id="j-adv-tv"]/div[2]/div[1]/div[2]/div[1]/div[2]/dl[1]/dt/a').get_attribute('href') except Exception, e: href = None if href is not None and 'www.le.com' in href: print href try: driver.get(href) except Exception, e: driver.execute_script('window.stop()') try: content = driver.find_element_by_xpath('//div[@class="Info"]').get_attribute('textContent') except Exception, e: continue sql = """ insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s) """ value = (_id, tv_name, url, '', content, 'leshi') Mysql.insertOne(sql, value=value, conn=conn) driver.quit() def parse_wangju_all_url_title(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name, url from scrapy.wangju_all_url where source = 'leshi' order by id asc """ rows = Mysql.getAll(sql, conn=conn) driver = webdriver.Firefox() driver.set_page_load_timeout(10) for row in rows: _id = row['id'] tv_name = row['tv_name'] url = row['url'] try: driver.get(url) except Exception, e: print e driver.execute_script('window.stop()') try: title = driver.find_element_by_xpath('//div[@class="listPic active"]/div[1]/p/i').text except Exception, e: title = '' sql = """ update scrapy.wangju_all_url set title = '%s' where source = '%s' and url = '%s' """ sql = sql % (title, 'leshi', url) Mysql.execute(sql, conn=conn) def parse_content(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name, url, content from scrapy.wangju_all_url where source = 'leshi' order by id asc """ rows = Mysql.getAll(sql, conn=conn) for row in rows: _id = row['id'] tv_name = row['tv_name'] url = row['url'] content = row['content'] import re m = re.search(ur'([0-9]+[.]?)+', content) score = '0' if m is not None: score = m.group(0) play = '0' m = re.search(ur'播放数:([0-9]+[.]?)+[(亿)(万)]', content) if m is not None: play = m.group(0) sql = """ update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'leshi' """ sql = sql % (score, play, url) Mysql.execute(sql, conn=conn) if __name__ == '__main__': # scrapy_data() # scrapy_url() # parse_wangju_all_url_title() parse_content()