123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186 |
- #/usr/bin/env python
- #coding=utf-8
- import random
- import sys
- import time
- from selenium import webdriver
- from urllib import quote
- from fty_util.common import Mysql
- reload(sys)
- sys.setdefaultencoding('utf8')
- """
- 乐视视频爬取规则
- """
- def scrapy_url():
- conn = Mysql.createOfflineConn()
- sql = """
- select id, tv_name from scrapy.wangju_url where url_leshi is null order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- for row in rows:
- _id = row['id']
- tv_name = row['tv_name']
- url = 'http://so.le.com/s?wd=' + quote(str(tv_name))
- try:
- driver.get(url)
- except Exception, e:
- driver.execute_script('window.stop()')
- divs = driver.find_elements_by_xpath('//div[@class="So-detail Tv-so"]')
- href_list = []
- for div in divs:
- try:
- href = div.find_element_by_xpath('./div/div[2]/div[1]/h1/a').get_attribute('href')
- href_list.append(href)
- except Exception, e:
- pass
- if len(href_list) > 0:
- sql = """
- update scrapy.wangju_url set url_leshi = '%s' where id = %s
- """
- sql = sql % (','.join(href_list), _id)
- Mysql.execute(sql, conn=conn)
- need_blank = False
- if need_blank:
- sql = """
- update scrapy.wangju_url set url_leshi = '%s' where id = %s
- """
- sql = sql % ('', _id)
- Mysql.execute(sql, conn=conn)
- driver.quit()
- def scrapy_data():
- conn = Mysql.createOfflineConn()
- sql = """
- select id, tv_name, url_leshi from scrapy.wangju_url where url_leshi is not null and url_leshi != '' order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- for row in rows:
- _id = row['id']
- tv_name = row['tv_name']
- url_leshi = row['url_leshi']
- urls = url_leshi.split(',')
- for url in urls:
- if 'www.le.com' not in url:
- continue
- try:
- driver.get(url)
- except Exception, e:
- driver.execute_script('window.stop()')
-
- try:
- href = driver.find_element_by_xpath('//div[@id="j-adv-tv"]/div[2]/div[1]/div[2]/div[1]/div[2]/dl[1]/dt/a').get_attribute('href')
- except Exception, e:
- href = None
-
- if href is not None and 'www.le.com' in href:
- print href
- try:
- driver.get(href)
- except Exception, e:
- driver.execute_script('window.stop()')
- try:
- content = driver.find_element_by_xpath('//div[@class="Info"]').get_attribute('textContent')
- except Exception, e:
- continue
-
- sql = """
- insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
- """
- value = (_id, tv_name, url, '', content, 'leshi')
- Mysql.insertOne(sql, value=value, conn=conn)
- driver.quit()
- def parse_wangju_all_url_title():
- conn = Mysql.createOfflineConn()
- sql = """
- select id, tv_name, url from scrapy.wangju_all_url where source = 'leshi' order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- for row in rows:
- _id = row['id']
- tv_name = row['tv_name']
- url = row['url']
- try:
- driver.get(url)
- except Exception, e:
- print e
- driver.execute_script('window.stop()')
- try:
- title = driver.find_element_by_xpath('//div[@class="listPic active"]/div[1]/p/i').text
- except Exception, e:
- title = ''
- sql = """
- update scrapy.wangju_all_url set title = '%s' where source = '%s' and url = '%s'
- """
- sql = sql % (title, 'leshi', url)
- Mysql.execute(sql, conn=conn)
- def parse_content():
- conn = Mysql.createOfflineConn()
- sql = """
- select id, tv_name, url, content from scrapy.wangju_all_url where source = 'leshi' order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- _id = row['id']
- tv_name = row['tv_name']
- url = row['url']
- content = row['content']
- import re
- m = re.search(ur'([0-9]+[.]?)+', content)
- score = '0'
- if m is not None:
- score = m.group(0)
- play = '0'
- m = re.search(ur'播放数:([0-9]+[.]?)+[(亿)(万)]', content)
- if m is not None:
- play = m.group(0)
- sql = """
- update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'leshi'
- """
- sql = sql % (score, play, url)
- Mysql.execute(sql, conn=conn)
- if __name__ == '__main__':
- # scrapy_data()
- # scrapy_url()
-
- # parse_wangju_all_url_title()
- parse_content()
|