123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231 |
- #/usr/bin/env python
- #coding=utf-8
- import random
- import sys
- import time
- from selenium import webdriver
- from urllib import quote
- from fty_util.common import Mysql
- reload(sys)
- sys.setdefaultencoding('utf8')
- """
- 腾讯视频爬取规则
- 1、scrapy_url 通过搜索页面,爬取搜索到的最有可能是电视剧页面的url
- 2、scrapy_data 进入搜索到的详情页面,爬取评分,每集url(播放数量在每集页面上显示)
- 3、todo 爬取每页详情页
- 腾讯视频通过搜索到的详情页面没有播放数量和评论数量,需要一个个页面解析
- 搜索页面-->搜索详情页面-->播放页面(只需取第一集播放页面即可)
- 所以只有在播放页面爬取到播放量即可。
- """
- def scrapy_url():
- conn = Mysql.createOfflineConn()
- sql = """
- select id, tv_name from scrapy.wangju_url order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- driver = webdriver.PhantomJS()
- driver.set_page_load_timeout(10)
- _id = row['id']
- tv_name = row['tv_name']
- url = 'https://v.qq.com/x/search/?q=' + quote(str(tv_name))
- try:
- driver.get(url)
- except Exception, e:
- driver.execute_script('window.stop()')
- divs = driver.find_elements_by_xpath('//div[@class="wrapper_main"]/div')
- for div in divs:
- try:
- title = div.find_element_by_xpath('./div[1]/div/h2/a/em').text
- href = div.find_element_by_xpath('./div[1]/div/h2/a').get_attribute('href')
- if 'v.qq.com/detail' in href:
- print href
- sql = """
- insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
- """
- value = (_id, tv_name, href, title, '', 'tengxun')
- Mysql.insertOne(sql, value=value, conn=conn)
- time.sleep(1)
- except Exception, e:
- print e
- continue
- driver.quit()
- # 爬取搜索到的详情页面
- def scrapy_data():
- conn = Mysql.createOfflineConn()
- # sql = """
- # select id, tv_name, url_tengxun from scrapy.wangju_url where url_tengxun is not null and url_tengxun != '' and tengxun_fenji is null order by id asc
- # """
- sql = """
- select id, tv_name, url, title from scrapy.wangju_all_url where source = 'tengxun' order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- driver = webdriver.PhantomJS()
- driver.set_page_load_timeout(10)
- _id = row['id']
- tv_name = row['tv_name']
- url = row['url']
- try:
- driver.get(url)
- except Exception, e:
- driver.execute_script('window.stop()')
- # 爬取内容
- try:
- content = driver.find_element_by_xpath('//div[@class="container_inner"]').get_attribute('textContent')
- except Exception, e:
- content = ''
- try:
- pagelist = driver.find_elements_by_xpath('//div[@class="mod_episode"]/span')
- if pagelist is not None:
- data_list = []
- for page in pagelist:
- num = page.find_element_by_xpath('./a/span').text
- num = num.replace(' ', '').replace('\n', '')
- href = page.find_element_by_xpath('./a').get_attribute('href')
- if 'v.qq.com' in href:
- data_list.append((_id, tv_name, num, href, 'tengxun'))
- # 插入分集数据
- if data_list is not None and len(data_list) > 0:
- sql = """
- insert into scrapy.wangju_fenji_url (id, tv_name, num, url, source) values (%s, %s, %s, %s, %s)
- """
- Mysql.insertMany(sql, data_list, conn)
- except Exception, e:
- pass
-
- # 更新内容
- sql = """
- update scrapy.wangju_all_url set content = %s where url = %s
- """
- value = (content, url)
- Mysql.execute(sql, param=value, conn=conn)
- driver.quit()
- # 爬取播放页面
- def scrapy_play_page():
- conn = Mysql.createOfflineConn()
- sql = """
- select id, tv_name, url from scrapy.wangju_fenji_url where source = 'tengxun' and num = '1' order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- _id = row['id']
- tv_name = row['tv_name']
- url = row['url']
- if 'v.qq.com' not in url:
- driver.quit()
- continue
- else:
- try:
- driver.get(url)
- except Exception, e:
- print e
- driver.execute_script('window.stop()')
- try:
- count = driver.find_element_by_xpath('//em[@id="mod_cover_playnum"]').text
- except Exception, e:
- print e
- count = 0
-
- print count
- sql = """
- update scrapy.wangju_url set tengxun_playtimes = '%s' where id = %s
- """
- sql = sql % (count, _id)
- Mysql.execute(sql, conn=conn)
- driver.quit()
- def parse_wangju_all_url_data():
- conn = Mysql.createOfflineConn()
- sql = """
- select id, tv_name, url from scrapy.wangju_all_url where source = 'tengxun' order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- driver2 = webdriver.Firefox()
- driver2.set_page_load_timeout(10)
- for row in rows:
- _id = row['id']
- tv_name = row['tv_name']
- url = row['url']
-
- try:
- driver.get(url)
- except Exception, e:
- print e
- driver.execute_script('window.stop()')
- try:
- score = driver.find_element_by_xpath('//div[@class="video_score"]').text
- score = score.replace(' ', '').replace('\n', '')
- except:
- score = ''
-
- try:
- pagelist = driver.find_elements_by_xpath('//span[@class="item"]')
- except:
- pagelist = None
-
- try:
- page_dict = dict()
- if pagelist is not None:
- for page in pagelist:
- episode = page.find_element_by_xpath('./a').get_attribute('href')
- episode_text = page.find_element_by_xpath('./a/span').text
- page_dict[episode_text] = episode
- if page_dict.get('1') is not None and 'v.qq.com' in page_dict.get('1'):
- try:
- driver2.get(page_dict.get('1'))
- except Exception, e:
- print e
- driver2.execute_script('window.stop()')
- try:
- count = driver2.find_element_by_xpath('//em[@id="mod_cover_playnum"]').text
- except Exception, e:
- print e
- count = 0
- sql = """
- update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'tengxun'
- """
- sql = sql % (score, count, url)
- Mysql.execute(sql, conn=conn)
- else:
- sql = """
- delete from scrapy.wangju_all_url where url = '%s' and source = '%s'
- """
- sql = sql % (url, 'tengxun')
- Mysql.execute(sql, conn=conn)
- except Exception, e:
- continue
-
-
- if __name__ == '__main__':
- # scrapy_url()
- # scrapy_data()
- # scrapy_play_page()
- parse_wangju_all_url_data()
|