#/usr/bin/env python #coding=utf-8 import random import sys import time from selenium import webdriver from urllib import quote from fty_util.common import Mysql reload(sys) sys.setdefaultencoding('utf8') """ 腾讯视频爬取规则 1、scrapy_url 通过搜索页面,爬取搜索到的最有可能是电视剧页面的url 2、scrapy_data 进入搜索到的详情页面,爬取评分,每集url(播放数量在每集页面上显示) 3、todo 爬取每页详情页 腾讯视频通过搜索到的详情页面没有播放数量和评论数量,需要一个个页面解析 搜索页面-->搜索详情页面-->播放页面(只需取第一集播放页面即可) 所以只有在播放页面爬取到播放量即可。 """ def scrapy_url(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name from scrapy.wangju_url order by id asc """ rows = Mysql.getAll(sql, conn=conn) for row in rows: driver = webdriver.PhantomJS() driver.set_page_load_timeout(10) _id = row['id'] tv_name = row['tv_name'] url = 'https://v.qq.com/x/search/?q=' + quote(str(tv_name)) try: driver.get(url) except Exception, e: driver.execute_script('window.stop()') divs = driver.find_elements_by_xpath('//div[@class="wrapper_main"]/div') for div in divs: try: title = div.find_element_by_xpath('./div[1]/div/h2/a/em').text href = div.find_element_by_xpath('./div[1]/div/h2/a').get_attribute('href') if 'v.qq.com/detail' in href: print href sql = """ insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s) """ value = (_id, tv_name, href, title, '', 'tengxun') Mysql.insertOne(sql, value=value, conn=conn) time.sleep(1) except Exception, e: print e continue driver.quit() # 爬取搜索到的详情页面 def scrapy_data(): conn = Mysql.createOfflineConn() # sql = """ # select id, tv_name, url_tengxun from scrapy.wangju_url where url_tengxun is not null and url_tengxun != '' and tengxun_fenji is null order by id asc # """ sql = """ select id, tv_name, url, title from scrapy.wangju_all_url where source = 'tengxun' order by id asc """ rows = Mysql.getAll(sql, conn=conn) for row in rows: driver = webdriver.PhantomJS() driver.set_page_load_timeout(10) _id = row['id'] tv_name = row['tv_name'] url = row['url'] try: driver.get(url) except Exception, e: driver.execute_script('window.stop()') # 爬取内容 try: content = driver.find_element_by_xpath('//div[@class="container_inner"]').get_attribute('textContent') except Exception, e: content = '' try: pagelist = driver.find_elements_by_xpath('//div[@class="mod_episode"]/span') if pagelist is not None: data_list = [] for page in pagelist: num = page.find_element_by_xpath('./a/span').text num = num.replace(' ', '').replace('\n', '') href = page.find_element_by_xpath('./a').get_attribute('href') if 'v.qq.com' in href: data_list.append((_id, tv_name, num, href, 'tengxun')) # 插入分集数据 if data_list is not None and len(data_list) > 0: sql = """ insert into scrapy.wangju_fenji_url (id, tv_name, num, url, source) values (%s, %s, %s, %s, %s) """ Mysql.insertMany(sql, data_list, conn) except Exception, e: pass # 更新内容 sql = """ update scrapy.wangju_all_url set content = %s where url = %s """ value = (content, url) Mysql.execute(sql, param=value, conn=conn) driver.quit() # 爬取播放页面 def scrapy_play_page(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name, url from scrapy.wangju_fenji_url where source = 'tengxun' and num = '1' order by id asc """ rows = Mysql.getAll(sql, conn=conn) for row in rows: driver = webdriver.Firefox() driver.set_page_load_timeout(10) _id = row['id'] tv_name = row['tv_name'] url = row['url'] if 'v.qq.com' not in url: driver.quit() continue else: try: driver.get(url) except Exception, e: print e driver.execute_script('window.stop()') try: count = driver.find_element_by_xpath('//em[@id="mod_cover_playnum"]').text except Exception, e: print e count = 0 print count sql = """ update scrapy.wangju_url set tengxun_playtimes = '%s' where id = %s """ sql = sql % (count, _id) Mysql.execute(sql, conn=conn) driver.quit() def parse_wangju_all_url_data(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name, url from scrapy.wangju_all_url where source = 'tengxun' order by id asc """ rows = Mysql.getAll(sql, conn=conn) driver = webdriver.Firefox() driver.set_page_load_timeout(10) driver2 = webdriver.Firefox() driver2.set_page_load_timeout(10) for row in rows: _id = row['id'] tv_name = row['tv_name'] url = row['url'] try: driver.get(url) except Exception, e: print e driver.execute_script('window.stop()') try: score = driver.find_element_by_xpath('//div[@class="video_score"]').text score = score.replace(' ', '').replace('\n', '') except: score = '' try: pagelist = driver.find_elements_by_xpath('//span[@class="item"]') except: pagelist = None try: page_dict = dict() if pagelist is not None: for page in pagelist: episode = page.find_element_by_xpath('./a').get_attribute('href') episode_text = page.find_element_by_xpath('./a/span').text page_dict[episode_text] = episode if page_dict.get('1') is not None and 'v.qq.com' in page_dict.get('1'): try: driver2.get(page_dict.get('1')) except Exception, e: print e driver2.execute_script('window.stop()') try: count = driver2.find_element_by_xpath('//em[@id="mod_cover_playnum"]').text except Exception, e: print e count = 0 sql = """ update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'tengxun' """ sql = sql % (score, count, url) Mysql.execute(sql, conn=conn) else: sql = """ delete from scrapy.wangju_all_url where url = '%s' and source = '%s' """ sql = sql % (url, 'tengxun') Mysql.execute(sql, conn=conn) except Exception, e: continue if __name__ == '__main__': # scrapy_url() # scrapy_data() # scrapy_play_page() parse_wangju_all_url_data()