#/usr/bin/env python #coding=utf-8 import random import sys import time from selenium import webdriver from urllib import quote from fty_util.common import Mysql reload(sys) sys.setdefaultencoding('utf8') """ 爱奇艺爬取规则 1、scrapy_url 通过搜索页面,爬取搜索到的电视剧页面url 2、scrapy_data 进入搜索到的详情页面,爬取内容、每集url(播放数量在每集页面上显示) 3、scrapy_play_page 进入第一集的播放页面,爬取播放记录数 4、todo 每天爬取每页信息 爱奇艺通过搜索到的详情页面没有播放数量和评论数量,需要一个个页面解析 搜索页面-->搜索详情页面-->播放页面(只需取第一集播放页面即可)-->真实详情页面(爬取播放数量和评论数量(评论暂时爬不到)) 所以只要在播放页面爬取到播放量即可。 """ # 爬取搜索页面 def scrapy_url(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name from scrapy.wangju_url order by id asc """ rows = Mysql.getAll(sql, conn=conn) for row in rows: driver = webdriver.PhantomJS() driver.set_page_load_timeout(10) _id = row['id'] tv_name = row['tv_name'] url = 'http://so.iqiyi.com/so/q_' + quote(str(tv_name)) try: driver.get(url) except Exception, e: driver.execute_script('window.stop()') lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li') for li in lis: try: title = li.find_element_by_xpath('./div/h3/a').get_attribute('title') href = li.find_element_by_xpath('./div/h3/a').get_attribute('href') if 'www.iqiyi.com/lib' in href: print href sql = """ insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s) """ value = (_id, tv_name, href, title, '', 'iqiyi') Mysql.insertOne(sql, value=value, conn=conn) time.sleep(1) except Exception, e: print e continue driver.quit() # 爬取搜索到的详情页面 def scrapy_data(): conn = Mysql.createOfflineConn() # sql = """ # select id, tv_name, url_iqiyi from scrapy.wangju_url where url_iqiyi is not null and url_iqiyi != '' and iqiyi_fenji is null order by id asc # """ sql = """ select id, tv_name, url, title from scrapy.wangju_all_url where source = 'iqiyi' order by id asc """ rows = Mysql.getAll(sql, conn=conn) for row in rows: driver = webdriver.PhantomJS() driver.set_page_load_timeout(10) _id = row['id'] tv_name = row['tv_name'] url = row['url'] title = row['title'] try: driver.get(url) except Exception, e: driver.execute_script('window.stop()') # 爬取内容 try: content = driver.find_element_by_xpath('//div[@class="result_detail"]').get_attribute('textContent') except Exception, e: content = '' # 爬取分集 try: pagelist = driver.find_elements_by_xpath('//div[@class="mod_album_lists clearfix"]/div[3]/div/ul/li') except Exception, e: # 如果没有隐藏的集数,则用显示的集数 try: pagelist = driver.find_elements_by_xpath('//div[@class="mod_album_lists clearfix"]/div/ul/li') except Exception, e: pagelist = None pass if pagelist is not None: # 如果集数存在,则爬取每集url,用于爬取播放量和评论量 data_list = [] for page in pagelist: num = page.find_element_by_xpath('./a').get_attribute('title') num = num.replace(' ', '').replace('\n', '') href = page.find_element_by_xpath('./a').get_attribute('href') if 'www.iqiyi.com' in href: data_list.append((_id, tv_name, num, href, 'iqiyi')) # 插入分集数据 if data_list is not None and len(data_list) > 0: sql = """ insert into scrapy.wangju_fenji_url (id, tv_name, num, url, source) values (%s, %s, %s, %s, %s) """ Mysql.insertMany(sql, data_list, conn) # 更新内容 sql = """ update scrapy.wangju_all_url set content = %s where url = %s """ value = (content, url) Mysql.execute(sql, param=value, conn=conn) driver.quit() # 爬取播放页面 def scrapy_play_page(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name, url from scrapy.wangju_fenji_url where source = 'iqiyi' and num = '1' order by id asc """ rows = Mysql.getAll(sql, conn=conn) for row in rows: driver = webdriver.Firefox() driver.set_page_load_timeout(10) _id = row['id'] tv_name = row['tv_name'] url = row['url'] if 'www.iqiyi.com' not in url: driver.quit() continue else: try: driver.get(url) except Exception, e: print e driver.execute_script('window.stop()') try: count = driver.find_element_by_xpath('//span[@id="widget-playcount"]').text except Exception, e: print e count = 0 print count sql = """ update scrapy.wangju_url set iqiyi_playtimes = '%s' where id = %s """ sql = sql % (count, _id) Mysql.execute(sql, conn=conn) driver.quit() # 每天爬取播放页面(爱奇艺只有每集的评论数量,没有每集播放数量) def scrapy_play_page_everyday(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name, num, url from scrapy.wangju_fenji_url where source = 'iqiyi' order by id asc """ rows = Mysql.getAll(sql, conn=conn) for row in rows: driver = webdriver.Firefox() driver.set_page_load_timeout(20) _id = row['id'] tv_name = row['tv_name'] num = row['num'] url = row['url'] if 'www.iqiyi.com' not in url: driver.quit() sql = """ delete from scrapy.wangju_fenji_url where url = '%s' """ sql = sql % (url,) Mysql.execute(sql, conn=conn) continue else: try: driver.get(url) except Exception, e: print e driver.execute_script('window.stop()') try: commenttimes = driver.find_element_by_xpath('//a[@class="blm-tab"]/em/i').text except Exception, e: print e commenttimes = '' print url print commenttimes # sql = """ # insert into scrapy.wangju_fenji_data (id, tv_name, num, source, palytimes, commenttimes) values (%s, %s, %s, %s, %s, %s) # """ # value = (_id, tv_name, num, 'iqiyi', playtimes, commenttimes) def parse_wangju_all_url_data(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name, url from scrapy.wangju_all_url where source = 'iqiyi' and (playtimes = '' or playtimes = '0') order by id asc """ rows = Mysql.getAll(sql, conn=conn) driver = webdriver.Firefox() driver.set_page_load_timeout(10) driver2 = webdriver.Firefox() driver2.set_page_load_timeout(10) for row in rows: _id = row['id'] tv_name = row['tv_name'] url = row['url'] try: driver.get(url) except Exception, e: print e driver.execute_script('window.stop()') try: score = driver.find_element_by_xpath('//span[@class="score_font"]').text score = score.replace(' ', '').replace('\n', '') except: score = '' try: pagelist = driver.find_elements_by_xpath('//li[@class="album_item"]') except Exception, e: pass pagelist = None try: if pagelist is not None: page_dict = dict() for page in pagelist: try: episode = page.find_element_by_xpath('./a').get_attribute('href') episode_text = page.find_element_by_xpath('./a').text page_dict[episode_text] = episode except: continue if page_dict.get('1') is not None and 'www.iqiyi.com' in page_dict.get('1'): try: driver2.get(page_dict.get('1')) time.sleep(5) except Exception, e: print e driver2.execute_script('window.stop()') try: count = driver2.find_element_by_xpath('//a[@id="chartTrigger"]/span').text except Exception, e: print e count = '0' print count sql = """ update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'iqiyi' """ sql = sql % (score, count, url) Mysql.execute(sql, conn=conn) else: sql = """ delete from scrapy.wangju_all_url where url = '%s' and source = '%s' """ sql = sql % (url, 'iqiyi') Mysql.execute(sql, conn=conn) else: sql = """ delete from scrapy.wangju_all_url where url = '%s' and source = '%s' """ sql = sql % (url, 'iqiyi') Mysql.execute(sql, conn=conn) except Exception, e: continue if __name__ == '__main__': # scrapy_url() # scrapy_data() # scrapy_play_page() # scrapy_play_page_everyday() parse_wangju_all_url_data()