123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294 |
- #/usr/bin/env python
- #coding=utf-8
- import random
- import sys
- import time
- from selenium import webdriver
- from urllib import quote
- from fty_util.common import Mysql
- reload(sys)
- sys.setdefaultencoding('utf8')
- """
- 爱奇艺爬取规则
- 1、scrapy_url 通过搜索页面,爬取搜索到的电视剧页面url
- 2、scrapy_data 进入搜索到的详情页面,爬取内容、每集url(播放数量在每集页面上显示)
- 3、scrapy_play_page 进入第一集的播放页面,爬取播放记录数
- 4、todo 每天爬取每页信息
- 爱奇艺通过搜索到的详情页面没有播放数量和评论数量,需要一个个页面解析
- 搜索页面-->搜索详情页面-->播放页面(只需取第一集播放页面即可)-->真实详情页面(爬取播放数量和评论数量(评论暂时爬不到))
- 所以只要在播放页面爬取到播放量即可。
- """
- # 爬取搜索页面
- def scrapy_url():
- conn = Mysql.createOfflineConn()
- sql = """
- select id, tv_name from scrapy.wangju_url order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- driver = webdriver.PhantomJS()
- driver.set_page_load_timeout(10)
- _id = row['id']
- tv_name = row['tv_name']
- url = 'http://so.iqiyi.com/so/q_' + quote(str(tv_name))
- try:
- driver.get(url)
- except Exception, e:
- driver.execute_script('window.stop()')
- lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li')
- for li in lis:
- try:
- title = li.find_element_by_xpath('./div/h3/a').get_attribute('title')
- href = li.find_element_by_xpath('./div/h3/a').get_attribute('href')
- if 'www.iqiyi.com/lib' in href:
- print href
- sql = """
- insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
- """
- value = (_id, tv_name, href, title, '', 'iqiyi')
- Mysql.insertOne(sql, value=value, conn=conn)
- time.sleep(1)
- except Exception, e:
- print e
- continue
- driver.quit()
- # 爬取搜索到的详情页面
- def scrapy_data():
-
- conn = Mysql.createOfflineConn()
- # sql = """
- # select id, tv_name, url_iqiyi from scrapy.wangju_url where url_iqiyi is not null and url_iqiyi != '' and iqiyi_fenji is null order by id asc
- # """
- sql = """
- select id, tv_name, url, title from scrapy.wangju_all_url where source = 'iqiyi' order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- driver = webdriver.PhantomJS()
- driver.set_page_load_timeout(10)
- _id = row['id']
- tv_name = row['tv_name']
- url = row['url']
- title = row['title']
-
- try:
- driver.get(url)
- except Exception, e:
- driver.execute_script('window.stop()')
-
- # 爬取内容
- try:
- content = driver.find_element_by_xpath('//div[@class="result_detail"]').get_attribute('textContent')
- except Exception, e:
- content = ''
-
- # 爬取分集
- try:
- pagelist = driver.find_elements_by_xpath('//div[@class="mod_album_lists clearfix"]/div[3]/div/ul/li')
- except Exception, e:
- # 如果没有隐藏的集数,则用显示的集数
- try:
- pagelist = driver.find_elements_by_xpath('//div[@class="mod_album_lists clearfix"]/div/ul/li')
- except Exception, e:
- pagelist = None
- pass
- if pagelist is not None:
- # 如果集数存在,则爬取每集url,用于爬取播放量和评论量
- data_list = []
- for page in pagelist:
- num = page.find_element_by_xpath('./a').get_attribute('title')
- num = num.replace(' ', '').replace('\n', '')
- href = page.find_element_by_xpath('./a').get_attribute('href')
- if 'www.iqiyi.com' in href:
- data_list.append((_id, tv_name, num, href, 'iqiyi'))
- # 插入分集数据
- if data_list is not None and len(data_list) > 0:
- sql = """
- insert into scrapy.wangju_fenji_url (id, tv_name, num, url, source) values (%s, %s, %s, %s, %s)
- """
- Mysql.insertMany(sql, data_list, conn)
-
- # 更新内容
- sql = """
- update scrapy.wangju_all_url set content = %s where url = %s
- """
- value = (content, url)
- Mysql.execute(sql, param=value, conn=conn)
- driver.quit()
- # 爬取播放页面
- def scrapy_play_page():
- conn = Mysql.createOfflineConn()
- sql = """
- select id, tv_name, url from scrapy.wangju_fenji_url where source = 'iqiyi' and num = '1' order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- _id = row['id']
- tv_name = row['tv_name']
- url = row['url']
- if 'www.iqiyi.com' not in url:
- driver.quit()
- continue
- else:
- try:
- driver.get(url)
- except Exception, e:
- print e
- driver.execute_script('window.stop()')
- try:
- count = driver.find_element_by_xpath('//span[@id="widget-playcount"]').text
- except Exception, e:
- print e
- count = 0
-
- print count
- sql = """
- update scrapy.wangju_url set iqiyi_playtimes = '%s' where id = %s
- """
- sql = sql % (count, _id)
- Mysql.execute(sql, conn=conn)
- driver.quit()
- # 每天爬取播放页面(爱奇艺只有每集的评论数量,没有每集播放数量)
- def scrapy_play_page_everyday():
- conn = Mysql.createOfflineConn()
- sql = """
- select id, tv_name, num, url from scrapy.wangju_fenji_url where source = 'iqiyi' order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(20)
- _id = row['id']
- tv_name = row['tv_name']
- num = row['num']
- url = row['url']
- if 'www.iqiyi.com' not in url:
- driver.quit()
- sql = """
- delete from scrapy.wangju_fenji_url where url = '%s'
- """
- sql = sql % (url,)
- Mysql.execute(sql, conn=conn)
- continue
- else:
- try:
- driver.get(url)
- except Exception, e:
- print e
- driver.execute_script('window.stop()')
- try:
- commenttimes = driver.find_element_by_xpath('//a[@class="blm-tab"]/em/i').text
- except Exception, e:
- print e
- commenttimes = ''
-
- print url
- print commenttimes
-
- # sql = """
- # insert into scrapy.wangju_fenji_data (id, tv_name, num, source, palytimes, commenttimes) values (%s, %s, %s, %s, %s, %s)
- # """
- # value = (_id, tv_name, num, 'iqiyi', playtimes, commenttimes)
- def parse_wangju_all_url_data():
- conn = Mysql.createOfflineConn()
- sql = """
- select id, tv_name, url from scrapy.wangju_all_url where source = 'iqiyi' and (playtimes = '' or playtimes = '0') order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- driver2 = webdriver.Firefox()
- driver2.set_page_load_timeout(10)
- for row in rows:
- _id = row['id']
- tv_name = row['tv_name']
- url = row['url']
-
- try:
- driver.get(url)
- except Exception, e:
- print e
- driver.execute_script('window.stop()')
- try:
- score = driver.find_element_by_xpath('//span[@class="score_font"]').text
- score = score.replace(' ', '').replace('\n', '')
- except:
- score = ''
- try:
- pagelist = driver.find_elements_by_xpath('//li[@class="album_item"]')
- except Exception, e:
- pass
- pagelist = None
- try:
- if pagelist is not None:
- page_dict = dict()
- for page in pagelist:
- try:
- episode = page.find_element_by_xpath('./a').get_attribute('href')
- episode_text = page.find_element_by_xpath('./a').text
- page_dict[episode_text] = episode
- except:
- continue
- if page_dict.get('1') is not None and 'www.iqiyi.com' in page_dict.get('1'):
- try:
- driver2.get(page_dict.get('1'))
- time.sleep(5)
- except Exception, e:
- print e
- driver2.execute_script('window.stop()')
- try:
- count = driver2.find_element_by_xpath('//a[@id="chartTrigger"]/span').text
- except Exception, e:
- print e
- count = '0'
- print count
- sql = """
- update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'iqiyi'
- """
- sql = sql % (score, count, url)
- Mysql.execute(sql, conn=conn)
- else:
- sql = """
- delete from scrapy.wangju_all_url where url = '%s' and source = '%s'
- """
- sql = sql % (url, 'iqiyi')
- Mysql.execute(sql, conn=conn)
- else:
- sql = """
- delete from scrapy.wangju_all_url where url = '%s' and source = '%s'
- """
- sql = sql % (url, 'iqiyi')
- Mysql.execute(sql, conn=conn)
- except Exception, e:
- continue
- if __name__ == '__main__':
- # scrapy_url()
- # scrapy_data()
- # scrapy_play_page()
- # scrapy_play_page_everyday()
- parse_wangju_all_url_data()
|