123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286 |
- #/usr/bin/env python
- #coding=utf-8
- """爱奇艺电视剧分类爬取
- 分为两步
- 第一步爬取搜索页面结果,找到符合条件的电视剧
- 第二步根据保存的具体页面url爬取分类信息
- """
- import random
- import sys
- import time
- from selenium import webdriver
- from fty_util.common import Mysql
- reload(sys)
- sys.setdefaultencoding('utf8')
- class DSJ_All(object):
- # 爬取电视剧链接地址
- def get_iqiyi_url():
-
- # 需要爬取的列表页面
- start_urls = [
- # 'http://www.iqiyi.com/lib/dianshiju/,,2017_4_1.html',
- # 'http://www.iqiyi.com/lib/dianshiju/,,2016_4_1.html',
- # 'http://www.iqiyi.com/lib/dianshiju/,,2015_4_1.html',
- # 'http://www.iqiyi.com/lib/dianshiju/,,2014-2011_4_1.html',
- # 'http://www.iqiyi.com/lib/dianshiju/,,2010-2000_4_1.html',
- # 'http://www.iqiyi.com/lib/dianshiju/,,90%E5%B9%B4%E4%BB%A3_4_1.html',
- # 'http://www.iqiyi.com/lib/dianshiju/,,80%E5%B9%B4%E4%BB%A3_4_1.html',
- 'http://www.iqiyi.com/lib/dianshiju/,,%E6%9B%B4%E6%97%A9_4_1.html'
- ]
- # 打开Firefox浏览器
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(20)
- # 数据库连接
- conn = Mysql.createScrapyConn()
- for url in start_urls:
- # 打开主页
- try:
- driver.get(url)
- except:
- driver.execute_script('window.stop()')
-
- is_next = True
- while is_next:
-
- try:
- next_page = driver.find_elements_by_xpath('//div[@class="mod-page"]/a')[-1]
- except:
- continue
- lis = driver.find_elements_by_xpath('//div[@class="wrapper-piclist"]/ul/li')
- sql_insert = """
- insert into scrapy.iqiyi_dianshiju_url (url) values (%s)
- """
- data_list = []
- for li in lis:
- try:
- tv_url = li.find_element_by_xpath('.//div[1]/a').get_attribute('href')
- print tv_url
- data_list.append((tv_url,))
- except Exception, e:
- print '没有'
- continue
- time.sleep(random.uniform(0, 2))
- Mysql.insertMany(sql_insert, data_list, conn)
- try:
- next_page_text = next_page.find_element_by_xpath('.').text
- if next_page_text == '下一页':
- next_page.click()
- else:
- is_next = False;
- except:
- is_next = False;
- time.sleep(10)
- driver.quit()
- # 爬取具体页面
- def get_iqiyi_detail():
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- # 数据库连接
- conn = Mysql.createScrapyConn()
- sql = """
- select max(id) from scrapy.iqiyi_dianshiju_detail
- """
- max_id = Mysql.getOne(sql, conn=conn)
- max_id = max_id[0]
- if max_id is None:
- max_id = 0
- # 获取所有url
- sql = """
- select id, url from scrapy.iqiyi_dianshiju_url where id > '%s' order by id asc
- """
- sql = sql % (max_id)
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- _id = row['id']
- url = row['url']
- print url
- try:
- driver.get(url)
- except:
- driver.execute_script('window.stop()')
-
- detail_info = driver.find_element_by_xpath('//div[@class="result_detail"]')
- # 详情html内容
- detail_info_html = detail_info.get_attribute('innerHTML')
- # 详情文本内容
- detail_info_text = detail_info.find_element_by_xpath('.').text
- # 电视剧名称
- tv_name = detail_info.find_element_by_xpath('h1/a').text
- #存入数据库
- sql = """
- insert into scrapy.iqiyi_dianshiju_detail (id, tv_name, detail_info_text, detail_info_html, url) values (%s, %s, %s, %s, %s)
- """
- value = (_id, tv_name, detail_info_text, detail_info_html, url)
- Mysql.insertOne(sql, value=value, conn=conn)
- time.sleep(random.uniform(1, 5))
- driver.quit()
- # 爬取电视剧链接地址
- def get_tengxun_url():
- start_urls = [
- # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=2017',
- # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=859',
- # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=860',
- # 'http://v.qq.com/x/list/tv?iyear=861&offset=0&sort=5',
- # 'http://v.qq.com/x/list/tv?sort=5&offset=0&iyear=862',
- # 'http://v.qq.com/x/list/tv?iyear=863&sort=5&offset=0',
- # 'http://v.qq.com/x/list/tv?sort=5&iyear=864&offset=0',
- 'http://v.qq.com/x/list/tv?iyear=865&sort=5&offset=0',
- 'http://v.qq.com/x/list/tv?iyear=866&offset=0&sort=5'
- ]
- # 打开Firefox浏览器
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(20)
- # 数据库连接
- conn = Mysql.createScrapyConn()
- for url in start_urls:
- # 打开主页
- try:
- driver.get(url)
- except:
- driver.execute_script('window.stop()')
-
- is_next = True
- while is_next:
- lis = driver.find_elements_by_xpath('//div[@class="mod_bd"]/div/ul/li')
- print lis
- sql_insert = """
- insert into scrapy.tengxun_dianshiju_url (url) values (%s)
- """
- data_list = []
- for li in lis:
- try:
- tv_url = li.find_element_by_xpath('a').get_attribute('href')
- print tv_url
- data_list.append((tv_url,))
- except Exception, e:
- print '没有'
- continue
- time.sleep(1)
- Mysql.insertMany(sql_insert, data_list, conn)
- try:
- next_page = driver.find_elements_by_xpath('//div[@class="mod_pages"]/a')[-1]
- except:
- is_next = False
- continue
- try:
- next_page_text = next_page.find_element_by_xpath('.').text
- next_page_url = next_page.find_element_by_xpath('.').get_attribute('href')
- if next_page_url == 'javascript:;':
- is_next = False
- continue
- if next_page_text == '下一页':
- next_page.click()
- else:
- is_next = False;
- except:
- is_next = False;
- time.sleep(10)
- driver.quit()
- def get_tengxun_detail_url():
- # 打开Firefox浏览器
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(20)
- # 数据库连接
- conn = Mysql.createScrapyConn()
- sql = """
- select id, url from scrapy.tengxun_dianshiju_url where detail_url is null or detail_url = '' order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- _id = row['id']
- url = row['url']
- # 打开主页
- try:
- driver.get(url)
- except:
- driver.execute_script('window.stop()')
- if re.match(r'(.*)detail(.*)', driver.current_url):
- print driver.current_url
- sql = """
- update scrapy.tengxun_dianshiju_url set detail_url = '%s' where id = '%s'
- """
- sql = sql % (driver.current_url, _id)
- Mysql.update(sql, conn=conn)
- continue
- try:
- a_list = driver.find_elements_by_xpath('//a[@class="album_title"]')
- print a_list
- for a in a_list:
- detail_href = a.find_element_by_xpath('.').get_attribute('href')
- if re.match(r'(.*)detail(.*)', detail_href):
- print detail_href
- sql = """
- update scrapy.tengxun_dianshiju_url set detail_url = '%s' where id = '%s'
- """
- sql = sql % (detail_href, _id)
- Mysql.update(sql, conn=conn)
- break
- except Exception, e:
- print e
- time.sleep(random.uniform(0, 3))
-
- driver.quit()
- # 爬取具体页面
- def get_tengxun_detail():
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- # 数据库连接
- conn = Mysql.createScrapyConn()
- # 获取所有需要爬取的电视剧
- sql = """
- select url, detail_url from scrapy.tengxun_dianshiju_url order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- url = row['url']
- detail_url = row['detail_url']
- try:
- driver.get(detail_url)
- except:
- driver.execute_script('window.stop()')
-
- detail_info = driver.find_element_by_xpath('//div[@class="container_inner"]')
- # 详情html内容
- detail_info_html = detail_info.get_attribute('innerHTML')
- # 详情文本内容
- detail_info_text = detail_info.find_element_by_xpath('.').text
- # 电视剧名称
- tv_name = detail_info.find_element_by_xpath('.//div[@class="video_title_collect cf"]/h1/a').text
- sql = """
- insert into scrapy.tengxun_dianshiju_detail (tv_name, detail_info_text, detail_info_html, cover_url, detail_url) values ('%s', '%s', '%s', '%s', '%s')
- """
- sql = sql % (tv_name, detail_info_text, detail_info_html, url, detail_url)
- Mysql.insertOne(sql, conn=conn)
- driver.quit()
- if __name__ == '__main__':
- if len(sys.argv) != 2:
- print '没有输入参数,退出'
- sys.exit(0)
- print 'method name is ' + sys.argv[1]
- obj = DSJ_All()
- try:
- getattr(obj, sys.argv[1])()
- except Exception, e:
- print e
|