#/usr/bin/env python #coding=utf-8 """爱奇艺电视剧分类爬取 分为两步 第一步爬取搜索页面结果,找到符合条件的电视剧 第二步根据保存的具体页面url爬取分类信息 """ import random import sys import time from selenium import webdriver from fty_util.common import Mysql reload(sys) sys.setdefaultencoding('utf8') class DSJ_All(object): # 爬取电视剧链接地址 def get_iqiyi_url(): # 需要爬取的列表页面 start_urls = [ # 'http://www.iqiyi.com/lib/dianshiju/,,2017_4_1.html', # 'http://www.iqiyi.com/lib/dianshiju/,,2016_4_1.html', # 'http://www.iqiyi.com/lib/dianshiju/,,2015_4_1.html', # 'http://www.iqiyi.com/lib/dianshiju/,,2014-2011_4_1.html', # 'http://www.iqiyi.com/lib/dianshiju/,,2010-2000_4_1.html', # 'http://www.iqiyi.com/lib/dianshiju/,,90%E5%B9%B4%E4%BB%A3_4_1.html', # 'http://www.iqiyi.com/lib/dianshiju/,,80%E5%B9%B4%E4%BB%A3_4_1.html', 'http://www.iqiyi.com/lib/dianshiju/,,%E6%9B%B4%E6%97%A9_4_1.html' ] # 打开Firefox浏览器 driver = webdriver.Firefox() driver.set_page_load_timeout(20) # 数据库连接 conn = Mysql.createScrapyConn() for url in start_urls: # 打开主页 try: driver.get(url) except: driver.execute_script('window.stop()') is_next = True while is_next: try: next_page = driver.find_elements_by_xpath('//div[@class="mod-page"]/a')[-1] except: continue lis = driver.find_elements_by_xpath('//div[@class="wrapper-piclist"]/ul/li') sql_insert = """ insert into scrapy.iqiyi_dianshiju_url (url) values (%s) """ data_list = [] for li in lis: try: tv_url = li.find_element_by_xpath('.//div[1]/a').get_attribute('href') print tv_url data_list.append((tv_url,)) except Exception, e: print '没有' continue time.sleep(random.uniform(0, 2)) Mysql.insertMany(sql_insert, data_list, conn) try: next_page_text = next_page.find_element_by_xpath('.').text if next_page_text == '下一页': next_page.click() else: is_next = False; except: is_next = False; time.sleep(10) driver.quit() # 爬取具体页面 def get_iqiyi_detail(): driver = webdriver.Firefox() driver.set_page_load_timeout(10) # 数据库连接 conn = Mysql.createScrapyConn() sql = """ select max(id) from scrapy.iqiyi_dianshiju_detail """ max_id = Mysql.getOne(sql, conn=conn) max_id = max_id[0] if max_id is None: max_id = 0 # 获取所有url sql = """ select id, url from scrapy.iqiyi_dianshiju_url where id > '%s' order by id asc """ sql = sql % (max_id) rows = Mysql.getAll(sql, conn=conn) for row in rows: _id = row['id'] url = row['url'] print url try: driver.get(url) except: driver.execute_script('window.stop()') detail_info = driver.find_element_by_xpath('//div[@class="result_detail"]') # 详情html内容 detail_info_html = detail_info.get_attribute('innerHTML') # 详情文本内容 detail_info_text = detail_info.find_element_by_xpath('.').text # 电视剧名称 tv_name = detail_info.find_element_by_xpath('h1/a').text #存入数据库 sql = """ insert into scrapy.iqiyi_dianshiju_detail (id, tv_name, detail_info_text, detail_info_html, url) values (%s, %s, %s, %s, %s) """ value = (_id, tv_name, detail_info_text, detail_info_html, url) Mysql.insertOne(sql, value=value, conn=conn) time.sleep(random.uniform(1, 5)) driver.quit() # 爬取电视剧链接地址 def get_tengxun_url(): start_urls = [ # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=2017', # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=859', # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=860', # 'http://v.qq.com/x/list/tv?iyear=861&offset=0&sort=5', # 'http://v.qq.com/x/list/tv?sort=5&offset=0&iyear=862', # 'http://v.qq.com/x/list/tv?iyear=863&sort=5&offset=0', # 'http://v.qq.com/x/list/tv?sort=5&iyear=864&offset=0', 'http://v.qq.com/x/list/tv?iyear=865&sort=5&offset=0', 'http://v.qq.com/x/list/tv?iyear=866&offset=0&sort=5' ] # 打开Firefox浏览器 driver = webdriver.Firefox() driver.set_page_load_timeout(20) # 数据库连接 conn = Mysql.createScrapyConn() for url in start_urls: # 打开主页 try: driver.get(url) except: driver.execute_script('window.stop()') is_next = True while is_next: lis = driver.find_elements_by_xpath('//div[@class="mod_bd"]/div/ul/li') print lis sql_insert = """ insert into scrapy.tengxun_dianshiju_url (url) values (%s) """ data_list = [] for li in lis: try: tv_url = li.find_element_by_xpath('a').get_attribute('href') print tv_url data_list.append((tv_url,)) except Exception, e: print '没有' continue time.sleep(1) Mysql.insertMany(sql_insert, data_list, conn) try: next_page = driver.find_elements_by_xpath('//div[@class="mod_pages"]/a')[-1] except: is_next = False continue try: next_page_text = next_page.find_element_by_xpath('.').text next_page_url = next_page.find_element_by_xpath('.').get_attribute('href') if next_page_url == 'javascript:;': is_next = False continue if next_page_text == '下一页': next_page.click() else: is_next = False; except: is_next = False; time.sleep(10) driver.quit() def get_tengxun_detail_url(): # 打开Firefox浏览器 driver = webdriver.Firefox() driver.set_page_load_timeout(20) # 数据库连接 conn = Mysql.createScrapyConn() sql = """ select id, url from scrapy.tengxun_dianshiju_url where detail_url is null or detail_url = '' order by id asc """ rows = Mysql.getAll(sql, conn=conn) for row in rows: _id = row['id'] url = row['url'] # 打开主页 try: driver.get(url) except: driver.execute_script('window.stop()') if re.match(r'(.*)detail(.*)', driver.current_url): print driver.current_url sql = """ update scrapy.tengxun_dianshiju_url set detail_url = '%s' where id = '%s' """ sql = sql % (driver.current_url, _id) Mysql.update(sql, conn=conn) continue try: a_list = driver.find_elements_by_xpath('//a[@class="album_title"]') print a_list for a in a_list: detail_href = a.find_element_by_xpath('.').get_attribute('href') if re.match(r'(.*)detail(.*)', detail_href): print detail_href sql = """ update scrapy.tengxun_dianshiju_url set detail_url = '%s' where id = '%s' """ sql = sql % (detail_href, _id) Mysql.update(sql, conn=conn) break except Exception, e: print e time.sleep(random.uniform(0, 3)) driver.quit() # 爬取具体页面 def get_tengxun_detail(): driver = webdriver.Firefox() driver.set_page_load_timeout(10) # 数据库连接 conn = Mysql.createScrapyConn() # 获取所有需要爬取的电视剧 sql = """ select url, detail_url from scrapy.tengxun_dianshiju_url order by id asc """ rows = Mysql.getAll(sql, conn=conn) for row in rows: url = row['url'] detail_url = row['detail_url'] try: driver.get(detail_url) except: driver.execute_script('window.stop()') detail_info = driver.find_element_by_xpath('//div[@class="container_inner"]') # 详情html内容 detail_info_html = detail_info.get_attribute('innerHTML') # 详情文本内容 detail_info_text = detail_info.find_element_by_xpath('.').text # 电视剧名称 tv_name = detail_info.find_element_by_xpath('.//div[@class="video_title_collect cf"]/h1/a').text sql = """ insert into scrapy.tengxun_dianshiju_detail (tv_name, detail_info_text, detail_info_html, cover_url, detail_url) values ('%s', '%s', '%s', '%s', '%s') """ sql = sql % (tv_name, detail_info_text, detail_info_html, url, detail_url) Mysql.insertOne(sql, conn=conn) driver.quit() if __name__ == '__main__': if len(sys.argv) != 2: print '没有输入参数,退出' sys.exit(0) print 'method name is ' + sys.argv[1] obj = DSJ_All() try: getattr(obj, sys.argv[1])() except Exception, e: print e