#/usr/bin/env python #coding=utf-8 """爱奇艺电视剧分类爬取 分为两步 第一步爬取搜索页面结果,找到符合条件的电视剧 第二步根据保存的具体页面url爬取分类信息 """ import random import sys import time from selenium import webdriver from fty_util.common import Mysql reload(sys) sys.setdefaultencoding('utf8') class DSJ_Categories(object): # 爬取电视剧链接地址 def get_iqiyi_url(): # 打开Firefox浏览器 driver = webdriver.Firefox() driver.set_page_load_timeout(10) # 数据库连接 conn = Mysql.createScrapyConn() # 获取所有需要爬取的电视剧 sql = """ select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and (iqiyi_url is null or iqiyi_url = '') order by id asc """ # rows = conn.getAll(sql) rows = Mysql.getAll(sql, conn=conn) for row in rows: _id = row['id'] tv_name = row['tv_name'] print tv_name start_url = "http://so.iqiyi.com/so/q_" + tv_name + "?source=input" # 打开主页 try: driver.get(start_url) except: driver.execute_script('window.stop()') lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li') for li in lis: try: first_num = li.find_element_by_xpath('.//div[@class="info_item mt15"]/div/div[2]/ul[1]/li[1]/a').text if '1' == first_num.strip(): href = li.find_element_by_xpath('.//div/h3/a').get_attribute('href') print href sql = """ update scrapy.tv_category_scrapy set iqiyi_url = '%s' where id = '%s' """ sql = sql % (href, _id) # conn.update(sql) Mysql.update(sql, conn=conn) break except Exception, e: print '没有' continue driver.quit() # 爬取具体页面 def get_detail(): driver = webdriver.Firefox() driver.set_page_load_timeout(10) # 数据库连接 conn = Mysql.createScrapyConn() # 获取所有需要爬取的电视剧 sql = """ select id, tv_name, iqiyi_url from scrapy.tv_category_scrapy where id > 4573 and iqiyi_url is not null and iqiyi_url != '' and iqiyi_types is null order by id asc """ # rows = conn.getAll(sql) rows = Mysql.getAll(sql, conn=conn) for row in rows: _id = row['id'] tv_name = row['tv_name'] url = row['iqiyi_url'] print url try: driver.get(url) except: driver.execute_script('window.stop()') cats = driver.find_elements_by_xpath('//div[@class="look_point"]/a') cats_set = set() for cat in cats: cats_set.add(cat.find_element_by_xpath('.').text.strip()) #存入数据库 sql = """ update scrapy.tv_category_scrapy set iqiyi_types = '%s' where id = '%s' """ sql = sql % (' '.join(cats_set), _id) # conn.update(sql) Mysql.update(sql, conn=conn) driver.quit() # 爬取电视剧链接地址 def get_tengxun_url(): # 打开Firefox浏览器 driver = webdriver.Firefox() driver.set_page_load_timeout(10) # 数据库连接 conn = Mysql.createScrapyConn() # 获取所有需要爬取的电视剧 sql = """ select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is null order by id asc """ # rows = conn.getAll(sql) rows = Mysql.getAll(sql, conn=conn) for row in rows: _id = row['id'] tv_name = row['tv_name'] print tv_name start_url = "http://v.qq.com/x/search/?q=" + tv_name + "&stag=0" # 打开主页 try: driver.get(start_url) except: driver.execute_script('window.stop()') divs = driver.find_elements_by_xpath('//div[@class="wrapper_main"]/div') for div in divs: try: href = div.find_element_by_xpath('div[1]/div/h2/a').get_attribute('href') print href matchObj = re.match(r'(.*)detail(.*)', href, re.M | re.I) if matchObj: sql = """ update scrapy.tv_category_scrapy set tengxun_url = %s where id = %s """ # sql = sql % (href, _id) value = (href, _id) # conn.update(sql) Mysql.update(sql, param=value, conn=conn) break except Exception, e: print '没有' print e continue driver.quit() # 爬取具体页面 def get_detail(): driver = webdriver.Firefox() driver.set_page_load_timeout(10) # 数据库连接 conn = Mysql.createScrapyConn() # 获取所有需要爬取的电视剧 sql = """ select id, tv_name, tengxun_url from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is not null and tengxun_url != '' and tengxun_types is null order by id asc """ # rows = Mysql.getAll(sql) rows = Mysql.getAll(sql, conn=conn) for row in rows: _id = row['id'] tv_name = row['tv_name'] tengxun_url = row['tengxun_url'] print tengxun_url # 打开主页 try: driver.get(tengxun_url) except: driver.execute_script('window.stop()') cats = driver.find_elements_by_xpath('//div[@class="tag_list"]/a') cats_set = set() for cat in cats: cat_name = cat.find_element_by_xpath('.').text cats_set.add(cat_name) #存入数据库 sql = """ update scrapy.tv_category_scrapy set tengxun_types = '%s' where id = '%s' """ sql = sql % (' '.join(cats_set), _id) # conn.update(sql) Mysql.update(sql, conn=conn) driver.quit() if __name__ == '__main__': if len(sys.argv) != 2: print '没有输入参数,退出' sys.exit(0) print 'method name is ' + sys.argv[1] obj = DSJ_Categories() try: getattr(obj, sys.argv[1])() except Exception, e: print e