123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198 |
- #/usr/bin/env python
- #coding=utf-8
- """爱奇艺电视剧分类爬取
- 分为两步
- 第一步爬取搜索页面结果,找到符合条件的电视剧
- 第二步根据保存的具体页面url爬取分类信息
- """
- import random
- import sys
- import time
- from selenium import webdriver
- from fty_util.common import Mysql
- reload(sys)
- sys.setdefaultencoding('utf8')
- class DSJ_Categories(object):
- # 爬取电视剧链接地址
- def get_iqiyi_url():
- # 打开Firefox浏览器
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- # 数据库连接
- conn = Mysql.createScrapyConn()
- # 获取所有需要爬取的电视剧
- sql = """
- select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and (iqiyi_url is null or iqiyi_url = '') order by id asc
- """
- # rows = conn.getAll(sql)
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- _id = row['id']
- tv_name = row['tv_name']
- print tv_name
- start_url = "http://so.iqiyi.com/so/q_" + tv_name + "?source=input"
- # 打开主页
- try:
- driver.get(start_url)
- except:
- driver.execute_script('window.stop()')
-
- lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li')
- for li in lis:
- try:
- first_num = li.find_element_by_xpath('.//div[@class="info_item mt15"]/div/div[2]/ul[1]/li[1]/a').text
- if '1' == first_num.strip():
- href = li.find_element_by_xpath('.//div/h3/a').get_attribute('href')
- print href
- sql = """
- update scrapy.tv_category_scrapy set iqiyi_url = '%s' where id = '%s'
- """
- sql = sql % (href, _id)
- # conn.update(sql)
- Mysql.update(sql, conn=conn)
-
- break
- except Exception, e:
- print '没有'
- continue
- driver.quit()
- # 爬取具体页面
- def get_detail():
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- # 数据库连接
- conn = Mysql.createScrapyConn()
- # 获取所有需要爬取的电视剧
- sql = """
- select id, tv_name, iqiyi_url from scrapy.tv_category_scrapy where id > 4573 and iqiyi_url is not null and iqiyi_url != '' and iqiyi_types is null order by id asc
- """
- # rows = conn.getAll(sql)
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- _id = row['id']
- tv_name = row['tv_name']
- url = row['iqiyi_url']
- print url
- try:
- driver.get(url)
- except:
- driver.execute_script('window.stop()')
-
- cats = driver.find_elements_by_xpath('//div[@class="look_point"]/a')
- cats_set = set()
- for cat in cats:
- cats_set.add(cat.find_element_by_xpath('.').text.strip())
- #存入数据库
- sql = """
- update scrapy.tv_category_scrapy set iqiyi_types = '%s' where id = '%s'
- """
- sql = sql % (' '.join(cats_set), _id)
- # conn.update(sql)
- Mysql.update(sql, conn=conn)
- driver.quit()
- # 爬取电视剧链接地址
- def get_tengxun_url():
- # 打开Firefox浏览器
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- # 数据库连接
- conn = Mysql.createScrapyConn()
- # 获取所有需要爬取的电视剧
- sql = """
- select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is null order by id asc
- """
- # rows = conn.getAll(sql)
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- _id = row['id']
- tv_name = row['tv_name']
- print tv_name
- start_url = "http://v.qq.com/x/search/?q=" + tv_name + "&stag=0"
- # 打开主页
- try:
- driver.get(start_url)
- except:
- driver.execute_script('window.stop()')
-
- divs = driver.find_elements_by_xpath('//div[@class="wrapper_main"]/div')
- for div in divs:
- try:
- href = div.find_element_by_xpath('div[1]/div/h2/a').get_attribute('href')
- print href
- matchObj = re.match(r'(.*)detail(.*)', href, re.M | re.I)
- if matchObj:
- sql = """
- update scrapy.tv_category_scrapy set tengxun_url = %s where id = %s
- """
- # sql = sql % (href, _id)
- value = (href, _id)
- # conn.update(sql)
- Mysql.update(sql, param=value, conn=conn)
- break
- except Exception, e:
- print '没有'
- print e
- continue
- driver.quit()
- # 爬取具体页面
- def get_detail():
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- # 数据库连接
- conn = Mysql.createScrapyConn()
- # 获取所有需要爬取的电视剧
- sql = """
- select id, tv_name, tengxun_url from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is not null and tengxun_url != '' and tengxun_types is null order by id asc
- """
- # rows = Mysql.getAll(sql)
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- _id = row['id']
- tv_name = row['tv_name']
- tengxun_url = row['tengxun_url']
- print tengxun_url
- # 打开主页
- try:
- driver.get(tengxun_url)
- except:
- driver.execute_script('window.stop()')
-
- cats = driver.find_elements_by_xpath('//div[@class="tag_list"]/a')
- cats_set = set()
- for cat in cats:
- cat_name = cat.find_element_by_xpath('.').text
- cats_set.add(cat_name)
- #存入数据库
- sql = """
- update scrapy.tv_category_scrapy set tengxun_types = '%s' where id = '%s'
- """
- sql = sql % (' '.join(cats_set), _id)
- # conn.update(sql)
- Mysql.update(sql, conn=conn)
- driver.quit()
- if __name__ == '__main__':
- if len(sys.argv) != 2:
- print '没有输入参数,退出'
- sys.exit(0)
- print 'method name is ' + sys.argv[1]
- obj = DSJ_Categories()
- try:
- getattr(obj, sys.argv[1])()
- except Exception, e:
- print e
|