#/usr/bin/env python #coding=utf-8 import random import sys import time from selenium import webdriver from urllib import quote from fty_util.common import Mysql reload(sys) sys.setdefaultencoding('utf8') """ 从爱奇艺中爬取百度百科没有爬到的内容 """ # 爬取搜索页面 def scrapy_url(): conn = Mysql.createOfflineConn() sql = """ select max(tv_id) as tv_id from scrapy.iqiyi_dianshiju_detail """ max_id = Mysql.getOne(sql, conn=conn) if max_id is None or max_id[0] == 0: max_tv_id = 0 else: max_tv_id = max_id[0] sql = """ select id, name from tv_lib.yxb_tv_series where id > %s and status = 12 order by id asc """ sql = sql % (max_tv_id,) rows = Mysql.getAll(sql, conn=conn) driver = webdriver.PhantomJS() driver.set_page_load_timeout(10) driver2 = webdriver.PhantomJS() driver2.set_page_load_timeout(10) for row in rows: _id = row['id'] name = row['name'] url = 'http://so.iqiyi.com/so/q_' + quote(str(name)) try: driver.get(url) except Exception, e: driver.execute_script('window.stop()') lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li') for li in lis: try: title = li.find_element_by_xpath('./div/h3/a').get_attribute('title') href = li.find_element_by_xpath('./div/h3/a').get_attribute('href') if 'www.iqiyi.com/lib' in href: print href try: driver2.get(href) except: pass content = driver2.find_element_by_xpath('//div[@class="result_detail"]').get_attribute('textContent') if content is None: content = '' desc = driver2.find_element_by_xpath('//div[@class="mod-body introduce-info"]').get_attribute('textContext') if desc is None: desc = '' content = content + '\n' + '概述:' + desc sql = """ insert into scrapy.iqiyi_dianshiju_detail (tv_id, tv_name, title, detail_info_text, url) values (%s, %s, %s, %s, %s) """ value = (_id, name, title, content, href) Mysql.insertOne(sql, value=value, conn=conn) except Exception, e: print e continue driver.quit() driver2.quit() if __name__ == '__main__': scrapy_url()