1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- #/usr/bin/env python
- #coding=utf-8
- import random
- import sys
- import time
- from selenium import webdriver
- from urllib import quote
- from fty_util.common import Mysql
- reload(sys)
- sys.setdefaultencoding('utf8')
- """
- 从爱奇艺中爬取百度百科没有爬到的内容
- """
- # 爬取搜索页面
- def scrapy_url():
- conn = Mysql.createOfflineConn()
- sql = """
- select max(tv_id) as tv_id from scrapy.iqiyi_dianshiju_detail
- """
- max_id = Mysql.getOne(sql, conn=conn)
- if max_id is None or max_id[0] == 0:
- max_tv_id = 0
- else:
- max_tv_id = max_id[0]
- sql = """
- select id, name from tv_lib.yxb_tv_series where id > %s and status = 12 order by id asc
- """
- sql = sql % (max_tv_id,)
- rows = Mysql.getAll(sql, conn=conn)
- driver = webdriver.PhantomJS()
- driver.set_page_load_timeout(10)
- driver2 = webdriver.PhantomJS()
- driver2.set_page_load_timeout(10)
- for row in rows:
- _id = row['id']
- name = row['name']
- url = 'http://so.iqiyi.com/so/q_' + quote(str(name))
- try:
- driver.get(url)
- except Exception, e:
- driver.execute_script('window.stop()')
- lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li')
- for li in lis:
- try:
- title = li.find_element_by_xpath('./div/h3/a').get_attribute('title')
- href = li.find_element_by_xpath('./div/h3/a').get_attribute('href')
- if 'www.iqiyi.com/lib' in href:
- print href
- try:
- driver2.get(href)
- except:
- pass
- content = driver2.find_element_by_xpath('//div[@class="result_detail"]').get_attribute('textContent')
- if content is None:
- content = ''
- desc = driver2.find_element_by_xpath('//div[@class="mod-body introduce-info"]').get_attribute('textContext')
- if desc is None:
- desc = ''
-
- content = content + '\n' + '概述:' + desc
- sql = """
- insert into scrapy.iqiyi_dianshiju_detail (tv_id, tv_name, title, detail_info_text, url) values (%s, %s, %s, %s, %s)
- """
- value = (_id, name, title, content, href)
- Mysql.insertOne(sql, value=value, conn=conn)
- except Exception, e:
- print e
- continue
- driver.quit()
- driver2.quit()
- if __name__ == '__main__':
- scrapy_url()
|