123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206 |
- #/usr/bin/env python
- #coding=utf-8
- """
- 新剧营销文章爬取
- """
- import random
- import sys
- import time
- import collections
- from selenium import webdriver
- from urllib import quote
- from fty_util.common import Mysql
- reload(sys)
- sys.setdefaultencoding('utf8')
- def scrapy_website():
- conn = Mysql.createOfflineConn()
- # 清空scrapy.scrapy_article表
- sql = """
- truncate table scrapy.scrapy_article
- """
- Mysql.execute(sql, conn=conn)
- # 将网站url和名称 放入有序字典中
- websites_dict = collections.OrderedDict()
- sql = """
- select name, update_url from odl.basic_websites order by id asc
- """
- websites = Mysql.getAll(sql, conn=conn)
- for website in websites:
- name = website['name']
- update_url = website['update_url']
- websites_dict[update_url] = name
- driver = webdriver.PhantomJS()
- driver.set_page_load_timeout(10)
- driver2 = webdriver.PhantomJS()
- driver2.set_page_load_timeout(10)
- start_url = 'http://www.baidu.com/'
- sql = """
- select id, tv_name from yxb.ad_tv_lib where source = 1 order by id asc
- """
- tvs = Mysql.getAll(sql, conn=conn)
- for tv in tvs:
- tv_id = tv['id']
- tv_name = tv['tv_name']
- for update_url in websites_dict:
- name = websites_dict.get(update_url)
- try:
- driver.get(start_url)
- except Exception, e:
- pass
- # input_box = None
- # submit_button = None
- # try:
- # input_box = driver.find_element_by_id('kw')
- # submit_button = driver.find_element_by_id('su')
- # except Exception, e:
- # driver.refresh()
- # 搜索条件
- line = 'intitle:' + tv_name + ' ' + 'site:' + update_url
- print line
- try:
- # input_box.clear()
- # input_box.send_keys(line)
- # submit_button.click()
- js = 'document.getElementById("kw").value = "' + line + '"'
- driver.execute_script(js)
- js = 'document.getElementById("su").click()'
- driver.execute_script(js)
- except Exception, e:
- print '点击请求失败'
- time.sleep(1)
- # 分页块
- page = None
- try:
- page = driver.find_elements_by_xpath('//div[@id="page"]/a')
- except:
- pass
- count = 0
- # 如果分页不存在,说明记录在十条以内或没有记录
- if page is None or len(page) == 0:
- try:
- divs = driver.find_elements_by_xpath('//div[@id="content_left"]/div')
- if divs is not None and len(divs) > 0:
- count = len(divs)
- for div in divs:
- try:
- div_title = div.find_element_by_xpath('./h3/a').get_attribute('textContent')
- div_title = div_title.replace(' ', '').replace('\n', '')
- div_href = div.find_element_by_xpath('./h3/a').get_attribute('href')
- div_content = div.find_element_by_xpath('.').get_attribute('textContent')
- import re
- m = re.search(ur'[0-9]+年[0-9]+月[0-9]+日', div_content)
- if m is not None:
- div_date = m.group(0)
- div_date = div_date.replace('年', '-').replace('月', '-').replace('日','')
- sql = """
- insert into scrapy.scrapy_article (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_url, scrapy_title, scrapy_date) values (%s, %s, %s, %s, %s, %s, %s, %s)
- """
- value = (tv_id, tv_name, 1, name, line, div_href, div_title, div_date)
- Mysql.insertOne(sql, value=value, conn=conn)
- except:
- pass
- except Exception, e:
- print e
- count = 0
- # 如果分页存在,判断最后一页是不是10
- else:
-
- try:
- divs = driver.find_elements_by_xpath('//div[@id="content_left"]/div')
- except:
- divs = None
- if divs is not None and len(divs) > 0:
- # count = len(divs)
- for div in divs:
- try:
- try:
- div_title = div.find_element_by_xpath('./h3/a').get_attribute('textContent')
- div_title = div_title.replace(' ', '').replace('\n', '')
- div_href = div.find_element_by_xpath('./h3/a').get_attribute('href')
- div_content = div.find_element_by_xpath('.').get_attribute('textContent')
- import re
- m = re.search(ur'[0-9]+年[0-9]+月[0-9]+日', div_content)
- if m is not None:
- div_date = m.group(0)
- div_date = div_date.replace('年', '-').replace('月', '-').replace('日','')
- sql = """
- insert into scrapy.scrapy_article (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_url, scrapy_title, scrapy_date) values (%s, %s, %s, %s, %s, %s, %s, %s)
- """
- value = (tv_id, tv_name, 1, name, line, div_href, div_title, div_date)
- Mysql.insertOne(sql, value=value, conn=conn)
- except:
- pass
- except Exception, e:
- pass
- try:
- page_nums = driver.find_elements_by_xpath('//div[@id="page"]/a')
- max_page_num = 1
- max_page_href= ''
- for page_num in page_nums:
- href = page_num.find_element_by_xpath('.').get_attribute('href')
- page_num_text = page_num.find_element_by_xpath('.').get_attribute('textContent')
- page_num_text = page_num_text.replace(' ', '').replace('\n', '')
- # 如果只是数字
- if page_num_text.isdigit():
- page_num_text = int(page_num_text)
- if page_num_text > max_page_num:
- max_page_num = page_num_text
- max_page_href = href
- try:
- driver2.get(max_page_href)
- except Exception, e:
- print e
- pass
- divs = driver2.find_elements_by_xpath('//div[@id="content_left"]/div')
- if divs is not None and len(divs) > 0:
- # count = len(divs)
- for div in divs:
- try:
- div_title = div.find_element_by_xpath('./h3/a').get_attribute('textContent')
- div_title = div_title.replace(' ', '').replace('\n', '')
- div_href = div.find_element_by_xpath('./h3/a').get_attribute('href')
- div_content = div.find_element_by_xpath('.').get_attribute('textContent')
- import re
- m = re.search(ur'[0-9]+年[0-9]+月[0-9]+日', div_content)
- if m is not None:
- div_date = m.group(0)
- div_date = div_date.replace('年', '-').replace('月', '-').replace('日','')
- sql = """
- insert into scrapy.scrapy_article (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_url, scrapy_title, scrapy_date) values (%s, %s, %s, %s, %s, %s, %s, %s)
- """
- value = (tv_id, tv_name, 1, name, line, div_href, div_title, div_date)
- Mysql.insertOne(sql, value=value, conn=conn)
- except:
- pass
- # 如果是下一页字符串
- elif page_num_text == '下一页>':
- break
- except Exception, e:
- print e
- continue
- driver.quit()
- driver2.quit()
- Mysql.close(conn=conn)
- if __name__ == '__main__':
- scrapy_website()
|