lianghua
/
py_script


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
							#/usr/bin/env python
#coding=utf-8

"""
营销文章数量爬取
"""
import random
import sys
import time
import collections

from selenium import webdriver
from urllib import quote

from fty_util.common import Mysql

reload(sys)
sys.setdefaultencoding('utf8')

def scrapy_website():
    conn = Mysql.createOfflineConn()

    # 将网站url和名称 放入有序字典中
    websites_dict = collections.OrderedDict()
    sql = """
        select name, update_url from odl.basic_websites order by id asc
    """
    websites = Mysql.getAll(sql, conn=conn)
    for website in websites:
        name = website['name']
        update_url = website['update_url']
        websites_dict[update_url] = name
    driver = webdriver.PhantomJS()
    driver.set_page_load_timeout(10)

    sql = """
        select max(tv_id) as tv_id from scrapy.scrapy_article_count
    """

    max_tv_id = Mysql.getOne(sql, conn=conn)
    if max_tv_id is None or max_tv_id[0] == 0:
        max_id = 0
    else:
        max_id = max_tv_id[0]

    start_url = 'http://www.baidu.com/'

    sql = """
        select tv_id, tv_name from odl.ad_tv_lib where tv_id > %s order by id asc
    """
    sql = sql % (max_id, )

    tvs = Mysql.getAll(sql, conn=conn)

    for tv in tvs:
        tv_id = tv['tv_id']
        tv_name = tv['tv_name']

        for update_url in websites_dict:
            name = websites_dict.get(update_url)
            try:
                driver.get(start_url)
            except Exception, e:
                pass
            # input_box = None
            # submit_button = None
            # try:
            #     input_box = driver.find_element_by_id('kw')
            #     submit_button = driver.find_element_by_id('su')
            # except Exception, e:
            #     driver.refresh()
            # 搜索条件
            line = 'intitle:' + tv_name + ' ' + 'site:' + update_url
            try:
                js = 'document.getElementById("kw").value = "' + line + '"'
                driver.execute_script(js)
                js = 'document.getElementById("su").click()'
                driver.execute_script(js)
                # input_box.clear()
                # input_box.send_keys(line)
                # submit_button.click()
            except Exception, e:
                print '点击请求失败'

            time.sleep(1)
            # 分页块
            page = None
            try:
                page = driver.find_elements_by_xpath('//div[@id="page"]/a')
            except:
                pass
            count = 0
            # 如果分页不存在，说明记录在十条以内或没有记录
            if page is None or len(page) == 0:
                try:
                    divs = driver.find_elements_by_xpath('//div[@id="content_left"]/div')
                    if divs is not None and len(divs) > 0:
                        count = len(divs)
                except Exception, e:
                    count = 0
            #  如果分页存在，判断最后一页是不是10
            else:
                try:
                    page_nums = driver.find_elements_by_xpath('//div[@id="page"]/a')
                    max_page_num = 1
                    max_page_href= ''
                    for page_num in page_nums:
                        href = page_num.find_element_by_xpath('.').get_attribute('href')
                        page_num_text = page_num.find_element_by_xpath('.').get_attribute('textContent')
                        page_num_text = page_num_text.replace(' ', '').replace('\n', '')

                        # 如果只是数字
                        if page_num_text.isdigit():
                            page_num_text = int(page_num_text)
                            if page_num_text > max_page_num:
                                max_page_num = page_num_text
                                max_page_href = href
                        # 如果是下一页字符串
                        elif page_num_text == '下一页>':
                            break

                    try:
                        driver.get(max_page_href)
                    except Exception, e:
                        pass
                    try:
                        divs = driver.find_elements_by_xpath('//div[@id="content_left"]/div')
                        if divs is not None and len(divs) > 0:
                            count = len(divs)
                    except Exception, e:
                        count = 0
                    
                    count = (max_page_num - 1) * 10 + count
                except:
                    continue

            if count != 0:
                sql = """
                    insert into scrapy.scrapy_article_count (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_count) values (%s, %s, %s, %s, %s, %s)
                """
                value = (tv_id, tv_name, 1, name, line, count)
                Mysql.insertOne(sql, value=value, conn=conn)
    driver.quit()

if __name__ == '__main__':
    scrapy_website()