scrapy_website_count.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. #/usr/bin/env python
  2. #coding=utf-8
  3. """
  4. 营销文章数量爬取
  5. """
  6. import random
  7. import sys
  8. import time
  9. import collections
  10. from selenium import webdriver
  11. from urllib import quote
  12. from fty_util.common import Mysql
  13. reload(sys)
  14. sys.setdefaultencoding('utf8')
  15. def scrapy_website():
  16. conn = Mysql.createOfflineConn()
  17. # 将网站url和名称 放入有序字典中
  18. websites_dict = collections.OrderedDict()
  19. sql = """
  20. select name, update_url from odl.basic_websites order by id asc
  21. """
  22. websites = Mysql.getAll(sql, conn=conn)
  23. for website in websites:
  24. name = website['name']
  25. update_url = website['update_url']
  26. websites_dict[update_url] = name
  27. driver = webdriver.PhantomJS()
  28. driver.set_page_load_timeout(10)
  29. sql = """
  30. select max(tv_id) as tv_id from scrapy.scrapy_article_count
  31. """
  32. max_tv_id = Mysql.getOne(sql, conn=conn)
  33. if max_tv_id is None or max_tv_id[0] == 0:
  34. max_id = 0
  35. else:
  36. max_id = max_tv_id[0]
  37. start_url = 'http://www.baidu.com/'
  38. sql = """
  39. select tv_id, tv_name from odl.ad_tv_lib where tv_id > %s order by id asc
  40. """
  41. sql = sql % (max_id, )
  42. tvs = Mysql.getAll(sql, conn=conn)
  43. for tv in tvs:
  44. tv_id = tv['tv_id']
  45. tv_name = tv['tv_name']
  46. for update_url in websites_dict:
  47. name = websites_dict.get(update_url)
  48. try:
  49. driver.get(start_url)
  50. except Exception, e:
  51. pass
  52. # input_box = None
  53. # submit_button = None
  54. # try:
  55. # input_box = driver.find_element_by_id('kw')
  56. # submit_button = driver.find_element_by_id('su')
  57. # except Exception, e:
  58. # driver.refresh()
  59. # 搜索条件
  60. line = 'intitle:' + tv_name + ' ' + 'site:' + update_url
  61. try:
  62. js = 'document.getElementById("kw").value = "' + line + '"'
  63. driver.execute_script(js)
  64. js = 'document.getElementById("su").click()'
  65. driver.execute_script(js)
  66. # input_box.clear()
  67. # input_box.send_keys(line)
  68. # submit_button.click()
  69. except Exception, e:
  70. print '点击请求失败'
  71. time.sleep(1)
  72. # 分页块
  73. page = None
  74. try:
  75. page = driver.find_elements_by_xpath('//div[@id="page"]/a')
  76. except:
  77. pass
  78. count = 0
  79. # 如果分页不存在,说明记录在十条以内或没有记录
  80. if page is None or len(page) == 0:
  81. try:
  82. divs = driver.find_elements_by_xpath('//div[@id="content_left"]/div')
  83. if divs is not None and len(divs) > 0:
  84. count = len(divs)
  85. except Exception, e:
  86. count = 0
  87. # 如果分页存在,判断最后一页是不是10
  88. else:
  89. try:
  90. page_nums = driver.find_elements_by_xpath('//div[@id="page"]/a')
  91. max_page_num = 1
  92. max_page_href= ''
  93. for page_num in page_nums:
  94. href = page_num.find_element_by_xpath('.').get_attribute('href')
  95. page_num_text = page_num.find_element_by_xpath('.').get_attribute('textContent')
  96. page_num_text = page_num_text.replace(' ', '').replace('\n', '')
  97. # 如果只是数字
  98. if page_num_text.isdigit():
  99. page_num_text = int(page_num_text)
  100. if page_num_text > max_page_num:
  101. max_page_num = page_num_text
  102. max_page_href = href
  103. # 如果是下一页字符串
  104. elif page_num_text == '下一页>':
  105. break
  106. try:
  107. driver.get(max_page_href)
  108. except Exception, e:
  109. pass
  110. try:
  111. divs = driver.find_elements_by_xpath('//div[@id="content_left"]/div')
  112. if divs is not None and len(divs) > 0:
  113. count = len(divs)
  114. except Exception, e:
  115. count = 0
  116. count = (max_page_num - 1) * 10 + count
  117. except:
  118. continue
  119. if count != 0:
  120. sql = """
  121. insert into scrapy.scrapy_article_count (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_count) values (%s, %s, %s, %s, %s, %s)
  122. """
  123. value = (tv_id, tv_name, 1, name, line, count)
  124. Mysql.insertOne(sql, value=value, conn=conn)
  125. driver.quit()
  126. if __name__ == '__main__':
  127. scrapy_website()