scrapy_gongzhonghao_count.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. #/usr/bin/env python
  2. #coding=utf-8
  3. import random
  4. import sys
  5. import time
  6. import collections
  7. from selenium import webdriver
  8. from urllib import quote
  9. from fty_util.common import Mysql
  10. reload(sys)
  11. sys.setdefaultencoding('utf8')
  12. def scrapy_website():
  13. conn = Mysql.createOfflineConn()
  14. # 将网站url和名称 放入有序字典中
  15. websites_dict = collections.OrderedDict()
  16. sql = """
  17. select name, account from odl.basic_weixin_subscribe where is_delete != 1 order by id asc
  18. """
  19. websites = Mysql.getAll(sql, conn=conn)
  20. for website in websites:
  21. name = website['name']
  22. account = website['account']
  23. websites_dict[account] = name
  24. driver = webdriver.Firefox()
  25. driver.set_page_load_timeout(10)
  26. start_url = 'http://weixin.sogou.com/'
  27. sql = """
  28. select tv_id, tv_name from odl.ad_tv_lib where tv_id order by id asc
  29. """
  30. tvs = Mysql.getAll(sql, conn=conn)
  31. for tv in tvs:
  32. tv_id = tv['tv_id']
  33. tv_name = tv['tv_name']
  34. try:
  35. driver.get(start_url)
  36. except Exception, e:
  37. pass
  38. try:
  39. input_box = driver.find_element_by_id('upquery')
  40. submit_button = driver.find_element_by_class_name('swz')
  41. except Exception, e:
  42. driver.refresh()
  43. # 搜索条件
  44. try:
  45. input_box.clear()
  46. input_box.send_keys(tv_name)
  47. submit_button.click()
  48. except Exception, e:
  49. print '点击请求失败'
  50. for account in websites_dict:
  51. name = websites_dict.get(account)
  52. input_box = None
  53. submit_button = None
  54. time.sleep(5)
  55. js = 'document.getElementsByClassName("time-box float")[2].style.display="block"'
  56. driver.execute_script(js)
  57. js = 'document.getElementsByClassName("s-sea")[0].value = "' + account + '"'
  58. driver.execute_script(js)
  59. js = 'document.getElementById("search_enter").click()'
  60. driver.execute_script(js)
  61. # s_sea = driver.find_element_by_class_name('s-sea')
  62. # search_enter = driver.find_element_by_id('search_enter')
  63. # s_sea.clear()
  64. # s_sea.send_keys(account)
  65. # search_enter.click()
  66. time.sleep(10)
  67. driver.execute_script('window.stop()')
  68. # driver.refresh()
  69. # 分页块
  70. page = None
  71. try:
  72. page = driver.find_elements_by_xpath('//div[@id="pagebar_container"]/a')
  73. except:
  74. pass
  75. count = 0
  76. # 如果分页不存在,说明记录在十条以内或没有记录
  77. if page is None or len(page) == 0:
  78. try:
  79. divs = driver.find_elements_by_xpath('//ul[@class="news-list"]/li')
  80. if divs is not None and len(divs) > 0:
  81. count = len(divs)
  82. except Exception, e:
  83. count = 0
  84. # 如果分页存在,判断最后一页是不是10
  85. else:
  86. try:
  87. page_nums = driver.find_elements_by_xpath('//div[@id="pagebar_container"]/a')
  88. max_page_num = 1
  89. max_page_href= ''
  90. for page_num in page_nums:
  91. href = page_num.find_element_by_xpath('.').get_attribute('href')
  92. page_num_text = page_num.find_element_by_xpath('.').get_attribute('textContent')
  93. page_num_text = page_num_text.replace(' ', '').replace('\n', '')
  94. # 如果只是数字
  95. if page_num_text.isdigit():
  96. page_num_text = int(page_num_text)
  97. if page_num_text > max_page_num:
  98. max_page_num = page_num_text
  99. max_page_href = href
  100. # 如果是下一页字符串
  101. elif page_num_text == '下一页':
  102. break
  103. try:
  104. driver.get(max_page_href)
  105. except Exception, e:
  106. pass
  107. try:
  108. divs = driver.find_elements_by_xpath('//ul[@class="news-list"]/li')
  109. if divs is not None and len(divs) > 0:
  110. count = len(divs)
  111. except Exception, e:
  112. count = 0
  113. count = (max_page_num - 1) * 10 + count
  114. except:
  115. continue
  116. if count != 0:
  117. sql = """
  118. insert into scrapy.scrapy_subscribe_count (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_count) values (%s, %s, %s, %s, %s, %s)
  119. """
  120. value = (tv_id, tv_name, 2, name, '', count)
  121. Mysql.insertOne(sql, value=value, conn=conn)
  122. driver.quit()
  123. if __name__ == '__main__':
  124. scrapy_website()