scrapy_website_count_new.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. #/usr/bin/env python
  2. #coding=utf-8
  3. """
  4. 新剧营销文章爬取
  5. """
  6. import random
  7. import sys
  8. import time
  9. import collections
  10. from selenium import webdriver
  11. from urllib import quote
  12. from fty_util.common import Mysql
  13. reload(sys)
  14. sys.setdefaultencoding('utf8')
  15. def scrapy_website():
  16. conn = Mysql.createOfflineConn()
  17. # 清空scrapy.scrapy_article表
  18. sql = """
  19. truncate table scrapy.scrapy_article
  20. """
  21. Mysql.execute(sql, conn=conn)
  22. # 将网站url和名称 放入有序字典中
  23. websites_dict = collections.OrderedDict()
  24. sql = """
  25. select name, update_url from odl.basic_websites order by id asc
  26. """
  27. websites = Mysql.getAll(sql, conn=conn)
  28. for website in websites:
  29. name = website['name']
  30. update_url = website['update_url']
  31. websites_dict[update_url] = name
  32. driver = webdriver.PhantomJS()
  33. driver.set_page_load_timeout(10)
  34. driver2 = webdriver.PhantomJS()
  35. driver2.set_page_load_timeout(10)
  36. start_url = 'http://www.baidu.com/'
  37. sql = """
  38. select id, tv_name from yxb.ad_tv_lib where source = 1 order by id asc
  39. """
  40. tvs = Mysql.getAll(sql, conn=conn)
  41. for tv in tvs:
  42. tv_id = tv['id']
  43. tv_name = tv['tv_name']
  44. for update_url in websites_dict:
  45. name = websites_dict.get(update_url)
  46. try:
  47. driver.get(start_url)
  48. except Exception, e:
  49. pass
  50. # input_box = None
  51. # submit_button = None
  52. # try:
  53. # input_box = driver.find_element_by_id('kw')
  54. # submit_button = driver.find_element_by_id('su')
  55. # except Exception, e:
  56. # driver.refresh()
  57. # 搜索条件
  58. line = 'intitle:' + tv_name + ' ' + 'site:' + update_url
  59. print line
  60. try:
  61. # input_box.clear()
  62. # input_box.send_keys(line)
  63. # submit_button.click()
  64. js = 'document.getElementById("kw").value = "' + line + '"'
  65. driver.execute_script(js)
  66. js = 'document.getElementById("su").click()'
  67. driver.execute_script(js)
  68. except Exception, e:
  69. print '点击请求失败'
  70. time.sleep(1)
  71. # 分页块
  72. page = None
  73. try:
  74. page = driver.find_elements_by_xpath('//div[@id="page"]/a')
  75. except:
  76. pass
  77. count = 0
  78. # 如果分页不存在,说明记录在十条以内或没有记录
  79. if page is None or len(page) == 0:
  80. try:
  81. divs = driver.find_elements_by_xpath('//div[@id="content_left"]/div')
  82. if divs is not None and len(divs) > 0:
  83. count = len(divs)
  84. for div in divs:
  85. try:
  86. div_title = div.find_element_by_xpath('./h3/a').get_attribute('textContent')
  87. div_title = div_title.replace(' ', '').replace('\n', '')
  88. div_href = div.find_element_by_xpath('./h3/a').get_attribute('href')
  89. div_content = div.find_element_by_xpath('.').get_attribute('textContent')
  90. import re
  91. m = re.search(ur'[0-9]+年[0-9]+月[0-9]+日', div_content)
  92. if m is not None:
  93. div_date = m.group(0)
  94. div_date = div_date.replace('年', '-').replace('月', '-').replace('日','')
  95. sql = """
  96. insert into scrapy.scrapy_article (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_url, scrapy_title, scrapy_date) values (%s, %s, %s, %s, %s, %s, %s, %s)
  97. """
  98. value = (tv_id, tv_name, 1, name, line, div_href, div_title, div_date)
  99. Mysql.insertOne(sql, value=value, conn=conn)
  100. except:
  101. pass
  102. except Exception, e:
  103. print e
  104. count = 0
  105. # 如果分页存在,判断最后一页是不是10
  106. else:
  107. try:
  108. divs = driver.find_elements_by_xpath('//div[@id="content_left"]/div')
  109. except:
  110. divs = None
  111. if divs is not None and len(divs) > 0:
  112. # count = len(divs)
  113. for div in divs:
  114. try:
  115. try:
  116. div_title = div.find_element_by_xpath('./h3/a').get_attribute('textContent')
  117. div_title = div_title.replace(' ', '').replace('\n', '')
  118. div_href = div.find_element_by_xpath('./h3/a').get_attribute('href')
  119. div_content = div.find_element_by_xpath('.').get_attribute('textContent')
  120. import re
  121. m = re.search(ur'[0-9]+年[0-9]+月[0-9]+日', div_content)
  122. if m is not None:
  123. div_date = m.group(0)
  124. div_date = div_date.replace('年', '-').replace('月', '-').replace('日','')
  125. sql = """
  126. insert into scrapy.scrapy_article (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_url, scrapy_title, scrapy_date) values (%s, %s, %s, %s, %s, %s, %s, %s)
  127. """
  128. value = (tv_id, tv_name, 1, name, line, div_href, div_title, div_date)
  129. Mysql.insertOne(sql, value=value, conn=conn)
  130. except:
  131. pass
  132. except Exception, e:
  133. pass
  134. try:
  135. page_nums = driver.find_elements_by_xpath('//div[@id="page"]/a')
  136. max_page_num = 1
  137. max_page_href= ''
  138. for page_num in page_nums:
  139. href = page_num.find_element_by_xpath('.').get_attribute('href')
  140. page_num_text = page_num.find_element_by_xpath('.').get_attribute('textContent')
  141. page_num_text = page_num_text.replace(' ', '').replace('\n', '')
  142. # 如果只是数字
  143. if page_num_text.isdigit():
  144. page_num_text = int(page_num_text)
  145. if page_num_text > max_page_num:
  146. max_page_num = page_num_text
  147. max_page_href = href
  148. try:
  149. driver2.get(max_page_href)
  150. except Exception, e:
  151. print e
  152. pass
  153. divs = driver2.find_elements_by_xpath('//div[@id="content_left"]/div')
  154. if divs is not None and len(divs) > 0:
  155. # count = len(divs)
  156. for div in divs:
  157. try:
  158. div_title = div.find_element_by_xpath('./h3/a').get_attribute('textContent')
  159. div_title = div_title.replace(' ', '').replace('\n', '')
  160. div_href = div.find_element_by_xpath('./h3/a').get_attribute('href')
  161. div_content = div.find_element_by_xpath('.').get_attribute('textContent')
  162. import re
  163. m = re.search(ur'[0-9]+年[0-9]+月[0-9]+日', div_content)
  164. if m is not None:
  165. div_date = m.group(0)
  166. div_date = div_date.replace('年', '-').replace('月', '-').replace('日','')
  167. sql = """
  168. insert into scrapy.scrapy_article (tv_id, tv_name, search_type, media_name, search_restrict_url, scrapy_url, scrapy_title, scrapy_date) values (%s, %s, %s, %s, %s, %s, %s, %s)
  169. """
  170. value = (tv_id, tv_name, 1, name, line, div_href, div_title, div_date)
  171. Mysql.insertOne(sql, value=value, conn=conn)
  172. except:
  173. pass
  174. # 如果是下一页字符串
  175. elif page_num_text == '下一页>':
  176. break
  177. except Exception, e:
  178. print e
  179. continue
  180. driver.quit()
  181. driver2.quit()
  182. Mysql.close(conn=conn)
  183. if __name__ == '__main__':
  184. scrapy_website()