scrapy_tengxun.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. #/usr/bin/env python
  2. #coding=utf-8
  3. import random
  4. import sys
  5. import time
  6. from selenium import webdriver
  7. from urllib import quote
  8. from fty_util.common import Mysql
  9. reload(sys)
  10. sys.setdefaultencoding('utf8')
  11. """
  12. 腾讯视频爬取规则
  13. 1、scrapy_url 通过搜索页面,爬取搜索到的最有可能是电视剧页面的url
  14. 2、scrapy_data 进入搜索到的详情页面,爬取评分,每集url(播放数量在每集页面上显示)
  15. 3、todo 爬取每页详情页
  16. 腾讯视频通过搜索到的详情页面没有播放数量和评论数量,需要一个个页面解析
  17. 搜索页面-->搜索详情页面-->播放页面(只需取第一集播放页面即可)
  18. 所以只有在播放页面爬取到播放量即可。
  19. """
  20. def scrapy_url():
  21. conn = Mysql.createOfflineConn()
  22. sql = """
  23. select id, tv_name from scrapy.wangju_url order by id asc
  24. """
  25. rows = Mysql.getAll(sql, conn=conn)
  26. for row in rows:
  27. driver = webdriver.PhantomJS()
  28. driver.set_page_load_timeout(10)
  29. _id = row['id']
  30. tv_name = row['tv_name']
  31. url = 'https://v.qq.com/x/search/?q=' + quote(str(tv_name))
  32. try:
  33. driver.get(url)
  34. except Exception, e:
  35. driver.execute_script('window.stop()')
  36. divs = driver.find_elements_by_xpath('//div[@class="wrapper_main"]/div')
  37. for div in divs:
  38. try:
  39. title = div.find_element_by_xpath('./div[1]/div/h2/a/em').text
  40. href = div.find_element_by_xpath('./div[1]/div/h2/a').get_attribute('href')
  41. if 'v.qq.com/detail' in href:
  42. print href
  43. sql = """
  44. insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
  45. """
  46. value = (_id, tv_name, href, title, '', 'tengxun')
  47. Mysql.insertOne(sql, value=value, conn=conn)
  48. time.sleep(1)
  49. except Exception, e:
  50. print e
  51. continue
  52. driver.quit()
  53. # 爬取搜索到的详情页面
  54. def scrapy_data():
  55. conn = Mysql.createOfflineConn()
  56. # sql = """
  57. # select id, tv_name, url_tengxun from scrapy.wangju_url where url_tengxun is not null and url_tengxun != '' and tengxun_fenji is null order by id asc
  58. # """
  59. sql = """
  60. select id, tv_name, url, title from scrapy.wangju_all_url where source = 'tengxun' order by id asc
  61. """
  62. rows = Mysql.getAll(sql, conn=conn)
  63. for row in rows:
  64. driver = webdriver.PhantomJS()
  65. driver.set_page_load_timeout(10)
  66. _id = row['id']
  67. tv_name = row['tv_name']
  68. url = row['url']
  69. try:
  70. driver.get(url)
  71. except Exception, e:
  72. driver.execute_script('window.stop()')
  73. # 爬取内容
  74. try:
  75. content = driver.find_element_by_xpath('//div[@class="container_inner"]').get_attribute('textContent')
  76. except Exception, e:
  77. content = ''
  78. try:
  79. pagelist = driver.find_elements_by_xpath('//div[@class="mod_episode"]/span')
  80. if pagelist is not None:
  81. data_list = []
  82. for page in pagelist:
  83. num = page.find_element_by_xpath('./a/span').text
  84. num = num.replace(' ', '').replace('\n', '')
  85. href = page.find_element_by_xpath('./a').get_attribute('href')
  86. if 'v.qq.com' in href:
  87. data_list.append((_id, tv_name, num, href, 'tengxun'))
  88. # 插入分集数据
  89. if data_list is not None and len(data_list) > 0:
  90. sql = """
  91. insert into scrapy.wangju_fenji_url (id, tv_name, num, url, source) values (%s, %s, %s, %s, %s)
  92. """
  93. Mysql.insertMany(sql, data_list, conn)
  94. except Exception, e:
  95. pass
  96. # 更新内容
  97. sql = """
  98. update scrapy.wangju_all_url set content = %s where url = %s
  99. """
  100. value = (content, url)
  101. Mysql.execute(sql, param=value, conn=conn)
  102. driver.quit()
  103. # 爬取播放页面
  104. def scrapy_play_page():
  105. conn = Mysql.createOfflineConn()
  106. sql = """
  107. select id, tv_name, url from scrapy.wangju_fenji_url where source = 'tengxun' and num = '1' order by id asc
  108. """
  109. rows = Mysql.getAll(sql, conn=conn)
  110. for row in rows:
  111. driver = webdriver.Firefox()
  112. driver.set_page_load_timeout(10)
  113. _id = row['id']
  114. tv_name = row['tv_name']
  115. url = row['url']
  116. if 'v.qq.com' not in url:
  117. driver.quit()
  118. continue
  119. else:
  120. try:
  121. driver.get(url)
  122. except Exception, e:
  123. print e
  124. driver.execute_script('window.stop()')
  125. try:
  126. count = driver.find_element_by_xpath('//em[@id="mod_cover_playnum"]').text
  127. except Exception, e:
  128. print e
  129. count = 0
  130. print count
  131. sql = """
  132. update scrapy.wangju_url set tengxun_playtimes = '%s' where id = %s
  133. """
  134. sql = sql % (count, _id)
  135. Mysql.execute(sql, conn=conn)
  136. driver.quit()
  137. def parse_wangju_all_url_data():
  138. conn = Mysql.createOfflineConn()
  139. sql = """
  140. select id, tv_name, url from scrapy.wangju_all_url where source = 'tengxun' order by id asc
  141. """
  142. rows = Mysql.getAll(sql, conn=conn)
  143. driver = webdriver.Firefox()
  144. driver.set_page_load_timeout(10)
  145. driver2 = webdriver.Firefox()
  146. driver2.set_page_load_timeout(10)
  147. for row in rows:
  148. _id = row['id']
  149. tv_name = row['tv_name']
  150. url = row['url']
  151. try:
  152. driver.get(url)
  153. except Exception, e:
  154. print e
  155. driver.execute_script('window.stop()')
  156. try:
  157. score = driver.find_element_by_xpath('//div[@class="video_score"]').text
  158. score = score.replace(' ', '').replace('\n', '')
  159. except:
  160. score = ''
  161. try:
  162. pagelist = driver.find_elements_by_xpath('//span[@class="item"]')
  163. except:
  164. pagelist = None
  165. try:
  166. page_dict = dict()
  167. if pagelist is not None:
  168. for page in pagelist:
  169. episode = page.find_element_by_xpath('./a').get_attribute('href')
  170. episode_text = page.find_element_by_xpath('./a/span').text
  171. page_dict[episode_text] = episode
  172. if page_dict.get('1') is not None and 'v.qq.com' in page_dict.get('1'):
  173. try:
  174. driver2.get(page_dict.get('1'))
  175. except Exception, e:
  176. print e
  177. driver2.execute_script('window.stop()')
  178. try:
  179. count = driver2.find_element_by_xpath('//em[@id="mod_cover_playnum"]').text
  180. except Exception, e:
  181. print e
  182. count = 0
  183. sql = """
  184. update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'tengxun'
  185. """
  186. sql = sql % (score, count, url)
  187. Mysql.execute(sql, conn=conn)
  188. else:
  189. sql = """
  190. delete from scrapy.wangju_all_url where url = '%s' and source = '%s'
  191. """
  192. sql = sql % (url, 'tengxun')
  193. Mysql.execute(sql, conn=conn)
  194. except Exception, e:
  195. continue
  196. if __name__ == '__main__':
  197. # scrapy_url()
  198. # scrapy_data()
  199. # scrapy_play_page()
  200. parse_wangju_all_url_data()