scrapy_iqiyi.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. #/usr/bin/env python
  2. #coding=utf-8
  3. import random
  4. import sys
  5. import time
  6. from selenium import webdriver
  7. from urllib import quote
  8. from fty_util.common import Mysql
  9. reload(sys)
  10. sys.setdefaultencoding('utf8')
  11. """
  12. 爱奇艺爬取规则
  13. 1、scrapy_url 通过搜索页面,爬取搜索到的电视剧页面url
  14. 2、scrapy_data 进入搜索到的详情页面,爬取内容、每集url(播放数量在每集页面上显示)
  15. 3、scrapy_play_page 进入第一集的播放页面,爬取播放记录数
  16. 4、todo 每天爬取每页信息
  17. 爱奇艺通过搜索到的详情页面没有播放数量和评论数量,需要一个个页面解析
  18. 搜索页面-->搜索详情页面-->播放页面(只需取第一集播放页面即可)-->真实详情页面(爬取播放数量和评论数量(评论暂时爬不到))
  19. 所以只要在播放页面爬取到播放量即可。
  20. """
  21. # 爬取搜索页面
  22. def scrapy_url():
  23. conn = Mysql.createOfflineConn()
  24. sql = """
  25. select id, tv_name from scrapy.wangju_url order by id asc
  26. """
  27. rows = Mysql.getAll(sql, conn=conn)
  28. for row in rows:
  29. driver = webdriver.PhantomJS()
  30. driver.set_page_load_timeout(10)
  31. _id = row['id']
  32. tv_name = row['tv_name']
  33. url = 'http://so.iqiyi.com/so/q_' + quote(str(tv_name))
  34. try:
  35. driver.get(url)
  36. except Exception, e:
  37. driver.execute_script('window.stop()')
  38. lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li')
  39. for li in lis:
  40. try:
  41. title = li.find_element_by_xpath('./div/h3/a').get_attribute('title')
  42. href = li.find_element_by_xpath('./div/h3/a').get_attribute('href')
  43. if 'www.iqiyi.com/lib' in href:
  44. print href
  45. sql = """
  46. insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
  47. """
  48. value = (_id, tv_name, href, title, '', 'iqiyi')
  49. Mysql.insertOne(sql, value=value, conn=conn)
  50. time.sleep(1)
  51. except Exception, e:
  52. print e
  53. continue
  54. driver.quit()
  55. # 爬取搜索到的详情页面
  56. def scrapy_data():
  57. conn = Mysql.createOfflineConn()
  58. # sql = """
  59. # select id, tv_name, url_iqiyi from scrapy.wangju_url where url_iqiyi is not null and url_iqiyi != '' and iqiyi_fenji is null order by id asc
  60. # """
  61. sql = """
  62. select id, tv_name, url, title from scrapy.wangju_all_url where source = 'iqiyi' order by id asc
  63. """
  64. rows = Mysql.getAll(sql, conn=conn)
  65. for row in rows:
  66. driver = webdriver.PhantomJS()
  67. driver.set_page_load_timeout(10)
  68. _id = row['id']
  69. tv_name = row['tv_name']
  70. url = row['url']
  71. title = row['title']
  72. try:
  73. driver.get(url)
  74. except Exception, e:
  75. driver.execute_script('window.stop()')
  76. # 爬取内容
  77. try:
  78. content = driver.find_element_by_xpath('//div[@class="result_detail"]').get_attribute('textContent')
  79. except Exception, e:
  80. content = ''
  81. # 爬取分集
  82. try:
  83. pagelist = driver.find_elements_by_xpath('//div[@class="mod_album_lists clearfix"]/div[3]/div/ul/li')
  84. except Exception, e:
  85. # 如果没有隐藏的集数,则用显示的集数
  86. try:
  87. pagelist = driver.find_elements_by_xpath('//div[@class="mod_album_lists clearfix"]/div/ul/li')
  88. except Exception, e:
  89. pagelist = None
  90. pass
  91. if pagelist is not None:
  92. # 如果集数存在,则爬取每集url,用于爬取播放量和评论量
  93. data_list = []
  94. for page in pagelist:
  95. num = page.find_element_by_xpath('./a').get_attribute('title')
  96. num = num.replace(' ', '').replace('\n', '')
  97. href = page.find_element_by_xpath('./a').get_attribute('href')
  98. if 'www.iqiyi.com' in href:
  99. data_list.append((_id, tv_name, num, href, 'iqiyi'))
  100. # 插入分集数据
  101. if data_list is not None and len(data_list) > 0:
  102. sql = """
  103. insert into scrapy.wangju_fenji_url (id, tv_name, num, url, source) values (%s, %s, %s, %s, %s)
  104. """
  105. Mysql.insertMany(sql, data_list, conn)
  106. # 更新内容
  107. sql = """
  108. update scrapy.wangju_all_url set content = %s where url = %s
  109. """
  110. value = (content, url)
  111. Mysql.execute(sql, param=value, conn=conn)
  112. driver.quit()
  113. # 爬取播放页面
  114. def scrapy_play_page():
  115. conn = Mysql.createOfflineConn()
  116. sql = """
  117. select id, tv_name, url from scrapy.wangju_fenji_url where source = 'iqiyi' and num = '1' order by id asc
  118. """
  119. rows = Mysql.getAll(sql, conn=conn)
  120. for row in rows:
  121. driver = webdriver.Firefox()
  122. driver.set_page_load_timeout(10)
  123. _id = row['id']
  124. tv_name = row['tv_name']
  125. url = row['url']
  126. if 'www.iqiyi.com' not in url:
  127. driver.quit()
  128. continue
  129. else:
  130. try:
  131. driver.get(url)
  132. except Exception, e:
  133. print e
  134. driver.execute_script('window.stop()')
  135. try:
  136. count = driver.find_element_by_xpath('//span[@id="widget-playcount"]').text
  137. except Exception, e:
  138. print e
  139. count = 0
  140. print count
  141. sql = """
  142. update scrapy.wangju_url set iqiyi_playtimes = '%s' where id = %s
  143. """
  144. sql = sql % (count, _id)
  145. Mysql.execute(sql, conn=conn)
  146. driver.quit()
  147. # 每天爬取播放页面(爱奇艺只有每集的评论数量,没有每集播放数量)
  148. def scrapy_play_page_everyday():
  149. conn = Mysql.createOfflineConn()
  150. sql = """
  151. select id, tv_name, num, url from scrapy.wangju_fenji_url where source = 'iqiyi' order by id asc
  152. """
  153. rows = Mysql.getAll(sql, conn=conn)
  154. for row in rows:
  155. driver = webdriver.Firefox()
  156. driver.set_page_load_timeout(20)
  157. _id = row['id']
  158. tv_name = row['tv_name']
  159. num = row['num']
  160. url = row['url']
  161. if 'www.iqiyi.com' not in url:
  162. driver.quit()
  163. sql = """
  164. delete from scrapy.wangju_fenji_url where url = '%s'
  165. """
  166. sql = sql % (url,)
  167. Mysql.execute(sql, conn=conn)
  168. continue
  169. else:
  170. try:
  171. driver.get(url)
  172. except Exception, e:
  173. print e
  174. driver.execute_script('window.stop()')
  175. try:
  176. commenttimes = driver.find_element_by_xpath('//a[@class="blm-tab"]/em/i').text
  177. except Exception, e:
  178. print e
  179. commenttimes = ''
  180. print url
  181. print commenttimes
  182. # sql = """
  183. # insert into scrapy.wangju_fenji_data (id, tv_name, num, source, palytimes, commenttimes) values (%s, %s, %s, %s, %s, %s)
  184. # """
  185. # value = (_id, tv_name, num, 'iqiyi', playtimes, commenttimes)
  186. def parse_wangju_all_url_data():
  187. conn = Mysql.createOfflineConn()
  188. sql = """
  189. select id, tv_name, url from scrapy.wangju_all_url where source = 'iqiyi' and (playtimes = '' or playtimes = '0') order by id asc
  190. """
  191. rows = Mysql.getAll(sql, conn=conn)
  192. driver = webdriver.Firefox()
  193. driver.set_page_load_timeout(10)
  194. driver2 = webdriver.Firefox()
  195. driver2.set_page_load_timeout(10)
  196. for row in rows:
  197. _id = row['id']
  198. tv_name = row['tv_name']
  199. url = row['url']
  200. try:
  201. driver.get(url)
  202. except Exception, e:
  203. print e
  204. driver.execute_script('window.stop()')
  205. try:
  206. score = driver.find_element_by_xpath('//span[@class="score_font"]').text
  207. score = score.replace(' ', '').replace('\n', '')
  208. except:
  209. score = ''
  210. try:
  211. pagelist = driver.find_elements_by_xpath('//li[@class="album_item"]')
  212. except Exception, e:
  213. pass
  214. pagelist = None
  215. try:
  216. if pagelist is not None:
  217. page_dict = dict()
  218. for page in pagelist:
  219. try:
  220. episode = page.find_element_by_xpath('./a').get_attribute('href')
  221. episode_text = page.find_element_by_xpath('./a').text
  222. page_dict[episode_text] = episode
  223. except:
  224. continue
  225. if page_dict.get('1') is not None and 'www.iqiyi.com' in page_dict.get('1'):
  226. try:
  227. driver2.get(page_dict.get('1'))
  228. time.sleep(5)
  229. except Exception, e:
  230. print e
  231. driver2.execute_script('window.stop()')
  232. try:
  233. count = driver2.find_element_by_xpath('//a[@id="chartTrigger"]/span').text
  234. except Exception, e:
  235. print e
  236. count = '0'
  237. print count
  238. sql = """
  239. update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'iqiyi'
  240. """
  241. sql = sql % (score, count, url)
  242. Mysql.execute(sql, conn=conn)
  243. else:
  244. sql = """
  245. delete from scrapy.wangju_all_url where url = '%s' and source = '%s'
  246. """
  247. sql = sql % (url, 'iqiyi')
  248. Mysql.execute(sql, conn=conn)
  249. else:
  250. sql = """
  251. delete from scrapy.wangju_all_url where url = '%s' and source = '%s'
  252. """
  253. sql = sql % (url, 'iqiyi')
  254. Mysql.execute(sql, conn=conn)
  255. except Exception, e:
  256. continue
  257. if __name__ == '__main__':
  258. # scrapy_url()
  259. # scrapy_data()
  260. # scrapy_play_page()
  261. # scrapy_play_page_everyday()
  262. parse_wangju_all_url_data()