i_t_dsj_all.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. #/usr/bin/env python
  2. #coding=utf-8
  3. """爱奇艺电视剧分类爬取
  4. 分为两步
  5. 第一步爬取搜索页面结果,找到符合条件的电视剧
  6. 第二步根据保存的具体页面url爬取分类信息
  7. """
  8. import random
  9. import sys
  10. import time
  11. from selenium import webdriver
  12. from fty_util.common import Mysql
  13. reload(sys)
  14. sys.setdefaultencoding('utf8')
  15. class DSJ_All(object):
  16. # 爬取电视剧链接地址
  17. def get_iqiyi_url():
  18. # 需要爬取的列表页面
  19. start_urls = [
  20. # 'http://www.iqiyi.com/lib/dianshiju/,,2017_4_1.html',
  21. # 'http://www.iqiyi.com/lib/dianshiju/,,2016_4_1.html',
  22. # 'http://www.iqiyi.com/lib/dianshiju/,,2015_4_1.html',
  23. # 'http://www.iqiyi.com/lib/dianshiju/,,2014-2011_4_1.html',
  24. # 'http://www.iqiyi.com/lib/dianshiju/,,2010-2000_4_1.html',
  25. # 'http://www.iqiyi.com/lib/dianshiju/,,90%E5%B9%B4%E4%BB%A3_4_1.html',
  26. # 'http://www.iqiyi.com/lib/dianshiju/,,80%E5%B9%B4%E4%BB%A3_4_1.html',
  27. 'http://www.iqiyi.com/lib/dianshiju/,,%E6%9B%B4%E6%97%A9_4_1.html'
  28. ]
  29. # 打开Firefox浏览器
  30. driver = webdriver.Firefox()
  31. driver.set_page_load_timeout(20)
  32. # 数据库连接
  33. conn = Mysql.createScrapyConn()
  34. for url in start_urls:
  35. # 打开主页
  36. try:
  37. driver.get(url)
  38. except:
  39. driver.execute_script('window.stop()')
  40. is_next = True
  41. while is_next:
  42. try:
  43. next_page = driver.find_elements_by_xpath('//div[@class="mod-page"]/a')[-1]
  44. except:
  45. continue
  46. lis = driver.find_elements_by_xpath('//div[@class="wrapper-piclist"]/ul/li')
  47. sql_insert = """
  48. insert into scrapy.iqiyi_dianshiju_url (url) values (%s)
  49. """
  50. data_list = []
  51. for li in lis:
  52. try:
  53. tv_url = li.find_element_by_xpath('.//div[1]/a').get_attribute('href')
  54. print tv_url
  55. data_list.append((tv_url,))
  56. except Exception, e:
  57. print '没有'
  58. continue
  59. time.sleep(random.uniform(0, 2))
  60. Mysql.insertMany(sql_insert, data_list, conn)
  61. try:
  62. next_page_text = next_page.find_element_by_xpath('.').text
  63. if next_page_text == '下一页':
  64. next_page.click()
  65. else:
  66. is_next = False;
  67. except:
  68. is_next = False;
  69. time.sleep(10)
  70. driver.quit()
  71. # 爬取具体页面
  72. def get_iqiyi_detail():
  73. driver = webdriver.Firefox()
  74. driver.set_page_load_timeout(10)
  75. # 数据库连接
  76. conn = Mysql.createScrapyConn()
  77. sql = """
  78. select max(id) from scrapy.iqiyi_dianshiju_detail
  79. """
  80. max_id = Mysql.getOne(sql, conn=conn)
  81. max_id = max_id[0]
  82. if max_id is None:
  83. max_id = 0
  84. # 获取所有url
  85. sql = """
  86. select id, url from scrapy.iqiyi_dianshiju_url where id > '%s' order by id asc
  87. """
  88. sql = sql % (max_id)
  89. rows = Mysql.getAll(sql, conn=conn)
  90. for row in rows:
  91. _id = row['id']
  92. url = row['url']
  93. print url
  94. try:
  95. driver.get(url)
  96. except:
  97. driver.execute_script('window.stop()')
  98. detail_info = driver.find_element_by_xpath('//div[@class="result_detail"]')
  99. # 详情html内容
  100. detail_info_html = detail_info.get_attribute('innerHTML')
  101. # 详情文本内容
  102. detail_info_text = detail_info.find_element_by_xpath('.').text
  103. # 电视剧名称
  104. tv_name = detail_info.find_element_by_xpath('h1/a').text
  105. #存入数据库
  106. sql = """
  107. insert into scrapy.iqiyi_dianshiju_detail (id, tv_name, detail_info_text, detail_info_html, url) values (%s, %s, %s, %s, %s)
  108. """
  109. value = (_id, tv_name, detail_info_text, detail_info_html, url)
  110. Mysql.insertOne(sql, value=value, conn=conn)
  111. time.sleep(random.uniform(1, 5))
  112. driver.quit()
  113. # 爬取电视剧链接地址
  114. def get_tengxun_url():
  115. start_urls = [
  116. # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=2017',
  117. # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=859',
  118. # 'http://v.qq.com/x/list/tv?offset=0&sort=5&iyear=860',
  119. # 'http://v.qq.com/x/list/tv?iyear=861&offset=0&sort=5',
  120. # 'http://v.qq.com/x/list/tv?sort=5&offset=0&iyear=862',
  121. # 'http://v.qq.com/x/list/tv?iyear=863&sort=5&offset=0',
  122. # 'http://v.qq.com/x/list/tv?sort=5&iyear=864&offset=0',
  123. 'http://v.qq.com/x/list/tv?iyear=865&sort=5&offset=0',
  124. 'http://v.qq.com/x/list/tv?iyear=866&offset=0&sort=5'
  125. ]
  126. # 打开Firefox浏览器
  127. driver = webdriver.Firefox()
  128. driver.set_page_load_timeout(20)
  129. # 数据库连接
  130. conn = Mysql.createScrapyConn()
  131. for url in start_urls:
  132. # 打开主页
  133. try:
  134. driver.get(url)
  135. except:
  136. driver.execute_script('window.stop()')
  137. is_next = True
  138. while is_next:
  139. lis = driver.find_elements_by_xpath('//div[@class="mod_bd"]/div/ul/li')
  140. print lis
  141. sql_insert = """
  142. insert into scrapy.tengxun_dianshiju_url (url) values (%s)
  143. """
  144. data_list = []
  145. for li in lis:
  146. try:
  147. tv_url = li.find_element_by_xpath('a').get_attribute('href')
  148. print tv_url
  149. data_list.append((tv_url,))
  150. except Exception, e:
  151. print '没有'
  152. continue
  153. time.sleep(1)
  154. Mysql.insertMany(sql_insert, data_list, conn)
  155. try:
  156. next_page = driver.find_elements_by_xpath('//div[@class="mod_pages"]/a')[-1]
  157. except:
  158. is_next = False
  159. continue
  160. try:
  161. next_page_text = next_page.find_element_by_xpath('.').text
  162. next_page_url = next_page.find_element_by_xpath('.').get_attribute('href')
  163. if next_page_url == 'javascript:;':
  164. is_next = False
  165. continue
  166. if next_page_text == '下一页':
  167. next_page.click()
  168. else:
  169. is_next = False;
  170. except:
  171. is_next = False;
  172. time.sleep(10)
  173. driver.quit()
  174. def get_tengxun_detail_url():
  175. # 打开Firefox浏览器
  176. driver = webdriver.Firefox()
  177. driver.set_page_load_timeout(20)
  178. # 数据库连接
  179. conn = Mysql.createScrapyConn()
  180. sql = """
  181. select id, url from scrapy.tengxun_dianshiju_url where detail_url is null or detail_url = '' order by id asc
  182. """
  183. rows = Mysql.getAll(sql, conn=conn)
  184. for row in rows:
  185. _id = row['id']
  186. url = row['url']
  187. # 打开主页
  188. try:
  189. driver.get(url)
  190. except:
  191. driver.execute_script('window.stop()')
  192. if re.match(r'(.*)detail(.*)', driver.current_url):
  193. print driver.current_url
  194. sql = """
  195. update scrapy.tengxun_dianshiju_url set detail_url = '%s' where id = '%s'
  196. """
  197. sql = sql % (driver.current_url, _id)
  198. Mysql.update(sql, conn=conn)
  199. continue
  200. try:
  201. a_list = driver.find_elements_by_xpath('//a[@class="album_title"]')
  202. print a_list
  203. for a in a_list:
  204. detail_href = a.find_element_by_xpath('.').get_attribute('href')
  205. if re.match(r'(.*)detail(.*)', detail_href):
  206. print detail_href
  207. sql = """
  208. update scrapy.tengxun_dianshiju_url set detail_url = '%s' where id = '%s'
  209. """
  210. sql = sql % (detail_href, _id)
  211. Mysql.update(sql, conn=conn)
  212. break
  213. except Exception, e:
  214. print e
  215. time.sleep(random.uniform(0, 3))
  216. driver.quit()
  217. # 爬取具体页面
  218. def get_tengxun_detail():
  219. driver = webdriver.Firefox()
  220. driver.set_page_load_timeout(10)
  221. # 数据库连接
  222. conn = Mysql.createScrapyConn()
  223. # 获取所有需要爬取的电视剧
  224. sql = """
  225. select url, detail_url from scrapy.tengxun_dianshiju_url order by id asc
  226. """
  227. rows = Mysql.getAll(sql, conn=conn)
  228. for row in rows:
  229. url = row['url']
  230. detail_url = row['detail_url']
  231. try:
  232. driver.get(detail_url)
  233. except:
  234. driver.execute_script('window.stop()')
  235. detail_info = driver.find_element_by_xpath('//div[@class="container_inner"]')
  236. # 详情html内容
  237. detail_info_html = detail_info.get_attribute('innerHTML')
  238. # 详情文本内容
  239. detail_info_text = detail_info.find_element_by_xpath('.').text
  240. # 电视剧名称
  241. tv_name = detail_info.find_element_by_xpath('.//div[@class="video_title_collect cf"]/h1/a').text
  242. sql = """
  243. insert into scrapy.tengxun_dianshiju_detail (tv_name, detail_info_text, detail_info_html, cover_url, detail_url) values ('%s', '%s', '%s', '%s', '%s')
  244. """
  245. sql = sql % (tv_name, detail_info_text, detail_info_html, url, detail_url)
  246. Mysql.insertOne(sql, conn=conn)
  247. driver.quit()
  248. if __name__ == '__main__':
  249. if len(sys.argv) != 2:
  250. print '没有输入参数,退出'
  251. sys.exit(0)
  252. print 'method name is ' + sys.argv[1]
  253. obj = DSJ_All()
  254. try:
  255. getattr(obj, sys.argv[1])()
  256. except Exception, e:
  257. print e