i_t_dsj_categories_without_browser.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. #/usr/bin/env python
  2. #coding=utf-8
  3. """电视剧分类爬取
  4. 分为两步
  5. 第一步爬取搜索页面结果,找到符合条件的电视剧
  6. 第二步根据保存的具体页面url爬取分类信息
  7. """
  8. import random
  9. import sys
  10. import time
  11. from selenium import webdriver
  12. from urllib import quote
  13. from fty_util.common import Mysql
  14. reload(sys)
  15. sys.setdefaultencoding('utf8')
  16. class DSJ_Categories(object):
  17. # 爬取电视剧链接地址
  18. def get_iqiyi_url(self):
  19. # 打开Firefox浏览器
  20. # driver = webdriver.Firefox()
  21. driver = webdriver.PhantomJS()
  22. driver.set_page_load_timeout(10)
  23. # 数据库连接
  24. conn = Mysql.createScrapyConn()
  25. # 获取所有需要爬取的电视剧
  26. # sql = """
  27. # select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and (iqiyi_url is null or iqiyi_url = '') order by id asc
  28. # """
  29. sql = """
  30. select id, tv_name from scrapy.tv_category_scrapy where id > 5598 order by id asc
  31. """
  32. # rows = conn.getAll(sql)
  33. rows = Mysql.getAll(sql, conn=conn)
  34. for row in rows:
  35. _id = row['id']
  36. tv_name = row['tv_name']
  37. print tv_name
  38. start_url = "http://so.iqiyi.com/so/q_" + quote(str(tv_name)) + "?source=input"
  39. # 打开主页
  40. try:
  41. driver.get(start_url)
  42. except:
  43. driver.execute_script('window.stop()')
  44. lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li')
  45. for li in lis:
  46. try:
  47. first_num = li.find_element_by_xpath('.//div[@class="info_item mt15"]/div/div[2]/ul[1]/li[1]/a').text
  48. if '1' == first_num.strip():
  49. href = li.find_element_by_xpath('.//div/h3/a').get_attribute('href')
  50. print href
  51. sql = """
  52. update scrapy.tv_category_scrapy set iqiyi_url = '%s' where id = '%s'
  53. """
  54. sql = sql % (href, _id)
  55. # conn.update(sql)
  56. Mysql.update(sql, conn=conn)
  57. break
  58. except Exception, e:
  59. print '没有'
  60. continue
  61. break
  62. driver.quit()
  63. # 爬取具体页面
  64. def get_iqiyi_detail(self):
  65. driver = webdriver.PhantomJS()
  66. driver.set_page_load_timeout(10)
  67. # 数据库连接
  68. conn = Mysql.createScrapyConn()
  69. # 获取所有需要爬取的电视剧
  70. sql = """
  71. select id, tv_name, iqiyi_url from scrapy.tv_category_scrapy where id > 4573 and iqiyi_url is not null and iqiyi_url != '' and iqiyi_types is null order by id asc
  72. """
  73. # rows = conn.getAll(sql)
  74. rows = Mysql.getAll(sql, conn=conn)
  75. for row in rows:
  76. _id = row['id']
  77. tv_name = row['tv_name']
  78. url = row['iqiyi_url']
  79. print url
  80. try:
  81. driver.get(url)
  82. except:
  83. driver.execute_script('window.stop()')
  84. cats = driver.find_elements_by_xpath('//div[@class="look_point"]/a')
  85. cats_set = set()
  86. for cat in cats:
  87. cats_set.add(cat.find_element_by_xpath('.').text.strip())
  88. #存入数据库
  89. sql = """
  90. update scrapy.tv_category_scrapy set iqiyi_types = '%s' where id = '%s'
  91. """
  92. sql = sql % (' '.join(cats_set), _id)
  93. # conn.update(sql)
  94. Mysql.update(sql, conn=conn)
  95. driver.quit()
  96. # 爬取电视剧链接地址
  97. def get_tengxun_url(self):
  98. # 打开Firefox浏览器
  99. driver = webdriver.PhantomJS()
  100. driver.set_page_load_timeout(10)
  101. # 数据库连接
  102. conn = Mysql.createScrapyConn()
  103. # 获取所有需要爬取的电视剧
  104. sql = """
  105. select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is null order by id asc
  106. """
  107. # rows = conn.getAll(sql)
  108. rows = Mysql.getAll(sql, conn=conn)
  109. for row in rows:
  110. _id = row['id']
  111. tv_name = row['tv_name']
  112. print tv_name
  113. start_url = "http://v.qq.com/x/search/?q=" + quote(str(tv_name)) + "&stag=0"
  114. # 打开主页
  115. try:
  116. driver.get(start_url)
  117. except:
  118. driver.execute_script('window.stop()')
  119. divs = driver.find_elements_by_xpath('//div[@class="wrapper_main"]/div')
  120. for div in divs:
  121. try:
  122. href = div.find_element_by_xpath('div[1]/div/h2/a').get_attribute('href')
  123. print href
  124. matchObj = re.match(r'(.*)detail(.*)', href, re.M | re.I)
  125. if matchObj:
  126. sql = """
  127. update scrapy.tv_category_scrapy set tengxun_url = %s where id = %s
  128. """
  129. # sql = sql % (href, _id)
  130. value = (href, _id)
  131. # conn.update(sql)
  132. Mysql.update(sql, param=value, conn=conn)
  133. break
  134. except Exception, e:
  135. print '没有'
  136. print e
  137. continue
  138. driver.quit()
  139. # 爬取具体页面
  140. def get_tengxun_detail(self):
  141. driver = webdriver.PhantomJS()
  142. driver.set_page_load_timeout(10)
  143. # 数据库连接
  144. conn = Mysql.createScrapyConn()
  145. # 获取所有需要爬取的电视剧
  146. sql = """
  147. select id, tv_name, tengxun_url from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is not null and tengxun_url != '' and tengxun_types is null order by id asc
  148. """
  149. # rows = Mysql.getAll(sql)
  150. rows = Mysql.getAll(sql, conn=conn)
  151. for row in rows:
  152. _id = row['id']
  153. tv_name = row['tv_name']
  154. tengxun_url = row['tengxun_url']
  155. print tengxun_url
  156. # 打开主页
  157. try:
  158. driver.get(tengxun_url)
  159. except:
  160. driver.execute_script('window.stop()')
  161. cats = driver.find_elements_by_xpath('//div[@class="tag_list"]/a')
  162. cats_set = set()
  163. for cat in cats:
  164. cat_name = cat.find_element_by_xpath('.').text
  165. cats_set.add(cat_name)
  166. #存入数据库
  167. sql = """
  168. update scrapy.tv_category_scrapy set tengxun_types = '%s' where id = '%s'
  169. """
  170. sql = sql % (' '.join(cats_set), _id)
  171. # conn.update(sql)
  172. Mysql.update(sql, conn=conn)
  173. driver.quit()
  174. if __name__ == '__main__':
  175. if len(sys.argv) != 2:
  176. print '没有输入参数,退出'
  177. sys.exit(0)
  178. print 'method name is ' + sys.argv[1]
  179. obj = DSJ_Categories()
  180. try:
  181. getattr(obj, sys.argv[1])()
  182. except Exception, e:
  183. print e