i_t_dsj_categories.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. #/usr/bin/env python
  2. #coding=utf-8
  3. """爱奇艺电视剧分类爬取
  4. 分为两步
  5. 第一步爬取搜索页面结果,找到符合条件的电视剧
  6. 第二步根据保存的具体页面url爬取分类信息
  7. """
  8. import random
  9. import sys
  10. import time
  11. from selenium import webdriver
  12. from fty_util.common import Mysql
  13. reload(sys)
  14. sys.setdefaultencoding('utf8')
  15. class DSJ_Categories(object):
  16. # 爬取电视剧链接地址
  17. def get_iqiyi_url():
  18. # 打开Firefox浏览器
  19. driver = webdriver.Firefox()
  20. driver.set_page_load_timeout(10)
  21. # 数据库连接
  22. conn = Mysql.createScrapyConn()
  23. # 获取所有需要爬取的电视剧
  24. sql = """
  25. select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and (iqiyi_url is null or iqiyi_url = '') order by id asc
  26. """
  27. # rows = conn.getAll(sql)
  28. rows = Mysql.getAll(sql, conn=conn)
  29. for row in rows:
  30. _id = row['id']
  31. tv_name = row['tv_name']
  32. print tv_name
  33. start_url = "http://so.iqiyi.com/so/q_" + tv_name + "?source=input"
  34. # 打开主页
  35. try:
  36. driver.get(start_url)
  37. except:
  38. driver.execute_script('window.stop()')
  39. lis = driver.find_elements_by_xpath('//div[@class="mod_result"]/ul/li')
  40. for li in lis:
  41. try:
  42. first_num = li.find_element_by_xpath('.//div[@class="info_item mt15"]/div/div[2]/ul[1]/li[1]/a').text
  43. if '1' == first_num.strip():
  44. href = li.find_element_by_xpath('.//div/h3/a').get_attribute('href')
  45. print href
  46. sql = """
  47. update scrapy.tv_category_scrapy set iqiyi_url = '%s' where id = '%s'
  48. """
  49. sql = sql % (href, _id)
  50. # conn.update(sql)
  51. Mysql.update(sql, conn=conn)
  52. break
  53. except Exception, e:
  54. print '没有'
  55. continue
  56. driver.quit()
  57. # 爬取具体页面
  58. def get_detail():
  59. driver = webdriver.Firefox()
  60. driver.set_page_load_timeout(10)
  61. # 数据库连接
  62. conn = Mysql.createScrapyConn()
  63. # 获取所有需要爬取的电视剧
  64. sql = """
  65. select id, tv_name, iqiyi_url from scrapy.tv_category_scrapy where id > 4573 and iqiyi_url is not null and iqiyi_url != '' and iqiyi_types is null order by id asc
  66. """
  67. # rows = conn.getAll(sql)
  68. rows = Mysql.getAll(sql, conn=conn)
  69. for row in rows:
  70. _id = row['id']
  71. tv_name = row['tv_name']
  72. url = row['iqiyi_url']
  73. print url
  74. try:
  75. driver.get(url)
  76. except:
  77. driver.execute_script('window.stop()')
  78. cats = driver.find_elements_by_xpath('//div[@class="look_point"]/a')
  79. cats_set = set()
  80. for cat in cats:
  81. cats_set.add(cat.find_element_by_xpath('.').text.strip())
  82. #存入数据库
  83. sql = """
  84. update scrapy.tv_category_scrapy set iqiyi_types = '%s' where id = '%s'
  85. """
  86. sql = sql % (' '.join(cats_set), _id)
  87. # conn.update(sql)
  88. Mysql.update(sql, conn=conn)
  89. driver.quit()
  90. # 爬取电视剧链接地址
  91. def get_tengxun_url():
  92. # 打开Firefox浏览器
  93. driver = webdriver.Firefox()
  94. driver.set_page_load_timeout(10)
  95. # 数据库连接
  96. conn = Mysql.createScrapyConn()
  97. # 获取所有需要爬取的电视剧
  98. sql = """
  99. select id, tv_name from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is null order by id asc
  100. """
  101. # rows = conn.getAll(sql)
  102. rows = Mysql.getAll(sql, conn=conn)
  103. for row in rows:
  104. _id = row['id']
  105. tv_name = row['tv_name']
  106. print tv_name
  107. start_url = "http://v.qq.com/x/search/?q=" + tv_name + "&stag=0"
  108. # 打开主页
  109. try:
  110. driver.get(start_url)
  111. except:
  112. driver.execute_script('window.stop()')
  113. divs = driver.find_elements_by_xpath('//div[@class="wrapper_main"]/div')
  114. for div in divs:
  115. try:
  116. href = div.find_element_by_xpath('div[1]/div/h2/a').get_attribute('href')
  117. print href
  118. matchObj = re.match(r'(.*)detail(.*)', href, re.M | re.I)
  119. if matchObj:
  120. sql = """
  121. update scrapy.tv_category_scrapy set tengxun_url = %s where id = %s
  122. """
  123. # sql = sql % (href, _id)
  124. value = (href, _id)
  125. # conn.update(sql)
  126. Mysql.update(sql, param=value, conn=conn)
  127. break
  128. except Exception, e:
  129. print '没有'
  130. print e
  131. continue
  132. driver.quit()
  133. # 爬取具体页面
  134. def get_detail():
  135. driver = webdriver.Firefox()
  136. driver.set_page_load_timeout(10)
  137. # 数据库连接
  138. conn = Mysql.createScrapyConn()
  139. # 获取所有需要爬取的电视剧
  140. sql = """
  141. select id, tv_name, tengxun_url from scrapy.tv_category_scrapy where id > 4573 and tengxun_url is not null and tengxun_url != '' and tengxun_types is null order by id asc
  142. """
  143. # rows = Mysql.getAll(sql)
  144. rows = Mysql.getAll(sql, conn=conn)
  145. for row in rows:
  146. _id = row['id']
  147. tv_name = row['tv_name']
  148. tengxun_url = row['tengxun_url']
  149. print tengxun_url
  150. # 打开主页
  151. try:
  152. driver.get(tengxun_url)
  153. except:
  154. driver.execute_script('window.stop()')
  155. cats = driver.find_elements_by_xpath('//div[@class="tag_list"]/a')
  156. cats_set = set()
  157. for cat in cats:
  158. cat_name = cat.find_element_by_xpath('.').text
  159. cats_set.add(cat_name)
  160. #存入数据库
  161. sql = """
  162. update scrapy.tv_category_scrapy set tengxun_types = '%s' where id = '%s'
  163. """
  164. sql = sql % (' '.join(cats_set), _id)
  165. # conn.update(sql)
  166. Mysql.update(sql, conn=conn)
  167. driver.quit()
  168. if __name__ == '__main__':
  169. if len(sys.argv) != 2:
  170. print '没有输入参数,退出'
  171. sys.exit(0)
  172. print 'method name is ' + sys.argv[1]
  173. obj = DSJ_Categories()
  174. try:
  175. getattr(obj, sys.argv[1])()
  176. except Exception, e:
  177. print e