scrapy_leshi.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. #/usr/bin/env python
  2. #coding=utf-8
  3. import random
  4. import sys
  5. import time
  6. from selenium import webdriver
  7. from urllib import quote
  8. from fty_util.common import Mysql
  9. reload(sys)
  10. sys.setdefaultencoding('utf8')
  11. """
  12. 乐视视频爬取规则
  13. """
  14. def scrapy_url():
  15. conn = Mysql.createOfflineConn()
  16. sql = """
  17. select id, tv_name from scrapy.wangju_url where url_leshi is null order by id asc
  18. """
  19. rows = Mysql.getAll(sql, conn=conn)
  20. driver = webdriver.Firefox()
  21. driver.set_page_load_timeout(10)
  22. for row in rows:
  23. _id = row['id']
  24. tv_name = row['tv_name']
  25. url = 'http://so.le.com/s?wd=' + quote(str(tv_name))
  26. try:
  27. driver.get(url)
  28. except Exception, e:
  29. driver.execute_script('window.stop()')
  30. divs = driver.find_elements_by_xpath('//div[@class="So-detail Tv-so"]')
  31. href_list = []
  32. for div in divs:
  33. try:
  34. href = div.find_element_by_xpath('./div/div[2]/div[1]/h1/a').get_attribute('href')
  35. href_list.append(href)
  36. except Exception, e:
  37. pass
  38. if len(href_list) > 0:
  39. sql = """
  40. update scrapy.wangju_url set url_leshi = '%s' where id = %s
  41. """
  42. sql = sql % (','.join(href_list), _id)
  43. Mysql.execute(sql, conn=conn)
  44. need_blank = False
  45. if need_blank:
  46. sql = """
  47. update scrapy.wangju_url set url_leshi = '%s' where id = %s
  48. """
  49. sql = sql % ('', _id)
  50. Mysql.execute(sql, conn=conn)
  51. driver.quit()
  52. def scrapy_data():
  53. conn = Mysql.createOfflineConn()
  54. sql = """
  55. select id, tv_name, url_leshi from scrapy.wangju_url where url_leshi is not null and url_leshi != '' order by id asc
  56. """
  57. rows = Mysql.getAll(sql, conn=conn)
  58. driver = webdriver.Firefox()
  59. driver.set_page_load_timeout(10)
  60. for row in rows:
  61. _id = row['id']
  62. tv_name = row['tv_name']
  63. url_leshi = row['url_leshi']
  64. urls = url_leshi.split(',')
  65. for url in urls:
  66. if 'www.le.com' not in url:
  67. continue
  68. try:
  69. driver.get(url)
  70. except Exception, e:
  71. driver.execute_script('window.stop()')
  72. try:
  73. href = driver.find_element_by_xpath('//div[@id="j-adv-tv"]/div[2]/div[1]/div[2]/div[1]/div[2]/dl[1]/dt/a').get_attribute('href')
  74. except Exception, e:
  75. href = None
  76. if href is not None and 'www.le.com' in href:
  77. print href
  78. try:
  79. driver.get(href)
  80. except Exception, e:
  81. driver.execute_script('window.stop()')
  82. try:
  83. content = driver.find_element_by_xpath('//div[@class="Info"]').get_attribute('textContent')
  84. except Exception, e:
  85. continue
  86. sql = """
  87. insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
  88. """
  89. value = (_id, tv_name, url, '', content, 'leshi')
  90. Mysql.insertOne(sql, value=value, conn=conn)
  91. driver.quit()
  92. def parse_wangju_all_url_title():
  93. conn = Mysql.createOfflineConn()
  94. sql = """
  95. select id, tv_name, url from scrapy.wangju_all_url where source = 'leshi' order by id asc
  96. """
  97. rows = Mysql.getAll(sql, conn=conn)
  98. driver = webdriver.Firefox()
  99. driver.set_page_load_timeout(10)
  100. for row in rows:
  101. _id = row['id']
  102. tv_name = row['tv_name']
  103. url = row['url']
  104. try:
  105. driver.get(url)
  106. except Exception, e:
  107. print e
  108. driver.execute_script('window.stop()')
  109. try:
  110. title = driver.find_element_by_xpath('//div[@class="listPic active"]/div[1]/p/i').text
  111. except Exception, e:
  112. title = ''
  113. sql = """
  114. update scrapy.wangju_all_url set title = '%s' where source = '%s' and url = '%s'
  115. """
  116. sql = sql % (title, 'leshi', url)
  117. Mysql.execute(sql, conn=conn)
  118. def parse_content():
  119. conn = Mysql.createOfflineConn()
  120. sql = """
  121. select id, tv_name, url, content from scrapy.wangju_all_url where source = 'leshi' order by id asc
  122. """
  123. rows = Mysql.getAll(sql, conn=conn)
  124. for row in rows:
  125. _id = row['id']
  126. tv_name = row['tv_name']
  127. url = row['url']
  128. content = row['content']
  129. import re
  130. m = re.search(ur'([0-9]+[.]?)+', content)
  131. score = '0'
  132. if m is not None:
  133. score = m.group(0)
  134. play = '0'
  135. m = re.search(ur'播放数:([0-9]+[.]?)+[(亿)(万)]', content)
  136. if m is not None:
  137. play = m.group(0)
  138. sql = """
  139. update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'leshi'
  140. """
  141. sql = sql % (score, play, url)
  142. Mysql.execute(sql, conn=conn)
  143. if __name__ == '__main__':
  144. # scrapy_data()
  145. # scrapy_url()
  146. # parse_wangju_all_url_title()
  147. parse_content()