scrapy_pptv.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. #/usr/bin/env python
  2. #coding=utf-8
  3. import random
  4. import sys
  5. import time
  6. from selenium import webdriver
  7. from urllib import quote
  8. from fty_util.common import Mysql
  9. reload(sys)
  10. sys.setdefaultencoding('utf8')
  11. def scrapy_url():
  12. conn = Mysql.createOfflineConn()
  13. sql = """
  14. select id, tv_name from scrapy.wangju_url where url_pptv is null order by id asc
  15. """
  16. rows = Mysql.getAll(sql, conn=conn)
  17. driver = webdriver.Firefox()
  18. driver.set_page_load_timeout(10)
  19. for row in rows:
  20. _id = row['id']
  21. tv_name = row['tv_name']
  22. url = 'http://search.pptv.com/s_video?kw=' + quote(str(tv_name))
  23. need_blank = True
  24. try:
  25. driver.get(url)
  26. except Exception, e:
  27. driver.execute_script('window.stop()')
  28. divs = driver.find_elements_by_xpath('//div[@id="search-result"]/div')
  29. href_list = []
  30. for div in divs:
  31. try:
  32. href = div.find_element_by_xpath('./div[2]/dl/dd/p/a').get_attribute('href')
  33. href_list.append(href)
  34. except Exception, e:
  35. pass
  36. if len(href_list) > 0:
  37. sql = """
  38. update scrapy.wangju_url set url_pptv = '%s' where id = %s
  39. """
  40. sql = sql % (','.join(href_list), _id)
  41. Mysql.execute(sql, conn=conn)
  42. need_blank = False
  43. if need_blank:
  44. sql = """
  45. update scrapy.wangju_url set url_pptv = '%s' where id = %s
  46. """
  47. sql = sql % ('', _id)
  48. Mysql.execute(sql, conn=conn)
  49. driver.quit()
  50. def parse_unique_url():
  51. conn = Mysql.createOfflineConn()
  52. sql = """
  53. select id, tv_name, url_pptv from scrapy.wangju_url where url_pptv is not null and url_pptv != '' and pptv_finished is null order by id asc
  54. """
  55. rows = Mysql.getAll(sql, conn=conn)
  56. driver = webdriver.Firefox()
  57. driver.set_page_load_timeout(10)
  58. for row in rows:
  59. _id = row['id']
  60. tv_name = row['tv_name']
  61. url_pptv = row['url_pptv']
  62. urls = url_pptv.split(',')
  63. for url in urls:
  64. try:
  65. driver.get(url)
  66. except Exception, e:
  67. try:
  68. driver.execute_script('window.stop()')
  69. except:
  70. continue
  71. try:
  72. nav_type = driver.find_element_by_xpath('//div[@class="module module-bread-nav cf"]/p/a').text
  73. if nav_type != u'电视剧':
  74. continue
  75. else:
  76. title = driver.find_element_by_xpath('//div[@class="module-dpage-info"]/div[1]/h3').text
  77. content = driver.find_element_by_xpath('//div[@class="module-dpage-info"]/div[2]').get_attribute('textContent')
  78. sql = """
  79. insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
  80. """
  81. value = (_id, tv_name, url, title, content, 'pptv')
  82. Mysql.insertOne(sql, value=value, conn=conn)
  83. except Exception, e:
  84. pass
  85. sql = """
  86. update scrapy.wangju_url set pptv_finished = '%s' where id = %s
  87. """
  88. sql = sql % ('1', _id)
  89. Mysql.execute(sql, conn=conn)
  90. driver.quit()
  91. def scrapy_fenji():
  92. pass
  93. def parse_content():
  94. conn = Mysql.createOfflineConn()
  95. sql = """
  96. select id, tv_name, url, content from scrapy.wangju_all_url where source = 'pptv' order by id asc
  97. """
  98. rows = Mysql.getAll(sql, conn=conn)
  99. for row in rows:
  100. _id = row['id']
  101. tv_name = row['tv_name']
  102. url = row['url']
  103. content = row['content']
  104. import re
  105. m = re.search(ur'评分:\d+(.)\d+', content)
  106. score = '0'
  107. if m is not None:
  108. score = m.group(0)
  109. play = '0'
  110. m = re.search(ur'播放:\d+(.)\d+[(亿)(万)]', content)
  111. if m is not None:
  112. play = m.group(0)
  113. sql = """
  114. update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'pptv'
  115. """
  116. sql = sql % (score, play, url)
  117. Mysql.execute(sql, conn=conn)
  118. if __name__ == '__main__':
  119. # scrapy_url()
  120. # parse_unique_url()
  121. parse_content()