scrapy_sohu.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. #/usr/bin/env python
  2. #coding=utf-8
  3. import random
  4. import sys
  5. import time
  6. from selenium import webdriver
  7. from urllib import quote
  8. from fty_util.common import Mysql
  9. reload(sys)
  10. sys.setdefaultencoding('utf8')
  11. def scrapy_url():
  12. conn = Mysql.createOfflineConn()
  13. sql = """
  14. select id, tv_name from scrapy.wangju_url where url_sohu is null order by id asc
  15. """
  16. rows = Mysql.getAll(sql, conn=conn)
  17. driver = webdriver.Firefox()
  18. driver.set_page_load_timeout(10)
  19. for row in rows:
  20. _id = row['id']
  21. tv_name = row['tv_name']
  22. url = 'http://so.tv.sohu.com/mts?box=1&wd=' + quote(str(tv_name))
  23. need_blank = True
  24. try:
  25. driver.get(url)
  26. except Exception, e:
  27. driver.execute_script('window.stop()')
  28. divs = driver.find_elements_by_xpath('//div[@class="wrap cfix"]/div')
  29. href_list = []
  30. for div in divs:
  31. try:
  32. href = div.find_element_by_xpath('./div/div[2]/div[1]/h2/a').get_attribute('href')
  33. href_list.append(href)
  34. except Exception, e:
  35. pass
  36. if len(href_list) > 0:
  37. sql = """
  38. update scrapy.wangju_url set url_sohu = '%s' where id = %s
  39. """
  40. sql = sql % (','.join(href_list), _id)
  41. Mysql.execute(sql, conn=conn)
  42. need_blank = False
  43. if need_blank:
  44. sql = """
  45. update scrapy.wangju_url set url_sohu = '%s' where id = %s
  46. """
  47. sql = sql % ('', _id)
  48. Mysql.execute(sql, conn=conn)
  49. driver.quit()
  50. def scrapy_data():
  51. conn = Mysql.createOfflineConn()
  52. sql = """
  53. select id, tv_name, url_sohu from scrapy.wangju_url where url_sohu is not null and url_sohu != '' order by id asc
  54. """
  55. rows = Mysql.getAll(sql, conn=conn)
  56. driver = webdriver.Firefox()
  57. driver.set_page_load_timeout(10)
  58. for row in rows:
  59. _id = row['id']
  60. tv_name = row['tv_name']
  61. url_sohu = row['url_sohu']
  62. urls = url_sohu.split(',')
  63. for url in urls:
  64. try:
  65. driver.get(url)
  66. except Exception, e:
  67. driver.execute_script('window.stop()')
  68. try:
  69. title = ''
  70. content = driver.find_element_by_xpath('//div[@class="infoR r"]').get_attribute('textContent')
  71. except Exception, e:
  72. try:
  73. title = driver.find_element_by_xpath('//div[@class="drama-name area rel cfix "]').get_attribute('textContent')
  74. content = driver.find_element_by_xpath('//div[@class="drama-infoR"]').get_attribute('textContent')
  75. except Exception, e:
  76. continue
  77. sql = """
  78. insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
  79. """
  80. value = (_id, tv_name, url, title, content, 'sohu')
  81. Mysql.insertOne(sql, value=value, conn=conn)
  82. driver.quit()
  83. def parse_wangju_all_url_data():
  84. conn = Mysql.createOfflineConn()
  85. sql = """
  86. select id, tv_name, url, content from scrapy.wangju_all_url where source = 'sohu' order by id asc
  87. """
  88. rows = Mysql.getAll(sql, conn=conn)
  89. for row in rows:
  90. _id = row['id']
  91. tv_name = row['tv_name']
  92. url = row['url']
  93. content = row['content']
  94. import re
  95. m = re.search(ur'评分:\d+(.)\d+', content)
  96. score = '0'
  97. if m is not None:
  98. score = m.group(0)
  99. play = '0'
  100. m = re.search(ur'总播放:\d+(.)\d+[(亿)(万)]', content)
  101. if m is not None:
  102. play = m.group(0)
  103. sql = """
  104. update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'sohu'
  105. """
  106. sql = sql % (score, play, url)
  107. Mysql.execute(sql, conn=conn)
  108. if __name__ == '__main__':
  109. # scrapy_data()
  110. # scrapy_url()
  111. parse_wangju_all_url_data()