scrapy_huashutv.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. #/usr/bin/env python
  2. #coding=utf-8
  3. import random
  4. import sys
  5. import time
  6. from selenium import webdriver
  7. from urllib import quote
  8. from fty_util.common import Mysql
  9. reload(sys)
  10. sys.setdefaultencoding('utf8')
  11. def scrapy_url():
  12. conn = Mysql.createOfflineConn()
  13. sql = """
  14. select id, tv_name from scrapy.wangju_url where url_huashutv is null order by id asc
  15. """
  16. rows = Mysql.getAll(sql, conn=conn)
  17. driver = webdriver.Firefox()
  18. driver.set_page_load_timeout(10)
  19. for row in rows:
  20. _id = row['id']
  21. tv_name = row['tv_name']
  22. url = 'http://www.wasu.cn/Search/show/k/' + quote(str(tv_name))
  23. need_blank = True
  24. try:
  25. driver.get(url)
  26. except Exception, e:
  27. driver.execute_script('window.stop()')
  28. divs = driver.find_elements_by_xpath('//div[@id="agg_list"]/div')
  29. href_list = []
  30. for div in divs:
  31. try:
  32. href = div.find_element_by_xpath('./div[1]/a[1]').get_attribute('href')
  33. href_list.append(href)
  34. except Exception, e:
  35. pass
  36. if len(href_list) > 0:
  37. sql = """
  38. update scrapy.wangju_url set url_huashutv = '%s' where id = %s
  39. """
  40. sql = sql % (','.join(href_list), _id)
  41. Mysql.execute(sql, conn=conn)
  42. need_blank = False
  43. if need_blank:
  44. sql = """
  45. update scrapy.wangju_url set url_huashutv = '%s' where id = %s
  46. """
  47. sql = sql % ('', _id)
  48. Mysql.execute(sql, conn=conn)
  49. driver.quit()
  50. def scrapy_data():
  51. conn = Mysql.createOfflineConn()
  52. sql = """
  53. select id, tv_name, url_huashutv from scrapy.wangju_url where url_huashutv is not null and url_huashutv != '' order by id asc
  54. """
  55. rows = Mysql.getAll(sql, conn=conn)
  56. driver = webdriver.Firefox()
  57. driver.set_page_load_timeout(10)
  58. for row in rows:
  59. _id = row['id']
  60. tv_name = row['tv_name']
  61. url_huashutv = row['url_huashutv']
  62. urls = url_huashutv.split(',')
  63. for url in urls:
  64. if 'www.wasu.cn' not in url:
  65. continue
  66. try:
  67. driver.get(url)
  68. except Exception, e:
  69. driver.execute_script('window.stop()')
  70. try:
  71. href = driver.find_element_by_xpath('//div[@id="con_telelist_1"]/ul/li[1]/a').get_attribute('href')
  72. except Exception, e:
  73. href = None
  74. if href is not None and 'www.wasu.cn' in href:
  75. print href
  76. try:
  77. driver.get(href)
  78. except Exception, e:
  79. driver.execute_script('window.stop()')
  80. try:
  81. content = driver.find_element_by_xpath('//div[@id="play_vod_hits"]').get_attribute('textContent')
  82. except Exception, e:
  83. continue
  84. sql = """
  85. insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
  86. """
  87. value = (_id, tv_name, url, '', content, 'huashutv')
  88. Mysql.insertOne(sql, value=value, conn=conn)
  89. driver.quit()
  90. if __name__ == '__main__':
  91. scrapy_data()
  92. # scrapy_url()