scrapy_youku.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. #/usr/bin/env python
  2. #coding=utf-8
  3. import random
  4. import sys
  5. import time
  6. from selenium import webdriver
  7. from urllib import quote
  8. from fty_util.common import Mysql
  9. reload(sys)
  10. sys.setdefaultencoding('utf8')
  11. def scrapy_url():
  12. conn = Mysql.createOfflineConn()
  13. sql = """
  14. select id, tv_name from scrapy.wangju_url where url_youku is null order by id asc
  15. """
  16. rows = Mysql.getAll(sql, conn=conn)
  17. driver = webdriver.Firefox()
  18. driver.set_page_load_timeout(10)
  19. for row in rows:
  20. _id = row['id']
  21. tv_name = row['tv_name']
  22. url = 'http://www.soku.com/search_video/q_' + quote(str(tv_name))
  23. # need_blank = True
  24. try:
  25. driver.get(url)
  26. except Exception, e:
  27. driver.execute_script('window.stop()')
  28. divs = driver.find_elements_by_xpath('//div[@class="sk-express"]/div/div')
  29. for div in divs:
  30. try:
  31. title = div.find_element_by_xpath('./div/div[2]/div[1]/div/h2/a[1]').get_attribute('textContent')
  32. title = title.replace(' ', '').replace('\n', '')
  33. href = div.find_element_by_xpath('//div[@class="info_cont"]/p/a').get_attribute('href')
  34. jishu = None
  35. try:
  36. jishu = div.find_elements_by_xpath('//div[@class="s_items all site14 "]/ul/li')
  37. except Exception, e:
  38. pass
  39. if jishu is None or len(jishu) == 0:
  40. try:
  41. # jishu = div.find_elements_by_xpath('//div[@class="s_items site14 "]/ul/li')
  42. jishu = div.find_elements_by_xpath('//div[@class="s_detail"]/div[4]/ul/li')
  43. except Exception, e:
  44. pass
  45. if tv_name in title and jishu is not None and len(jishu) > 0:
  46. sql = """
  47. update scrapy.wangju_url set url_youku = '%s' where id = %s
  48. """
  49. sql = sql % (href, _id)
  50. Mysql.execute(sql, conn=conn)
  51. need_blank = False
  52. except Exception, e:
  53. pass
  54. if need_blank:
  55. sql = """
  56. update scrapy.wangju_url set url_youku = '%s' where id = %s
  57. """
  58. sql = sql % ('', _id)
  59. Mysql.execute(sql, conn=conn)
  60. driver.quit()
  61. def scrapy_data():
  62. conn = Mysql.createOfflineConn()
  63. sql = """
  64. select id, tv_name, url_youku from scrapy.wangju_url where url_youku is not null and url_youku != '' order by id asc
  65. """
  66. rows = Mysql.getAll(sql, conn=conn)
  67. driver = webdriver.Firefox()
  68. driver.set_page_load_timeout(10)
  69. for row in rows:
  70. _id = row['id']
  71. tv_name = row['tv_name']
  72. url_youku = row['url_youku']
  73. need_blank = True
  74. try:
  75. driver.get(url_youku)
  76. except Exception, e:
  77. driver.execute_script('window.stop()')
  78. try:
  79. content = driver.find_element_by_xpath('//div[@class="detailinfo"]').get_attribute('textContent')
  80. except Exception, e:
  81. try:
  82. content = driver.find_element_by_xpath('//div[@class="p-base"]').get_attribute('textContent')
  83. except Exception, e:
  84. continue
  85. sql = """
  86. insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
  87. """
  88. value = (_id, tv_name, url_youku, '', content, 'youku')
  89. Mysql.insertOne(sql, value=value, conn=conn)
  90. driver.quit()
  91. def parse_content():
  92. conn = Mysql.createOfflineConn()
  93. sql = """
  94. select id, tv_name, url, content from scrapy.wangju_all_url where source = 'youku' order by id asc
  95. """
  96. rows = Mysql.getAll(sql, conn=conn)
  97. for row in rows:
  98. _id = row['id']
  99. tv_name = row['tv_name']
  100. url = row['url']
  101. content = row['content']
  102. import re
  103. m = re.search(ur'评分: ([0-9]+[.]?)+', content)
  104. score = '0'
  105. if m is not None:
  106. score = m.group(0)
  107. play = '0'
  108. m = re.search(ur'播放数:([0-9]+[,]?)+', content)
  109. if m is not None:
  110. play = m.group(0)
  111. sql = """
  112. update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'youku'
  113. """
  114. sql = sql % (score, play, url)
  115. Mysql.execute(sql, conn=conn)
  116. # def parse_detail_content():
  117. # conn = Mysql.createOfflineConn()
  118. # sql = """
  119. # select id, detail_info_text from scrapy.iqiyi_dianshiju_detail order by id asc
  120. # """
  121. # rows = Mysql.getAll(sql, conn=conn)
  122. # for row in rows:
  123. # _id = row['id']
  124. # detail_info_text = row['detail_info_text']
  125. # # sql = """
  126. # # update scrapy.iqiyi_dianshiju_detail aa inner join scrapy.iqiyi_dianshiju_detail_copy bb on aa.id = bb.id set aa.detail_info_text = bb.detail_info_text
  127. # # """
  128. # # Mysql.update(sql, conn=conn)
  129. # if detail_info_text is not None:
  130. # # content = ''
  131. # # (line0, line1) = tuple(detail_info_text.split(u'评分'))
  132. # # line0 = line0.replace('\n', '')
  133. # # content = line0 + '\n' + line1
  134. # for line in detail_info_text.split('\n'):
  135. # sql = """
  136. # update scrapy.iqiyi_dianshiju_detail set detail_info_text = %s where id = %s
  137. # """
  138. # value = (content, _id)
  139. # Mysql.update(sql, param=value, conn=conn)
  140. # Mysql.close(conn=conn)
  141. def update_tv_lib():
  142. conn = Mysql.createOfflineConn()
  143. sql = """
  144. select tv_id, detail_info_text from scrapy.iqiyi_dianshiju_detail order by id asc
  145. """
  146. rows = Mysql.getAll(sql, conn=conn)
  147. for row in rows:
  148. tv_id = row['tv_id']
  149. detail_info_text = row['detail_info_text']
  150. lines = []
  151. for line in detail_info_text.split('\n'):
  152. lines.append(line)
  153. director = ''
  154. actors = ''
  155. product_area = ''
  156. premiere_time = ''
  157. _type = ''
  158. for i in range(len(lines)):
  159. line = lines[i]
  160. if u'导演' in line:
  161. director = line.replace(u'导演:', '')
  162. if u'主演' in line:
  163. actors = line.replace(u'主演:', '')
  164. if u'地区' in line:
  165. product_area = line.replace(u'地区:', '')
  166. if u'首播时间' in line:
  167. premiere_time = line.replace(u'首播时间:', '')
  168. if u'看点' in line:
  169. # print line[i+1]
  170. print lines[i+1]
  171. _type = lines[i+1]
  172. # if u'更新时间' in line:
  173. # gengxin = lines[i+1]
  174. sql = """
  175. update tv_lib.yxb_tv_series set level = %s, type = %s, script_form = %s, director = %s, product_area = %s, actors = %s, premiere_time = %s where id = %s
  176. """
  177. value = (5, _type, 1, director, product_area, actors, premiere_time, tv_id)
  178. Mysql.update(sql, param=value, conn=conn)
  179. Mysql.close(conn=conn)
  180. if __name__ == '__main__':
  181. # scrapy_data()
  182. # scrapy_url()
  183. # parse_content()
  184. # parse_detail_content()
  185. update_tv_lib()