123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222 |
- #/usr/bin/env python
- #coding=utf-8
- import random
- import sys
- import time
- from selenium import webdriver
- from urllib import quote
- from fty_util.common import Mysql
- reload(sys)
- sys.setdefaultencoding('utf8')
- def scrapy_url():
- conn = Mysql.createOfflineConn()
- sql = """
- select id, tv_name from scrapy.wangju_url where url_youku is null order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- for row in rows:
- _id = row['id']
- tv_name = row['tv_name']
- url = 'http://www.soku.com/search_video/q_' + quote(str(tv_name))
- # need_blank = True
- try:
- driver.get(url)
- except Exception, e:
- driver.execute_script('window.stop()')
- divs = driver.find_elements_by_xpath('//div[@class="sk-express"]/div/div')
- for div in divs:
- try:
- title = div.find_element_by_xpath('./div/div[2]/div[1]/div/h2/a[1]').get_attribute('textContent')
- title = title.replace(' ', '').replace('\n', '')
- href = div.find_element_by_xpath('//div[@class="info_cont"]/p/a').get_attribute('href')
- jishu = None
- try:
- jishu = div.find_elements_by_xpath('//div[@class="s_items all site14 "]/ul/li')
- except Exception, e:
- pass
- if jishu is None or len(jishu) == 0:
- try:
- # jishu = div.find_elements_by_xpath('//div[@class="s_items site14 "]/ul/li')
- jishu = div.find_elements_by_xpath('//div[@class="s_detail"]/div[4]/ul/li')
- except Exception, e:
- pass
- if tv_name in title and jishu is not None and len(jishu) > 0:
- sql = """
- update scrapy.wangju_url set url_youku = '%s' where id = %s
- """
- sql = sql % (href, _id)
- Mysql.execute(sql, conn=conn)
- need_blank = False
- except Exception, e:
- pass
- if need_blank:
- sql = """
- update scrapy.wangju_url set url_youku = '%s' where id = %s
- """
- sql = sql % ('', _id)
- Mysql.execute(sql, conn=conn)
- driver.quit()
- def scrapy_data():
- conn = Mysql.createOfflineConn()
- sql = """
- select id, tv_name, url_youku from scrapy.wangju_url where url_youku is not null and url_youku != '' order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- driver = webdriver.Firefox()
- driver.set_page_load_timeout(10)
- for row in rows:
- _id = row['id']
- tv_name = row['tv_name']
- url_youku = row['url_youku']
- need_blank = True
- try:
- driver.get(url_youku)
- except Exception, e:
- driver.execute_script('window.stop()')
- try:
- content = driver.find_element_by_xpath('//div[@class="detailinfo"]').get_attribute('textContent')
- except Exception, e:
- try:
- content = driver.find_element_by_xpath('//div[@class="p-base"]').get_attribute('textContent')
- except Exception, e:
- continue
- sql = """
- insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s)
- """
- value = (_id, tv_name, url_youku, '', content, 'youku')
- Mysql.insertOne(sql, value=value, conn=conn)
- driver.quit()
- def parse_content():
- conn = Mysql.createOfflineConn()
- sql = """
- select id, tv_name, url, content from scrapy.wangju_all_url where source = 'youku' order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- _id = row['id']
- tv_name = row['tv_name']
- url = row['url']
- content = row['content']
- import re
- m = re.search(ur'评分: ([0-9]+[.]?)+', content)
- score = '0'
- if m is not None:
- score = m.group(0)
- play = '0'
- m = re.search(ur'播放数:([0-9]+[,]?)+', content)
- if m is not None:
- play = m.group(0)
- sql = """
- update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'youku'
- """
- sql = sql % (score, play, url)
- Mysql.execute(sql, conn=conn)
- # def parse_detail_content():
- # conn = Mysql.createOfflineConn()
- # sql = """
- # select id, detail_info_text from scrapy.iqiyi_dianshiju_detail order by id asc
- # """
- # rows = Mysql.getAll(sql, conn=conn)
- # for row in rows:
- # _id = row['id']
- # detail_info_text = row['detail_info_text']
- # # sql = """
- # # update scrapy.iqiyi_dianshiju_detail aa inner join scrapy.iqiyi_dianshiju_detail_copy bb on aa.id = bb.id set aa.detail_info_text = bb.detail_info_text
- # # """
- # # Mysql.update(sql, conn=conn)
- # if detail_info_text is not None:
- # # content = ''
- # # (line0, line1) = tuple(detail_info_text.split(u'评分'))
- # # line0 = line0.replace('\n', '')
- # # content = line0 + '\n' + line1
- # for line in detail_info_text.split('\n'):
-
- # sql = """
- # update scrapy.iqiyi_dianshiju_detail set detail_info_text = %s where id = %s
- # """
- # value = (content, _id)
- # Mysql.update(sql, param=value, conn=conn)
- # Mysql.close(conn=conn)
- def update_tv_lib():
- conn = Mysql.createOfflineConn()
- sql = """
- select tv_id, detail_info_text from scrapy.iqiyi_dianshiju_detail order by id asc
- """
- rows = Mysql.getAll(sql, conn=conn)
- for row in rows:
- tv_id = row['tv_id']
- detail_info_text = row['detail_info_text']
- lines = []
- for line in detail_info_text.split('\n'):
- lines.append(line)
- director = ''
- actors = ''
- product_area = ''
- premiere_time = ''
- _type = ''
- for i in range(len(lines)):
- line = lines[i]
- if u'导演' in line:
- director = line.replace(u'导演:', '')
- if u'主演' in line:
- actors = line.replace(u'主演:', '')
- if u'地区' in line:
- product_area = line.replace(u'地区:', '')
- if u'首播时间' in line:
- premiere_time = line.replace(u'首播时间:', '')
- if u'看点' in line:
- # print line[i+1]
- print lines[i+1]
- _type = lines[i+1]
-
- # if u'更新时间' in line:
- # gengxin = lines[i+1]
-
- sql = """
- update tv_lib.yxb_tv_series set level = %s, type = %s, script_form = %s, director = %s, product_area = %s, actors = %s, premiere_time = %s where id = %s
- """
- value = (5, _type, 1, director, product_area, actors, premiere_time, tv_id)
- Mysql.update(sql, param=value, conn=conn)
- Mysql.close(conn=conn)
- if __name__ == '__main__':
- # scrapy_data()
- # scrapy_url()
- # parse_content()
- # parse_detail_content()
- update_tv_lib()
|