#/usr/bin/env python #coding=utf-8 import random import sys import time from selenium import webdriver from urllib import quote from fty_util.common import Mysql reload(sys) sys.setdefaultencoding('utf8') def scrapy_url(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name from scrapy.wangju_url where url_youku is null order by id asc """ rows = Mysql.getAll(sql, conn=conn) driver = webdriver.Firefox() driver.set_page_load_timeout(10) for row in rows: _id = row['id'] tv_name = row['tv_name'] url = 'http://www.soku.com/search_video/q_' + quote(str(tv_name)) # need_blank = True try: driver.get(url) except Exception, e: driver.execute_script('window.stop()') divs = driver.find_elements_by_xpath('//div[@class="sk-express"]/div/div') for div in divs: try: title = div.find_element_by_xpath('./div/div[2]/div[1]/div/h2/a[1]').get_attribute('textContent') title = title.replace(' ', '').replace('\n', '') href = div.find_element_by_xpath('//div[@class="info_cont"]/p/a').get_attribute('href') jishu = None try: jishu = div.find_elements_by_xpath('//div[@class="s_items all site14 "]/ul/li') except Exception, e: pass if jishu is None or len(jishu) == 0: try: # jishu = div.find_elements_by_xpath('//div[@class="s_items site14 "]/ul/li') jishu = div.find_elements_by_xpath('//div[@class="s_detail"]/div[4]/ul/li') except Exception, e: pass if tv_name in title and jishu is not None and len(jishu) > 0: sql = """ update scrapy.wangju_url set url_youku = '%s' where id = %s """ sql = sql % (href, _id) Mysql.execute(sql, conn=conn) need_blank = False except Exception, e: pass if need_blank: sql = """ update scrapy.wangju_url set url_youku = '%s' where id = %s """ sql = sql % ('', _id) Mysql.execute(sql, conn=conn) driver.quit() def scrapy_data(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name, url_youku from scrapy.wangju_url where url_youku is not null and url_youku != '' order by id asc """ rows = Mysql.getAll(sql, conn=conn) driver = webdriver.Firefox() driver.set_page_load_timeout(10) for row in rows: _id = row['id'] tv_name = row['tv_name'] url_youku = row['url_youku'] need_blank = True try: driver.get(url_youku) except Exception, e: driver.execute_script('window.stop()') try: content = driver.find_element_by_xpath('//div[@class="detailinfo"]').get_attribute('textContent') except Exception, e: try: content = driver.find_element_by_xpath('//div[@class="p-base"]').get_attribute('textContent') except Exception, e: continue sql = """ insert into scrapy.wangju_all_url (id, tv_name, url, title, content, source) values (%s, %s, %s, %s, %s, %s) """ value = (_id, tv_name, url_youku, '', content, 'youku') Mysql.insertOne(sql, value=value, conn=conn) driver.quit() def parse_content(): conn = Mysql.createOfflineConn() sql = """ select id, tv_name, url, content from scrapy.wangju_all_url where source = 'youku' order by id asc """ rows = Mysql.getAll(sql, conn=conn) for row in rows: _id = row['id'] tv_name = row['tv_name'] url = row['url'] content = row['content'] import re m = re.search(ur'评分: ([0-9]+[.]?)+', content) score = '0' if m is not None: score = m.group(0) play = '0' m = re.search(ur'播放数:([0-9]+[,]?)+', content) if m is not None: play = m.group(0) sql = """ update scrapy.wangju_all_url set score = '%s', playtimes = '%s' where url = '%s' and source = 'youku' """ sql = sql % (score, play, url) Mysql.execute(sql, conn=conn) # def parse_detail_content(): # conn = Mysql.createOfflineConn() # sql = """ # select id, detail_info_text from scrapy.iqiyi_dianshiju_detail order by id asc # """ # rows = Mysql.getAll(sql, conn=conn) # for row in rows: # _id = row['id'] # detail_info_text = row['detail_info_text'] # # sql = """ # # update scrapy.iqiyi_dianshiju_detail aa inner join scrapy.iqiyi_dianshiju_detail_copy bb on aa.id = bb.id set aa.detail_info_text = bb.detail_info_text # # """ # # Mysql.update(sql, conn=conn) # if detail_info_text is not None: # # content = '' # # (line0, line1) = tuple(detail_info_text.split(u'评分')) # # line0 = line0.replace('\n', '') # # content = line0 + '\n' + line1 # for line in detail_info_text.split('\n'): # sql = """ # update scrapy.iqiyi_dianshiju_detail set detail_info_text = %s where id = %s # """ # value = (content, _id) # Mysql.update(sql, param=value, conn=conn) # Mysql.close(conn=conn) def update_tv_lib(): conn = Mysql.createOfflineConn() sql = """ select tv_id, detail_info_text from scrapy.iqiyi_dianshiju_detail order by id asc """ rows = Mysql.getAll(sql, conn=conn) for row in rows: tv_id = row['tv_id'] detail_info_text = row['detail_info_text'] lines = [] for line in detail_info_text.split('\n'): lines.append(line) director = '' actors = '' product_area = '' premiere_time = '' _type = '' for i in range(len(lines)): line = lines[i] if u'导演' in line: director = line.replace(u'导演:', '') if u'主演' in line: actors = line.replace(u'主演:', '') if u'地区' in line: product_area = line.replace(u'地区:', '') if u'首播时间' in line: premiere_time = line.replace(u'首播时间:', '') if u'看点' in line: # print line[i+1] print lines[i+1] _type = lines[i+1] # if u'更新时间' in line: # gengxin = lines[i+1] sql = """ update tv_lib.yxb_tv_series set level = %s, type = %s, script_form = %s, director = %s, product_area = %s, actors = %s, premiere_time = %s where id = %s """ value = (5, _type, 1, director, product_area, actors, premiere_time, tv_id) Mysql.update(sql, param=value, conn=conn) Mysql.close(conn=conn) if __name__ == '__main__': # scrapy_data() # scrapy_url() # parse_content() # parse_detail_content() update_tv_lib()