class Baiduserp::Parser
Public Instance Methods
_parse_ads_right(file)
click to toggle source
# File lib/baiduserp/parser/ads_right.rb, line 2 def _parse_ads_right(file) result = [] file[:doc].search('div.EC_im').each do |div| r = {} r[:rank] = div['id'].sub('bdfs','').to_i + 1 r[:title] = Baiduserp::Helper.get_content_safe(div.search('a.EC_t')) r[:content] = Baiduserp::Helper.get_content_safe(div.search('a.EC_desc/font')) r[:url] = Baiduserp::Helper.get_content_safe(div.search('font.EC_url')) result << r end result end
_parse_ads_top(file)
click to toggle source
# File lib/baiduserp/parser/ads_top.rb, line 2 def _parse_ads_top(file) result = [] rank = 0 part = file[:doc].search('div#content_left').first return result if part.nil? part.children.each do |div| id = div['id'].to_i break if id > 0 && id < 3000 next unless div['class'].to_s.include?('ec_pp_f') rank += 1 if div.name == 'div' r = {rank: rank, id: id} r[:title] = Baiduserp::Helper.get_content_safe(div.search('div.ec_title')) r[:content] = Baiduserp::Helper.get_content_safe(div.search('div.ec_desc')) r[:url] = Baiduserp::Helper.get_content_safe(div.search('span.ec_url')) result << r else # div.name == 'table' r = {rank: rank, id: id} r[:title] = Baiduserp::Helper.get_content_safe(div.search('td.EC_header/a')) r[:content] = Baiduserp::Helper.get_content_safe(div.search('a.EC_desc')) r[:url] = Baiduserp::Helper.get_content_safe(div.search('a.EC_url')) result << r end end result end
_parse_con_ar(file)
click to toggle source
# File lib/baiduserp/parser/con_ar.rb, line 2 def _parse_con_ar(file) result = [] divs = file[:doc].search("div#content_right div#con-ar").first return [] if divs.nil? divs.children.each do |div| next unless div['class'].to_s.include?('result-op') result << {:tpl => div['tpl'], :data_click => Baiduserp::Helper.parse_data_click(div['data-click']) } end result end
_parse_pinpaizhuanqu(file)
click to toggle source
# File lib/baiduserp/parser/pinpaizhuanqu.rb, line 2 def _parse_pinpaizhuanqu(file) part = file[:doc].search("div[@id='content_left']").first return false if part.nil? part.children[2].name == 'script' end
_parse_ranks(file)
click to toggle source
# File lib/baiduserp/parser/ranks.rb, line 2 def _parse_ranks(file) result = [] part = file[:doc].search("div[@id='content_left']").first return result if part.nil? part.children.each do |table| next if table.nil? id = table['id'].to_i next unless id > 0 && id < 3000 r = {:rank => id} r[:result_op] = table['class'].to_s.include?('result-op') r[:fk] = table['fk'] r[:srcid] = table['srcid'] r[:tpl] = table['tpl'] r[:mu] = table['mu'] url = table.search('h3/a').first unless url.nil? url = url['href'] sleep(rand) url = Baiduserp::Client.get_rank_url(url).headers['location'] if url.include?('http://www.baidu.com/link?') end r[:url] = url r[:title] = Baiduserp::Helper.get_content_safe(table.search('h3')) r[:content] = Baiduserp::Helper.get_content_safe(table.search('div.c-abstract')) table.search('a').each do |link| r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com') end r[:baiduopen] = false if r[:baiduopen].nil? result << r end result end
_parse_result_num(file)
click to toggle source
# File lib/baiduserp/parser/result_num.rb, line 4 def _parse_result_num(file) html = file[:html] str = html.scan(/百度为您找到相关结果(.*)个/).join str = str.gsub('约','') if str.include?('万') parts = str.split('万') result = parts[0].to_i * 10000 + parts[1].to_i else result = str.gsub(',', '').to_i end result end
_parse_right_hotel(file)
click to toggle source
# File lib/baiduserp/parser/right_hotel.rb, line 2 def _parse_right_hotel(file) rh = file[:doc].search('div[@tpl="right_hotel"]') return nil if rh.nil? rh = rh.first return nil if rh.nil? title = Baiduserp::Helper.get_content_safe(rh.search('div.opr-hotel-title')) {:title => title} end
_parse_right_personinfo(file)
click to toggle source
# File lib/baiduserp/parser/right_personinfo.rb, line 2 def _parse_right_personinfo(file) rp = file[:doc].search('div[@tpl="right_personinfo"]') return nil if rp.nil? title = Baiduserp::Helper.get_content_safe rp.search('span.opr-personinfo-subtitle-large') info_summary = Baiduserp::Helper.get_content_safe rp.search('div.opr-personinfo-summary') info = Baiduserp::Helper.get_content_safe rp.search('div.opr-personinfo-info') source = Baiduserp::Helper.get_content_safe rp.search('div.opr-personinfo-source a') return nil if title.nil? && info.nil? && source.nil? {:title => title, :info_summary => info_summary, :info => info, :source => source} end
_parse_right_relaperson(file)
click to toggle source
# File lib/baiduserp/parser/right_relaperson.rb, line 2 def _parse_right_relaperson(file) relapersons = file[:doc].search('div[@tpl="right_relaperson"]') return nil if relapersons.nil? result = [] relapersons.each do |rr| title = rr.search('div.cr-title/span').first title = title.content unless title.nil? r = [] rr.search('p.opr-relaperson-name/a').each do |p| r << p['title'] end result << {:title => title, :names => r} end result end
_parse_right_weather(file)
click to toggle source
# File lib/baiduserp/parser/right_weather.rb, line 2 def _parse_right_weather(file) rw = file[:doc].search('div[@tpl="right_weather"]') return nil if rw.nil? rw = rw.first return nil if rw.nil? title = Baiduserp::Helper.get_content_safe(rw.search('div.opr-weather-title')) week = rw.search('a.opr-weather-week').first['href'] {:title => title, :week => week} end
_parse_zhixin(file)
click to toggle source
# File lib/baiduserp/parser/zhixin.rb, line 2 def _parse_zhixin(file) result = [] file[:doc].search("div#content_left .result-zxl").each do |zxl| result << {:id => zxl['id'], :srcid => zxl['srcid'], :fk => zxl['fk'], :tpl => zxl['tpl'], :mu => zxl['mu'], :data_click => Baiduserp::Helper.parse_data_click(zxl['data-click']) } end result end
get_search_html(keyword,page=1)
click to toggle source
# File lib/baiduserp/parser.rb, line 47 def get_search_html(keyword,page=1) keyword = keyword.gsub(" ","+") page = page.to_i > 1 ? "&pn=#{page.to_i-1}0" : "" serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8&inputT=#{1000+rand(1000)}") # serp_url = URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8") Client.get_serp(serp_url).body end
parse(html)
click to toggle source
# File lib/baiduserp/parser.rb, line 12 def parse(html) html = html.encode!('UTF-8','UTF-8',:invalid => :replace) @file = Hash.new @serp = Baiduserp::Result.new @file[:html] = html @file[:doc] = Nokogiri::HTML(html) self.methods.each do |m| next unless m =~ /^_parse_/ #p m begin @serp[m.to_s.sub('_parse_','').to_sym] = self.send m,@file rescue Exception => e issue_file = "/tmp/baiduserp_issue_#{Time.now.strftime("%Y%m%d%H%M%S")}.html" open(issue_file,'w').puts(html) puts "Notice:" puts "Baiduserp gem have a bug, please email to zmingqian@qq.com to report it." puts "Please attach file #{issue_file} in the email and the error information below, thanks!" puts e.message puts e.inspect puts e.backtrace raise "Baiduserp Parser Get An Error!" end #p @serp.keys end @serp end
parse_file(file_path)
click to toggle source
# File lib/baiduserp/parser.rb, line 55 def parse_file(file_path) if File.exists? file_path html = open(file_path).read else html = Client.get_serp(file_path).body end parse html end
search(keyword,page=1)
click to toggle source
# File lib/baiduserp/parser.rb, line 42 def search(keyword,page=1) html = get_search_html(keyword,page) parse html end