class Serper::Baidu
Public Instance Methods
_parse_ads_right(file)
click to toggle source
# File lib/serper/baidu/parser.rb, line 2 def _parse_ads_right(file) result = [] rank = 0 file[:doc].search('div#ec_im_container span a.c-icon.efc-cert').each do |div| rank += 1 url = Addressable::URI.parse(Serper::Helper.parse_data_click(div['data-renzheng'])['identity']['a']['url']).query_values['wd'].to_s.sub('@v','') rescue '' result << {url: url, rank: rank} end result end
_parse_ads_top(file)
click to toggle source
# File lib/serper/baidu/parser.rb, line 14 def _parse_ads_top(file) result = [] rank = 0 file[:doc].search('div#content_left').first.children.each do |div| break if div['id'].to_i > 0 div.search('span a.c-icon.efc-cert').each do |div| rank += 1 url = Addressable::URI.parse(Serper::Helper.parse_data_click(div['data-renzheng'])['identity']['a']['url']).query_values['wd'].to_s.sub('@v', '') rescue '' result << {url: url, rank: rank} end end result end
_parse_con_ar(file)
click to toggle source
# File lib/serper/baidu/parser.rb, line 29 def _parse_con_ar(file) result = [] divs = file[:doc].search("div#content_right div#con-ar").first return [] if divs.nil? divs.children.each do |div| next unless div['class'].to_s.include?('result-op') result << {:tpl => div['tpl'], :data_click => Serper::Helper.parse_data_click(div['data-click']) } end result end
_parse_ranks(file)
click to toggle source
def _parse_pinpaizhuanqu(file)
part = file[:doc].search("div[@id='content_left']").first return false if part.nil? part.children[2].name == 'script'
end
# File lib/serper/baidu/parser.rb, line 49 def _parse_ranks(file) result = [] part = file[:doc].search("div[@id='content_left']").first return result if part.nil? part.children.each do |table| next if table.nil? id = table['id'].to_i next unless id > 0 && id < 3000 r = {:rank => id} r[:result_op] = table['class'].to_s.include?('result-op') r[:fk] = table['fk'] r[:srcid] = table['srcid'] r[:tpl] = table['tpl'] r[:mu] = table['mu'] url = table.search('h3/a').first unless url.nil? url = url['href'] sleep(rand) url = Serper::Crawler.get_rank_url('http:'+url).headers['location'] if url.include?('//www.baidu.com/link?') end r[:url] = url r[:title] = Serper::Helper.get_content_safe(table.search('h3')) r[:content] = Serper::Helper.get_content_safe(table.search('div.c-abstract')) table.search('a').each do |link| r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com') end r[:baiduopen] = false if r[:baiduopen].nil? result << r end result end
_parse_zhixin(file)
click to toggle source
def _parse_right_weather(file)
rw = file[:doc].search('div[@tpl="right_weather"]') return nil if rw.nil? rw = rw.first return nil if rw.nil? title = Serper::Helper.get_content_safe(rw.search('div.opr-weather-title')) week = rw.search('a.opr-weather-week').first['href'] {:title => title, :week => week}
end
# File lib/serper/baidu/parser.rb, line 171 def _parse_zhixin(file) result = [] file[:doc].search("div#content_left .result-zxl").each do |zxl| result << {:id => zxl['id'], :srcid => zxl['srcid'], :fk => zxl['fk'], :tpl => zxl['tpl'], :mu => zxl['mu'], :data_click => Serper::Helper.parse_data_click(zxl['data-click']) } end result end
_weight_of_ads_right(serp_result,side_rank)
click to toggle source
# File lib/serper/baidu/weight.rb, line 89 def _weight_of_ads_right(serp_result,side_rank) result = [] serp_result[:ads_right].each.with_index do |ad,i| side_rank += 1 url = ad[:url].to_s type = 'SEM' name = '' site = Serper::Helper.parse_site(url) subdomain = Serper::Helper.parse_subdomain(url) path = Serper::Helper.parse_path(url) part_rank = ad[:rank] weight = 1.0/side_rank.to_f result << {type: type, name: name, site: site, subdomain: subdomain, path: path, side_rank: side_rank, part_rank: part_rank, side_weight: weight} end [result, side_rank] end
_weight_of_ads_top(serp_result,side_rank)
click to toggle source
# File lib/serper/baidu/weight.rb, line 69 def _weight_of_ads_top(serp_result,side_rank) result = [] serp_result[:ads_top].each.with_index do |ad,i| side_rank += 1 url = ad[:url].to_s type = 'SEM' name = '' site = Serper::Helper.parse_site(url) subdomain = Serper::Helper.parse_subdomain(url) path = Serper::Helper.parse_path(url) part_rank = ad[:rank] weight = 1.0/side_rank.to_f result << {type: type, name: name, site: site, subdomain: subdomain, path: path, side_rank: side_rank, part_rank: part_rank, side_weight: weight} end [result, side_rank] end
_weight_of_con_ar(serp_result,side_rank)
click to toggle source
# File lib/serper/baidu/weight.rb, line 109 def _weight_of_con_ar(serp_result,side_rank) result = [] serp_result[:con_ar].each.with_index do |con,i| side_rank += 1 url = con[:data_click]['mu'].to_s type = 'Special' name = con[:tpl] site = Serper::Helper.parse_site(url) subdomain = Serper::Helper.parse_subdomain(url) path = Serper::Helper.parse_path(url) weight = 1.0 * weight_config[:con_ar_weight] result << {type: type, name: name, site: site, subdomain: subdomain, path: path, side_rank: side_rank, part_rank: i+1, side_weight: weight} end [result, side_rank] end
_weight_of_ranks(serp_result,side_rank)
click to toggle source
weight_of*** functions return a hash array each hash includes: type, name, site, weight
# File lib/serper/baidu/weight.rb, line 31 def _weight_of_ranks(serp_result,side_rank) result = [] serp_result[:ranks].each.with_index do |rank,i| side_rank += 1 url = rank[:url].to_s mu = rank[:mu].to_s type = 'SEO' type = 'Special' if rank[:baiduopen] unless mu.empty? url = mu type = 'Special' end site = Serper::Helper.parse_site(url) subdomain = Serper::Helper.parse_subdomain(url) path = Serper::Helper.parse_path(url) name = rank[:tpl].to_s weight = 1.0/side_rank.to_f if type == 'Special' if rank[:baiduopen] weight = weight * weight_config[:baiduopen_weight].to_f else weight = weight * weight_config[:rank_special_weight].to_f end end part_rank = rank[:rank] result << {type: type, name: name, site: site, subdomain: subdomain, path: path, mu: mu, side_rank: side_rank, part_rank: part_rank, side_weight: weight} end [result, side_rank] end
_weight_of_zhixin(serp_result,side_rank)
click to toggle source
# File lib/serper/baidu/weight.rb, line 127 def _weight_of_zhixin(serp_result,side_rank) result = [] serp_result[:zhixin].each.with_index do |zhixin,i| side_rank += 1 url = zhixin[:mu].to_s type = 'Special' name = zhixin[:tpl] site = Serper::Helper.parse_site(url) subdomain = Serper::Helper.parse_subdomain(url) weight = 1.0 * weight_config[:zhixin_weight] path = Serper::Helper.parse_path(url) result << {type: type, name: name, site: site, subdomain: subdomain, path: path, side_rank: side_rank, part_rank: i+1, side_weight: weight} end [result, side_rank] end
serp_url(keyword,page)
click to toggle source
# File lib/serper/baidu/crawler.rb, line 2 def serp_url(keyword,page) keyword = keyword.gsub(" ","+") page = page.to_i > 1 ? "&pn=#{page.to_i-1}0" : '' URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8&inputT=#{1000+rand(1000)}") end
weight_config()
click to toggle source
# File lib/serper/baidu/weight.rb, line 2 def weight_config { :left_parts => [:ads_top, :zhixin, :ranks ], :right_parts => [:con_ar, :ads_right ], :left_part_weight => 8, :right_part_weight => 2, :zhixin_weight => 3.5, :baiduopen_weight => 3, :rank_special_weight => 2, :con_ar_weight => 2 } end