class Serper::Baidu

Public Instance Methods

_parse_ads_right(file) click to toggle source
# File lib/serper/baidu/parser.rb, line 2
def _parse_ads_right(file)
  result = []
  rank = 0

  file[:doc].search('div#ec_im_container span a.c-icon.efc-cert').each do |div|
    rank += 1
    url = Addressable::URI.parse(Serper::Helper.parse_data_click(div['data-renzheng'])['identity']['a']['url']).query_values['wd'].to_s.sub('@v','') rescue ''
    result << {url: url, rank: rank}
  end
  result
end
_parse_ads_top(file) click to toggle source
# File lib/serper/baidu/parser.rb, line 14
def _parse_ads_top(file)
  result = []
  rank = 0

  file[:doc].search('div#content_left').first.children.each do |div|
    break if div['id'].to_i > 0
    div.search('span a.c-icon.efc-cert').each do |div|
      rank += 1
      url = Addressable::URI.parse(Serper::Helper.parse_data_click(div['data-renzheng'])['identity']['a']['url']).query_values['wd'].to_s.sub('@v', '') rescue ''
      result << {url: url, rank: rank}
    end
  end
  result
end
_parse_con_ar(file) click to toggle source
# File lib/serper/baidu/parser.rb, line 29
def _parse_con_ar(file)
  result = []
  divs = file[:doc].search("div#content_right div#con-ar").first
  return [] if divs.nil?
  divs.children.each do |div|
    next unless div['class'].to_s.include?('result-op')
    result << {:tpl => div['tpl'],
               :data_click => Serper::Helper.parse_data_click(div['data-click'])
    }
  end
  result
end
_parse_ranks(file) click to toggle source

def _parse_pinpaizhuanqu(file)

part = file[:doc].search("div[@id='content_left']").first
return false if part.nil?

part.children[2].name == 'script'

end

# File lib/serper/baidu/parser.rb, line 49
def _parse_ranks(file)
  result = []
  part = file[:doc].search("div[@id='content_left']").first
  return result if part.nil?

  part.children.each do |table|
    next if table.nil?
    id = table['id'].to_i
    next unless id > 0 && id < 3000

    r = {:rank => id}

    r[:result_op] = table['class'].to_s.include?('result-op')

    r[:fk] = table['fk']

    r[:srcid] = table['srcid']

    r[:tpl] = table['tpl']

    r[:mu] = table['mu']

    url = table.search('h3/a').first
    unless url.nil?
      url = url['href']
      sleep(rand)
      url = Serper::Crawler.get_rank_url('http:'+url).headers['location'] if url.include?('//www.baidu.com/link?')
    end
    r[:url] = url

    r[:title] = Serper::Helper.get_content_safe(table.search('h3'))

    r[:content] = Serper::Helper.get_content_safe(table.search('div.c-abstract'))

    table.search('a').each do |link|
      r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com')
    end
    r[:baiduopen] = false if r[:baiduopen].nil?

    result << r
  end
  result
end
_parse_zhixin(file) click to toggle source

def _parse_right_weather(file)

rw = file[:doc].search('div[@tpl="right_weather"]')
return nil if rw.nil?

rw = rw.first
return nil if rw.nil?

title = Serper::Helper.get_content_safe(rw.search('div.opr-weather-title'))
week = rw.search('a.opr-weather-week').first['href']

{:title => title, :week => week}

end

# File lib/serper/baidu/parser.rb, line 171
def _parse_zhixin(file)
  result = []
  file[:doc].search("div#content_left .result-zxl").each do |zxl|
    result << {:id => zxl['id'],
               :srcid => zxl['srcid'],
               :fk => zxl['fk'],
               :tpl => zxl['tpl'],
               :mu => zxl['mu'],
               :data_click => Serper::Helper.parse_data_click(zxl['data-click'])
    }
  end
  result
end
_weight_of_ads_right(serp_result,side_rank) click to toggle source
# File lib/serper/baidu/weight.rb, line 89
def _weight_of_ads_right(serp_result,side_rank)
  result = []
  serp_result[:ads_right].each.with_index do |ad,i|
    side_rank += 1

    url = ad[:url].to_s
    type = 'SEM'
    name = ''
    site = Serper::Helper.parse_site(url)
    subdomain = Serper::Helper.parse_subdomain(url)
    path = Serper::Helper.parse_path(url)

    part_rank = ad[:rank]

    weight = 1.0/side_rank.to_f
    result << {type: type, name: name, site: site, subdomain: subdomain, path: path, side_rank: side_rank, part_rank: part_rank, side_weight: weight}
  end
  [result, side_rank]
end
_weight_of_ads_top(serp_result,side_rank) click to toggle source
# File lib/serper/baidu/weight.rb, line 69
def _weight_of_ads_top(serp_result,side_rank)
  result = []
  serp_result[:ads_top].each.with_index do |ad,i|
    side_rank += 1

    url = ad[:url].to_s
    type = 'SEM'
    name = ''
    site = Serper::Helper.parse_site(url)
    subdomain = Serper::Helper.parse_subdomain(url)
    path = Serper::Helper.parse_path(url)

    part_rank = ad[:rank]

    weight = 1.0/side_rank.to_f
    result << {type: type, name: name, site: site, subdomain: subdomain, path: path, side_rank: side_rank, part_rank: part_rank, side_weight: weight}
  end
  [result, side_rank]
end
_weight_of_con_ar(serp_result,side_rank) click to toggle source
# File lib/serper/baidu/weight.rb, line 109
def _weight_of_con_ar(serp_result,side_rank)
  result = []
  serp_result[:con_ar].each.with_index do |con,i|
    side_rank += 1

    url = con[:data_click]['mu'].to_s
    type = 'Special'
    name = con[:tpl]
    site = Serper::Helper.parse_site(url)
    subdomain = Serper::Helper.parse_subdomain(url)
    path = Serper::Helper.parse_path(url)

    weight = 1.0 * weight_config[:con_ar_weight]
    result << {type: type, name: name, site: site, subdomain: subdomain, path: path, side_rank: side_rank, part_rank: i+1, side_weight: weight}
  end
  [result, side_rank]
end
_weight_of_ranks(serp_result,side_rank) click to toggle source

weight_of*** functions return a hash array each hash includes: type, name, site, weight

# File lib/serper/baidu/weight.rb, line 31
def _weight_of_ranks(serp_result,side_rank)
  result = []
  serp_result[:ranks].each.with_index do |rank,i|
    side_rank += 1

    url = rank[:url].to_s
    mu = rank[:mu].to_s

    type = 'SEO'
    type = 'Special' if rank[:baiduopen]

    unless mu.empty?
      url = mu
      type = 'Special'
    end

    site = Serper::Helper.parse_site(url)
    subdomain = Serper::Helper.parse_subdomain(url)
    path = Serper::Helper.parse_path(url)

    name = rank[:tpl].to_s

    weight = 1.0/side_rank.to_f
    if type == 'Special'
      if rank[:baiduopen]
        weight = weight * weight_config[:baiduopen_weight].to_f
      else
        weight = weight * weight_config[:rank_special_weight].to_f
      end
    end

    part_rank = rank[:rank]

    result << {type: type, name: name, site: site, subdomain: subdomain, path: path, mu: mu, side_rank: side_rank, part_rank: part_rank, side_weight: weight}
  end
  [result, side_rank]
end
_weight_of_zhixin(serp_result,side_rank) click to toggle source
# File lib/serper/baidu/weight.rb, line 127
def _weight_of_zhixin(serp_result,side_rank)
  result = []
  serp_result[:zhixin].each.with_index do |zhixin,i|
    side_rank += 1

    url = zhixin[:mu].to_s
    type = 'Special'
    name = zhixin[:tpl]
    site = Serper::Helper.parse_site(url)
    subdomain = Serper::Helper.parse_subdomain(url)
    weight = 1.0 * weight_config[:zhixin_weight]
    path = Serper::Helper.parse_path(url)

    result << {type: type, name: name, site: site, subdomain: subdomain, path: path, side_rank: side_rank, part_rank: i+1, side_weight: weight}
  end
  [result, side_rank]
end
serp_url(keyword,page) click to toggle source
# File lib/serper/baidu/crawler.rb, line 2
def serp_url(keyword,page)
  keyword = keyword.gsub(" ","+")
  page = page.to_i > 1 ? "&pn=#{page.to_i-1}0" : ''
  URI.escape("http://www.baidu.com/s?wd=#{keyword}#{page}&ie=utf-8&inputT=#{1000+rand(1000)}")
end
weight_config() click to toggle source
# File lib/serper/baidu/weight.rb, line 2
def weight_config
  {
      :left_parts => [:ads_top,
                      :zhixin,
                      :ranks
      ],

      :right_parts => [:con_ar,
                       :ads_right
      ],

      :left_part_weight => 8,

      :right_part_weight => 2,

      :zhixin_weight => 3.5,

      :baiduopen_weight => 3,

      :rank_special_weight => 2,

      :con_ar_weight => 2
  }
end