class Meiriyigua::BaiduCrawl

Public Class Methods

new() click to toggle source
# File lib/meiriyigua/baidu_crawl.rb, line 8
def initialize
  @agent = CrawlClient.create_agent
end

Public Instance Methods

get_intro(title) click to toggle source
# File lib/meiriyigua/baidu_crawl.rb, line 31
def get_intro(title)
  page = @agent.get('http://www.baidu.com/')
  sleep 1
  search_form = page.form_with(:name => "f1")
  search_form.field_with(:name => "wd").value = title
  search_results = @agent.submit search_form

  result = ""
  search_results.search('div.c-container div.c-abstract').each_with_index{|a, i| result << "提示#{i+1}\r\n #{a.text}\r\n\r\n"}
  result
rescue
  puts "抓取百度简介出错了 #{$!.class} #{$!.message}\n#{$!.backtrace.join("\n")}"
  ""
end
run() click to toggle source
# File lib/meiriyigua/baidu_crawl.rb, line 12
def run
  UrlRecord.all(:baidu_at => nil).each do |url_record|
    page_record = url_record.page_record
    baidu_intro = get_intro(page_record.title)
    CrawlClient.random_sleep
    if baidu_intro.empty?
      print "抓取百度简介 #{url_record.url} "
      puts "失败"
    else
      page_record.baidu_intro = baidu_intro
      page_record.save
      url_record.baidu_at = Time.now
      url_record.save
      print "抓取百度简介 #{url_record.url} "
      puts "成功"
    end
  end
end