class Meiriyigua::BaiduCrawl
Public Class Methods
new()
click to toggle source
# File lib/meiriyigua/baidu_crawl.rb, line 8 def initialize @agent = CrawlClient.create_agent end
Public Instance Methods
get_intro(title)
click to toggle source
# File lib/meiriyigua/baidu_crawl.rb, line 31 def get_intro(title) page = @agent.get('http://www.baidu.com/') sleep 1 search_form = page.form_with(:name => "f1") search_form.field_with(:name => "wd").value = title search_results = @agent.submit search_form result = "" search_results.search('div.c-container div.c-abstract').each_with_index{|a, i| result << "提示#{i+1}\r\n #{a.text}\r\n\r\n"} result rescue puts "抓取百度简介出错了 #{$!.class} #{$!.message}\n#{$!.backtrace.join("\n")}" "" end
run()
click to toggle source
# File lib/meiriyigua/baidu_crawl.rb, line 12 def run UrlRecord.all(:baidu_at => nil).each do |url_record| page_record = url_record.page_record baidu_intro = get_intro(page_record.title) CrawlClient.random_sleep if baidu_intro.empty? print "抓取百度简介 #{url_record.url} " puts "失败" else page_record.baidu_intro = baidu_intro page_record.save url_record.baidu_at = Time.now url_record.save print "抓取百度简介 #{url_record.url} " puts "成功" end end end