class Abrupt::Crawler
Crawler
for a website including all followed urls with performing abrupt services BETA!!!
Constants
- SERVICE_MAPPING
Public Class Methods
new(uri, *args)
click to toggle source
# File lib/abrupt/crawler.rb, line 28 def initialize(uri, *args) @uri = Addressable::URI.parse(uri).normalize opts = args.first @options = { lang: 'en', services: %w(r i s c l p), depth: '3', word_limit: 20 } @options[:services] = opts[:services] if opts[:services] @options[:lang] = opts[:lang] if opts[:lang] @follow_links = !opts[:nofollow] @result = {} end
Public Instance Methods
canonize_html(html)
click to toggle source
# File lib/abrupt/crawler.rb, line 109 def canonize_html(html) baseurl = "#{@uri.scheme}://#{@uri.host}" converter = Service::AbsoluteUrl.new(html, baseurl: baseurl) converter.execute end
crawl(uri = nil)
click to toggle source
Crawls a page, saves the service results in result hash and returns an array with the existing uris of this page.
@param uri [String] the uri to crawl @return [JSON] result
# File lib/abrupt/crawler.rb, line 48 def crawl(uri = nil) Abrupt.log '.' uri ||= @uri.to_str.append_last_slash unless @result[uri] html = fetch_html(uri) @result[uri] ||= {} @result[uri] = perform_services(html) if html # new_uris.select! { |url| same_host?(url) } # filter uris_with_same_host(uri).uniq.each { |url| crawl(url) } if @follow_links end Service::Base.transform_hash(@result) end
fetch_html(uri)
click to toggle source
# File lib/abrupt/crawler.rb, line 72 def fetch_html(uri) uri = Addressable::URI.parse(uri.strip).normalize.to_str begin response = ::RestClient.get uri, accept: :html content_type = response.headers[:content_type].to_s case response.code when 200...400 response.to_str if html?(content_type) else false end rescue => e puts "error fetching html on #{uri}" puts e nil end end
html?(content_type)
click to toggle source
# File lib/abrupt/crawler.rb, line 90 def html?(content_type) content_type.start_with?('text/html') end
init_services_hash(html)
click to toggle source
# File lib/abrupt/crawler.rb, line 98 def init_services_hash(html) @options[:services].map do |s| s = s.to_sym service_class = SERVICE_MAPPING[s] available_options = service_class.available_options opts = available_options.map { |o| [o, @options[o.to_sym]] }.to_h service = service_class.new(html, opts) [service_class.keyname, service] end.to_h end
perform_services(html)
click to toggle source
# File lib/abrupt/crawler.rb, line 115 def perform_services(html) result = {} html = canonize_html(html) services_hash = init_services_hash(html) services_hash.each do |json_field, service_class| result[json_field.to_sym] = service_class.execute end result end
same_host?(uri)
click to toggle source
# File lib/abrupt/crawler.rb, line 94 def same_host?(uri) !uri.to_s.empty? && Addressable::URI.parse(uri).host.eql?(@uri.host) end
uris_with_same_host(uri)
click to toggle source
TODO: maybe as class method
# File lib/abrupt/crawler.rb, line 62 def uris_with_same_host(uri) if @result[uri][:link] && @result[uri][:link]['a'] @result[uri][:link]['a'].to_a.map do |link| link['href'] if same_host?(link['href']) end.compact else [] end end