class Abrupt::Crawler

Crawler for a website including all followed urls with performing abrupt services BETA!!!

Constants

SERVICE_MAPPING

Public Class Methods

new(uri, *args) click to toggle source
# File lib/abrupt/crawler.rb, line 28
def initialize(uri, *args)
  @uri = Addressable::URI.parse(uri).normalize
  opts = args.first
  @options = {
      lang: 'en',
      services: %w(r i s c l p),
      depth: '3',
      word_limit: 20
  }
  @options[:services] = opts[:services] if opts[:services]
  @options[:lang] = opts[:lang] if opts[:lang]
  @follow_links = !opts[:nofollow]
  @result = {}
end

Public Instance Methods

canonize_html(html) click to toggle source
# File lib/abrupt/crawler.rb, line 109
def canonize_html(html)
  baseurl = "#{@uri.scheme}://#{@uri.host}"
  converter = Service::AbsoluteUrl.new(html, baseurl: baseurl)
  converter.execute
end
crawl(uri = nil) click to toggle source

Crawls a page, saves the service results in result hash and returns an array with the existing uris of this page.

@param uri [String] the uri to crawl @return [JSON] result

# File lib/abrupt/crawler.rb, line 48
def crawl(uri = nil)
  Abrupt.log '.'
  uri ||= @uri.to_str.append_last_slash
  unless @result[uri]
    html = fetch_html(uri)
    @result[uri] ||= {}
    @result[uri] = perform_services(html) if html
    # new_uris.select! { |url| same_host?(url) } # filter
    uris_with_same_host(uri).uniq.each { |url| crawl(url) } if @follow_links
  end
  Service::Base.transform_hash(@result)
end
fetch_html(uri) click to toggle source
# File lib/abrupt/crawler.rb, line 72
def fetch_html(uri)
  uri = Addressable::URI.parse(uri.strip).normalize.to_str
  begin
    response = ::RestClient.get uri, accept: :html
    content_type = response.headers[:content_type].to_s
    case response.code
    when 200...400
      response.to_str if html?(content_type)
    else
      false
    end
  rescue => e
    puts "error fetching html on #{uri}"
    puts e
    nil
  end
end
html?(content_type) click to toggle source
# File lib/abrupt/crawler.rb, line 90
def html?(content_type)
  content_type.start_with?('text/html')
end
init_services_hash(html) click to toggle source
# File lib/abrupt/crawler.rb, line 98
def init_services_hash(html)
  @options[:services].map do |s|
    s = s.to_sym
    service_class = SERVICE_MAPPING[s]
    available_options = service_class.available_options
    opts = available_options.map { |o| [o, @options[o.to_sym]] }.to_h
    service = service_class.new(html, opts)
    [service_class.keyname, service]
  end.to_h
end
perform_services(html) click to toggle source
# File lib/abrupt/crawler.rb, line 115
def perform_services(html)
  result = {}
  html = canonize_html(html)
  services_hash = init_services_hash(html)
  services_hash.each do |json_field, service_class|
    result[json_field.to_sym] = service_class.execute
  end
  result
end
same_host?(uri) click to toggle source
# File lib/abrupt/crawler.rb, line 94
def same_host?(uri)
  !uri.to_s.empty? && Addressable::URI.parse(uri).host.eql?(@uri.host)
end
uris_with_same_host(uri) click to toggle source

TODO: maybe as class method

# File lib/abrupt/crawler.rb, line 62
def uris_with_same_host(uri)
  if @result[uri][:link] && @result[uri][:link]['a']
    @result[uri][:link]['a'].to_a.map do |link|
      link['href'] if same_host?(link['href'])
    end.compact
  else
    []
  end
end