class Tansaku::Crawler

Constants

DEFAULT_USER_AGENT

Attributes

additional_list[R]
base_uri[R]
headers[R]
host[R]
max_concurrent_requests[R]
type[R]
user_agent[R]

Public Class Methods

new( base_uri, additional_list: nil, headers: {}, host: nil, max_concurrent_requests: nil, type: "all", user_agent: DEFAULT_USER_AGENT ) click to toggle source
# File lib/tansaku/crawler.rb, line 26
def initialize(
  base_uri,
  additional_list: nil,
  headers: {},
  host: nil,
  max_concurrent_requests: nil,
  type: "all",
  user_agent: DEFAULT_USER_AGENT
)
  @base_uri = URI.parse(base_uri)
  raise ArgumentError, "Invalid URI" unless valid_uri?

  @additional_list = additional_list
  unless additional_list.nil?
    raise ArgumentError, "Invalid path" unless valid_path?
  end

  @headers = headers
  @host = host
  @max_concurrent_requests = max_concurrent_requests || Etc.nprocessors * 8
  @type = type
  @user_agent = user_agent
end

Public Instance Methods

crawl() click to toggle source
# File lib/tansaku/crawler.rb, line 50
def crawl
  results = {}
  Async do
    barrier = Async::Barrier.new
    semaphore = Async::Semaphore.new(max_concurrent_requests, parent: barrier)
    internet = Async::HTTP::Internet.new

    paths.each do |path|
      semaphore.async do
        url = url_for(path)
        res = internet.head(url, default_request_headers)

        results[url] = res.status if online?(res.status)
      rescue Errno::ECONNRESET, Errno::ECONNREFUSED, Errno::EHOSTUNREACH, EOFError, OpenSSL::SSL::SSLError, Async::TimeoutError
        next
      end
    end
    barrier.wait
  end
  results
end

Private Instance Methods

default_request_headers() click to toggle source
# File lib/tansaku/crawler.rb, line 100
def default_request_headers
  @default_request_headers ||= headers.merge({ "host" => host, "user-agent" => user_agent }.compact)
end
online?(status) click to toggle source
# File lib/tansaku/crawler.rb, line 74
def online?(status)
  [200, 204, 301, 302, 307, 401, 403].include? status.to_i
end
paths() click to toggle source
# File lib/tansaku/crawler.rb, line 86
def paths
  paths = Path.get_by_type(type)
  paths += File.readlines(File.expand_path(additional_list, __dir__)) if additional_list
  paths.map(&:chomp).compact
end
url_for(path) click to toggle source
# File lib/tansaku/crawler.rb, line 92
def url_for(path)
  URI(base_uri + CGI.escape(path)).to_s
end
urls() click to toggle source
# File lib/tansaku/crawler.rb, line 96
def urls
  paths.map { |path| url_for path }
end
valid_path?() click to toggle source
# File lib/tansaku/crawler.rb, line 82
def valid_path?
  File.exist?(additional_list)
end
valid_uri?() click to toggle source
# File lib/tansaku/crawler.rb, line 78
def valid_uri?
  ["http", "https"].include? base_uri.scheme
end