class SpidrCLI::Options
Constants
- METHODS
Spidr methods
Attributes
columns[R]
content_types[R]
header[R]
spidr_method[R]
spidr_options[R]
url[R]
usage_doc[R]
Public Class Methods
new(argv = ARGV)
click to toggle source
# File lib/spidr_cli/options.rb, line 11 def initialize(argv = ARGV) @url = nil @columns = %w[url] @content_types = nil @header = false @usage_doc = nil @spidr_method = 'site' @spidr_options = {} parse_options(argv) end
Private Instance Methods
option_hash(value)
click to toggle source
# File lib/spidr_cli/options.rb, line 220 def option_hash(value) value.map { |v| v.split('=') }.to_h end
parse_options(argv)
click to toggle source
# File lib/spidr_cli/options.rb, line 25 def parse_options(argv) proxy_options = {} @spidr_method = METHODS[argv.first] if METHODS.key?(argv.first) OptionParser.new do |parser| @usage_doc = parser.to_s parser.banner = 'Usage: spidr [<method>] [options] <url>' parser.default_argv = argv parser.on('--columns=[val1,val2]', Array, 'Columns in output') do |value| @columns = value || columns end parser.on('--content-types=[val1,val2]', Array, 'Formats to output (html, javascript, css, json, ..)') do |value| @content_types = value end parser.on('--[no-]header', 'Include the header') do |value| @header = value end # Spidr::Sanitizers options parser.on('--[no-]strip-fragments', 'Specifies whether the Agent will strip URI fragments (default: true)') do |value| spidr_options[:strip_fragments] = value end parser.on('--[no-]strip-query', 'Specifies whether the Agent will strip URI query (default: false)') do |value| spidr_options[:strip_query] = value end # Spidr::Filters options parser.on('--schemes=[http,https]', Array, 'Only spider links with certain scheme') do |value| spidr_options[:schemes] = value if value end parser.on('--host=[example]', String, 'Only spider links on certain host') do |value| spidr_options[:host] = value if value end # NOTE: --hosts is overriden # @see https://github.com/postmodern/spidr/blob/master/lib/spidr/agent.rb#L273 parser.on('--hosts=[example.com]', Array, 'Only spider links on certain hosts (ignored unless method is "start_at")') do |value| spidr_options[:hosts] = to_option_regexp_array(value) if value end # NOTE: --ignore-hosts is overriden # @see https://github.com/postmodern/spidr/blob/master/lib/spidr/agent.rb#L273 parser.on('--ignore-hosts=[www.example.com]', Array, 'Do not spider links on certain hosts (ignored unless method is "start_at")') do |value| spidr_options[:ignore_hosts] = to_option_regexp_array(value) if value end parser.on('--ports=[80, 443]', Array, 'Only spider links on certain ports') do |value| spidr_options[:ports] = to_option_int_array(value) if value end parser.on('--ignore-ports=[8000, 8080, 3000]', Array, 'Do not spider links on certain ports') do |value| spidr_options[:ignore_ports] = to_option_int_array(value) if value end parser.on('--links=[/blog/]', Array, 'Only spider links on certain link patterns') do |value| spidr_options[:links] = to_option_regexp_array(value) if value end parser.on('--ignore-links=[/blog/]', Array, 'Do not spider links on certain link patterns') do |value| spidr_options[:ignore_links] = to_option_regexp_array(value) if value end parser.on('--urls=[/blog/]', Array, 'Only spider links on certain urls') do |value| spidr_options[:urls] = to_option_regexp_array(value) if value end parser.on('--ignore-urls=[/blog/]', Array, 'Do not spider links on certain urls') do |value| spidr_options[:ignore_urls] = to_option_regexp_array(value) if value end parser.on('--exts=[htm]', Array, 'Only spider links on certain extensions') do |value| spidr_options[:exts] = to_option_regexp_array(value) if value end parser.on('--ignore-exts=[cfm]', Array, 'Do not spider links on certain extensions') do |value| spidr_options[:ignore_exts] = to_option_regexp_array(value) if value end # Spidr::Agent options parser.on('--open-timeout=val', Integer, 'Open timeout') do |value| spidr_options[:open_timeout] = value end parser.on('--read-timeout=val', Integer, 'Read timeout') do |value| spidr_options[:read_timeout] = value end parser.on('--ssl-timeout=val', Integer, 'SSL timeout') do |value| spidr_options[:ssl_timeout] = value end parser.on('--continue-timeout=val', Integer, 'Continue timeout') do |value| spidr_options[:continue_timeout] = value end parser.on('--keep-alive-timeout=val', Integer, 'Keep alive timeout') do |value| spidr_options[:keep_alive_timeout] = value end parser.on('--proxy-host=val', String, 'The host the proxy is running on') do |value| proxy_options[:host] = value end parser.on('--proxy-port=val', Integer, 'The port the proxy is running on') do |value| proxy_options[:port] = value end parser.on('--proxy-user=val', String, 'The user to authenticate with the proxy') do |value| proxy_options[:user] = value end parser.on('--proxy-password=val', String, 'The password to authenticate with the proxy') do |value| proxy_options[:password] = value end parser.on('--default-headers=[key1=val1,key2=val2]', Array, 'Default headers to set for every request') do |value| spidr_options[:default_headers] = option_hash(value || []) end parser.on('--host-header=val', String, 'The HTTP Host header to use with each request') do |value| spidr_options[:host_header] = value end parser.on('--host-headers=[key1=val1,key2=val2]', Array, 'The HTTP Host headers to use for specific hosts') do |value| spidr_options[:host_headers] = option_hash(value || []) end parser.on('--user-agent=val', String, 'The User-Agent string to send with each requests') do |value| spidr_options[:user_agent] = value end parser.on('--referer=val', String, 'The Referer URL to send with each request') do |value| spidr_options[:referer] = value end parser.on('--delay=val', Integer, 'The number of seconds to pause between each request') do |value| spidr_options[:delay] = value end parser.on('--queue=[val1,val2]', Array, 'The initial queue of URLs to visit') do |value| spidr_options[:queue] = value end parser.on('--history=[val1,val2]', Array, 'The initial list of visited URLs') do |value| spidr_options[:history] = value end parser.on('--limit=val', Integer, 'The maximum number of pages to visit') do |value| spidr_options[:limit] = value end parser.on('--max-depth=val', Integer, 'The maximum link depth to follow') do |value| spidr_options[:max_depth] = value end parser.on('--[no-]robots', 'Respect Robots.txt') do |value| spidr_options[:robots] = value end # Boilerplate CLI parser.on('-h', '--help', 'How to use') do puts parser exit end parser.on_tail('--version', 'Show version') do puts "Spidr version #{Spidr::VERSION} (SpidrCLI version #{SpidrCLI::VERSION})" exit end end.parse! if @spidr_method != 'start_at' && (spidr_options.key?(:hosts) || spidr_options.key?(:ignore_hosts)) raise(ArgumentError, '--hosts and --ignore-hosts argument are only valid if spidr method is "start_at"') end spidr_options[:proxy] = proxy_options unless proxy_options.empty? @url = argv.last end
to_option_int_array(value)
click to toggle source
# File lib/spidr_cli/options.rb, line 212 def to_option_int_array(value) value.map { |v| Integer(v) } end
to_option_regexp_array(value)
click to toggle source
# File lib/spidr_cli/options.rb, line 216 def to_option_regexp_array(value) value.map { |v| Regexp.new(v) } end