class Apollo::CrawlerProgram
Public Class Methods
console_table(headings, rows)
click to toggle source
Show tabular data in form of CLI table
# File lib/apollo_crawler/program/crawler_program.rb, line 98 def self.console_table(headings, rows) rows = rows.map do |o| i = o.new res = [] headings.each do |h| res << i.instance_eval(h) end res end table = Terminal::Table.new :headings => headings, :rows => rows puts table end
get_modules_paths(modules = APOLLO_CRAWLER_MODULES)
click to toggle source
# File lib/apollo_crawler/program/crawler_program.rb, line 80 def self.get_modules_paths(modules = APOLLO_CRAWLER_MODULES) res = modules.map do |name| Dir[File.join(APOLLO_CRAWLER_BASE_DIR, name, "*.rb")].each do |path| path end end res.flatten.sort end
new()
click to toggle source
Initializer - Constructor
Calls superclass method
Apollo::BaseProgram::new
# File lib/apollo_crawler/program/crawler_program.rb, line 70 def initialize super @options = {} at_exit { at_exit_handler } end
register_modules(modules = APOLLO_CRAWLER_MODULES)
click to toggle source
# File lib/apollo_crawler/program/crawler_program.rb, line 90 def self.register_modules(modules = APOLLO_CRAWLER_MODULES) get_modules_paths(modules).each do |file| # puts "Adding module '#{file}'" require file end end
Public Instance Methods
at_exit_handler()
click to toggle source
At Exit handler
# File lib/apollo_crawler/program/crawler_program.rb, line 511 def at_exit_handler() # if(@options[:verbose]) # puts "Running at_exit_handler" # end # TODO: Flush caches # TODO: End gracefully # Force exit event machine # EventMachine.stop end
generate_crawler(name, url = nil, matcher = nil, options = @options)
click to toggle source
# File lib/apollo_crawler/program/crawler_program.rb, line 265 def generate_crawler(name, url = nil, matcher = nil, options = @options) name = name.titleize.gsub(" ", "") if(@options[:verbose]) puts "Generating new crawler '#{name}'" end template_path = RbConfig::CRAWLER_TEMPLATE_PATH puts template_path if(File.exists?(template_path) == false) puts "Template file '#{template_path}' does not exists!" return -1 end if(options[:verbose]) puts "Using template '#{template_path}'" end unless(options[:silent]) dest_path = File.join(Dir.pwd, "#{name.underscore}.rb") end url = url ? url : "http://some-url-here" matcher = matcher ? matcher : "//a" placeholders = { "CRAWLER_CLASS_NAME" => name, "CRAWLER_NAME" => name.titleize, "CRAWLER_URL" => url, "CRAWLER_MATCHER" => matcher } puts "Generating crawler '#{name.titleize}', class: '#{name}', path: '#{dest_path}'" File.open(template_path, 'r') do |tmpl| File.open(dest_path, 'w') do |crawler| while line = tmpl.gets #puts line placeholders.each do |k, v| line.gsub!(k, v) end crawler.puts line end end end return 0 end
get_crawlers(args, options = @options)
click to toggle source
Get crawlers passd to cmd-line
# File lib/apollo_crawler/program/crawler_program.rb, line 363 def get_crawlers(args, options = @options) crawlers = [] if(args.length > 0) crawlers << args.shift end if(options[:run_all]) crawlers = @crawlers.keys end return crawlers end
get_crawlers_by_name(crawlers, crawler_classes = Apollo::Crawler::BaseCrawler.subclasses)
click to toggle source
# File lib/apollo_crawler/program/crawler_program.rb, line 315 def get_crawlers_by_name(crawlers, crawler_classes = Apollo::Crawler::BaseCrawler.subclasses) res = [] crawlers.each do |crawler| next if crawler.nil? crawler_classes.each do |klass| next if klass.nil? crawler_name = crawler.to_s.split('::').last.downcase klass_name = klass.to_s.split('::').last.downcase.gsub("crawler", "") # puts "#{crawler_name} => #{klass_name}" if crawler_name == klass_name || crawler_name == "#{klass_name}crawler" res << klass break end end end res end
init_additional_crawlers(dirs)
click to toggle source
# File lib/apollo_crawler/program/crawler_program.rb, line 401 def init_additional_crawlers(dirs) # puts "Initializing aditional crawlers ..." dirs.each do |dir| if(@options[:verbose]) puts "Registering additional crawler dir '#{dir}'" end Dir.glob("#{dir}/*.rb").each do |f| if(@options[:verbose]) puts "Registering crawler '#{f}'" end require f end end end
init_options()
click to toggle source
Initialize command-line options
# File lib/apollo_crawler/program/crawler_program.rb, line 114 def init_options() @options[:env] = Apollo::ENV @options[:doc_limit] = nil @options[:verbose] = false @options[:version] = nil @options[:cache_dirs] = [ RbConfig::CACHES_DIR ] @options[:crawler_dirs] = [ RbConfig::CRAWLERS_DIR ] @options[:formatter_dirs] = [ RbConfig::FORMATTERS_DIR ] @options[:generate_crawler] = nil end
init_options_parser()
click to toggle source
# File lib/apollo_crawler/program/crawler_program.rb, line 136 def init_options_parser() @optparser = OptionParser.new do | opts | opts.banner = "Usage: apollo-crawler [OPTIONS] CRAWLER_NAME [START_URL]" opts.separator "" opts.separator "Specific options:" # This displays the help screen, all programs are # assumed to have this option. opts.on('-h', '--help', 'Display this screen') do @options[:show_help] = true end opts.on('-a', '--all', 'Run all crawlers') do @options[:run_all] = true end opts.on('-e', '--environment [NAME]', "Environment used, default '#{@options[:env]}'") do |name| @options[:env] = name end opts.on('-f', '--format [NAME]', "Formatter used") do |name| @options[:formatter] = name end opts.on('-g', '--generate [NAME]', "Generate scaffold for new crawler") do |name| @options[:generate_crawler] = name end opts.on('-i', '--include [PATH]', 'Include additional crawler or crawler directory') do |path| @options[:crawler_dirs] << path init_additional_crawlers([path]) end opts.on('-n', '--doc-limit [NUM]', 'Limit count of documents to be processed') do |count| @options[:doc_limit] = count.to_i end opts.on('-v', '--verbose', 'Enable verbose output') do @options[:verbose] = true end opts.on('-V', '--version', 'Show version info') do @options[:version] = true end opts.on('-l', '--list-crawlers', 'List of crawlers') do @options[:list_crawlers] = true end opts.on(nil, '--list-formatters', 'List of formatters available') do @options[:list_formatters] = true end # opts.on('-q', '--query [QUERY]', 'Query crawler database for phrase') do |query| # @options[:query] = query # end opts.on('-s', '--silent', 'Silent mode - do not print processed document') do @options[:silent] = true end end end
init_program(args)
click to toggle source
Init program
# File lib/apollo_crawler/program/crawler_program.rb, line 418 def init_program(args) init_options() init_options_parser() CrawlerProgram.register_modules() parse_options(args) init_program_directory(RbConfig::PROGRAM_DIRECTORY, RbConfig::PROGRAM_DIRECTORIES) load_config_file() res = process_options(args) if res != nil return res end return nil end
init_program_directory(base_dir = RbConfig::PROGRAM_DIRECTORY, dirs = RbConfig::PROGRAM_DIRECTORIES, options = @options)
click to toggle source
# File lib/apollo_crawler/program/crawler_program.rb, line 376 def init_program_directory(base_dir = RbConfig::PROGRAM_DIRECTORY, dirs = RbConfig::PROGRAM_DIRECTORIES, options = @options) dirs.each do |dir| if(File.directory?(dir) == false) if(options[:verbose]) puts "Creating '#{dir}'" end FileUtils.mkpath(dir) end end init_user_config_file(File.join(File.dirname(__FILE__), 'config_user.trb'), File.join(base_dir, 'config.rb')) end
init_user_config_file(config_path, dest_path, options = @options)
click to toggle source
# File lib/apollo_crawler/program/crawler_program.rb, line 390 def init_user_config_file(config_path, dest_path, options = @options) # Create user config file if(File.exists?(config_path) && File.exists?(dest_path) == false) if(options[:verbose]) puts "Creating user config file '#{config_path}' => '#{dest_path}'" end FileUtils.cp(config_path, dest_path) end end
load_config_file(config = RbConfig::PROGRAM_CONFIG_PATH)
click to toggle source
Load global options first Merge it with local options (if they exists)
# File lib/apollo_crawler/program/crawler_program.rb, line 247 def load_config_file(config = RbConfig::PROGRAM_CONFIG_PATH) if(File.exists?(config)) if(@options[:verbose]) puts "Loading config '#{config}'" end require config else if(@options[:verbose]) # TODO: Add support for initial rake task generation # Something like this: # rake config:init # Initializes config files with # their defaults (if not exists already) puts "Default config does not exist, skipping - '#{config}'" end end end
parse_options(args = ARGV)
click to toggle source
Parse the options passed to command-line
# File lib/apollo_crawler/program/crawler_program.rb, line 202 def parse_options(args = ARGV) # Parse the command-line. Remember there are two forms # of the parse method. The 'parse' method simply parses # ARGV, while the 'parse!' method parses ARGV and removes # any options found there, as well as any parameters for # the options. What's left is the list of files to resize. @optparser.parse!(args) end
process_docs_handler(docs, options = options, formatter)
click to toggle source
# File lib/apollo_crawler/program/crawler_program.rb, line 492 def process_docs_handler(docs, options = options, formatter) if(docs.nil?) return docs end if(docs.kind_of?(Array) == false) docs = [docs] end if options[:silent] != true docs.each do |doc| puts formatter.format(doc) end end return docs end
process_options(args)
click to toggle source
# File lib/apollo_crawler/program/crawler_program.rb, line 211 def process_options(args) if(@options[:version]) puts Apollo::VERSION return 0 end if(@options[:show_help]) puts @optparser return 0 end if(@options[:generate_crawler]) name = @options[:generate_crawler] url = args.length > 0 ? args[0] : nil matcher = args.length > 1 ? args[1] : nil return self.generate_crawler(name, url, matcher) end if(@options[:list_formatters]) objs = Apollo::Formatter::BaseFormatter.subclasses CrawlerProgram.console_table(['name', 'self.class'], objs) return 0 end if(@options[:list_crawlers]) objs = Apollo::Crawler::BaseCrawler.subclasses CrawlerProgram.console_table(['name', 'self.class'], objs) return 0 end return nil end
request_exit(code = 0)
click to toggle source
# File lib/apollo_crawler/program/crawler_program.rb, line 482 def request_exit(code = 0) begin exit(0) rescue SystemExit => e # puts "rescued a SystemExit exception, reason: '#{e.to_s}'" end return code end
run(args = ARGV)
click to toggle source
Run Program
# File lib/apollo_crawler/program/crawler_program.rb, line 447 def run(args = ARGV) res_code = init_program(args) if res_code.nil? == false return request_exit(res_code) end if(@options[:verbose]) puts "Running environment '#{@options[:env]}'" end # Look for query if(@options[:query]) res_code = run_query(@options[:query], @options) return request_exit(res_code) end # Parse remaining arguments as crawlers crawler_names = get_crawlers(args) if(crawler_names.nil? || crawler_names.empty?) puts @optparser return request_exit(0) end # Get crawlers by their names crawlers = get_crawlers_by_name(crawler_names, Apollo::Crawler::BaseCrawler.subclasses) if(crawlers.nil? || crawlers.empty?) puts @optparser return request_exit(0) end res_code = run_crawlers(crawlers, args, @options) return request_exit(res_code) end
run_crawlers(crawlers, args, options = @options)
click to toggle source
# File lib/apollo_crawler/program/crawler_program.rb, line 337 def run_crawlers(crawlers, args, options = @options) crawlers.each do |crawler| if(options[:verbose]) puts "Running '#{crawler}'" end opts = { :doc_limit => options[:doc_limit] } # Run crawlers instance = crawler.new if(args.nil? || args.empty?) args = instance.url end res = instance.etl(args, opts) do | docs | process_docs_handler(docs, options, Apollo::Formatter::JsonFormatter.new) end end return 0 end
run_query(query, options = {})
click to toggle source
# File lib/apollo_crawler/program/crawler_program.rb, line 438 def run_query(query, options = {}) if(options[:verbose]) puts "Investigating query '#{query}'" end return 0 end