class Apollo::Crawler::BaseCrawler
Public Class Methods
create_metadoc(url, doc)
click to toggle source
# File lib/apollo_crawler/crawler/base_crawler.rb, line 172 def self.create_metadoc(url, doc) body = doc[:body].encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}) return { 'url' => url, 'doc' => body, 'hash' => Digest::SHA256.new.update(body).hexdigest, 'created_at' => Time.now.utc, 'expires_at' => nil, 'version' => 0 } end
fetch(url)
click to toggle source
# File lib/apollo_crawler/crawler/base_crawler.rb, line 50 def self.fetch(url) RbConfig::DEFAULT_FETCHER.fetch(url) end
name_re()
click to toggle source
# File lib/apollo_crawler/crawler/base_crawler.rb, line 37 def self.name_re() return /crawler$/ end
new()
click to toggle source
# File lib/apollo_crawler/crawler/base_crawler.rb, line 32 def initialize @backlog = [] @visited = [] end
try_get_doc(root, url)
click to toggle source
# File lib/apollo_crawler/crawler/base_crawler.rb, line 62 def self.try_get_doc(root, url) doc = BaseCrawler.try_get_url(root, url) # TODO: Set experition header return { :doc => doc, :url => url } end
try_get_url(root, url)
click to toggle source
# File lib/apollo_crawler/crawler/base_crawler.rb, line 54 def self.try_get_url(root, url) begin return URI.join(root, url) rescue return nil end end
Public Instance Methods
enqueue_url(url)
click to toggle source
# File lib/apollo_crawler/crawler/base_crawler.rb, line 130 def enqueue_url(url) urls = [] return urls if url.nil? # We support both - list of urls or single url if(url.kind_of?(Array)) urls = urls.concat(url) else urls << url end urls.each do |u| if(url_processed?(u) == false) @backlog << u end end end
etl(url=nil, opts={}) { |doc| ... }
click to toggle source
-
(0) Figure out URL
-
(1) Extract Data
-
(2) Extract Links
-
(3) Go to (0) eventually
# File lib/apollo_crawler/crawler/base_crawler.rb, line 76 def etl(url=nil, opts={}, &block) # Look for passed URL use default instead and fail if it is not valid if(url.nil? || url.empty?) url = self.url end # TODO: Be more agressive, use assert, it is clients responsibility! if(url.nil?) return nil end enqueue_url(url) # Counter of processed documents (pages) docs_processed = 0 res = [] # TODO: Respect limit of documents/urls processed while(@backlog.empty? == false) url = @backlog.shift # puts "Processing '#{url}'" doc = self.process_url(url) # Increase counter of processed documents docs_processed = docs_processed + 1 @visited << url # Process document if was successfuly retreived if(!doc.nil?) # TODO: Use log4r and log it only on info level if block_given? yield doc end # Add document to queue of results res << doc enqueue_url(doc[:links].map(){ |l| l[:link] }) if doc[:links] end # Break if limit of documents to processed was reached break if opts[:doc_limit] && docs_processed >= opts[:doc_limit] end # Return processed document return res end
extract_data(doc)
click to toggle source
Extracts data from document
# File lib/apollo_crawler/crawler/base_crawler.rb, line 228 def extract_data(doc) res = [] return res end
extract_links(doc)
click to toggle source
Extract links to another documents from this document
# File lib/apollo_crawler/crawler/base_crawler.rb, line 234 def extract_links(doc) res = [] return res end
fetch_document(url)
click to toggle source
Fetch document
# File lib/apollo_crawler/crawler/base_crawler.rb, line 186 def fetch_document(url) # TODO: Refactor following idiom if(url == nil) url = self.url end if(url.nil?) return nil end url = url.to_s # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure cache = Apollo::Cache::Factory.instance.construct metadoc = cache.try_get(url) do max_attempts = 3 attempt_no = 0 success = false doc = nil while(attempt_no < max_attempts && success == false) do begin doc = BaseCrawler.fetch(url) success = true rescue Exception => e puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'" sleep 1 attempt_no = attempt_no + 1 success = false end end # Create metadata BaseCrawler.create_metadoc(url, doc) end # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so return Nokogiri::HTML(metadoc['doc']) end
name()
click to toggle source
Name of the crawler
# File lib/apollo_crawler/crawler/base_crawler.rb, line 42 def name return "Crawler Base" end
process_url(url)
click to toggle source
# File lib/apollo_crawler/crawler/base_crawler.rb, line 147 def process_url(url) doc = self.fetch_document(url) if(doc.nil?) return nil end # Try extract data from document data = self.extract_data(doc) # Try extract links for another documents links = self.extract_links(doc) # TODO: Make configurable if links extracted from doc should be printed # puts links.inspect # Format ETL result res = { :crawler => self.class.name, :data => data, :links => links } return res end
url()
click to toggle source
# File lib/apollo_crawler/crawler/base_crawler.rb, line 46 def url return nil end
url_processed?(url)
click to toggle source
# File lib/apollo_crawler/crawler/base_crawler.rb, line 126 def url_processed?(url) return @backlog.include?(url) || @visited.include?(url) end