class Xapian::Indexer::Spider
Represents a process which consumes resources into the database and follows links to related resources
Attributes
resources[R]
Public Class Methods
new(database, generator, controller, options = {})
click to toggle source
database = Xapian::Database.new(ARGV)
# File lib/xapian/indexer/spider.rb, line 32 def initialize(database, generator, controller, options = {}) @database = database @generator = generator @controller = controller @links = [] @touched = Set.new @logger = options[:logger] || Logger.new($stdout) end
Public Instance Methods
add(root)
click to toggle source
# File lib/xapian/indexer/spider.rb, line 45 def add(root) case root when String @links << root when Array @links += root else @logger.error "Could not add roots #{root.inspect}!" end end
process(options = {}, &block)
click to toggle source
# File lib/xapian/indexer/spider.rb, line 106 def process(options = {}, &block) count = 0 depth = 0 until @links.empty? new_links = [] @links.each do |link| # Mark and sweep - don't review the same resource twice! next if @touched.include?(link) @touched << link # Create a new fetch from the database... fetch = Fetch.new(@database, @controller, link) resource = fetch.current_resource # Does it already exist in the current database (and fresh?) unless fetch.archived_resource && fetch.archived_resource.fresh? # Fetch the resource and add it to the index begin @logger.info "Indexing #{resource.name}..." resource.fetch! rescue @logger.error "Could not fetch resource #{resource.name}: #{$!}!" $!.backtrace.each{|line| @logger.error(line)} end # Did we fetch a resource and was it indexable? if resource.fetched? if resource.content? doc = Xapian::Document.new doc.data = @controller.save(resource) doc.add_term(resource.name_digest) @generator.document = doc @generator.index_text(resource.content) @database.replace_document(resource.name_digest, doc) else @logger.warn "Resource was not indexable #{resource.name}!" end else @logger.warn "Could not fetch resource #{resource.name}!" end else @logger.info "Still fresh #{resource.name}..." end new_links += (fetch.links || []).map(&block).compact count += 1 if options[:count] && count > options[:count] # If we have to leave before finishing this breadth... @links += new_links return count end end @links = new_links depth += 1 return count if options[:depth] && depth > options[:depth] end end
remove_old!()
click to toggle source
# File lib/xapian/indexer/spider.rb, line 172 def remove_old! postlist = @database.postlist("") postlist.each do |post| document = @database.document(post.docid) resource = @controller.recreate(document.data) unless resource.fresh? @logger.info "Removing expired index for #{resource.name}." @database.delete_document(post.docid) end end end