class Xapian::Indexer::Spider

Represents a process which consumes resources into the database and follows links to related resources

Attributes

resources[R]

Public Class Methods

new(database, generator, controller, options = {}) click to toggle source

database = Xapian::Database.new(ARGV)

# File lib/xapian/indexer/spider.rb, line 32
def initialize(database, generator, controller, options = {})
        @database = database
        @generator = generator
        @controller = controller
        
        @links = []
        @touched = Set.new
        
        @logger = options[:logger] || Logger.new($stdout)
end

Public Instance Methods

add(root) click to toggle source
# File lib/xapian/indexer/spider.rb, line 45
def add(root)
        case root
        when String
                @links << root
        when Array
                @links += root
        else
                @logger.error "Could not add roots #{root.inspect}!"
        end
end
process(options = {}, &block) click to toggle source
# File lib/xapian/indexer/spider.rb, line 106
def process(options = {}, &block)
        count = 0
        depth = 0
        
        until @links.empty?
                new_links = []
                
                @links.each do |link|
                        # Mark and sweep - don't review the same resource twice!
                        next if @touched.include?(link)
                        @touched << link
                        
                        # Create a new fetch from the database...
                        fetch = Fetch.new(@database, @controller, link)
                        resource = fetch.current_resource
                        
                        # Does it already exist in the current database (and fresh?)
                        unless fetch.archived_resource && fetch.archived_resource.fresh?
                                # Fetch the resource and add it to the index
                                begin
                                        @logger.info "Indexing #{resource.name}..."
                                        resource.fetch!
                                rescue
                                        @logger.error "Could not fetch resource #{resource.name}: #{$!}!"
                                        $!.backtrace.each{|line| @logger.error(line)}
                                end
                                
                                # Did we fetch a resource and was it indexable?
                                if resource.fetched?
                                        if resource.content?
                                                doc = Xapian::Document.new
                                                doc.data = @controller.save(resource)
                                                doc.add_term(resource.name_digest)
                        
                                                @generator.document = doc
                                                @generator.index_text(resource.content)
                                                @database.replace_document(resource.name_digest, doc)
                                        else
                                                @logger.warn "Resource was not indexable #{resource.name}!"
                                        end
                                else
                                        @logger.warn "Could not fetch resource #{resource.name}!"
                                end
                        else
                                @logger.info "Still fresh #{resource.name}..."
                        end
                        
                        new_links += (fetch.links || []).map(&block).compact
                        
                        count += 1
                        
                        if options[:count] && count > options[:count]
                                # If we have to leave before finishing this breadth...
                                @links += new_links
                                return count
                        end
                end
                
                @links = new_links
                
                depth += 1
                
                return count if options[:depth] && depth > options[:depth]
        end
end
remove_old!() click to toggle source
# File lib/xapian/indexer/spider.rb, line 172
def remove_old!
        postlist = @database.postlist("")
        
        postlist.each do |post|
                document = @database.document(post.docid)
                resource = @controller.recreate(document.data)
                
                unless resource.fresh?
                        @logger.info "Removing expired index for #{resource.name}."
                        @database.delete_document(post.docid)
                end
        end
end