class Xapian::Indexer::Extractors::HTML
Represents a resource that will be indexed
Constants
- NBSP
- WHITESPACE
Public Class Methods
new(options = {})
click to toggle source
# File lib/xapian/indexer/extractors/html.rb, line 31 def initialize(options = {}) @options = options @logger = options[:logger] || Logger.new($stderr) end
Public Instance Methods
call(resource, status, headers, data)
click to toggle source
# File lib/xapian/indexer/extractors/html.rb, line 37 def call(resource, status, headers, data) html = Nokogiri::HTML.parse(data) result = {} # Extract description meta_description = html.css("meta[name='description']").first if meta_description result[:description] = meta_description['content'] else # Use the first paragraph as a description first_paragraph = html.search("p").first if first_paragraph result[:description] = first_paragraph.inner_text.gsub(WHITESPACE, " ") end end base_tag = html.at('html/head/base') if base_tag base = URI.parse(base_tag['href']) else base = URI.parse(resource.name) end links = [] html.css('a').each do |link| href = (link['href'] || "").to_s.gsub(/ /, '%20') # No scheme but starts with a '/' #begin links << (base + href) #rescue # $stderr.puts "Could not add link #{href}: #{$!}" #end end # Remove any fragment at the end of the URI. links.each{|link| link.fragment = nil} # Convert to strings and uniq. result[:links] = links.map{|link| link.to_s}.uniq #$stderr.puts "Extracted links = #{result[:links].inspect}" # Extract title title_tag = html.at('html/head/title') h1_tag = html.search('h1').first if title_tag result[:title] = title_tag.inner_text.gsub(WHITESPACE, " ") elsif h1_tag result[:title] = h1_tag.inner_text.gsub(WHITESPACE, " ") end # Extract keywords meta_keywords = html.css("meta[name='keyword']").first if meta_keywords result[:keywords] = meta_keywords['content'].gsub(WHITESPACE, " ") end # Remove junk elements from the html html.search("script").remove html.search("link").remove html.search("meta").remove html.search("style").remove html.search("form").remove html.css('.noindex').remove body = html.at('html/body') if body # We also convert NBSP characters to inner space. result[:content] = body.inner_text.gsub(WHITESPACE, " ") end return result end