class SemanticCrawler::Websites::MicroData

Extract microdata from a website and output it as JSON

Attributes

microdata[RW]
url[RW]

Public Class Methods

new(url) click to toggle source
# File lib/semantic_crawler/websites/micro_data.rb, line 12
def initialize(url)
  doc = Nokogiri::HTML(open(url))
  microdata = Microdata::Document.new(doc.to_s)
  items = microdata.extract_items
  self.microdata = extract_microdata(items)
end

Public Instance Methods

to_json() click to toggle source
# File lib/semantic_crawler/websites/micro_data.rb, line 19
def to_json
  microdata.to_json
end
to_s() click to toggle source
# File lib/semantic_crawler/websites/micro_data.rb, line 23
def to_s
  microdata
end

Private Instance Methods

extract_microdata(items) click to toggle source
# File lib/semantic_crawler/websites/micro_data.rb, line 28
def extract_microdata(items)
  hash = Hash.new
  if items.kind_of? Array and items.first and items.first.kind_of? String
    hash = items
  elsif items.kind_of? Array and items.first
    items.each do |item|
      props = item.properties
      properties = Hash.new
      props.each do |key, value|
        hash[item.type.first] ||= Array.new
        values = extract_microdata(value)
        properties.merge!(key.to_s => values)
      end
      hash[item.type.first] << properties
    end
  else
    raise "Not implemented!"
  end
  hash
end