class Tychus::Parsers::SchemaOrgParser

Attributes

review_doc[R]
root_doc[R]
video_object_doc[R]

Public Class Methods

new(uri) click to toggle source
Calls superclass method Tychus::Parsers::Base::new
# File lib/tychus/parsers/schema_org_parser.rb, line 8
def initialize(uri)
  @root_doc = '[itemtype="http://schema.org/Recipe"]'
  @review_doc = '[itemtype="http://schema.org/Review"]'
  @video_object_doc = '[itemtype="http://www.schema.org/VideoObject"]'
  super
  strip_review_microformat
  strip_video_object_microformat
end

Public Instance Methods

itemprop_node_for(property) click to toggle source
# File lib/tychus/parsers/schema_org_parser.rb, line 17
def itemprop_node_for(property)
  recipe_doc.css("[itemprop='#{property}']").first || NullObject.new
end
parse_author() click to toggle source
# File lib/tychus/parsers/schema_org_parser.rb, line 21
def parse_author
  itemprop_node_for(:author).content
end
parse_cook_time() click to toggle source
# File lib/tychus/parsers/schema_org_parser.rb, line 30
def parse_cook_time
  # leverage iso8601
  parse_duration(itemprop_node_for(:cookTime))
end
parse_description() click to toggle source
# File lib/tychus/parsers/schema_org_parser.rb, line 25
def parse_description
  # is it always first?
  itemprop_node_for(:description).content
end
parse_duration(node) click to toggle source
# File lib/tychus/parsers/schema_org_parser.rb, line 35
def parse_duration(node)
  # Allrecipes - 'time' element
  # Foodnetwork - 'meta' element (std according to
  # Schema.org/Recipe)
  case node.name
  when "meta", "span"
    node.attr('content')
  when "time"
    node.attr('datetime')
  else
    NullObject.new
  end
end
parse_image() click to toggle source
# File lib/tychus/parsers/schema_org_parser.rb, line 49
def parse_image
  itemprop_node_for(:image).attr('src')
end
parse_ingredients() click to toggle source
# File lib/tychus/parsers/schema_org_parser.rb, line 53
def parse_ingredients
  # NOT FIRST
  recipe_doc
    .css('[itemprop="ingredients"]')
    .map do |node|
      node.content
        .squeeze(" ")
        .rstrip
        .lstrip
        .split("\r\n")
    end.flatten
end
parse_name() click to toggle source
# File lib/tychus/parsers/schema_org_parser.rb, line 66
def parse_name
  itemprop_node_for(:name).content
end
parse_prep_time() click to toggle source
# File lib/tychus/parsers/schema_org_parser.rb, line 70
def parse_prep_time
  parse_duration(itemprop_node_for(:prepTime))
end
parse_recipe_instructions() click to toggle source
# File lib/tychus/parsers/schema_org_parser.rb, line 74
def parse_recipe_instructions
  # strip empty strings, drop trailing whitespace, clean carriage returns (\r\n)
  #
  # Allrecipes: <li><span>lorem ipsum</span></li>
  # FoodNetwork: <p>lorem ipsum</p>
  # reject headers such as "Directions" and divs such as .categories for Foodnetwork recipes
  reject_regex = /^(h.|div)$/

  itemprop_node_for(:recipeInstructions)
    .element_children
    .reject { |node| node.name =~ reject_regex }
    .map do |node|
      node.content
        .squeeze(" ")
        .rstrip
        .split("\r\n\s\r\n\s")
    end.flatten.reject(&:blank?)
end
parse_recipe_yield() click to toggle source
# File lib/tychus/parsers/schema_org_parser.rb, line 93
def parse_recipe_yield
  itemprop_node_for(:recipeYield).content
end
parse_total_time() click to toggle source
# File lib/tychus/parsers/schema_org_parser.rb, line 97
def parse_total_time
  # leverage iso8601
  parse_duration(itemprop_node_for(:totalTime))
end
strip_review_microformat() click to toggle source
# File lib/tychus/parsers/schema_org_parser.rb, line 102
def strip_review_microformat
  recipe_doc.css(review_doc).remove
end
strip_video_object_microformat() click to toggle source
# File lib/tychus/parsers/schema_org_parser.rb, line 106
def strip_video_object_microformat
  recipe_doc.css(video_object_doc).remove
end