module Buzzsaw::DSL

Constants

ENCODING_EXCEPTION

Public Instance Methods

asciify_target_text(target) click to toggle source
# File lib/buzzsaw/dsl.rb, line 240
def asciify_target_text(target)
  return unless target
  newstr = ""
  target.each_char { |chr| newstr << (chr.dump["u{e2}"] ? '"' : chr) }
  newstr.to_ascii
end
capture_target_text(text, pattern) click to toggle source
# File lib/buzzsaw/dsl.rb, line 200
def capture_target_text(text, pattern)
  return unless text
  pattern ? text[pattern] : text.gsub(/\s+/," ")
end
collect_by_xpath(args) click to toggle source
# File lib/buzzsaw/dsl.rb, line 19
def collect_by_xpath(args)
  args.symbolize_keys!
  args[:match] = args[:capture] = args[:pattern] if args[:pattern]

  nodes = get_nodes(args)
  target = collect_target_text(args, nodes)
  return args[:label] if args[:label] && target.present?
  asciify_target_text(target)
end
collect_target_text(args, nodes) click to toggle source
# File lib/buzzsaw/dsl.rb, line 182
def collect_target_text(args, nodes)
  match_target_text!(nodes, args[:match])

  # Reduce the matching nodes
  result = join_target_text(nodes, args[:join])

  # Filter the string with the :capture regex
  capture_target_text(result, args[:capture])
rescue ENCODING_EXCEPTION
end
filter_target_text(target, filter_list) click to toggle source
# File lib/buzzsaw/dsl.rb, line 139
def filter_target_text(target, filter_list)
  filter_list.each do |filter|
    next unless target.present?
    filter.symbolize_keys! if filter.is_a?(Hash)
    if filter.is_a?(String) && respond_to?(filter)
      target = send(filter, target)
    elsif filter[:accept]
      target = target[filter[:accept]]
    elsif filter[:reject]
      target.slice!(filter[:reject])
    elsif filter[:prefix]
      target = "#{filter[:prefix]}#{target}"
    elsif filter[:postfix]
      target = "#{target}#{filter[:postfix]}"
    end
  end
  target.try(:strip)
end
Also aliased as: filters
filters(target, filter_list)
Alias for: filter_target_text
find_by_meta_tag(args) click to toggle source
# File lib/buzzsaw/dsl.rb, line 70
def find_by_meta_tag(args)
  args.symbolize_keys!
  args[:pattern] ||= args[:match] # Backwards compatibility

  nodes = get_nodes_for_meta_attribute(args)
  return unless target = get_content_for_meta_nodes(nodes)
  target = target[args[:pattern]] if args[:pattern]
  return args[:label] if args[:label] && target.present?
  target
end
Also aliased as: label_by_meta_tag
find_by_schema_tag(value) click to toggle source
# File lib/buzzsaw/dsl.rb, line 82
def find_by_schema_tag(value)
  string_methods = [:upcase, :downcase, :capitalize]
  nodes = string_methods.map do |method|
    doc.at_xpath("//*[@itemprop=\"#{value.send(method)}\"]")
  end.compact
  return if nodes.empty?
  content = nodes.first.text.strip.gsub(/\s+/," ")
  return unless content.present?
  content
end
find_by_xpath(args) click to toggle source

Main DSL methods

# File lib/buzzsaw/dsl.rb, line 9
def find_by_xpath(args)
  args.symbolize_keys!
  args[:match] = args[:capture] = args[:pattern] if args[:pattern]

  nodes = get_nodes(args)
  target = find_target_text(args, nodes)
  return args[:label] if args[:label] && target.present?
  asciify_target_text(target)
end
find_in_table(args) click to toggle source
# File lib/buzzsaw/dsl.rb, line 29
def find_in_table(args)
  args.symbolize_keys!

  xpath   = args[:xpath]
  capture = args[:capture]

  if args[:row].is_a?(Fixnum)
    match_row = nil
    row_index = args[:row]
  else
    row_index = nil
    match_row = args[:row]
  end

  if args[:column].is_a?(Fixnum)
    match_column = nil
    column_index = args[:column]
  else
    column_index = nil
    match_column = args[:column]
  end

  return unless table = doc.at_xpath(xpath)

  # Rows match first
  return unless row = match_table_element(table, "tr", match_row, row_index)
  unless match_column || column_index
    if capture
      return row.text[capture]
    else
      return row.text
    end
  end

  # Now columns
  return unless col = match_table_element(row, "td", match_column, column_index)

  return col.text unless capture
  col.text[capture]
end
find_target_text(args, nodes) click to toggle source
# File lib/buzzsaw/dsl.rb, line 171
def find_target_text(args, nodes)
  match_target_text!(nodes, args[:match])

  # Select the first match
  result = nodes.first.try(:strip)

  # Filter match with the :capture regex
  capture_target_text(result, args[:capture])
rescue ENCODING_EXCEPTION
end
get_content_for_meta_nodes(nodes) click to toggle source
# File lib/buzzsaw/dsl.rb, line 231
def get_content_for_meta_nodes(nodes)
  return unless nodes && nodes.any?
  contents = nodes.map { |node| node.attribute("content") }.compact
  return if contents.empty?
  content = contents.first.value.strip.squeeze(" ")
  return unless content.present?
  content
end
get_nodes(args) click to toggle source
# File lib/buzzsaw/dsl.rb, line 216
def get_nodes(args)
  nodes = doc.xpath(args[:xpath])
  nodes.map(&:text).compact
end
get_nodes_for_meta_attribute(args) click to toggle source
# File lib/buzzsaw/dsl.rb, line 221
def get_nodes_for_meta_attribute(args)
  attribute = args[:attribute]
  value_variations = [:upcase, :downcase, :capitalize].map { |method| args[:value].send(method) }
  nodes = value_variations.map do |value|
    doc.at_xpath("//head/meta[@#{attribute}=\"#{value}\"]")
  end.compact
  return if nodes.empty?
  nodes
end
join_target_text(nodes, delimiter) click to toggle source
# File lib/buzzsaw/dsl.rb, line 205
def join_target_text(nodes, delimiter)
  return unless nodes.present?
  delimiter = delimiter.to_s
  nodes.inject { |a, b| a.to_s + delimiter + b.to_s }
end
label_by_meta_keywords(args) click to toggle source
# File lib/buzzsaw/dsl.rb, line 126
def label_by_meta_keywords(args)
  args.symbolize_keys!
  return args[:label] if meta_keywords && meta_keywords[args[:pattern]]
end
label_by_meta_tag(args)
Alias for: find_by_meta_tag
label_by_url(args) click to toggle source
# File lib/buzzsaw/dsl.rb, line 93
def label_by_url(args)
  args.symbolize_keys!
  return args[:label] if "#{url}"[args[:pattern]]
end
match_table_element(table, element, match, index) click to toggle source

Private

# File lib/buzzsaw/dsl.rb, line 164
def match_table_element(table, element, match, index)
  row = nil
  row = table.xpath(".//#{element}").detect { |r| r.text && r.text[match] } if match
  row ||= table.xpath(".//#{element}[#{index}]") if index
  row
end
match_target_text!(nodes, pattern) click to toggle source
# File lib/buzzsaw/dsl.rb, line 193
def match_target_text!(nodes, pattern)
  return unless nodes.present?
  nodes.select! do |node|
    pattern ? node[pattern].present? : node.present?
  end
end
meta_description() click to toggle source
# File lib/buzzsaw/dsl.rb, line 117
def meta_description; meta_name(value: 'description'); end
meta_image() click to toggle source
# File lib/buzzsaw/dsl.rb, line 118
def meta_image;       meta_name(value: 'image'); end
meta_keywords() click to toggle source
# File lib/buzzsaw/dsl.rb, line 116
def meta_keywords;    meta_name(value: 'keywords'); end
meta_name(args) click to toggle source
# File lib/buzzsaw/dsl.rb, line 107
def meta_name(args)
  args.symbolize_keys!
  args.merge!(attribute: 'name')
  find_by_meta_tag(args)
end
meta_og(value) click to toggle source
# File lib/buzzsaw/dsl.rb, line 113
def meta_og(value);   meta_property(value: "og:#{value}"); end
meta_og_description() click to toggle source
# File lib/buzzsaw/dsl.rb, line 123
def meta_og_description; meta_og('description'); end
meta_og_image() click to toggle source
# File lib/buzzsaw/dsl.rb, line 124
def meta_og_image;       meta_og('image'); end
meta_og_keywords() click to toggle source
# File lib/buzzsaw/dsl.rb, line 122
def meta_og_keywords;    meta_og('keywords'); end
meta_og_title() click to toggle source
# File lib/buzzsaw/dsl.rb, line 121
def meta_og_title;       meta_og('title'); end
meta_price() click to toggle source
# File lib/buzzsaw/dsl.rb, line 119
def meta_price;       meta_name(value: 'price'); end
meta_property(args) click to toggle source

Meta tag convenience methods

# File lib/buzzsaw/dsl.rb, line 101
def meta_property(args)
  args.symbolize_keys!
  args.merge!(attribute: 'property')
  find_by_meta_tag(args)
end
meta_title() click to toggle source
# File lib/buzzsaw/dsl.rb, line 115
def meta_title;       meta_name(value: 'title'); end
sanitize(text) click to toggle source
# File lib/buzzsaw/dsl.rb, line 211
def sanitize(text)
  return unless str = Sanitize.clean(text, elements: [])
  HTMLEntities.new.decode(str)
end
schema_description() click to toggle source
# File lib/buzzsaw/dsl.rb, line 137
def schema_description; find_by_schema_tag("description"); end
schema_name() click to toggle source
# File lib/buzzsaw/dsl.rb, line 136
def schema_name;        find_by_schema_tag("name"); end
schema_price() click to toggle source

Schema.org convenience mthods

# File lib/buzzsaw/dsl.rb, line 135
def schema_price;       find_by_schema_tag("price"); end