module Buzzsaw::DSL
Constants
- ENCODING_EXCEPTION
Public Instance Methods
asciify_target_text(target)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 240 def asciify_target_text(target) return unless target newstr = "" target.each_char { |chr| newstr << (chr.dump["u{e2}"] ? '"' : chr) } newstr.to_ascii end
capture_target_text(text, pattern)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 200 def capture_target_text(text, pattern) return unless text pattern ? text[pattern] : text.gsub(/\s+/," ") end
collect_by_xpath(args)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 19 def collect_by_xpath(args) args.symbolize_keys! args[:match] = args[:capture] = args[:pattern] if args[:pattern] nodes = get_nodes(args) target = collect_target_text(args, nodes) return args[:label] if args[:label] && target.present? asciify_target_text(target) end
collect_target_text(args, nodes)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 182 def collect_target_text(args, nodes) match_target_text!(nodes, args[:match]) # Reduce the matching nodes result = join_target_text(nodes, args[:join]) # Filter the string with the :capture regex capture_target_text(result, args[:capture]) rescue ENCODING_EXCEPTION end
filter_target_text(target, filter_list)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 139 def filter_target_text(target, filter_list) filter_list.each do |filter| next unless target.present? filter.symbolize_keys! if filter.is_a?(Hash) if filter.is_a?(String) && respond_to?(filter) target = send(filter, target) elsif filter[:accept] target = target[filter[:accept]] elsif filter[:reject] target.slice!(filter[:reject]) elsif filter[:prefix] target = "#{filter[:prefix]}#{target}" elsif filter[:postfix] target = "#{target}#{filter[:postfix]}" end end target.try(:strip) end
Also aliased as: filters
find_by_meta_tag(args)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 70 def find_by_meta_tag(args) args.symbolize_keys! args[:pattern] ||= args[:match] # Backwards compatibility nodes = get_nodes_for_meta_attribute(args) return unless target = get_content_for_meta_nodes(nodes) target = target[args[:pattern]] if args[:pattern] return args[:label] if args[:label] && target.present? target end
Also aliased as: label_by_meta_tag
find_by_schema_tag(value)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 82 def find_by_schema_tag(value) string_methods = [:upcase, :downcase, :capitalize] nodes = string_methods.map do |method| doc.at_xpath("//*[@itemprop=\"#{value.send(method)}\"]") end.compact return if nodes.empty? content = nodes.first.text.strip.gsub(/\s+/," ") return unless content.present? content end
find_by_xpath(args)
click to toggle source
Main DSL
methods
# File lib/buzzsaw/dsl.rb, line 9 def find_by_xpath(args) args.symbolize_keys! args[:match] = args[:capture] = args[:pattern] if args[:pattern] nodes = get_nodes(args) target = find_target_text(args, nodes) return args[:label] if args[:label] && target.present? asciify_target_text(target) end
find_in_table(args)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 29 def find_in_table(args) args.symbolize_keys! xpath = args[:xpath] capture = args[:capture] if args[:row].is_a?(Fixnum) match_row = nil row_index = args[:row] else row_index = nil match_row = args[:row] end if args[:column].is_a?(Fixnum) match_column = nil column_index = args[:column] else column_index = nil match_column = args[:column] end return unless table = doc.at_xpath(xpath) # Rows match first return unless row = match_table_element(table, "tr", match_row, row_index) unless match_column || column_index if capture return row.text[capture] else return row.text end end # Now columns return unless col = match_table_element(row, "td", match_column, column_index) return col.text unless capture col.text[capture] end
find_target_text(args, nodes)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 171 def find_target_text(args, nodes) match_target_text!(nodes, args[:match]) # Select the first match result = nodes.first.try(:strip) # Filter match with the :capture regex capture_target_text(result, args[:capture]) rescue ENCODING_EXCEPTION end
get_content_for_meta_nodes(nodes)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 231 def get_content_for_meta_nodes(nodes) return unless nodes && nodes.any? contents = nodes.map { |node| node.attribute("content") }.compact return if contents.empty? content = contents.first.value.strip.squeeze(" ") return unless content.present? content end
get_nodes(args)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 216 def get_nodes(args) nodes = doc.xpath(args[:xpath]) nodes.map(&:text).compact end
get_nodes_for_meta_attribute(args)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 221 def get_nodes_for_meta_attribute(args) attribute = args[:attribute] value_variations = [:upcase, :downcase, :capitalize].map { |method| args[:value].send(method) } nodes = value_variations.map do |value| doc.at_xpath("//head/meta[@#{attribute}=\"#{value}\"]") end.compact return if nodes.empty? nodes end
join_target_text(nodes, delimiter)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 205 def join_target_text(nodes, delimiter) return unless nodes.present? delimiter = delimiter.to_s nodes.inject { |a, b| a.to_s + delimiter + b.to_s } end
label_by_meta_keywords(args)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 126 def label_by_meta_keywords(args) args.symbolize_keys! return args[:label] if meta_keywords && meta_keywords[args[:pattern]] end
label_by_url(args)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 93 def label_by_url(args) args.symbolize_keys! return args[:label] if "#{url}"[args[:pattern]] end
match_table_element(table, element, match, index)
click to toggle source
Private
# File lib/buzzsaw/dsl.rb, line 164 def match_table_element(table, element, match, index) row = nil row = table.xpath(".//#{element}").detect { |r| r.text && r.text[match] } if match row ||= table.xpath(".//#{element}[#{index}]") if index row end
match_target_text!(nodes, pattern)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 193 def match_target_text!(nodes, pattern) return unless nodes.present? nodes.select! do |node| pattern ? node[pattern].present? : node.present? end end
meta_description()
click to toggle source
# File lib/buzzsaw/dsl.rb, line 117 def meta_description; meta_name(value: 'description'); end
meta_image()
click to toggle source
# File lib/buzzsaw/dsl.rb, line 118 def meta_image; meta_name(value: 'image'); end
meta_keywords()
click to toggle source
# File lib/buzzsaw/dsl.rb, line 116 def meta_keywords; meta_name(value: 'keywords'); end
meta_name(args)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 107 def meta_name(args) args.symbolize_keys! args.merge!(attribute: 'name') find_by_meta_tag(args) end
meta_og(value)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 113 def meta_og(value); meta_property(value: "og:#{value}"); end
meta_og_description()
click to toggle source
# File lib/buzzsaw/dsl.rb, line 123 def meta_og_description; meta_og('description'); end
meta_og_image()
click to toggle source
# File lib/buzzsaw/dsl.rb, line 124 def meta_og_image; meta_og('image'); end
meta_og_keywords()
click to toggle source
# File lib/buzzsaw/dsl.rb, line 122 def meta_og_keywords; meta_og('keywords'); end
meta_og_title()
click to toggle source
# File lib/buzzsaw/dsl.rb, line 121 def meta_og_title; meta_og('title'); end
meta_price()
click to toggle source
# File lib/buzzsaw/dsl.rb, line 119 def meta_price; meta_name(value: 'price'); end
meta_property(args)
click to toggle source
Meta tag convenience methods
# File lib/buzzsaw/dsl.rb, line 101 def meta_property(args) args.symbolize_keys! args.merge!(attribute: 'property') find_by_meta_tag(args) end
meta_title()
click to toggle source
# File lib/buzzsaw/dsl.rb, line 115 def meta_title; meta_name(value: 'title'); end
sanitize(text)
click to toggle source
# File lib/buzzsaw/dsl.rb, line 211 def sanitize(text) return unless str = Sanitize.clean(text, elements: []) HTMLEntities.new.decode(str) end
schema_description()
click to toggle source
# File lib/buzzsaw/dsl.rb, line 137 def schema_description; find_by_schema_tag("description"); end
schema_name()
click to toggle source
# File lib/buzzsaw/dsl.rb, line 136 def schema_name; find_by_schema_tag("name"); end
schema_price()
click to toggle source
Schema.org convenience mthods
# File lib/buzzsaw/dsl.rb, line 135 def schema_price; find_by_schema_tag("price"); end