class Digger::Pattern
Extractor patterns definition
Constants
- MATCH_MAX
- TYPES
- TYPES_CSS
- TYPES_JSON
- TYPES_OTHER
- TYPES_REGEXP
Attributes
block[RW]
type[RW]
value[RW]
Public Class Methods
new(hash = {})
click to toggle source
# File lib/digger/pattern.rb, line 8 def initialize(hash = {}) hash.each_pair do |key, value| send("#{key}=", value) if %w[type value block].include?(key.to_s) end end
wrap(hash)
click to toggle source
# File lib/digger/pattern.rb, line 27 def self.wrap(hash) hash.transform_values { |value| value.is_a?(Pattern) ? value : Pattern.new(value) } end
Public Instance Methods
css_match(doc)
click to toggle source
# File lib/digger/pattern.rb, line 84 def css_match(doc) # content is Nokogiri::HTML::Document contents = doc.css(value) if type == 'css_many' block = safe_block { |node| node&.content&.strip } contents.map { |node| block.call(node) } elsif type == 'css_all' block = safe_block block.call(contents) else block = safe_block { |node| node&.content&.strip } block.call(contents.first) end end
get_body(page)
click to toggle source
# File lib/digger/pattern.rb, line 59 def get_body(page) safe_block.call(page.body) end
get_header(page)
click to toggle source
# File lib/digger/pattern.rb, line 54 def get_header(page) header = (page.headers[value.to_s.downcase] || []).first safe_block.call(header) end
get_lines(page)
click to toggle source
# File lib/digger/pattern.rb, line 67 def get_lines(page) block = safe_block page.body.split("\n").map(&:strip).filter { |line| !line.empty? }.map { |line| block.call(line) } end
get_plain(page)
click to toggle source
# File lib/digger/pattern.rb, line 63 def get_plain(page) safe_block.call(page.doc&.text) end
json_match(page)
click to toggle source
# File lib/digger/pattern.rb, line 77 def json_match(page) json = page.send(type) keys = json_index_keys(value) match = json_fetch(json, keys) safe_block.call(match) end
match_page(page)
click to toggle source
# File lib/digger/pattern.rb, line 40 def match_page(page) return unless page.success? if TYPES_REGEXP.include?(type) # regular expression regexp_match(page.body) elsif TYPES_CSS.include?(type) # css expression css_match(page.doc) elsif TYPES_JSON.include?(type) json_match(page) elsif TYPES_OTHER.include?(type) send("get_#{type}", page) end end
regexp_match(body)
click to toggle source
# File lib/digger/pattern.rb, line 99 def regexp_match(body) # content is String if %w[match_many match_all].include? type regexp = value.is_a?(Regexp) ? value : Regexp.new(value.to_s) matches = body.gsub(regexp).to_a if type == 'match_many' block = safe_block(&:strip) matches.map { |node| block.call(node) } else block = safe_block block.call(matches) end else index = TYPES_REGEXP.index(type) matches = body.match(value) block = safe_block(&:strip) block.call(matches[index]) unless matches.nil? end end
safe_block(&default_block)
click to toggle source
# File lib/digger/pattern.rb, line 14 def safe_block(&default_block) if block.nil? || (block.is_a?(String) && block.strip.empty?) default_block || ->(v) { v } elsif block.respond_to?(:call) block else proc { $SAFE = 2 eval block }.call end end
Private Instance Methods
json_fetch(json, keys)
click to toggle source
# File lib/digger/pattern.rb, line 119 def json_fetch(json, keys) if keys.empty? json else pt = keys.shift json_fetch(json[pt[:index] || pt[:key]], keys) end end
json_index_keys(keys)
click to toggle source
# File lib/digger/pattern.rb, line 128 def json_index_keys(keys) keys.to_s.match(/^\$\S*$/)[0].scan(/(\.(\w+)|\[(\d+)\])/).map do |p| p[1].nil? ? { index: p[2].to_i } : { key: p[1] } end end