class Digger::Pattern

Extractor patterns definition

Constants

MATCH_MAX
TYPES
TYPES_CSS
TYPES_JSON
TYPES_OTHER
TYPES_REGEXP

Attributes

block[RW]
type[RW]
value[RW]

Public Class Methods

new(hash = {}) click to toggle source
# File lib/digger/pattern.rb, line 8
def initialize(hash = {})
  hash.each_pair do |key, value|
    send("#{key}=", value) if %w[type value block].include?(key.to_s)
  end
end
wrap(hash) click to toggle source
# File lib/digger/pattern.rb, line 27
def self.wrap(hash)
  hash.transform_values { |value| value.is_a?(Pattern) ? value : Pattern.new(value) }
end

Public Instance Methods

css_match(doc) click to toggle source
# File lib/digger/pattern.rb, line 84
def css_match(doc)
  # content is Nokogiri::HTML::Document
  contents = doc.css(value)
  if type == 'css_many'
    block = safe_block { |node| node&.content&.strip }
    contents.map { |node| block.call(node) }
  elsif type == 'css_all'
    block = safe_block
    block.call(contents)
  else
    block = safe_block { |node| node&.content&.strip }
    block.call(contents.first)
  end
end
get_body(page) click to toggle source
# File lib/digger/pattern.rb, line 59
def get_body(page)
  safe_block.call(page.body)
end
get_header(page) click to toggle source
# File lib/digger/pattern.rb, line 54
def get_header(page)
  header = (page.headers[value.to_s.downcase] || []).first
  safe_block.call(header)
end
get_lines(page) click to toggle source
# File lib/digger/pattern.rb, line 67
def get_lines(page)
  block = safe_block
  page.body.split("\n").map(&:strip).filter { |line| !line.empty? }.map { |line| block.call(line) }
end
get_plain(page) click to toggle source
# File lib/digger/pattern.rb, line 63
def get_plain(page)
  safe_block.call(page.doc&.text)
end
json_match(page) click to toggle source
# File lib/digger/pattern.rb, line 77
def json_match(page)
  json = page.send(type)
  keys = json_index_keys(value)
  match = json_fetch(json, keys)
  safe_block.call(match)
end
match_page(page) click to toggle source
# File lib/digger/pattern.rb, line 40
def match_page(page)
  return unless page.success?

  if TYPES_REGEXP.include?(type) # regular expression
    regexp_match(page.body)
  elsif TYPES_CSS.include?(type) # css expression
    css_match(page.doc)
  elsif TYPES_JSON.include?(type)
    json_match(page)
  elsif TYPES_OTHER.include?(type)
    send("get_#{type}", page)
  end
end
regexp_match(body) click to toggle source
# File lib/digger/pattern.rb, line 99
def regexp_match(body)
  # content is String
  if %w[match_many match_all].include? type
    regexp = value.is_a?(Regexp) ? value : Regexp.new(value.to_s)
    matches = body.gsub(regexp).to_a
    if type == 'match_many'
      block = safe_block(&:strip)
      matches.map { |node| block.call(node) }
    else
      block = safe_block
      block.call(matches)
    end
  else
    index = TYPES_REGEXP.index(type)
    matches = body.match(value)
    block = safe_block(&:strip)
    block.call(matches[index]) unless matches.nil?
  end
end
safe_block(&default_block) click to toggle source
# File lib/digger/pattern.rb, line 14
def safe_block(&default_block)
  if block.nil? || (block.is_a?(String) && block.strip.empty?)
    default_block || ->(v) { v }
  elsif block.respond_to?(:call)
    block
  else
    proc {
      $SAFE = 2
      eval block
    }.call
  end
end

Private Instance Methods

json_fetch(json, keys) click to toggle source
# File lib/digger/pattern.rb, line 119
def json_fetch(json, keys)
  if keys.empty?
    json
  else
    pt = keys.shift
    json_fetch(json[pt[:index] || pt[:key]], keys)
  end
end
json_index_keys(keys) click to toggle source
# File lib/digger/pattern.rb, line 128
def json_index_keys(keys)
  keys.to_s.match(/^\$\S*$/)[0].scan(/(\.(\w+)|\[(\d+)\])/).map do |p|
    p[1].nil? ? { index: p[2].to_i } : { key: p[1] }
  end
end