module PageRecognizer

Attributes

logger[RW]

Public Class Methods

load(str) click to toggle source
# File lib/pagerecognizer.rb, line 22
def self.load str
  require "nokogiri"
  Nokogiri::HTML(str).css("div").map do |n|
    Struct.new(*%i{ node top left width height }).new Struct.new(:tag_name).new(n.text),
      *n[:style].scan(/(\S+): ([^\;]+)/).to_h.values_at(
                    *%w{ top left width height }
      ).map(&:to_f)
  end.extend Dumpable
end
new(msg, arrays) click to toggle source
Calls superclass method
# File lib/pagerecognizer.rb, line 138
def initialize msg, arrays
  Module.nesting.first.logger.error "#{self.class}: #{msg}"
  @dumps = arrays.map{ |name, array| [name, array.extend(Dumpable).dump] }.to_h
  super msg
end

Public Instance Methods

cols(*heuristics) click to toggle source
# File lib/pagerecognizer.rb, line 233
def cols *heuristics
  heuristics = %i{ AREA HEIGHT WIDTH } if heuristics.empty?
  split heuristics, :width, :height, :left, :top
end
recognize() click to toggle source
# File lib/pagerecognizer.rb, line 32
def recognize
  logger = Module.nesting.first.logger

  nodes = []
  try = lambda do
    prev = nodes
    code = "( function(node) {
      var x = scrollX, y = scrollY;
      var _tap = function(x, f){ f(); return x };
      var f = function(node) {
        node.scrollIntoView();
        var rect = JSON.parse(JSON.stringify(node.getBoundingClientRect()));
        var child_nodes = Array.from(node.childNodes).filter(function(node) { return node.nodeType == 1 });
        var clickable;
        if (node.nodeName == 'svg') {
          var states = child_nodes.map( function(n){
            return _tap(n.style ? n.style.display : '', function(){ n.style.display = 'none' } );
          } );
          clickable = (node === document.elementFromPoint(rect.x + rect.width/2, rect.y + rect.height/2));
          var _zip = function(a, b){ return a.map( function(e, i) { return [e, b[i]] } ) };
          _zip(child_nodes, states).forEach( function(_){ _[0].style.display = _[1] } );
        } else {
          clickable = (node === document.elementFromPoint(rect.x + rect.width/2, rect.y + rect.height/2));
        };
        rect.top += scrollY;
        rect.left += scrollX;
        return [ [
          rect.top, rect.left, rect.width, rect.height, clickable, node
        ] ].concat(node.nodeName == 'svg' ? [] : child_nodes.flatMap(f));
      };
      return _tap(f(node), function(){ scrollTo(x, y) });
    } )(arguments[0])"
    str = Struct.new :top, :left, :width, :height, :clickable, :node
    nodes = page.evaluate(code, self).map{ |s| str.new *s }
    nodes.size == prev.size
  end

  if defined? Selenium::WebDriver::Wait
    Selenium::WebDriver::Wait.new(
      message: "number of DOM elements didn't stop to change"
    ).until &try
  else
    t = Time.now
    until try.call
      fail "number of DOM elements didn't stop to change" if Time.now > t + 5
    end
  end
  logger.info "#{nodes.size} DOM nodes found"

  nodes.select! &:clickable
  nodes.reject do |n|
    nodes.any? do |nn|
      cs = [
        nn.top <=> n.top,
        nn.left <=> n.left,
        n.left + n.width <=> nn.left + nn.width,
        n.top + n.height <=> nn.top + nn.height,
      ]
      cs.include?(1) && !cs.include?(-1)
    end
  end.extend Dumpable
end
rows(*heuristics) click to toggle source
# File lib/pagerecognizer.rb, line 229
def rows *heuristics
  heuristics = %i{ AREA HEIGHT WIDTH } if heuristics.empty?
  split heuristics, :height, :width, :top, :left
end

Private Instance Methods

recognize_more() click to toggle source
# File lib/pagerecognizer.rb, line 95
        def recognize_more
  logger = Module.nesting.first.logger

  nodes = []
  try = lambda do
    prev = nodes
    code = "( function(node) {
      var x = scrollX, y = scrollY;
      var _tap = function(x, f){ f(); return x };
      var f = function(node) {
        node.scrollIntoView();
        var rect = JSON.parse(JSON.stringify(node.getBoundingClientRect()));
        rect.top += scrollY;
        rect.left += scrollX;
        return [ [
          node, JSON.stringify([rect.top, rect.left, rect.width, rect.height])
        ] ].concat(Array.from(node.childNodes).filter(function(node) { return node.nodeType == 1 }).flatMap(f));
      };
      return _tap(f(node), function(){ scrollTo(x, y) });
    } )(arguments[0])"
    str = Struct.new :node, :top, :left, :width, :height
    nodes = page.evaluate(code, self).map{ |node, a| str.new node, *JSON.load(a) }
    nodes.size == prev.size
  end

  if defined? Selenium::WebDriver::Wait
    Selenium::WebDriver::Wait.new(
      message: "number of DOM elements didn't stop to change"
    ).until &try
  else
    t = Time.now
    until try.call
      fail "number of DOM elements didn't stop to change" if Time.now > t + 10
    end
  end
  logger.info "#{nodes.size} DOM nodes found"

  nodes.reject!{ |i| i.height.zero? || i.width.zero? }
  nodes
end
split(heuristics, hh, ww, tt, ll) click to toggle source
# File lib/pagerecognizer.rb, line 146
        def split heuristics, hh, ww, tt, ll
  logger = Module.nesting.first.logger

  unstale = unless defined? Selenium::WebDriver::Error::StaleElementReferenceError
    ->(&b){ b.call }
  else
    lambda do |&try|
      t = Time.now
      begin
        try.call
      rescue Selenium::WebDriver::Error::StaleElementReferenceError
        raise if Time.now > t + 10
        retry
      end
    end
  end
  all = unstale.call do recognize_more end.sort_by(&tt)
  logger.info "all nodes: #{all.size}"
  rect = page.evaluate("( function(node) { return JSON.parse(JSON.stringify(node.getBoundingClientRect())) } )(arguments[0])", self)
  inside = all.reject{ |i| i.left < rect["left"] || i.left + i.width > rect["right"] || i.top < rect["top"] || i.top + i.height > rect["bottom"] }
  raise ErrorNotEnoughNodes.new "no inside nodes", all: all, inside: inside if inside.empty?
  logger.info "inside nodes: #{inside.size}"
  nodes = unstale.call do inside.reject{ |i| %w{ button script svg path a img span }.include? i.node.tag_name } end.uniq{ |i| [i[hh], i[ww], i[tt], i[ll]] }
  logger.info "good nodes: #{nodes.size}"   # only those that might be containers

  large = nodes#.select{ |i| i[ww] > nodes.map(&ww).max / 4 }
  logger.info "large enough and unique: #{large.size}"

  interfere = lambda do |a, b|
    a[tt] < b[tt] + b[hh] &&
    b[tt] < a[tt] + a[hh]
  end

  rest = large.select.with_index do |a, i|
    large.each_with_index.none? do |b, j|
      next if i == j
      a[tt] >= b[tt] && a[tt] + a[hh] <= b[tt] + b[hh] &&
      large.all?{ |c| interfere[a, c] == interfere[b, c] }
    end
  end
  logger.info "not nested: #{rest.size}"
  # rest = rest.sample 50

  # adding the :area field for faster upcoming computations
  struct = Struct.new *large.first.members, :area
  rest.map!{ |i| struct.new *i.values, i.width * i.height }

  require "pcbr"
  pcbr = PCBR.new
  is = []
  max, past = 0, []
  prev = nil
  time = Time.now
  loop do
    rest.each_with_index do |node, i|
      next if is.any?{ |j| i == j || interfere[rest[i], rest[j]] }
      sol = rest.values_at *is, i
      pcbr.store [*is, i].sort, [
        *( is.size                                                                                                                if heuristics.include? :SIZE   ),
        *( sol.map(&:area).inject(:+)                                                                                             if heuristics.include? :AREA   ),
        *( -sol.product(sol).map{ |s1, s2| (s1.width              - s2.width             ).abs }.inject(:+) / sol.size / sol.size if heuristics.include? :WIDTH  ),
        *( -sol.product(sol).map{ |s1, s2| (s1.height             - s2.height            ).abs }.inject(:+) / sol.size / sol.size if heuristics.include? :HEIGHT ),
        *( -sol.product(sol).map{ |s1, s2| (s1[ll] + s1[ww] / 2.0 - s2[ll] - s2[ww] / 2.0).abs }.inject(:+) / sol.size / sol.size if heuristics.include? :MIDDLE ),
      ] unless pcbr.table.assoc [*is, i].sort
    end
    if prev && Time.now - time > 1 && (Time.now - prev > (prev - time))
      m = pcbr.table.reject{ |i| i.first.size == 1 }.map(&:last).max
      break if 1 == pcbr.table.count{ |i| i.last == m } || Time.now - time > 5
    end
    break unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.inject(:+) }.max_by(&:last)
    if t.last > max
      prev, max = Time.now, t.last
      logger.debug [Time.now - time, max, t.first]
    end
    past.push (is = t.first).map{ |i| 2**i }.inject(:+)
  end
  # TODO: if multiple with max score, take the max by area
  unless best = pcbr.table.reject{ |is,| is.size == 1 }.max_by(&:last)
    raise ErrorNotEnoughNodes.new "failed to split <#{tag_name}>", all: all, inside: inside, nodes: nodes, large: large, rest: rest
  end
  rest.values_at(*best.first).extend(Dumpable)
end