class ArticleJSON::Import::GoogleDoc::HTML::NodeAnalyzer

Attributes

node[R]

Public Class Methods

new(node) click to toggle source

@param [Nokogiri::HTML::Node] node

# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 9
def initialize(node)
  @node = node
end

Public Instance Methods

begins_with?(text) click to toggle source

Check if the node text begins with a certain text @param [String] @return [Boolean]

# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 23
def begins_with?(text)
  first_word = node.inner_text.strip.downcase.split(' ').first
  first_word == text.strip.downcase
end
br?() click to toggle source

Check if the node is a linebreak. A span only containing whitespaces and
tags is considered a linebreak. @return [Boolean]

# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 104
def br?
  return @is_br if defined? @is_br
  @is_br = node.name == 'br' || only_includes_brs?
end
embed?() click to toggle source

Check if the node contains an embedded element @return [Boolean]

# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 96
def embed?
  return @is_embed if defined? @is_embed
  @is_embed = EmbeddedParser.supported?(node)
end
empty?() click to toggle source

Check if the node is empty, i.e. not containing any text Given that images are the only nodes without text, we have to make sure that it's not an image. @return [Boolean]

# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 32
def empty?
  return @is_empty if defined? @is_empty
  @is_empty = node.inner_text.strip.empty? && !image? && !hr? && !br?
end
has_text?(text) click to toggle source

Check if a node equals a certain text @param [String] text @return [Boolean]

# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 16
def has_text?(text)
  node.inner_text.strip.downcase == text.strip.downcase
end
heading?() click to toggle source

Check if the node is a header tag between <h1> and <h5> @return [Boolean]

# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 39
def heading?
  return @is_heading if defined? @is_heading
  @is_heading =
    !quote? && !text_box? && %w(h1 h2 h3 h4 h5).include?(node.name)
end
hr?() click to toggle source

Check if the node is a horizontal line (i.e. `<hr>`) @return [Boolean]

# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 47
def hr?
  node.name == 'hr'
end
image?() click to toggle source

Check if the node contains an image @return [Boolean]

# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 89
def image?
  return @is_image if defined? @is_image
  @is_image = node.xpath('.//img').length > 0
end
list?() click to toggle source

Check if the node contains an ordered or unordered list @return [Boolean]

# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 66
def list?
  return @is_list if defined? @is_list
  @is_list = %w(ul ol).include?(node.name)
end
paragraph?() click to toggle source

Check if the node is a normal text paragraph @return [Boolean]

# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 53
def paragraph?
  return @is_paragraph if defined? @is_paragraph
  @is_paragraph =
    node.name == 'p' &&
      !empty? &&
      !image? &&
      !text_box? &&
      !quote? &&
      !embed?
end
quote?() click to toggle source

Check if the node starts a quote Quotes start with a single line saying “Quote:”. @return [Boolean]

# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 82
def quote?
  return @is_quote if defined? @is_quote
  @is_quote = has_text?('quote:')
end
text_box?() click to toggle source

Check if the node starts a text box Text boxes start with a single line saying “Textbox:” or “Highlight:”. @return [Boolean]

# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 74
def text_box?
  return @is_text_box if defined? @is_text_box
  @is_text_box = begins_with?('textbox:') || begins_with?('highlight:')
end
type() click to toggle source

Determine the type of this node The type is one of the elements supported by article_json. @return [Symbol]

# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 112
def type
  return :empty if empty?
  return :hr if hr?
  return :heading if heading?
  return :paragraph if paragraph?
  return :list if list?
  return :text_box if text_box?
  return :quote if quote?
  return :image if image?
  return :embed if embed?
  :unknown
end

Private Instance Methods

only_includes_brs?() click to toggle source

Return true if the node only contains
nodes and empty text @return [Boolean]

# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 129
def only_includes_brs?
  return false unless node.inner_text.strip.empty?
  tags = node.children.map(&:name)
  # Check if it only contains <br> and text nodes
  return false unless tags.all? { |tag| %w(br text).include? tag }
  # Check if at least one is a `<br>` node
  tags.include?('br')
end