class ArticleJSON::Import::GoogleDoc::HTML::Parser

Public Class Methods

new(html) click to toggle source

@param [String] html

# File lib/article_json/import/google_doc/html/parser.rb, line 7
def initialize(html)
  doc = Nokogiri::HTML(html)
  @body_enumerator = doc.xpath('//body').last.children.to_enum

  css_node = doc.xpath('//head/style').last
  @css_analyzer = CSSAnalyzer.new(css_node&.inner_text)
end

Public Instance Methods

parsed_content() click to toggle source

Parse the body of the document and return the result @return [Array]

# File lib/article_json/import/google_doc/html/parser.rb, line 17
def parsed_content
  @parsed_content ||= parse_body
end

Private Instance Methods

body_has_more_nodes?() click to toggle source

@return [Boolean]

# File lib/article_json/import/google_doc/html/parser.rb, line 128
def body_has_more_nodes?
  @body_enumerator.peek
  true
rescue StopIteration
  false
end
next_node() click to toggle source

Return the next node if available, and advance the enumerator @return [Nokogiri::HTML::Node]

# File lib/article_json/import/google_doc/html/parser.rb, line 123
def next_node
  body_has_more_nodes? ? @body_enumerator.next : nil
end
nodes_until_hr() click to toggle source

Collect all nodes until a horizontal line, advancing the enumerator @return [Array]

# File lib/article_json/import/google_doc/html/parser.rb, line 112
def nodes_until_hr
  nodes = []
  until !body_has_more_nodes? ||
      NodeAnalyzer.new(@body_enumerator.peek).hr?
    nodes << @body_enumerator.next
  end
  nodes
end
parse_body() click to toggle source

Loop over all body nodes and parse them @return [Array]

# File lib/article_json/import/google_doc/html/parser.rb, line 25
def parse_body
  @parsed_content = []
  while body_has_more_nodes?
    @parsed_content << begin
      @current_node = NodeAnalyzer.new(@body_enumerator.next)
      parse_current_node || next
    end
  end
  @parsed_content
end
parse_current_node() click to toggle source

Parse the current node and return an element, if available @return [ArticleJSON::Elements::Base]

# File lib/article_json/import/google_doc/html/parser.rb, line 38
def parse_current_node
  case @current_node.type
  when :heading then parse_heading
  when :paragraph then parse_paragraph
  when :list then parse_list
  when :image then parse_image
  when :text_box then parse_text_box
  when :quote then parse_quote
  when :embed then parse_embed
  when :hr, :empty, :unknown then nil
  end
end
parse_embed() click to toggle source

@return [ArticleJSON::Elements::Embed]

# File lib/article_json/import/google_doc/html/parser.rb, line 102
def parse_embed
  EmbeddedParser.build(
    node: @current_node.node,
    caption_node: next_node,
    css_analyzer: @css_analyzer
  )
end
parse_heading() click to toggle source

@return [ArticleJSON::Elements::Heading]

# File lib/article_json/import/google_doc/html/parser.rb, line 52
def parse_heading
  HeadingParser.new(node: @current_node.node).element
end
parse_image() click to toggle source

@return [ArticleJSON::Elements::Image]

# File lib/article_json/import/google_doc/html/parser.rb, line 73
def parse_image
  ImageParser
    .new(
      node: @current_node.node,
      caption_node: next_node,
      css_analyzer: @css_analyzer
    )
    .element
end
parse_list() click to toggle source

@return [ArticleJSON::Elements::List]

# File lib/article_json/import/google_doc/html/parser.rb, line 66
def parse_list
  ListParser
    .new(node: @current_node.node, css_analyzer: @css_analyzer)
    .element
end
parse_paragraph() click to toggle source

@return [ArticleJSON::Elements::Paragraph|nil]

# File lib/article_json/import/google_doc/html/parser.rb, line 57
def parse_paragraph
  paragraph =
    ParagraphParser
      .new(node: @current_node.node, css_analyzer: @css_analyzer)
      .element
  paragraph unless paragraph.blank?
end
parse_quote() click to toggle source

@return [ArticleJSON::Elements::Quote]

# File lib/article_json/import/google_doc/html/parser.rb, line 95
def parse_quote
  QuoteParser
    .new(nodes: nodes_until_hr, css_analyzer: @css_analyzer)
    .element
end
parse_text_box() click to toggle source

@return [ArticleJSON::Elements::TextBox]

# File lib/article_json/import/google_doc/html/parser.rb, line 84
def parse_text_box
  TextBoxParser
    .new(
      type_node: @current_node.node,
      nodes: nodes_until_hr,
      css_analyzer: @css_analyzer
    )
    .element
end