class ArticleJSON::Import::GoogleDoc::HTML::TextParser

Public Class Methods

extract(node:, css_analyzer:) click to toggle source

Extract multiple text nodes from a wrapping node The wrapping node is usually a paragraph or caption @param [Nokogiri::HTML::Node] node @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer @return [Array]

# File lib/article_json/import/google_doc/html/text_parser.rb, line 68
def extract(node:, css_analyzer:)
  node.children.map do |child_node|
    next if NodeAnalyzer.new(child_node).empty?
    new(node: child_node, css_analyzer: css_analyzer).element
  end.compact
end
new(node:, css_analyzer:) click to toggle source

@param [Nokogiri::HTML::Node] node @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer

# File lib/article_json/import/google_doc/html/text_parser.rb, line 8
def initialize(node:, css_analyzer:)
  @node = node
  @css_analyzer = css_analyzer
end

Public Instance Methods

bold?() click to toggle source

Check if the text node is styled as bold @return [Boolean]

# File lib/article_json/import/google_doc/html/text_parser.rb, line 24
def bold?
  @node.name == 'span' &&
    @node.has_attribute?('class') &&
    @css_analyzer.bold?(@node.attribute('class').value)
end
content() click to toggle source

The content of the text node, w/o any markup @return [String]

# File lib/article_json/import/google_doc/html/text_parser.rb, line 15
def content
  @node.children
    .map { |child| child.name == 'br' ? "\n" : child.inner_text }
    .join('')
    .gsub(/\s*\n\s*/, "\n") # Only keep a single consecutive linebreak
end
element() click to toggle source

@return [ArticleJSON::Elements::Text]

# File lib/article_json/import/google_doc/html/text_parser.rb, line 53
def element
  ArticleJSON::Elements::Text.new(
    content: content,
    bold: bold?,
    italic: italic?,
    href: href
  )
end
href() click to toggle source

A possible link target for the text, otherwise `nil` Google redirects (basically all links in a google doc html export) are stripped. @return [String]

# File lib/article_json/import/google_doc/html/text_parser.rb, line 42
def href
  if @node.name == 'span' &&
      @node.first_element_child&.name == 'a' &&
      @node.first_element_child&.has_attribute?('href')
    strip_google_redirect(
      @node.first_element_child.attribute('href').value
    )
  end
end
italic?() click to toggle source

Check if the text node is styled as italic @return [Boolean]

# File lib/article_json/import/google_doc/html/text_parser.rb, line 32
def italic?
  @node.name == 'span' &&
    @node.has_attribute?('class') &&
    @css_analyzer.italic?(@node.attribute('class').value)
end

Private Instance Methods

strip_google_redirect(url) click to toggle source

@param [String] url @return [String]

# File lib/article_json/import/google_doc/html/text_parser.rb, line 80
def strip_google_redirect(url)
  uri = URI(url)
  if uri.host && uri.host.match(/google\.com/) && uri.path == '/url'
    params = CGI.parse(uri.query)
    return params['q'].first if params['q'] && params['q'].any?
  end
  url
end