class ArticleJSON::Import::GoogleDoc::HTML::TextParser
Public Class Methods
extract(node:, css_analyzer:)
click to toggle source
Extract multiple text nodes from a wrapping node The wrapping node is usually a paragraph or caption @param [Nokogiri::HTML::Node] node @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer @return [Array]
# File lib/article_json/import/google_doc/html/text_parser.rb, line 68 def extract(node:, css_analyzer:) node.children.map do |child_node| next if NodeAnalyzer.new(child_node).empty? new(node: child_node, css_analyzer: css_analyzer).element end.compact end
new(node:, css_analyzer:)
click to toggle source
@param [Nokogiri::HTML::Node] node @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
# File lib/article_json/import/google_doc/html/text_parser.rb, line 8 def initialize(node:, css_analyzer:) @node = node @css_analyzer = css_analyzer end
Public Instance Methods
bold?()
click to toggle source
Check if the text node is styled as bold @return [Boolean]
# File lib/article_json/import/google_doc/html/text_parser.rb, line 24 def bold? @node.name == 'span' && @node.has_attribute?('class') && @css_analyzer.bold?(@node.attribute('class').value) end
content()
click to toggle source
The content of the text node, w/o any markup @return [String]
# File lib/article_json/import/google_doc/html/text_parser.rb, line 15 def content @node.children .map { |child| child.name == 'br' ? "\n" : child.inner_text } .join('') .gsub(/\s*\n\s*/, "\n") # Only keep a single consecutive linebreak end
element()
click to toggle source
@return [ArticleJSON::Elements::Text]
# File lib/article_json/import/google_doc/html/text_parser.rb, line 53 def element ArticleJSON::Elements::Text.new( content: content, bold: bold?, italic: italic?, href: href ) end
href()
click to toggle source
A possible link target for the text, otherwise `nil` Google redirects (basically all links in a google doc html export) are stripped. @return [String]
# File lib/article_json/import/google_doc/html/text_parser.rb, line 42 def href if @node.name == 'span' && @node.first_element_child&.name == 'a' && @node.first_element_child&.has_attribute?('href') strip_google_redirect( @node.first_element_child.attribute('href').value ) end end
italic?()
click to toggle source
Check if the text node is styled as italic @return [Boolean]
# File lib/article_json/import/google_doc/html/text_parser.rb, line 32 def italic? @node.name == 'span' && @node.has_attribute?('class') && @css_analyzer.italic?(@node.attribute('class').value) end
Private Instance Methods
strip_google_redirect(url)
click to toggle source
@param [String] url @return [String]
# File lib/article_json/import/google_doc/html/text_parser.rb, line 80 def strip_google_redirect(url) uri = URI(url) if uri.host && uri.host.match(/google\.com/) && uri.path == '/url' params = CGI.parse(uri.query) return params['q'].first if params['q'] && params['q'].any? end url end