class ArticleJSON::Import::GoogleDoc::HTML::NodeAnalyzer
Attributes
Public Class Methods
@param [Nokogiri::HTML::Node] node
# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 9 def initialize(node) @node = node end
Public Instance Methods
Check if the node text begins with a certain text @param [String] @return [Boolean]
# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 23 def begins_with?(text) first_word = node.inner_text.strip.downcase.split(' ').first first_word == text.strip.downcase end
Check if the node is a linebreak. A span only containing whitespaces and
tags is considered a linebreak. @return [Boolean]
# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 104 def br? return @is_br if defined? @is_br @is_br = node.name == 'br' || only_includes_brs? end
Check if the node contains an embedded element @return [Boolean]
# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 96 def embed? return @is_embed if defined? @is_embed @is_embed = EmbeddedParser.supported?(node) end
Check if the node is empty, i.e. not containing any text Given that images are the only nodes without text, we have to make sure that it's not an image. @return [Boolean]
# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 32 def empty? return @is_empty if defined? @is_empty @is_empty = node.inner_text.strip.empty? && !image? && !hr? && !br? end
Check if a node equals a certain text @param [String] text @return [Boolean]
# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 16 def has_text?(text) node.inner_text.strip.downcase == text.strip.downcase end
Check if the node is a header tag between <h1> and <h5> @return [Boolean]
# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 39 def heading? return @is_heading if defined? @is_heading @is_heading = !quote? && !text_box? && %w(h1 h2 h3 h4 h5).include?(node.name) end
Check if the node is a horizontal line (i.e. `<hr>`) @return [Boolean]
# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 47 def hr? node.name == 'hr' end
Check if the node contains an image @return [Boolean]
# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 89 def image? return @is_image if defined? @is_image @is_image = node.xpath('.//img').length > 0 end
Check if the node contains an ordered or unordered list @return [Boolean]
# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 66 def list? return @is_list if defined? @is_list @is_list = %w(ul ol).include?(node.name) end
Check if the node is a normal text paragraph @return [Boolean]
# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 53 def paragraph? return @is_paragraph if defined? @is_paragraph @is_paragraph = node.name == 'p' && !empty? && !image? && !text_box? && !quote? && !embed? end
Check if the node starts a quote Quotes start with a single line saying “Quote:”. @return [Boolean]
# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 82 def quote? return @is_quote if defined? @is_quote @is_quote = has_text?('quote:') end
Check if the node starts a text box Text boxes start with a single line saying “Textbox:” or “Highlight:”. @return [Boolean]
# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 74 def text_box? return @is_text_box if defined? @is_text_box @is_text_box = begins_with?('textbox:') || begins_with?('highlight:') end
Determine the type of this node The type is one of the elements supported by article_json. @return [Symbol]
# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 112 def type return :empty if empty? return :hr if hr? return :heading if heading? return :paragraph if paragraph? return :list if list? return :text_box if text_box? return :quote if quote? return :image if image? return :embed if embed? :unknown end
Private Instance Methods
Return true if the node only contains
nodes and empty text @return [Boolean]
# File lib/article_json/import/google_doc/html/node_analyzer.rb, line 129 def only_includes_brs? return false unless node.inner_text.strip.empty? tags = node.children.map(&:name) # Check if it only contains <br> and text nodes return false unless tags.all? { |tag| %w(br text).include? tag } # Check if at least one is a `<br>` node tags.include?('br') end