class SpreadsheetBuilder::HtmlParser

Attributes

doc[R]

Public Class Methods

from_erb(file) click to toggle source
# File lib/spreadsheet_builder/html_parser.rb, line 8
def self.from_erb(file)
  html     = File.read(file)
  template = ERB.new(html)
  html     = template.result

  new(html)
end
from_slim(file, options = {}, context = self, &block) click to toggle source
# File lib/spreadsheet_builder/html_parser.rb, line 3
def self.from_slim(file, options = {}, context = self, &block)
  html = Slim::Template.new(file, options).render(context, &block)
  new(html)
end
new(html, *css_paths) click to toggle source
# File lib/spreadsheet_builder/html_parser.rb, line 17
def initialize(html, *css_paths)
  # TODO merge inline style tags into CssParser
  @css_load_paths = css_paths
  @html = html
  @doc  = Nokogiri::HTML(@html) 
end

Public Instance Methods

build(force_level = :none) click to toggle source
# File lib/spreadsheet_builder/html_parser.rb, line 24
def build(force_level = :none)
  SpreadsheetBuilder.from_data(to_data(force_level))
end
css() click to toggle source
# File lib/spreadsheet_builder/html_parser.rb, line 28
def css
  return @css  if @css

  @doc.css('link[rel=stylesheet]').map { |l| 
    href = l["href"].sub(/^\/+/, '')
    # This will have to be updated later with a host
    @css_load_paths << "#{href}" 
  }
  #@css = SpreadsheetBuilder::CssParser.new(@css_load_paths)
  # Figure out the best way to load this
  @css = SpreadsheetBuilder::CssParser.new([])
end
to_data(force_level = :none) click to toggle source

TODO clean this up

# File lib/spreadsheet_builder/html_parser.rb, line 42
def to_data(force_level = :none)
  cells       = [] 
  merges      = []
  col_widths  = {}
  row_heights = {}

  css.reset(force_level)

  tb  = doc.css('table').first 

  # ignoring specified formats for anything other than table tr td/th
  tb_format = css.format_from_node(tb) 

  row = 0
  doc.css('tr').each do |tr|
    tr_format = tb_format.merge(@css.format_from_node(tr))

    increment = true
    tr.css('td, th').each_with_index do |td, col|
       
      # TODO Do we really need rowheight and colwidth now that there
      # is css parsing?
      rowheight = td.attributes["rowheight"]
      colwidth  = td.attributes["colwidth"]
      rowspan   = td.attributes["rowspan"]
      colspan   = td.attributes["colspan"]

      rowheight &&= rowheight.value.to_i
      colwidth  &&= colwidth.value.to_i
      rowspan   &&= rowspan.value.to_i
      colspan   &&= colspan.value.to_i

      add_td_to_cells(row, col, td, tr_format, cells)
      if colspan
        (1..colspan-1).each {|t| 
          add_td_to_cells(row, col+t, td, tr_format, cells)
        }
      end
      if rowspan
        (1..rowspan-2).each {|t| 
          add_td_to_cells(row+t, col, td, tr_format, cells)
        }
        increment = false
      end
      if colspan || rowspan
        merges << [
          row, col, row + (rowspan || 2)-2, col + (colspan || 1)-1
        ]
      end
    end

    row += 1 if increment
  end

  puts cells.inspect
  { cells: cells, merges: { 0 => merges } }
end

Private Instance Methods

add_td_to_cells(row, col, td, tr_format, cells) click to toggle source

TODO Document

# File lib/spreadsheet_builder/html_parser.rb, line 103
def add_td_to_cells(row, col, td, tr_format, cells)
  found = cells.find { |cell| cell[:row] == row && cell[:col] == col}
  unless found 
    td_format = tr_format.merge(css.format_from_node(td))
    cells << { 
      row:    row, 
      col:    col, 
      value:  td.text.strip, 
      format: td_format, 
      path:   td.css_path 
    } 
  else
    add_td_to_cells(row, col + 1, td, tr_format, cells)
  end
end