class Roo::Excelx

Constants

ERROR_VALUES
ExceedsMaxError

Public Class Methods

new(filename_or_stream, options = {}) click to toggle source

initialization and opening of a spreadsheet file values for packed: :zip optional cell_max (int) parameter for early aborting attempts to parse enormous documents.

Calls superclass method Roo::Base::new
# File lib/roo/excelx.rb, line 36
def initialize(filename_or_stream, options = {})
  packed = options[:packed]
  file_warning = options.fetch(:file_warning, :error)
  cell_max = options.delete(:cell_max)
  sheet_options = {}
  sheet_options[:expand_merged_ranges] = (options[:expand_merged_ranges] || false)
  sheet_options[:no_hyperlinks] = (options[:no_hyperlinks] || false)
  sheet_options[:empty_cell] = (options[:empty_cell] || false)
  shared_options = {}

  shared_options[:disable_html_wrapper] = (options[:disable_html_wrapper] || false)
  unless is_stream?(filename_or_stream)
    file_type_check(filename_or_stream, %w[.xlsx .xlsm], 'an Excel 2007', file_warning, packed)
    basename = find_basename(filename_or_stream)
  end

  # NOTE: Create temp directory and allow Ruby to cleanup the temp directory
  #       when the object is garbage collected. Initially, the finalizer was
  #       created in the Roo::Tempdir module, but that led to a segfault
  #       when testing in Ruby 2.4.0.
  @tmpdir = self.class.make_tempdir(self, basename, options[:tmpdir_root])
  ObjectSpace.define_finalizer(self, self.class.finalize(object_id))

  @shared = Shared.new(@tmpdir, shared_options)
  @filename = local_filename(filename_or_stream, @tmpdir, packed)
  process_zipfile(@filename || filename_or_stream)

  @sheet_names = []
  @sheets = []
  @sheets_by_name = {}

  workbook.sheets.each_with_index do |sheet, index|
    next if options[:only_visible_sheets] && sheet['state'] == 'hidden'

    sheet_name = sheet['name']
    @sheet_names << sheet_name
    @sheets_by_name[sheet_name] = @sheets[index] = Sheet.new(sheet_name, @shared, index, sheet_options)
  end

  if cell_max
    cell_count = ::Roo::Utils.num_cells_in_range(sheet_for(options.delete(:sheet)).dimensions)
    raise ExceedsMaxError.new("Excel file exceeds cell maximum: #{cell_count} > #{cell_max}") if cell_count > cell_max
  end

  super
rescue
  self.class.finalize_tempdirs(object_id)
  raise
end

Public Instance Methods

cell(row, col, sheet = nil) click to toggle source

Returns the content of a spreadsheet-cell. (1,1) is the upper left corner. (1,1), (1,‘A’), (‘A’,1), (‘a’,1) all refers to the cell at the first line and first row.

# File lib/roo/excelx.rb, line 114
def cell(row, col, sheet = nil)
  key = normalize(row, col)
  safe_send(sheet_for(sheet).cells[key], :value)
end
celltype(row, col, sheet = nil) click to toggle source

returns the type of a cell:

  • :float

  • :string,

  • :date

  • :percentage

  • :formula

  • :time

  • :datetime

# File lib/roo/excelx.rb, line 198
def celltype(row, col, sheet = nil)
  key = normalize(row, col)
  safe_send(sheet_for(sheet).cells[key], :type)
end
column(column_number, sheet = nil) click to toggle source

returns all values in this column as an array column numbers are 1,2,3,… like in the spreadsheet

# File lib/roo/excelx.rb, line 125
def column(column_number, sheet = nil)
  if column_number.is_a?(::String)
    column_number = ::Roo::Utils.letter_to_number(column_number)
  end
  sheet_for(sheet).column(column_number)
end
comment(row, col, sheet = nil) click to toggle source

returns the comment at (row/col) nil if there is no comment

# File lib/roo/excelx.rb, line 279
def comment(row, col, sheet = nil)
  key = normalize(row, col)
  sheet_for(sheet).comments[key]
end
comment?(row, col, sheet = nil) click to toggle source

true, if there is a comment

# File lib/roo/excelx.rb, line 285
def comment?(row, col, sheet = nil)
  !!comment(row, col, sheet)
end
comments(sheet = nil) click to toggle source
# File lib/roo/excelx.rb, line 289
def comments(sheet = nil)
  sheet_for(sheet).comments.map do |(x, y), comment|
    [x, y, comment]
  end
end
each_row_streaming(options = {}) { |row| ... } click to toggle source

Yield an array of Excelx::Cell Takes options for sheet, pad_cells, and max_rows

# File lib/roo/excelx.rb, line 297
def each_row_streaming(options = {})
  sheet = sheet_for(options.delete(:sheet))
  if block_given?
    sheet.each_row(options) { |row| yield row }
  else
    sheet.to_enum(:each_row, options)
  end
end
empty?(row, col, sheet = nil) click to toggle source
# File lib/roo/excelx.rb, line 232
def empty?(row, col, sheet = nil)
  sheet = sheet_for(sheet)
  key = normalize(row, col)
  cell = sheet.cells[key]
  !cell || cell.empty? ||
    (row < sheet.first_row || row > sheet.last_row || col < sheet.first_column || col > sheet.last_column)
end
excelx_format(row, col, sheet = nil) click to toggle source

returns the internal format of an excel cell

# File lib/roo/excelx.rb, line 227
def excelx_format(row, col, sheet = nil)
  key = normalize(row, col)
  sheet_for(sheet).excelx_format(key)
end
excelx_type(row, col, sheet = nil) click to toggle source

returns the internal type of an excel cell

  • :numeric_or_formula

  • :string

Note: this is only available within the Excelx class

# File lib/roo/excelx.rb, line 207
def excelx_type(row, col, sheet = nil)
  key = normalize(row, col)
  safe_send(sheet_for(sheet).cells[key], :cell_type)
end
excelx_value(row, col, sheet = nil) click to toggle source

returns the internal value of an excelx cell Note: this is only available within the Excelx class

# File lib/roo/excelx.rb, line 214
def excelx_value(row, col, sheet = nil)
  key = normalize(row, col)
  safe_send(sheet_for(sheet).cells[key], :cell_value)
end
first_column(sheet = nil) click to toggle source

returns the number of the first non-empty column

# File lib/roo/excelx.rb, line 143
def first_column(sheet = nil)
  sheet_for(sheet).first_column
end
first_row(sheet = nil) click to toggle source

returns the number of the first non-empty row

# File lib/roo/excelx.rb, line 133
def first_row(sheet = nil)
  sheet_for(sheet).first_row
end
font(row, col, sheet = nil) click to toggle source

Given a cell, return the cell’s style

# File lib/roo/excelx.rb, line 184
def font(row, col, sheet = nil)
  key = normalize(row, col)
  definition_index = safe_send(sheet_for(sheet).cells[key], :style)
  styles.definitions[definition_index] if definition_index
end
formatted_value(row, col, sheet = nil) click to toggle source

returns the internal value of an excelx cell Note: this is only available within the Excelx class

# File lib/roo/excelx.rb, line 221
def formatted_value(row, col, sheet = nil)
  key = normalize(row, col)
  safe_send(sheet_for(sheet).cells[key], :formatted_value)
end
formula(row, col, sheet = nil) click to toggle source

Returns the formula at (row,col). Returns nil if there is no formula. The method formula? checks if there is a formula.

# File lib/roo/excelx.rb, line 163
def formula(row, col, sheet = nil)
  key = normalize(row, col)
  safe_send(sheet_for(sheet).cells[key], :formula)
end
formula?(*args) click to toggle source

Predicate methods really should return a boolean value. Hopefully no one was relying on the fact that this previously returned either nil/formula

# File lib/roo/excelx.rb, line 171
def formula?(*args)
  !!formula(*args)
end
formulas(sheet = nil) click to toggle source

returns each formula in the selected sheet as an array of tuples in following format

[row, col, formula], [row, col, formula],…
# File lib/roo/excelx.rb, line 177
def formulas(sheet = nil)
  sheet_for(sheet).cells.select { |_, cell| cell.formula }.map do |(x, y), cell|
    [x, y, cell.formula]
  end
end
images(sheet = nil) click to toggle source
# File lib/roo/excelx.rb, line 105
def images(sheet = nil)
  images_names = sheet_for(sheet).images.map(&:last)
  images_names.map { |iname| image_files.find { |ifile| ifile[iname] } }
end
label(name) click to toggle source

returns the row,col values of the labelled cell (nil,nil) if label is not defined

# File lib/roo/excelx.rb, line 248
def label(name)
  labels = workbook.defined_names
  return [nil, nil, nil] if labels.empty? || !labels.key?(name)

  [labels[name].row, labels[name].col, labels[name].sheet]
end
labels() click to toggle source

Returns an array which all labels. Each element is an array with

labelname, [row,col,sheetname]
# File lib/roo/excelx.rb, line 257
def labels
  @labels ||= workbook.defined_names.map do |name, label|
    [
      name,
      [label.row, label.col, label.sheet]
    ]
  end
end
last_column(sheet = nil) click to toggle source

returns the number of the last non-empty column

# File lib/roo/excelx.rb, line 148
def last_column(sheet = nil)
  sheet_for(sheet).last_column
end
last_row(sheet = nil) click to toggle source

returns the number of the last non-empty row

# File lib/roo/excelx.rb, line 138
def last_row(sheet = nil)
  sheet_for(sheet).last_row
end
method_missing(method, *args) click to toggle source
Calls superclass method Roo::Base#method_missing
# File lib/roo/excelx.rb, line 86
def method_missing(method, *args)
  if (label = workbook.defined_names[method.to_s])
    safe_send(sheet_for(label.sheet).cells[label.key], :value)
  else
    # call super for methods like #a1
    super
  end
end
row(rownumber, sheet = nil) click to toggle source
# File lib/roo/excelx.rb, line 119
def row(rownumber, sheet = nil)
  sheet_for(sheet).row(rownumber)
end
sheet_for(sheet) click to toggle source
# File lib/roo/excelx.rb, line 99
def sheet_for(sheet)
  sheet ||= default_sheet
  validate_sheet!(sheet)
  @sheets_by_name[sheet] || @sheets[sheet]
end
sheets() click to toggle source
# File lib/roo/excelx.rb, line 95
def sheets
  @sheet_names
end
to_s(sheet = nil) click to toggle source

shows the internal representation of all cells for debugging purposes

# File lib/roo/excelx.rb, line 242
def to_s(sheet = nil)
  sheet_for(sheet).cells.inspect
end

Private Instance Methods

clean_sheet(sheet) click to toggle source
# File lib/roo/excelx.rb, line 308
def clean_sheet(sheet)
  @sheets_by_name[sheet].cells.each_pair do |coord, value|
    next unless value.value.is_a?(::String)

    @sheets_by_name[sheet].cells[coord].value = sanitize_value(value.value)
  end

  @cleaned[sheet] = true
end
extract_images(entries, tmpdir) click to toggle source
# File lib/roo/excelx.rb, line 385
def extract_images(entries, tmpdir)
  img_entries = entries.select { |e| e.name[/media\/image([0-9]+)/] }
  img_entries.each do |entry|
    path = "#{@tmpdir}/roo#{entry.name.gsub(/xl\/|\//, "_")}"
    image_files << path
    entry.extract(path)
  end
end
extract_sheets_in_order(entries, sheet_ids, sheets, tmpdir) click to toggle source

Extracts the sheets in order, but it will ignore sheets that are not worksheets.

# File lib/roo/excelx.rb, line 374
def extract_sheets_in_order(entries, sheet_ids, sheets, tmpdir)
  (sheet_ids & sheets.keys).each_with_index do |id, i|
    name = sheets[id]
    entry = entries.find { |e| "/#{e.name}" =~ /#{name}$/ }
    path = "#{tmpdir}/roo_sheet#{i + 1}"
    sheet_files << path
    @sheet_files << path
    entry.extract(path)
  end
end
extract_worksheet_ids(entries, path) click to toggle source

Internal: extracts the worksheet_ids from the workbook.xml file. xlsx

documents require a workbook.xml file, so a if the file is missing
it is not a valid xlsx file. In these cases, an ArgumentError is
raised.

wb - a Zip::Entry for the workbook.xml file. path - A String for Zip::Entry’s destination path.

Examples

extract_worksheet_ids(<Zip::Entry>, 'tmpdir/roo_workbook.xml')
# => ["rId1", "rId2", "rId3"]

Returns an Array of Strings.

# File lib/roo/excelx.rb, line 332
def extract_worksheet_ids(entries, path)
  wb = entries.find { |e| e.name[/workbook.xml$/] }
  fail ArgumentError 'missing required workbook file' if wb.nil?

  wb.extract(path)
  workbook_doc = Roo::Utils.load_xml(path).remove_namespaces!
  workbook_doc.xpath('//sheet').map { |s| s['id'] }
end
extract_worksheet_rels(entries, path) click to toggle source

Internal

wb_rels - A Zip::Entry for the workbook.xml.rels file. path - A String for the Zip::Entry’s destination path.

Examples

extract_worksheets(<Zip::Entry>, 'tmpdir/roo_workbook.xml.rels')
# => {
      "rId1"=>"worksheets/sheet1.xml",
      "rId2"=>"worksheets/sheet2.xml",
      "rId3"=>"worksheets/sheet3.xml"
     }

Returns a Hash.

# File lib/roo/excelx.rb, line 356
def extract_worksheet_rels(entries, path)
  wb_rels = entries.find { |e| e.name[/workbook.xml.rels$/] }
  fail ArgumentError 'missing required workbook file' if wb_rels.nil?

  wb_rels.extract(path)
  rels_doc = Roo::Utils.load_xml(path).remove_namespaces!

  relationships = rels_doc.xpath('//Relationship').select do |relationship|
    worksheet_types.include? relationship['Type']
  end

  relationships.each_with_object({}) do |relationship, hash|
    hash[relationship['Id']] = relationship['Target']
  end
end
process_zipfile(zipfilename_or_stream) click to toggle source

Extracts all needed files from the zip file

# File lib/roo/excelx.rb, line 395
def process_zipfile(zipfilename_or_stream)
  @sheet_files = []

  unless is_stream?(zipfilename_or_stream)
    zip_file = Zip::File.open(zipfilename_or_stream)
  else
    zip_file = Zip::CentralDirectory.new
    zip_file.read_from_stream zipfilename_or_stream
  end

  process_zipfile_entries zip_file.to_a.sort_by(&:name)
end
process_zipfile_entries(entries) click to toggle source
# File lib/roo/excelx.rb, line 408
def process_zipfile_entries(entries)
  # NOTE: When Google or Numbers 3.1 exports to xlsx, the worksheet filenames
  #       are not in order. With Numbers 3.1, the first sheet is always
  #       sheet.xml, not sheet1.xml. With Google, the order of the worksheets is
  #       independent of a worksheet's filename (i.e. sheet6.xml can be the
  #       first worksheet).
  #
  #       workbook.xml lists the correct order of worksheets and
  #       workbook.xml.rels lists the filenames for those worksheets.
  #
  #       workbook.xml:
  #         <sheet state="visible" name="IS" sheetId="1" r:id="rId3"/>
  #         <sheet state="visible" name="BS" sheetId="2" r:id="rId4"/>
  #       workbook.xml.rel:
  #         <Relationship Id="rId4" Target="worksheets/sheet5.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/>
  #         <Relationship Id="rId3" Target="worksheets/sheet4.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/>
  sheet_ids = extract_worksheet_ids(entries, "#{@tmpdir}/roo_workbook.xml")
  sheets = extract_worksheet_rels(entries, "#{@tmpdir}/roo_workbook.xml.rels")
  extract_sheets_in_order(entries, sheet_ids, sheets, @tmpdir)
  extract_images(entries, @tmpdir)

  entries.each do |entry|
    path =
    case entry.name.downcase
    when /richdata/
      # FIXME: Ignore richData as parsing is not implemented yet and can cause
      # Zip::DestinationFileExistsError when including a second "styles.xml" entry
      # see http://schemas.microsoft.com/office/spreadsheetml/2017/richdata2
      nil
    when /sharedstrings.xml$/
      "#{@tmpdir}/roo_sharedStrings.xml"
    when /styles.xml$/
      "#{@tmpdir}/roo_styles.xml"
    when /comments([0-9]+).xml$/
      # FIXME: Most of the time, The order of the comment files are the same
      #       the sheet order, i.e. sheet1.xml's comments are in comments1.xml.
      #       In some situations, this isn't true. The true location of a
      #       sheet's comment file is in the sheet1.xml.rels file. SEE
      #       ECMA-376 12.3.3 in "Ecma Office Open XML Part 1".
      nr = Regexp.last_match[1].to_i
      comments_files[nr - 1] = "#{@tmpdir}/roo_comments#{nr}"
    when %r{chartsheets/_rels/sheet([0-9]+).xml.rels$}
      # NOTE: Chart sheet relationship files were interfering with
      #       worksheets.
      nil
    when /sheet([0-9]+).xml.rels$/
      # FIXME: Roo seems to use sheet[\d].xml.rels for hyperlinks only, but
      #        it also stores the location for sharedStrings, comments,
      #        drawings, etc.
      nr = Regexp.last_match[1].to_i
      rels_files[nr - 1] = "#{@tmpdir}/roo_rels#{nr}"
    when /drawing([0-9]+).xml.rels$/
      # Extracting drawing relationships to make images lists for each sheet
      nr = Regexp.last_match[1].to_i
      image_rels[nr - 1] = "#{@tmpdir}/roo_image_rels#{nr}"
    end

    entry.extract(path) if path
  end
end
safe_send(object, method, *args) click to toggle source
# File lib/roo/excelx.rb, line 469
def safe_send(object, method, *args)
  object.send(method, *args) if object&.respond_to?(method)
end
worksheet_types() click to toggle source
# File lib/roo/excelx.rb, line 473
def worksheet_types
  [
    'http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet', # OOXML Transitional
    'http://purl.oclc.org/ooxml/officeDocument/relationships/worksheet' # OOXML Strict
  ]
end