class Keep::Manifest

Constants

EMPTY_DIR_TOKEN_REGEXP
FILE_NAME_REGEXP
FILE_TOKEN_REGEXP
NON_8BIT_ENCODED_CHAR
STREAM_NAME_REGEXP
STREAM_TOKEN_REGEXP

Public Class Methods

new(manifest_text) click to toggle source

Class to parse a manifest text and provide common views of that data.

# File lib/arvados/keep.rb, line 114
def initialize(manifest_text)
  @text = manifest_text
  @files = nil
end
unescape(s) click to toggle source
# File lib/arvados/keep.rb, line 140
def self.unescape(s)
  return nil if s.nil?

  # Parse backslash escapes in a Keep manifest stream or file name.
  s.gsub(/\\(\\|[0-7]{3})/) do |_|
    case $1
    when '\\'
      '\\'
    else
      $1.to_i(8).chr
    end
  end
end
valid?(manifest) click to toggle source
# File lib/arvados/keep.rb, line 303
def self.valid? manifest
  begin
    validate! manifest
    true
  rescue ArgumentError
    false
  end
end
validate!(manifest) click to toggle source

Verify that a given manifest is valid according to arvados.org/projects/arvados/wiki/Keep_manifest_format

# File lib/arvados/keep.rb, line 252
def self.validate! manifest
  raise ArgumentError.new "No manifest found" if !manifest

  return true if manifest.empty?

  raise ArgumentError.new "Invalid manifest: does not end with newline" if !manifest.end_with?("\n")
  line_count = 0
  manifest.each_line do |line|
    line_count += 1

    words = line[0..-2].split(/ /)
    raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing stream name" if words.empty?

    count = 0

    word = words.shift
    raise ArgumentError.new "Manifest invalid for stream #{line_count}: >8-bit encoded chars not allowed on stream token #{word.inspect}" if word =~ NON_8BIT_ENCODED_CHAR
    unescaped_word = unescape(word)
    count += 1 if word =~ STREAM_TOKEN_REGEXP and unescaped_word =~ STREAM_NAME_REGEXP and unescaped_word !~ /\/\.\.?(\/|$)/
    raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid stream name #{word.inspect if word}" if count != 1

    count = 0
    word = words.shift
    while word =~ Locator::LOCATOR_REGEXP
      word = words.shift
      count += 1
    end
    raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid locator #{word.inspect if word}" if count == 0

    count = 0
    raise ArgumentError.new "Manifest invalid for stream #{line_count}: >8-bit encoded chars not allowed on file token #{word.inspect}" if word =~ NON_8BIT_ENCODED_CHAR
    while unescape(word) =~ EMPTY_DIR_TOKEN_REGEXP or
      (word =~ FILE_TOKEN_REGEXP and unescape(word) =~ FILE_NAME_REGEXP and ($~[1].split('/') & ['..', '.']).empty?)
      word = words.shift
      count += 1
    end

    if word
      raise ArgumentError.new "Manifest invalid for stream #{line_count}: invalid file token #{word.inspect}"
    elsif count == 0
      raise ArgumentError.new "Manifest invalid for stream #{line_count}: no file tokens"
    end

    # Ruby's split() method silently drops trailing empty tokens
    # (which are not allowed by the manifest format) so we have to
    # check trailing spaces manually.
    raise ArgumentError.new "Manifest invalid for stream #{line_count}: trailing space" if line.end_with? " \n"
  end
  true
end

Public Instance Methods

each_file_spec() { |stream_name| ... } click to toggle source
# File lib/arvados/keep.rb, line 166
def each_file_spec
  return to_enum(__method__) unless block_given?
  @text.each_line do |line|
    stream_name = nil
    in_file_tokens = false
    line.scan(/\S+/) do |token|
      if stream_name.nil?
        stream_name = unescape token
      elsif in_file_tokens or not Locator.valid? token
        in_file_tokens = true

        start_pos, file_size, file_name = split_file_token(token)
        stream_name_adjuster = ''
        if file_name.include?('/')                # '/' in filename
          dirname, sep, basename = file_name.rpartition('/')
          stream_name_adjuster = sep + dirname   # /dir_parts
          file_name = basename
        end

        yield [stream_name + stream_name_adjuster, start_pos, file_size, file_name]
      end
    end
  end
  true
end
each_line() { |stream_name, block_tokens, file_tokens| ... } click to toggle source
# File lib/arvados/keep.rb, line 119
def each_line
  return to_enum(__method__) unless block_given?
  @text.each_line do |line|
    stream_name = nil
    block_tokens = []
    file_tokens = []
    line.scan(/\S+/) do |token|
      if stream_name.nil?
        stream_name = unescape token
      elsif file_tokens.empty? and Locator.valid? token
        block_tokens << token
      else
        file_tokens << unescape(token)
      end
    end
    # Ignore blank lines
    next if stream_name.nil?
    yield [stream_name, block_tokens, file_tokens]
  end
end
exact_file_count?(want_count) click to toggle source
# File lib/arvados/keep.rb, line 230
def exact_file_count?(want_count)
  files_count(want_count + 1) == want_count
end
files() click to toggle source
# File lib/arvados/keep.rb, line 192
def files
  if @files.nil?
    file_sizes = Hash.new(0)
    each_file_spec do |streamname, _, filesize, filename|
      file_sizes[[streamname, filename]] += filesize
    end
    @files = file_sizes.each_pair.map do |(streamname, filename), size|
      [streamname, filename, size]
    end
  end
  @files
end
files_count(stop_after=nil) click to toggle source
# File lib/arvados/keep.rb, line 205
def files_count(stop_after=nil)
  # Return the number of files represented in this manifest.
  # If stop_after is provided, files_count will read the manifest
  # incrementally, and return immediately when it counts that number of
  # files.  This can help you avoid parsing the entire manifest if you
  # just want to check if a small number of files are specified.
  if stop_after.nil? or not @files.nil?
    # Avoid counting empty dir placeholders
    return files.reject{|_, name, size| name == '.' and size == 0}.size
  end
  seen_files = {}
  each_file_spec do |streamname, _, filesize, filename|
    # Avoid counting empty dir placeholders
    next if filename == "." and filesize == 0
    seen_files[[streamname, filename]] = true
    return stop_after if (seen_files.size >= stop_after)
  end
  seen_files.size
end
files_size() click to toggle source
# File lib/arvados/keep.rb, line 225
def files_size
  # Return the total size of all files in this manifest.
  files.reduce(0) { |total, (_, _, size)| total + size }
end
has_file?(want_stream, want_file=nil) click to toggle source
# File lib/arvados/keep.rb, line 238
def has_file?(want_stream, want_file=nil)
  if want_file.nil?
    want_stream, want_file = File.split(want_stream)
  end
  each_file_spec do |streamname, _, _, name|
    if streamname == want_stream and name == want_file
      return true
    end
  end
  false
end
minimum_file_count?(want_count) click to toggle source
# File lib/arvados/keep.rb, line 234
def minimum_file_count?(want_count)
  files_count(want_count) >= want_count
end
split_file_token(token) click to toggle source
# File lib/arvados/keep.rb, line 158
def split_file_token token
  start_pos, filesize, filename = token.split(':', 3)
  if filename.nil?
    raise ArgumentError.new "Invalid file token '#{token}'"
  end
  [start_pos.to_i, filesize.to_i, unescape(filename)]
end
unescape(s) click to toggle source
# File lib/arvados/keep.rb, line 154
def unescape(s)
  self.class.unescape(s)
end