class Csvlint::Validator

Constants

ANCHOR_REGEXP
ERROR_MATCHERS
FORMATS
POSSIBLE_DATE_REGEXP
QUOTED_STRING_REGEXP
RELATIONSHIP_REGEXP
REL_REGEXP
REV_REGEXP
SGML_NAME_REGEXP
TITLE_REGEXP
TOKEN_REGEXP
URI_REGEXP

Attributes

content_type[R]
csv_header[R]
current_line[R]
data[R]
dialect[R]
encoding[R]
extension[R]
headers[R]
schema[R]

Public Class Methods

new(source, dialect = {}, schema = nil, options = {}) click to toggle source
# File lib/csvlint/validate.rb, line 62
def initialize(source, dialect = {}, schema = nil, options = {})
  reset
  @source = source
  @formats = []
  @schema = schema
  @dialect = dialect
  @csv_header = true
  @headers = {}
  @lambda = options[:lambda]
  @leading = ""

  @limit_lines = options[:limit_lines]
  @extension = parse_extension(source) unless @source.nil?

  @expected_columns = 0
  @col_counts = []
  @line_breaks = []

  @errors += @schema.errors unless @schema.nil?
  @warnings += @schema.warnings unless @schema.nil?

  @data = [] # it may be advisable to flush this on init?

  validate
end

Public Instance Methods

build_exception_messages(csvException, errChars, lineNo) click to toggle source
# File lib/csvlint/validate.rb, line 335
def build_exception_messages(csvException, errChars, lineNo)
  #TODO 1 - this is a change in logic, rather than straight refactor of previous error building, however original logic is bonkers
  #TODO 2 - using .kind_of? is a very ugly fix here and it meant to work around instances where :auto symbol is preserved in @csv_options
  type = fetch_error(csvException)
  if !@csv_options[:row_sep].kind_of?(Symbol) && [:unclosed_quote,:stray_quote].include?(type) && !@input.match(@csv_options[:row_sep])
    build_linebreak_error
  else
    build_errors(type, :structure, lineNo, nil, errChars)
  end
end
build_formats(row) click to toggle source
# File lib/csvlint/validate.rb, line 387
def build_formats(row)
  row.each_with_index do |col, i|
    next if col.nil? || col.empty?
    @formats[i] ||= Hash.new(0)

    format =
        if col.strip[FORMATS[:numeric]]
          :numeric
        elsif uri?(col)
          :uri
        elsif possible_date?(col)
          date_formats(col)
        else
          :string
        end

    @formats[i][format] += 1
  end
end
build_linebreak_error() click to toggle source
# File lib/csvlint/validate.rb, line 346
def build_linebreak_error
  build_errors(:line_breaks, :structure) unless @errors.any? { |e| e.type == :line_breaks }
end
check_consistency() click to toggle source
# File lib/csvlint/validate.rb, line 407
def check_consistency
  @formats.each_with_index do |format,i|
    if format
      total = format.values.reduce(:+).to_f
      if format.none?{|_,count| count / total >= 0.9}
        build_warnings(:inconsistent_values, :schema, nil, i + 1)
      end
    end
  end
end
check_foreign_keys() click to toggle source
# File lib/csvlint/validate.rb, line 418
def check_foreign_keys
  if @schema.instance_of? Csvlint::Csvw::TableGroup
    @schema.validate_foreign_keys
    @errors += @schema.errors
    @warnings += @schema.warnings
  end
end
check_mixed_linebreaks() click to toggle source
# File lib/csvlint/validate.rb, line 319
def check_mixed_linebreaks
  build_linebreak_error if @line_breaks.uniq.count > 1
end
dialect_to_csv_options(dialect) click to toggle source
# File lib/csvlint/validate.rb, line 375
def dialect_to_csv_options(dialect)
  skipinitialspace = dialect["skipInitialSpace"] || true
  delimiter = dialect["delimiter"]
  delimiter = delimiter + " " if !skipinitialspace
  return {
      :col_sep => delimiter,
      :row_sep => dialect["lineTerminator"],
      :quote_char => dialect["quoteChar"],
      :skip_blanks => false
  }
end
fetch_error(error) click to toggle source
# File lib/csvlint/validate.rb, line 369
def fetch_error(error)
  e = error.message.match(/^(.+?)(?: [io]n)? \(?line \d+\)?\.?$/i)
  message = e[1] rescue nil
  ERROR_MATCHERS.fetch(message, :unknown_error)
end
finish() click to toggle source
# File lib/csvlint/validate.rb, line 210
def finish
  sum = @col_counts.inject(:+)
  unless sum.nil?
    build_warnings(:title_row, :structure) if @col_counts.first < (sum / @col_counts.size.to_f)
  end
  # return expected_columns to calling class
  build_warnings(:check_options, :structure) if @expected_columns == 1
  check_consistency
  check_foreign_keys
  check_mixed_linebreaks
  validate_encoding
end
header?() click to toggle source
# File lib/csvlint/validate.rb, line 266
def header?
  @csv_header && @dialect["header"]
end
line_breaks() click to toggle source
# File lib/csvlint/validate.rb, line 323
def line_breaks
  if @line_breaks.uniq.count > 1
    :mixed
  else
    @line_breaks.uniq.first
  end
end
line_breaks_reported?() click to toggle source
# File lib/csvlint/validate.rb, line 282
def line_breaks_reported?
  @line_breaks_reported === true
end
locate_schema() click to toggle source
# File lib/csvlint/validate.rb, line 426
def locate_schema

  @source_url = nil
  warn_if_unsuccessful = false
  case @source
    when StringIO
      return
    when File
      @source_url = "file:#{File.expand_path(@source)}"
    else
      @source_url = @source
  end
  unless @schema.nil?
    if @schema.tables[@source_url]
      return
    else
      @schema = nil
    end
  end
  link_schema = nil
  @schema = link_schema if link_schema

  paths = []
  if @source_url =~ /^http(s)?/
    begin
      well_known_uri = URI.join(@source_url, "/.well-known/csvm")
      well_known = open(well_known_uri).read
        # TODO
    rescue OpenURI::HTTPError, URI::BadURIError
    end
  end
  paths = ["{+url}-metadata.json", "csv-metadata.json"] if paths.empty?
  paths.each do |template|
    begin
      template = URITemplate.new(template)
      path = template.expand('url' => @source_url)
      url = URI.join(@source_url, path)
      url = File.new(url.to_s.sub(/^file:/, "")) if url.to_s =~ /^file:/
      schema = Schema.load_from_json(url)
      if schema.instance_of? Csvlint::Csvw::TableGroup
        if schema.tables[@source_url]
          @schema = schema
        else
          warn_if_unsuccessful = true
          build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
        end
      end
    rescue Errno::ENOENT
    rescue OpenURI::HTTPError, URI::BadURIError, ArgumentError
    rescue => e
      STDERR.puts e.class
      STDERR.puts e.message
      STDERR.puts e.backtrace
      raise e
    end
  end
  build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema) if warn_if_unsuccessful
  @schema = nil
end
parse_contents(stream, line = nil) click to toggle source

analyses the provided csv and builds errors, warnings and info messages

# File lib/csvlint/validate.rb, line 172
def parse_contents(stream, line = nil)
  # parse_contents will parse one line and apply headers, formats methods and error handle as appropriate
  current_line = line.present? ? line : 1
  all_errors = []

  @csv_options[:encoding] = @encoding

  begin
    row = LineCSV.parse_line(stream, @csv_options)
  rescue LineCSV::MalformedCSVError => e
    build_exception_messages(e, stream, current_line)
  end

  @data << row
  if row
    if current_line <= 1 && @csv_header
      # this conditional should be refactored somewhere
      row = row.reject { |col| col.nil? || col.empty? }
      validate_header(row)
      @col_counts << row.size
    else
      build_formats(row)
      @col_counts << row.reject { |col| col.nil? || col.empty? }.size
      @expected_columns = row.size unless @expected_columns != 0
      build_errors(:blank_rows, :structure, current_line, nil, stream.to_s) if row.reject { |c| c.nil? || c.empty? }.size == 0
      # Builds errors and warnings related to the provided schema file
      if @schema
        @schema.validate_row(row, current_line, all_errors, @source)
        @errors += @schema.errors
        all_errors += @schema.errors
        @warnings += @schema.warnings
      else
        build_errors(:ragged_rows, :structure, current_line, nil, stream.to_s) if !row.empty? && row.size != @expected_columns
      end
    end
  end
end
parse_line(line) click to toggle source
# File lib/csvlint/validate.rb, line 136
def parse_line(line)
  line = @leading + line
  # Check if the last line is a line break - in which case it's a full line
  if line[-1, 1].include?("\n")
    # If the number of quotes is odd, the linebreak is inside some quotes
    if line.count(@dialect["quoteChar"]).odd?
      @leading = line
    else
      validate_line(line, @current_line)
      @leading = ""
      @current_line = @current_line+1
    end
  else
    # If it's not a full line, then prepare to add it to the beginning of the next chunk
    @leading = line
  end
rescue ArgumentError => ae
  build_errors(:invalid_encoding, :structure, @current_line, nil, @current_line) unless @reported_invalid_encoding
  @current_line = @current_line+1
  @reported_invalid_encoding = true
end
report_line_breaks(line_no=nil) click to toggle source
# File lib/csvlint/validate.rb, line 270
def report_line_breaks(line_no=nil)
  return unless @input[-1, 1].include?("\n") # Return straight away if there's no newline character - i.e. we're on the last line
  line_break = get_line_break(@input)
  @line_breaks << line_break
  unless line_breaks_reported?
    if line_break != "\r\n"
      build_info_messages(:nonrfc_line_breaks, :structure, line_no)
      @line_breaks_reported = true
    end
  end
end
row_count() click to toggle source
# File lib/csvlint/validate.rb, line 331
def row_count
  data.count
end
set_dialect() click to toggle source
# File lib/csvlint/validate.rb, line 286
def set_dialect
  @assumed_header = @dialect["header"].nil?
  @supplied_dialect = @dialect != {}

  begin
    schema_dialect = @schema.tables[@source_url].dialect || {}
  rescue
    schema_dialect = {}
  end
  @dialect = {
      "header" => true,
      "delimiter" => ",",
      "skipInitialSpace" => true,
      "lineTerminator" => :auto,
      "quoteChar" => '"',
      "trim" => :true
  }.merge(schema_dialect).merge(@dialect || {})

  @csv_header = @csv_header && @dialect["header"]
  @csv_options = dialect_to_csv_options(@dialect)
end
validate() click to toggle source
# File lib/csvlint/validate.rb, line 88
def validate
  if @extension =~ /.xls(x)?/
    build_warnings(:excel, :context)
    return
  end
  locate_schema unless @schema.instance_of?(Csvlint::Schema)
  set_dialect

  if @source.class == String
    validate_url
  else
    validate_metadata
    validate_stream
  end
  finish
end
validate_encoding() click to toggle source
# File lib/csvlint/validate.rb, line 308
def validate_encoding
  if @headers["content-type"]
    if @headers["content-type"] !~ /charset=/
      build_warnings(:no_encoding, :context)
    elsif @headers["content-type"] !~ /charset=utf-8/i
      build_warnings(:encoding, :context)
    end
  end
  build_warnings(:encoding, :context) if @encoding != "UTF-8"
end
validate_header(header) click to toggle source
# File lib/csvlint/validate.rb, line 350
def validate_header(header)
  names = Set.new
  header.map{|h| h.strip! } if @dialect["trim"] == :true
  header.each_with_index do |name,i|
    build_warnings(:empty_column_name, :schema, nil, i+1) if name == ""
    if names.include?(name)
      build_warnings(:duplicate_column_name, :schema, nil, i+1)
    else
      names << name
    end
  end
  if @schema
    @schema.validate_header(header, @source)
    @errors += @schema.errors
    @warnings += @schema.warnings
  end
  return valid?
end
validate_line(input = nil, index = nil) click to toggle source
# File lib/csvlint/validate.rb, line 158
def validate_line(input = nil, index = nil)
  @input = input
  single_col = false
  line = index.present? ? index : 0
  @encoding = input.encoding.to_s
  report_line_breaks(line)
  parse_contents(input, line)
  @lambda.call(self) unless @lambda.nil?
rescue ArgumentError => ae
  build_errors(:invalid_encoding, :structure, @current_line, nil, index) unless @reported_invalid_encoding
  @reported_invalid_encoding = true
end
validate_metadata() click to toggle source
# File lib/csvlint/validate.rb, line 223
def validate_metadata
  assumed_header = !@supplied_dialect
  unless @headers.empty?
    if @headers["content-type"] =~ /text\/csv/
      @csv_header = @csv_header && true
      assumed_header = @assumed_header.present?
    end
    if @headers["content-type"] =~ /header=(present|absent)/
      @csv_header = true if $1 == "present"
      @csv_header = false if $1 == "absent"
      assumed_header = false
    end
    build_warnings(:no_content_type, :context) if @content_type == nil
    build_errors(:wrong_content_type, :context) unless (@content_type && @content_type =~ /text\/csv/)
  end
  @header_processed = true
  build_info_messages(:assumed_header, :structure) if assumed_header

  @link_headers = @headers["link"].split(",") rescue nil
  @link_headers.each do |link_header|
    match = LINK_HEADER_REGEXP.match(link_header)
    uri = match["uri"].gsub(/(^\<|\>$)/, "") rescue nil
    rel = match["rel-relationship"].gsub(/(^\"|\"$)/, "") rescue nil
    param = match["param"]
    param_value = match["param-value"].gsub(/(^\"|\"$)/, "") rescue nil
    if rel == "describedby" && param == "type" && ["application/csvm+json", "application/ld+json", "application/json"].include?(param_value)
      begin
        url = URI.join(@source_url, uri)
        schema = Schema.load_from_json(url)
        if schema.instance_of? Csvlint::Csvw::TableGroup
          if schema.tables[@source_url]
            link_schema = schema
          else
            warn_if_unsuccessful = true
            build_warnings(:schema_mismatch, :context, nil, nil, @source_url, schema)
          end
        end
      rescue OpenURI::HTTPError
      end
    end
  end if @link_headers
end
validate_stream() click to toggle source
# File lib/csvlint/validate.rb, line 105
def validate_stream
  @current_line = 1
  @source.each_line do |line|
    break if line_limit_reached?
    parse_line(line)
  end
  validate_line(@leading, @current_line) unless @leading == ""
end
validate_url() click to toggle source
# File lib/csvlint/validate.rb, line 114
def validate_url
  @current_line = 1
  request = Typhoeus::Request.new(@source, followlocation: true)
  request.on_headers do |response|
    @headers = response.headers || {}
    @content_type = response.headers["content-type"] rescue nil
    @response_code = response.code
    return build_errors(:not_found) if response.code == 404
    validate_metadata
  end
  request.on_body do |chunk|
    io = StringIO.new(chunk)
    io.each_line do |line|
      break if line_limit_reached?
      parse_line(line)
    end
  end
  request.run
  # Validate the last line too
  validate_line(@leading, @current_line) unless @leading == ""
end

Private Instance Methods

date_format?(klass, value, format) click to toggle source
# File lib/csvlint/validate.rb, line 549
def date_format?(klass, value, format)
  klass.strptime(value, format).strftime(format) == value
rescue ArgumentError # invalid date
  false
end
date_formats(col) click to toggle source
# File lib/csvlint/validate.rb, line 523
def date_formats(col)
  if col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
    :date_db
  elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
    :date_short
  elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
    :date_rfc822
  elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
    :date_long
  elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
    :dateTime_time
  elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
    :dateTime_hms
  elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
    :dateTime_db
  elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
    :dateTime_iso8601
  elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
    :dateTime_short
  elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
    :dateTime_long
  else
    :string
  end
end
get_line_break(line) click to toggle source
# File lib/csvlint/validate.rb, line 559
def get_line_break(line)
  eol = line.chars.last(2)
  if eol.first == "\r"
    "\r\n"
  else
    "\n"
  end
end
line_limit_reached?() click to toggle source
# File lib/csvlint/validate.rb, line 555
def line_limit_reached?
  @limit_lines.present? && @current_line > @limit_lines
end
parse_extension(source) click to toggle source
# File lib/csvlint/validate.rb, line 488
def parse_extension(source)

  case source
    when File
      return File.extname( source.path )
    when IO
      return ""
    when StringIO
      return ""
    when Tempfile
      # this is triggered when the revalidate dialect use case happens
      return ""
    else
      begin
        parsed = URI.parse(source)
        File.extname(parsed.path)
      rescue URI::InvalidURIError
        return ""
      end
  end
end
possible_date?(col) click to toggle source
# File lib/csvlint/validate.rb, line 519
def possible_date?(col)
  col[POSSIBLE_DATE_REGEXP]
end
uri?(value) click to toggle source
# File lib/csvlint/validate.rb, line 510
def uri?(value)
  if value.strip[FORMATS[:uri]]
    uri = URI.parse(value)
    uri.kind_of?(URI::HTTP) || uri.kind_of?(URI::HTTPS)
  end
rescue URI::InvalidURIError
  false
end