class Rpareia::Parser

Attributes

project[R]

Public Class Methods

new(xml) click to toggle source
# File lib/rpareia/parser.rb, line 32
def initialize(xml)
  @xml = xml

  parse
end

Private Instance Methods

find_duplicated(arr) click to toggle source
# File lib/rpareia/parser.rb, line 50
def find_duplicated(arr)
  arr.detect {|e| arr.rindex(e) != arr.index(e) }
end
parse() click to toggle source
# File lib/rpareia/parser.rb, line 163
def parse
  parse_xml
  parse_task
  parse_data_sources
  parse_deterministic_linkage
  parse_output
end
parse_data_sources() click to toggle source
# File lib/rpareia/parser.rb, line 54
def parse_data_sources
  @project[:data_sources] = []

  @xml.xpath("/project/data-sources/data-source").each do |data_source|
    id = data_source['id'].to_s
    raise MissingDataSourceId.exception if id.empty?

    file = data_source['file'].to_s
    raise MissingDataSourceFile.exception("Missing file attribute from data source '#{id}'") if file.empty?
    raise InvalidDataSourceFile.exception("File '#{file}' from data source '#{id}' does not exist") unless File.exist?(file)

    type = data_source['type'].to_s
    raise InvalidDataSourceType.exception("Data source type '#{type}' not supported") if type != "delimited"

    field_separator = data_source['field-separator'].to_s
    raise MissingFieldSeparator.exception("Missing field separator from data source '#{id}'") if field_separator.empty?

    fields = data_source.xpath("fields/field")
    raise FieldsElementNotFound.exception("Element 'field' not found on data source '#{id}'") if fields.empty?

    my_fields = []
    fields.each do |field|
      name = field['name']
      raise MissingFieldName.exception("Attribute 'name' not found on data source '#{id}'") if name.empty?

      type = field['type']
      raise InvalidFieldType.exception("Invalid type '#{type}' from field '#{name}', data source '#{id}'") unless ['int', 'string'].include? type
      my_fields << {name: name, type: type}
    end

    if dup = find_duplicated(my_fields.map{|f| f[:name]})
      raise DuplicatedFieldName.exception("Duplicated field name '#{dup}' on data source '#{id}'")
    end

    @project[:data_sources] << {
      id: id,
      file: file,
      fields: my_fields
    }
  end

  if dup = find_duplicated(@project[:data_sources].map{|el| el[:id]})
    raise DuplicatedDataSourceId.exception("Duplicated data source id '#{dup}'")
  end

  case @project[:name]
  when 'linkage'
    if @project[:data_sources].size != 2
      raise InvalidNumberOfSources.exception("Linkage: expected two data-source, #{@project[:data_sources].size} given")
    end
  when 'deduplication'
    if @project[:data_sources].size != 1
      raise InvalidNumberOfSources.exception("Deduplication: expected one data-sources, #{@project[:data_sources].size} given")
    end
  else
    raise InvalidTaskError.exception("Invalid task: '#{@project[:name]}'")
  end
end
parse_deterministic_linkage() click to toggle source
# File lib/rpareia/parser.rb, line 113
def parse_deterministic_linkage
  deterministic_linkage = @xml.xpath("/project/deterministic-linkage")
  raise DeterministicLinkageElementNotFound.exception("Missing deterministic-linkage element") if deterministic_linkage.empty?

  conjunction = deterministic_linkage.xpath("conjunction")
  size = conjunction.size

  case size
  when 0
    raise ConjunctionElementNotFound.exception("Missing conjunction element")
  when 1
  else
    raise MultipleConjunctionElements.exception("Only one conjunction element is allowed, #{size} found")
  end

  parts = conjunction.xpath("part")
  raise MissingPart.exception("At leas one part element is required") if parts.empty?

  @project[:parts] = []
  parts.each do |part|
    field_name = part['field-name'].to_s
    raise MissingFieldName.exception("Missing attribute field-name on part element") if field_name.empty?

    @project[:parts] << {field_name: field_name}
  end

  @project[:parts].each do |part|
    @project[:data_sources].each do |data_source|
      unless data_source[:fields].map{|e| e[:name]}.include?(part[:field_name])
        raise MissingPartFieldNameOnDataSource.exception("Field name '#{part[:field_name]}' not found on data source '#{data_source[:id]}'")
      end
    end
  end
end
parse_output() click to toggle source
# File lib/rpareia/parser.rb, line 148
def parse_output
  output = @xml.xpath("/project/output")

  case size = output.size
  when 0
    raise MissingOutputElement.exception("Missing output element")
  when 1
  else
    raise MultipleOutputElement.exception("Only one output element is allowed, #{size} found")
  end

  @project[:output] = output.first['deterministic']
  raise MissingDeterministicAttribute.exception("Missing attribute 'deterministic' on output element") unless @project[:output]
end
parse_task() click to toggle source
# File lib/rpareia/parser.rb, line 46
def parse_task
  @project = {name: @xml.xpath("/project/@task").first.value}
end
parse_xml() click to toggle source
# File lib/rpareia/parser.rb, line 40
def parse_xml
  @xml = Nokogiri::XML(@xml)

  raise SyntaxError.exception(@xml.errors.join("\n")) unless @xml.errors.empty?
end