class Stepmod::Utils::TermsExtractor

Constants

ACCEPTED_STAGES

TODO: we may want a command line option to override this in the future

WITHDRAWN_STATUS

Attributes

cvs_mode[R]
encountered_terms[R]
general_concepts[R]
parsed_bibliography[R]
part_concepts[R]
part_modules[R]
part_resources[R]
resource_concepts[R]
stdout[R]
stepmod_dir[R]
stepmod_path[R]

Public Class Methods

call(stepmod_dir, stdout = $stdout) click to toggle source
# File lib/stepmod/utils/terms_extractor.rb, line 28
def self.call(stepmod_dir, stdout = $stdout)
  new(stepmod_dir, stdout).call
end
new(stepmod_dir, stdout) click to toggle source
# File lib/stepmod/utils/terms_extractor.rb, line 32
def initialize(stepmod_dir, stdout)
  @stdout = stdout
  @stepmod_dir = stepmod_dir
  @stepmod_path = Pathname.new(stepmod_dir).realpath
  @general_concepts = Glossarist::Collection.new
  @resource_concepts = Glossarist::Collection.new
  @parsed_bibliography = []
  @part_concepts = []
  @part_resources = []
  @part_modules = []
  @encountered_terms = {}
end

Public Instance Methods

call() click to toggle source
# File lib/stepmod/utils/terms_extractor.rb, line 60
def call
  # If we are using the stepmod CVS repository, provide the revision number per file
  @cvs_mode = if Dir.exists?(stepmod_path.join("CVS"))
                require "ptools"
                # ptools provides File.which
                File.which("cvs")
              end

  log "INFO: STEPmod directory set to #{stepmod_dir}."

  if cvs_mode
    log "INFO: STEPmod directory is a \
      CVS repository and will detect revisions."
    log "INFO: [CVS] Detecting file revisions can be slow, \
      please be patient!"
  else
    log "INFO: STEPmod directory is not a CVS repository, \
      skipping revision detection."
  end

  log "INFO: Detecting paths..."

  repo_index = Nokogiri::XML(File.read(stepmod_path.join("repository_index.xml"))).root

  files = []

  # add module paths
  repo_index.xpath("//module").each do |x|
    next if x['status'] == WITHDRAWN_STATUS

    path = Pathname.new("#{stepmod_dir}/modules/#{x['name']}/module.xml")
    files << path if File.exists? path
  end

  # add resource_docs paths
  repo_index.xpath("//resource_doc").each do |x|
    next if x['status'] == WITHDRAWN_STATUS

    path = Pathname.new("#{stepmod_dir}/resource_docs/#{x['name']}/resource.xml")
    files << path if File.exists? path
  end

  # add business_object_models paths
  repo_index.xpath("//business_object_model").each do |x|
    next if x['status'] == WITHDRAWN_STATUS

    path = Pathname.new("#{stepmod_dir}/business_object_models/#{x['name']}/business_object_model.xml")
    files << path if File.exists? path
  end

  # add application_protocols paths
  repo_index.xpath("//application_protocol").each do |x|
    next if x['status'] == WITHDRAWN_STATUS

    path = Pathname.new("#{stepmod_dir}/application_protocols/#{x['name']}/application_protocol.xml")
    files << path if File.exists? path
  end

  files.sort!.uniq!
  process_term_files(files)

  [
    general_concepts,
    resource_concepts,
    parsed_bibliography,
    part_concepts,
    part_resources,
    part_modules,
  ]
end
log(message) click to toggle source
# File lib/stepmod/utils/terms_extractor.rb, line 45
def log(message)
  stdout.puts "[stepmod-utils] #{message}"
end
term_special_category(bibdata) click to toggle source
# File lib/stepmod/utils/terms_extractor.rb, line 49
def term_special_category(bibdata)
  case bibdata.part.to_i
  when 41, 42, 43, 44, 45, 46, 47, 51
    true
  when [56..112]
    true
  else
    false
  end
end

Private Instance Methods

find_or_initialize_concept(collection, localized_concept) click to toggle source
# File lib/stepmod/utils/terms_extractor.rb, line 371
def find_or_initialize_concept(collection, localized_concept)
  concept = collection
    .store(Glossarist::Concept.new(id: SecureRandom.uuid))
  concept.add_l10n(localized_concept)
end
process_term_files(files) click to toggle source
# File lib/stepmod/utils/terms_extractor.rb, line 133
def process_term_files(files)
  parsed_schema_names = {}
  files.each do |file_path|
    file_path = file_path.realpath
    fpath = file_path.relative_path_from(stepmod_path)

    log "INFO: Processing XML file #{fpath}"
    current_document = Nokogiri::XML(File.read(file_path)).root

    bibdata = nil
    begin
      bibdata = Stepmod::Utils::Bibdata.new(document: current_document)
    rescue StandardError
      log "WARNING: Unknown file #{fpath}, skipped"
      next
    end

    unless ACCEPTED_STAGES.include? bibdata.doctype
      log "INFO: skipped #{bibdata.docid} as it is not \
        one of (#{ACCEPTED_STAGES.join(', ')})."
      next
    end

    if bibdata.part.to_s.empty?
      log "FATAL: missing `part` attribute: #{fpath}"
      log "INFO: skipped #{bibdata.docid} as it is missing `part` attribute."
      next
    end

    revision_string = "\n// CVS: revision not detected"
    if cvs_mode
      # Run `cvs status` to find out version

      log "INFO: Detecting CVS revision..."
      Dir.chdir(stepmod_path) do
        status = `cvs status #{fpath}`

        unless status.empty?
          working_rev = status.split(/\n/).grep(/Working revision:/)
            .first.match(/revision:\s+(.+)$/)[1]
          repo_rev = status.split(/\n/).grep(/Repository revision:/)
            .first.match(/revision:\t(.+)\t/)[1]
          log "INFO: CVS working rev (#{working_rev}), \
            repo rev (#{repo_rev})"
          revision_string = "\n// CVS working rev: (#{working_rev}), repo rev (#{repo_rev})\n" +
            "// CVS: revision #{working_rev == repo_rev ? 'up to date' : 'differs'}"
        end
      end
    end

    # read definitions
    current_part_concepts = Glossarist::Collection.new
    definition_index = 0
    current_document.xpath("//definition").each do |definition|
      definition_index += 1
      term_id = definition["id"]
      unless term_id.nil?
        if encountered_terms[term_id]
          log "FATAL: Duplicated term with id: #{term_id}, #{fpath}"
        end
        encountered_terms[term_id] = true
      end

      # Assume that definition is located in clause 3 of the ISO document
      # in order. We really don't have a good reference here.
      ref_clause = "3.#{definition_index}"

      concept = Stepmod::Utils::Concept.parse(
        definition,
        reference_anchor: bibdata.anchor,
        reference_clause: ref_clause,
        file_path: fpath + revision_string,
      )
      next unless concept

      if term_special_category(bibdata)
        # log "INFO: this part is special"
        find_or_initialize_concept(current_part_concepts, concept)
      else
        # log "INFO: this part is generic"
        find_or_initialize_concept(general_concepts, concept)
      end

      parsed_bibliography << bibdata
    end

    current_part_resources = Glossarist::Collection.new
    current_part_modules_arm = {}
    current_part_modules_mim = {}

    log "INFO: FILE PATH IS #{file_path}"
    case file_path.to_s
    when /resource.xml$/
      log "INFO: Processing resource.xml for #{file_path}"
      # Assumption: every schema is only linked by a single resource_docs document.
      current_document.xpath("//schema").each do |schema_node|
        schema_name = schema_node["name"]
        if parsed_schema_names[schema_name]
          log "ERROR: We have encountered this schema before: \
            #{schema_name} from path \
            #{parsed_schema_names[schema_name]}, now at #{file_path}"
          next
        else
          parsed_schema_names[schema_name] = file_path
        end

        Dir["#{stepmod_path}/resources/#{schema_name}/descriptions.xml"].each do |description_xml_path|
          log "INFO: Processing resources schema #{description_xml_path}"
          description_document = Nokogiri::XML(File.read(description_xml_path)).root
          description_document.xpath("//ext_description").each do |ext_description|
            # log "INFO: Processing linkend[#{ext_description['linkend']}]"

            concept = Stepmod::Utils::Concept.parse(
              ext_description,
              reference_anchor: bibdata.anchor,
              reference_clause: nil,
              file_path: Pathname.new(description_xml_path)
                          .relative_path_from(stepmod_path),
            )
            next unless concept

            if term_special_category(bibdata)
              # log "INFO: this part is special"
              find_or_initialize_concept(current_part_resources, concept)
            else
              # log "INFO: this part is generic"
              find_or_initialize_concept(resource_concepts, concept)
            end

            parsed_bibliography << bibdata
          end
        end
      end

    when /module.xml$/
      log "INFO: Processing module.xml for #{file_path}"
      # Assumption: every schema is only linked by a single module document.
      # puts current_document.xpath('//module').length
      schema_name = current_document.xpath("//module").first["name"]
      if parsed_schema_names[schema_name]
        log "ERROR: We have encountered this schema before: \
          #{schema_name} from path #{parsed_schema_names[schema_name]}, \
            now at #{file_path}"
        next
      else
        parsed_schema_names[schema_name] = file_path
      end

      description_xml_path = "#{stepmod_path}/modules/#{schema_name}/arm_descriptions.xml"
      log "INFO: Processing modules schema #{description_xml_path}"

      if File.exists?(description_xml_path)
        description_document = Nokogiri::XML(
          File.read(description_xml_path),
        )
          .root
        description_document.xpath("//ext_description").each do |ext_description|
          linkend_schema = ext_description["linkend"].split(".").first
          concept = Stepmod::Utils::Concept.parse(
            ext_description,
            reference_anchor: bibdata.anchor,
            reference_clause: nil,
            file_path: Pathname.new(description_xml_path)
                        .relative_path_from(stepmod_path),
          )
          next unless concept

          current_part_modules_arm[linkend_schema] ||= Glossarist::Collection.new
          find_or_initialize_concept(
            current_part_modules_arm[linkend_schema], concept
          )
          # puts part_modules_arm.inspect
          parsed_bibliography << bibdata
        end
      end

      description_xml_path = "#{stepmod_path}/modules/#{schema_name}/mim_descriptions.xml"
      log "INFO: Processing modules schema #{description_xml_path}"

      if File.exists?(description_xml_path)
        description_document = Nokogiri::XML(
          File.read(description_xml_path),
        )
          .root
        description_document.xpath("//ext_description").each do |ext_description|
          linkend_schema = ext_description["linkend"].split(".").first

          concept = Stepmod::Utils::Concept.parse(
            ext_description,
            reference_anchor: bibdata.anchor,
            reference_clause: nil,
            file_path: Pathname
                        .new(description_xml_path)
                        .relative_path_from(stepmod_path),
          )
          next unless concept

          current_part_modules_mim[linkend_schema] ||=
            Glossarist::Collection.new
          find_or_initialize_concept(
            current_part_modules_mim[linkend_schema], concept
          )

          parsed_bibliography << bibdata
        end
      end

    end

    log "INFO: Completed processing XML file #{fpath}"
    if current_part_concepts.to_a.empty?
      log "INFO: Skipping #{fpath} (#{bibdata.docid}) \
        because it contains no concepts."
    elsif current_part_concepts.to_a.length < 3
      log "INFO: Skipping #{fpath} (#{bibdata.docid}) \
        because it only has #{current_part_concepts.to_a.length} terms."

      current_part_concepts.to_a.each do |x|
        general_concepts.store(x)
      end
    else
      unless current_part_concepts.to_a.empty?
        part_concepts << [bibdata,
                          current_part_concepts]
      end
    end
    unless current_part_resources.to_a.empty?
      part_resources << [bibdata,
                         current_part_resources]
    end
    if (current_part_modules_arm.to_a.size +
        current_part_modules_mim.to_a.size).positive?
      part_modules << [bibdata, current_part_modules_arm,
                       current_part_modules_mim]
    end
  end
end