class ScientificNameParser

we can use these expressions when we are ready to parse virus names class VirusParser

def initialize
  @order     = /^\s*[A-Z][a-z]\+virales/i
  @family    = /^\s*[A-Z][a-z]\+viridae|viroidae/i
  @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
  @genus     = /^\s*[A-Z][a-z]\+virus|viroid/i
  @species   = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|
                viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/ix
  @parsed    = nil
end

end

Constants

FAILED_RESULT

Public Class Methods

fix_case(name_string) click to toggle source
# File lib/biodiversity/parser.rb, line 119
def self.fix_case(name_string)
  name_ary = name_string.split(/\s+/)
  words_num = name_ary.size
  res = nil
  if words_num == 1
    res = name_ary[0].gsub(/[\(\)\{\}]/, '')
    if res.size > 1
      res = UnicodeUtils.upcase(res[0]) + UnicodeUtils.downcase(res[1..-1])
    else
      res = nil
    end
  else
    if name_ary[0].size > 1
      word1 = UnicodeUtils.upcase(name_ary[0][0]) +
        UnicodeUtils.downcase(name_ary[0][1..-1])
    else
      word1 = name_ary[0]
    end
    if name_ary[1].match(/^\(/)
      word2 = name_ary[1].gsub(/\)$/, '') + ')'
      word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
        UnicodeUtils.downcase(word2[2..-1])
    else
      word2 = UnicodeUtils.downcase(name_ary[1])
    end
    res = word1 + ' ' +
      word2 + ' ' +
      name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(' ')
    res.strip!
  end
  res
end
new(opts = {}) click to toggle source
# File lib/biodiversity/parser.rb, line 153
def initialize(opts = {})
  @canonical_with_rank = !!opts[:canonical_with_rank]
  @verbatim = ''
  @clean = ScientificNameCleanParser.new
  @dirty = ScientificNameDirtyParser.new
  @canonical = ScientificNameCanonicalParser.new
  @parsed = nil
end
version() click to toggle source
# File lib/biodiversity/parser.rb, line 115
def self.version
  Biodiversity::VERSION
end

Private Class Methods

add_rank_to_canonical(parsed) click to toggle source
# File lib/biodiversity/parser.rb, line 265
def self.add_rank_to_canonical(parsed)
  parts = parsed[:canonical].split(' ')
  name_ary = parts[0..1]
  parsed[:details][0][:infraspecies].each do |data|
    infrasp = data[:string]
    rank = data[:rank]
    name_ary << (rank && rank != 'n/a' ? "#{rank} #{infrasp}" : infrasp)
  end
  parsed[:canonical] = name_ary.join(' ')
end
surrogate?(parsed_data) click to toggle source
# File lib/biodiversity/parser.rb, line 247
def self.surrogate?(parsed_data)
  return false unless parsed_data[:parsed]
  name = parsed_data[:verbatim]
  pos = parsed_data[:positions].to_a.flatten
  surrogate1 = /BOLD:|[\d]{5,}/i
  surrogate2 = /\b(spp|sp|nr|cf)[\.]?[\s]*$/i
  is_surrogate = false

  ai_index = pos.index('annotation_identification')
  if ai_index
    ai = name[pos[ai_index - 1]..pos[ai_index + 1]]
    is_surrogate = true if ai.match(/^(spp|cf|sp|nr)/)
  end
  is_surrogate = true if !is_surrogate && (name.match(surrogate1) ||
                   name.match(surrogate2))
  is_surrogate
end

Public Instance Methods

parse(a_string) click to toggle source
# File lib/biodiversity/parser.rb, line 178
def parse(a_string)
  @verbatim = a_string.strip
  a_string = PreProcessor::clean(a_string)

  if virus?(a_string)
    @parsed = { verbatim: a_string, virus: true }
  elsif unknown_placement?(a_string)
    @parsed = { verbatim: a_string }
  else
    begin
      @parsed = @clean.parse(a_string) || @dirty.parse(a_string)
      unless @parsed
        index = @dirty.index || @clean.index
        salvage_match = a_string[0..index].split(/\s+/)[0..-2]
        salvage_string = salvage_match ? salvage_match.join(' ') : a_string
        @parsed =  @dirty.parse(salvage_string) ||
                   @canonical.parse(a_string) ||
                   { verbatim: a_string }
      end
    rescue
      @parsed = FAILED_RESULT.(@verbatim)
    end
  end

  def @parsed.verbatim=(a_string)
    @verbatim = a_string
  end

  def @parsed.all(opts = {})
    canonical_with_rank = !!opts[:canonical_with_rank]
    parsed = self.class != Hash
    res = { parsed: parsed, parser_version: ScientificNameParser::version}
    if parsed
      hybrid = self.hybrid rescue false
      res.merge!({
        verbatim: @verbatim,
        normalized: self.value,
        canonical: self.canonical,
        hybrid: hybrid,
        details: self.details,
        parser_run: self.parser_run,
        positions: self.pos
        })
    else
      res.merge!(self)
    end
    if (canonical_with_rank &&
        canonical.count(' ') > 1 &&
        res[:details][0][:infraspecies])
      ScientificNameParser.add_rank_to_canonical(res)
    end
    res[:surrogate] = true if ScientificNameParser.surrogate?(res)
    res = {:scientificName => res}
  end

  def @parsed.pos_json
    self.pos.to_json rescue ''
  end

  def @parsed.all_json
    self.all.to_json rescue ''
  end

  @parsed.verbatim = @verbatim
  @parsed.all(canonical_with_rank: @canonical_with_rank)
end
parsed() click to toggle source
# File lib/biodiversity/parser.rb, line 174
def parsed
  @parsed
end
unknown_placement?(a_string) click to toggle source
# File lib/biodiversity/parser.rb, line 170
def unknown_placement?(a_string)
  !!(a_string.match(/incertae\s+sedis/i) || a_string.match(/inc\.\s*sed\./i))
end
virus?(a_string) click to toggle source
# File lib/biodiversity/parser.rb, line 162
def virus?(a_string)
  !!(a_string.match(/\sICTV\s*$/) ||
     a_string.match(/\b(virus|viruses|
                        phage|phages|viroid|viroids|
                        satellite|satellites|prion|prions)\b/ix) ||
     a_string.match(/[A-Z]?[a-z]+virus\b/))
end