module BioExominer::TextParser

Constants

MAX_SIZE
SKIP_TOKENS

Public Class Methods

add(tokens, word) click to toggle source
# File lib/bio-exominer/textparser.rb, line 25
def TextParser::add tokens, word
  return if SKIP_TOKENS.include?(word)
  # return if word.size < 2
  tokens[word] ||= 0 
  tokens[word] += 1
end
rm_punctuation(w) click to toggle source
# File lib/bio-exominer/textparser.rb, line 32
def TextParser::rm_punctuation w
  return nil if w == nil
  word = w.dup
  if word =~ /^\[\d+\]/
    word = word.sub(/^\[\d+\]/,'')
  end
  word = word.sub(/^\(/,'')
  word = word.sub(/\)$/,'')     
  word = word.sub(/[,:;!]$/,'') # remove punctuation
  word = word.sub(/^[`"']/,'')  # remove starting quotes
  word = word.sub(/[`"']$/,'')  # remove ending quotes
  word
end
tokenize(buf) click to toggle source

Return tokens with count

# File lib/bio-exominer/textparser.rb, line 47
def TextParser::tokenize buf
  tokens = {}
  list = buf.split(/[\r\n\s]+/)
  list.each_with_index do | word,idx |
    n1 = p1 = nil
    p1 = rm_punctuation(list[idx-1]) if idx>0
    w1 = rm_punctuation(word)
    n1 = rm_punctuation(list[idx+1]) if idx<list.size
    next if w1.size < 2
    next if p1 =~ /table|dataset|supplement|figure|chapter|section|paragraph/i 
    # Filter out letters+name
    if w1 =~ /^[A-Z]/ and w1.capitalize == w1
      next if n1 and n1.size == 1
      next if p1 and p1.size == 1
      next if n1 and n1.size == 2 and n1 =~ /^[A-Z][A-Z]/
      next if p1 and p1.size == 2 and p1 =~ /^[A-Z][A-Z]/
    end
    if w1.size == 2 and w1 =~ /^[A-Z][A-Z]/
      next if p1 and p1 =~ /^[A-Z]/ and p1.capitalize == p1
      next if n1 and n1 =~ /^[A-Z]/ and n1.capitalize == n1
    end
    # Filter out all lowercase small names
    next if w1.size < 4 and w1 == w1.downcase and w1 !~ /\d/
    # Remove brackets and braces in first and last positions
    add(tokens,w1) if TextParser.valid_token?(word)
    # p [word,w1,TextParser.valid_token?(word)]
    add(tokens,word) if TextParser.valid_token?(word) and word != w1
    # split on dash or underscore
    if word =~ /-|_/
      word.split(/-|_/).each do |w|
        add(tokens,w) if TextParser.valid_token?(w)
      end
    end
  end
  # p tokens
  tokens
end
tokenize_with_context(buf, context_type = :sentence) click to toggle source

Return a list of tokens with count and context

# File lib/bio-exominer/textparser.rb, line 86
def TextParser::tokenize_with_context buf, context_type = :sentence
  tokens_context = {}
  tokens_count = {}
  # Split buf into sentences based on dots or newlines
  sentences = 
    if context_type == :line or context_type == 'line'
      buf.split(/\n/)
    else
      buf.split(/\.\s+/)
    end
  sentences.each do | sentence1 |
    sentence = sentence1.strip.gsub(/(\r|\n)\s*/,' ') 
    # remove quotes
    sentence = sentence.gsub(/"/,'')
    tokens = tokenize(sentence)
    tokens.each { | token, count |
      # shorten the sentence
      sentence2 = 
        if sentence.size > MAX_SIZE+2
          half_size = MAX_SIZE/2
          pos = sentence.index(token)
          start = (pos-half_size<0 ? 0 : pos-half_size)
          stop  = pos+half_size
          s2 = sentence[start..stop]
          s2.sub(/^\w+\s+/,'').sub(/\s+\w+$/,'')
        else
          sentence
        end
      tokens_count[token] ||= 0
      tokens_count[token] += count
      tokens_context[token] ||= []
      tokens_context[token] << sentence2
    }
  end
  return tokens_count, tokens_context
end
valid_token?(token) click to toggle source

L3MBTL

# File lib/bio-exominer/textparser.rb, line 18
def TextParser::valid_token? token
  return false if token.strip == "" 
  return false if token =~ /^(\d|[,])+$/
  return false if token !~ /[a-zA-Z]/  # at least one word char
  true
end