module BioExominer::TextParser
Constants
- MAX_SIZE
- SKIP_TOKENS
Public Class Methods
add(tokens, word)
click to toggle source
# File lib/bio-exominer/textparser.rb, line 25 def TextParser::add tokens, word return if SKIP_TOKENS.include?(word) # return if word.size < 2 tokens[word] ||= 0 tokens[word] += 1 end
rm_punctuation(w)
click to toggle source
# File lib/bio-exominer/textparser.rb, line 32 def TextParser::rm_punctuation w return nil if w == nil word = w.dup if word =~ /^\[\d+\]/ word = word.sub(/^\[\d+\]/,'') end word = word.sub(/^\(/,'') word = word.sub(/\)$/,'') word = word.sub(/[,:;!]$/,'') # remove punctuation word = word.sub(/^[`"']/,'') # remove starting quotes word = word.sub(/[`"']$/,'') # remove ending quotes word end
tokenize(buf)
click to toggle source
Return tokens with count
# File lib/bio-exominer/textparser.rb, line 47 def TextParser::tokenize buf tokens = {} list = buf.split(/[\r\n\s]+/) list.each_with_index do | word,idx | n1 = p1 = nil p1 = rm_punctuation(list[idx-1]) if idx>0 w1 = rm_punctuation(word) n1 = rm_punctuation(list[idx+1]) if idx<list.size next if w1.size < 2 next if p1 =~ /table|dataset|supplement|figure|chapter|section|paragraph/i # Filter out letters+name if w1 =~ /^[A-Z]/ and w1.capitalize == w1 next if n1 and n1.size == 1 next if p1 and p1.size == 1 next if n1 and n1.size == 2 and n1 =~ /^[A-Z][A-Z]/ next if p1 and p1.size == 2 and p1 =~ /^[A-Z][A-Z]/ end if w1.size == 2 and w1 =~ /^[A-Z][A-Z]/ next if p1 and p1 =~ /^[A-Z]/ and p1.capitalize == p1 next if n1 and n1 =~ /^[A-Z]/ and n1.capitalize == n1 end # Filter out all lowercase small names next if w1.size < 4 and w1 == w1.downcase and w1 !~ /\d/ # Remove brackets and braces in first and last positions add(tokens,w1) if TextParser.valid_token?(word) # p [word,w1,TextParser.valid_token?(word)] add(tokens,word) if TextParser.valid_token?(word) and word != w1 # split on dash or underscore if word =~ /-|_/ word.split(/-|_/).each do |w| add(tokens,w) if TextParser.valid_token?(w) end end end # p tokens tokens end
tokenize_with_context(buf, context_type = :sentence)
click to toggle source
Return a list of tokens with count and context
# File lib/bio-exominer/textparser.rb, line 86 def TextParser::tokenize_with_context buf, context_type = :sentence tokens_context = {} tokens_count = {} # Split buf into sentences based on dots or newlines sentences = if context_type == :line or context_type == 'line' buf.split(/\n/) else buf.split(/\.\s+/) end sentences.each do | sentence1 | sentence = sentence1.strip.gsub(/(\r|\n)\s*/,' ') # remove quotes sentence = sentence.gsub(/"/,'') tokens = tokenize(sentence) tokens.each { | token, count | # shorten the sentence sentence2 = if sentence.size > MAX_SIZE+2 half_size = MAX_SIZE/2 pos = sentence.index(token) start = (pos-half_size<0 ? 0 : pos-half_size) stop = pos+half_size s2 = sentence[start..stop] s2.sub(/^\w+\s+/,'').sub(/\s+\w+$/,'') else sentence end tokens_count[token] ||= 0 tokens_count[token] += count tokens_context[token] ||= [] tokens_context[token] << sentence2 } end return tokens_count, tokens_context end
valid_token?(token)
click to toggle source
L3MBTL
# File lib/bio-exominer/textparser.rb, line 18 def TextParser::valid_token? token return false if token.strip == "" return false if token =~ /^(\d|[,])+$/ return false if token !~ /[a-zA-Z]/ # at least one word char true end