class TAO::RDFizer

Constants

ERB_ANNOTATIONS_TTL

variable: denotations, relations

ERB_PREFIXES_TTL

variable: namespaces

ERB_SPANS_TTL

variable: spans

Public Class Methods

new(mode = nil) click to toggle source

if mode == :spans then produces span descriptions if mode == :annotations then produces annotation descriptions if mode == nil then produces both

# File lib/tao_rdfizer/tao_rdfizer.rb, line 10
def initialize(mode = nil)
        @mode = mode
        template = if !mode.nil? && mode == :spans
                ERB_SPANS_TTL
        else
                ERB_ANNOTATIONS_TTL
        end
        @tao_ttl_erb = ERB.new(template, nil, '-')
        @prefix_ttl_erb = ERB.new(ERB_PREFIXES_TTL, nil, '-')
end

Public Instance Methods

rdfize(annotations_col, options = nil) click to toggle source
# File lib/tao_rdfizer/tao_rdfizer.rb, line 21
def rdfize(annotations_col, options = nil)
        options ||= {}
        only_prefixes = options.has_key?(:only_prefixes) ? options[:only_prefixes] == true : false
        with_prefixes = options.has_key?(:with_prefixes) ? options[:with_prefixes] == true : true

        # check the format
        annotations_col.each do |annotations|
                raise "'target' is missing" unless annotations.has_key? :target
        end

        # namespaces
        namespaces = {}

        anns = annotations_col.first
        anns[:namespaces].each {|n| namespaces[n[:prefix]] = n[:uri]} unless anns[:namespaces].nil?

        prefixes_ttl = @prefix_ttl_erb.result_with_hash(namespaces:namespaces) if only_prefixes || with_prefixes

        if only_prefixes
                prefixes_ttl
        else
                annotations_ttl = get_annotations_ttl(annotations_col, namespaces)
                if with_prefixes
                        prefixes_ttl + annotations_ttl
                else
                        annotations_ttl
                end
        end
end

Private Instance Methods

find_uri(label, namespaces, prefix_for_this) click to toggle source
# File lib/tao_rdfizer/tao_rdfizer.rb, line 214
def find_uri (label, namespaces, prefix_for_this)
        if label.match(/\s/)
                # raise ArgumentError, "A label including a whitespace character found: #{label}."
                label.gsub(/\s/, '_')
        end
        delimiter_position = label.index(':')
        if !delimiter_position.nil? && namespaces.keys.include?(label[0...delimiter_position])
                label.gsub('(', '\(').gsub(')', '\)') # brackets have to be escaped
        elsif label =~ %r[^https?://]
                "<#{label}>"
        else
                clabel = if label.match(/^\W+$/)
                        'SYM'
                else
                        label.sub(/^\W+/, '').sub(/[^a-zA-Z0-9_)]+$/, '').gsub(/ +/, '_')
                end
                namespaces.has_key?('_base') ? "<#{clabel}>" : "#{prefix_for_this}:#{clabel.gsub('(', '\(').gsub(')', '\)')}"
        end
end
get_annotations_ttl(annotations_col, namespaces) click to toggle source
# File lib/tao_rdfizer/tao_rdfizer.rb, line 53
def get_annotations_ttl(annotations_col, namespaces)
        anns = annotations_col.first

        unless @mode ==:spans
                raise ArgumentError, "A project name has to be specified." unless anns.has_key?(:project)
                prefix_for_this = anns[:project].downcase.gsub(/ /, '_')
                raise ArgumentError, "'#{prefix_for_this}' is a reserved prefix for this project." if namespaces.has_key?(prefix_for_this)
                project_uri = 'http://pubannotation.org/projects/' + anns[:project]
                namespaces[prefix_for_this] = project_uri + '/'
        end

        denotations = []
        attributes = []
        relations = []
        spans = []

        annotations_col.each do |annotations|
                text = annotations[:text]
                text_uri = annotations[:target]
                text_id = begin
                        sourcedb, sourceid, divid = get_target_info(text_uri)
                        divid.nil? ? "#{sourcedb}-#{sourceid}" : "#{sourcedb}-#{sourceid}-#{divid}"
                end

                # denotations and relations
                _denotations = annotations[:denotations] || []
                _attributes = annotations[:attributes] || []
                _relations = annotations[:relations] || []
                if @mode == :spans && annotations.has_key?(:tracks)
                        annotations[:tracks].each do |track|
                                _denotations += track[:denotations] if track.has_key? :denotations
                                _attributes += track[:attributes] if track.has_key? :attributes
                                _relations += track[:relations] if track.has_key? :relations
                        end
                end

                begin
                        unless @mode == :span
                                # index attributes
                                attributesh = _attributes.inject({}) do |h, a|
                                        if a[:pred].end_with?('_id')
                                                subj = a[:subj]
                                                h[subj] = [] unless h.has_key? subj
                                                h[subj] << a[:obj]
                                        end
                                        h
                                end

                                # denotations preprocessing
                                _denotations.each do |d|
                                        span_uri = "<#{text_uri}/spans/#{d[:span][:begin]}-#{d[:span][:end]}>"
                                        d[:span_uri] = span_uri
                                        d[:obj_uri] = "#{prefix_for_this}:#{text_id}-#{d[:id]}"
                                        class_uris = (attributesh[d[:id]] || []).push(d[:obj])
                                        d[:class_uris] = class_uris.map{|uri| find_uri(uri, namespaces, prefix_for_this)}
                                rescue ArgumentError => e
                                        raise ArgumentError, "[#{sourcedb}-#{sourceid}-#{d[:id]}] " + e.message
                                end

                                # relations preprocessing
                                _relations.each do |r|
                                        r[:subj_uri] = "#{prefix_for_this}:#{text_id}-#{r[:subj]}"
                                        r[:obj_uri] = "#{prefix_for_this}:#{text_id}-#{r[:obj]}"
                                        r[:pred_uri] = find_uri(r[:pred], namespaces, prefix_for_this)
                                rescue ArgumentError => e
                                        raise ArgumentError, "[#{sourcedb}-#{sourceid}-#{r[:id]}] " + e.message
                                end
                        else
                                # denotations preprocessing
                                _denotations.each do |d|
                                        span_uri = "<#{text_uri}/spans/#{d[:span][:begin]}-#{d[:span][:end]}>"
                                        d[:span_uri] = span_uri
                                rescue ArgumentError => e
                                        raise ArgumentError, "[#{sourcedb}-#{sourceid}-#{d[:id]}] " + e.message
                                end
                        end
                end

                unless @mode == :annotations
                        # collect spans
                        _spans = _denotations.map{|d| d[:span]}
                        _spans.uniq!

                        # add_infomation
                        _spans.each do |s|
                                s[:span_uri] = "<#{text_uri}/spans/#{s[:begin]}-#{s[:end]}>"
                                s[:source_uri] = text_uri
                                s[:text] = text[s[:begin] ... s[:end]]
                        end

                        # index spans
                        spanh = _spans.inject({}){|r, s| r[s[:span_uri]] = s; r}

                        # add denotation information
                        _denotations.each do |d|
                                span_uri = d[:span_uri]
                                if spanh[span_uri][:denotations].nil?
                                        spanh[span_uri][:denotations] = [d]
                                else
                                        spanh[span_uri][:denotations] << d
                                end
                        end

                        _spans.sort!{|a, b| (a[:begin] <=> b[:begin]).nonzero? || b[:end] <=> a[:end]}

                        ## begin indexing
                        len = text.length
                        num = _spans.length

                        # initilaize the index
                        (0 ... num).each do |i|
                                _spans[i][:followings] = []
                                _spans[i][:children] = []
                        end

                        (0 ... num).each do |i|
                                # find the first following span
                                j = i + 1

                                while j < num && _spans[j][:begin] < _spans[i][:end]
                                        unless include_parent?(_spans[i][:children], _spans[j])
                                                _spans[i][:children] << _spans[j]
                                        end
                                        j += 1
                                end

                                # find adjacent positions
                                fp = _spans[i][:end]
                                fp += 1 while fp < len && text[fp].match(/\s/)
                                next if fp == len

                                # index adjacent spans
                                while j < num && _spans[j][:begin] == fp
                                        _spans[i][:followings] << _spans[j]
                                        j += 1
                                end
                        end
                end

                denotations += _denotations
                relations += _relations
                spans += _spans unless @mode == :annotations
        end

        @tao_ttl_erb.result(binding)
end
get_target_info(text_uri) click to toggle source
# File lib/tao_rdfizer/tao_rdfizer.rb, line 206
def get_target_info (text_uri)
        sourcedb = (text_uri =~ %r|/sourcedb/([^/]+)|)? $1 : nil
        sourceid = (text_uri =~ %r|/sourceid/([^/]+)|)? $1 : nil
        divid    = (text_uri =~ %r|/divs/([^/]+)|)? $1 : nil

        return sourcedb, sourceid, divid
end
include_parent?(spans, span) click to toggle source
# File lib/tao_rdfizer/tao_rdfizer.rb, line 200
def include_parent?(spans, span)
        # spans.each{|s| return true if (s[:begin] <= span[:begin] && s[:end] > span[:end]) || (s[:begin] < span[:begin] && s[:end] >= span[:end])}
        spans.each{|s| return true if s[:begin] <= span[:begin] && s[:end] >= span[:end]}
        return false
end
rdf_literal_escape(string) click to toggle source
# File lib/tao_rdfizer/tao_rdfizer.rb, line 234
def rdf_literal_escape(string)
        string.gsub('\\', '\\\\\\').
                                 gsub("\t", '\\t').
                                 gsub("\b", '\\b').
                                 gsub("\n", '\\n').
                                 gsub("\r", '\\r').
                                 gsub("\f", '\\f').
                                 gsub('"', '\\"').
                                 freeze
end