module Awestruct::ContextHelper

Public Instance Methods

clean_html(str) click to toggle source
# File lib/awestruct/context_helper.rb, line 11
def clean_html(str)
  str.gsub( / /, ' ' )
end
close_tags(s) click to toggle source
# File lib/awestruct/context_helper.rb, line 19
def close_tags(s)
  stack = []
  s.scan(/<\/?[^>]+>/).each do |tag|
    if tag[1] != '/'
      tag = tag[1..-1].scan(/\w+/).first
      stack = [ tag ] + stack
    else
      tag = tag[2..-1].scan(/\w+/).first
      if stack[0] == tag
        stack = stack.drop(1)
      else
        raise "Malformed HTML expected #{tag[0]} but got #{tag} '#{s}'"
      end
    end
  end
  stack.inject(s) { |memo,tag| memo += "</#{tag}>" }
end
fix_url(base_url, url) click to toggle source
# File lib/awestruct/context_helper.rb, line 70
def fix_url(base_url, url)
  return url unless ( url =~ /^\// )
  "#{base_url}#{url}"
end
fully_qualify_urls(base_url, text) click to toggle source
# File lib/awestruct/context_helper.rb, line 41
    def fully_qualify_urls(base_url, text)
      begin
        doc = Oga.parse_xml text

        doc.each_node do |elem|
          if elem.is_a?(Oga::XML::Element)
            case elem.name
            when 'a'
              elem.set 'href', fix_url(base_url, elem.get('href')) if elem.get('href')
            when 'link'
              elem.set 'href', fix_url(base_url, elem.get('href')) if elem.get('href')
            when 'img'
              elem.set 'src', fix_url(base_url, elem.get('src')) if elem.get('src')
            end
          end
        end

        doc.to_xml.tap do |d|
          d.force_encoding(text.encoding) if d.encoding != text.encoding
        end
      rescue => e
        Awestruct::ExceptionHelper.log_error e
        $LOG.info %Q(If the error has to do with 'end of input' ensure none of the following tags have a closing tag:
#{Oga::XML::HTML_VOID_ELEMENTS.to_a.collect {|a| a.downcase}.uniq.join(', ')}) if $LOG.info?
        $LOG.warn "Text being parsed:\n#{text}" if $LOG.warn?
        text # returning the bad text, which hopefully will help find the cause
      end
    end
html_to_text(str) click to toggle source
# File lib/awestruct/context_helper.rb, line 7
def html_to_text(str)
  str.gsub( /<[^>]+>/, '' ).gsub( /&nbsp;/, ' ' )
end
summarize(text, numwords=20, ellipsis='...') click to toggle source
# File lib/awestruct/context_helper.rb, line 37
def summarize(text, numwords=20, ellipsis='...')
  close_tags(text.split(/ /)[0, numwords].join(' ') + ellipsis)
end
without_images(str) click to toggle source
# File lib/awestruct/context_helper.rb, line 15
def without_images(str)
  str.gsub(/<img[^>]+>/,'').gsub(/<a[^>]+>([^<]*)<\/a>/, '\1')
end