module Awestruct::ContextHelper
Public Instance Methods
clean_html(str)
click to toggle source
# File lib/awestruct/context_helper.rb, line 11 def clean_html(str) str.gsub( / /, ' ' ) end
fix_url(base_url, url)
click to toggle source
# File lib/awestruct/context_helper.rb, line 70 def fix_url(base_url, url) return url unless ( url =~ /^\// ) "#{base_url}#{url}" end
fully_qualify_urls(base_url, text)
click to toggle source
# File lib/awestruct/context_helper.rb, line 41 def fully_qualify_urls(base_url, text) begin doc = Oga.parse_xml text doc.each_node do |elem| if elem.is_a?(Oga::XML::Element) case elem.name when 'a' elem.set 'href', fix_url(base_url, elem.get('href')) if elem.get('href') when 'link' elem.set 'href', fix_url(base_url, elem.get('href')) if elem.get('href') when 'img' elem.set 'src', fix_url(base_url, elem.get('src')) if elem.get('src') end end end doc.to_xml.tap do |d| d.force_encoding(text.encoding) if d.encoding != text.encoding end rescue => e Awestruct::ExceptionHelper.log_error e $LOG.info %Q(If the error has to do with 'end of input' ensure none of the following tags have a closing tag: #{Oga::XML::HTML_VOID_ELEMENTS.to_a.collect {|a| a.downcase}.uniq.join(', ')}) if $LOG.info? $LOG.warn "Text being parsed:\n#{text}" if $LOG.warn? text # returning the bad text, which hopefully will help find the cause end end
html_to_text(str)
click to toggle source
# File lib/awestruct/context_helper.rb, line 7 def html_to_text(str) str.gsub( /<[^>]+>/, '' ).gsub( / /, ' ' ) end
summarize(text, numwords=20, ellipsis='...')
click to toggle source
# File lib/awestruct/context_helper.rb, line 37 def summarize(text, numwords=20, ellipsis='...') close_tags(text.split(/ /)[0, numwords].join(' ') + ellipsis) end
without_images(str)
click to toggle source
# File lib/awestruct/context_helper.rb, line 15 def without_images(str) str.gsub(/<img[^>]+>/,'').gsub(/<a[^>]+>([^<]*)<\/a>/, '\1') end