class FeedProcessorUtils::HTMLParser

Public Class Methods

new(config_file = nil) click to toggle source
# File lib/feed_processor_utils/html_parser.rb, line 9
def initialize(config_file = nil)
  config_file ||= @@default_config
  @config = YAML.load(File.read(config_file))
end

Public Instance Methods

parse_data(input) click to toggle source
# File lib/feed_processor_utils/html_parser.rb, line 14
def parse_data(input)
  input_doc = Nokogiri::HTML(input)
  parsed = Hash[
    fields.map do |field_name, parsing_data|
      [field_name, extract_field(input_doc, parsing_data)]
    end
  ]
  parse_lazy_images!(parsed[:lazy_image_tags]) if parsed[:lazy_image_tags]
  parsed
end
parse_url(url) click to toggle source
# File lib/feed_processor_utils/html_parser.rb, line 25
def parse_url(url)
  input = open(url).read
  parse_data(input)
end

Private Instance Methods

extract_field(input_doc, parsing_data) click to toggle source
# File lib/feed_processor_utils/html_parser.rb, line 32
def extract_field(input_doc, parsing_data)
  if parsing_data[:collection]
    collection = []
    parsing_data[:selectors].each do |selector|
      elements = input_doc.css(selector)
      elements.each do |element|
        if element[parsing_data[:attribute]]
          collection << element[parsing_data[:attribute]]
        elsif parsing_data[:fallback_text]
          collection << element.text
        end
      end
    end
    collection
  else
    parsing_data[:selectors].each do |selector|
      element = input_doc.at_css(selector)
      if element
        return element[parsing_data[:attribute]] if element[parsing_data[:attribute]]
        return element.text if parsing_data[:fallback_text]
      end
    end
    nil
  end
end
fields() click to toggle source
# File lib/feed_processor_utils/html_parser.rb, line 58
def fields
  @config
end
parse_lazy_images!(lazy_images) click to toggle source
# File lib/feed_processor_utils/html_parser.rb, line 62
def parse_lazy_images!(lazy_images)
  # this gets rid of #{whatever} in sky sports articles
  regex = /#\{(.+)\}/
  lazy_images.map! do |lazy_image|
    lazy_image.sub! regex do |full_match|
      $1.to_s # this is the 'whatever' inside #{whatever}
    end
  end.compact!
end