class FeedProcessorUtils::HTMLParser
Public Class Methods
new(config_file = nil)
click to toggle source
# File lib/feed_processor_utils/html_parser.rb, line 9 def initialize(config_file = nil) config_file ||= @@default_config @config = YAML.load(File.read(config_file)) end
Public Instance Methods
parse_data(input)
click to toggle source
# File lib/feed_processor_utils/html_parser.rb, line 14 def parse_data(input) input_doc = Nokogiri::HTML(input) parsed = Hash[ fields.map do |field_name, parsing_data| [field_name, extract_field(input_doc, parsing_data)] end ] parse_lazy_images!(parsed[:lazy_image_tags]) if parsed[:lazy_image_tags] parsed end
parse_url(url)
click to toggle source
# File lib/feed_processor_utils/html_parser.rb, line 25 def parse_url(url) input = open(url).read parse_data(input) end
Private Instance Methods
extract_field(input_doc, parsing_data)
click to toggle source
# File lib/feed_processor_utils/html_parser.rb, line 32 def extract_field(input_doc, parsing_data) if parsing_data[:collection] collection = [] parsing_data[:selectors].each do |selector| elements = input_doc.css(selector) elements.each do |element| if element[parsing_data[:attribute]] collection << element[parsing_data[:attribute]] elsif parsing_data[:fallback_text] collection << element.text end end end collection else parsing_data[:selectors].each do |selector| element = input_doc.at_css(selector) if element return element[parsing_data[:attribute]] if element[parsing_data[:attribute]] return element.text if parsing_data[:fallback_text] end end nil end end
fields()
click to toggle source
# File lib/feed_processor_utils/html_parser.rb, line 58 def fields @config end
parse_lazy_images!(lazy_images)
click to toggle source
# File lib/feed_processor_utils/html_parser.rb, line 62 def parse_lazy_images!(lazy_images) # this gets rid of #{whatever} in sky sports articles regex = /#\{(.+)\}/ lazy_images.map! do |lazy_image| lazy_image.sub! regex do |full_match| $1.to_s # this is the 'whatever' inside #{whatever} end end.compact! end