class Classifieds::BoatScraper
Public Class Methods
scrape_results_detail_page(detail_doc, item_condition, detail_values)
click to toggle source
Returns detail attributes and values in detail_values hash
# File lib/classifieds/boat_scraper.rb, line 30 def self.scrape_results_detail_page(detail_doc, item_condition, detail_values) boat_details_doc = detail_doc.css('.boat-details') if boat_details_doc.empty? do_alt_processing(detail_doc, item_condition, detail_values) else do_normal_processing(detail_doc, boat_details_doc, item_condition, detail_values) end end
scrape_results_page(results_url, results_url_file, results_doc, item_class)
click to toggle source
Creates listings from summary web page
# File lib/classifieds/boat_scraper.rb, line 7 def self.scrape_results_page(results_url, results_url_file, results_doc, item_class) results_doc.css('.boat-listings li').each { |result| id = listing_id(result) next if id.nil? descr_div = result.at_css('.description') title_parts = split_title(descr_div) # => [year, make, model] start_date = '' # Listing start_date currently not available from web page. sale_price = sale_price(descr_div) seller_name = seller_name(descr_div) seller_location = seller_location(descr_div) seller_phone = '' # Seller phone currently not available from web page. detail_url = detail_url(results_url, result) item_condition = 'Used' # Condition currently not available from web page. item = item_class.new(title_parts[0], title_parts[1], title_parts[2], sale_price, item_condition, detail_url) seller = Classifieds::Seller.find_or_create(seller_name, seller_location, seller_phone) Classifieds::Listing.new(id, item, seller, start_date) } end
Private Class Methods
detail_cells(doc)
click to toggle source
Returns the detail table
# File lib/classifieds/boat_scraper.rb, line 44 def self.detail_cells(doc) doc.css('#collapsible-content-areas tr') end
detail_cells_alt(doc)
click to toggle source
Returns the detail table
# File lib/classifieds/boat_scraper.rb, line 49 def self.detail_cells_alt(doc) doc.css('#ad_detail-table tr') end
detail_url(url, doc)
click to toggle source
Returns a summary record's detail page url
# File lib/classifieds/boat_scraper.rb, line 54 def self.detail_url(url, doc) # detail link is given as relative to the summary page's domain uri = URI.parse(url) "#{uri.scheme}://#{uri.host}#{doc.at_css('.inner a')['href']}" end
do_alt_processing(doc, item_condition, detail_values)
click to toggle source
Returns detail attributes and values in detail_values hash
# File lib/classifieds/boat_scraper.rb, line 60 def self.do_alt_processing(doc, item_condition, detail_values) # Create some entries manually. main_content = doc.at_css('#main-content') detail_values['Description'.to_sym] = main_content.at_css('p').text.strip # Description must be first attribute. detail_values['Condition'.to_sym] = item_condition # Create the rest from scraping the html's detail attrribute/value table. detail_cells = detail_cells_alt(main_content) (0...detail_cells.size).each { |index| dl_tag = detail_cells[index].children (0...dl_tag.size).step(2) { |child| attribute = dl_tag[child].text.chomp(':') value = dl_tag[child+1].text detail_values[attribute.to_sym] = value } } detail_values['Phone'.to_sym] = seller_phone(main_content) # NOTE: keep phone number last in list for display consistency. end
do_normal_processing(doc, boat_doc, item_condition, detail_values)
click to toggle source
Returns detail attributes and values in detail_values hash
# File lib/classifieds/boat_scraper.rb, line 81 def self.do_normal_processing(doc, boat_doc, item_condition, detail_values) # Create some entries manually. detail_values['Description'.to_sym] = doc.at_css('#main-details').text.strip detail_values['Condition'.to_sym] = item_condition # Create the rest from scraping the html's detail attrribute/value table. detail_cells = detail_cells(boat_doc) (0...detail_cells.size).each { |index| attribute_tag = detail_cells[index].children[1] if attribute_tag dl_tag = attribute_tag.css('dl') if 0 < dl_tag.size # need to do alternate normal processing. process_detail_list_alt(dl_tag, detail_values) else attribute = attribute_tag.text.chomp(':') value_tag = detail_cells[index].at_css('td') if value_tag text_value = (attribute == 'Owner Video' ? 'Yes' : value_tag.text) # substitute Yes for the video link. detail_values[attribute.to_sym] = text_value else process_detail_list(detail_cells[index].css('dl').children, detail_values) end end end } detail_values['Phone'.to_sym] = doc.at_css('.phone').text # NOTE: keep phone number last in list for display consistency. end
listing_id(doc)
click to toggle source
Returns the listing's id. If the listing doesn't have an id, it's not really a listing.
# File lib/classifieds/boat_scraper.rb, line 114 def self.listing_id(doc) value = doc.attributes['data-reporting-impression-product-id'] value.text if value end
process_detail_list(doc, detail_values)
click to toggle source
Parse description list data into hash
# File lib/classifieds/boat_scraper.rb, line 120 def self.process_detail_list(doc, detail_values) (0...doc.size).step(4) { |index| attribute = doc[index+1] next if attribute.nil? attr_text = attribute.text.chomp(':') value_tag = doc[index+3] detail_values[attr_text.to_sym] = value_tag.text if value_tag } end
process_detail_list_alt(doc, detail_values)
click to toggle source
Parse description list data into hash
# File lib/classifieds/boat_scraper.rb, line 131 def self.process_detail_list_alt(doc, detail_values) (0...doc.size).each { |index| children = doc[index].children child_index = 1 while child_index < children.size attribute = children[child_index].text.chomp(':') value = children[child_index+2] if value.nil? || value.text.strip.size == 0 value = children[child_index+1] child_index += 3 else child_index += 4 end detail_values[attribute.to_sym] = value.text.sub('✓', 'Y') if value end } end
sale_price(doc)
click to toggle source
Returns the listing's sale price
# File lib/classifieds/boat_scraper.rb, line 150 def self.sale_price(doc) value = doc.at_css('.price .data') value ? value.text : doc.at_css('.price .txt').children[1].text end
seller_location(doc)
click to toggle source
Returns the seller's address
# File lib/classifieds/boat_scraper.rb, line 156 def self.seller_location(doc) value = doc.at_css('.location .data') value ? value.text : doc.at_css('.location .txt').children[1].text end
seller_name(doc)
click to toggle source
Returns the seller's name
# File lib/classifieds/boat_scraper.rb, line 162 def self.seller_name(doc) value = doc.at_css('.offered-by .data') value ? value.text : doc.at_css('.offered-by .txt').children[1].text end
seller_phone(doc)
click to toggle source
Returns the Seller's phone number
# File lib/classifieds/boat_scraper.rb, line 168 def self.seller_phone(doc) phone = doc.at_css('.seller-info .phone').text phone.reverse! # alt listing phone has css format 'direction: rtl' phone[0] = '('; phone[4] = ')' phone end
split_title(doc)
click to toggle source
Returns the year, make, and model from the title string NOTE: It is assumed that Make will be one word.
Otherwise, likely will need a database of Make names to match against.
# File lib/classifieds/boat_scraper.rb, line 178 def self.split_title(doc) title_parts = title(doc).split(' ') year = title_parts[0] # year make = title_parts[1] # make model = title_parts.last(title_parts.size - 2).join(' ') # model [year, make, model] end
title(doc)
click to toggle source
Returns the summary listing's title
# File lib/classifieds/boat_scraper.rb, line 187 def self.title(doc) doc.at_css('.name').text # '2000 JASON 25 Downeaster' (also Grady White, Grady-White, Sea Ray, ...) end