class Classifieds::AutoScraper
Constants
- PHONE_PATTERN
Public Class Methods
scrape_results_detail_page(detail_doc, item_condition, detail_values)
click to toggle source
Returns detail attributes and values in detail_values hash
# File lib/classifieds/auto_scraper.rb, line 33 def self.scrape_results_detail_page(detail_doc, item_condition, detail_values) # Create some entries manually. detail_values['Description'.to_sym] = detail_doc.at_css('.aiDetailsDescription h2').next.text.strip # Description must be first attribute. detail_values['Condition'.to_sym] = item_condition detail_values['Certified'.to_sym] = '' # Create the rest from scraping the html's detail attrribute/value table. detail_cells = detail_cells(detail_doc) index = 0 while index < detail_cells.size if ("\u00A0" == detail_cells[index].text) && detail_cells[index].at_css('span.aiCPOiconDetail') detail_values[:Certified] = 'Yes' # The table row containing attribute Certified Icon does not have a value column. index += 1 else # Grab the attribute and value from the html table. attribute = detail_cells[index].text.chomp(':') value = detail_cells[index+1].text detail_values[attribute.to_sym] = value index += 2 end end end
scrape_results_page(results_url, results_url_file, results_doc, item_class)
click to toggle source
Creates listings from summary web page
# File lib/classifieds/auto_scraper.rb, line 7 def self.scrape_results_page(results_url, results_url_file, results_doc, item_class) results_doc.css('.aiResultsWrapper').each { |result| id = listing_id(result) title_parts = title_parts(result) # => [year, make, model] description_pod_div = result.at_css('.aiDescriptionPod') start_date = start_date(description_pod_div) mileage = mileage(description_pod_div) sale_price = sale_price(description_pod_div) contact_div = result.at_css('.contactLinks') contact_strong = contact_div.css('strong') seller_name = seller_name(contact_strong) seller_location = seller_location(contact_strong) seller_phone = seller_phone(contact_div, id, results_url_file) detail_url = detail_url(results_url, result) item_condition = item_condition(detail_url) item = item_class.new(title_parts[0], title_parts[1], title_parts[2], mileage, sale_price, item_condition, detail_url) seller = Classifieds::Seller.find_or_create(seller_name, seller_location, seller_phone) Classifieds::Listing.new(id, item, seller, start_date) } end
Private Class Methods
detail_cells(doc)
click to toggle source
Returns the detail table
# File lib/classifieds/auto_scraper.rb, line 63 def self.detail_cells(doc) doc.css('.aiDetailAdDetails td') end
detail_url(url, doc)
click to toggle source
Returns a summary record's detail page url
# File lib/classifieds/auto_scraper.rb, line 68 def self.detail_url(url, doc) # detail link is given as relative to the summary page's domain uri = URI.parse(url) "#{uri.scheme}://#{uri.host}#{doc.at_css('.aiResultTitle h3 a')['href']}" end
item_condition(url)
click to toggle source
Returns the item condition, as encoded in the detail page url
# File lib/classifieds/auto_scraper.rb, line 58 def self.item_condition(url) url.match(/[a-z0-9]-(certified|new|used)-[0-9]/)[1].capitalize end
listing_id(doc)
click to toggle source
Returns the listing's id
# File lib/classifieds/auto_scraper.rb, line 74 def self.listing_id(doc) # e.g. from "aiResultsMainDiv547967889" doc.at_css('.aiResultsMainDiv')['id'].match(/\d+/).to_s end
mileage(doc)
click to toggle source
Returns the listing's mileage value
# File lib/classifieds/auto_scraper.rb, line 79 def self.mileage(doc) value = doc.at_css('.listingType').text # e.g. 'Mileage: xx,xxx' (value.include? 'Available') ? 'NA' : value.match(/Mileage: (\d*,{,1}\d+)/)[1] end
sale_price(doc)
click to toggle source
Returns the listing's sale price
# File lib/classifieds/auto_scraper.rb, line 85 def self.sale_price(doc) value = doc.at_css('.price').text (value.include? 'Call') ? 'Call' : value end
seller_location(doc)
click to toggle source
Returns the seller's address
# File lib/classifieds/auto_scraper.rb, line 91 def self.seller_location(doc) doc[1].text.strip end
seller_name(doc)
click to toggle source
Returns the seller's name
# File lib/classifieds/auto_scraper.rb, line 96 def self.seller_name(doc) doc[0].text.strip end
seller_phone(doc, id, url_file)
click to toggle source
Returns the seller's phone number, if it exists
# File lib/classifieds/auto_scraper.rb, line 103 def self.seller_phone(doc, id, url_file) span = doc.at_css('span') if span match_data = span.text.match(/#{PHONE_PATTERN}/) match_data ? match_data.to_s : '' else seller_phone_private(url_file, id) end end
seller_phone_private(url_file, id)
click to toggle source
Returns the seller's phone number, if found in the raw HTML text
# File lib/classifieds/auto_scraper.rb, line 114 def self.seller_phone_private(url_file, id) match_data = nil open(url_file).detect { |line| match_data = line.match(/#{PHONE_PATTERN}/) if line.include? ('aiGetPhoneNumber'+id) } match_data ? match_data.to_s : '' end
start_date(doc)
click to toggle source
Returns listing start date
# File lib/classifieds/auto_scraper.rb, line 121 def self.start_date(doc) doc.at_css('.listingDate').text end
title(doc)
click to toggle source
Returns the summary listing's title
# File lib/classifieds/auto_scraper.rb, line 126 def self.title(doc) doc.at_css('.aiResultTitle h3').text # '2010 Ford Explorer XL' end
title_parts(doc)
click to toggle source
Returns the year, make, and model from the title string NOTE: It is assumed that Make will be one word.
Otherwise, likely will need a database of Make names to match against.
# File lib/classifieds/auto_scraper.rb, line 133 def self.title_parts(doc) title_parts = title(doc).split(' ') year = title_parts[0] # year make = title_parts[1] # make model = title_parts.last(title_parts.size - 2).join(' ') # model [year, make, model] end