class Classifieds::AutoScraper

Constants

PHONE_PATTERN

Public Class Methods

scrape_results_detail_page(detail_doc, item_condition, detail_values) click to toggle source

Returns detail attributes and values in detail_values hash

# File lib/classifieds/auto_scraper.rb, line 33
def self.scrape_results_detail_page(detail_doc, item_condition, detail_values)
  # Create some entries manually.
  detail_values['Description'.to_sym] = detail_doc.at_css('.aiDetailsDescription h2').next.text.strip  # Description must be first attribute.
  detail_values['Condition'.to_sym] = item_condition
  detail_values['Certified'.to_sym] = ''
  # Create the rest from scraping the html's detail attrribute/value table.
  detail_cells = detail_cells(detail_doc)
  index = 0
  while index < detail_cells.size
    if ("\u00A0" == detail_cells[index].text) && detail_cells[index].at_css('span.aiCPOiconDetail')
      detail_values[:Certified] = 'Yes' # The table row containing attribute Certified Icon does not have a value column.
      index += 1
    else  # Grab the attribute and value from the html table.
      attribute = detail_cells[index].text.chomp(':')
      value = detail_cells[index+1].text
      detail_values[attribute.to_sym] = value
      index += 2
    end
  end
end
scrape_results_page(results_url, results_url_file, results_doc, item_class) click to toggle source

Creates listings from summary web page

# File lib/classifieds/auto_scraper.rb, line 7
def self.scrape_results_page(results_url, results_url_file, results_doc, item_class)
  results_doc.css('.aiResultsWrapper').each { |result|
    id = listing_id(result)
    title_parts = title_parts(result)  # => [year, make, model]

    description_pod_div = result.at_css('.aiDescriptionPod')
    start_date = start_date(description_pod_div)
    mileage = mileage(description_pod_div)
    sale_price = sale_price(description_pod_div)

    contact_div = result.at_css('.contactLinks')
    contact_strong = contact_div.css('strong')
    seller_name = seller_name(contact_strong)
    seller_location = seller_location(contact_strong)
    seller_phone = seller_phone(contact_div, id, results_url_file)

    detail_url = detail_url(results_url, result)
    item_condition = item_condition(detail_url)

    item = item_class.new(title_parts[0], title_parts[1], title_parts[2], mileage, sale_price, item_condition, detail_url)
    seller = Classifieds::Seller.find_or_create(seller_name, seller_location, seller_phone)
    Classifieds::Listing.new(id, item, seller, start_date)
  }
end

Private Class Methods

detail_cells(doc) click to toggle source

Returns the detail table

# File lib/classifieds/auto_scraper.rb, line 63
def self.detail_cells(doc)
  doc.css('.aiDetailAdDetails td')
end
detail_url(url, doc) click to toggle source

Returns a summary record's detail page url

# File lib/classifieds/auto_scraper.rb, line 68
def self.detail_url(url, doc)  # detail link is given as relative to the summary page's domain
  uri = URI.parse(url)
  "#{uri.scheme}://#{uri.host}#{doc.at_css('.aiResultTitle h3 a')['href']}"
end
item_condition(url) click to toggle source

Returns the item condition, as encoded in the detail page url

# File lib/classifieds/auto_scraper.rb, line 58
def self.item_condition(url)
  url.match(/[a-z0-9]-(certified|new|used)-[0-9]/)[1].capitalize
end
listing_id(doc) click to toggle source

Returns the listing's id

# File lib/classifieds/auto_scraper.rb, line 74
def self.listing_id(doc)  # e.g. from "aiResultsMainDiv547967889"
  doc.at_css('.aiResultsMainDiv')['id'].match(/\d+/).to_s
end
mileage(doc) click to toggle source

Returns the listing's mileage value

# File lib/classifieds/auto_scraper.rb, line 79
def self.mileage(doc)
  value = doc.at_css('.listingType').text  # e.g. 'Mileage: xx,xxx'
  (value.include? 'Available') ? 'NA' : value.match(/Mileage: (\d*,{,1}\d+)/)[1]
end
sale_price(doc) click to toggle source

Returns the listing's sale price

# File lib/classifieds/auto_scraper.rb, line 85
def self.sale_price(doc)
  value = doc.at_css('.price').text
  (value.include? 'Call') ? 'Call' : value
end
seller_location(doc) click to toggle source

Returns the seller's address

# File lib/classifieds/auto_scraper.rb, line 91
def self.seller_location(doc)
  doc[1].text.strip
end
seller_name(doc) click to toggle source

Returns the seller's name

# File lib/classifieds/auto_scraper.rb, line 96
def self.seller_name(doc)
  doc[0].text.strip
end
seller_phone(doc, id, url_file) click to toggle source

Returns the seller's phone number, if it exists

# File lib/classifieds/auto_scraper.rb, line 103
def self.seller_phone(doc, id, url_file)
  span = doc.at_css('span')
  if span
    match_data = span.text.match(/#{PHONE_PATTERN}/)
    match_data ? match_data.to_s : ''
  else
    seller_phone_private(url_file, id)
  end
end
seller_phone_private(url_file, id) click to toggle source

Returns the seller's phone number, if found in the raw HTML text

# File lib/classifieds/auto_scraper.rb, line 114
def self.seller_phone_private(url_file, id)
  match_data = nil
  open(url_file).detect { |line| match_data = line.match(/#{PHONE_PATTERN}/) if line.include? ('aiGetPhoneNumber'+id) }
  match_data ? match_data.to_s : ''
end
start_date(doc) click to toggle source

Returns listing start date

# File lib/classifieds/auto_scraper.rb, line 121
def self.start_date(doc)
  doc.at_css('.listingDate').text
end
title(doc) click to toggle source

Returns the summary listing's title

# File lib/classifieds/auto_scraper.rb, line 126
def self.title(doc)
  doc.at_css('.aiResultTitle h3').text  # '2010 Ford Explorer XL'
end
title_parts(doc) click to toggle source

Returns the year, make, and model from the title string NOTE: It is assumed that Make will be one word.

Otherwise, likely will need a database of Make names to match against.
# File lib/classifieds/auto_scraper.rb, line 133
def self.title_parts(doc)
  title_parts = title(doc).split(' ')
  year = title_parts[0]  # year
  make = title_parts[1]  # make
  model = title_parts.last(title_parts.size - 2).join(' ')  # model
  [year, make, model]
end