class Classifieds::BoatScraper

Public Class Methods

scrape_results_detail_page(detail_doc, item_condition, detail_values) click to toggle source

Returns detail attributes and values in detail_values hash

# File lib/classifieds/boat_scraper.rb, line 30
def self.scrape_results_detail_page(detail_doc, item_condition, detail_values)
  boat_details_doc = detail_doc.css('.boat-details')

  if boat_details_doc.empty?
    do_alt_processing(detail_doc, item_condition, detail_values)
  else
    do_normal_processing(detail_doc, boat_details_doc, item_condition, detail_values)
  end
end
scrape_results_page(results_url, results_url_file, results_doc, item_class) click to toggle source

Creates listings from summary web page

# File lib/classifieds/boat_scraper.rb, line 7
def self.scrape_results_page(results_url, results_url_file, results_doc, item_class)
  results_doc.css('.boat-listings li').each { |result|
    id = listing_id(result)
    next if id.nil?

    descr_div = result.at_css('.description')
    title_parts = split_title(descr_div)  # => [year, make, model]
    start_date = ''  # Listing start_date currently not available from web page.
    sale_price = sale_price(descr_div)
    seller_name = seller_name(descr_div)
    seller_location = seller_location(descr_div)
    seller_phone = ''  # Seller phone currently not available from web page.

    detail_url = detail_url(results_url, result)
    item_condition = 'Used'  # Condition currently not available from web page.

    item = item_class.new(title_parts[0], title_parts[1], title_parts[2], sale_price, item_condition, detail_url)
    seller = Classifieds::Seller.find_or_create(seller_name, seller_location, seller_phone)
    Classifieds::Listing.new(id, item, seller, start_date)
  }
end

Private Class Methods

detail_cells(doc) click to toggle source

Returns the detail table

# File lib/classifieds/boat_scraper.rb, line 44
def self.detail_cells(doc)
  doc.css('#collapsible-content-areas tr')
end
detail_cells_alt(doc) click to toggle source

Returns the detail table

# File lib/classifieds/boat_scraper.rb, line 49
def self.detail_cells_alt(doc)
  doc.css('#ad_detail-table tr')
end
detail_url(url, doc) click to toggle source

Returns a summary record's detail page url

# File lib/classifieds/boat_scraper.rb, line 54
def self.detail_url(url, doc)  # detail link is given as relative to the summary page's domain
  uri = URI.parse(url)
  "#{uri.scheme}://#{uri.host}#{doc.at_css('.inner a')['href']}"
end
do_alt_processing(doc, item_condition, detail_values) click to toggle source

Returns detail attributes and values in detail_values hash

# File lib/classifieds/boat_scraper.rb, line 60
def self.do_alt_processing(doc, item_condition, detail_values)
  # Create some entries manually.
  main_content = doc.at_css('#main-content')
  detail_values['Description'.to_sym] = main_content.at_css('p').text.strip  # Description must be first attribute.
  detail_values['Condition'.to_sym] = item_condition

  # Create the rest from scraping the html's detail attrribute/value table.
  detail_cells = detail_cells_alt(main_content)
  (0...detail_cells.size).each { |index|
    dl_tag = detail_cells[index].children
    (0...dl_tag.size).step(2) { |child|
      attribute = dl_tag[child].text.chomp(':')
      value = dl_tag[child+1].text
      detail_values[attribute.to_sym] = value
    }
  }

  detail_values['Phone'.to_sym] = seller_phone(main_content)  # NOTE: keep phone number last in list for display consistency.
end
do_normal_processing(doc, boat_doc, item_condition, detail_values) click to toggle source

Returns detail attributes and values in detail_values hash

# File lib/classifieds/boat_scraper.rb, line 81
def self.do_normal_processing(doc, boat_doc, item_condition, detail_values)
  # Create some entries manually.
  detail_values['Description'.to_sym] = doc.at_css('#main-details').text.strip
  detail_values['Condition'.to_sym] = item_condition

  # Create the rest from scraping the html's detail attrribute/value table.
  detail_cells = detail_cells(boat_doc)
  (0...detail_cells.size).each { |index|
    attribute_tag = detail_cells[index].children[1]

    if attribute_tag
      dl_tag = attribute_tag.css('dl')

      if 0 < dl_tag.size  # need to do alternate normal processing.
        process_detail_list_alt(dl_tag, detail_values)
      else
        attribute = attribute_tag.text.chomp(':')
        value_tag = detail_cells[index].at_css('td')

        if value_tag
          text_value = (attribute == 'Owner Video' ? 'Yes' : value_tag.text)  # substitute Yes for the video link.
          detail_values[attribute.to_sym] = text_value
        else
          process_detail_list(detail_cells[index].css('dl').children, detail_values)
        end
      end
    end
  }

  detail_values['Phone'.to_sym] = doc.at_css('.phone').text  # NOTE: keep phone number last in list for display consistency.
end
listing_id(doc) click to toggle source

Returns the listing's id. If the listing doesn't have an id, it's not really a listing.

# File lib/classifieds/boat_scraper.rb, line 114
def self.listing_id(doc)
  value = doc.attributes['data-reporting-impression-product-id']
  value.text if value
end
process_detail_list(doc, detail_values) click to toggle source

Parse description list data into hash

# File lib/classifieds/boat_scraper.rb, line 120
def self.process_detail_list(doc, detail_values)
  (0...doc.size).step(4) { |index|
    attribute = doc[index+1]
    next if attribute.nil?
    attr_text = attribute.text.chomp(':')
    value_tag = doc[index+3]
    detail_values[attr_text.to_sym] = value_tag.text if value_tag
  }
end
process_detail_list_alt(doc, detail_values) click to toggle source

Parse description list data into hash

# File lib/classifieds/boat_scraper.rb, line 131
def self.process_detail_list_alt(doc, detail_values)
  (0...doc.size).each { |index|
    children = doc[index].children
    child_index = 1
    while child_index < children.size
      attribute = children[child_index].text.chomp(':')
      value = children[child_index+2]
      if value.nil? || value.text.strip.size == 0
        value = children[child_index+1]
        child_index += 3
      else
        child_index += 4
      end
      detail_values[attribute.to_sym] = value.text.sub('&check;', 'Y') if value
    end
  }
end
sale_price(doc) click to toggle source

Returns the listing's sale price

# File lib/classifieds/boat_scraper.rb, line 150
def self.sale_price(doc)
  value = doc.at_css('.price .data')
  value ? value.text : doc.at_css('.price .txt').children[1].text
end
seller_location(doc) click to toggle source

Returns the seller's address

# File lib/classifieds/boat_scraper.rb, line 156
def self.seller_location(doc)
  value = doc.at_css('.location .data')
  value ? value.text : doc.at_css('.location .txt').children[1].text
end
seller_name(doc) click to toggle source

Returns the seller's name

# File lib/classifieds/boat_scraper.rb, line 162
def self.seller_name(doc)
  value = doc.at_css('.offered-by .data')
  value ? value.text : doc.at_css('.offered-by .txt').children[1].text
end
seller_phone(doc) click to toggle source

Returns the Seller's phone number

# File lib/classifieds/boat_scraper.rb, line 168
def self.seller_phone(doc)
  phone = doc.at_css('.seller-info .phone').text
  phone.reverse!  # alt listing phone has css format 'direction: rtl'
  phone[0] = '('; phone[4] = ')'
  phone
end
split_title(doc) click to toggle source

Returns the year, make, and model from the title string NOTE: It is assumed that Make will be one word.

Otherwise, likely will need a database of Make names to match against.
# File lib/classifieds/boat_scraper.rb, line 178
def self.split_title(doc)
  title_parts = title(doc).split(' ')
  year = title_parts[0]  # year
  make = title_parts[1]  # make
  model = title_parts.last(title_parts.size - 2).join(' ')  # model
  [year, make, model]
end
title(doc) click to toggle source

Returns the summary listing's title

# File lib/classifieds/boat_scraper.rb, line 187
def self.title(doc)
  doc.at_css('.name').text  # '2000 JASON 25 Downeaster' (also Grady White, Grady-White, Sea Ray, ...)
end