class Amazoned::Parser

Attributes

html_doc[R]
product_hash[RW]
response[R]

Public Class Methods

new(response) click to toggle source
# File lib/amazoned/parser.rb, line 5
def initialize(response)
  @product_hash = Hash.new
  @response = response
  @html_doc = Nokogiri::HTML(response.body)
end

Public Instance Methods

call() click to toggle source
# File lib/amazoned/parser.rb, line 11
def call
  parse_response_for_product_details( response )
end
extract_subcategory_rankings(nokogiri_html) click to toggle source
# File lib/amazoned/parser.rb, line 90
def extract_subcategory_rankings(nokogiri_html)
  # Below is gnarly string manipulation to parse text strings like:
  # "\n    #2\n    in Baby > Baby Care > Health\n    \n    #2\n    in Baby > Baby Care > Pacifiers, Teethers & Teething Relief > Teethers\n    "
  # into:
  # [["2", "Baby > Baby Care > Health"], ["2", "Baby > Baby Care > Pacifiers, Teethers & Teething Relief > Teethers"]]
  nokogiri_html
  .map{|i| i.text}
  .map{|i| i.partition("in")
  .map(&:strip)}
  .map{|i| i - ["in"] }
  .map{|i|
    i.map{|ii|
      ii.gsub("#", "") # remove '#' from '#2'
      .gsub("\u00A0", "") # remove No-Break Space Unicode characters (U+00A0) since Ruby's .strip command won't remove them
    }
  }.each do |i|
    hsh = {}
    hsh[:rank] = i.first.to_i
    hsh[:ladder] = i.last
    product_hash[:best_sellers_rank] << hsh
  end
end
parse_response_for_product_details(response) click to toggle source
# File lib/amazoned/parser.rb, line 15
def parse_response_for_product_details(response)
  product_hash[:best_sellers_rank] = []

  ########
  # # Parent category Seller Rank Parser
  ########
  parsed_parent_category = html_doc.css('#SalesRank').text.partition("(").first.chop.partition("#").last.partition("in").map(&:strip) - ["in"]
  product_hash[:rank] = parsed_parent_category.first.delete(',').to_i # "903,610" -> 903610
  product_hash[:category] = parsed_parent_category.last

  ########
  # # Subcategory Seller Rank Parser
  ########
  extract_subcategory_rankings( html_doc.css('.zg_hrsr_item') )

  ########
  # # Package Dimension Parser
  ########
  # Package Dimension Parsing Strategy 1:
  product_hash[:package_dimensions] = html_doc.css('.size-weight').children.map{|r| r.text}.reject{|r| !r.match?("inches")}.first

  # Package Dimension Parsing Strategy 2:
  if product_hash[:package_dimensions].blank?

    # Find an index for the string "Package Dimensions" within a string text extraction of the page
    str_index = html_doc.inner_text.index("Package Dimensions")

    unless str_index.nil?

      # Reduce string representing the html page down to a smaller target string including "Package Dimensions" and the weights
      str = html_doc.inner_text[str_index .. str_index + 150]

      # Find within target string an index for where the word "inches" appears, then grab characters around it
      product_hash[:package_dimensions] = str[str.index("inches")- 20.. str.index("inches")+8].strip
    end
  end

  # Package Dimension Parsing Strategy 3:
    response.search('.//*[@class="a-color-secondary a-size-base prodDetSectionEntry"]').map{|n| n.parent}.each do |n|

      # Parse html in each row of Amazon's product details table to get back a string. E.g:  "\n    \n        Best Sellers Rank\n    \n    \n         \n              \n #63 in Toys & Games (See Top 100 in Toys & Games)\n        \n              \n #3 in Toys & Games > Baby & Toddler Toys > Teethers\n        \n              \n        \n    \n    "
      str = n.children.inner_text
      if product_hash[:best_sellers_rank].blank?
        str.match("Best Sellers Rank") do |m|
          # Gnarly string manipulation extracts the array: ["63", "in", "Toys & Games"]
          parsed_parent_category = str.partition("(").first.chop.partition("#").last.partition("in").map(&:strip)

          # From ["63", "in", "Toys & Games"] we only care about first & last parts of this array
          product_hash[:rank] = parsed_parent_category.first.delete(',').to_i
          product_hash[:category] = parsed_parent_category.last

          parsed_categories = str.partition(")").last.split("#").map(&:strip)
          parsed_categories.each do |pc|
            next if pc.blank?
            parsed_category = pc.partition("in").map(&:strip).map{|i| i.gsub("#", "")} - ["in"]

            hsh = {}
            hsh[:rank] = parsed_category.first.delete(',').to_i
            hsh[:ladder] = parsed_category.last
            product_hash[:best_sellers_rank] << hsh
         end
        end
      end

      if product_hash[:product_dimensions].blank?

        # Use pattern matching to extract the product details we care about
        str.match("Product Dimensions") do |m|
          product_hash[:package_dimensions] = str[str.index("inches")- 20.. str.index("inches")+8].strip
        end
      end
    end
  product_hash
end