class Onebox::Engine::AmazonOnebox

Public Instance Methods

http_params() click to toggle source
# File lib/onebox/engine/amazon_onebox.rb, line 43
def http_params
  if @options && @options[:user_agent]
    { 'User-Agent' => @options[:user_agent] }
  end
end
tld() click to toggle source
# File lib/onebox/engine/amazon_onebox.rb, line 39
def tld
  @tld ||= @@matcher.match(@url)["tld"]
end
url() click to toggle source
# File lib/onebox/engine/amazon_onebox.rb, line 16
def url
  @raw ||= nil

  # If possible, fetch the cached HTML body immediately so we can
  # try to grab the canonical URL from that document,
  # rather than guess at the best URL structure to use
  if !@raw && has_cached_body
    @raw = Onebox::Helpers.fetch_html_doc(@url, http_params, body_cacher)
  end

  if @raw
    canonical_link = @raw.at('//link[@rel="canonical"]/@href')
    return canonical_link.to_s if canonical_link
  end

  if match && match[:id]
    id = Addressable::URI.encode_component(match[:id], Addressable::URI::CharacterClasses::PATH)
    return "https://www.amazon.#{tld}/dp/#{id}"
  end

  @url
end

Private Instance Methods

data() click to toggle source
# File lib/onebox/engine/amazon_onebox.rb, line 107
def data
  og = ::Onebox::OpenGraph.new(raw)

  if raw.at_css('#dp.book_mobile') # printed books
    title = raw.at("h1#title")&.inner_text
    authors = raw.at_css('#byline_secondary_view_div') ? multiple_authors("//div[@id='byline_secondary_view_div']//span[@class='a-text-bold']") : raw.at("#byline")&.inner_text
    rating = raw.at("#averageCustomerReviews_feature_div .a-icon")&.inner_text || raw.at("#cmrsArcLink .a-icon")&.inner_text

    table_xpath = "//div[@id='productDetails_secondary_view_div']//table[@id='productDetails_techSpec_section_1']"
    isbn = raw.xpath("#{table_xpath}//tr[8]//td").inner_text.strip

    # if ISBN is misplaced or absent it's hard to find out which data is
    # available and where to find it so just set it all to nil
    if /^\d(\-?\d){12}$/.match(isbn)
      publisher = raw.xpath("#{table_xpath}//tr[1]//td").inner_text.strip
      published = raw.xpath("#{table_xpath}//tr[2]//td").inner_text.strip
      book_length = raw.xpath("#{table_xpath}//tr[6]//td").inner_text.strip
    else
      isbn = publisher = published = book_length = nil
    end

    result = {
      link: url,
      title: title,
      by_info: authors,
      image: og.image || image,
      description: raw.at("#productDescription")&.inner_text,
      rating: "#{rating}#{', ' if rating && (!isbn&.empty? || !price&.empty?)}",
      price: price,
      isbn_asin_text: "ISBN",
      isbn_asin: isbn,
      publisher: publisher,
      published: "#{published}#{', ' if published && !price&.empty?}"
    }

  elsif raw.at_css('#dp.ebooks_mobile') # ebooks
    title = raw.at("#ebooksTitle")&.inner_text
    authors = raw.at_css('#a-popover-mobile-udp-contributor-popover-id') ? multiple_authors("//div[@id='a-popover-mobile-udp-contributor-popover-id']//span[contains(@class,'a-text-bold')]") : (raw.at("#byline")&.inner_text&.strip || raw.at("#bylineInfo")&.inner_text&.strip)
    rating = raw.at("#averageCustomerReviews_feature_div .a-icon")&.inner_text || raw.at("#cmrsArcLink .a-icon")&.inner_text || raw.at("#acrCustomerReviewLink .a-icon")&.inner_text

    table_xpath = "//div[@id='detailBullets_secondary_view_div']//ul"
    asin = raw.xpath("#{table_xpath}//li[4]/span/span[2]").inner_text

    # if ASIN is misplaced or absent it's hard to find out which data is
    # available and where to find it so just set it all to nil
    if /^[0-9A-Z]{10}$/.match(asin)
      publisher = raw.xpath("#{table_xpath}//li[2]/span/span[2]").inner_text
      published = raw.xpath("#{table_xpath}//li[1]/span/span[2]").inner_text
    else
      asin = publisher = published = nil
    end

    result = {
      link: url,
      title: title,
      by_info: authors,
      image: og.image || image,
      description: raw.at("#productDescription")&.inner_text,
      rating: "#{rating}#{', ' if rating && (!asin&.empty? || !price&.empty?)}",
      price: price,
      isbn_asin_text: "ASIN",
      isbn_asin: asin,
      publisher: publisher,
      published: "#{published}#{', ' if published && !price&.empty?}"
    }

  else
    title = og.title || CGI.unescapeHTML(raw.css("title").inner_text)
    result = {
      link: url,
      title: title,
      image: og.image || image,
      price: price
    }

    result[:by_info] = raw.at("#by-line")
    result[:by_info] = Onebox::Helpers.clean(result[:by_info].inner_html) if result[:by_info]

    summary = raw.at("#productDescription")

    description = og.description || summary&.inner_text
    description ||= raw.css("meta[name=description]").first&.[]("content")
    result[:description] = CGI.unescapeHTML(Onebox::Helpers.truncate(description, 250)) if description
  end

  result[:price] = nil if result[:price].start_with?("$0") || result[:price] == 0

  result
end
has_cached_body() click to toggle source
# File lib/onebox/engine/amazon_onebox.rb, line 51
def has_cached_body
  body_cacher&.respond_to?('cache_response_body?') &&
    body_cacher.cache_response_body?(uri.to_s) &&
    body_cacher.cached_response_body_exists?(uri.to_s)
end
image() click to toggle source
# File lib/onebox/engine/amazon_onebox.rb, line 61
def image
  if (main_image = raw.css("#main-image")) && main_image.any?
    attributes = main_image.first.attributes

    if attributes["data-a-hires"]
      return attributes["data-a-hires"].to_s
    elsif attributes["data-a-dynamic-image"]
      return ::JSON.parse(attributes["data-a-dynamic-image"].value).keys.first
    end
  end

  if (landing_image = raw.css("#landingImage")) && landing_image.any?
    attributes = landing_image.first.attributes

    if attributes["data-old-hires"]
      return attributes["data-old-hires"].to_s
    else
      return landing_image.first["src"].to_s
    end
  end

  if (ebook_image = raw.css("#ebooksImgBlkFront")) && ebook_image.any?
    ::JSON.parse(ebook_image.first.attributes["data-a-dynamic-image"].value).keys.first
  end
end
match() click to toggle source
# File lib/onebox/engine/amazon_onebox.rb, line 57
def match
  @match ||= @url.match(/(?:d|g)p\/(?:product\/|video\/detail\/)?(?<id>[A-Z0-9]+)(?:\/|\?|$)/mi)
end
multiple_authors(authors_xpath) click to toggle source
# File lib/onebox/engine/amazon_onebox.rb, line 100
def multiple_authors(authors_xpath)
  raw
    .xpath(authors_xpath)
    .map { |a| a.inner_text.strip }
    .join(", ")
end
price() click to toggle source
# File lib/onebox/engine/amazon_onebox.rb, line 87
def price
  # get item price (Amazon markup is inconsistent, deal with it)
  if raw.css("#priceblock_ourprice .restOfPrice")[0] && raw.css("#priceblock_ourprice .restOfPrice")[0].inner_text
    "#{raw.css("#priceblock_ourprice .restOfPrice")[0].inner_text}#{raw.css("#priceblock_ourprice .buyingPrice")[0].inner_text}.#{raw.css("#priceblock_ourprice .restOfPrice")[1].inner_text}"
  elsif raw.css("#priceblock_dealprice") && (dealprice = raw.css("#priceblock_dealprice span")[0])
    dealprice.inner_text
  elsif !raw.css("#priceblock_ourprice").inner_text.empty?
    raw.css("#priceblock_ourprice").inner_text
  else
    raw.css(".mediaMatrixListItem.a-active .a-color-price").inner_text
  end
end