class InfoScraper

this is the goodreads scraper require_relative “EbookDealInfo”

Public Instance Methods

info_scrape(book) click to toggle source
# File lib/ebookdealinfo/info_scraper.rb, line 6
def info_scrape(book) #for each instance of book in the class collection, go get blurb, series, gr rating/rates and add them to that instance; also author to deal with last name only from scrape?
  search_string = "#{book.title} #{book.author.gsub(".", ". ").gsub(/[^\w\s]/,"")}".gsub(/(\A|\s)\S\s/," ").gsub(/[^a-zA-Z0-9']+/, "+") #turns the author + title into a usable goodreads search string
  search_page = Nokogiri::HTML(open("https://www.goodreads.com/search?q=#{search_string}&search_type=books",'User-Agent' => 'Ruby')) #uses the search string to pull an item's goodreads page
  if search_page.css("table a").size != 0
    determinant = search_page.css("span.minirating").map.with_index {|i,index| [index, i.text.strip.slice(/\s(\d|,)+/).strip.gsub(",","").to_i]}.sort! {|x,y| x[1].to_i <=> y[1].to_i}.last #the search result with the most rates (and presumably most legitimate) is an array [result_index, #rates]
    item_page = Nokogiri::HTML(open("https://goodreads.com/#{search_page.css("table a.bookTitle")[determinant[0]].attribute("href").value}",'User-Agent' => 'Ruby').read)
    book.author = item_page.search("div#bookAuthors.stacked span :not(.greyText) :not(.smallText)").text #gets the complete author name since reddit might not provide it
    book.title = item_page.search("h1#bookTitle.bookTitle").text.slice(/^(\n).+(.\n)/).strip #goodreads provides better titles
    book.series = item_page.search("h1#bookTitle.bookTitle :first-child").text.strip.gsub(/[()]/, "") #provides series
    book.rating = item_page.search("span.average").text #average rating
    book.rates = item_page.search("span.votes.value-title").text.strip #number of ratings
    #blurb needs work
    book.blurb = item_page.xpath('//span[starts-with(@id, "freeText")]')[1].text#grab the blurb
    #we will scrape the top two genre entries, but when we want to check if one is a more specific form of the other
    if item_page.search("div.bigBoxContent div.elementList div.left").empty?
      book.genre_one = "No genre listed"
      book.genre_two = ""
    else
      genre_one = item_page.search("div.bigBoxContent div.elementList div.left")[0].text.split("\n").map {|i| i.strip if i.strip.size > 0}.reject {|i| i==nil} # turn the first genre into a stripped array of actual content
      book.genre_one = ""
      genre_one.each {|i| book.genre_one << "#{i} "}
      if item_page.search("div.bigBoxContent div.elementList div.left")[1] != nil
        genre_two = item_page.search("div.bigBoxContent div.elementList div.left")[1].text.split("\n").map {|i| i.strip if i.strip.size > 0}.reject {|i| i==nil}
        book.genre_two = ""
        genre_two.each {|i| book.genre_two << "#{i} "}
      else
        book.genre_two = ""
      end
    end
  else #instead of a raising a nobook error that will break the looping, let's flag the book as incomplete and not display it at the end
    book.completable = false
  end
end