class CorpusGenerator::Scraper

Constants

Attributes

html_doc[RW]

Public Class Methods

new() click to toggle source
# File lib/random_poetry_scraper/scraper.rb, line 6
def initialize
    self.html_doc = Nokogiri::HTML(open(BROWSE_LINK))
end

Public Instance Methods

scrape_poem_page() click to toggle source
# File lib/random_poetry_scraper/scraper.rb, line 10
def scrape_poem_page
    poem_attributes = {}
    poem_attributes[:name] = html_doc.css(".poem").css("h2").text
    poem_attributes[:text] = html_doc.css(".poem").css("p").inner_html.gsub("<br>", "\n").gsub(/\r\n[\t]+/, "")
    if poet_name = html_doc.css(".poet").text
        poem_attributes[:poet] = {}
        poem_attributes[:poet][:name] = poet_name
        poem_attributes[:poet][:profile_url] = ROOT_LINK + html_doc.css(".poem a").attr("href").value
    end

    poem_attributes == [] ? nil : poem_attributes
end