class CorpusGenerator::Scraper
Constants
- BROWSE_LINK
- ROOT_LINK
Attributes
html_doc[RW]
Public Class Methods
new()
click to toggle source
# File lib/random_poetry_scraper/scraper.rb, line 6 def initialize self.html_doc = Nokogiri::HTML(open(BROWSE_LINK)) end
Public Instance Methods
scrape_poem_page()
click to toggle source
# File lib/random_poetry_scraper/scraper.rb, line 10 def scrape_poem_page poem_attributes = {} poem_attributes[:name] = html_doc.css(".poem").css("h2").text poem_attributes[:text] = html_doc.css(".poem").css("p").inner_html.gsub("<br>", "\n").gsub(/\r\n[\t]+/, "") if poet_name = html_doc.css(".poet").text poem_attributes[:poet] = {} poem_attributes[:poet][:name] = poet_name poem_attributes[:poet][:profile_url] = ROOT_LINK + html_doc.css(".poem a").attr("href").value end poem_attributes == [] ? nil : poem_attributes end