class Scraper
Constants
- BASE_URL
Attributes
story[RW]
title[RW]
url[RW]
Public Class Methods
scrape_article(url)
click to toggle source
# File lib/scraper.rb, line 29 def self.scrape_article(url) agent = Mechanize.new article = agent.get(url) stories = [] story_hash = {} title = article.search("//*[@id='headline']").text author = article.search('.byline-author').text story_hash[:title] = title story_hash[:author] = author story_hash[:url] = url article_string = "" article.search(".story-body *").each do |paragraph| if !article_string.include?(paragraph.children.text) if paragraph.name == "p" && paragraph.children.text != "Advertisement" article_string << paragraph.children.text + "\n" + "\n" elsif paragraph.name == "h4" && !paragraph.children.text.nil? article_string << paragraph.children.text + "\n" + "\n" end end end story_hash[:story] = article_string story_hash end
scrape_front_page()
click to toggle source
# File lib/scraper.rb, line 9 def self.scrape_front_page agent = Mechanize.new index = agent.get(BASE_URL) front_page_articles = [] index.css(".story-heading").each do |story| val = story.css("a").text next if val.nil? || val == false || val == "" hash = { :title => story.css("a").text.strip, :url => story.css('a').attribute('href').value } front_page_articles << hash end front_page_articles end