class Scraper

Constants

BASE_URL

Attributes

author[RW]
story[RW]
title[RW]
url[RW]

Public Class Methods

scrape_article(url) click to toggle source
# File lib/scraper.rb, line 29
def self.scrape_article(url)

  agent = Mechanize.new
  article = agent.get(url)
  stories = []
  story_hash = {}

  title = article.search("//*[@id='headline']").text
  author = article.search('.byline-author').text

  story_hash[:title] = title
  story_hash[:author] = author
  story_hash[:url] =  url

  article_string = ""
 
  article.search(".story-body *").each do |paragraph|

   if  !article_string.include?(paragraph.children.text)
    
    if paragraph.name == "p" && paragraph.children.text != "Advertisement"
       article_string << paragraph.children.text + "\n" + "\n"
    elsif paragraph.name == "h4" && !paragraph.children.text.nil?
       article_string << paragraph.children.text + "\n" + "\n"
    end
   end
    
  end
  story_hash[:story] = article_string

  story_hash
  
  
end
scrape_front_page() click to toggle source
# File lib/scraper.rb, line 9
def self.scrape_front_page
 
  agent = Mechanize.new
  index = agent.get(BASE_URL)
  front_page_articles = []
  index.css(".story-heading").each do |story|
    val =  story.css("a").text 
    next if val.nil? || val == false || val == ""
    hash = {
     
       :title => story.css("a").text.strip,  
       :url => story.css('a').attribute('href').value
    }

    front_page_articles  << hash
  end

  front_page_articles 
 end