class KdnuggetsRoundup::DataWrassler
Constants
- BASE_URL
web scraper class
- TOP_STORIES_PATH
Public Instance Methods
wrassle_article_attributes(article_url)
click to toggle source
# File lib/datawrassler.rb, line 29 def wrassle_article_attributes(article_url) #helper method to be called inside wrassle_top_stories doc = Nokogiri::HTML(open(article_url)) tags = doc.css('div.tag-data a') tags = tags.collect{|tag| tag.text} summary = doc.css('p.excerpt').text author = doc.css('#post- b').text.match(/\S*\s\S*[[:punct:]]/)[0].gsub(/[0-9[[:punct:]]]/, '') article = doc.css('p, ol, ul') counter = 0 excerpt = [] article.each do |paragraph| counter += 1 if counter < 3 #=> first two elements are normally bylines or other fluff next elsif counter > 8 #=> ensures only 5 elements make it through break end excerpt << paragraph.text end #excerpt = excerpt.delete_if{|x| x ==''} {author: author, tags: tags, summary: summary, excerpt: excerpt} end
wrassle_top_stories()
click to toggle source
# File lib/datawrassler.rb, line 11 def wrassle_top_stories #=> Note there are 7 stories in both most popular and most shared each week doc = Nokogiri::HTML(open(BASE_URL + TOP_STORIES_PATH)) stories = doc.css('ol.three_ol li') counter = 0 stories.each do |story| counter += 1 url = BASE_URL + story.css('a').attribute('href').text title = story.css('b').text if KdnuggetsRoundup::Article.find_by_title(title) #=> there are always 14 stories, but there are often duplicates article = KdnuggetsRoundup::Article.find_by_title(title) else article = KdnuggetsRoundup::Article.new(title, url) article.assign_attributes(wrassle_article_attributes(url)) end counter < 8 ? article.add_to_popular : article.add_to_shared #=> top stories shows the 7 most popular and 7 most shared articles end end