class Scraper
Public Class Methods
all_topics()
click to toggle source
Scrapes all main topics from all portals main page creates all topic instances
# File lib/scraper.rb, line 32 def self.all_topics html = open("https://en.wikipedia.org/wiki/Portal:Contents/Portals") doc = Nokogiri::HTML(html) do |config| config.noblanks end #sets a container for the main topic headlines doc.search("#mw-content-text div table table div").each{|anchor| if anchor['style'] == "position: relative;border: 0px solid #A3BFB1;background: #CEF2E0;color: black;padding: .1em;text-align: center;font-weight: bold;font-size: 100%;margin-bottom: 0px;border-top: 1px solid #A3BFB1;border-bottom: 1px solid #A3BFB1;" anchor['class'] = "title_container" unless anchor.text.include?("General reference") end } #set .headlines class for all main topics doc.search("h2 .mw-headline big").each{|anchor| anchor['class'] = "headlines" unless anchor.text == "Wikipedia's contents: Portals" || anchor.text == "Wikipedia's contents: Portals" || anchor.text.include?("General reference") } #updating the @@all_topics hash with topic symbols doc.search(".headlines").each{|anchor| copy = anchor.text.chomp("(see in all page types)").strip copy.slice!(-3..-1) @@all_topics << copy } @@all_topics.each{|item| Topic.new(item) } @@all_topics end
get_portal_name(url)
click to toggle source
# File lib/scraper.rb, line 63 def self.get_portal_name(url) html = open(url) puts "***Scraping Portal Name" doc = Nokogiri::HTML(html) do |config| config.noblanks end doc.search("title").text end
scrape_portals_page(name)
click to toggle source
# File lib/scraper.rb, line 4 def self.scrape_portals_page(name) choice_index = @@all_topics.index(name) + 1 #choice is the chosen topic index #there are 11 main topics derrived from Scraper.all_topics html = open("https://en.wikipedia.org/wiki/Portal:Contents/Portals") doc = Nokogiri::HTML(html) do |config| config.noblanks end puts "***Scraping Portals Page" #set portals-container class for all portal links for each topic #Thus there are 12 portal links containers but we're skipping the first one doc.search("div").each{|anchor| if anchor['style'] == "box-sizing: border-box; border: 0px solid #A3BFB1; border-bottom: 0px solid #A3BFB1;; border-top-width: 1px; vertical-align: top;background: #F5FFFA;opacity: 1; color: black; text-align: left; margin: 0 0 10px; padding: 1em;;padding-top: .3em;-moz-border-radius: 0; -webkit-border-radius: 0; border-radius: 0;" anchor['class'] = "portals-container" end } #randomnly select a sub-portal from the main topic portal choice randval = Random.new randnum = randval.rand(doc.search(".portals-container")[choice_index].search("a").count{|i| i.attribute("href").value.include?("/wiki/Portal:")}) randportal = doc.search(".portals-container")[choice_index].search("a")[randnum].attribute("href").value.prepend("https://en.wikipedia.org") randportal end