class Wikihow::Scraper
Public Class Methods
scrape_for_categories()
click to toggle source
# File lib/wikihow/scraper.rb, line 2 def self.scrape_for_categories doc = Nokogiri::HTML(open("https://www.wikihow.com/Main-Page")) categories_array = [] doc.search("#hp_categories a").each do |category| title = category.text url = category.attr("href") categories_array << {:title => title,:url => url} end categories_array end
scrape_for_topics(category)
click to toggle source
# File lib/wikihow/scraper.rb, line 13 def self.scrape_for_topics(category) doc = Nokogiri::HTML(open("https://www.wikihow.com" + category.url)) topics_array = [] doc.search("#cat_container #cat_all a").each do |topic| title = topic.search("span").text.strip url = topic.attr("href") topics_array << {:title => title,:url => url} if title != "" end topics_array end
scrape_topic(topic)
click to toggle source
# File lib/wikihow/scraper.rb, line 24 def self.scrape_topic(topic) doc = Nokogiri::HTML(open(topic.url)) topic.intro = doc.search("#intro p").last.text sections_array = [] doc.search("#intro #method_toc .toc_method").each do |method| sections_array << {:section_title => method.text, :section_steps => []} end sections_array.each.with_index do |section, i| doc.search(".steps")[i].search(".step").each do |section_li| step_description = [section_li.search(".whb").text.strip + " " + section_li.search("> text()").text.strip] section_li.search("> ul > li").each do |step_li| bullet_point = [step_li.search("> text(), a").text.strip] sub_bullet_point = step_li.search("> ul > li").collect {|bullet_point_li|bullet_point_li.search("> text()").text.strip} bullet_point << sub_bullet_point if sub_bullet_point !=[] step_description << bullet_point if bullet_point != [] end section[:section_steps] << step_description end if doc.search(".steps")[i].search(".step").empty? section[:section_steps] << ["There is no text description for this section"] end end sections_array end