class Wordwise::Scraper
Scrapes web page containing word and the individual pages with their definitions.
Constants
- BASEPATH
Public Class Methods
scrape_entry_pages()
click to toggle source
Samples 4 urls to words' pages and parses the question word, its origin and definition, and 3 more definitions.
# File lib/scraper.rb, line 46 def self.scrape_entry_pages docs, word_urls = [], [] question_words = Wordwise::CLI.question_words # Iterates over array to make array of urls that are parsed by Nokogiri # and put in another array. question_words.each_index do |i| word_urls << "#{BASEPATH}/definition/#{question_words[i]}" docs << Nokogiri::HTML(open(word_urls[i])) end # Sets variable for word origin. origin_wrapper = docs[0].css('.senseInnerWrapper p')[-1] if origin_wrapper origin = origin_wrapper.text else origin = 'Origin not available.' end end
scrape_word_list(page_idx)
click to toggle source
Scrapes a page with a word list.
# File lib/scraper.rb, line 26 def self.scrape_word_list(page_idx) doc = Nokogiri::HTML(open(@list_urls[page_idx])) @words_defs = {} # Creates hash of word-definition pairs. (0..doc.css('tr').length - 1).each do |i| @words_defs.store(doc.css('tr')[i].css('td')[0].text, doc.css('tr')[i].css('td')[1].text) end # Removes invalid entries @words_defs.delete('') @words_defs.delete_if { |w| w =~ /\W/ || w =~ /xylene/ || w =~ /do/ } # Converts hash to array for use in .scrape_entry_pages. @words_defs_ary = @words_defs.to_a end
scrape_word_lists()
click to toggle source
Scrapes page with list of word lists.
# File lib/scraper.rb, line 8 def self.scrape_word_lists html = Nokogiri::HTML(open(BASEPATH + '/explore/word-lists')) @list_urls, lists = [], [] # Populates arrays of word list names and urls. (0..html.css('.record').size - 1).each do |i| @list_urls << BASEPATH + html.css('.record a')[i].attribute('href').value lists << html.css('.record h2')[i].text end # Removes list not fitting format. @list_urls.delete_if { |u| u =~ /phobias/ } lists.delete_if { |l| l =~ /Phobias/ } lists end