class WikiReader
Public Class Methods
article(title, language = 'en')
click to toggle source
# File lib/wikipedia.rb, line 29 def self.article(title, language = 'en') raise Errno::ENOENT if ! db.key?(language) # invalid language db[language] = fetch_vital(language) if ! db[language] # not initialized art = fetch_article(title, language) # read from the internet raise Errno::ENOENT if ! art db[language] << title if art && ! db[language].index(title) # adds in our list @@sizes[language] ||= {} @@sizes[language][title] = art.bytesize art end
articles(language = 'en')
click to toggle source
all articles in memory
# File lib/wikipedia.rb, line 18 def self.articles(language = 'en') return nil if ! db.key?(language) # invalid language db[language] = fetch_vital(language) if ! db[language] # not initialized db[language] end
db()
click to toggle source
# File lib/wikipedia.rb, line 48 def self.db return @@db if @@db @@db = Hash[fetch_languages.collect { |item| [item, nil] }] end
languages()
click to toggle source
# File lib/wikipedia.rb, line 25 def self.languages db.keys end
size(title, language = 'en')
click to toggle source
# File lib/wikipedia.rb, line 43 def self.size(title, language = 'en') @@sizes[language] ||= {} @@sizes[language][title] || 1 end
Private Class Methods
fetch_article(title, language = 'en')
click to toggle source
# File lib/wikipedia.rb, line 55 def self.fetch_article(title, language = 'en') begin url = "https://#{language}.wikipedia.org/wiki/#{URI::encode(title)}?action=raw" @@http.get_content(url) rescue => e puts e nil end end
fetch_languages()
click to toggle source
# File lib/wikipedia.rb, line 65 def self.fetch_languages # hardcoded list (used as fallback) default = [ 'en', 'ceb', 'sv', 'de', 'nl', 'fr', 'ru', 'it', 'es', 'war', 'pl', 'vi', 'ja', 'pt', 'zh', 'uk', 'ca', 'fa', 'ar', 'no', 'sh', 'fi', 'hu', 'id', 'cs', 'ko', 'ro', 'sr', 'tr', 'ms', 'eu', 'eo', 'bg', 'da', 'min', 'kk', 'hy', 'sk', 'zh-min-nan', 'he', 'lt', 'hr', 'ce', 'et', 'sl', 'be', 'gl', 'nn', 'el', 'uz', 'la', 'simple', 'vo', 'ur', 'hi', 'az', 'th', 'ka' ] # fetch from the internet lst = nil begin url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' doc = Nokogiri::HTML(@@http.get_content(url)) lst = doc.at_css('[id="1_000_000.2B_articles"]').xpath('./ancestor::h3')[0].next.next.children.xpath('td[4]').map {|e| e.text} lst += doc.at_css('[id="100_000.2B_articles"]').xpath('./ancestor::h3')[0].next.next.children.xpath('td[4]').map {|e| e.text} rescue end # uses default list if theres no results lst = (lst.class == Array) && lst.any? ? lst : default lst[0, 15] end
fetch_vital(language = 'en')
click to toggle source
# File lib/wikipedia.rb, line 90 def self.fetch_vital(language = 'en') lst = [] begin url = 'https://en.wikipedia.org/wiki/Wikipedia:Vital_articles' doc = Nokogiri::HTML(@@http.get_content(url)) # not english, click on "Languages" section (left column) if language != 'en' link = doc.at_css("a.interlanguage-link-target[lang='#{language}']") return [] if ! link url = link.attribute('href') doc = Nokogiri::HTML(@@http.get_content(url)) return [] if ! doc end # get all links starting with "/wiki/" and with no class (it will get some extra non-vital articles, but it's ok) lst = doc.xpath('//li/a[not (@class)][starts-with(@href, "/wiki/")]').map do |e| href = e.attribute('href').text[6..-1] # remove initial '/wiki/' href = URI::decode(href) # convert non-ascii chars pos = href.index('#') href = href[0, href.index('#')] if pos # remove anchor href end lst.uniq! rescue end # rescue lst end