class Datasets::Wikipedia

Constants

Contributor
Page
Revision

Public Class Methods

new(language: :en, type: :articles) click to toggle source
Calls superclass method Datasets::Dataset::new
# File lib/datasets/wikipedia.rb, line 28
def initialize(language: :en,
               type: :articles)
  super()
  @language = language
  @type = type
  @metadata.id = "wikipedia-#{@language}-#{@type}"
  @metadata.name = "Wikipedia #{@type} (#{@language})"
  @metadata.url = "https://dumps.wikimedia.org/"
  @metadata.licenses = [
    "CC-BY-SA-3.0",
    "CC-BY-SA-4.0",
    "GFDL-1.3-or-later",
  ]
  @metadata.description = "Wikipedia #{@type} in #{@language}"
end

Public Instance Methods

each(&block) click to toggle source
# File lib/datasets/wikipedia.rb, line 44
def each(&block)
  return to_enum(__method__) unless block_given?

  open_data do |input|
    listener = ArticlesListener.new(block)
    parser = REXML::Parsers::StreamParser.new(input, listener)
    parser.parse
  end
end

Private Instance Methods

open_data(&block) click to toggle source
# File lib/datasets/wikipedia.rb, line 55
def open_data(&block)
  base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
  data_path = cache_dir_path + base_name
  unless data_path.exist?
    data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
    download(data_path, data_url)
  end

  extract_bz2(data_path, &block)
end
type_in_path() click to toggle source
# File lib/datasets/wikipedia.rb, line 66
def type_in_path
  case @type
  when :articles
    "pages-articles"
  else
    @type.to_s
  end
end