class Datasets::PennTreebank

Constants

DESCRIPTION
Record

Public Class Methods

new(type: :train) click to toggle source
Calls superclass method Datasets::Dataset::new
# File lib/datasets/penn-treebank.rb, line 16
def initialize(type: :train)
  valid_types = [:train, :test, :valid]
  unless valid_types.include?(type)
    valid_types_label = valid_types.collect(&:inspect).join(", ")
    message = "Type must be one of [#{valid_types_label}]: #{type.inspect}"
    raise ArgumentError, message
  end
  @type = type

  super()

  @metadata.id = "penn-treebank-#{@type}"
  @metadata.name = "Penn Treebank: #{@type}"
  @metadata.description = DESCRIPTION
  @metadata.url = "https://github.com/wojzaremba/lstm"
  @metadata.licenses = ["Apache-2.0"]
end

Public Instance Methods

each(&block) click to toggle source
# File lib/datasets/penn-treebank.rb, line 34
def each(&block)
  return to_enum(__method__) unless block_given?

  base_name = "ptb.#{@type}.txt"
  data_path = cache_dir_path + base_name
  unless data_path.exist?
    base_url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data"
    download(data_path, "#{base_url}/#{base_name}")
  end

  parse_data(data_path, &block)
end

Private Instance Methods

parse_data(data_path) { |record| ... } click to toggle source
# File lib/datasets/penn-treebank.rb, line 48
def parse_data(data_path)
  File.open(data_path) do |f|
    f.each_line do |line|
      line.split.each do |word|
        yield(Record.new(word.strip))
      end
    end
  end
end