class TensorStream::Train::AdamOptimizer

High Level implementation of the ADAM algorithm

Attributes

learning_rate[RW]

Public Class Methods

new(learning_rate = 0.001, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8, use_locking: false, name: "Adam") click to toggle source

Construct a new Adam optimizer.

Args: learning_rate: A Tensor or a floating point value. The learning rate. beta1: A float value or a constant float tensor.

The exponential decay rate for the 1st moment estimates.

beta2: A float value or a constant float tensor.

The exponential decay rate for the 2nd moment estimates.

epsilon: A small constant for numerical stability. This epsilon is

"epsilon hat" in the Kingma and Ba paper (in the formula just before
Section 2.1), not the epsilon in Algorithm 1 of the paper.

use_locking: If True use locks for update operations. name: Optional name for the operations created when applying gradients.

Defaults to "Adam".
Calls superclass method
# File lib/tensor_stream/train/adam_optimizer.rb, line 24
def initialize(learning_rate = 0.001, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8,
  use_locking: false, name: "Adam")
  @learning_rate = learning_rate
  @beta1 = beta1
  @beta2 = beta2
  @epsilon = epsilon

  # Tensor versions of the constructor arguments, created in _prepare().
  @lr_t = nil
  @beta1_t = nil
  @beta2_t = nil
  @epsilon_t = nil

  # Created in SparseApply if needed.
  @updated_lr = nil
  super(name: name, use_locking: use_locking)
end

Protected Instance Methods

apply_dense(grad, var) click to toggle source
# File lib/tensor_stream/train/adam_optimizer.rb, line 74
def apply_dense(grad, var)
  m = get_slot(var, "m")
  v = get_slot(var, "v")
  beta1_power, beta2_power = get_beta_accumulators
  _op(:apply_adam,
    var, m, v,
    TensorStream.cast(beta1_power, var.data_type),
    TensorStream.cast(beta2_power, var.data_type),
    TensorStream.cast(@lr_t, var.data_type),
    TensorStream.cast(@beta1_t, var.data_type),
    TensorStream.cast(@beta2_t, var.data_type),
    TensorStream.cast(@epsilon_t, var.data_type),
    grad, use_locking: @use_locking)
end
create_slots(var_list) click to toggle source
# File lib/tensor_stream/train/adam_optimizer.rb, line 62
def create_slots(var_list)
  first_var = var_list.min_by(&:name)
  create_non_slot_variable(@beta1, "beta1_power", first_var)
  create_non_slot_variable(@beta2, "beta2_power", first_var)

  # Create slots for the first and second moments.
  var_list.each do |v|
    zeros_slot(v, "m", @name)
    zeros_slot(v, "v", @name)
  end
end
finish(update_ops, name_scope) click to toggle source
# File lib/tensor_stream/train/adam_optimizer.rb, line 89
def finish(update_ops, name_scope)
  TensorStream.control_dependencies(update_ops) do
    beta1_power, beta2_power = get_beta_accumulators
    update_beta1 = nil, update_beta2 = nil
    TensorStream.colocate_with(beta1_power) do
      update_beta1 = beta1_power.assign(beta1_power * @beta1_t, use_locking: @use_locking)
      update_beta2 = beta2_power.assign(beta2_power * @beta2_t, use_locking: @use_locking)
    end
    TensorStream.group(update_ops + [update_beta1, update_beta2], name: name_scope)
  end
end
get_beta_accumulators() click to toggle source
# File lib/tensor_stream/train/adam_optimizer.rb, line 44
def get_beta_accumulators
  graph = TensorStream.get_default_graph
  [get_non_slot_variable("beta1_power", graph: graph),
   get_non_slot_variable("beta2_power", graph: graph),]
end
prepare() click to toggle source
# File lib/tensor_stream/train/adam_optimizer.rb, line 50
def prepare
  lr = call_if_callable(@learning_rate)
  beta1 = call_if_callable(@beta1)
  beta2 = call_if_callable(@beta2)
  epsilon = call_if_callable(@epsilon)

  @lr_t = TensorStream.convert_to_tensor(lr, name: "learning_rate")
  @beta1_t = TensorStream.convert_to_tensor(beta1, name: "beta1")
  @beta2_t = TensorStream.convert_to_tensor(beta2, name: "beta2")
  @epsilon_t = TensorStream.convert_to_tensor(epsilon, name: "epsilon")
end