class Elasticrawl::Cluster

Configures the cluster settings for the job flow that will be launched. These settings are loaded from ~/.elasticrawl/cluster.yml.

Public Class Methods

new() click to toggle source
# File lib/elasticrawl/cluster.rb, line 5
def initialize
  @master_group = instance_group('master')
  @core_group = instance_group('core')
  @task_group = instance_group('task') if has_task_group?
end

Public Instance Methods

cluster_desc() click to toggle source

Describes the instances that will be launched. This is used by the job confirmation messages.

# File lib/elasticrawl/cluster.rb, line 33
    def cluster_desc
      cluster_desc = <<-HERE
Cluster configuration
Master: #{instance_group_desc(@master_group)}
Core:   #{instance_group_desc(@core_group)}
Task:   #{instance_group_desc(@task_group)}
HERE
    end
create_job_flow(job, emr_config = nil) click to toggle source

Returns a configured job flow to the calling job.

# File lib/elasticrawl/cluster.rb, line 12
def create_job_flow(job, emr_config = nil)
  config = Config.new

  Elasticity.configure do |c|
    c.access_key = config.access_key_id
    c.secret_key = config.secret_access_key
  end

  job_flow = Elasticity::JobFlow.new
  job_flow.name = "Job: #{job.job_name} #{job.job_desc}"
  job_flow.log_uri = job.log_uri

  configure_job_flow(job_flow)
  configure_instances(job_flow)
  configure_bootstrap_actions(job_flow, emr_config)

  job_flow
end

Private Instance Methods

config_for_group(group_name) click to toggle source

Returns the config settings for an instance group.

# File lib/elasticrawl/cluster.rb, line 129
def config_for_group(group_name)
  config_setting("#{group_name}_instance_group")
end
config_setting(key_name) click to toggle source

Returns a config setting from cluster.yml.

# File lib/elasticrawl/cluster.rb, line 134
def config_setting(key_name)
  config = Config.new
  config.load_config('cluster')[key_name]
end
configure_bootstrap_actions(job_flow, emr_config = nil) click to toggle source

Configures bootstrap actions that will be run when each instance is launched. EMR config is an XML file of Hadoop settings stored on S3. There are applied to each node by a bootstrap action.

# File lib/elasticrawl/cluster.rb, line 71
def configure_bootstrap_actions(job_flow, emr_config = nil)
  bootstrap_scripts = config_setting('bootstrap_scripts')

  if bootstrap_scripts.present?
    bootstrap_scripts.each do |script_uri|
      action = Elasticity::BootstrapAction.new(script_uri, '', '')
      job_flow.add_bootstrap_action(action)
    end
  end

  if emr_config.present?
    action = Elasticity::HadoopFileBootstrapAction.new(emr_config)
    job_flow.add_bootstrap_action(action)
  end
end
configure_instances(job_flow) click to toggle source

Configures the instances that will be launched. The master group has a single node. The task group is optional.

# File lib/elasticrawl/cluster.rb, line 62
def configure_instances(job_flow)
  job_flow.set_master_instance_group(@master_group)
  job_flow.set_core_instance_group(@core_group)
  job_flow.set_task_instance_group(@task_group) if @task_group.present?
end
configure_job_flow(job_flow) click to toggle source

Set job flow properties from settings in cluster.yml.

# File lib/elasticrawl/cluster.rb, line 44
def configure_job_flow(job_flow)
    ec2_key_name = config_setting('ec2_key_name')
    placement = config_setting('placement')
    emr_ami_version = config_setting('emr_ami_version')
    job_flow_role = config_setting('job_flow_role')
    service_role = config_setting('service_role')
    ec2_subnet_id = config_setting('ec2_subnet_id')

    job_flow.ec2_subnet_id = ec2_subnet_id if ec2_subnet_id.present?
    job_flow.ec2_key_name = ec2_key_name if ec2_key_name.present?
    job_flow.placement = placement if placement.present?
    job_flow.ami_version = emr_ami_version if emr_ami_version.present?
    job_flow.job_flow_role = job_flow_role if job_flow_role.present?
    job_flow.service_role = service_role if service_role.present?
end
has_task_group?() click to toggle source

Returns whether cluster.yml specifies a task group.

# File lib/elasticrawl/cluster.rb, line 88
def has_task_group?
  task_config = config_for_group('task')
  task_config.has_key?('instance_count') && task_config['instance_count'] > 0
end
instance_group(group_name) click to toggle source

Configures an instance group with the instance type, # of instances and the bid price if spot instances are to be used.

# File lib/elasticrawl/cluster.rb, line 110
def instance_group(group_name)
  config = config_for_group(group_name)

  instance_group = Elasticity::InstanceGroup.new
  instance_group.role = group_name.upcase
  instance_group.type = config['instance_type']

  if config.has_key?('instance_count') && config['instance_count'] > 0
    instance_group.count = config['instance_count']
  end

  if config['use_spot_instances'] == true
    instance_group.set_spot_instances(config['bid_price'])
  end

  instance_group
end
instance_group_desc(group) click to toggle source

Describes an instance group.

# File lib/elasticrawl/cluster.rb, line 94
def instance_group_desc(group)
  if group.present?
    if group.market == 'SPOT'
      price = "(Spot: #{group.bid_price})"
    else
      price = '(On Demand)'
    end

    "#{group.count} #{group.type}  #{price}"
  else
    '--'
  end
end