class RP::EMR::Step::S3DistCp

Create a S3DistCp step docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/UsingEMR_s3distcp.html

Handles annoying details like argument escaping

Accepts all the input parameters listed in the documentation as of version 1.0.7.

@example

step = S3DistCpStep.new do |s|
  s.src = 's3://bucket/input/prefix/'   # Note this is NOT Hadoop's glob syntax
  s.dest = 's3://bucket/output/path'
  s.srcPattern = 's3://bucket/input/prefix/[foo|bar].*\.eml'     # Input regex - see Java's regex docs
  s.groupBy = '.*([a-z0-9]{2}).tsv'     # Note that you need a capture group
  s.targetSize = 120.megabytes
  s.compression = 'snappy'
  s.deleteOnSuccess = true
end

step.to_hash                            # => Ruby hash ready for use in :steps key of a job

Constants

BOOLEAN_FIELDS
DEFAULT_S3_DISTCP_JAR
HASH_FIELDS

Public Instance Methods

to_hash() click to toggle source
# File lib/rp/emr/step/s3_dist_cp.rb, line 57
def to_hash
  step.to_hash
end

Private Instance Methods

boolean_fields_args() click to toggle source
# File lib/rp/emr/step/s3_dist_cp.rb, line 85
def boolean_fields_args
  BOOLEAN_FIELDS.
    reject { |f| !send(f) }.
    map { |f| "--#{f}" }
end
hash_field_args() click to toggle source
# File lib/rp/emr/step/s3_dist_cp.rb, line 74
def hash_field_args
  HASH_FIELDS.each do |f|
    raise ArgumentError, "I don't know how to handle whitespace" if send(f) =~ / /
  end

  HASH_FIELDS.
    map { |f| [f, send(f)] }.
    reject { |k, v| v.nil? }.
    flat_map { |k, v| ["--#{k}", v.to_s] }
end
step() click to toggle source
# File lib/rp/emr/step/s3_dist_cp.rb, line 63
def step
  RP::EMR::Step.new(
    name: name,
    action_on_failure: action_on_failure,
    hadoop_jar_step: {
      jar: s3_distcp_jar,
      args: hash_field_args + boolean_fields_args,
    }
  )
end