class RP::EMR::Step::S3DistCp
Create a S3DistCp
step docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/UsingEMR_s3distcp.html
Handles annoying details like argument escaping
Accepts all the input parameters listed in the documentation as of version 1.0.7.
@example
step = S3DistCpStep.new do |s| s.src = 's3://bucket/input/prefix/' # Note this is NOT Hadoop's glob syntax s.dest = 's3://bucket/output/path' s.srcPattern = 's3://bucket/input/prefix/[foo|bar].*\.eml' # Input regex - see Java's regex docs s.groupBy = '.*([a-z0-9]{2}).tsv' # Note that you need a capture group s.targetSize = 120.megabytes s.compression = 'snappy' s.deleteOnSuccess = true end step.to_hash # => Ruby hash ready for use in :steps key of a job
Constants
- BOOLEAN_FIELDS
- DEFAULT_S3_DISTCP_JAR
- HASH_FIELDS
Public Instance Methods
to_hash()
click to toggle source
# File lib/rp/emr/step/s3_dist_cp.rb, line 57 def to_hash step.to_hash end
Private Instance Methods
boolean_fields_args()
click to toggle source
# File lib/rp/emr/step/s3_dist_cp.rb, line 85 def boolean_fields_args BOOLEAN_FIELDS. reject { |f| !send(f) }. map { |f| "--#{f}" } end
hash_field_args()
click to toggle source
# File lib/rp/emr/step/s3_dist_cp.rb, line 74 def hash_field_args HASH_FIELDS.each do |f| raise ArgumentError, "I don't know how to handle whitespace" if send(f) =~ / / end HASH_FIELDS. map { |f| [f, send(f)] }. reject { |k, v| v.nil? }. flat_map { |k, v| ["--#{k}", v.to_s] } end
step()
click to toggle source
# File lib/rp/emr/step/s3_dist_cp.rb, line 63 def step RP::EMR::Step.new( name: name, action_on_failure: action_on_failure, hadoop_jar_step: { jar: s3_distcp_jar, args: hash_field_args + boolean_fields_args, } ) end