class CADataFrame::MergeFrame

Constants

LEFT_RIGHT_COMBINATIONS

Attributes

indicator[R]
keep_left[R]
keep_right[R]
left[R]
left_key_values[R]
left_keys[R]
merge_key[RW]
on[R]
right[R]
right_key_values[R]
right_keys[R]

Public Class Methods

new(left_df, right_df, opts={}) click to toggle source
# File lib/carray-dataframe/join.rb, line 50
def initialize left_df, right_df, opts={} # rubocop:disable Metrics/AbcSize -- quick-fix for issue #171
  init_opts(opts)
  validate_on!(left_df, right_df)
  key_sanitizer = ->(h) { sanitize_merge_keys(h.values_at(*on)) }
  @left = df_to_a(left_df)
  @left.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) }
  @left_key_values = @left.map(&key_sanitizer)
  @right = df_to_a(right_df)
  @right.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) }
  @right_key_values = @right.map(&key_sanitizer)
  @left_keys, @right_keys = merge_keys(left_df, right_df, on)
end

Public Instance Methods

join() click to toggle source
# File lib/carray-dataframe/join.rb, line 63
def join
  res = []
  until left.empty? && right.empty?
    lkey = first_left_key
    rkey = first_right_key
    row(lkey, rkey).tap { |r| res << r if r }
  end
  df = CADataFrame.new(res, order: dataframe_vector_names)
  if dataframe_vector_names.include?("index")
    df.set_index("index")
  else
    df
  end
end

Private Instance Methods

add_indicator(row, indicator_value) click to toggle source
# File lib/carray-dataframe/join.rb, line 152
def add_indicator(row, indicator_value)
  return row unless indicator
  row[indicator] = indicator_value
  row
end
cartesian_product() click to toggle source
# File lib/carray-dataframe/join.rb, line 250
def cartesian_product
  @cartesian_product ||= left_rows_at_merge_key.product(right_rows_at_merge_key).map do |left_row, right_row|
    merge_rows(left_row, right_row)
  end
end
dataframe_vector_names() click to toggle source
# File lib/carray-dataframe/join.rb, line 96
def dataframe_vector_names
  left_keys.values + on + right_keys.values + Array(indicator)
end
df_to_a(df) click to toggle source
# File lib/carray-dataframe/join.rb, line 109
def df_to_a df
  # FIXME: much faster than "native" DataFrame#to_a. Should not be
  h = df.to_h
  keys = h.keys
  h.values.map(&:to_a).transpose.map { |r| keys.zip(r).to_h }
end
end_cartesian_product() click to toggle source
# File lib/carray-dataframe/join.rb, line 256
def end_cartesian_product
  left_size = left_rows_at_merge_key.size
  left_key_values.shift(left_size)
  left.shift(left_size)
  right_size = right_rows_at_merge_key.size
  right_key_values.shift(right_size)
  right.shift(right_size)
  @cartesian_product = nil
end
expand_row(row, renamings) click to toggle source
# File lib/carray-dataframe/join.rb, line 219
def expand_row row, renamings
  renamings
    .map { |from, to| [to, row[from]] }.to_h
    .merge(on.map { |col| [col, row[col]] }.to_h)
    .merge(indicator ? {indicator => nil} : {})
end
extract_left_right(how) click to toggle source
# File lib/carray-dataframe/join.rb, line 100
def extract_left_right(how)
  LEFT_RIGHT_COMBINATIONS[how] or
    raise ArgumentError, "Unrecognized join option: #{how}"
end
first_left_key() click to toggle source
# File lib/carray-dataframe/join.rb, line 234
def first_left_key
  left_key_values.empty? ? nil : left_key_values.first
end
first_right_key() click to toggle source
# File lib/carray-dataframe/join.rb, line 226
def first_right_key
  right_key_values.empty? ? nil : right_key_values.first
end
guard_duplicate(val, duplicates, num) click to toggle source
# File lib/carray-dataframe/join.rb, line 131
def guard_duplicate val, duplicates, num
  duplicates.include?(val) ? "#{val}_" : val
end
guard_keys(keys, duplicates, num) click to toggle source
# File lib/carray-dataframe/join.rb, line 127
def guard_keys keys, duplicates, num
  keys.map { |v| [v, guard_duplicate(v, duplicates, num)] }.to_h
end
init_opts(opts) click to toggle source
# File lib/carray-dataframe/join.rb, line 90
def init_opts(opts)
  @on = opts[:on].map(&:to_s)
  @keep_left, @keep_right = extract_left_right(opts[:how])
  @indicator = opts[:indicator]
end
left_row_missing_right() click to toggle source
# File lib/carray-dataframe/join.rb, line 197
def left_row_missing_right
  val = one_to_one_left_row
  expand_row(val, left_keys) if keep_left
end
left_rows_at_merge_key() click to toggle source
# File lib/carray-dataframe/join.rb, line 242
def left_rows_at_merge_key
  left.take_while { |arr| sanitize_merge_keys(arr.values_at(*on)) == merge_key }
end
lt(k1, k2) click to toggle source
# File lib/carray-dataframe/join.rb, line 207
def lt(k1, k2)
  (k1 <=> k2) == -1
end
merge_keys(df1, df2, on) click to toggle source
# File lib/carray-dataframe/join.rb, line 116
def merge_keys(df1, df2, on)
  duplicates =
    (df1.column_names + df2.column_names - on)
    .group_by(&:itself)
    .select { |_, g| g.count == 2 }.map(&:first)
  [
    guard_keys(df1.column_names - on, duplicates, 1),
    guard_keys(df2.column_names - on, duplicates, 2)
  ]
end
merge_matching_rows() click to toggle source
# File lib/carray-dataframe/join.rb, line 158
def merge_matching_rows
  if one_to_one_merge?
    merge_rows(one_to_one_left_row, one_to_one_right_row)
  elsif one_to_many_merge?
    result = merge_rows(left.first, right.first)
    one_to_many_shift
    result
  else
    result = cartesian_product.shift
    end_cartesian_product if cartesian_product.empty?
    result
  end
end
merge_rows(lrow, rrow) click to toggle source
# File lib/carray-dataframe/join.rb, line 211
def merge_rows lrow, rrow
  left_keys
    .map { |from, to| [to, lrow[from]] }.to_h
    .merge(on.map { |col| [col, lrow[col]] }.to_h)
    .merge(indicator ? {indicator => nil} : {})
    .merge(right_keys.map { |from, to| [to, rrow[from]] }.to_h)
end
next_left_key() click to toggle source
# File lib/carray-dataframe/join.rb, line 238
def next_left_key
  left_key_values[1]
end
next_right_key() click to toggle source
# File lib/carray-dataframe/join.rb, line 230
def next_right_key
  right_key_values[1]
end
one_to_many_merge?() click to toggle source
# File lib/carray-dataframe/join.rb, line 183
def one_to_many_merge?
  !(merge_key == next_left_key && merge_key == next_right_key)
end
one_to_many_shift() click to toggle source
# File lib/carray-dataframe/join.rb, line 172
def one_to_many_shift
  shift_left = first_right_key != next_right_key
  shift_right = first_left_key != next_left_key
  one_to_one_left_row if shift_left
  one_to_one_right_row if shift_right
end
one_to_one_left_row() click to toggle source
# File lib/carray-dataframe/join.rb, line 187
def one_to_one_left_row
  left_key_values.shift
  left.shift
end
one_to_one_merge?() click to toggle source
# File lib/carray-dataframe/join.rb, line 179
def one_to_one_merge?
  merge_key != next_left_key && merge_key != next_right_key
end
one_to_one_right_row() click to toggle source
# File lib/carray-dataframe/join.rb, line 192
def one_to_one_right_row
  right_key_values.shift
  right.shift
end
right_row_missing_left() click to toggle source
# File lib/carray-dataframe/join.rb, line 202
def right_row_missing_left
  val = one_to_one_right_row
  expand_row(val, right_keys) if keep_right
end
right_rows_at_merge_key() click to toggle source
# File lib/carray-dataframe/join.rb, line 246
def right_rows_at_merge_key
  right.take_while { |arr| sanitize_merge_keys(arr.values_at(*on)) == merge_key }
end
row(lkey, rkey) click to toggle source
# File lib/carray-dataframe/join.rb, line 135
def row(lkey, rkey)
  case
  when !lkey && !rkey
    # :nocov:
    # It's just an impossibility handler, can't be covered :)
    raise 'Unexpected condition met during merge'
    # :nocov:
  when lkey == rkey
    self.merge_key = lkey
    add_indicator(merge_matching_rows, :both)
  when !rkey || lt(lkey, rkey)
    add_indicator(left_row_missing_right, :left_only)
  else # !lkey || lt(rkey, lkey)
    add_indicator(right_row_missing_left, :right_only)
  end
end
safe_compare(left_array, right_array) click to toggle source
# File lib/carray-dataframe/join.rb, line 273
def safe_compare(left_array, right_array)
  left_array.zip(right_array).map { |l, r|
    next 0 if l.nil? && r.nil?
    next 1 if r.nil?
    next -1 if l.nil?
    l <=> r
  }.reject(&:zero?).first || 0
end
sanitize_merge_keys(merge_keys) click to toggle source
# File lib/carray-dataframe/join.rb, line 105
def sanitize_merge_keys(merge_keys)
  merge_keys.map { |v| v.nil? ? NilSorter.new : v }
end
validate_on!(left_df, right_df) click to toggle source
# File lib/carray-dataframe/join.rb, line 266
def validate_on!(left_df, right_df)
  @on.each do |on|
    left_df.has_column?(on) && right_df.has_column?(on) or
      raise ArgumentError, "Both dataframes expected to have #{on.inspect} field"
  end
end