class CADataFrame::MergeFrame
Constants
- LEFT_RIGHT_COMBINATIONS
Attributes
indicator[R]
keep_left[R]
keep_right[R]
left[R]
left_key_values[R]
left_keys[R]
merge_key[RW]
on[R]
right[R]
right_key_values[R]
right_keys[R]
Public Class Methods
new(left_df, right_df, opts={})
click to toggle source
# File lib/carray-dataframe/join.rb, line 50 def initialize left_df, right_df, opts={} # rubocop:disable Metrics/AbcSize -- quick-fix for issue #171 init_opts(opts) validate_on!(left_df, right_df) key_sanitizer = ->(h) { sanitize_merge_keys(h.values_at(*on)) } @left = df_to_a(left_df) @left.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) } @left_key_values = @left.map(&key_sanitizer) @right = df_to_a(right_df) @right.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) } @right_key_values = @right.map(&key_sanitizer) @left_keys, @right_keys = merge_keys(left_df, right_df, on) end
Public Instance Methods
join()
click to toggle source
# File lib/carray-dataframe/join.rb, line 63 def join res = [] until left.empty? && right.empty? lkey = first_left_key rkey = first_right_key row(lkey, rkey).tap { |r| res << r if r } end df = CADataFrame.new(res, order: dataframe_vector_names) if dataframe_vector_names.include?("index") df.set_index("index") else df end end
Private Instance Methods
add_indicator(row, indicator_value)
click to toggle source
# File lib/carray-dataframe/join.rb, line 152 def add_indicator(row, indicator_value) return row unless indicator row[indicator] = indicator_value row end
cartesian_product()
click to toggle source
# File lib/carray-dataframe/join.rb, line 250 def cartesian_product @cartesian_product ||= left_rows_at_merge_key.product(right_rows_at_merge_key).map do |left_row, right_row| merge_rows(left_row, right_row) end end
dataframe_vector_names()
click to toggle source
# File lib/carray-dataframe/join.rb, line 96 def dataframe_vector_names left_keys.values + on + right_keys.values + Array(indicator) end
df_to_a(df)
click to toggle source
# File lib/carray-dataframe/join.rb, line 109 def df_to_a df # FIXME: much faster than "native" DataFrame#to_a. Should not be h = df.to_h keys = h.keys h.values.map(&:to_a).transpose.map { |r| keys.zip(r).to_h } end
end_cartesian_product()
click to toggle source
# File lib/carray-dataframe/join.rb, line 256 def end_cartesian_product left_size = left_rows_at_merge_key.size left_key_values.shift(left_size) left.shift(left_size) right_size = right_rows_at_merge_key.size right_key_values.shift(right_size) right.shift(right_size) @cartesian_product = nil end
expand_row(row, renamings)
click to toggle source
# File lib/carray-dataframe/join.rb, line 219 def expand_row row, renamings renamings .map { |from, to| [to, row[from]] }.to_h .merge(on.map { |col| [col, row[col]] }.to_h) .merge(indicator ? {indicator => nil} : {}) end
extract_left_right(how)
click to toggle source
# File lib/carray-dataframe/join.rb, line 100 def extract_left_right(how) LEFT_RIGHT_COMBINATIONS[how] or raise ArgumentError, "Unrecognized join option: #{how}" end
first_left_key()
click to toggle source
# File lib/carray-dataframe/join.rb, line 234 def first_left_key left_key_values.empty? ? nil : left_key_values.first end
first_right_key()
click to toggle source
# File lib/carray-dataframe/join.rb, line 226 def first_right_key right_key_values.empty? ? nil : right_key_values.first end
guard_duplicate(val, duplicates, num)
click to toggle source
# File lib/carray-dataframe/join.rb, line 131 def guard_duplicate val, duplicates, num duplicates.include?(val) ? "#{val}_" : val end
guard_keys(keys, duplicates, num)
click to toggle source
# File lib/carray-dataframe/join.rb, line 127 def guard_keys keys, duplicates, num keys.map { |v| [v, guard_duplicate(v, duplicates, num)] }.to_h end
init_opts(opts)
click to toggle source
# File lib/carray-dataframe/join.rb, line 90 def init_opts(opts) @on = opts[:on].map(&:to_s) @keep_left, @keep_right = extract_left_right(opts[:how]) @indicator = opts[:indicator] end
left_row_missing_right()
click to toggle source
# File lib/carray-dataframe/join.rb, line 197 def left_row_missing_right val = one_to_one_left_row expand_row(val, left_keys) if keep_left end
left_rows_at_merge_key()
click to toggle source
# File lib/carray-dataframe/join.rb, line 242 def left_rows_at_merge_key left.take_while { |arr| sanitize_merge_keys(arr.values_at(*on)) == merge_key } end
lt(k1, k2)
click to toggle source
# File lib/carray-dataframe/join.rb, line 207 def lt(k1, k2) (k1 <=> k2) == -1 end
merge_keys(df1, df2, on)
click to toggle source
# File lib/carray-dataframe/join.rb, line 116 def merge_keys(df1, df2, on) duplicates = (df1.column_names + df2.column_names - on) .group_by(&:itself) .select { |_, g| g.count == 2 }.map(&:first) [ guard_keys(df1.column_names - on, duplicates, 1), guard_keys(df2.column_names - on, duplicates, 2) ] end
merge_matching_rows()
click to toggle source
# File lib/carray-dataframe/join.rb, line 158 def merge_matching_rows if one_to_one_merge? merge_rows(one_to_one_left_row, one_to_one_right_row) elsif one_to_many_merge? result = merge_rows(left.first, right.first) one_to_many_shift result else result = cartesian_product.shift end_cartesian_product if cartesian_product.empty? result end end
merge_rows(lrow, rrow)
click to toggle source
# File lib/carray-dataframe/join.rb, line 211 def merge_rows lrow, rrow left_keys .map { |from, to| [to, lrow[from]] }.to_h .merge(on.map { |col| [col, lrow[col]] }.to_h) .merge(indicator ? {indicator => nil} : {}) .merge(right_keys.map { |from, to| [to, rrow[from]] }.to_h) end
next_left_key()
click to toggle source
# File lib/carray-dataframe/join.rb, line 238 def next_left_key left_key_values[1] end
next_right_key()
click to toggle source
# File lib/carray-dataframe/join.rb, line 230 def next_right_key right_key_values[1] end
one_to_many_merge?()
click to toggle source
# File lib/carray-dataframe/join.rb, line 183 def one_to_many_merge? !(merge_key == next_left_key && merge_key == next_right_key) end
one_to_many_shift()
click to toggle source
# File lib/carray-dataframe/join.rb, line 172 def one_to_many_shift shift_left = first_right_key != next_right_key shift_right = first_left_key != next_left_key one_to_one_left_row if shift_left one_to_one_right_row if shift_right end
one_to_one_left_row()
click to toggle source
# File lib/carray-dataframe/join.rb, line 187 def one_to_one_left_row left_key_values.shift left.shift end
one_to_one_merge?()
click to toggle source
# File lib/carray-dataframe/join.rb, line 179 def one_to_one_merge? merge_key != next_left_key && merge_key != next_right_key end
one_to_one_right_row()
click to toggle source
# File lib/carray-dataframe/join.rb, line 192 def one_to_one_right_row right_key_values.shift right.shift end
right_row_missing_left()
click to toggle source
# File lib/carray-dataframe/join.rb, line 202 def right_row_missing_left val = one_to_one_right_row expand_row(val, right_keys) if keep_right end
right_rows_at_merge_key()
click to toggle source
# File lib/carray-dataframe/join.rb, line 246 def right_rows_at_merge_key right.take_while { |arr| sanitize_merge_keys(arr.values_at(*on)) == merge_key } end
row(lkey, rkey)
click to toggle source
# File lib/carray-dataframe/join.rb, line 135 def row(lkey, rkey) case when !lkey && !rkey # :nocov: # It's just an impossibility handler, can't be covered :) raise 'Unexpected condition met during merge' # :nocov: when lkey == rkey self.merge_key = lkey add_indicator(merge_matching_rows, :both) when !rkey || lt(lkey, rkey) add_indicator(left_row_missing_right, :left_only) else # !lkey || lt(rkey, lkey) add_indicator(right_row_missing_left, :right_only) end end
safe_compare(left_array, right_array)
click to toggle source
# File lib/carray-dataframe/join.rb, line 273 def safe_compare(left_array, right_array) left_array.zip(right_array).map { |l, r| next 0 if l.nil? && r.nil? next 1 if r.nil? next -1 if l.nil? l <=> r }.reject(&:zero?).first || 0 end
sanitize_merge_keys(merge_keys)
click to toggle source
# File lib/carray-dataframe/join.rb, line 105 def sanitize_merge_keys(merge_keys) merge_keys.map { |v| v.nil? ? NilSorter.new : v } end
validate_on!(left_df, right_df)
click to toggle source
# File lib/carray-dataframe/join.rb, line 266 def validate_on!(left_df, right_df) @on.each do |on| left_df.has_column?(on) && right_df.has_column?(on) or raise ArgumentError, "Both dataframes expected to have #{on.inspect} field" end end