class Evoc::Analyze

Attributes

opts[RW]
tx_store[RW]

Public Class Methods

new(opts) click to toggle source
# File lib/evoc/analyze.rb, line 6
def initialize(opts)
  self.opts = opts
  Logging.set_level(self.opts[:logger_level])
  if opts[:tx_store].nil?
  self.tx_store = TxStore.new(path: opts[:transactions],case_id: self.opts[:case_id], granularity: self.opts[:granularity])
  else
    self.tx_store = opts[:tx_store]
  end
end

Public Instance Methods

aggregator_range() click to toggle source
# File lib/evoc/analyze.rb, line 379
def aggregator_range
  r = Random.new
  $stdout.puts 'aggregator,length,range,y'
  (1..1000).each do |length|
    list = ([1]*length).map {|i| i - rand(0.00001..1)}.sort.reverse
    list_inf = ([1]*length).map {|i| rand(0..10)}.sort.reverse
    aggregator = Evoc::InterestingnessMeasureAggregator.new('m_support',list)
    aggregator_inf = Evoc::InterestingnessMeasureAggregator.new('m_hyper_coefficient',list_inf)
    $stdout.puts "CG,#{length},\"0..1\",#{aggregator.cg}"
    $stdout.puts "DCG,#{length},\"0..1\",#{aggregator.dcg}"
    $stdout.puts "DCG2,#{length},\"0..1\",#{aggregator.dcg2}"
    $stdout.puts "CG,#{length},inf,#{aggregator_inf.cg}"
    $stdout.puts "DCG,#{length},inf,#{aggregator_inf.dcg}"
    $stdout.puts "DCG2,#{length},inf,#{aggregator_inf.dcg2}"
  end

end
all() click to toggle source

Perform all of the numerical analyzes

Prints to stdout

# File lib/evoc/analyze.rb, line 20
def all
  methods = %W(num_commits percent_method_changes_of_all_changes avg_changes_per_file avg_method_changes_per_parsable_file num_unique_changes average_changes_per_commit average_commits_per_specific_change time_span_in_years average_time_between_commits_in_minutes)

  CSV {|row| row << methods}
  results = []
  methods.each do |m|
    results << self.method(m).call
  end
  CSV {|row| row << results}
end
average_changes_per_commit() click to toggle source
# File lib/evoc/analyze.rb, line 155
def average_changes_per_commit
  arr = self.tx_store.map(&:size)
  (arr.inject{ |sum, el| sum + el }.to_f / arr.size).round(2)
end
average_commits_per_specific_change() click to toggle source
# File lib/evoc/analyze.rb, line 115
def average_commits_per_specific_change
  (self.tx_store.items.values.inject(0) {|sum,txes| sum + txes.size}.to_f/self.tx_store.items.size).round(2)
end
average_time_between_commits_in_minutes() click to toggle source
# File lib/evoc/analyze.rb, line 172
def average_time_between_commits_in_minutes
  average_time_between_commits(unit: 'minutes').round(2)
end
avg_changes_per_file() click to toggle source
# File lib/evoc/analyze.rb, line 82
def avg_changes_per_file
  files_changed = 0
  total_changes = 0
  self.tx_store.each do |tx|
    named_items = tx.items.map {|i| self.tx_store.int_2_name[i]}
    named_items.group_by {|i| /^(?<file>[^:]+?)(?::|$)/.match(i)[:file]}.each do |file,changes|
      files_changed += 1
      total_changes += changes.size
    end
  end
  return (total_changes.to_f/files_changed).round(2)
end
avg_method_changes_per_parsable_file() click to toggle source
# File lib/evoc/analyze.rb, line 60
def avg_method_changes_per_parsable_file
  parsable_files_changed = 0
  method_changes = 0
  self.tx_store.each do |tx|
    named_items = tx.items.map {|i| self.tx_store.int_2_name[i]}
    # group changed items by file name
    named_items.group_by {|i| /^(?<file>[^:]+?)(?::|$)/.match(i)[:file]}.each do |file,changes|
      # check if any method changes were found for this file
      if changes.any? {|c| c =~ /:(?!@residuals)/}
        parsable_files_changed += 1
        # count number of method changes per file group
        changes.each do |change|
          if change =~ /:(?!@residuals)/
            method_changes += 1
          end
        end
      end
    end
  end
  return (method_changes.to_f/parsable_files_changed).round(2)
end
commit_size() click to toggle source

Dumps the commit sizes, one commit per line

# File lib/evoc/analyze.rb, line 135
def commit_size
  if self.opts[:group]
    $stdout.puts 'commit_size,frequency'
    self.tx_store.group_by(&:size).sort.each do |size,txes|
      STDOUT.puts "#{size},#{txes.size}"
    end
  else
    $stdout.puts 'commit_size'
    self.tx_store.each_with_index do |tx,index|
      $stderr.print "Dumping commit sizes: #{index+1} of #{self.tx_store.size}               \r"
      $stdout.puts tx.size
    end
  end
  $stderr.puts "DONE                                                 "
end
commits() click to toggle source
# File lib/evoc/analyze.rb, line 44
def commits
  unique_items = Set.new
  changes_so_far = 0
  self.tx_store.each do |tx|
    data = Hash.new
    tx.items.each {|item| unique_items << item}
    changes_so_far = changes_so_far += tx.items.size
    data['sha'] = tx.id
    data['index'] = tx.index
    data['num_changes'] = tx.items.size
    data['items_touched_so_far'] = unique_items.size
    data['moving_average'] = (changes_so_far/(tx.index+1)).to_f.round(2)
    STDOUT.puts data.to_json
  end
end
create_dict() click to toggle source
# File lib/evoc/analyze.rb, line 397
def create_dict
  puts "id,name"
  self.tx_store.int_2_name.each do |id,name|
    puts "#{id},#{name}"
  end
end
evolution() click to toggle source
# File lib/evoc/analyze.rb, line 31
def evolution
  CSV {|row| row << %w(index relevant_index overlap)}
  self.tx_store.each do |tx|
    changed_in = self.tx_store.transactions_of_list(tx.items, strict: false, identifier: :index)
    previous_txes = changed_in.select {|i| i <= tx.index}
    previous_txes.each do |prev_index|
      prev_tx = self.tx_store.get_tx(id: prev_index,id_type: :index)
      overlap = ((prev_tx.items & tx.items).size/tx.size.to_f).round(2)
      CSV {|row| row << [tx.index,prev_index,overlap]}
    end
  end
end
file_frequency() click to toggle source

Prints a CSV formated table of the top N frequent files N is configured in opts

# File lib/evoc/analyze.rb, line 122
def file_frequency
  # print header
  CSV {|row| row << %W(file frequency)}
  frequency = self.tx_store.items.map {|item,txes| [item, txes.size] }
  frequency.sort_by! {|item,freq| -freq}
  frequency.take(self.opts[:top]).each do |file,freq|
    filename = self.tx_store.int_2_name[file]
    CSV {|row| row << [filename,freq]}
  end
end
measure_ranges() click to toggle source
# File lib/evoc/analyze.rb, line 222
def measure_ranges
  #rules = []
  #ab_range = ([0.00000001]+(0.0..1).step(0.01).to_a+[0.99999999]).map(&:rationalize)
  #ab_range.each do |ab|
  #  ab_zero = ((ab == 0) ? 0.0000001 : 0)
  #  a_range = (ab+ab_zero..1).step(0.01).map(&:rationalize)
  #  b_range = (ab+ab_zero..1).step(0.01).map(&:rationalize)
  #  # construct all rules starting from A
  #  a_range.each do |a|
  #    new_b_range = (ab..1-a+ab).step(0.01).map(&:rationalize)
  #    new_b_range.each do |b|
  #      if (a == 0) | (b == 0)
  #        # a or b cant be 0
  #        next
  #      else
  #        r = Evoc::Rule.new(lhs: [], rhs: [])
  #        r.set_p('p_A',a)
  #        r.set_p('p_B',b)
  #        r.set_p('p_AB',ab)
  #        rules << r
  #      end
  #    end
  #  end
  #  # construct all rules starting from B
  #  b_range.each do |b|
  #    new_a_range = (ab..1-b+ab).step(0.01).map(&:rationalize)
  #    new_a_range.each do |a|
  #      r = Evoc::Rule.new(lhs: [], rhs: [])
  #      r.set_p('p_A',a)
  #      r.set_p('p_B',b)
  #      r.set_p('p_AB',ab)
  #      rules << r
  #    end
  #  end
  #end
  ## A and B never change together, but do change
  r1 = Evoc::Rule.new(lhs: [], rhs: [])
  r1.set_p('p_A',1.to_r/4)
  r1.set_p('p_B',1.to_r/4)
  r1.set_p('p_AB',0)
  # A and B never change together, and A almost never change
  r2 = Evoc::Rule.new(lhs: [], rhs: [])
  r2.set_p('p_A',1.to_r/1000000)
  r2.set_p('p_B',1.to_r/2)
  r2.set_p('p_AB',0)
  # A and B never change together, and B almost never change
  r3 = Evoc::Rule.new(lhs: [], rhs: [])
  r3.set_p('p_A',1.to_r/2)
  r3.set_p('p_B',1.to_r/1000000)
  r3.set_p('p_AB',0)
  # A and B never change together, but change half of the time
  r4 = Evoc::Rule.new(lhs: [], rhs: [])
  r4.set_p('p_A',1.to_r/2)
  r4.set_p('p_B',1.to_r/2)
  r4.set_p('p_AB',0)
  
  # A and B never change together, and A and B almost never change
  r5 = Evoc::Rule.new(lhs: [], rhs: [])
  r5.set_p('p_A',1.to_r/1000000)
  r5.set_p('p_B',1.to_r/1000000)
  r5.set_p('p_AB',0)

  # A and B always change together
  r6 = Evoc::Rule.new(lhs: [], rhs: [])
  r6.set_p('p_A',1.to_r/3)
  r6.set_p('p_B',1.to_r/3)
  r6.set_p('p_AB',1.to_r/3)
  # A and B always change together, but rarely change
  r7 = Evoc::Rule.new(lhs: [], rhs: [])
  r7.set_p('p_A',1.to_r/1000000)
  r7.set_p('p_B',1.to_r/1000000)
  r7.set_p('p_AB',1.to_r/1000000)
  # A and B always change together, always
  r8 = Evoc::Rule.new(lhs: [], rhs: [])
  r8.set_p('p_A',1)
  r8.set_p('p_B',1)
  r8.set_p('p_AB',1)
  
  # B always change when A change, but rarely change
  r9 = Evoc::Rule.new(lhs: [], rhs: [])
  r9.set_p('p_A',1)
  r9.set_p('p_B',1.to_r/10000000)
  r9.set_p('p_AB',1.to_r/10000000)
  
  # A always change when B change, but rarely change
  r10 = Evoc::Rule.new(lhs: [], rhs: [])
  r10.set_p('p_A',1.to_r/10000000)
  r10.set_p('p_B',1)
  r10.set_p('p_AB',1.to_r/10000000)

  # A and B sometimes change together, B always
  r11 = Evoc::Rule.new(lhs: [], rhs: [])
  r11.set_p('p_A',1.to_r/2)
  r11.set_p('p_B',1)
  r11.set_p('p_AB',1.to_r/2)
  
  # A and B sometimes change together, A always
  r12 = Evoc::Rule.new(lhs: [], rhs: [])
  r12.set_p('p_A',1)
  r12.set_p('p_B',1.to_r/2)
  r12.set_p('p_AB',1.to_r/2)
  
  # A and B always change together, almost always
  r13 = Evoc::Rule.new(lhs: [], rhs: [])
  r13.set_p('p_A',0.99999999.rationalize)
  r13.set_p('p_B',0.99999999.rationalize)
  r13.set_p('p_AB',0.99999999.rationalize)
  
  # A and B always change together, half of the time
  r14 = Evoc::Rule.new(lhs: [], rhs: [])
  r14.set_p('p_A',0.5.rationalize)
  r14.set_p('p_B',0.5.rationalize)
  r14.set_p('p_AB',0.5.rationalize)
  
  # A and B sometimes change together, A always
  r15 = Evoc::Rule.new(lhs: [], rhs: [])
  r15.set_p('p_A',0.4.rationalize)
  r15.set_p('p_B',0.4.rationalize)
  r15.set_p('p_AB',0.1.rationalize)
  
  # A and B sometimes change together, A always
  r16 = Evoc::Rule.new(lhs: [], rhs: [])
  r16.set_p('p_A',1.to_r/3)
  r16.set_p('p_B',1.to_r/2)
  r16.set_p('p_AB',1.to_r/4)

  rules = [r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15,r16]

  measures = (Evoc::Rule.p_measures + Evoc::Rule.measures).sort
  CSV {|r| r << [nil,nil,nil,nil,'min_configuration',nil,nil,'max_configuration',nil,nil]}
  CSV {|r| r << ['measure','current_range','min','max','P(A)','P(B)','P(A,B)','P(A)','P(B)','P(A,B)']} # print header
  measures.each do |m|
    values = Hash.new
    rules.each do |r|
      measure = r.get_measure(m)
      if value = (measure.is_a?(Evoc::InterestingnessMeasure) ? measure.value : measure)
        if value != Float::NAN
          a = r.p_A
          b = r.p_B
          ab = r.p_AB
          key = "#{a},#{b},#{ab}"
          values[key] = Hash.new
          values[key][:m] = measure
          values[key][:v] = value.to_f
          values[key][:a] = a
          values[key][:b] = b
          values[key][:ab] = ab
        end
      end
    end
    min = values.min_by {|k,v| v[:v]}
    max = values.max_by {|k,v| v[:v]}
    current_range = (min[1][:m].is_a?(Evoc::InterestingnessMeasure) ? "[#{Evoc::InterestingnessMeasures.get_min(m)},#{Evoc::InterestingnessMeasures.get_max(m)}]" : "[0,1]")
    CSV {|r| r << [m,current_range,min[1][:v],max[1][:v],min[1][:a],min[1][:b],min[1][:ab],max[1][:a],max[1][:b],max[1][:ab]]} # print row
  end
end
measure_values() click to toggle source
# File lib/evoc/analyze.rb, line 199
def measure_values
  # generate some random rules with min sup 1
  samplable = self.tx_store.select {|tx| tx.size > 1}
  measures = (Evoc::Rule.p_measures + Evoc::Rule.measures).sort
  CSV {|r| r << measures} # print header
  self.opts[:number].times do |i|
    random_tx = samplable.sample
    lhs = random_tx.items.sample(rand(1..random_tx.size-1))
    rhs = (random_tx.items-lhs).sample
    r = Evoc::Rule.new(lhs: lhs, rhs: rhs, tx_store: self.tx_store)
    row = CSV::Row.new([],[],false)
    measures.each do |m|
      measure = r.get_measure(m)
      if value = measure.is_a?(Evoc::InterestingnessMeasure) ? measure.value : measure
        row << value.to_f
      else
        row << nil
      end
    end
    CSV {|r| r << row} # print row
  end
end
median_changes_per_commit() click to toggle source

added but not used as the data is not interesting –LM

# File lib/evoc/analyze.rb, line 161
def median_changes_per_commit
  arr = self.tx_store.map(&:size)
  sorted = arr.sort
  len = sorted.length
  (sorted[(len - 1) / 2] + sorted[len / 2]) / 2.0
end
num_commits() click to toggle source
# File lib/evoc/analyze.rb, line 111
def num_commits
  self.tx_store.size
end
num_unique_changes() click to toggle source
# File lib/evoc/analyze.rb, line 151
def num_unique_changes
  self.tx_store.map(&:items).flatten.uniq.size
end
percent_method_changes_of_all_changes() click to toggle source

@return [Float] the percentage of changes that are method level

# File lib/evoc/analyze.rb, line 97
def percent_method_changes_of_all_changes
  total_changes = 0
  file_changes = 0
  self.tx_store.each do |tx|
    tx.items.each do |item|
      total_changes += 1
      if self.tx_store.int_2_name[item] =~ /^[^:]+?(\.[^:\/]+?)?(?::@residuals$|$)/
        file_changes += 1
      end
    end
  end
  return ((1 - file_changes.to_f/total_changes)*100).round(2)
end
time_span_in_years() click to toggle source
# File lib/evoc/analyze.rb, line 168
def time_span_in_years
  time_span(unit: 'years')
end
uniqueness() click to toggle source
# File lib/evoc/analyze.rb, line 177
def uniqueness
  result = Hash.new

  self.tx_store.each do |tx|
    query_size = tx.size-1
    queries = tx.items.combination(query_size).to_a
    previous_history = self.tx_store.clone_with_subset(0,tx.index)

    queries.each do |query|
      hits = previous_history.transactions_of_list(query,true).size
      if result[hits].nil?
        result[hits] = 1
      else
        result[hits] += 1
      end
    end

  end

  result
end

Private Instance Methods

average_time_between_commits(unit: 'hours') click to toggle source
# File lib/evoc/analyze.rb, line 412
def average_time_between_commits(unit: 'hours')
  if self.tx_store.size > 1
    t1 = self.tx_store.first.date
    t2 = self.tx_store.last.date
    total_time = TimeDifference.between(t1,t2).method('in_'+unit).call
    total_time.to_f/(self.tx_store.size-1)
  else
    raise Exception.new, "History only contained 1 or 0 transactions"
  end
end
time_span(unit: 'years') click to toggle source
# File lib/evoc/analyze.rb, line 406
def time_span(unit: 'years')
  t1 = self.tx_store.first.date
  t2 = self.tx_store.last.date
  TimeDifference.between(t1,t2).method('in_'+unit).call
end