class Vcs2Json::Git

Constants

FIELD_SEP

Generate separators between fields and commits

META_DATA

Attributes

case_id[RW]
fine_grained[RW]
ignore[R]
number[RW]

Public Class Methods

new(opts) click to toggle source
# File lib/vcs2json/git.rb, line 21
def initialize(opts)
  # case id must be set before setting ignore
  # as the id is used to lookup the list of
  # files to ignore in the ignorefile
  self.case_id = opts[:case_id]
  self.ignore = opts[:ignore]
  self.before = opts[:before]
  self.after = opts[:after]
  self.number = opts[:number]
  self.fine_grained = opts[:fine_grained]

  # Set logger level
  Logging.set_location(opts[:logger_location])
  Logging.set_level(opts[:logger_level])
  SrcML.ignore_comments = opts[:ignore_comments]
  SrcML.ignore_whitespace = opts[:ignore_whitespace]
  SrcML.residuals = opts[:residuals]

  # Check that SrcML is available if fine grained is turned on
  if self.fine_grained
    begin
      Open3.capture3("srcml --version")
    rescue Errno::ENOENT
      $stderr.puts "SrcML is required for fine grained change history extraction, please install from www.srcml.com"
      $stderr.puts "Defaulting to file level"
      self.fine_grained = false
    end
  end
end

Public Instance Methods

after() click to toggle source
# File lib/vcs2json/git.rb, line 63
def after
  @after.nil? ? '' : "--after=\"#{@after}\""
end
after=(after) click to toggle source
# File lib/vcs2json/git.rb, line 51
def after=(after)
  if !after.nil?
    begin
      Date.parse(after)
      @after = after
    rescue
      STDERR.puts "Invalid date --after=#{after}. Ignoring option."
      @after = nil
    end
  end
end
before() click to toggle source
# File lib/vcs2json/git.rb, line 79
def before
  @before.nil? ? '' : "--before=\"#{@before}\""
end
before=(before) click to toggle source
# File lib/vcs2json/git.rb, line 67
def before=(before)
  if !before.nil?
    begin
      Date.parse(before)
      @before = before
    rescue
      STDERR.puts "Invalid date --before=#{before}. Ignoring option."
      @before = nil
    end
  end
end
ignore=(path) click to toggle source
# File lib/vcs2json/git.rb, line 232
def ignore= path
  default_locations = ["#{Dir.pwd}/.evocignore","~/.evocignore"]
  paths = (path.nil? ? default_locations : [path] + default_locations)
  file = nil
  ignore = []
  paths.each do |p|
    if File.exist?(p)
      file = File.open(p)
      STDERR.puts "Loading files to ignore from #{file.path}"
      # return first match
      break
    end
  end
  if file.nil?
    STDERR.puts ".evocignore not found. Tried #{paths}. All files will be used."
  else
    if self.case_id.nil?
      STDERR.puts "Id in .evocignore not specified, not ignoring any files."
    else
      ignore_file = YAML.load(file)
      if ignore_file.key?(self.case_id)
        ignore = ignore_file[self.case_id]
        if !ignore.nil?
          STDERR.puts "Ignoring #{ignore.size} files"
        end
      else
        STDERR.puts "The id: '#{self.case_id}' not found in #{file.path}"
      end
    end
  end
  @ignore = (ignore.nil? ? [] : ignore)
  return @ignore
end
parse() click to toggle source
# File lib/vcs2json/git.rb, line 83
def parse

  # keeps track of number of commits successfully parsed
  commit_counter = 0

  # keeps track of empty commits
  empty_commits = []

  ##########################
  # GET LIST OF COMMIT IDS #
  ##########################

  # getting the list of revision ids is cheap, so we get some extra in case we are unable to parse the required amount in the first 'n' commits
  commit_ids = `git rev-list HEAD #{self.before} #{self.after} -n #{self.number*10} --no-merges`.split

  ############################
  # ITERATE OVER EACH COMMIT #
  ############################

  commit_ids.each do |id|
    logger.debug "Parsing commit: #{id}"
    # get the changed files
    changed_files = `git log --pretty=format:'' --name-status #{id} -n 1`.split("\n")
                      .map {|line| line.split(/(^[AMD])\s+/).delete_if {|e| e.empty?}}

    # remove ignored files
    changed_files.reject! {|file| 
      if self.ignore.include?(file[1])
       logger.debug "[IGNOREDEBUG] Ignored #{file[1]} in commit #{id}" 
       true
      else
        false
      end
    }

    # add files changed info
    if !changed_files.empty?

      ##################
      # FETCH METADATA #
      ##################

      raw_commit = `git log --pretty=format:'#{META_DATA}' #{id} -n 1`
      commit = ''

      ##################
      # CLEAN RAW DATA #
      ##################

      begin
        # try encoding to utf8
        commit = raw_commit.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
        # need to expliceitely check if the encoding is valid for ruby <= 2.0
        # utf8 -> utf8 will not do anything even with invalid bytes
        # http://stackoverflow.com/questions/24036821/ruby-2-0-0-stringmatch-argumenterror-invalid-byte-sequence-in-utf-8
        if !commit.valid_encoding?
          # encode to utf16 first and then back to utf8
          commit.encode!("UTF-16be", invalid: :replace, undef: :replace, :replace=>'')
          commit.encode!('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
        end
      rescue ArgumentError
        raise EncodingError.new, "Unable to encode input as UTF-8"
      end

      ##############################
      # CONSTRUCT OUTPUT HASH/JSON #
      ##############################

      output_hash = Hash.new
      fields = commit.split(FIELD_SEP)
      sha = fields[0].delete("\n") #remove astray newlines
      output_hash[:sha]            = sha
      output_hash[:name]           = fields[1]
      output_hash[:email]          = fields[2]
      output_hash[:date]           = Time.parse fields[3]
      output_hash[:committer_name] = fields[4]
      output_hash[:committer_email]= fields[5]
      output_hash[:committer_date] = Time.parse fields[6]
      output_hash[:message]        = fields[7]
      output_hash[:changes] = []

      #######################################
      # PARSE FILES FOR FINEGRAINED CHANGES #
      #######################################
      
      # print progress


      changed_files.each_with_index do |(status,file_name),index|
        STDERR.print "Parsing file #{index+1} Of #{changed_files.size} in commit #{commit_counter+1} of #{self.number}                  \r"
        if ([status,file_name].empty? || status.nil? || file_name.nil? || status.empty? || file_name.empty?)
            # ignoring commit
        else
          # add finer grained change info
          if self.fine_grained
            begin
              # new file, all methods are new, no need to calculate diff
              if status == 'A'
                SrcML.methods(file_name,revision: id).keys.each {|m| output_hash[:changes] << m}
              # calculate diffs
              else
                SrcML.changed_methods_git(file_name,id).each {|m| output_hash[:changes] << m}
              end
            rescue SrcML::UnsupportedLanguageError, SrcML::ParseError
              output_hash[:changes] << file_name
            end
          else
            output_hash[:changes] << file_name
          end
        end
      end # changes_files.each

      # Only add commits where at least one change was detected
      if !output_hash[:changes].empty?
        ###########################
        # PRINT COMMIT TO $stdout #
        ###########################

        $stdout.puts output_hash.to_json

        # increase counter for number of commits successfully parsed
        commit_counter += 1

        ########################################
        # CHECK IF REQUESTED AMOUNT IS REACHED #
        ########################################

        if commit_counter == self.number
          break # out of loop
        end
      else # no changes detected in commit
        empty_commits << id
      end
    else # no files in commit
      empty_commits << id
    end
  end 

  # we may still lack commits after exhaustive search, notify user
  if commit_counter < self.number
    STDERR.puts "Asked for #{self.number} commits, only found #{commit_counter} non-empty commits in the last #{self.number*2} commits"
  end
  # print ids of empty commits to stderr
  if !empty_commits.empty?
    STDERR.puts "EMPTY COMMITS"
    STDERR.puts empty_commits
  end
end