class DocDiff

Character String module. To use, include to String, or extend String. 2003- Hisashi MORITA

Data class reduces input for diff and convert alphabet to Integer.

It reduces input by removing common prefix, suffix and unique elements.

So, reduced input has following properties:

Contours

Contours is based on the algorithm which is presented by Claus Rick.

I made two optimizations (for long LCS):

References

Claus2000

Claus Rick,

Simple and Fast Linear Space Computation of Longest Common Subsequences, Information Processing Letters, Vol. 75/6, 275 - 281, Elsevier (2000)

Claus1995

Claus Rick,

A New Flexible Algorithm for the Longest Common Subsequence Problem, Proceedings of the 6th Symposium on Combinatorial Pattern Matching (CPM'95), Lecture Notes in Computer Science, Vol. 937, 340 - 351, Springer Verlag (1995) Also in Nordic Journal of Computing (NJC), Vol. 2, No. 4, Winter 1995, 444 - 461. web.informatik.uni-bonn.de/IV/Mitarbeiter/rick/lcs.dvi.Z

Diff::ShortestPath uses the algorithm described in following paper.

Wu1990

Sun Wu, Udi Manber, Gene Myers and Webb Miller,

An O(NP) Sequence Comparison Algorithm, Information Processing Letters 35, 1990, 317-323

English ASCII encoding module for CharString 2003- Hisashi MORITA

Japanese EUC-JP encoding module for CharString 2003- Hisashi MORITA

Japanese Shift_JIS encoding module for CharString 2003- Hisashi MORITA

Japanese UTF-8 encoding module for CharString 2003- Hisashi MORITA

Constants

AltUserConfigFileName
AppVersion
Author
License
SystemConfigFileName
UserConfigFileName

Attributes

config[RW]

Public Class Methods

new() click to toggle source
# File lib/doc_diff.rb, line 19
def initialize()
  @config = {}
end
parse_config_file_content(content) click to toggle source
# File lib/doc_diff.rb, line 24
def DocDiff.parse_config_file_content(content)
  result = {}
  return result if content.size <= 0
  lines = content.dup.split(/\r\n|\r|\n/).compact
  lines.collect!{|line| line.sub(/#.*$/, '')}
  lines.collect!{|line| line.strip}
  lines.delete_if{|line| line == ""}
  lines.each{|line|
    raise 'line does not include " = ".' unless /[\s]+=[\s]+/.match line
    name_src, value_src = line.split(/[\s]+=[\s]+/)
    raise "Invalid name: #{name_src.inspect}" if (/\s/.match name_src)
    raise "Invalid value: #{value_src.inspect}" unless value_src.kind_of?(String)
    name  = name_src.intern
    value = value_src
    value = true if ['on','yes','true'].include? value_src.downcase
    value = false if ['off','no','false'].include? value_src.downcase
    value = value_src.to_i if /^[0-9]+$/.match value_src
    result[name] = value
  }
  result
end

Public Instance Methods

anatomize_classic(src) click to toggle source
# File lib/viewdiff.rb, line 94
def anatomize_classic(src)
  self.extend ClassicDiff
  diffed = []
  src_encoding = CharString.guess_encoding(src)
  src_eol = CharString.guess_eol(src)
  src.scan(Regexp.new(elements, Regexp::MULTILINE)){|m|
    case
    when /\A[0-9]/.match(m) then # hunk
      diffed.concat(anatomize_classic_hunk(m, src_encoding, src_eol))
    else # not hunk
      diffed.concat(Difference.new(m.split(/^/), m.split(/^/)))
    end
  }
  return diffed
end
anatomize_classic_hunk(a_hunk, src_encoding, src_eol) click to toggle source
# File lib/viewdiff.rb, line 110
def anatomize_classic_hunk(a_hunk, src_encoding, src_eol)
  self.extend ClassicDiff
  diffed = []
  a_hunk.scan(/(#{hunk_header})(#{change}|#{del}+|#{add}+)/){|n|
    head, body = [$1, $2].collect{|e|
      e.extend(CharString)
      e.encoding, e.eol = src_encoding, src_eol
      e
    }
    diffed.concat(Difference.new(head.to_words, head.to_words))
    case
    when /d/.match(head) # del
      diffed.concat(Difference.new(body.to_words, []))
    when /a/.match(head) # add
      diffed.concat(Difference.new([], body.to_words))
    when /c/.match(head) # change (need tweak)
      former, latter = body.split(/#{sep}/).collect{|e|
        e.extend(CharString)
        e.encoding, e.eol = src_encoding, src_eol
        e
      }
      d = Difference.new(former.to_words, latter.to_words)
      diffed_former = d.former_only
      diffed_latter = d.latter_only
      sepstr = /#{sep}/.match(body).to_s.extend(CharString)
      sepstr.encoding, sepstr.eol = src_encoding, src_eol
      sepelm = Difference.new(sepstr.to_words, sepstr.to_words)
      diffed.concat(diffed_former + sepelm + diffed_latter)
    else
      raise "invalid hunk header: #{head}"
    end
  }
  return diffed
end
anatomize_context(src) click to toggle source
# File lib/viewdiff.rb, line 184
def anatomize_context(src)
  self.extend ContextDiff
  diffed = []
  src_encoding = CharString.guess_encoding(src)
  src_eol = CharString.guess_eol(src)
  src.scan(/#{elements}/m){|m|
    case
    when /\A\*{10,}#{eol}^\*{3} /.match(m) then # hunk
      diffed.concat(anatomize_context_hunk(m, src_encoding, src_eol))
    else # not hunk
      m.extend(CharString)
      m.encoding, m.eol = src_encoding, src_eol
      diffed.concat(Difference.new(m.to_words, m.to_words))
    end
  }
  return diffed
end
anatomize_context_hunk(a_hunk, src_encoding, src_eol) click to toggle source
# File lib/viewdiff.rb, line 202
def anatomize_context_hunk(a_hunk, src_encoding, src_eol)
  self.extend ContextDiff
  diffed = []
  h, sh_f, body_f, sh_l, body_l = nil
  a_hunk.scan(/(#{hunk_header})(#{hunk_subheader_former})(.*?)(#{hunk_subheader_latter})(.*?)\z/m){|m|
    h, sh_f, body_f, sh_l, body_l = m[0..4].collect{|he|
      if he
        he.extend(CharString)
        he.encoding, he.eol = src_encoding, src_eol
      end
      he
    }
  }
  diffed_former, diffed_latter = anatomize_context_hunk_scanbodies(body_f, body_l, src_encoding, src_eol)
  diffed.concat(Difference.new(h.to_words, h.to_words) +
                Difference.new(sh_f.to_words, sh_f.to_words) +
                diffed_former +
                Difference.new(sh_l.to_words, sh_l.to_words) +
                diffed_latter)
  return diffed
end
anatomize_context_hunk_scanbodies(body_f, body_l, src_encoding, src_eol) click to toggle source
# File lib/viewdiff.rb, line 224
def anatomize_context_hunk_scanbodies(body_f, body_l, src_encoding, src_eol)
  body_f = '' if body_f.nil?
  body_l = '' if body_l.nil?
  self.extend ContextDiff
  changes_org = [[], []]
  changes_org[0], changes_org[1] = [body_f, body_l].collect{|b|
    b.scan(/#{change}+/).collect{|ch|
      if ch
        ch.extend(CharString)
        ch.encoding, ch.eol = src_encoding, src_eol
      end
      ch
    }
  }
  changes = changes_org.dup
  diffed = [[], []]
  [body_f, body_l].each_with_index{|half, i|
    changes[0], changes[1] = changes_org[0].dup, changes_org[1].dup
    half.scan(/(#{del}+)|(#{add}+)|(#{change}+)|(#{misc}+)/m){|elm|
      elm_d, elm_a, elm_c, elm_cmn = elm[0..3]
      [elm_d, elm_a, elm_c, elm_cmn].collect{|e|
        if e
          e.extend(CharString)
          e.encoding, e.eol = src_encoding, src_eol
        end
        e
      }
      case
      when elm_d then d = Difference.new(elm_d.to_words, [])
      when elm_a then d = Difference.new([], elm_a.to_words)
      when elm_c then d = Difference.new(changes[0].shift.to_words, changes[1].shift.to_words)
        case i
        when 0 then d = d.former_only
        when 1 then d = d.latter_only
        else raise
        end
      when elm_cmn then d = Difference.new(elm_cmn.to_words, elm_cmn.to_words)
      else
        raise "bummers!"
      end
      diffed[i].concat(d)
    } # end half.scan
  } # end each_with_index
  return diffed
end
anatomize_unified(src) click to toggle source
# File lib/viewdiff.rb, line 306
def anatomize_unified(src)
  self.extend UnifiedDiff
  diffed = []
  src_encoding = CharString.guess_encoding(src)
  src_eol = CharString.guess_eol(src)
  src.scan(/#{elements}/m){|m|
    case
    when /\A@@ /.match(m) then # hunk
      diffed.concat(anatomize_unified_hunk(m.to_s, src_encoding, src_eol))
    else # not hunk
      m.extend(CharString)
      m.encoding, m.eol = src_encoding, src_eol
      diffed.concat(Difference.new(m.to_words, m.to_words))
    end
  }
  return diffed
end
anatomize_unified_hunk(a_hunk, src_encoding, src_eol) click to toggle source
# File lib/viewdiff.rb, line 324
def anatomize_unified_hunk(a_hunk, src_encoding, src_eol)
  self.extend UnifiedDiff
  diffed = []
  a_hunk.scan(/(#{hunk_header})(#{any}+#{eol}?)/m){|m|
    head, body = m[0], m[1]
    [head, body].collect{|e|
      e.extend(CharString)
      e.encoding, e.eol = src_encoding, src_eol
    }
    diffed.concat(Difference.new(head.to_words, head.to_words))
    body.scan(/(#{del}+)(#{add}+)|(#{del}+#{eol}?)|(#{add}+)|(#{common}+#{eol}?)|(.*#{eol}?)/m){|m|
      cf, cl, d, a, cmn, msc = m[0..5]
      [cf, cl, d, a, cmn, msc].collect{|e|
        next if e.nil?
        e.extend(CharString)
        e.encoding, e.eol = src_encoding, src_eol
      }
      case
      when (cf and cl) then
        Difference.new(cf.to_words, cl.to_words).each{|e|
          case e.first
          when :change_elt     then diffed << [:change_elt, e[1], nil]
                                    diffed << [:change_elt, nil, e[2]]
          when :del_elt        then diffed << [:change_elt, e[1], nil]
          when :add_elt        then diffed << [:change_elt, nil, e[2]]
          when :common_elt_elt then diffed << e
          else raise "bummers! (#{e.inspect})"
          end
        }
      when d           then diffed.concat(Difference.new(d.to_words, []))
      when a           then diffed.concat(Difference.new([], a.to_words))
      when cmn         then diffed.concat(Difference.new(cmn.to_words, cmn.to_words))
      when msc         then diffed.concat(Difference.new(msc.to_words, msc.to_words))
      else raise "bummers! (#{m.inspect})"
      end
    }
  }
  return diffed
end
compare_by_line(doc1, doc2) click to toggle source
# File lib/doc_diff.rb, line 46
def compare_by_line(doc1, doc2)
  Difference.new(doc1.split_to_line, doc2.split_to_line)
end
compare_by_line_word(doc1, doc2) click to toggle source
# File lib/doc_diff.rb, line 50
def compare_by_line_word(doc1, doc2)
  lines = compare_by_line(doc1, doc2)
  words = Difference.new
  lines.each{|line|
    if line.first == :change_elt
      before_change = Document.new(line[1].join,
                                   doc1.encoding, doc1.eol)
      after_change  = Document.new(line[2].join,
                                   doc2.encoding, doc2.eol)
      Difference.new(before_change.split_to_word,
                     after_change.split_to_word).each{|word|
        words << word
      }
    else  # :common_elt_elt, :del_elt, or :add_elt
      words << line
    end
  }
  words
end
compare_by_line_word_char(doc1, doc2) click to toggle source

i know this implementation of recursion is so lame…

# File lib/doc_diff.rb, line 71
def compare_by_line_word_char(doc1, doc2)
  lines = compare_by_line(doc1, doc2)
  lines_and_words = Difference.new
  lines.each{|line|
    if line.first == :change_elt
      before_change = Document.new(line[1].join,
                                   doc1.encoding, doc1.eol)
      after_change  = Document.new(line[2].join,
                                   doc2.encoding, doc2.eol)
      Difference.new(before_change.split_to_word,
                     after_change.split_to_word).each{|word|
        lines_and_words << word
      }
    else  # :common_elt_elt, :del_elt, or :add_elt
      lines_and_words << line
    end
  }
  lines_words_and_chars = Difference.new
  lines_and_words.each{|line_or_word|
    if line_or_word.first == :change_elt
      before_change = Document.new(line_or_word[1].join, doc1.encoding, doc1.eol)
      after_change  = Document.new(line_or_word[2].join, doc2.encoding, doc2.eol)
      Difference.new(before_change.split_to_char, after_change.split_to_char).each{|char|
        lines_words_and_chars << char
      }
    else  # :common_elt_elt, :del_elt, or :add_elt
      lines_words_and_chars << line_or_word
    end
  }
  lines_words_and_chars
end
process_config_file(filename) click to toggle source
# File lib/doc_diff.rb, line 160
def process_config_file(filename)
  file_content = nil
  begin
    File.open(filename, "r"){|f| file_content = f.read}
  rescue Errno::ENOENT
    message = "config file not found so not read."
  ensure
    if file_content != nil
      self.config.update(DocDiff.parse_config_file_content(file_content))
    end
  end
  message
end
run(doc1, doc2, option) click to toggle source
# File lib/doc_diff.rb, line 103
def run(doc1, doc2, option)
  raise "option is nil" if option.nil?
  raise "option[:resolution] is nil" if option[:resolution].nil?
  raise "option[:format] is nil" if option[:format].nil?
  case
  when doc1.class == Document && doc2.class == Document # OK
  when doc1.encoding != nil && doc2.encoding != nil     # OK
  when doc1.encoding == doc2.encoding && doc1.eol == doc2.eol # OK
  else
    raise("Error!  Blame the author (doc1: #{doc1.encoding}, #{doc1.eol}, doc2: #{doc2.encoding}, #{doc2.eol}).")
  end

  case option[:resolution]
  when "line"; then difference = compare_by_line(doc1, doc2)
  when "word"; then difference = compare_by_line_word(doc1, doc2)
  when "char"; then difference = compare_by_line_word_char(doc1, doc2)
  else
    raise "Unsupported resolution: #{option[:resolution].inspect}"
  end
  view = View.new(difference, doc1.encoding, doc1.eol)
  user_tags = {:start_common        => (@config[:tag_common_start] ||= ''),
               :end_common          => (@config[:tag_common_end] ||= ''),
               :start_del           => (@config[:tag_del_start] ||= ''),
               :end_del             => (@config[:tag_del_end] ||= ''),
               :start_add           => (@config[:tag_add_start] ||= ''),
               :end_add             => (@config[:tag_add_end] ||= ''),
               :start_before_change => (@config[:tag_change_before_start] ||= ''),
               :end_before_change   => (@config[:tag_change_before_end] ||= ''),
               :start_after_change  => (@config[:tag_change_after_start] ||= ''),
               :end_after_change    => (@config[:tag_change_after_end] ||= '')}
  case option[:digest]
  when true
    case option[:format]
    when "tty";      then result = view.to_tty_digest(option)
    when "html";     then result = view.to_html_digest(option)
    when "manued";   then result = view.to_manued_digest(option)
    when "wdiff";    then result = view.to_wdiff_digest(option)
    when "stat";     then result = view.to_stat(option)
    when "user";     then result = view.to_user_digest(user_tags)
    else
      raise "Unsupported output format: #{option[:format].inspect}."
    end
  when false
    case option[:format]
    when "tty";      then result = view.to_tty(option)
    when "html";     then result = view.to_html(option)
    when "manued";   then result = view.to_manued(option)
    when "wdiff";    then result = view.to_wdiff(option)
    when "stat";     then result = view.to_stat(option)
    when "user";     then result = view.to_user(user_tags)
    else
      raise "Unsupported output format: #{option[:format].inspect}."
    end
  end
  result.join
end
scan_text_for_diffs(src) click to toggle source
# File lib/viewdiff.rb, line 20
def scan_text_for_diffs(src)
  eol = "(?:\r\n|\n|\r)"
  pats = {
    :classic => "(?:[0-9]+(?:,[0-9]+)?[dac][0-9]+(?:,[0-9]+)?#{eol}.+?(?=^[^-<>0-9 ]))",
    :context => "(?:^\\*{3} .+?#{eol}--- .+?#{eol}.+?(?=^[^-+! *]|\\z))",
    :unified => "(?:^--- .+?#{eol}^\\+{3} .+?#{eol}.+?(?=^[^-+ @]|\\z))"
  }
  src.scan(/(?:#{pats.values.join("|")})|(?:.*?#{eol}+)/m)
end