module Elasticsearch::Git::Repository

Constants

BLOBS_BATCH
COMMMITS_BATCH

Public Instance Methods

as_indexed_json(options = {}) click to toggle source

Representation of repository as indexed json Attention: It can be very very very huge hash

# File lib/elasticsearch/git/repository.rb, line 271
def as_indexed_json(options = {})
  data = {}
  data[:blobs] = index_blobs_array
  data[:commits] = index_commits_array
  data
end
can_index_blob?(blob) click to toggle source

Index text-like files which size less 1.mb

# File lib/elasticsearch/git/repository.rb, line 167
def can_index_blob?(blob)
  blob.text? && (blob.size && blob.size.to_i < 1048576)
end
client_for_indexing() click to toggle source
# File lib/elasticsearch/git/repository.rb, line 389
def client_for_indexing
  @client_for_indexing ||= Elasticsearch::Client.new retry_on_failure: 5
end
delete_blob(blob) click to toggle source
# File lib/elasticsearch/git/repository.rb, line 122
def delete_blob(blob)
  return unless blob.text?
  {
    delete: {
      _index: "#{self.class.index_name}",
      _type: self.class.name.underscore,
      _id: "#{repository_id}_#{blob.path}",
      _parent: project_id
    }
  }
end
index_blob(blob, target_sha) click to toggle source
# File lib/elasticsearch/git/repository.rb, line 134
def index_blob(blob, target_sha)
  return unless can_index_blob?(blob)
  {
    index:  {
      _index: "#{self.class.index_name}",
      _type: self.class.name.underscore,
      _id: "#{repository_id}_#{blob.path}",
      _parent: project_id,
      data: {
        blob: {
          type: "blob",
          oid: blob.id,
          rid: repository_id,
          content: blob.data,
          commit_sha: target_sha,
          path: blob.path,

          # We're duplicating file_name parameter here because
          # we need another analyzer for it.
          # Ideally this should be done with copy_to: 'blob.file_name' option
          # but it does not work in ES v2.3.*. We're doing it so to not make users
          # install newest versions
          # https://github.com/elastic/elasticsearch-mapper-attachments/issues/124
          file_name: blob.path,

          language: blob.language ? blob.language.name : "Text"
        }
      }
    }
  }
end
index_blobs(from_rev: nil, to_rev: repository_for_indexing.last_commit.oid) { |slice, length| ... } click to toggle source

Indexing all text-like blobs in repository

All data stored in global index Repository can be selected by 'rid' field If you want - this field can be used for store 'project' id

blob {

id - uniq id of blob from all repositories
oid - blob id in repository
content - blob content
commit_sha - last actual commit sha

}

For search from blobs use type 'blob'

# File lib/elasticsearch/git/repository.rb, line 87
def index_blobs(from_rev: nil, to_rev: repository_for_indexing.last_commit.oid)
  from, to = parse_revs(from_rev, to_rev)

  diff = repository_for_indexing.diff(from, to)
  deltas = diff.deltas

  deltas.reverse.each_slice(BLOBS_BATCH) do |slice|
    bulk_operations = slice.map do |delta|
      if delta.status == :deleted
        next if delta.old_file[:mode].to_s(8) == "160000"
        b = LiteBlob.new(repository_for_indexing, delta.old_file)
        delete_blob(b)
      else
        next if delta.new_file[:mode].to_s(8) == "160000"
        b = LiteBlob.new(repository_for_indexing, delta.new_file)
        index_blob(b, to)
      end
    end

    perform_bulk bulk_operations

    yield slice, deltas.length if block_given?
  end

  ObjectSpace.garbage_collect
end
index_blobs_array() click to toggle source

Indexing blob from current index

# File lib/elasticsearch/git/repository.rb, line 279
def index_blobs_array
  result = []

  target_sha = repository_for_indexing.head.target.oid

  if repository_for_indexing.bare?
    tree = repository_for_indexing.lookup(target_sha).tree
    result.push(recurse_blobs_index_hash(tree))
  else
    repository_for_indexing.index.each do |blob|
      b = LiteBlob.new(repository_for_indexing, blob)
      result.push(
        {
          type: 'blob',
          id: "#{target_sha}_#{b.path}",
          rid: repository_id,
          oid: b.id,
          content: b.data,
          commit_sha: target_sha
        }
      ) if b.text?
    end
  end

  result
end
index_commit(commit) click to toggle source
# File lib/elasticsearch/git/repository.rb, line 216
def index_commit(commit)
  author    = commit.author
  committer = commit.committer

  {
    index:  {
      _index: "#{self.class.index_name}",
      _type: self.class.name.underscore,
      _id: "#{repository_id}_#{commit.oid}",
      _parent: project_id,
      data: {
        commit: {
          type: "commit",
          rid: repository_id,
          sha: commit.oid,
          author: {
            name: encode!(author[:name]),
            email: encode!(author[:email]),
            time: author[:time].strftime('%Y%m%dT%H%M%S%z'),
          },
          committer: {
            name: encode!(committer[:name]),
            email: encode!(committer[:email]),
            time: committer[:time].strftime('%Y%m%dT%H%M%S%z'),
          },
          message: encode!(commit.message)
        }
      }
    }
  }
end
index_commits(from_rev: nil, to_rev: repository_for_indexing.last_commit.oid) { |batch, length| ... } click to toggle source

Indexing all commits in repository

All data stored in global index Repository can be filtered by 'rid' field If you want - this field can be used git store 'project' id

commit {

sha - commit sha
author {
  name - commit author name
  email - commit author email
  time - commit time
}
commiter {
  name - committer name
  email - committer email
  time - commit time
}
message - commit message

}

For search from commits use type 'commit'

# File lib/elasticsearch/git/repository.rb, line 193
def index_commits(from_rev: nil, to_rev: repository_for_indexing.last_commit.oid)
  from, to = parse_revs(from_rev, to_rev)
  range = [from, to].compact.join('..')
  out, err, status = Open3.capture3("git log #{range} --format=\"%H\"", chdir: repository_for_indexing.path)

  if status.success? && err.blank?
    #TODO use rugged walker!!!
    commit_oids = out.split("\n")

    commit_oids.each_slice(COMMMITS_BATCH) do |batch|
      bulk_operations = batch.map do |commit|
        index_commit(repository_for_indexing.lookup(commit))
      end

      perform_bulk bulk_operations

      yield batch, commit_oids.length if block_given?
    end

    ObjectSpace.garbage_collect
  end
end
index_commits_array() click to toggle source

Lookup all object ids for commit objects

# File lib/elasticsearch/git/repository.rb, line 332
def index_commits_array
  res = []

  repository_for_indexing.each_id do |oid|
    obj = repository_for_indexing.lookup(oid)
    if obj.type == :commit
      res.push(
        {
          type: 'commit',
          sha: obj.oid,
          author: obj.author,
          committer: obj.committer,
          message: encode!(obj.message)
        }
      )
    end
  end

  res
end
index_new_branch?(from) click to toggle source
# File lib/elasticsearch/git/repository.rb, line 265
def index_new_branch?(from)
  from == '0000000000000000000000000000000000000000'
end
parse_revs(from_rev, to_rev) click to toggle source
# File lib/elasticsearch/git/repository.rb, line 248
def parse_revs(from_rev, to_rev)
  from = if index_new_branch?(from_rev)
           if to_rev == repository_for_indexing.last_commit.oid
             nil
           else
             repository_for_indexing.merge_base(
               to_rev,
               repository_for_indexing.last_commit.oid
             )
           end
         else
           from_rev
         end

  return from, to_rev
end
path_to_repo() click to toggle source
# File lib/elasticsearch/git/repository.rb, line 370
def path_to_repo
  if @path_to_repo.blank?
    raise NotImplementedError, 'Please, define "path_to_repo" method, or set "path_to_repo" via "repository_for_indexing" method'
  else
    @path_to_repo
  end
end
perform_bulk(bulk_operations) click to toggle source
# File lib/elasticsearch/git/repository.rb, line 114
def perform_bulk(bulk_operations)
  bulk_operations.compact!

  return false if bulk_operations.empty?

  client_for_indexing.bulk body: bulk_operations
end
recurse_blobs_index_hash(tree, path = "") click to toggle source
# File lib/elasticsearch/git/repository.rb, line 306
def recurse_blobs_index_hash(tree, path = "")
  result = []

  tree.each_blob do |blob|
    blob[:path] = path + blob[:name]
    b = LiteBlob.new(repository_for_indexing, blob)
    result.push(
      {
        type: 'blob',
        id: "#{repository_for_indexing.head.target.oid}_#{path}#{blob[:name]}",
        rid: repository_id,
        oid: b.id,
        content: b.data,
        commit_sha: repository_for_indexing.head.target.oid
      }
    ) if b.text?
  end

  tree.each_tree do |nested_tree|
    result.push(recurse_blobs_index_hash(repository_for_indexing.lookup(nested_tree[:oid]), "#{nested_tree[:name]}/"))
  end

  result.flatten
end
repository_for_indexing(repo_path = nil) click to toggle source
# File lib/elasticsearch/git/repository.rb, line 379
def repository_for_indexing(repo_path = nil)
  return @rugged_repo_indexer if defined? @rugged_repo_indexer

  @path_to_repo ||= repo_path || path_to_repo

  set_repository_id

  @rugged_repo_indexer = Rugged::Repository.new(@path_to_repo)
end
repository_id() click to toggle source

For Overwrite

# File lib/elasticsearch/git/repository.rb, line 365
def repository_id
  @repository_id
end
set_repository_id(id = nil) click to toggle source

Repository id used for identity data from different repositories Update this value if needed

# File lib/elasticsearch/git/repository.rb, line 360
def set_repository_id(id = nil)
  @repository_id = id || path_to_repo
end