module PgHero::Methods::SuggestedIndexes

Public Instance Methods

autoindex(create: false) click to toggle source
# File lib/pghero/methods/suggested_indexes.rb, line 78
def autoindex(create: false)
  suggested_indexes.each do |index|
    p index
    if create
      connection.execute("CREATE INDEX CONCURRENTLY ON #{quote_table_name(index[:table])} (#{index[:columns].map { |c| quote_table_name(c) }.join(",")})")
    end
  end
end
best_index(statement) click to toggle source
# File lib/pghero/methods/suggested_indexes.rb, line 87
def best_index(statement)
  best_index_helper([statement])[statement]
end
suggested_indexes(suggested_indexes_by_query: nil, **options) click to toggle source
# File lib/pghero/methods/suggested_indexes.rb, line 64
def suggested_indexes(suggested_indexes_by_query: nil, **options)
  indexes = []

  (suggested_indexes_by_query || self.suggested_indexes_by_query(**options)).select { |_s, i| i[:found] && !i[:covering_index] }.group_by { |_s, i| i[:index] }.each do |index, group|
    details = {}
    group.map(&:second).each do |g|
      details = details.except(:index).deep_merge(g)
    end
    indexes << index.merge(queries: group.map(&:first), details: details)
  end

  indexes.sort_by { |i| [i[:table], i[:columns]] }
end
suggested_indexes_by_query(queries: nil, query_stats: nil, indexes: nil) click to toggle source

TODO clean this mess

# File lib/pghero/methods/suggested_indexes.rb, line 9
def suggested_indexes_by_query(queries: nil, query_stats: nil, indexes: nil)
  best_indexes = {}

  if suggested_indexes_enabled?
    # get most time-consuming queries
    queries ||= (query_stats || self.query_stats(historical: true, start_at: 24.hours.ago)).map { |qs| qs[:query] }

    # get best indexes for queries
    best_indexes = best_index_helper(queries)

    if best_indexes.any?
      existing_columns = Hash.new { |hash, key| hash[key] = Hash.new { |hash2, key2| hash2[key2] = [] } }
      indexes ||= self.indexes
      indexes.group_by { |g| g[:using] }.each do |group, inds|
        inds.each do |i|
          existing_columns[group][i[:table]] << i[:columns]
        end
      end
      indexes_by_table = indexes.group_by { |i| i[:table] }

      best_indexes.each do |_query, best_index|
        if best_index[:found]
          index = best_index[:index]
          best_index[:table_indexes] = indexes_by_table[index[:table]].to_a

          # indexes of same type
          indexes = existing_columns[index[:using] || "btree"][index[:table]]

          if best_index[:structure][:sort].empty?
            # gist indexes without an opclass
            # (opclass is part of column name, so columns won't match if opclass present)
            indexes += existing_columns["gist"][index[:table]]

            # hash indexes work for equality
            indexes += existing_columns["hash"][index[:table]] if best_index[:structure][:where].all? { |v| v[:op] == "=" }

            # brin indexes work for all
            indexes += existing_columns["brin"][index[:table]]
          end

          covering_index = indexes.find { |e| index_covers?(e.map { |v| v.sub(/ inet_ops\z/, "") }, index[:columns]) }
          if covering_index
            best_index[:covering_index] = covering_index
            best_index[:explanation] = "Covered by index on (#{covering_index.join(", ")})"
          end
        end
      end
    end
  else
    raise NotEnabled, "Suggested indexes not enabled"
  end

  best_indexes
end
suggested_indexes_enabled?() click to toggle source
# File lib/pghero/methods/suggested_indexes.rb, line 4
def suggested_indexes_enabled?
  defined?(PgQuery) && Gem::Version.new(PgQuery::VERSION) >= Gem::Version.new("0.9.0") && query_stats_enabled?
end

Private Instance Methods

best_index_helper(statements) click to toggle source
# File lib/pghero/methods/suggested_indexes.rb, line 93
def best_index_helper(statements)
  indexes = {}

  # see if this is a query we understand and can use
  parts = {}
  statements.each do |statement|
    parts[statement] = best_index_structure(statement)
  end

  # get stats about columns for relevant tables
  tables = parts.values.map { |t| t[:table] }.uniq
  # TODO get schema from query structure, then try search path
  schema = PgHero.connection_config(connection_model)[:schema] || "public"
  if tables.any?
    row_stats = Hash[table_stats(table: tables, schema: schema).map { |i| [i[:table], i[:estimated_rows]] }]
    col_stats = column_stats(table: tables, schema: schema).group_by { |i| i[:table] }
  end

  # find best index based on query structure and column stats
  parts.each do |statement, structure|
    index = {found: false}

    if structure[:error]
      index[:explanation] = structure[:error]
    elsif structure[:table].start_with?("pg_")
      index[:explanation] = "System table"
    else
      index[:structure] = structure

      table = structure[:table]
      where = structure[:where].uniq
      sort = structure[:sort]

      total_rows = row_stats[table].to_i
      index[:rows] = total_rows

      ranks = Hash[col_stats[table].to_a.map { |r| [r[:column], r] }]
      columns = (where + sort).map { |c| c[:column] }.uniq

      if columns.any?
        if columns.all? { |c| ranks[c] }
          first_desc = sort.index { |c| c[:direction] == "desc" }
          sort = sort.first(first_desc + 1) if first_desc
          where = where.sort_by { |c| [row_estimates(ranks[c[:column]], total_rows, total_rows, c[:op]), c[:column]] } + sort

          index[:row_estimates] = Hash[where.map { |c| ["#{c[:column]} (#{c[:op] || "sort"})", row_estimates(ranks[c[:column]], total_rows, total_rows, c[:op]).round] }]

          # no index needed if less than 500 rows
          if total_rows >= 500

            if ["~~", "~~*"].include?(where.first[:op])
              index[:found] = true
              index[:row_progression] = [total_rows, index[:row_estimates].values.first]
              index[:index] = {table: table, columns: ["#{where.first[:column]} gist_trgm_ops"], using: "gist"}
            else
              # if most values are unique, no need to index others
              rows_left = total_rows
              final_where = []
              prev_rows_left = [rows_left]
              where.reject { |c| ["~~", "~~*"].include?(c[:op]) }.each do |c|
                next if final_where.include?(c[:column])
                final_where << c[:column]
                rows_left = row_estimates(ranks[c[:column]], total_rows, rows_left, c[:op])
                prev_rows_left << rows_left
                if rows_left < 50 || final_where.size >= 2 || [">", ">=", "<", "<=", "~~", "~~*", "BETWEEN"].include?(c[:op])
                  break
                end
              end

              index[:row_progression] = prev_rows_left.map(&:round)

              # if the last indexes don't give us much, don't include
              prev_rows_left.reverse!
              (prev_rows_left.size - 1).times do |i|
                if prev_rows_left[i] > prev_rows_left[i + 1] * 0.3
                  final_where.pop
                else
                  break
                end
              end

              if final_where.any?
                index[:found] = true
                index[:index] = {table: table, columns: final_where}
              end
            end
          else
            index[:explanation] = "No index needed if less than 500 rows"
          end
        else
          index[:explanation] = "Stats not found"
        end
      else
        index[:explanation] = "No columns to index"
      end
    end

    indexes[statement] = index
  end

  indexes
end
best_index_structure(statement) click to toggle source
# File lib/pghero/methods/suggested_indexes.rb, line 196
def best_index_structure(statement)
  return {error: "Too large"} if statement.to_s.length > 10000

  begin
    tree = PgQuery.parse(statement).tree
  rescue PgQuery::ParseError
    return {error: "Parse error"}
  end
  return {error: "Unknown structure"} unless tree.size == 1

  tree = tree.first

  # pg_query 1.0.0
  tree = tree["RawStmt"]["stmt"] if tree["RawStmt"]

  table = parse_table(tree) rescue nil
  unless table
    error =
      case tree.keys.first
      when "InsertStmt"
        "INSERT statement"
      when "VariableSetStmt"
        "SET statement"
      when "SelectStmt"
        if (tree["SelectStmt"]["fromClause"].first["JoinExpr"] rescue false)
          "JOIN not supported yet"
        end
      end
    return {error: error || "Unknown structure"}
  end

  select = tree.values.first
  where = (select["whereClause"] ? parse_where(select["whereClause"]) : []) rescue nil
  return {error: "Unknown structure"} unless where

  sort = (select["sortClause"] ? parse_sort(select["sortClause"]) : []) rescue []

  {table: table, where: where, sort: sort}
end
column_stats(schema: nil, table: nil) click to toggle source
# File lib/pghero/methods/suggested_indexes.rb, line 310
      def column_stats(schema: nil, table: nil)
        select_all <<-SQL
          SELECT
            schemaname AS schema,
            tablename AS table,
            attname AS column,
            null_frac,
            n_distinct
          FROM
            pg_stats
          WHERE
            schemaname = #{quote(schema)}
            #{table ? "AND tablename IN (#{Array(table).map { |t| quote(t) }.join(", ")})" : ""}
          ORDER BY
            1, 2, 3
        SQL
      end
parse_sort(sort_clause) click to toggle source
# File lib/pghero/methods/suggested_indexes.rb, line 301
def parse_sort(sort_clause)
  sort_clause.map do |v|
    {
      column: v["SortBy"]["node"]["ColumnRef"]["fields"].last["String"]["str"],
      direction: v["SortBy"]["sortby_dir"] == 2 ? "desc" : "asc"
    }
  end
end
parse_table(tree) click to toggle source
# File lib/pghero/methods/suggested_indexes.rb, line 270
def parse_table(tree)
  case tree.keys.first
  when "SelectStmt"
    tree["SelectStmt"]["fromClause"].first["RangeVar"]["relname"]
  when "DeleteStmt"
    tree["DeleteStmt"]["relation"]["RangeVar"]["relname"]
  when "UpdateStmt"
    tree["UpdateStmt"]["relation"]["RangeVar"]["relname"]
  end
end
parse_where(tree) click to toggle source

TODO capture values

# File lib/pghero/methods/suggested_indexes.rb, line 282
def parse_where(tree)
  aexpr = tree["A_Expr"]

  if tree["BoolExpr"]
    if tree["BoolExpr"]["boolop"] == 0
      tree["BoolExpr"]["args"].flat_map { |v| parse_where(v) }
    else
      raise "Not Implemented"
    end
  elsif aexpr && ["=", "<>", ">", ">=", "<", "<=", "~~", "~~*", "BETWEEN"].include?(aexpr["name"].first["String"]["str"])
    [{column: aexpr["lexpr"]["ColumnRef"]["fields"].last["String"]["str"], op: aexpr["name"].first["String"]["str"]}]
  elsif tree["NullTest"]
    op = tree["NullTest"]["nulltesttype"] == 1 ? "not_null" : "null"
    [{column: tree["NullTest"]["arg"]["ColumnRef"]["fields"].last["String"]["str"], op: op}]
  else
    raise "Not Implemented"
  end
end
row_estimates(stats, total_rows, rows_left, op) click to toggle source

TODO better row estimation www.postgresql.org/docs/current/static/row-estimation-examples.html

# File lib/pghero/methods/suggested_indexes.rb, line 238
def row_estimates(stats, total_rows, rows_left, op)
  case op
  when "null"
    rows_left * stats[:null_frac].to_f
  when "not_null"
    rows_left * (1 - stats[:null_frac].to_f)
  else
    rows_left *= (1 - stats[:null_frac].to_f)
    ret =
      if stats[:n_distinct].to_f == 0
        0
      elsif stats[:n_distinct].to_f < 0
        if total_rows > 0
          (-1 / stats[:n_distinct].to_f) * (rows_left / total_rows.to_f)
        else
          0
        end
      else
        rows_left / stats[:n_distinct].to_f
      end

    case op
    when ">", ">=", "<", "<=", "~~", "~~*", "BETWEEN"
      (rows_left + ret) / 10.0 # TODO better approximation
    when "<>"
      rows_left - ret
    else
      ret
    end
  end
end