module ApacheLogReport

Constants

VERSION

Public Class Methods

analyze_data(db, options = {}) click to toggle source

take a sqlite3 databae and analyze data

# File lib/apache_log_report.rb, line 169
  def self.analyze_data db, options = {}

    @first_day      = db.execute "SELECT datetime from LogLine order by datetime limit 1"
    @last_day       = db.execute "SELECT datetime from LogLine order by datetime desc limit 1"
    @log_size       = db.execute "SELECT count(datetime) from LogLine"
    @crawlers_size  = db.execute "SELECT count(datetime) from LogLine where bot == 1"
    @selfpolls_size = db.execute "SELECT count(datetime) from LogLine where ip == '::1'"

    #
    # generate the where clause corresponding to the command line options to filter data
    #
    @filter = [
      (options[:from_date] ? "date(datetime) >= '#{options[:from_date]}'" : nil),
      (options[:to_date] ? "date(datetime) <= '#{options[:to_date]}'" : nil),
      (options[:only_crawlers] ? "bot == 1" : nil),
      (options[:ignore_crawlers] ? "bot == 0" : nil),
      (options[:no_selfpolls] ? "ip != '::1'" : nil),
      "true"
    ].compact.join " and "

    # in alternative to sum(size)
    human_readable_size = <<-EOS
    CASE 
    WHEN sum(size) < 1024 THEN sum(size) || ' B' 
    WHEN sum(size) >=  1024 AND sum(size) < (1024 * 1024) THEN ROUND((CAST(sum(size) AS REAL) / 1024),2) || ' KB' 
    WHEN sum(size) >= (1024 * 1024)  AND sum(size) < (1024 * 1024 * 1024) THEN ROUND((CAST(sum(size) AS REAL) / (1024 * 1024)),2) || ' MB' 
    WHEN sum(size) >= (1024 * 1024 * 1024) AND sum(size) < (1024 * 1024 * 1024 *1024) THEN ROUND((CAST(sum(size) AS REAL) / (1024 * 1024 * 1024)),2) || ' GB' 
    WHEN sum(size) >= (1024 * 1024 * 1024 * 1024) THEN ROUND((CAST(sum(size) AS REAL) / (1024 * 1024 * 1024 * 1024)),2) || ' TB' 
END AS size
    EOS

    @total_hits = db.execute "SELECT count(datetime) from LogLine where #{@filter}"
    @total_unique_visitors = db.execute "SELECT count(distinct(unique_visitor)) from LogLine where #{@filter}"
    @total_size = db.execute "SELECT #{human_readable_size} from LogLine where #{@filter}"
    @total_days = (Date.parse(@last_day[0][0]) - Date.parse(@first_day[0][0])).to_i

    @daily_distribution = db.execute "SELECT date(datetime), count(datetime), count(distinct(unique_visitor)), #{human_readable_size} from LogLine  where #{@filter} group by date(datetime)"

    @time_distribution = db.execute "SELECT strftime('%H', datetime), count(datetime), count(distinct(unique_visitor)), #{human_readable_size} from LogLine  where #{@filter} group by strftime('%H', datetime)"

    @most_requested_pages = db.execute "SELECT path, count(path), count(distinct(unique_visitor)), #{human_readable_size} from LogLine where extension == '.html' and #{@filter} group by path order by count(path) desc limit #{options[:limit]}"

    @most_requested_resources = db.execute "SELECT path, count(path), count(distinct(unique_visitor)), #{human_readable_size} from LogLine  where #{@filter} group by path order by count(path) desc limit #{options[:limit]}"

    @missed_pages = db.execute "SELECT path, count(path), count(distinct(unique_visitor)) from LogLine where status == '404' and extension == '.html' and #{@filter} group by path order by count(path) desc limit #{options[:limit]}"

    @missed_resources = db.execute "SELECT path, count(path), count(distinct(unique_visitor)) from LogLine where status == '404' and #{@filter} group by path order by count(path) desc limit #{options[:limit]}"

    @reasonable_requests_exts = [ ".html", ".css", ".js", ".jpg", ".svg", ".png", ".woff", ".xml", ".ttf", ".ico", ".pdf", ".htm", ".txt", ".org" ].map {  |x|
      "extension != '#{x}'"
    }.join " and "

    @attacks = db.execute "SELECT path, count(path), count(distinct(unique_visitor)) from LogLine where status == '404' and #{@filter} and (#{@reasonable_requests_exts}) group by path order by count(path) desc  limit #{options[:limit]}"

    @statuses = db.execute "SELECT status, count(status) from LogLine where #{@filter} group by status order by status"

    @by_day_4xx = db.execute "SELECT date(datetime), count(datetime) from LogLine where substr(status, 1,1) == '4' and #{@filter} group by date(datetime)"
    @by_day_3xx = db.execute "SELECT date(datetime), count(datetime) from LogLine where substr(status, 1,1) == '3' and #{@filter} group by date(datetime)"
    @by_day_2xx = db.execute "SELECT date(datetime), count(datetime) from LogLine where substr(status, 1,1) == '2' and #{@filter} group by date(datetime)"

    @statuses_by_day = (@by_day_2xx + @by_day_3xx + @by_day_4xx).group_by { |x| x[0] }.to_a.map { |x|
      [x[0], x[1].map { |y| y[1] }].flatten
    }

    @browsers = db.execute "SELECT browser, count(browser), count(distinct(unique_visitor)), #{human_readable_size} from LogLine where #{@filter} group by browser order by count(browser) desc"

    @platforms = db.execute "SELECT platform, count(platform), count(distinct(unique_visitor)), #{human_readable_size} from LogLine where #{@filter} group by platform order by count(platform) desc"

    @ips =  db.execute "SELECT ip, count(ip), count(distinct(unique_visitor)), #{human_readable_size} from LogLine where #{@filter} group by ip order by count(ip) desc limit #{options[:limit]}"

    @referers =  db.execute "SELECT referer, count(referer), count(distinct(unique_visitor)), #{human_readable_size} from LogLine where #{@filter} group by referer order by count(referer) desc limit #{options[:limit]}"
  end
emit(options = {}) click to toggle source
# File lib/apache_log_report.rb, line 260
  def self.emit options = {}, command, log_file, started_at, ended_at, duration
    @prefix = options[:prefix]
    @suffix = options[:suffix]
    @export = options[:code_export]
    
<<EOS
#+TITLE: Apache Log Analysis: #{log_file}
#+DATE: <#{Date.today}>
#+STARTUP: showall
#+OPTIONS: ^:{}
#+HTML_HEAD: <link rel="stylesheet" type="text/css" href="ala-style.css" />
#+OPTIONS: html-style:nil

* Summary

| Hits                | #{"%10d" % @total_hits[0][0]} |
| Unique Visitors     | #{"%10d" % @total_unique_visitors[0][0] } |
| Tx                  | #{"%10s" % @total_size[0][0]} |
| Days                | #{"%10d" % @total_days[0][0] } |

* Daily Distribution

#{ output_table "daily_distribution", ["Day", "Hits", "Visits", "Size"], @daily_distribution }

#+BEGIN_SRC gnuplot :var data = daily_distribution :results output :exports #{@export} :file #{@prefix}daily#{@suffix}.svg
reset 
set grid ytics linestyle 0
set grid xtics linestyle 0
set terminal svg size 1200,800 fname 'Arial'

set xdata time
set timefmt "%Y-%m-%d"
set format x "%a, %b %d"
set xtics rotate by 60 right

set title "Hits and Visitors"
set xlabel "Date"
set ylabel "Hits"
set y2label "Visits"
set y2tics

set style fill transparent solid 0.2 noborder

plot data using 1:2 with linespoints lw 3 lc rgb "#0000AA" pointtype 5 title "Hits" axes x1y2, \\
data using 1:2 with filledcurves below x1 linecolor rgb "#0000AA" notitle axes x1y2, \\
data using 1:3 with linespoints lw 3 lc rgb "#AA0000" pointtype 7 title "Visitors", \\
data using 1:3 with filledcurves below x1 notitle linecolor rgb "#AA0000", \\
data using 1:($3+0.1*$3):3 with labels notitle textcolor rgb "#AA0000", \\
data using 1:($2+0.1*$2):2 with labels notitle textcolor rgb "#0000AA" axes x1y2
#+END_SRC


* Time Distribution

#{ output_table "time_distribution", ["Hour", "Hits", "Visits", "Size"], @time_distribution }


#+BEGIN_SRC gnuplot :var data = time_distribution :results output :exports #{@export} :file #{@prefix}time#{@suffix}.svg
reset 
set terminal svg size 1200,800 fname 'Arial' fsize 10

set grid ytics linestyle 0

set title "Hits and Visitors"
set xlabel "Date"
set ylabel "Hits"
set y2label "Visitors"
set y2tics

set style fill solid 0.25
set boxwidth 0.6

set style data histograms
set style histogram clustered gap 1

plot data using 2:xtic(1) lc rgb "#0000AA" title "Hits",     \\
data using 3 lc rgb "#AA0000" title "Visitors" axes x1y2, \\
data using ($0 - 0.2):($2 + 0.1*$2):2 with labels title "" textcolor rgb("#0000AA"), \\
data using ($0 + 0.2):($3 + 0.1*$3):3 with labels title "" textcolor rgb("#AA0000") axes x1y2
#+END_SRC

#+BEGIN_SRC gnuplot :var data = time_distribution :results output :exports #{@export} :file #{@prefix}time-traffic#{@suffix}.svg
reset 
set terminal svg size 1200,800 fname 'Arial' fsize 10

set grid ytics linestyle 0

set title "Traffic"
set xlabel "Date"
set ylabel "Traffic"

set style fill solid 0.50
set boxwidth 0.6

set style data histograms
set style histogram clustered gap 1

plot data using 2:xtic(1) lc rgb "#00AA00" title "Traffic", \\
data using ($0):($2 + 0.1*$2):2 with labels title "" textcolor rgb("#00AA00")
#+END_SRC

* Most Requested Pages

#{ output_table "most_requested_pages", ["Path", "Hits", "Visits", "Size"], @most_requested_pages }

* Most Requested URIs

#{ output_table "most_requested_resources", ["Path", "Hits", "Visits", "Size"], @most_requested_resources }

* 404s on HTML files

#{ output_table "pages_404", ["Path", "Hits", "Visitors"], @missed_pages }

* 404s on other resources

#{ output_table "resources_404", ["Path", "Hits", "Visitors"], @missed_resources }

* Possible Attacks

#{ output_table "attacks", ["Path", "Hits", "Visitors"], @attacks }

* Statuses

#{ output_table "statuses", ["Status", "Count"], @statuses }

#+BEGIN_SRC gnuplot :var data = statuses :results output :exports #{@export} :file #{@prefix}statuses#{@suffix}.svg
reset 
set grid ytics linestyle 0
set terminal svg size 1200,800 fname 'Arial' fsize 10

set style fill solid 0.25
set boxwidth 0.6

plot data using 2:xtic(1) with boxes lc rgb "#0000AA" title "Hits", \\
data using ($0):($2+0.1*$2):2 with labels textcolor rgb "#0000AA"
#+END_SRC

* Daily Statuses

#{ output_table "daily_statuses", ["Status", "2xx", "3xx", "4xx"], @statuses_by_day }

#+BEGIN_SRC gnuplot :var data = daily_statuses :results output :exports #{@export} :file #{@prefix}daily-statuses#{@suffix}.svg
reset 
set terminal svg size 1200,800 fname 'Arial' fsize 10

set grid ytics linestyle 0

set title "Daily Statuses"
set xlabel "Date"
set ylabel "Number of Hits"
set xtics rotate by 60 right

set style fill solid 0.25
set boxwidth 0.6

set style data histograms
set style histogram clustered gap 1

plot data using 2:xtic(1) lc rgb "#00AA00" title "2xx",     \\
data using 3 lc rgb "#0000CC" title "3xx", \\
data using 4 lc rgb "#AA0000" title "4xx", \\
data using ($0 - 1. / 4):($2 + 0.1*$2):2 with labels title "" textcolor rgb("#00AA00"), \\
data using ($0):($3 + 0.1*$3):3 with labels title "" textcolor rgb("#0000CC"), \\
data using ($0 + 1. / 4):($4 + 0.1*$4):4 with labels title "" textcolor rgb("#AA0000")
#+END_SRC

* Browsers

#{ output_table "browsers", ["Browser", "Hits", "Visitors", "Size"], @browsers }

#+BEGIN_SRC gnuplot :var data = browsers :results output :exports #{@export} :file #{@prefix}browser#{@suffix}.svg
reset 
set grid ytics linestyle 0
set terminal svg size 1200,800 fname 'Arial' fsize 10

set style fill solid 0.25
set boxwidth 0.6

plot data using 2:xtic(1) with boxes lc rgb "#0000AA" title "Hits", \\
data using ($0):($2+0.1*$2):2 with labels textcolor rgb "#0000AA"
#+END_SRC

* Platforms

#{ output_table "platforms", ["Platform", "Hits", "Visitors", "Size"], @platforms }

#+BEGIN_SRC gnuplot :var data = platforms :results output :exports #{@export} :file #{@prefix}platforms#{@suffix}.svg
reset 
set grid ytics linestyle 0
set terminal svg size 1200,800 fname 'Arial' fsize 10

set style fill solid 0.25
set boxwidth 0.6

plot data using 2:xtic(1) with boxes lc rgb "#0000AA" title "Hits", \\
data using ($0):($2+0.1*$2):2 with labels textcolor rgb "#0000AA"
#+END_SRC

* IPs

#{ output_table "ips", ["IPs", "Hits", "Visitors", "Size"], @ips }


* Referers

#{ output_table "referers", ["Referers", "Hits", "Visitors", "Size"], @referers }

#+BEGIN_SRC gnuplot :var data = referers :results output :exports #{@export} :file #{@prefix}referers#{@suffix}.svg
reset 
set terminal svg size 1200,800 fname 'Arial' fsize 10

set grid ytics linestyle 0
set grid xtics linestyle 0

set title "Referers"
set xlabel "Date"
set xtics rotate by 60 right
set ylabel "Hits and Visits"

set style fill solid 0.45
set boxwidth 0.7

set style data histograms
set style histogram clustered gap 1

plot data using 2:xtic(1) lc rgb "#AA00AA" title "Hits",     \\
data using 3 lc rgb "#0AAAA0" title "Visits", \\
data using ($0 - 1. / 3):($2 + 0.1*$2):2 with labels title "" textcolor rgb("#AA00AA"), \\
data using ($0 + 1. / 3):($3 + 0.1*$3):3 with labels title "" textcolor rgb("#0AAAA0")
#+END_SRC

* Command Invocation and Performance

** Command Invocation

#+BEGIN_EXAMPLE shell
  #{command}
#+END_EXAMPLE

| Input file           | #{"%-50s" % (log_file || "stdin")} |
| Ignore crawlers      | #{"%-50s" % options[:ignore_crawlers]} |
| Only crawlers        | #{"%-50s" % options[:only_crawlers]} |
| No selfpoll          | #{"%-50s" % options[:no_selfpoll]} |
| Filter by date       | #{"%-50s" % (options[:from_date] != nil or options[:to_date] != nil)} |
| Prefix               | #{"%-50s" % @prefix} |
| Suffix               | #{"%-50s" % @suffix} |

** Log Structure

| Log size                 | #{"%10d" % @log_size[0][0]} |
| Self poll entries        | #{"%10d" % @selfpolls_size[0][0]} |
| Crawlers                 | #{"%10d" % @crawlers_size[0][0]} |
| Entries considered       | #{"%10d" % @total_hits[0][0]} |

** Performance

| Analysis started at | #{started_at.to_s} |
| Analysis ended at   | #{ended_at.to_s} |
| Duration (sec)      | #{"%5.3d" % duration } |
| Duration (min)      | #{"%5.3d" % (duration / 60 )} |
| Log size            | #{"%9d"   % @log_size[0][0]} |
| Lines/sec           | #{"%6.2f" % (@log_size[0][0] / duration)} |

* Local Variables                                                  :noexport:
# Local Variables:
# org-confirm-babel-evaluate: nil
# org-display-inline-images: t
# end:
EOS
  end
options_parse(options) click to toggle source
# File lib/apache_log_report.rb, line 10
def self.options_parse options
  limit = 30
  args = {} 

  opt_parser = OptionParser.new do |opts|
    opts.banner = "Usage: apache_log_report [options] [logfile]"

    opts.on("-lN", "--limit=N", Integer, "Number of entries to show (defaults to #{limit})") do |n|
      args[:limit] = n
    end

    opts.on("-bDATE", "--begin=DATE", DateTime, "Consider entries after or on DATE") do |n|
      args[:from_date] = n
    end

    opts.on("-eDATE", "--end=DATE", DateTime, "Consider entries before or on DATE") do |n|
      args[:to_date] = n
    end

    opts.on("-i", "--ignore-crawlers", "Ignore crawlers") do
      args[:ignore_crawlers] = true
    end

    opts.on("-p", "--ignore-selfpoll", "Ignore apaches self poll entries (from ::1)") do
      args[:no_selfpoll] = true
    end

    opts.on("-c", "--only-crawlers", "Perform analysis on crawlers only") do
      args[:only_crawlers] = true
    end

    opts.on("-uPREFIX", "--prefix=PREFIX", String, "Prefix to add to all plots (used to run multiple analyses in the same dir)") do |n|
      args[:prefix] = n
    end

    opts.on("-wSUFFIX", "--suffix=SUFFIX", String, "Suffix to add to all plots (used to run multiple analyses in the same dir)") do |n|
      args[:suffix] = n
    end

    opts.on("-cWHAT", "--code-export=WHAT", String, "Control :export directive in code blocks (code, results, *both*, none)") do |n|
      args[:code_export] = n
    end
    
    opts.on("-v", "--version", "Prints version information") do
      puts "apache_log_report version #{ApacheLogReport::VERSION}"
      puts "Copyright (C) 2020 Adolfo Villafiorita"
      puts "Distributed under the terms of the MIT license"
      puts ""
      puts "Written by Adolfo Villafiorita"
      exit
    end

    opts.on("-h", "--help", "Prints this help") do
      puts opts
      puts "This is version #{ApacheLogReport::VERSION}"
      exit
    end
  end

  opt_parser.parse!(options)

  args[:limit] ||= limit
  args[:ignore_crawlers] ||= false
  args[:no_selfpoll] ||= false
  args[:only_crawlers] ||= false
  args[:prefix] ||= ""
  args[:suffix] ||= ""
  args[:code_export] ||= "both"

  return args
end
output_table(name, headings, rows) click to toggle source
# File lib/apache_log_report.rb, line 249
def self.output_table name, headings, rows
  name = "#+NAME: #{name}"
  table = Terminal::Table.new headings: headings, rows: rows, style: { border_x: "-", border_i: "|" }

  #(2..headings.size).each do |i|
  #  table.align_column(i, :right)
  #end

  name + "\n" + table.to_s
end
parse(filename, options = {}) click to toggle source
# File lib/apache_log_report.rb, line 91
def self.parse filename, options = {}
  content = filename ? File.readlines(filename) : ARGF.readlines

  db = SQLite3::Database.new ":memory:"
  db.execute "CREATE TABLE IF NOT EXISTS LogLine(
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    datetime TEXT,
    ip TEXT,
    user TEXT,
    unique_visitor TEXT,
    method TEXT,
    path TEXT,
    extension TEXT,
    status TEXT,
    size INTEGER,
    referer TEXT,
    user_agent TEXT,
    bot INTEGER,
    browser TEXT,
    browser_version TEXT,
    platform TEXT,
    platform_version TEXT)"
  
  ins = db.prepare('insert into LogLine (
              datetime, 
              ip,
              user,
              unique_visitor,
              method,
              path, 
              extension,
              status,
              size,
              referer,
              user_agent,
              bot,
              browser,
              browser_version,
              platform,
              platform_version)
            values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)')

  parser = ApacheLog::Parser.new(options[:format] || 'combined')
  
  content.each do |line|
    begin
      hash = parser.parse line

      ua = Browser.new(hash[:user_agent], accept_language: "en-us")
      ins.execute(
        hash[:datetime].iso8601,
        hash[:remote_host],
        hash[:user],
        hash[:datetime].strftime("%Y-%m-%d") + " " + hash[:remote_host] + " " + hash[:user_agent],
        hash[:request][:method],
        hash[:request][:path],
        (hash[:request][:path] ? File.extname(hash[:request][:path]) : ""),
        hash[:status],
        hash[:size].to_i,
        hash[:referer],
        hash[:user_agent],
        ua.bot? ? 1 : 0,
        (ua.name || ""),
        (ua.version || ""),
        (ua.platform.name || ""),
        (ua.platform.version || "")
      )
    rescue
      STDERR.puts "Apache Log parser error: could not parse #{line}"
    end
  end
  
  db
end