#!/usr/bin/perl
#
#######################################################
###  Calomel.org web_report.pl  BEGIN
#######################################################

use Time::Local;

my $logdir = '/var/log/web_server';

opendir D,$logdir or die "Could not open $logdir ($!)";
@logfiles = sort grep /^access.log/, readdir D;
closedir D;

# Just use the 6 most recently archived log files.
shift @logfiles while @logfiles > 6;

my (%host, %url, %status, %urlsperhost);
my ($mintime,$maxtime) = (10_000_000_000, 0);
my %mon = qw/Jan 0 Feb 1 Mar 2 Apr 3 May  4 Jun  5
             Jul 6 Aug 7 Sep 8 Oct 9 Nov 10 Dec 11/;

foreach my $f (@logfiles,'access.log'){
  $logdir = '/var/log/lighttpd' if $f eq 'access.log';
  open F,"$logdir/$f" or die "Could not open $logdir/$f ($!)";
  while(<F>){
    my ($host, $ident_user, $auth_user, $day,$mon,$year, $hour,$min,$sec,
    $time_zone, $method, $url, $protocol, $status,
    $bytes, $referer, $agent) =
    /                 # regexp begins
    ^               # beginning-of-string anchor
    (\S+)           # assigned to $host
    \               # literal space
    (\S+)           # assigned to $ident_user
    \               # literal space
    (\S+)           # assigned to $auth_user
    \               # literal space
    \[              # literal left bracket
    (\d\d)          # assigned to $day
    \/              # literal solidus
    ([A-Z][a-z]{2}) # assigned to $mon
    \/              # literal solidus
    (\d{4})         # assigned to $year
    :               # literal colon
    (\d\d)          # assigned to $hour
    :               # literal colon
    (\d\d)          # assigned to $min
    :               # literal colon
    (\d\d)          # assigned to $sec
    \               # literal space
    ([^\]]+)        # assigned to $time_zone
    \]\ "           # literal string '] "'
    (\S+)           # assigned to $method
    \               # literal space
    (.+?)           # assigned to $url
    \               # literal space
    (\S+)           # assigned to $protocol
    "\              # literal string '" '
    (\S+)           # assigned to $status
    \               # literal space
    (\S+)           # assigned to $bytes
    \               # literal space
    "([^"]+)"       # assigned to $referer
    \               # literal space
    "([^"]+)"       # assigned to $agent
    $               # end-of-string anchor
    /x              # regexp ends, with x modifier
    or next;

    $host eq '::1' and next; # Ignore Apache generated requests from localhost.

    $bytes =~ /^\d+$/ or $bytes = 0;

    $host{$host}++;
    $bytesperhost{$host} += $bytes;
    $url{$url}++;
    $status_class = int($status/100) . '00';
    $status{$status_class}++;
    $urlsperhost{"$host $url"}++;

    # Parse the $time_zone variable.
    my $tz = 0;
    my ($tzs,$tzh,$tzm) = $time_zone =~ /([\-+ ])(\d\d)(\d\d)/;
    if(defined $tzs){
      $tzs = $tzs eq '-' ? 1 : -1;
      $tz = $tzs * (3600*$tzh + 60*$tzm);
    }

    my $time = timegm($sec,$min,$hour,$day,$mon{$mon},$year-1900) + $tz;
    $mintime = $time if $time < $mintime;
    $maxtime = $time if $time > $maxtime;
  }
  close F;
}

my $start = localtime $mintime;
my $end   = localtime $maxtime;

print "Analysis of log records between:\n$start and\n$end\n\n";

my %dns;

my @toprequestors = (sort { $host{$b} <=> $host{$a} } keys %host)[0..9];
print "Top 10 requesters:\n";
foreach my $host (@toprequestors){
  my $name = dns($host);
  printf "  %-15s %12s requests$name\n",$host,add_commas($host{$host});
}

print "\n";

my @topvolume =
(sort { $bytesperhost{$b} <=> $bytesperhost{$a} } keys %bytesperhost)[0..9];
print "Top 10 by volume downloaded:\n";
foreach my $host (@topvolume){
  my $name = dns($host);
  printf "  %-15s %16s bytes$name\n",$host,add_commas($bytesperhost{$host});
}

print "\n";

my @topurls = (sort { $url{$b} <=> $url{$a} } keys %url)[0..9];
print "Top 10 URLs requested:\n";
foreach my $url (@topurls){
  printf "  %12s $url\n",add_commas($url{$url});
}

print "\n";

my @topurlsperhost =
(sort { $urlsperhost{$b} <=> $urlsperhost{$a} } keys %urlsperhost)[0..9];
print "Top 10 URLs per host:\n";
foreach my $hosturl (@topurlsperhost){
  my ($host,$url) = split " ",$hosturl;
  my $name = dns($host);
  printf "  %4d %-15s $url$name\n",$urlsperhost{$hosturl},$host;
}

print "\n";

print "Number of requests per status class:\n";
foreach my $class (sort {$a <=> $b} keys %status){
  printf "%4d  %16s\n",$class,add_commas($status{$class});
}

sub dns{
  my $ip = shift;
  return $dns{$ip} if defined $dns{$ip} && $dns{$ip};
  my $lookup = `/usr/sbin/host $ip 2>/dev/null`;
  my $name;
  if($lookup =~ /NXDOMAIN/
  or $lookup =~ /SERVFAIL/
  or $lookup =~ /timed out/
  ){
    $name = '';
  }
  else{
    $name = (split ' ',$lookup)[-1];
    $name =~ s/\.$//;
    $name = " ($name)";
  }
  $dns{$ip} = $name if $name;
  $name;
}

sub add_commas{
  # Add commas to a number string (e.g. 1357924683 => 1,357,924,683)
  my $num = reverse shift;
  $num =~ s/(...)/$1,/g;
  chop $num if $num =~ /,$/;
  $num = reverse $num;
}
#######################################################
###  Calomel.org  calomel_web_report.pl  END
#######################################################
