WEB LOG TRENDS ANALYZER

Project Summary Page
Download wlta.tar.gz
#!/usr/bin/perl -w

#########################################################################
##                   WEB LOG TRENDS ANALYZER                           ##
#########################################################################
#
# The purpose of the "Web Log Trends Analyzer" is to provide a
# work-around for a "bug" in some popular commercial Log Analyzer
# products which make them unable to process multiple log files for
# overlapping periods of time.  This situation is usually the result of
# using a cluster of web servers for a single web site for which all
# answer requests, yet each maintain separate log files.  This situation
# is also known to occur in certain non-clustered environments, such as
# if you have one web server acting as a front-end for another via
# ProxyPass.  (e.g., SSL front-ending non-SSL, light-weight front-ending
# mod_perl, etc.).
#
# If you like your commercial Log Analyzer (as I do!), and don't feel
# like upgrading just yet (or can't afford to spend 10 times the money),
# this little Perl script will do the trick quite nicely.  Simply
# specify a list of all your log files and the Web Log Trends Analyzer
# will consolidate them into a single log file, sorted by time.  The
# output log will work with your commercial Log Analyzer product without
# a hitch, allowing you to continue using it to generate reports.  This
# script is well-suited to run via cron.
#
# While I am certain that commercial "Enterprise" log analysis products
# are excellent systems worthy of their six-figure price tags, the Web
# Log Trends Analyzer is intended to "fill the gap" for the simple
# sys-admin who already own a commercial Log Analyzer and just want to
# find out how their web site is being used.
#
#
# WLTA expects a list of log files.  These files are consolidated
# into a single logfile and output to STDOUT.
#
# Usage:  wlta.pl [list of log files] > consolidated.log


use Data::Dumper;

use constant DEBUG => $ENV{DEBUG} || 0;

use constant MONTHS => {
	'Jan' => 1,
	'Feb' => 2,
	'Mar' => 3,
	'Apr' => 4,
	'May' => 5,
	'Jun' => 6,
	'Jul' => 7,
	'Aug' => 8,
	'Sep' => 9,
	'Oct' => 10,
	'Nov' => 11,
	'Dec' => 12,
};


main();
exit(0);


sub main {
	my @filelist = @ARGV;

	# Hash to track time of last request for each file
	my $lastreqtime = {};

	# Hash to track log of last request for each file
	my $lastreqlog = {};

	# Hash to track stats for each file
	my $fhstats = {};

	# Open all the files
	my $fhcount = 0;
	my @filehandles = ();
	foreach my $file (@filelist) {
		my $fh = sprintf("fh%02d", $fhcount);
		open ($fh, "<$file") || die ("Can't open: $!");
		push(@filehandles, $fh);
		print STDERR "$fh => '$file'\n" if (DEBUG>0);
		update_fh($fh, $lastreqtime, $lastreqlog, $fhstats);
		$fhcount++;
	}

	# Now iterate through each filehandle
	while (my $output = get_next_line($lastreqtime, $lastreqlog, $fhstats)) {
		print "$output\n";
	}

	print STDERR "DONE!\n" if (DEBUG>0);

	# Close filehandles
	foreach my $fh (@filehandles) {
		close($fh);

		# Print Stats
		dump_stats($fh, $fhstats) if (DEBUG>0);
	}
	
}


# Return the next line to output, or null if done.
sub get_next_line {
	my ($lastreqtime, $lastreqlog, $fhstats) = @_;

	print STDERR "FH Lookup: " . Dumper($lastreqtime) if (DEBUG>4);

	# Build reverse lookup hash: lastreqtime => ( filehandles )
	my %timeindex = ();
	while (my ($fh, $time) = each(%$lastreqtime)) {
		$timeindex{$time} = [] unless (exists($timeindex{$time}));
		push(@{$timeindex{$time}}, $fh);  # Add this $fh to the list of those at this time
	}

	print STDERR "Reverse Lookup: " . Dumper(\%timeindex) if (DEBUG>4);

	# Find the lowest time.  Get the first $fh in the list.  Guard value: ($early_time<0) == EOF
	my @times = sort(keys(%timeindex));
	my $early_time = -1;
	for (my $eti=0; ($eti{$fh};

	# read another line from this $fh
	update_fh($fh, $lastreqtime, $lastreqlog, $fhstats);

	return $output;
}


# Read another line from this $fh.  Update $lastreqtime and $lastreqlog.
sub update_fh {
	my ($fh, $lastreqtime, $lastreqlog, $fhstats) = @_;

	print STDERR "Reading $fh..." if (DEBUG>2);

	my $rawline = <$fh>;

	# Check for EOF -- get out early if at EOF.
	unless (defined($rawline)) {
		$lastreqtime->{$fh} = -1;
		$lastreqlog->{$fh} = undef;
		return;
	}

	chomp($rawline);

	# Get date string from log line e.g.: "[27/Aug/2001:05:17:30 -0400]"
	$rawline =~ m/\[(\d\d)\/(\w\w\w)\/(\d\d\d\d)\:(\d\d)\:(\d\d)\:(\d\d).*\]/;
	my ($mday, $mon, $year, $hour, $min, $sec) = ($1, $2, $3, $4, $5, $6);
	unless (defined($sec)) {
		die ("[$fh] Problem parsing log line:\n\n$rawline\n\n");
	}

	my $mon_num = ${&MONTHS}{$mon};
	die ("No such month '$mon'") unless (defined($mon_num));

	my $datekey = sprintf("%4d%02d%02d%02d%02d%02d", $year, $mon_num, $mday, $hour, $min, $sec);
	print STDERR " [$datekey]\n" if (DEBUG>2);

	# Update $lastreqtime
	$lastreqtime->{$fh} = $datekey;

	# Update $lastreqlog
	$lastreqlog->{$fh} = $rawline;

	# Update Stats
	$fhstats->{$fh} = {FIRST_LOG=>$datekey, COUNT=>0} unless (exists($fhstats->{$fh}));
	$fhstats->{$fh}->{LAST_LOG} = $datekey;
	$fhstats->{$fh}->{COUNT}++;	
}


sub dump_stats {
	my ($fh, $fhstats) = @_;

	print STDERR "[$fh] COUNT: ".$fhstats->{$fh}->{COUNT}
		."  FIRST_LOG: ".$fhstats->{$fh}->{FIRST_LOG}
		."  LAST_LOG: ".$fhstats->{$fh}->{LAST_LOG}."\n";
}