#!/usr/bin/perl
#
#---------------------------------------------------------------------#
#  PROGRAM:	logHBF.pl
#
#  PURPOSE:	This program serves one purpose.  It reads an Apache
#		access_log file in standard ECLF format,
#		and print out the number of hits that have been 
#		recorded for each file and/or directory.  HBF stands 
#		for 'hits by file'.
#
#		The user can control the number of files that are
#		displayed in the output by changing NUM_RECS_TO_PRINT.
#
#		This tool lets you analyze your web site so you can
#		understand what content your viewers are interested in.
#
#  USAGE:
#		logHBF.pl access_log > results
#
#  VERSION:	1.0
#
#---------------------------------------------------------------------#

#----------------------------------------------------------------------------#
# COPYRIGHT:                                                                 #
#                                                                            #
# This example is from Developer's Daily (http://www.DevDaily.com).          #
# Copyright (c) 1998 DevDaily Interactive, Inc.                              #
# This example is provided WITHOUT ANY WARRANTY either expressed or implied. #
# You may study, use, modify, and distribute it for non-commercial purposes  #
# as long as this header is retained in the file.                            #
# For any commercial use, contact our editor (editor@DevDaily.com).          #
#----------------------------------------------------------------------------#

use File::Basename;

#------------------------------------------------------------------------------#
#  Global variables that control the program action and output.                #
#------------------------------------------------------------------------------#

$NUM_RECS_TO_PRINT     = 50;    # num output recs to print per section
$IGNORE_GIF_FILES      = 1;     # set 'true' to ignore GIF files in output
$IGNORE_JPG_FILES      = 1;     # set 'true' to ignore JPG files in output

#---------------------------------------------------------------------#
#  Change this array to include index filenames used on your system.  #
#---------------------------------------------------------------------#

@indexFilenames = ('index.htm', 'index.html', 'index.shtml');


                #----------------------------------#
                #  THE PROGRAM LOGIC STARTS HERE.  #
                #----------------------------------#

sub usage {
   print STDERR "\n\tUsage:  logHBF.pl access_log_file > output_file\n";
}


#----------------------------------------------------------#
#  These are two helper routines for the 'sort' function.  #
#----------------------------------------------------------#

sub fileNumericAscending {
   $numFileRequests{$a} <=> $numFileRequests{$b};
}


sub fileNumericDescending {
   $numFileRequests{$b} <=> $numFileRequests{$a};
}


#----------------------------<<   main   >>-----------------------------#

   #--------------------------------------------------------------------#
   #  Start by making sure the user is invoking this program properly.  #
   #--------------------------------------------------------------------#

   $numArgs = $#ARGV + 1;

   if ($numArgs != 1) {
      &usage;
      exit 1;
   }

   $logFile = $ARGV[0];

   open (LOGFILE,"$logFile") || die "  Error opening log file $logFile.\n";

   #------------------------------------------------------------------#
   #  Start reading and processing the access_log file in this loop.  #
   #------------------------------------------------------------------#

   while(<LOGFILE>) {

         chomp;

	 #----------------------------------------------#
	 #  condense one or more whitespace character   #
	 #  to one single space                         #
	 #----------------------------------------------#

         s/\s+/ /go;

         #----------------------------------------------------------#
         #  the next line breaks each line of the access_log into   #
         #  nine variables                                          #
         #----------------------------------------------------------#

         ($clientAddress,    $rfc1413,      $username, 
	 $localTime,         $httpRequest,  $statusCode, 
	 $bytesSentToClient, $referer,      $clientSoftware) =
         /^(\S+) (\S+) (\S+) \[(.+)\] \"(.+)\" (\S+) (\S+) \"(.*)\" \"(.*)\"/o;

	 #--------------------------------------------------------------------#
	 # take care of problem where the $httpRequest may simply be a hyphen #
	 #--------------------------------------------------------------------#

	 next if ($httpRequest =~ '^-$');

         #-----------------------------------------#
         #  Determine the value of $fileRequested  #
         #-----------------------------------------#

	 ($getPost, $fileRequested, $junk) = split(' ', $httpRequest, 3);

	 #--------------------------------------------------------#
	 #  Ignore hits to GIF or JPG files (normally this isn't  #
	 #  the info you're interested in.                        #
	 #--------------------------------------------------------#

	 if ($IGNORE_GIF_FILES) {
	    if ($fileRequested =~ /\.gif$/i) {
	       next;
	    }
	 }
	 if ($IGNORE_JPG_FILES) {
	    if ($fileRequested =~ /\.jpg$/i) {
	       next;
	    }
	 }

	 #-----------------------------------------------------------------#
	 #  if the base filename is something like index.htm, index.html,  #
	 #  or index.shtml, interpret this to be the same as the path by   #
	 #  itself.  This way, '/java/' is the same as '/java/index.html'. #
	 #-----------------------------------------------------------------#

	 foreach $indexFile (@indexFilenames) {
	    if (basename($fileRequested) =~ /$indexFile/i) {
	       $fileRequested = dirname($fileRequested);
	       last;
	    }
	 }

	 #----------------------------------------------------------------#
	 #  If the last character in $fileRequested is a '/', remove it.  #
	 #  This makes /perl/ equal to /perl.                             #
	 #----------------------------------------------------------------#

         if (length($fileRequested) > 1) {
	    if (substr($fileRequested,length($fileRequested)-1,1) eq '/') {
	       chop($fileRequested);
	    }
         }

	 #-----------------------------------------------------#
	 #  here's where we count the number of hits per file  #
	 #-----------------------------------------------------#

	 $numFileRequests{$fileRequested}++;

   }

   close (LOGFILE);

   #--------------------------------------#
   #  Output the number of hits per file  #
   #--------------------------------------#

   print "TOP $NUM_RECS_TO_PRINT MOST-REQUESTED FILES:\n";
   print "-----------------------------\n\n";
   $count=0;
   foreach $key (sort fileNumericDescending (keys(%numFileRequests))) {
      last if ($count >= $NUM_RECS_TO_PRINT);
      print "$numFileRequests{$key} \t\t $key\n";
      $count++;
   }
   print "\n\n";