#!/usr/bin/perl # #---------------------------------------------------------------------# # PROGRAM: logHBF.pl # # PURPOSE: This program serves one purpose. It reads an Apache # access_log file in standard ECLF format, # and print out the number of hits that have been # recorded for each file and/or directory. HBF stands # for 'hits by file'. # # The user can control the number of files that are # displayed in the output by changing NUM_RECS_TO_PRINT. # # This tool lets you analyze your web site so you can # understand what content your viewers are interested in. # # USAGE: # logHBF.pl access_log > results # # VERSION: 1.0 # #---------------------------------------------------------------------# #----------------------------------------------------------------------------# # COPYRIGHT: # # # # This example is from Developer's Daily (http://www.DevDaily.com). # # Copyright (c) 1998 DevDaily Interactive, Inc. # # This example is provided WITHOUT ANY WARRANTY either expressed or implied. # # You may study, use, modify, and distribute it for non-commercial purposes # # as long as this header is retained in the file. # # For any commercial use, contact our editor (editor@DevDaily.com). # #----------------------------------------------------------------------------# use File::Basename; #------------------------------------------------------------------------------# # Global variables that control the program action and output. # #------------------------------------------------------------------------------# $NUM_RECS_TO_PRINT = 50; # num output recs to print per section $IGNORE_GIF_FILES = 1; # set 'true' to ignore GIF files in output $IGNORE_JPG_FILES = 1; # set 'true' to ignore JPG files in output #---------------------------------------------------------------------# # Change this array to include index filenames used on your system. # #---------------------------------------------------------------------# @indexFilenames = ('index.htm', 'index.html', 'index.shtml'); #----------------------------------# # THE PROGRAM LOGIC STARTS HERE. # #----------------------------------# sub usage { print STDERR "\n\tUsage: logHBF.pl access_log_file > output_file\n"; } #----------------------------------------------------------# # These are two helper routines for the 'sort' function. # #----------------------------------------------------------# sub fileNumericAscending { $numFileRequests{$a} <=> $numFileRequests{$b}; } sub fileNumericDescending { $numFileRequests{$b} <=> $numFileRequests{$a}; } #----------------------------<< main >>-----------------------------# #--------------------------------------------------------------------# # Start by making sure the user is invoking this program properly. # #--------------------------------------------------------------------# $numArgs = $#ARGV + 1; if ($numArgs != 1) { &usage; exit 1; } $logFile = $ARGV[0]; open (LOGFILE,"$logFile") || die " Error opening log file $logFile.\n"; #------------------------------------------------------------------# # Start reading and processing the access_log file in this loop. # #------------------------------------------------------------------# while() { chomp; #----------------------------------------------# # condense one or more whitespace character # # to one single space # #----------------------------------------------# s/\s+/ /go; #----------------------------------------------------------# # the next line breaks each line of the access_log into # # nine variables # #----------------------------------------------------------# ($clientAddress, $rfc1413, $username, $localTime, $httpRequest, $statusCode, $bytesSentToClient, $referer, $clientSoftware) = /^(\S+) (\S+) (\S+) \[(.+)\] \"(.+)\" (\S+) (\S+) \"(.*)\" \"(.*)\"/o; #--------------------------------------------------------------------# # take care of problem where the $httpRequest may simply be a hyphen # #--------------------------------------------------------------------# next if ($httpRequest =~ '^-$'); #-----------------------------------------# # Determine the value of $fileRequested # #-----------------------------------------# ($getPost, $fileRequested, $junk) = split(' ', $httpRequest, 3); #--------------------------------------------------------# # Ignore hits to GIF or JPG files (normally this isn't # # the info you're interested in. # #--------------------------------------------------------# if ($IGNORE_GIF_FILES) { if ($fileRequested =~ /\.gif$/i) { next; } } if ($IGNORE_JPG_FILES) { if ($fileRequested =~ /\.jpg$/i) { next; } } #-----------------------------------------------------------------# # if the base filename is something like index.htm, index.html, # # or index.shtml, interpret this to be the same as the path by # # itself. This way, '/java/' is the same as '/java/index.html'. # #-----------------------------------------------------------------# foreach $indexFile (@indexFilenames) { if (basename($fileRequested) =~ /$indexFile/i) { $fileRequested = dirname($fileRequested); last; } } #----------------------------------------------------------------# # If the last character in $fileRequested is a '/', remove it. # # This makes /perl/ equal to /perl. # #----------------------------------------------------------------# if (length($fileRequested) > 1) { if (substr($fileRequested,length($fileRequested)-1,1) eq '/') { chop($fileRequested); } } #-----------------------------------------------------# # here's where we count the number of hits per file # #-----------------------------------------------------# $numFileRequests{$fileRequested}++; } close (LOGFILE); #--------------------------------------# # Output the number of hits per file # #--------------------------------------# print "TOP $NUM_RECS_TO_PRINT MOST-REQUESTED FILES:\n"; print "-----------------------------\n\n"; $count=0; foreach $key (sort fileNumericDescending (keys(%numFileRequests))) { last if ($count >= $NUM_RECS_TO_PRINT); print "$numFileRequests{$key} \t\t $key\n"; $count++; } print "\n\n";