#!/usr/bin/perl # #------------------------------------------------------------------# # PROGRAM: log-reader.cgi # # PURPOSE: Reads an Apache access_log file in standard ECLF # format, determines (1) Hits per file and (2) Hits by # IP Address. # # Intended to be used as a CGI program. # # VERSION: 1.0 # #------------------------------------------------------------------# # # # COPYRIGHT NOTICE: # # # # Copyright 1998 DevDaily Interactive, Inc. All Rights Reserved. # # # # This program is distributed free of charge. It may be used # # and modified free of charge as long as this copyright notice # # remains intact. By using this program you agree to indemnify # # DevDaily Interactive from any liability. # # # # Selling the code for this program without prior written # # consent is expressly forbidden. # #------------------------------------------------------------------# #use diagnostics -verbose; use File::Basename; use CGI; #------------------------------------------------------------------------------# # Global variables that control the program action and output. # #------------------------------------------------------------------------------# $PRINT_HITS_BY_ADDRESS = 1; # set 'true' to see num hits by IP address $PRINT_HITS_BY_FILE = 1; # set 'true' to see num hits by file $IGNORE_GIF_FILES = 1; # set 'true' to ignore GIF files in output $IGNORE_JPG_FILES = 1; # set 'true' to ignore JPG files in output $NUM_RECS_TO_PRINT = 50; # num output recs to print per section @indexFilenames = ('index.htm', 'index.html', 'index.shtml'); #------------------------------------------------------------------------------# #--------------------<< hashNumericAscending >>-------------------# # # # A 'sort' helper function, this routine sorts the output of a hash # # by 'value', assuming that the 'value' is numeric. Output is in # # ascending order (lowest to highest). If the 'value' of a hash is # # not numeric, you should use 'cmp' for the comparison instead. # # # #---------------------------------------------------------------------# sub hashNumericAscending { $numHits{$a} <=> $numHits{$b}; } sub fileNumericAscending { $numFileRequests{$a} <=> $numFileRequests{$b}; } #--------------------<< hashNumericDescending >>------------------# # # # A 'sort' helper function, this routine sorts the output of a hash # # by 'value', assuming that the 'value' is numeric. Output is in # # descending order (highest to lowest). If the 'value' of a hash # # is not numeric, you should use 'cmp' for the comparison instead. # # # #---------------------------------------------------------------------# sub fileNumericDescending { $numFileRequests{$b} <=> $numFileRequests{$a}; } sub hashNumericDescending { $numHits{$b} <=> $numHits{$a}; } sub readLogFile { unless (open(LOGFILE,$LOG_FILE_TO_READ)) { print "Error occurred trying to open the data file "; print "named $LOG_FILE_TO_READ\n"; return; } #---------------------------------# # initialize the hash variables # #---------------------------------# %numHits = (); %numFileRequests = (); while() { chomp; #----------------------------------------------# # condense one or more whitespace character # # to one single space # #----------------------------------------------# s/\s+/ /go; #----------------------------------------------------------# # the next line breaks each line of the access_log into # # nine variables # #----------------------------------------------------------# ($clientAddress, $rfc1413, $username, $localTime, $httpRequest, $statusCode, $bytesSentToClient, $referer, $clientSoftware) = /^(\S+) (\S+) (\S+) \[(.+)\] \"(.+)\" (\S+) (\S+) \"(.*)\" \"(.*)\"/o; ($getPost, $fileRequested, $junk) = split(' ', $httpRequest, 3); #--------------------------------------------------------------------# # take care of problem where the $httpRequest may simply be a hyphen # #--------------------------------------------------------------------# next if ($httpRequest =~ '^-$'); #------------------------------------------------------------# # Count the number of hits to the web site by each # # client address. This way you can tell how many hits to # # attribute to each client. # #------------------------------------------------------------# # %numHits is a 'hash' # #------------------------------------------------------------# if ($IGNORE_GIF_FILES) { if ($fileRequested =~ /\.gif$/i) { next; } } if ($IGNORE_JPG_FILES) { if ($fileRequested =~ /\.jpg$/i) { next; } } #-----------------------------------------------------------------# # if the base filename is something like index.htm, index.html, # # or index.shtml, interpret this to be the same as the path by # # itself. This way, '/java/' is the same as '/java/index.html'. # #-----------------------------------------------------------------# foreach $indexFile (@indexFilenames) { if (basename($fileRequested) =~ /$indexFile/i) { $fileRequested = dirname($fileRequested); last; } } #---------------------------------------------------------------# # here's where we count the number of hits by IP Address, and # # number of hits per file. # #---------------------------------------------------------------# $numHits{$clientAddress}++; $numFileRequests{$fileRequested}++; } close (LOGFILE); #--------------------------------------# # Output the number of hits per file # #--------------------------------------# if ("$PRINT_HITS_BY_FILE") { print "NUMBER OF HITS PER FILE:\n"; print "------------------------\n\n"; $count=0; foreach $key (sort fileNumericDescending (keys(%numFileRequests))) { last if ($count >= $NUM_RECS_TO_PRINT); print "$numFileRequests{$key} \t\t $key\n"; $count++; } print "\n\n"; } #--------------------------------------------# # Output the number of hits per IP address # #--------------------------------------------# if ("$PRINT_HITS_BY_ADDRESS") { print "NUMBER OF HITS PER IP ADDRESS:\n"; print "------------------------------\n\n"; $count=0; foreach $key (sort hashNumericDescending (keys(%numHits))) { last if ($count >= $NUM_RECS_TO_PRINT); print "$numHits{$key} \t\t $key\n"; $count++; } print "\n\n"; } } $q = new CGI; print $q->header; print $q->start_html('Log file analysis'); if (!$q->param) { print $q->h1('Log-Reader (version 1.0)'), $q->start_form, $q->br, '
',
		"This program reads an Apache access_log file in ECLF \n",
		"format, and prints the top URLs that were accessed, \n",
		"as well as the TCP/IP addresses with the most hits \n",
		"registered in your access_log file.\n\n",
		'File to analyze:  ', 
		$q->textfield(-name=>'FILENAME',
			-default=>'/usr/local/etc/apache/logs/access_log.10091998',
			-size=>60),
		$q->br,
		'Records to print: ', 
		$q->textfield(-name=>'NUMRECS',
			-default=>'25',
			-size=>5),
		"\n
 \n", ' ', $q->submit, '
', $q->end_form; } else { print "
\n";
	$LOG_FILE_TO_READ = $q->param(FILENAME);
	$NUM_RECS_TO_PRINT = $q->param(NUMRECS);
	readLogFile;
	print "
\n"; } print $q->end_html;