#!/usr/bin/perl require 5; use strict; my $VERSION = ('Time-stamp: "2005-08-21 21:33:58 ADT"' =~ m/"([0-9-]+)/g)[0]; # desc{ convert Common Log Format files to XML } sburke~cpan.org =for html Download here =head1 NAME clf2xml -- convert Common Log Format files to XML =head1 SYNOPSIS % clf2xml web_server_access.log > hits.xml Or to take a closer look: % tail -1 stuff.log 65.74.65.44 - banjo [25/Jul/2005:14:07:53 -0700] "GET /cgis/ docs/?action=edit&page_id=k%C3%ADl HTTP/1.1" 200 3228 "http: //www.google.com/search?num=30&hl=en&lr=&c2coff=1&q=haida+k% C3%ADl&btnG=Search" "Mozilla/5.0 (Windows; U; Windows NT 5.1 ; en-US; rv:1.7.10) Gecko/20050716 Firefox/1.0.6" % tail -1 stuff.log | clf2xml =head1 DESCRIPTION This is a small Perl utility to convert an Apache(-style) Common Log Format log file into XML. =head1 OPTIONS Run "clf2xml --help" to list all the options. =head1 COPYRIGHT AND DISCLAIMER Copyright (c) 2005 Sean M. Burke. All rights reserved. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. The programs and documentation in this file are distributed in the hope that they will be useful, but without any warranty; without even the implied warranty of merchantability or fitness for a particular purpose. =head1 AUTHOR Sean M. Burke, sburkeE<64>cpan.org =cut #--------------------------------------------------------------------------- use warnings; use utf8; use constant DEBUG => 0; my $opt_help; use Getopt::Long; GetOptions( 'help' => \ $opt_help, 'version' => \my $opt_version, 'nowrapper' => \my $opt_nowrapper, 'idprefix=s' => \my $opt_idprefix ) or $opt_help = 1; if($opt_help) { print "clf2xml -- convert Common Log Format hit entries to XML [ version $VERSION sburke\x40cpan.org ] Options: --help Show this message, then quit. --version Report version, then quit. --nowrapper Don't wrap the output in ... --idprefix=abc Prefex xml:id values with 'abc' (default is: 'L-') Specifying "." as a prefix is shorthand for a longish unique string and using that as the prefix. Examples: clf2xml < whatever.log > whatever.xml cat *.log | fgrep thingy.cgi | clf2xml | log_analyzer clf2xml logs/mysite_yesterday.log logs/mysite.log > hits.xml tail -f access.log | clf2xml --nowrapper "; exit; } if($opt_version) { print "clf2xml version $VERSION\n"; exit; } #--------------------------------------------------------------------------- use constant IP => 0; use constant USER_IDENT => 1; use constant USER_HTTP => 2; use constant DAY => 4; use constant MONTH => 5; use constant YEAR => 6; use constant HOUR => 8; use constant MIN => 9; use constant SEC => 10; use constant TZOFFSET => 11; use constant METHOD => 12; use constant PATH_AND_QUERY => 13; use constant PROTOCOL => 14; use constant STATUS_CODE => 15; use constant BYTES_RETURNED => 16; use constant REFER => 17; use constant USERAGENT => 18; my $pref = $opt_idprefix || "L-"; if($pref eq '.') { $pref = sprintf("h%x-%x-%x-", $^T, $$, defined( &Win32::GetTickCount ) ? (Win32::GetTickCount() & 0xff) : int(rand 256) # Under MSWin, $$ values get reused quickly! ); } DEBUG and print "idprefix set to <$pref>\n"; $opt_nowrapper or print "\n\n"; my %m; @m{ qw } = map sprintf("%02d",$_), 1 .. 12; my( @x, $orig_query, $orig_refer_query, $host, $query, $path, $refer_host, $refer_query ); while(<>) { @x = m{^ ([-._a-zA-Z0-9]+) # %h: IP address Col 1 \x20 (\S+) # %l: identd username Col 2 \x20 (\S+) # %u: HTTPAuth username Col 3 \x20 \[ # %t: timestamp Col 4 ( (\d\d) # day-of-month Col 5 / (Jan|Feb|Mar|Apr|May|Jun |Jul|Aug|Sep|Oct|Nov|Dec) # monthname Col 6 / ([12]\d\d\d) # year Col 7 ) : ( # time-of-day Col 8 (\d\d):(\d\d):(\d\d) # hrs, mns, secs Col 9,10,11 ) \x20 ([-+]\d\d\d\d) # TZoffset Col 12 \] \x20 " # %r -- request line ([A-Za-z]+) # request method Col 13 \x20 (\S+) # path + query-string Col 14 \s+ (\S+) # protocol Col 15 " \x20 (\d\d\d) # %>s: status code Col 16 \x20 ([-0-9]+) # %b: bytecount of return Col 17 (?: # Combined Log Format extensions: \x20 "([^\n\r"]*)" # %{Referer}i Col 18 \x20 "([^\n\r"]*)" # %{User-agent}i Col 19 )? [\n\r]* $ }xs; unless( @x ) { DEBUG and print "Rejecting line: $_\n"; next; } $x[5] = $m{$x[5]}; push @x, '', '' unless @x == 19; for(@x[USER_IDENT, USER_HTTP, REFER, USERAGENT ]) { $_ = '' if $_ eq '-'; } $x[BYTES_RETURNED] = '0' if $x[BYTES_RETURNED] eq '-'; ($host, $query, $path) = ('','',''); if($x[PATH_AND_QUERY] =~ m{\A (?: https?:// (?: [^\/\@\?]*\@ )? # userinfo ( [^\/\:\?]+ ) # 1: hostname (?:\:\d+)? # port )? ( / [^\?]* ) # 2: path (?: \? (.*) )? # 3: query }x ) { DEBUG > 10 and print "Zonk <", $1 || '', "> <", $2 || '', "> <", $3 || '', "> on path-and-query $x[PATH_AND_QUERY]\n"; ($host, $path, $query) = map {; defined($_) ? $_ : '' } ($1,$2,$3); } else { DEBUG > 10 and print "Path-and-query <<$x[PATH_AND_QUERY]>> is weird\n"; } ($refer_host, $refer_query) = ('',''); if($x[REFER] =~ m{\A http:// (?: [^\/\@\?]*\@ )? # userinfo ( [^\/\:\?]+ ) # 1: hostname (?:\:\d+)? # port (?: / [^\?]* ) # (?: \? (.*) )? # 2: query }x ) { DEBUG > 10 and print "Refer-zonk: <$1> <$2> on $x[REFER]\n"; $refer_host = $1; $refer_query = defined($2) ? $2 : ''; } else { DEBUG > 10 and print "Refer-fail on <<$x[REFER]>>\n"; } $orig_query = $query; $orig_refer_query = $refer_query; $_ = esc($_) for( $host, $query, $path, $refer_host, $refer_query ); $_ = esc($_) for( @x ); print qq{ }; if(length $orig_query ){query2params('query', $orig_query )} if(length $orig_refer_query){query2params('refer_query', $orig_refer_query)} print "\n\n\n"; } $opt_nowrapper or print "\n"; #---------------------------------------------------------------------- my %char2esc; BEGIN { # Plain escaping for(0 .. 255) { $char2esc{chr $_} = "&#$_;" } # Now Move fishy stuff into U+E000 - U+E00FF (e.g., U+E07f-U+E09f) # ThedFrom XML 1.1 spec says: "[#x1-#x8] | [#xB-#xC] | [#xE-#x1F] # | [#x7F-#x84] | [#x86-#x9F]" for( 0 .. 0x08, 0x0b, 0x0c, 0x0e .. 0x1f, 0x7f .. 0x9f ) { $char2esc{chr $_} = sprintf "à%02x;", $_; } } { my($el, $query, $name, $value, $pair); sub query2params { ($el, $query) = @_; print " <$el>\n"; for $pair (split /[&;]/, $query) { ($name, $value) = split('=', $pair,2); $value = '' unless defined $value; $value =~ tr/+/ /; $value =~ s/%([a-fA-F0-9]{2})/pack("C", hex($1))/eg; $name =~ tr/+/ /; $name =~ s/%([a-fA-F0-9]{2})/pack("C", hex($1))/eg; utf8::decode($name ); # silently does nothing, if not actually good utf8 utf8::decode($value); $name = esc($name); $value = esc($value); print qq{\t\n}; } print " \n"; return; } } sub e { return sprintf "&#x%x;", ord( $_[0] ) } sub esc { return $_[0] if $_[0] eq '' or $_[0] =~ m/\A[-\:a-zA-Z0-9\/\.]+\z/; # optimization for the most common cases my $x = $_[0]; $x =~ s{([^\x28-\x3b\x3f-\x7e\!\#\$\%\=\'])} # things not needing escaping {$char2esc{$1}||=e($1)}eg and # $x =~ s{([^;]) ([^&])}{$1 $2}g; # common readability optimization #0; 0 and $x =~ s{ (?<= [^;] ) # not (?! ; ) ! &\#32; (?= [^&] ) # not (?! & ) ! }{ }xg; # common readability optimization return $x; } __END__