#!/usr/bin/perl
require 5;
# desc{ convert Common Log Format files to XML } sburke~cpan.org
my $VERSION = ('Time-stamp: "2015-09-26 01:09:14 MDT"' =~ m/\"([0-9-]+)/g)[0];
use 5.8.1; # for the "//" operator
use strict;
use Carp;
=for html Download here
=head1 NAME
clf2xml -- convert Common Log Format files to XML
=head1 SYNOPSIS
% clf2xml web_server_access.log > hits.xml
Or to take a closer look:
% tail -1 stuff.log
65.74.65.44 - banjo [25/Jul/2005:14:07:53 -0700] "GET /cgis/
docs/?action=edit&page_id=k%C3%ADl HTTP/1.1" 200 3228 "http:
//www.google.com/search?num=30&hl=en&lr=&c2coff=1&q=haida+k%
C3%ADl&btnG=Search" "Mozilla/5.0 (Windows; U; Windows NT 5.1
; en-US; rv:1.7.10) Gecko/20050716 Firefox/1.0.6"
% tail -1 stuff.log | clf2xml
=head1 DESCRIPTION
This is a small Perl utility to convert an Apache(-style) Common Log
Format log file into XML.
=head1 OPTIONS
Run "clf2xml --help" to list all the options.
=head1 COPYRIGHT AND DISCLAIMER
Copyright (c) 2005 Sean M. Burke. All rights reserved.
This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
The programs and documentation in this file are distributed in
the hope that they will be useful, but without any warranty; without
even the implied warranty of merchantability or fitness for a
particular purpose.
=head1 AUTHOR
Sean M. Burke, sburkeE<64>cpan.org
=cut
#---------------------------------------------------------------------------
use warnings;
use utf8;
use constant DEBUG => 0;
my $opt_help;
use Getopt::Long;
GetOptions(
'help' => \ $opt_help,
'version' => \my $opt_version,
'nowrapper' => \my $opt_nowrapper,
'idprefix=s' => \my $opt_idprefix ) or $opt_help = 1;
if($opt_help) {
print
"clf2xml -- convert Common Log Format hit entries to XML
[ version $VERSION sburke\x40cpan.org ]
Options:
--help Show this message, then quit.
--version Report version, then quit.
--nowrapper Don't wrap the output in ...
--idprefix=abc Prefex xml:id values with 'abc' (default is: 'L-')
Specifying "." as a prefix is shorthand for a longish
unique string and using that as the prefix.
Examples:
clf2xml < whatever.log > whatever.xml
cat *.log | fgrep thingy.cgi | clf2xml | log_analyzer
clf2xml logs/mysite_yesterday.log logs/mysite.log > hits.xml
tail -f access.log | clf2xml --nowrapper
";
exit;
}
if($opt_version) { print "clf2xml version $VERSION\n"; exit; }
#---------------------------------------------------------------------------
use constant IP => 0;
use constant USER_IDENT => 1;
use constant USER_HTTP => 2;
use constant DAY => 4;
use constant MONTH => 5;
use constant YEAR => 6;
use constant HOUR => 8;
use constant MIN => 9;
use constant SEC => 10;
use constant TZOFFSET => 11;
use constant METHOD => 12;
use constant PATH_AND_QUERY => 13;
use constant PROTOCOL => 14;
use constant STATUS_CODE => 15;
use constant BYTES_RETURNED => 16;
use constant REFER => 17;
use constant USERAGENT => 18;
my $pref = $opt_idprefix || "L-";
if($pref eq '.') {
$pref = sprintf("h%x-%x-%x-",
$^T,
$$,
defined( &Win32::GetTickCount )
? (Win32::GetTickCount() & 0xff)
: int(rand 256)
# Under MSWin, $$ values get reused quickly!
);
}
DEBUG and print "idprefix set to <$pref>\n";
$opt_nowrapper or print "\n\n";
my %m;
@m{ qw }
= map sprintf("%02d",$_), 1 .. 12;
my( @x, $orig_query, $orig_refer_query,
$host, $query, $path,
$refer_host, $refer_query
);
while(<>) {
@x = m{^
([-._a-zA-Z0-9]+) # %h: IP address Col 1
\x20
(\S+) # %l: identd username Col 2
\x20
(\S+) # %u: HTTPAuth username Col 3
\x20
\[ # %t: timestamp Col 4
(
(\d\d) # day-of-month Col 5
/
(Jan|Feb|Mar|Apr|May|Jun
|Jul|Aug|Sep|Oct|Nov|Dec) # monthname Col 6
/
([12]\d\d\d) # year Col 7
)
:
( # time-of-day Col 8
(\d\d):(\d\d):(\d\d) # hrs, mns, secs Col 9,10,11
)
\x20
([-+]\d\d\d\d) # TZoffset Col 12
\]
\x20
\" # %r -- request line
([A-Za-z]+) # request method Col 13
\x20
(\S+) # path + query-string Col 14
\s+
(\S+) # protocol Col 15
\"
\x20
(\d\d\d) # %>s: status code Col 16
\x20
([-0-9]+) # %b: bytecount of return Col 17
(?: # Combined Log Format extensions:
\x20
\"([^\n\r"]*)" # %{Referer}i Col 18
\x20
\"([^\n\r"]*)" # %{User-agent}i Col 19
)?
[\n\r]*
$
}xs;
unless( @x ) {
DEBUG and print "Rejecting line: $_\n";
next;
}
$x[5] = $m{$x[5]};
push @x, '', '' unless @x == 19;
for(@x[USER_IDENT, USER_HTTP, REFER, USERAGENT ]) { $_ = '' if $_ eq '-'; }
$x[BYTES_RETURNED] = '0' if $x[BYTES_RETURNED] eq '-';
($host, $query, $path) = ('','','');
if($x[PATH_AND_QUERY] =~
m{\A
(?:
https?://
(?: [^\/\@\?]*\@ )? # userinfo
( [^\/\:\?]+ ) # 1: hostname
(?:\:\d+)? # port
)?
( / [^\?]* ) # 2: path
(?: \? (.*) )? # 3: query
}x
) {
DEBUG > 10 and print "Zonk <", $1 || '', "> <", $2 || '', "> <", $3 || '',
"> on path-and-query $x[PATH_AND_QUERY]\n";
($host, $path, $query) = map {; defined($_) ? $_ : '' } ($1,$2,$3);
} else {
DEBUG > 10 and print "Path-and-query <<$x[PATH_AND_QUERY]>> is weird\n";
}
($refer_host, $refer_query) = ('','');
if($x[REFER] =~
m{\A
http://
(?: [^\/\@\?]*\@ )? # userinfo
( [^\/\:\?]+ ) # 1: hostname
(?:\:\d+)? # port
(?: / [^\?]* ) #
(?: \? (.*) )? # 2: query
}x
) {
$refer_host = $1;
$refer_query = $2 // '';
DEBUG > 10 and print "Refer-Hit: <", $1 // "~" ,"> <", $2 // "~",
"> on $x[REFER]" // "norefer", "\n";
DEBUG > 10 and
print "\n",
;
} else {
DEBUG > 10 and print "Refer-fail on <<$x[REFER]>>\n";
}
if(DEBUG > 13 ) {
my($qu,$rq) = ($orig_query, $refer_query);
$qu = "UNDEF" unless defined $qu;
$rq = "UNDEF" unless defined $rq;
#print "{{qu=\"$qu\" | \"$rq\"}}\n";
}
$orig_query = $query;
$orig_refer_query = $refer_query;
$_ = esc($_) foreach(
$host,
$query,
$path,
$refer_host,
$refer_query,
)
;
$_ = esc($_) for( @x );
print qq{
};
if(length $orig_query ){query2params('query', $orig_query )}
if(length $orig_refer_query){query2params('refer_query', $orig_refer_query)}
print "\n\n\n";
}
$opt_nowrapper or print "\n";
#----------------------------------------------------------------------
my %char2esc;
BEGIN {
# Plain escaping
for(0 .. 255) { $char2esc{chr $_} = "$_;" }
# Now Move fishy stuff into U+E000 - U+E00FF (e.g., U+E07f-U+E09f)
# ThedFrom XML 1.1 spec says: "[#x1-#x8] | [#xB-#xC] | [#xE-#x1F]
# | [#x7F-#x84] | [#x86-#x9F]"
for(
0 .. 0x08, 0x0b, 0x0c, 0x0e .. 0x1f, 0x7f .. 0x9f
) { $char2esc{chr $_} = sprintf "à%02x;", $_; }
}
{
my($el, $query, $name, $value, $pair);
sub query2params {
($el, $query) = @_;
print " <$el>\n";
for $pair (split /[&;]/, $query) {
$pair = "=" unless defined $pair; # so ...;zot=thing;=;whatever" has a "" = ""
($name, $value) = split('=', $pair,2);
$name = '' unless defined $name;
$value = '' unless defined $value;
$value =~ tr/+/ /;
$value =~ s/%([a-fA-F0-9]{2})/pack("C", hex($1))/eg;
$name =~ tr/+/ /;
$name =~ s/%([a-fA-F0-9]{2})/pack("C", hex($1))/eg;
utf8::decode($name ); # silently does nothing, if not actually good utf8
utf8::decode($value);
$name = esc($name);
$value = esc($value);
print qq{\t\n};
}
print " $el>\n";
return;
}
}
sub e { return sprintf "%x;", ord( $_[0] ) }
sub esc {
confess "Got undef as arg to esc" unless defined $_[0];
return '' unless defined $_[0];
return $_[0] if $_[0] eq '' or $_[0] =~ m/\A[-\:a-zA-Z0-9\/\.]+\z/;
# optimization for the most common cases
my $x = $_[0];
$x =~ s{([^\x28-\x3b\x3f-\x7e\!\#\$\%\=\'])} # things not needing escaping
{$char2esc{$1}||=e($1)}eg
and
# $x =~ s{([^;]) ([^&])}{$1 $2}g; # common readability optimization
#0; 0 and
$x =~
s{
(?<= [^;] ) # not (?! ; ) !
&\#32;
(?= [^&] ) # not (?! & ) !
}{ }xg; # common readability optimization
# And something we apparently couldn't get above, so doing it here:
$x =~ s<(\S) (\S)><$1 $2>g;
# ...so that some " " can turn back to " ".
# "5.1; rv:"
# => "5.1; rv:"
return $x;
}
__END__