#!/usr/bin/perl
require 5;
# desc{  convert Common Log Format files to XML  } sburke~cpan.org
my $VERSION = ('Time-stamp: "2015-09-26 01:09:14 MDT"' =~ m/\"([0-9-]+)/g)[0];
use 5.8.1;  # for the "//" operator
use strict;
use Carp;

=for html <a href="./clf2xml">Download here</a>

=head1 NAME

clf2xml -- convert Common Log Format files to XML

=head1 SYNOPSIS

 % clf2xml web_server_access.log > hits.xml

Or to take a closer look:

 % tail -1 stuff.log
 
 65.74.65.44 - banjo [25/Jul/2005:14:07:53 -0700] "GET /cgis/
 docs/?action=edit&page_id=k%C3%ADl HTTP/1.1" 200 3228 "http:
 //www.google.com/search?num=30&hl=en&lr=&c2coff=1&q=haida+k%
 C3%ADl&btnG=Search" "Mozilla/5.0 (Windows; U; Windows NT 5.1
 ; en-US; rv:1.7.10) Gecko/20050716 Firefox/1.0.6"
 
 % tail -1 stuff.log | clf2xml
 
 <hits>
 
 <hit
  xml:id = "L-1"
  user_hostname = "65.74.65.44"
  user_ident = ""
  user_http = "banjo"

  date = "2005-07-25T14:07:53-0700"
  date_year   = "2005"
  date_month  = "07"
  date_day    = "25"
  date_hour   = "14"
  date_minute = "07"
  date_second = "53"
  date_timezone_offset = "-0700"
 
  host = ""
  path = "/cgis/docs/"
  querystring = "action=edit&#38;page_id=k%C3%ADl"
  path_and_query = "/cgis/docs/?action=edit&#38;page_id=k%C3%ADl"
 
  method = "GET"
  status = "200"
  protocol = "HTTP/1.1"
  bytes_returned = "3228"
  useragent = "Mozilla/5.0 (Windows;&#32;U;&#32;Windows NT 5.1;
 &#32;en-US;&#32;rv:1.7.10) Gecko/20050716 Firefox/1.0.6"
 
  refer = "http://www.google.com/search?num=30&#38;hl=en&#38;lr
 =&#38;c2coff=1&#38;q=haida+k%C3%ADl&#38;btnG=Search"
  refer_query = "num=30&#38;hl=en&#38;lr=&#38;c2coff=1&#38;q=
 haida+k%C3%ADl&#38;btnG=Search"
  refer_http_host = "www.google.com"
 >
   <query>
 	<param	name = "action"   value = "edit" />
 	<param	name = "page_id"  value = "k&#237;l" />
   </query>
   <refer_query>
 	<param	name = "num"	value = "30" />
 	<param	name = "hl"	value = "en" />
 	<param	name = "lr"	value = "" />
 	<param	name = "c2coff"	value = "1" />
 	<param	name = "q"	value = "haida k&#237;l" />
 	<param	name = "btnG"	value = "Search" />
   </refer_query>
 </hit>
 
 
 </hits>


=head1 DESCRIPTION

This is a small Perl utility to convert an Apache(-style) Common Log
Format log file into XML.

=head1 OPTIONS

Run "clf2xml --help" to list all the options.

=head1 COPYRIGHT AND DISCLAIMER

Copyright (c) 2005 Sean M. Burke. All rights reserved.

This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

The programs and documentation in this file are distributed in
the hope that they will be useful, but without any warranty; without
even the implied warranty of merchantability or fitness for a
particular purpose.

=head1 AUTHOR

Sean M. Burke, sburkeE<64>cpan.org

=cut

#---------------------------------------------------------------------------

use warnings;
use utf8;
use constant  DEBUG => 0;
my $opt_help;
use Getopt::Long;
GetOptions(
  'help'       => \   $opt_help,
  'version'    => \my $opt_version,
  'nowrapper'  => \my $opt_nowrapper,
  'idprefix=s' => \my $opt_idprefix       ) or $opt_help = 1;

if($opt_help) {
  print
"clf2xml -- convert Common Log Format hit entries to XML
                    [ version $VERSION  sburke\x40cpan.org ]
Options:
 --help          Show this message, then quit.
 --version       Report version, then quit.
 --nowrapper     Don't wrap the output in <hits>...</hits>
 --idprefix=abc  Prefex xml:id values with 'abc'      (default is: 'L-')
                 Specifying "." as a prefix is shorthand for a longish
                  unique string and using that as the prefix.

Examples:
 clf2xml < whatever.log > whatever.xml
 cat *.log | fgrep thingy.cgi | clf2xml | log_analyzer
 clf2xml logs/mysite_yesterday.log logs/mysite.log > hits.xml
 tail -f access.log | clf2xml --nowrapper
";
  exit;
}
if($opt_version) { print "clf2xml version $VERSION\n"; exit; }

#---------------------------------------------------------------------------

use constant  IP => 0;
use constant  USER_IDENT => 1;
use constant  USER_HTTP => 2;
use constant  DAY => 4;
use constant  MONTH => 5;
use constant  YEAR => 6;
use constant  HOUR => 8;
use constant  MIN  => 9;
use constant  SEC  => 10;
use constant  TZOFFSET => 11;
use constant  METHOD   => 12;
use constant  PATH_AND_QUERY => 13;
use constant  PROTOCOL       => 14;
use constant  STATUS_CODE    => 15;
use constant  BYTES_RETURNED => 16;
use constant  REFER          => 17;
use constant  USERAGENT      => 18;

my $pref = $opt_idprefix || "L-";
if($pref eq '.') {
  $pref = sprintf("h%x-%x-%x-",
    $^T,
    $$,
    defined( &Win32::GetTickCount )
        ? (Win32::GetTickCount() & 0xff)
        : int(rand 256)
     # Under MSWin, $$ values get reused quickly!
  );
}
DEBUG and print "idprefix set to <$pref>\n";

$opt_nowrapper  or  print "<hits>\n\n";


my %m;
@m{ qw<Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec> }
  = map sprintf("%02d",$_), 1 .. 12;

my( @x, $orig_query, $orig_refer_query,
    $host, $query, $path,
    $refer_host, $refer_query
);


while(<>) {

  @x = m{^

  ([-._a-zA-Z0-9]+)  # %h: IP address            Col 1
   \x20
  (\S+) # %l: identd username                    Col 2
   \x20
  (\S+) # %u: HTTPAuth username                  Col 3
   \x20
  \[ # %t: timestamp                             Col 4
    (
     (\d\d) # day-of-month                       Col 5
      /
     (Jan|Feb|Mar|Apr|May|Jun
     |Jul|Aug|Sep|Oct|Nov|Dec) # monthname       Col 6
      /
     ([12]\d\d\d) # year                         Col 7
    )
     :
    ( # time-of-day                              Col 8
     (\d\d):(\d\d):(\d\d)  # hrs, mns, secs      Col 9,10,11
    )
     \x20
    ([-+]\d\d\d\d)     # TZoffset                Col 12
  \]
   \x20
  \" # %r -- request line
   ([A-Za-z]+) # request method                  Col 13
    \x20
   (\S+)  # path + query-string                  Col 14
    \s+
   (\S+)  # protocol                             Col 15
  \"
   \x20 
  (\d\d\d)  # %>s: status code                   Col 16
   \x20
  ([-0-9]+)  # %b: bytecount of return           Col 17

  (?:   # Combined Log Format extensions:
    \x20
    \"([^\n\r"]*)" # %{Referer}i                   Col 18
    \x20
    \"([^\n\r"]*)" # %{User-agent}i                Col 19
  )?

   [\n\r]*
   $
  }xs;

  unless( @x ) {
    DEBUG and print "Rejecting line: $_\n";
    next;
  }

  $x[5] = $m{$x[5]};
  push @x, '', '' unless @x == 19;

  for(@x[USER_IDENT, USER_HTTP, REFER, USERAGENT ]) { $_ = '' if $_ eq '-'; }
  $x[BYTES_RETURNED] = '0'  if  $x[BYTES_RETURNED] eq '-';

  ($host, $query, $path) = ('','','');
  if($x[PATH_AND_QUERY] =~
    m{\A
      (?:
        https?://
        (?:  [^\/\@\?]*\@  )?  # userinfo

        (    [^\/\:\?]+  )     # 1: hostname

        (?:\:\d+)? # port
      )?
      ( / [^\?]* )           # 2: path
      (?: \? (.*) )?         # 3: query
    }x
  ) {
    DEBUG > 10 and print "Zonk <", $1 || '', "> <", $2 || '', "> <", $3 || '',
     "> on path-and-query $x[PATH_AND_QUERY]\n";
    ($host, $path, $query) = map {; defined($_) ? $_ : '' } ($1,$2,$3);
  } else {
    DEBUG > 10 and print "Path-and-query <<$x[PATH_AND_QUERY]>> is weird\n";

  }


  ($refer_host, $refer_query) = ('','');
  if($x[REFER] =~
    m{\A
      http://
      (?:  [^\/\@\?]*\@  )?  # userinfo

      (    [^\/\:\?]+    )   # 1: hostname

      (?:\:\d+)? # port
      (?: / [^\?]* )           #
      (?: \? (.*) )?         # 2: query
    }x
  ) {
    $refer_host  = $1;
    $refer_query = $2 // '';


    DEBUG > 10 and print "Refer-Hit: <", $1 // "~" ,"> <", $2 // "~",
                           "> on $x[REFER]" // "norefer", "\n";
    DEBUG > 10 and
    print "<!-- ODD LINE:\n",
      "\t ref_wholestring = {", $x[REFER] // "UNDEF",    "}\n",
      "\t    ref_hostname = {", $1 // "1=UNDEF",         "}\n",  
      "\t       ref_query = {", $2 // "2=UNDEF",         "}\n",
      "\t      whole_line = {", $_ // "line=UNDEF",      "}\n",
      "-->\n",
    ;
  } else {
    DEBUG > 10 and print "Refer-fail on <<$x[REFER]>>\n";
  }

  if(DEBUG > 13 ) {
    my($qu,$rq) = ($orig_query, $refer_query);
    $qu = "UNDEF" unless defined $qu;
    $rq = "UNDEF" unless defined $rq;
    #print "{{qu=\"$qu\" | \"$rq\"}}\n";
  }

  $orig_query       = $query;
  $orig_refer_query = $refer_query;
  $_ = esc($_) foreach(
     $host,
     $query,
     $path,
     $refer_host,
     $refer_query,
    )
  ;
  $_ = esc($_) for( @x );


  print qq{<hit
 xml:id = "$pref$."
 user_hostname = "$x[IP]"
 user_ident = "$x[USER_IDENT]"
 user_http = "$x[USER_HTTP]"

 date = "$x[YEAR]-$x[MONTH]-$x[DAY]T$x[HOUR]:$x[MIN]:$x[SEC]$x[TZOFFSET]"
 date_year   = "$x[YEAR]"
 date_month  = "$x[MONTH]"
 date_day    = "$x[DAY]"
 date_hour   = "$x[HOUR]"
 date_minute = "$x[MIN]"
 date_second = "$x[SEC]"
 date_timezone_offset = "$x[TZOFFSET]"

 host = "$host"
 path = "$path"
 querystring = "$query"
 path_and_query = "$x[PATH_AND_QUERY]"

 method = "$x[METHOD]"
 status = "$x[STATUS_CODE]"
 protocol = "$x[PROTOCOL]"
 bytes_returned = "$x[BYTES_RETURNED]"
 useragent = "$x[USERAGENT]"

 refer = "$x[REFER]"
 refer_query = "$refer_query"
 refer_http_host = "$refer_host"
>
};

  if(length $orig_query      ){query2params('query',       $orig_query      )}
  if(length $orig_refer_query){query2params('refer_query', $orig_refer_query)}

  print "</hit>\n\n\n";

}

$opt_nowrapper  or  print "</hits>\n";

#----------------------------------------------------------------------
my %char2esc;
BEGIN {
  # Plain escaping
  for(0 .. 255) { $char2esc{chr $_} = "&#$_;" }

  # Now Move fishy stuff into U+E000 - U+E00FF (e.g., U+E07f-U+E09f)
  #  ThedFrom XML 1.1 spec says: "[#x1-#x8] | [#xB-#xC] | [#xE-#x1F]
  #   | [#x7F-#x84] | [#x86-#x9F]"
  for(
   0 .. 0x08,  0x0b,  0x0c,  0x0e .. 0x1f,  0x7f .. 0x9f
  ) { $char2esc{chr $_} = sprintf "&#xE0%02x;", $_; }
}

{
my($el, $query, $name, $value, $pair);
sub query2params {
  ($el, $query) = @_;
  print "  <$el>\n";

  for $pair (split /[&;]/, $query) {
    $pair = "=" unless defined $pair;  # so ...;zot=thing;=;whatever" has a "" = ""
    ($name, $value) = split('=', $pair,2);
    $name  = '' unless defined $name;
    $value = '' unless defined $value;
    $value =~ tr/+/ /;
    $value =~ s/%([a-fA-F0-9]{2})/pack("C", hex($1))/eg;
    $name  =~ tr/+/ /;
    $name  =~ s/%([a-fA-F0-9]{2})/pack("C", hex($1))/eg;

    utf8::decode($name ); # silently does nothing, if not actually good utf8
    utf8::decode($value);

    $name  = esc($name);
    $value = esc($value);
    print qq{\t<param\tname = "$name"\tvalue = "$value" />\n};
  }

  print "  </$el>\n";
  return;
}
}

sub e { return sprintf "&#x%x;", ord( $_[0] ) }

sub esc {
  confess "Got undef as arg to esc" unless defined $_[0];
  return '' unless defined $_[0];
  return $_[0]  if $_[0] eq '' or  $_[0] =~ m/\A[-\:a-zA-Z0-9\/\.]+\z/;
     # optimization for the most common cases

  my $x = $_[0];
  $x =~ s{([^\x28-\x3b\x3f-\x7e\!\#\$\%\=\'])} # things not needing escaping
         {$char2esc{$1}||=e($1)}eg
  and
# $x =~ s{([^;])&#32;([^&])}{$1 $2}g; # common readability optimization
#0; 0 and
  $x =~
   s{
     (?<= [^;] )   # not (?! ; ) !
              &\#32;
     (?=  [^&] )   # not (?! & ) !
    }{ }xg; # common readability optimization

  # And something we apparently couldn't get above, so doing it here:
  $x =~ s<(\S)&#32;(\S)><$1 $2>g;
   # ...so that some "&#32;" can turn back to " ".
   #      "5.1;&#32;rv:"
   #   => "5.1; rv:"


  return $x;
}

__END__
