#!/usr/bin/perl
# desc{      Convert Common Log Format files to tab-separated-values format }
#
# clf2tsv -- Convert Common Log Format files to tab-separated-values format
#  Actually uses some Combined Log Format extensions
#
#  Time-stamp: "2011-03-14 03:03:38 ADT"
#  sburke@cpan.org

@m{ qw<Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec> }
  = map sprintf("%02d",$_), 1 .. 12;

while(<>) {

  @x = m{^

  ([-._a-zA-Z0-9]+)  # %h: IP address            Col 1
   \x20
  (\S+) # %l: identd username                    Col 2
   \x20
  (\S+) # %u: HTTPAuth username                  Col 3
   \x20
  \[ # %t: timestamp                             Col 4
    (
     (\d\d) # day-of-month                       Col 5
      /
     (Jan|Feb|Mar|Apr|May|Jun
     |Jul|Aug|Sep|Oct|Nov|Dec) # monthname       Col 6
      /
     ([12]\d\d\d) # year                         Col 7
    )
     :
    ( # time-of-day                              Col 8
     (\d\d):(\d\d):(\d\d)  # hrs, mns, secs      Col 9,10,11
    )
     \x20
    ([-+]\d\d\d\d)     # TZoffset                Col 12
  \]
   \x20
  " # %r -- request line
   ([A-Za-z]+) # request method                  Col 13
    \x20
   (\S+)  # path + query-string                  Col 14
    \s+
   (\S+)  # protocol                             Col 15
  "
   \x20 
  (\d\d\d)  # %>s: status code                   Col 16
   \x20
  ([-0-9]+)  # %b: bytecount of return           Col 17

  (?:   # Combined Log Format extensions:
    \x20
   ("[^\n\r"]*") # %{Referer}i                   Col 18
    \x20
   ("[^\n\r"]*") # %{User-agent}i                Col 19
  )?

   [\n\r]*
   $
  }xs;


  next unless @x;
  $x[5] = $m{$x[5]};
  push @x, '', '' unless @x == 19;
  tr/\t/ / foreach @x;
  print join("\t", @x), "\n";
}
__END__
