#!/usr/bin/perl
require 5 || exit;
use strict;

#  Sean M. Burke, sburke@cpan.org, 2005
#  Modify the source as you like/want/need.
#   desc{    makes an XML document from Unihan.txt    }

@ARGV == 2 or die(
"unihan2xml - makes an XML document from Unihan.txt
Usage:
   unihan2xml Unihan.txt Unihan.xml
 or:
   unihan2xml Unihan.txt -
      To send the XML code to STDOUT
");

my($In, $Out) = @ARGV;
die "What input?" unless $In and -f $In and -r _ and -s _;

my $Root_entity_name        = 'Unihan';
my $Entity_name             = 'Hanzi'   ;  # han number, as decimal
my $Charnum_field_name_dec  = 'dec' ;  # han number, as decimal
my $Charnum_field_name_hex  = 'hex' ;  # han number, as hex
my $Charnum_field_name_char = 'char';  # actual han character value
my $Record_count = 0;

open_input();
open_output();
header();
data2xml();
byebye();
exit;

#--------------------------------------------------------------------
sub byebye {
  print "<!--\n $Record_count records --></$Root_entity_name>\n";
  print STDERR "$0: Done after ", time() - $^T,
    " seconds for $Record_count records.\n",
  ;
}

#--------------------------------------------------------------------------

sub data2xml {
  print STDERR "$0: Generating XML...\n";

  my(@keys, @values); # for this character
  my($charnum, $k, $v);  # for this line
  my $lastcharnum = '';

  while(<IN>) {
    next unless
      ( $charnum, $k, $v) =
      m/^
        U\+([a-fA-F0-9]{4,6})  # character number
        \t
        k(\S+)        # key name
        \t
        ([^\cm\cj\t]+) # data value
       /sx
    ;
    emit(\@keys, \@values, $lastcharnum) unless $charnum eq $lastcharnum;
    $lastcharnum = $charnum;
    push @keys, $k;
    push @values, $v;
  }
  emit(\@keys, \@values, $charnum);
  close(IN);
}

sub emit {
  my($klist, $vlist, $charnumhex) = @_;
  return unless length($charnumhex) and @$klist and @$vlist;

  ++$Record_count;

  printf "<%s %s='&#x%s;' %s='%s' %s='%s' xml:id='x%s'\n",
    $Entity_name,
    $Charnum_field_name_char,    $charnumhex,
    $Charnum_field_name_hex,     $charnumhex,
    $Charnum_field_name_dec, hex($charnumhex),
      $charnumhex,
  ;

  my $k;
  foreach my $v (@$vlist) { # XML-escape
    $v =~ s/&/&amp;/g;
    $v =~ s/"/&quot;/g;
    $v =~ s/</&lt;/g;
    $v =~ s/>/&gt;/g;

    $k = shift @$klist;
    $k =~ tr/-_a-zA-Z0-9//cd;
    $k =~ s/^([^a-zA-Z_])/k$1/s;
    $k =~ s/^(xml)/x$1/is; # avoid validity problems

    print " ", $k, qq[="], $v, qq["\n];
  }
  splice @$vlist;
  print "/>";
  return;
  
}

#--------------------------------------------------------------------

sub header {
  my $table_name = 'unihan';

  print join "\n",
    qq[<?xml version="1.0" encoding="UTF-8" ?><!-- -*-coding:utf-8;-*-],
    "\t XML generated by unihan2xml by sburke\@cpan.org",
    sprintf("\t on %s \n\t from %s (%s b long, mod %s)",
      scalar(localtime),
      $In,
      -s $In,
      scalar(localtime( (stat($In))[9] ))
    ),
    "\n--><$Root_entity_name\n>"
  ;
}

#--------------------------------------------------------------------

sub open_output {
  if($Out eq '-') {
    # OK, fine.
  } else {
    open OUT, ">$Out" or die "Can't write-open $Out: $!";
    select(OUT);
  }
  return;
}

#--------------------------------------------------------------------

sub reopen_input {
  open(IN, $In) or die "Can't read-open $In: $!";
  binmode(IN);
  return;
}

#--------------------------------------------------------------------

sub open_input {  # open to IN and adjust newline format
  my $x;
  reopen_input($In);
  read(IN, $_, 2000) or die "Can't read from $In: $!";
  close(IN);
  $/ =
    m/((?:\cm\cj+)|\cm|\cj)/s
     ? $1   # yay, nice newline format
     : die "What newline format is $In in?!"  # should never happen
  ;
  
  reopen_input();
}

#--------------------------------------------------------------------
__END__
