# Time-stamp: "1999-09-10 21:18:03 MDT"

# Package whose function, maker(), reads a braille.tab file, and from it
#  cooks up a subroutine that braille-encodes individual words.
# Doesn't handle proper encoding of punctuation, or multi-word ligations
#  like "by the".
# Hasn't been tested except on American English.

package Lingua::Braille::Starfish;
require 5.005;  # we need fancy RE things.
use strict;
use vars qw($Debug $VERSION $Prec_block);

$Debug = 0 unless defined $Debug;
$VERSION = "0.31";

$Prec_block =  # the default rules-precedence block
 [
  # R# => RE frame, precedence
  # high number = low precedence
  # in order of decreasing specificity
  [ 4 => "\\B%s\\B", 200],  #4-only in middle
  [10 => "\\B%s\\b", 300],  #10-only at end
     
  [11 => "\\b%s\\B", 400],
     
  [ 2 => "\\b%s\\b", 500],  #2-must be exact match
  [ 3 => "\\b%s",    600],  #3-at beginning or all
  [ 7 => "\\B%s",    700],  #7-not at beginning
   # hack for:
   #11 at beginning but not all and not followed by punctuation or number
   # really just: at beginning but not all
     
  [ 1 => "%s",       950],  #1-use anywhere
  # [ 5 => "%s",       998],  #5-joins with same type  [TODO: fix!]
 ],
;

###########################################################################

sub init {
  use vars '&encode';
  my $routine_r = maker( $Prec_block, @_ );
  *encode = $routine_r;
  return;
}

###########################################################################

sub maker {
  # Either returns a closure that encodes single words, or dies.
  my $prec_block = $_[0];
  die "first param to maker isn't a ref!" unless ref $prec_block;

  my $Rules_re = undef;
  my @Substitutions; # a list of hasherefs (replacement lexicons, from => to)

  # Now read from the prec block.
  my %Rule_num_frames; # RE frames for rule types we know about
  my %Rule_num_precedence; # SMALLER numbers equals HIGHER precedence
  foreach my $r (@$prec_block) {
    $Rule_num_frames{$r->[0]} = $r->[1];
    $Rule_num_precedence{$r->[0]} = $r->[2];
  }
  
  my @files = grep { !ref($_) && length($_) && -f $_ } @_; # TODO: make this smart.
  die "I need rules!" unless @files;

  print "* ", scalar(@files), " rule files to read\n" if $Debug;
  my %Rule_num_seen;  # counter of number of subrules

 Each_file:
  my $sub_rule_count = 0;
  foreach my $f (@files) {
    die "Can't open $f" unless open(IN, "<$f");
    print "* Reading from $f\n" if $Debug;
    while(<IN>) {
      chomp;
      next unless /^-?(\d+)\|([^|]+)\|(\S+)/;

      next if $1 == 1 and length($2) == 1 and $2 eq $3;
       # really no point in saying "replace A with A anywhere"

      print "Subrule of type \xAB$1\xBB : \xAB$2\xBB => \xAB$3\xBB\n"
       if $Debug > 2;

      ++$sub_rule_count;
      my($rule_number, $from, $to) = ($1,$2,$3);

      $rule_number = 1 if $rule_number == 5; # HACK!!
      $rule_number = 3 if $rule_number == 8; # HACK!!
      $rule_number = 2 if $rule_number == 9; # HACK!!

      unless(exists $Rule_num_seen{$rule_number}) {
        $Rule_num_seen{$rule_number} ||= 1;
         $Substitutions[$rule_number] ||= {};
      }

      print "\"$_\" duplicates existing rule; possible ordering paradox\n"
       if $Debug && exists $Substitutions[$rule_number]{$from};

      $Substitutions[$rule_number]{$from} = $to;
    }
    close(IN);
  }
  print "* $sub_rule_count rule statements read.\n\n" if $Debug;
  # end of Each_rule

  #-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  # Cook up the RE.

  # prep for the Schwartzian transform now.
  my @bits;

  print  "Rule frames:\n" if $Debug;

 Each_rule_number:
  foreach my $rule_number (sort { $a <=> $b } keys %Rule_num_seen
  ) {
    my $prec = $Rule_num_precedence{$rule_number};
    print
      " \# $rule_number : \"", 
      $Rule_num_frames{$rule_number} || '?',
      "\" ", scalar(keys %{$Substitutions[$rule_number]} ),
      " subrules (prec ",
      $prec || '?', ")\n"
     if $Debug;

    unless(exists($Rule_num_frames{$rule_number})) {
      print "   * No frame for rule number $rule_number; skipping\n" if $Debug;
      next;
    }
    unless(defined($prec)) {
      print "   * No known precedence for rule number $rule_number; skipping\n" if $Debug;
      next;
    }

    push @bits,
      map
      #   0            1                         2          3
      # literal,     frame,                    rule #,      precedence
       [ $_, $Rule_num_frames{$rule_number}, $rule_number, $prec ],
       keys %{$Substitutions[$rule_number]};
  }

  print scalar(@bits), " bits\n" if $Debug;
  die "No rule \@bits?" unless @bits;

  # TODO:
  # An optimization should be made here -- for a given input string
  # $X across all rule contexts K, is it true that
  # all defined values of $Substitutions[K]{$X} are the same?
  # If so, then we don't need all the context-sensitivity we've
  # fought so hard for, with all this (?{$r=%s}) business.
  # In that degenerate case, just cook up a %Substitutions that
  # maps from input values to output values (regardless of context,
  # since we've established that this doesn't matter), cook up an RE
  # for that (same as below, minus the (?{$r=%s})), and then
  # return a sub as below, but without the thing that messes with $r,
  # and with basically just:
  #   $in =~ s/($Rules_re)/$Substitutions{$1}/oeg;
  #
  # But in any case, that's just a special optimization.  Contextless
  # rule systems will still be treated just as well by the code below,
  # if not quite as quickly.

  $Rules_re =
    join '|',
      map { 
        printf "literal %s ; frame '%s' ; rule# %s ; prec %s ;  => %s\n",
          $_->[0], $_->[1], $_->[2], $_->[3],
          $Substitutions[$_->[2]]{$_->[0]}
         if $Debug > 1;
        sprintf
          $_->[1] . '(?{$r=%s})',   # e.g.: \Bea(?{$r=3})
          quotemeta($_->[0]),
          $_->[2]
      }
      sort {  # Sort first by 
        length($b->[0]) <=> length($a->[0])   # ...Length of the literal
        or $a->[3] <=> $b->[3]  # Then by precedence
        or $a->[2] <=> $b->[2]  # (Then by rule number)
        or $a cmp $b   # Then by alpha order of the literals
      }
      @bits
  ;
  die "Rules_re is 0-length!" unless length $Rules_re;

  print "\nMaster RE (", length($Rules_re), " bytes):\n\xAB$Rules_re\xBB\n\n"
    if $Debug > 1;

  # return this closure:
  sub {
    use re 'eval';  # allow ({?CODE}) blocks in RE code from variables.
    my $in = $_[0];  # !! TO DO -- do caps handling better

    use vars '$r';
    local $r; # variables in RE (?{ CODE }) blocks need to be package vars

    my $case_prefix;
    my $lc = lc($in);
    if($in eq $lc) {                $case_prefix = '';   # all lc
    } elsif($in eq ucfirst($lc)) {  $case_prefix = ',';  # initial cap
    } elsif($in eq uc($lc)) {       $case_prefix = ',,'; # allcaps (and > 1 char)
    } else {                        $case_prefix = ',';
                                      # mixed case -- TODO: deal with correctly
    }
    # printf "<%s><%s><%s><%s>\n", $in, lc($in), ucfirst($in), uc($in); 
    # print "Case pref for \xAB$in\xBB : \xAB$case_prefix\xBB\n";

    $in = uc($in);

    $in =~ s/($Rules_re)/$Substitutions[$r]{$1}/oeg;

    return $case_prefix . $in;
  };
}

###########################################################################

1;

__END__