# Time-stamp: "1999-09-10 21:18:03 MDT" # Package whose function, maker(), reads a braille.tab file, and from it # cooks up a subroutine that braille-encodes individual words. # Doesn't handle proper encoding of punctuation, or multi-word ligations # like "by the". # Hasn't been tested except on American English. package Lingua::Braille::Starfish; require 5.005; # we need fancy RE things. use strict; use vars qw($Debug $VERSION $Prec_block); $Debug = 0 unless defined $Debug; $VERSION = "0.31"; $Prec_block = # the default rules-precedence block [ # R# => RE frame, precedence # high number = low precedence # in order of decreasing specificity [ 4 => "\\B%s\\B", 200], #4-only in middle [10 => "\\B%s\\b", 300], #10-only at end [11 => "\\b%s\\B", 400], [ 2 => "\\b%s\\b", 500], #2-must be exact match [ 3 => "\\b%s", 600], #3-at beginning or all [ 7 => "\\B%s", 700], #7-not at beginning # hack for: #11 at beginning but not all and not followed by punctuation or number # really just: at beginning but not all [ 1 => "%s", 950], #1-use anywhere # [ 5 => "%s", 998], #5-joins with same type [TODO: fix!] ], ; ########################################################################### sub init { use vars '&encode'; my $routine_r = maker( $Prec_block, @_ ); *encode = $routine_r; return; } ########################################################################### sub maker { # Either returns a closure that encodes single words, or dies. my $prec_block = $_[0]; die "first param to maker isn't a ref!" unless ref $prec_block; my $Rules_re = undef; my @Substitutions; # a list of hasherefs (replacement lexicons, from => to) # Now read from the prec block. my %Rule_num_frames; # RE frames for rule types we know about my %Rule_num_precedence; # SMALLER numbers equals HIGHER precedence foreach my $r (@$prec_block) { $Rule_num_frames{$r->[0]} = $r->[1]; $Rule_num_precedence{$r->[0]} = $r->[2]; } my @files = grep { !ref($_) && length($_) && -f $_ } @_; # TODO: make this smart. die "I need rules!" unless @files; print "* ", scalar(@files), " rule files to read\n" if $Debug; my %Rule_num_seen; # counter of number of subrules Each_file: my $sub_rule_count = 0; foreach my $f (@files) { die "Can't open $f" unless open(IN, "<$f"); print "* Reading from $f\n" if $Debug; while() { chomp; next unless /^-?(\d+)\|([^|]+)\|(\S+)/; next if $1 == 1 and length($2) == 1 and $2 eq $3; # really no point in saying "replace A with A anywhere" print "Subrule of type \xAB$1\xBB : \xAB$2\xBB => \xAB$3\xBB\n" if $Debug > 2; ++$sub_rule_count; my($rule_number, $from, $to) = ($1,$2,$3); $rule_number = 1 if $rule_number == 5; # HACK!! $rule_number = 3 if $rule_number == 8; # HACK!! $rule_number = 2 if $rule_number == 9; # HACK!! unless(exists $Rule_num_seen{$rule_number}) { $Rule_num_seen{$rule_number} ||= 1; $Substitutions[$rule_number] ||= {}; } print "\"$_\" duplicates existing rule; possible ordering paradox\n" if $Debug && exists $Substitutions[$rule_number]{$from}; $Substitutions[$rule_number]{$from} = $to; } close(IN); } print "* $sub_rule_count rule statements read.\n\n" if $Debug; # end of Each_rule #-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ # Cook up the RE. # prep for the Schwartzian transform now. my @bits; print "Rule frames:\n" if $Debug; Each_rule_number: foreach my $rule_number (sort { $a <=> $b } keys %Rule_num_seen ) { my $prec = $Rule_num_precedence{$rule_number}; print " \# $rule_number : \"", $Rule_num_frames{$rule_number} || '?', "\" ", scalar(keys %{$Substitutions[$rule_number]} ), " subrules (prec ", $prec || '?', ")\n" if $Debug; unless(exists($Rule_num_frames{$rule_number})) { print " * No frame for rule number $rule_number; skipping\n" if $Debug; next; } unless(defined($prec)) { print " * No known precedence for rule number $rule_number; skipping\n" if $Debug; next; } push @bits, map # 0 1 2 3 # literal, frame, rule #, precedence [ $_, $Rule_num_frames{$rule_number}, $rule_number, $prec ], keys %{$Substitutions[$rule_number]}; } print scalar(@bits), " bits\n" if $Debug; die "No rule \@bits?" unless @bits; # TODO: # An optimization should be made here -- for a given input string # $X across all rule contexts K, is it true that # all defined values of $Substitutions[K]{$X} are the same? # If so, then we don't need all the context-sensitivity we've # fought so hard for, with all this (?{$r=%s}) business. # In that degenerate case, just cook up a %Substitutions that # maps from input values to output values (regardless of context, # since we've established that this doesn't matter), cook up an RE # for that (same as below, minus the (?{$r=%s})), and then # return a sub as below, but without the thing that messes with $r, # and with basically just: # $in =~ s/($Rules_re)/$Substitutions{$1}/oeg; # # But in any case, that's just a special optimization. Contextless # rule systems will still be treated just as well by the code below, # if not quite as quickly. $Rules_re = join '|', map { printf "literal %s ; frame '%s' ; rule# %s ; prec %s ; => %s\n", $_->[0], $_->[1], $_->[2], $_->[3], $Substitutions[$_->[2]]{$_->[0]} if $Debug > 1; sprintf $_->[1] . '(?{$r=%s})', # e.g.: \Bea(?{$r=3}) quotemeta($_->[0]), $_->[2] } sort { # Sort first by length($b->[0]) <=> length($a->[0]) # ...Length of the literal or $a->[3] <=> $b->[3] # Then by precedence or $a->[2] <=> $b->[2] # (Then by rule number) or $a cmp $b # Then by alpha order of the literals } @bits ; die "Rules_re is 0-length!" unless length $Rules_re; print "\nMaster RE (", length($Rules_re), " bytes):\n\xAB$Rules_re\xBB\n\n" if $Debug > 1; # return this closure: sub { use re 'eval'; # allow ({?CODE}) blocks in RE code from variables. my $in = $_[0]; # !! TO DO -- do caps handling better use vars '$r'; local $r; # variables in RE (?{ CODE }) blocks need to be package vars my $case_prefix; my $lc = lc($in); if($in eq $lc) { $case_prefix = ''; # all lc } elsif($in eq ucfirst($lc)) { $case_prefix = ','; # initial cap } elsif($in eq uc($lc)) { $case_prefix = ',,'; # allcaps (and > 1 char) } else { $case_prefix = ','; # mixed case -- TODO: deal with correctly } # printf "<%s><%s><%s><%s>\n", $in, lc($in), ucfirst($in), uc($in); # print "Case pref for \xAB$in\xBB : \xAB$case_prefix\xBB\n"; $in = uc($in); $in =~ s/($Rules_re)/$Substitutions[$r]{$1}/oeg; return $case_prefix . $in; }; } ########################################################################### 1; __END__