#!/usr/bin/perl
#
#: utf8tolerant- gives you always-valid utf8, but forgives input that has non-utf8 binary data
#
# The interface to utf8tolerant is just like cat(1): if you specify
# files, it takes those as input; otherwise it takes from STDIN.  In
# either case, output is to STDOUT.
#
#######################################################################
# sburke@cpan.org ##### Time-stamp: "2008-10-20 00:17:05 AKDT sburke@cpan.org"

use utf8;
use constant DEBUG => 0;
use strict; use warnings;
$| = 1;
Main();
exit;

sub Main {
  @ARGV = '-' unless @ARGV;
  # Using a while(<>) is more trouble than it's worth in this case.
  # We could have just had a <> loop, but then we'd have to set
  # binmode on *every single* file, as <> opens it, plus figuring out
  # how that is detected, and ugh.

  binmode(STDOUT, ":utf8");

  foreach my $file (@ARGV) {
    do_file($file);
  }
}


sub do_file {
  my($file) = @_;
  return if $file eq '';

  my $is_stdin;
  my $IN;
  if($file eq '-') {
    $is_stdin = 1;
    open($IN, '<-'      ) or die "Can't open STDIN?! : $!";
    DEBUG and print "FROM STDIN\n";
  } else {
    open($IN, '<', $file) or die "Can't open $file : $!";    
    DEBUG and print "FROM $file\n";
  }

  binmode($IN); # and we'll throw "decode" at it...

  #
  # A harsher/harder way than the below would be to snare every
  # 8-bit seq and try chopping stuff off of it as a utf8 byte,
  # and interpreting the remanding (possibly intermediate) bytes
  # as Latin-1 bytes to upgrade.
  # But let's assume that a line is the unit of utf8-nonconformity:

  while(<$IN>) {
    if( utf8::decode($_) ) {
      # ^^ giving it a try
      DEBUG and print "*** All-good utf8:\n";
      print $_;
    } else {
      DEBUG and print "*** Contains non-utf8 sequences:\n";
      # Yes, we leave in \x80-\x9F.  It is indeed valid Unicode!
      print $_;
    }
  }
  #(
  # An aside: a scarier possibility would be to use Encode::Guess...
  # and allow command-line options for specifying encodings that that
  # module should try reading non-utf8 source data as, instead of just
  # Latin-1?
  #)

  unless($is_stdin) {
    close $IN;
    undef $IN;
  }
  return;
}

__END__

LICENSE:
This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

See http://www.perl.com/perl/misc/Artistic.html

AUTHOR:
Sean M. Burke, E<lt>sburke@cpan.orgE<gt>

