#!/usr/local/bin/perl
# Time-stamp: "2005-08-19 01:36:26 ADT"  sburke@cpan.org
#   desc{   scans HTML for images lacking alt attributes   }

# This program looks for files that look like HTML, and 
# does a cheap (non)parse on them to look for IMG tags
# that're missing ALT, HEIGHT, WIDTH, and SRC.
# This is NOT a real parse, it's just a cheap hack.

no locale;
use strict;
use File::Find;

my $max_length = 40000; # don't read files larger than this.

exit unless @ARGV;
die "Tell me what files/dirs to recurse.\n" unless @ARGV;

my @file_list;

print "Looking...\n";
find(sub {
  push(@file_list, $File::Find::name)
    if /html?|tmpl?|incl?$/i
	&& -f $File::Find::name;
}, @ARGV);

unless(@file_list) {
 print
   "No HTML files to scan under:\n",
   map("  $_\n", @ARGV),
     "\n\n";
 exit;
}
print
 "About to scan ", scalar(@file_list), ' ',
 @file_list == 1? 'file' : 'files',
    ".\n";

my($in, $contents);

foreach my $file (@file_list){
    if( (-s $file) > $max_length ) {
	print "$file is over the $max_length-byte limit. Skipping\n";
	next;
    }

    unless( open(IN, "<$file")) {
	print "Can't open $file : $!\n";
	close(IN);
	next;
    }
    print "Scanning: $file\n";

    read(IN, $in, $max_length);
  # print "Length: ", length($in), "\n";
    while($in =~ m/<IMG([^>]+)>/ig) {
	$contents = $1;
    print " at byte ", pos($in), ": <IMG$contents>\n"
     unless (
      $contents =~ /ALT/i
      and $contents =~ /HEIGHT/i
      and $contents =~ /WIDTH/i
      and $contents =~ /SRC/i
      and ($contents =~ tr<\cm\cj\t><   > || 1)
	     );
    }
}
print "Done at ", scalar(localtime), ".\n\n";
exit;

__END__
