#!/usr/bin/perl
# Time-stamp: "2008-05-31 22:20:39 ADT"
#
# See http://www.interglacial.com/rss/ for info on RSS feeds
#

require 5;
use strict;
use constant DEBUG => $ENV{'MAILTO'} ? 0 : 2;
use XML::RSS::SimpleGen 12;

my $base = 'http://www.usdoj.gov/dea/programs/forensicsci/microgram/';
rss_new(
  (my $url = $base . 'bulletins_index.html'),
  "Microgram Bulletins",
  "Published by the Drug Enforcement Administration",
);
rss_daily;
rss_self_url(      "http://interglacial.com/rss/microgram.rss" );
rss_generator_url( "http://interglacial.com/rss/microgram.pl" );
rss_livejournal( 'microgram' );

get_url( $url );
DEBUG > 5 and print $_;

my(@items);
while( m{
  <a\ href="(mg\d+/mg(\d+)\.html)">\s*HTML\s+version\s*</a>
}xg ) {
  DEBUG and print "$base$1 => #$2\n";
  push @items, [ "$base$1", "Microgram #$2" ];
  $items[-1][1] =~ s{#0+}{#};
  last if @items > 1;
}

die "$0: No items in here?!\n{{\n$_\n}}\nAborting" unless @items;

        
foreach my $item (@items) {
  get_url( $item->[0] );
  my @headlines;
  DEBUG and print "\n$$item[0] = $$item[1]\n";
  s{(<(?:(?:p)|(?:table))\b)}{</p>\n$1}g;
  while( m{
    <p(?:\ align="center")?>
      ([^\e]{1,400}?)
    </p>
   }xsg
  ) {
    #  (<[^<>]+>\s*)+?   # some tags
    #  \s*
    #  (?:<[^<>]+>\s*)+? # closey-taggies

    my $text = $1;
    my $tags;

    if($text =~ s{^\s*(<[^<>]+>\s*)+\s*}{}s) {   # some start-tags
      $tags = $1;
      next unless $tags =~ m/<(strong|b)\b/i;
    } else {
      DEBUG > 9 and print "Zonking $text\n";
      next;
    }
    
    $text =~ s{\s*(<[^<>]+>\s*)+$}{}g; # closey-taggies
    $text =~ s/^\s*-?\s*INTELLIGENCE\s*(BRIEF|ALERT)\s*-+\s*//s;

    $text =~ s{&quot;}{"}g;
    $text =~ s{&nbsp;}{ }g;
    $text =~ s{&reg;}{ }g;
    
    $text =~ s{<br.*?>}{ }g;
    $text =~ s{\s+}{ }g;
    $text =~ s{<img\b[^<>]+>}{\xA4}ig;
    $text =~ s{<[^<>]+>}{}g;

#        <p><font face="Times New Roman, Times, serif"><strong>70,000 PSILOCYBIN
#             MUSHROOM/CHOCOLATE <br>
#         CANDIES SEIZED NEAR AMARILLO, TEXAS</strong></font></p>

    next if length $text > 100 or length $text < 5
         or $text !~ m/[A-Z]/ or $text =~ m/[a-z]/;

    DEBUG and print "  [$text]\n";
    
    $text = lc ($text);
    $text =~ s/^\s+//s;
    $text =~ s/\s+$//s;

    push @headlines, $text;
  }
  warn "$0: No headlines in $$item[1]?!\n{{\n$_\n}}\n" unless @headlines;

  $item->[1] = "Microgram " . ($2 ? "$2: " : '') . lc($1)
   if m{\b(VOL\.\s+[MDCLXVI]+,\s+NO\.\s+\d+)\b(\s+\w+\s+20\d\d)?\b};

  #print $_;

  rss_item( $item->[0], $item->[1], join(" \xA0\xA7\xA0 ", @headlines) );
}

rss_save('microgram.rss', 150);

__END__

DISCLAIMERS -
1. All material published in either Microgram Bulletin or
Microgram Journal is reviewed prior to publication. However, the
reliability and accuracy of all published information are the
responsibility of the respective contributors, and publication in
Microgram Bulletin implies no endorsement by the United States
Department of Justice or the Drug Enforcement Administration.

2. Due to the ease of scanning, copying, electronic manipulation, and/or
reprinting, only the posted copies of Microgram Bulletin (on
www.dea.gov) are absolutely valid. All other copies, whether electronic
or hard, are necessarily suspect unless verified against the posted
versions. WARNING - Due to the often lengthy time delays between the
actual dates of seizures and their subsequent reporting in Microgram
Bulletin, and also because of the often wide variety of seizure types
with superficially similar physical attributes, published material
cannot be utilized to visually identify controlled substances currently
circulating in clandestine markets. The United States Department of
Justice and the Drug Enforcement Administration assume no liability for
the use or misuse of the information published in Microgram Bulletin.

