#!/home/sburke/bin/perl # Time-stamp: "2005-01-06 14:34:37 AST" # Folks, this is not my best code ever. But it works. -- sburke@cpan.org require 5; use constant DEBUG => $ENV{'MAILTO'} ? 0 : 1; use warnings; use strict; use URI; use HTML::Entities; use Time::Local; use LWP::Simple (); # BookTV Schedule scraper/converter. # sburke@cpan.org my $master_url = 'http://www.booktv.org/schedule/'; my @Times; # {'url' => thaturl}, ... my %Url2id; # url => { ... } my %ID2Time; my $id_counter = '0'; my $xml = 'book_tv_schedule.xml'; my $xml_out_temp = 'book_tv_schedule.xml.tmp'; my $temp_sched = 'schedule.html'; my $Year_Earlier = 1900 + (localtime($^T - 900_000))[5]; my $Year_Later = 1900 + (localtime($^T + 900_000))[5]; init_times_tables(); learn_schedule( parse_schedule() ); write_schedule(); DEBUG and print "\nDone at ", scalar(localtime), "\n\n"; exit; #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ my(%dow_before, @months, %month_before, @hour_before, @dows, @tzs); my($dow_re, $month_re, %month2num); sub init_times_tables { @dows = qw(Sunday Monday Tuesday Wednesday Thursday Friday Saturday); @dow_before{@dows} = ($dows[6], @dows[0..5]); # 0 1 .. 23 @hour_before = [23, 0 .. 22]; @months = qw{January February March April May June July August September October November December}; @month_before{@months} = ($months[11], @months[0..10]); @month2num{@months} = (0..11); $dow_re = re_join(@dows); $month_re = re_join(@months); @tzs = qw( Eastern Central Mountain Pacific Alaska Hawaii); return; } #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sub learn_schedule { @Times = @_; # so be it learnt! # Set up the IDs... foreach my $t (@Times) { $t->{'id'} = ++$id_counter } # Set up the Url2id list... foreach my $t (@Times) { push @{ $Url2id{ $t->{'url'} }{'xids'} } => $t->{'id'}; $Url2id{ $t->{'url'} }{'data'}{'desc'} ||= $t->{'desc'}; $Url2id{ $t->{'url'} }{'data'}{'short_title'} ||= $t->{'short_title'}; } foreach my $t (@Times) { my $id = $t->{'id'}; $ID2Time{$id} = $t; my @others = grep $_ ne $id, @{ $Url2id{ $t->{'url'} }{'xids'} }; $t->{'other_showings'} = join ' ', @others; # TODO: } return; } #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sub icsprint { my @lines = @_; foreach my $line (@lines) { $line =~ s{([,;\\])}{\\$1}g; $line =~ s/^\s+//s; $line =~ s/\s$//s; $line =~ s{\n}{\\n}g; #$line =~ s/(.{70})(?=.)/$1\cm\cj\t/mg; #print ICS $line, "\cm\cj"; #$line =~ s/(.{70})(?=.)/$1\n\t/mg; print ICS $line, "\n"; } return; } #--------------------------------------------------------------------------- sub write_schedule { foreach my $tz (@tzs) { my $html_out = "bt$tz\_long.html"; my $html_out_temp = "bt$tz\_long.html.tmp"; my $rss_out = "bt$tz.rss"; my $rss_out_temp = "bt$tz.rss.tmp"; my $ics_out = "bt$tz.ics"; my $ics_out_temp = "bt$tz.ics.tmp"; open(HTML, ">$html_out_temp") or die "Can't write-open $html_out_temp -- $!"; open(RSS, ">$rss_out_temp") or die "Can't write-open $rss_out_temp -- $!"; open(ICS, ">$ics_out_temp") or die "Can't write-open $ics_out_temp -- $!"; print HTML qq[ BookTV Schedule - $tz

BookTV Schedule - $tz

Alternate versions: RSS, iCalendar, XML
\n
], ; print RSS qq{ BookTV : $tz timezone http://www.interglacial.com/rss/booktv/bt${tz}_long.html BookTV's listings for this weekend, $tz timezone en-us http://interglacial.com/rss/booktv/booktv_tzs.pl http://interglacial.com/rss/booktv/bt$tz.rss http://interglacial.com/rss/booktv/bt$tz\_long.html HTML version of this data HTML version of this data http://interglacial.com/rss/booktv/bt$tz.ics iCalendar version of this data iCalendar version of this data } ; icsprint "BEGIN:VCALENDAR", "VERSION:2.0", "X-WR-CALNAME:Book TV Schedule ($tz timezone)", "PRODID:-//Sean M. Burke//NONSGML BookTVGen//EN", "METHOD:PUBLISH", ; my $last_date = ''; my $heading_count = 0; foreach my $t (@Times) { my @other_showings = ($t->{'other_showings'} =~ m/(\S+)/g); unless( $last_date eq $t->{"$tz\_start_day"} ) { $heading_count++; printf HTML "
%s\n", $heading_count, ($heading_count == 1) ? " class='firsty'" : '', $t->{"$tz\_start_day"}; } $last_date = $t->{"$tz\_start_day"}; my $titledesc = join( ' - ', grep $_, $t->{'short_title'}, $t->{'desc'} ) || "??"; if( $t->{$tz . "_start_time_iso"} ) { icsprint "BEGIN:VEVENT", "CATEGORIES:TVShow", "LOCATION:CSPAN2", "DTSTART:" . $t->{$tz . "_start_time_iso"}, $t->{$tz . "_end_time_iso"} ? ( "DTEND:" . $t->{$tz . "_end_time_iso"} ) : (), "SUMMARY:" . $titledesc, "DESCRIPTION:" . join( "\n", grep $_, $titledesc, $t->{'url'}, @other_showings ? ( join "" => "The " , scalar(@other_showings) , " other showing", (@other_showings == 1 ? '' : 's'), ":\n", map {; " * ", $_->{$tz.'_start_time_civ'} || $_->{$tz.'_start_time_civ_short'}, " on ", $_->{$tz.'_start_day'} || $_->{$tz.'_start_day_short'}, ".\n" } map $ID2Time{$_} || "No segment for ID '$_'!?!", @other_showings, ) : (), #end of Desc ), "URL:". $t->{'url'}, "UID:". $t->{'uid'} . $tz, "END:VEVENT" ; } else { DEBUG and print "No $tz\_start_time_iso in $$t{'uid'}\n"; } printf RSS qq{\n %s\n %s - %s\n %s, %s-%s - %s - %s\n\n\n}, map encode_entities( $t->{$_} || '' ), 'url', ('short_title', 'desc'), "$tz\_start_day_short", "$tz\_start_time_civ_short", "$tz\_end_time_civ_short", ('short_title', 'desc'), ; #icsprint printf HTML qq{
%s-%s - %s %s }, map encode_entities( $t->{$_} || '' ), 'id', "$tz\_start_time_civ_short", "$tz\_end_time_civ_short", qw(url short_title desc), ; if( @other_showings ) { print HTML ". [Other showings: ", join( ' ', map( "*", @other_showings ) ), "]\n" ; } print HTML "\n\n"; } print HTML "\n

[End]

"; print RSS "\n"; icsprint "END:VCALENDAR"; close(HTML); close(RSS); close(ICS); sleep 1; updatey( $html_out_temp => $html_out ); updatey( $rss_out_temp => $rss_out ); updatey( $ics_out_temp => $ics_out ); } open(XMLOUT, ">$xml_out_temp") or die "Can't write-open $xml_out_temp -- $!"; print XMLOUT qq{ } ; print XMLOUT " "; print XMLOUT " \n"; foreach my $t (@Times) { print XMLOUT " \n"; } print XMLOUT " \n"; print XMLOUT " \n"; foreach my $url (sort keys %Url2id) { print XMLOUT " ", ( map "\n" ; } print XMLOUT " \n"; print XMLOUT ""; close(XMLOUT); sleep 0; updatey($xml_out_temp => $xml); return; } #--------------------------------------------------------------------------- sub updatey { my($temp, $file) = @_; use File::Compare; use File::Copy; if(-e $file and -s $file) { if(compare($temp, $file)) { move($temp, $file); DEBUG and print "Moving $temp\n => $file\n"; } else { # they're the same unlink $temp; DEBUG and print "$file doesn't need updating.\n"; } } else { move($temp, $file); } chmod 0644, $file; return; } sub hash_as_attributes { my $x = $_[0]; keys %$x; # reset iterator return map " $_=\"" . xmlesc($x->{$_}). "\"\n", sort grep !ref $x->{$_}, keys %$x; } sub xmlesc { my $it = $_[0]; $it =~ s<([^\x20\x21\x23\x28-\x3b\x3d\x3F-\x5B\x5D-\x7E])> <'&#'.(ord($1)).';'>seg; return $it; } #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sub re_join { join "|", map quotemeta $_, grep length $_, sort {length $a <=> length $b or $a cmp $b} @_ } sub hours_back { my($day, $time, $hours_back) = @_; die "Non-natural hours-back: $hours_back" unless $hours_back == int($hours_back) and $hours_back >= 0; if( $hours_back == 0 ) { return($day, $time) } elsif( $hours_back > 1 ) { return hours_back( hours_back($day,$time,1), $hours_back - 1 ); } else { # just take off one hour die "illegal time <$time>" unless $time =~ m/^(\d\d?):(\d\d)$/s; my($h,$m) = ($1,$2); if($h > 0) { $time = sprintf '%02d:%s', $h - 1, $m; DEBUG > 2 and print "An hour before $_[0] $_[1] is $day $time, obviously\n"; return($day, $time); } else { $time = "23:$m"; $day = long_day_before($day); DEBUG > 2 and print "An hour before $_[0] $_[1] is $day $time, I figure\n"; return($day, $time); } } } { my %cache; sub long_day_before { $cache{ $_[0] } ||= _long_day_before($_[0]); } } sub _long_day_before { # 'Saturday, July 27' => 'Friday, July 26' my $in = $_[0]; if($in =~ m/^\s*($dow_re),\s*($month_re)\s*(\d+)\s*$/os) { if($3 > 1) { # common case my $out = sprintf '%s, %s %d', ($dow_before{ $1 } || return ''), $2, $3 - 1, ; DEBUG > 2 and print "Day before $in is $out, obviously\n"; return $out; } # Otherwise we actually have to think, because it's the 1st my $new_dow = ($dow_before{ $1 } || return ''); my $month_num = $month2num{$2}; my $noon = timelocal(0,0,12 => 1, $month_num, (localtime)[5]); DEBUG > 1 and print "Noon on $in is ", scalar(localtime($noon)), "\n"; $noon -= 24 * 60 * 60; DEBUG > 2 and print "The day before is ", scalar(localtime($noon)), "\n"; my @noon = localtime($noon); my $out = sprintf '%s, %s %d', $new_dow, $months[$noon[4]], $noon[3] ; DEBUG > 2 and print "Day before $in is $out, I figure\n"; return $out; } else { return ''; } } #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sub end_time_iso { my($y,$m,$d, $iso_time, $runtime_mins) = @_; # A very motley function. $iso_time =~ m/^(\d\d)(\d\d)(\d\d)$/s or die "Can't consider $iso_time as an ISO time!"; my($hour, $min, $sec) = ($1,$2,$3); my $start_time = timegm( $sec, $min, $hour, $d, $m-1, $y-1900 ); my $end_time = $start_time + 60 * $runtime_mins; my(@e) = gmtime($end_time); $e[5] += 1900; $e[4]++; return sprintf "%04d%02d%02dT%02d%02d%02d", @e[5,4,3,2,1,0]; } #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sub add_end_times { my($times_r) = $_[0]; # figure out run-times my $end_time = '8:00 a.m.'; my $end_time_mil = '08:00'; my $prev = { 'Eastern_start_time_mil' => '08:00', 'Eastern_start_time_civ' => '8:00 a.m.', 'Central_start_time_mil' => '07:00', 'Central_start_time_civ' => '7:00 a.m.', 'Mountain_start_time_mil' => '06:00', 'Mountain_start_time_civ' => '6:00 a.m.', 'Pacific_start_time_mil' => '05:00', 'Pacific_start_time_civ' => '5:00 a.m.', 'Alaska_start_time_mil' => '04:00', 'Alaska_start_time_civ' => '4:00 a.m.', 'Hawaii_start_time_mil' => '03:00', 'Hawaii_start_time_civ' => '3:00 a.m.', }; foreach my $t (reverse @$times_r) { $t->{'Eastern_start_time_mil'} = miltime( $t->{'Eastern_start_time_civ'} = delete $t->{'short_time'} ); $t->{'Eastern_start_day'} = delete $t->{'long_day'}; $t->{'runtime_hm'} = min2hm( $t->{'runtime'} = time_diff( $t->{'Eastern_start_time_mil'}, $prev->{'Eastern_start_time_mil'} ) ); my @tz_todo = @tzs; foreach my $tz (@tz_todo) { DEBUG > 3 and print "$tz\_end_time_civ < prev's $tz\_start_time_civ (", ($prev->{$tz . '_start_time_civ'}), ")\n", "$tz\_end_time_mil < prev's $tz\_start_time_mil (", ($prev->{$tz . '_start_time_mil'}), ")\n" ; $t->{$tz . '_end_time_civ'} = $prev->{$tz . '_start_time_civ'}; $t->{$tz . '_end_time_mil'} = $prev->{$tz . '_start_time_mil'}; } while(@tz_todo > 1) { my($from, $to) = @tz_todo; DEBUG > 2 and print "Converting $from -> $to\n"; ( $t->{$to . '_start_day'}, $t->{$to . '_start_time_mil'} ) = hours_back($t->{$from . '_start_day'}, $t->{$from . '_start_time_mil'}, 1) ; $t->{$to . '_start_time_civ'} = civtime( $t->{$to . '_start_time_mil'} ); shift @tz_todo; } # For next iteration: $prev = $t; } return; } #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sub add_start_day_and_isos { my $times_r = $_[0]; foreach my $tz (@tzs) { foreach my $t (@$times_r) { my $today = $t->{$tz . "_start_day"}; if( $today =~ m/^\s*($dow_re),\s*($month_re)\s*(\d+)\s*$/os) { my($m, $d) = (1 + $month2num{$2}, $3); my $y = $m == 1 ? $Year_Later : $Year_Earlier; my($iso_time); if( $t->{$tz . "_start_time_mil"} =~ m/^(\d\d):(\d\d)$/s ) { $iso_time = $1 . $2 . "00"; } else { print "Can't ISO-timify ", $t->{$tz . "_start_time_mil"}, "\n"; next; } $t->{$tz . "_start_time_iso"} = sprintf "%04d%02d%02dT%s", $y, $m, $d, $iso_time ; $t->{$tz . "_end_time_iso"} = end_time_iso( $y,$m,$d, $iso_time, $t->{'runtime'} ) if $t->{'runtime'}; #And incidentally... $t->{$tz . "_start_day_short"} = sprintf "%d/%02d", $m, $d; } else { print "Can't shorten [$today]\n"; } foreach my $f ( "$tz\_start_time_civ", "$tz\_end_time_civ" ) { ($t->{$f . "_short"} = $t->{$f}) =~ s/:00 / /; } $t->{"$tz\_end_time_civ_short"} =~ s/\s*[ap]\.m\.//s; $t->{"$tz\_start_time_civ_short"} =~ s/\s*([ap])\.m\./$1m/s; } } return; } #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sub parse_schedule { my @times = scrape_times( load_schedule() ); add_end_times( \@times ); add_start_day_and_isos( \@times ); #if(DEBUG > 2) { # use SMB::Stuff; print pretty \@times; #} return @times; } #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sub load_schedule { my $data; if( DEBUG > 4 and -e $temp_sched ) { open(IN, "<$temp_sched") || die $!; local $/; $data = ; close(IN); } else { DEBUG > 1 and print "Getting $master_url\n"; $data = LWP::Simple::get($master_url); die "Couldn't get $master_url" unless $data; DEBUG and print "Got $master_url -- ", scalar(length($data)), " bytes\n"; if(DEBUG > 4) { open(OUT, ">$temp_sched") or die $!; binmode(OUT); print OUT $data; close(OUT); } } DEBUG and print "Loaded ", length($data), " characters from $master_url\n"; return $data; } sub scrape_times { my $data = $_[0]; $data =~ s/(?:\cm*\cj|\cm)/\n/g; # nativize newlines my @times; my $this_day = '???'; my $last_time_m = 'am'; while($data =~ m{(?:
(.*?)
)|(?: (.*?) .*?(.*?)(?:, )?(.*?) )}g ) { if($1) { $this_day = $1; DEBUG and print " This day: $this_day\n"; } else { my %new = ( 'long_day' => $this_day, 'short_time' => $2, # like '8:00 am' 'url' => $3, # like "http://www.booktv.org/Children/index.asp?segID=1708\x26schedID=140" 'short_title' => $4, # like 'Jane Goodall' 'desc' => $5, # like 'The Chimpanzees I Love: Saving Their World and Ours' ); push @times, \%new; foreach my $v (values %new) { $v =~ s/\s+/ /g; # moiderize whitespace decode_entities($v); } $new{'uid'} = join( ".", $new{'url'} =~ m/(\d+)/g, $new{'long_day'} =~ m/(\w+)/g, $new{'short_time'} =~ m/(\w+)/g, substr( join('_', $new{'short_title'} =~ m/(\w+)/g), 0, 20), ) . "\@InterglacialBookTV" ; $new{'url'} = URI->new_abs($new{'url'}, $master_url)->canonical->as_string; for( $new{'short_time'} ) { if( m/(am|pm)/ ) { $last_time_m = $1; } else { $_ .= " $last_time_m"; } } } } DEBUG and print scalar(@times), " items of scraped data\n"; return @times; } #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Time-handling routines sub civtime { my $in = $_[0]; if( $in =~ m<^\s*(\d\d?):(\d\d)$>si) { my ($hour, $min) = ($1,$2); if($hour eq '12') { return "12:$min p.m."; } elsif($hour eq '00') { return "12:$min a.m."; } elsif($hour > 12) { return sprintf '%d:%02d p.m.', $hour - 12, $min; } else { $in =~ s/^0//s; return $in . ' a.m.'; } } else { die "Invalid miltime format $in\n"; } } sub miltime { my $in = $_[0]; my($hour, $min); if( $in =~ m<^\s*(\d\d?):(\d\d),?\s*a\.?m\.?\s*$>si) { ($hour, $min) = ($1,$2); if($hour eq '12') { # 12am = 0h $hour = '00'; } else { # 1am = 1h } } elsif($in =~ m<^\s*(\d\d?):(\d\d),?\s*p\.?m\.?\s*$>si) { ($hour, $min) = ($1,$2); if($hour eq '12') { # 12pm = 12h (noon) } else { # 1pm = 13h $hour += 12; } } else { die "Unparseable time format in \"$in\""; } DEBUG > 3 and printf " hour xlation % 10s => %02s:%02s\n", $in, $hour, $min; return sprintf '%02s:%02s', $hour, $min; } sub min2hm { # turn 85 => '1h25m' my $m = $_[0]; my $h; return $m . 'm' if $m < 60; $h = int($m/60); $m -= $h * 60; return $h . 'h' unless $m; return sprintf '%sh%02sm', $h, $m; } use constant M_IN_DAY => 24 * 60; sub time_diff { # ('08:50', '09:30') => 40, # ('23:50', '00:30') => 40, # crossing midnight is the one trouble spot my($from, $to) = @_; DEBUG > 2 and print " $from=>$to = "; foreach my $time ($from, $to) { die "Invalid miltime \"$time\"!!" unless $time =~ m/^(\d\d):(\d\d)$/s; $time = $1 * 60 + $2; } my $mins = $to - $from ; $mins = (M_IN_DAY + $to) - $from if $mins < 0; # crossing midnight DEBUG > 2 and print " $from=>$to = $mins minutes\n"; return $mins; } #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ __END__