#!/usr/bin/perl
require 5;
# desc{  report how to fit a directory's items onto a CDR's worth of space  }
use strict;

use constant CDR => 681_700_000;  # may require fudging

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

=head1 NAME

cdfit -- report how to fit a directory's items onto a CDR's worth of space

=head1 SYNOPSIS

  % cdfit ./mp3s
  Considering fitting ./mp3s into 681,700,000b...
  Optimal solution is 7 items:
  [
      58,338,055: Air
      90,221,893: Brian Eno
      18,908,463: Duran Duran
      42,602,438: Laurie Anderson
      14,547,579: Luscious Jackson
      98,023,464: Massive Attack
     359,057,250: Philip Glass
  ]
  filling to 681,699,142b,
  leaving 858b free and 14 items totalling 1,804,083,465b outside.

=head1 DESCRIPTION

This utility says how to fit items from a given directory into a
certain amount of space -- by default, a rule-of-thumb figure for
the amount of space on a blank CDR.

=head1 OPTIONS

Usage: C<cdfit B<[-I<number>]> B<[-tI<seconds>]>> B<[I<filespec>]>>

Examples:

  cdfit
  
  cdfit c:/data

  cdfit -670,000,000

  cdfit -670,000,000 ~/oggs

  cdfit -650,000,000 -t120

  cdfit -1,440,000 -t360 /user/mojojojo/journals /

Optional switch B<-I<number>> sets the size of the CD (or whatever)
that you want to fit items into.  Current default is 681,700,000
bytes.  You may use commas or underscores in this number.

Optional switch B<-t<seconds>> sets a time limit on how long this
program will try searching for a match.  When the time is up, the
program stops searching and reports the best solution found within
that time.  You may use commas or underscores in this number.  Note
that this number is in seconds: B<-t10> is ten seconds, not ten
minutes (which would be B<-t600>, of course).  This switch may
not be supported under MSWin.  

The parameter I<filespec> provides the path to the items that you
want to fit onto a CD.  Otherwise defaults to the first existing
item found from the list C<
    "c:/windows/desktop/to_burn",
    "c:/windows/desktop/mp3",
    "c:/windows/desktop/mp3s",
    "c:/windows/desktop/music",
    "."
> (the latter, current directory, is a guaranteed fallback).

Note that at time of writing, you can only provide look at one
directory -- i.e., you can't do C<cdfit mp3 mp3b>

=head1 CAVEATS

F<cdfit> doesn't take into account per-file or per-directory overhead
when imagining fitting things onto a CDR.  You can work around this by
just claiming that there's less space on a CDR than F<cdfit>'s default of
681,700,000 bytes.

F<cdfit> B<can> take B<exponentially> longer time when given more items
to fit.  (Specifically, each item B<can> make it take twice as long to
run: if it takes 3 seconds to consider 15 items, it I<might> take twice
as long (6s) to consider 16 items, twice as long again (12s) to consider
17 items.

My experience is that F<cdfit> runs best (i.e., in less than a minute on
a decent machine, I<not> taking exponential time with increased number
of elements) when you have fewer than 30 items, where the typical item
is larger than 1/10th the capacity of a CD.  Having lots of little items
(little relative to the size of the CD) will make this take quite a
long time.  When in doubt, you can always just abort the program.

=head2 Implementation Notes

This program has efficiency problems because this is a case of
the "Knapsack Problem", which is NP-complete.

F<cdfit>'s algorithm iterates over all possible solutions, but prunes
the search space when applicable.  I.e., when it's considering fitting
a 500M directory and a 300M directory and others into a 670M disk, it
doesn't bother iterating over all possible solutions that start with
having a 500M item and a 300M item, because it knows that none of those
are going to work.  So taking 2**N time is only a worst case --
bit it's one that we approach as we have many small items, the sum of
whose sizes is just over the capacity of the CD.

F<cdfit>'s algorithm specifically traps the case where all items given
will fit in the CD.  This avoids a complete (and pointless) scan of
the search space.

Mercifully, memory requirements don't increase much with the number
of items to fit.  We don't keep a list of all possible solutions in
memory -- we keep only the best one so far, and the current one being
considered.

=head1 BACKSTORY

As you can probably guess, I wrote this program because I had made
mp3 backups of a lot of CDs of mine, and I wanted to burn those
directories to CDRs, and I wanted to pack the CDRs as tightly as
possible.

=head1 COPYRIGHT AND DISCLAIMER

Copyright (c) 2002 Sean M. Burke. All rights reserved.

This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

The programs and documentation in this dist are distributed in
the hope that they will be useful, but without any warranty; without
even the implied warranty of merchantability or fitness for a
particular purpose.

=head1 AUTHOR

Sean M. Burke, sburke@cpan.org

=cut

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
use constant DEBUG => 0;
my $alarm;
my $length;

while(@ARGV and $ARGV[0] =~ m/^-/s) {
  last if $ARGV[0] eq '--';
  if($ARGV[0] =~ m/^-([0-9,_]+)$/s) {
    $length = $1;
    $length =~ tr/0-9//cd;
    $length += 0;
    shift @ARGV;
  } elsif($ARGV[0] =~ m/^-[tT]([0-9,_]+)$/) {
    eval {alarm 0};
    die "Can't use -T with this version/build of Perl on this platform.\n"
     if $@;
  
    $alarm = $1;
    $alarm =~ tr/0-9//cd;
    $alarm += 0;
    shift @ARGV;
  } else {
    die "Unknown option $ARGV[0]\n";
  }
}

my $where;
if(@ARGV) {
  if(@ARGV == 1) {
    $where = shift;
  } else {
    die "usage: cdfit [-target_size_in_bytes] [-Tseconds_allowed] [dirspec]\n"
      . "   Fit items in dirspec into target_size_in_bytes or fewer,\n"
      . "   with best solution found within seconds_allowed seconds.\n";
  }
}

my $path;
foreach my $p (
  $where ? $where : (
    "c:/windows/desktop/to_burn",
    "c:/windows/desktop/mp3",
    "c:/windows/desktop/mp3s",
    "c:/windows/desktop/music",
    '.',
  ),
) {
  next unless -e $p;
  $path = $p;
  last;
}
die "Where are the CD contents?" unless defined $path;

$length ||= CDR;
{
  print "Considering fitting $path into ", commulate($length), "b...\n";
  my %in = du1($path);
  my %in_short;
  @in_short{trim_common(keys %in)} = values %in;
  solve_fit( $length, %in_short );
}


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

sub solve_fit {

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Make our recursive closure:
  
  my $capacity;
  my @item_size;
  my %itemnum2name;
  my @last_solution; 

  my $go_deeper;
  $go_deeper = sub {
  
    # We could probably do this iteratively, since the search space is
    # pretty straight: it's 2**N where N is the number of items.
    # However, pruning in that situation is not totally straightforward
    # to implement, whereas it's a snap with recursion.
  
    # $_[0] is how deep we are. (0 = nothing yet)
    # $_[1] is how much we've used so far.
    # rest of @_ is list of items included
    DEBUG > 2 and print '  ' x $_[0], ' ' ,
      $_[0] . 'd', ' ',
      $_[1] . 'c', ' ',
      '[', join(',', @_[2 .. $#_]), ']',
      "\n"
    ;
    if($_[0] == @item_size) {
      # We've got a complete and good solution
      DEBUG > 3 and print '  ' x $_[0], "  Noting solution [@_]\n";
      if($_[1] < $last_solution[1]) {
        DEBUG > 3 and print '  ' x $_[0], "  No better.\n";
      } elsif($_[1] > $last_solution[1]) {
        DEBUG > 1 and print '  ' x $_[0], "  Better!\n";
        @last_solution = @_;
      } else {
        # It's the same.  Favor one with fewer parts.
        if(@_ > @last_solution) {
          DEBUG > 2 and print '  ' x $_[0], "  As good as, and but more complex than.\n";
        } elsif(@_ < @last_solution) {
          DEBUG > 2 and print '  ' x $_[0], "  As good as, and simpler than!.\n";
          @last_solution = @_;
        } else {
          DEBUG > 2 and print '  ' x $_[0], "  As good as, and as simple as.\n";
        }
      }
    } else {
      # We're only partial: keep recursing
      
      {
        DEBUG > 3 and printf "%s Considering adding a %sb item (#%s:%s)\n",
          '  ' x $_[0],
          $item_size[$_[0]], $_[0], $itemnum2name{$_[0]}
        ;
        my $x = $_[1] + $item_size[$_[0]];
        if($x > $capacity) {   # if there's no room.
          DEBUG > 3 and print '  ' x $_[0], "   [prune]\n";
        } else {
          # Consider combinations involving including this item:
          $go_deeper->($_[0] + 1, $x, @_[2 .. $#_], $_[0])
        }
      }
      # Now consider combinations involving skipping this item:
      $go_deeper->($_[0] + 1, @_[1 .. $#_]);
    }
    return;
  };
  # end of closure.
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

  $capacity = int(shift);
  
  @item_size    = ();
  %itemnum2name = ();
  while(@_) {
    my($name, $size) = splice @_,0,2;
    push @item_size, $size;
    $itemnum2name{ $#item_size } = $name;
    DEBUG and print "$#item_size => $name of size $item_size[-1]\n";
  }

  my $sum_size = 0;
  foreach my $size (@item_size) { $sum_size += $size }
  if($sum_size <= $capacity) {
    DEBUG and print "They all fit!\n";
    # Pretend we ran...
    @last_solution = ($#item_size, $sum_size, 0 .. $#item_size);
  } else {
    @last_solution = (-1,0);
  
    my $start_time = time();
    my $run_time;

    if($alarm) { # We have a time limit.
      local $SIG{'ALRM'} = sub { die "alarm\n" };
      eval {
        alarm $alarm;
        $go_deeper->(0,0);
        alarm 0;
      };
      if ($@) {
        die unless $@ eq "alarm\n";   # propagate unexpected errors
        print "Ran out of time.  Printing best solution so far...\n"
         if DEBUG >= 0;
      }
    } else {
      # Take as long as we want.
      $go_deeper->(0,0);
    }
    $run_time = time - $start_time;
    DEBUG and print "Excursion time: $run_time s\n";
  }
  undef $go_deeper; # break cyclicity
 
  if($last_solution[0] == -1) {
    DEBUG and print "No solution.\n";
    return();
  } else {
    if(DEBUG < 0) { # just report it.
      splice @last_solution, 0, 2;
    } else {
      my $used;
      (undef,$used) = splice @last_solution, 0, 2;
      my $space_free = $capacity - $used;
      my $outside    = $sum_size - $used;

      my $format = "   % " . length(commulate($capacity)) . "s: %s\n";
      print "Optimal solution is ",
        (@last_solution and @last_solution == @item_size) ? "all " : '',
        scalar(@last_solution),
        " items:\n[\n",
        do {
          use locale;
          map sprintf($format,
               commulate($item_size[$_]), 
               $itemnum2name{$_}
          ),
          sort {$itemnum2name{$a} cmp $itemnum2name{$b}}
               @last_solution
        },
        "]\nfilling to ", commulate($used), "b,\nleaving ", 
        commulate($space_free), "b free and ", 
        (@item_size - @last_solution) ? (
          @item_size - @last_solution,
          " items totalling ", commulate($outside), "b "
        ) : "nothing ",
        "outside.\n"
      ;
    }

    @last_solution = map $itemnum2name{$_}, @last_solution;
    return @last_solution;
  }
}

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

sub du1 {
  my $in = $_[0];
  opendir(MAIN, $in) || die "Can't opendir $in : $!";
  my @items = map(
    ($_ ne '.' and $_ ne '..') ? "$in/$_" : (),
    readdir(MAIN)
  );
  closedir(MAIN);
  
  my @chunks;
  foreach my $item (@items) {
    push @chunks, $item, _du_item($item);
    DEBUG and print "*** ", $chunks[-2], ' = ', $chunks[-1], "\n"
  }
  return @chunks;
}

sub _du_item {  # recursive
  my $spec = $_[0];
  if(-e $spec) {
    if(-f _) {
      DEBUG > 2 and print "spec $spec\n";
      return -s _;
    } elsif(-d _) {
      my $u = 0;
      opendir(DIR, $spec) || die "Can't opendir $spec : $!";
      my @items = map(
        ($_ ne '.' and $_ ne '..') ? "$spec/$_" : (),
        readdir(DIR)
      );
      closedir(DIR);
      foreach my $i (@items) {
        $u += _du_item($i);
      }
      return $u;
    }
  } else {
    return 0;
  }
}

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

sub commulate { # add commas to a number
  
  # optimize for most common case:
  return $_[0] unless $_[0] and $_[0] =~ m/^\d{3,}(?:\.\d*)?$/s;
  
  my($integer, $fraction) = ($_[0], '');
  $integer =~ tr/0-9.//cd;
  if($integer =~ /([^.]*)(\..*)/) {
    $integer = $1;
    $fraction = $2;
  }
  while( $integer =~ s/(\d)(\d\d\d)(\,|$)/${1},$2$3/g ) { }
  return $integer . $fraction; 
}

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

sub trim_common {
  # remove common substring from all elements
  my @in = @_;
  return @in unless @in > 1;

  my $common = $in[0];
  foreach my $k (@in) {
    #print "Comparing $common to $k\n";
    if( scalar($common ^ $k) =~ m/^(\x00+)/s ) {
      $common = substr($common, 0, length $1);
    } else {
      $common = '';
      last;
    }
  }
  
  if(length $common) {
    #print "Removing \"$common\"\n";
    $common = length $common;
    foreach my $in (@in) { substr($in,0,$common) = '' }
  }
  
  return @in;
}

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

__END__

