# #Time-stamp: "1999-08-04 15:46:44 MDT" my %word_freq; use strict; @ARGV = ('Slag:babbit.txt'); while(<>) { #last if $. > 10000; tr<'><_>; foreach my $w (grep length $_, split /\W+/, $_) { next if $w =~ /_$/s or $w =~ /^_/s or ($w ne lc($w) and $w ne "I"); ++$word_freq{$w}; #print "> $w\n"; } } my(%dyads, %triads); my @common; foreach my $w (keys %word_freq) { push @common, $w if $word_freq{$w} > 100; next unless length $w > 2 and $word_freq{$w}; for(my $i = 0; $i < length($w) - 1; ++$i) { ++$dyads{substr($w, $i, 2)}; } for(my $i = 0; $i < length($w) - 2; ++$i) { ++$triads{substr($w, $i, 3)}; } } foreach my $w (sort {$word_freq{$b} <=> $word_freq{$a}} @common ) { print "$w : $word_freq{$w}\n"; } print " --\n"; hash_top_third(\%dyads); print " --\n"; hash_top_third(\%triads); print " --\n\n"; sub hash_top_third { my $hr = $_[0]; my @out; my @sorted = sort {$hr->{$b} <=> $hr->{$a}} keys %$hr; return unless @sorted; my $sum; foreach my $v (values %$hr) { $sum += $v } $#sorted /= 3 if @sorted >= 9; my $enough = $sum * .85; my $weensy = ($hr->{$sorted[0]} * .1) || 1; my $accounted_for = 0; foreach my $k (@sorted) { $accounted_for += $hr->{$k}; print " $k : ", $hr->{$k}, " (", int($hr->{$k} / $sum * 100) , "%)\n"; last if $accounted_for > $sum or $hr->{$k} <= $weensy; } return; } print "OK\n"; __END__ WORDS: the and to of a he was I in you that his it with for on as had at but be all her him not they me were out is by have up like about this so do from one an she get don't or if old when just which their said know been good what them we some go got man into would think did who going now down there my more could your right little no these want any are can time how see way never say here business over back it's much house day too men other than come home always lot evening two fellow tell went thing only being off after make wife while room away himself night before real then through well LETTER PAIRS: in ed er ng es re te ti en st on le at nt ar ri an or ra ne de ly al li co is ro it ea se io ou ss ve he ic el la di us me ll ce pe ch nd un ta si ns rs ur as tr il sh lo ie pr th ac ge ni ca et nc to ma ol oo ha bl hi mi ho ec ul pa rt ct ai ke ad om ee ts ck po na ig ab ot mp ow pi im be ci sp fi am em mo ir tt id ap iv pl tu ut bo LETTER TRIPLETS: ing ion ent ess tio ted ter ati ers ate tin red est ous res nes ere con ine led ons ste rin nce lin ive ble men ver ist pro rea nte che les all sti tra ned dis per ant her tor rat ies eri der nde ght int igh ain tic com abl lly str ect pre