#!/usr/bin/perl
#  ** development/test version -- not for release **
# floyd.pl : A Multimorphemic Concordancer
#
# version 0.0.4     Time-stamp: "97/02/03 22:15:21 sburke"
# ------------------------------------------------------
# USAGE:
#  floyd.pl file1 [file2 file3 ... filen]  > concordance.txt
#
# or give the input on STDIN.  E.g.,
#  grep -i '^#' foo.txt | floyd.pl > concordance.txt
#
# ABSTRACT:
#  This program builds a concordance of a text (or texts), but a
#  concordance not simply of words (altho it /can/ do that), but rather
#  of the morphemes, which should be delineated by a "|" character.
#  If this comes across an occurrance of "un|lock|able", for example,
#  it will list this as an instance of "un", of "lock", and of "able",
#  noting the line number and file name each time.
#
# CREDITS:
#  Concept and MSDOS implementation by:
#	Floyd Lounsbury (fglnsb@minerva.cis.yale.edu), 1995.
#  UNIX/Perl implementation by:
#	Sean M. Burke (sburke@cpan.org), 1996.
#
# From the hypertext Webster's, at http://c.gp.cs.cmu.edu:5103/prog/webster
#
#   con.cor.dance \k*n-'ko.rd-*n(t)s, ka:n-\ n [ME, fr. MF, fr. ML
#   concordantia, fr. L concordant-, concordans)] 1: an alphabetical index
#   of the principal words in a book or the works of an author with their
#   immediate contexts 2: CONCORD, AGREEMENT

# To do--  allow for different kinds of sort orders.
# ideas:
#     add a timestamp to the output?
#     report of what the command line was (listing the input filenames)?
#     list a count of the number of times a form occurs?  or is that
#     best left to another program which can smash case?
#     add an option to smash case on input?
# the whole list-of-lists code is working, but funked out.  rewrite
#     using Perl5 stuff.

$nonalph = "[^|A-Za-z:~^'\x80-\xff]";
 # This range defines all the things that are non-alphabetical.
 # Since the range expression starts with a ^, it means that what is
 # what alphabetic is what is NOT in the ranges and characters
 # following the ^
 # In other words, what IS alphabetic is whatever follows the ^
 # [^|A-Za-z:~^'\x80-\xff] means to count as alphabetical all the
 # letters A-Z and a-z, all the characters from ASCII 0x80 (=decimal
 # 128) to ASCII 0xFF (=decimal FF), which is where all the accented
 # characters hide in the Latin-1 character set; and these
 # typographical characters:    ~ : ^ '
 # | is called alphabetical here, but it has special meaning in the
 # program as a morpheme-divider within words.


# VARIABLES ############################################################
# %conc will hold the data in the format:
#   $conc{"morpheme"} holds "filename#line#word".
#    I.e., the first word is the filename and line#, with a #
#    inbetween
#    The second word is the word the morpheme occurrs in, or ~ if it
#    fills the word

#### FOR MAC FOLKS #####################################################
##Remove the initial # from the following few lines, if you're using MacPerl:
#if (require "StandardFile.pl") { # hi MacPerl!
# @ARGV = &StandardFile'GetFile("Input text file:", "TEXT");
# $output_file = 'concordance_1';
# while(-e $output_file) { #get a safe name for outputting.
#     $output_file =~ s/(\d+)$/(1+$1)/ie;
# }
# open(STODUT, "> $output_file"); # hopefully a working kludge
#}
##### end of Mac junk

# Main loop to take in and log all the morphemes.
while ($line = <>) {
    chop $line;
    $line =~ s/\cm//g; # one never knows.

    @words = split(/$nonalph+/, $line);
    # Make a list of words on this line, where a word is a contiguous
    # string of alphabetic characters.

    foreach $word (@words) {  # Now iterate thru that list.
	next if $word eq '';
	$word = "|$word|";
	@morphemes = split(/\|/, $word);
	foreach $morpheme (@morphemes) {
	    next if $morpheme eq '';
	    $morpheme =~ tr/A-ZÁÉÍÓÚÂÊÎÔÛÄËÏÖÜÆÑ/a-záéíóúâêîôûäëïöüæñ/;
	    # Add characters there as necessary to handle downcasing for
	    # this language.
	    $linenumber = sprintf("%06d", $.);
	    # Make the line a 6-digit number, so we can sort on it.
	    $conc{$morpheme} .= "$ARGV$;$linenumber$;$word ";
	}
    }
    $. = 0 if (eof);
}

$~ = "OCCUR1";

# Give a report on all the morpheme-forms we found.
foreach $morpheme (sort(keys(%conc))) {
    &morpheme_report($conc{$morpheme});
}

exit;
# Byebye birdie


## SUBROUTINES ############################################################

sub morpheme_report {
    # Give a /concise/ report of the occurrence of the morphemes in question
    #  This is a subroutine just so we can localize the variables.
    local (%wordinstance, $instance);

    # This builds a report such that multiple instances of the SAME
    # word containing this morpheme, in the SAME file, will be listed
    # on the SAME line.  Across files that's not the case, or in
    # different words, and they'll be listed separately.

    foreach $instance (sort (split(/\s+/, $_[0]))) {
	next if $instance eq '';
	# Iterate thru the sorted list of non-null words-records
	# (having been separated by whitespace) found in the string
	# we were called on. 
	($filename, $linenumber, $occurringform) = split($;, $instance);
	# Split this thing back into the three ports.  The
	# split-character is the magic $; delimiter.
	$wordinstance{$occurringform, $filename} .= " $linenumber";
	# make a 1:many map of word+filename => linenumbers
    }

    foreach $instance (sort(keys(%wordinstance))) {
	# Go thru a sorted (by word, then by filename) list of all
	#  the places the morpheme occurs in.
	$linenumbers = $wordinstance{$instance};
	$linenumbers =~ s/\s+0*/ /g;
	# Collapse whitespace plus any 0's into a single space.
	$linenumbers =~ s/^\s+//g;
	# Kill word-initial whitespace

	# Now start putting things into the right variables for the
	# format to use.
	($occurringform, $filename) = split($;, $instance);
	# Unconflate the word and the filename.
	$occurringform =~ /\|$morpheme\|/i;
	$before = $`;
	$duringafter = $& . $';

	# Now write it all out, in format OCCUR1
	write;
	if ($linenumbers ne "") {  # is there more to output?
	    $~ = "OCCUR2";
	    write;          # write the rest out in the OCCUR2 format
	    $~ = "OCCUR1";
	}
    }
}

# OCCUR1 is the print format for the first line of output for a morpheme.
# OCCUR2 is for continuation lines.

format OCCUR1 =
@>>>>>>>>>>>>>>@<<<<<<<<<<<<<<<<<<<<<<< @>>>>>>>>>>>> ^<<<<<<<<<<<<<<<<<<<<<<<<
$before,       $duringafter,            $filename,    $linenumbers
.

format OCCUR2 =
~~                                                    ^<<<<<<<<<<<<<<<<<<<<<<<<
                                                      $linenumbers
.

# Quaecumque sunt vera
1;
#
## end