#!/usr/bin/perl # ** development/test version -- not for release ** # floyd.pl : A Multimorphemic Concordancer # # version 0.0.4 Time-stamp: "97/02/03 22:15:21 sburke" # ------------------------------------------------------ # USAGE: # floyd.pl file1 [file2 file3 ... filen] > concordance.txt # # or give the input on STDIN. E.g., # grep -i '^#' foo.txt | floyd.pl > concordance.txt # # ABSTRACT: # This program builds a concordance of a text (or texts), but a # concordance not simply of words (altho it /can/ do that), but rather # of the morphemes, which should be delineated by a "|" character. # If this comes across an occurrance of "un|lock|able", for example, # it will list this as an instance of "un", of "lock", and of "able", # noting the line number and file name each time. # # CREDITS: # Concept and MSDOS implementation by: # Floyd Lounsbury (fglnsb@minerva.cis.yale.edu), 1995. # UNIX/Perl implementation by: # Sean M. Burke (sburke@cpan.org), 1996. # # From the hypertext Webster's, at http://c.gp.cs.cmu.edu:5103/prog/webster # # con.cor.dance \k*n-'ko.rd-*n(t)s, ka:n-\ n [ME, fr. MF, fr. ML # concordantia, fr. L concordant-, concordans)] 1: an alphabetical index # of the principal words in a book or the works of an author with their # immediate contexts 2: CONCORD, AGREEMENT # To do-- allow for different kinds of sort orders. # ideas: # add a timestamp to the output? # report of what the command line was (listing the input filenames)? # list a count of the number of times a form occurs? or is that # best left to another program which can smash case? # add an option to smash case on input? # the whole list-of-lists code is working, but funked out. rewrite # using Perl5 stuff. $nonalph = "[^|A-Za-z:~^'\x80-\xff]"; # This range defines all the things that are non-alphabetical. # Since the range expression starts with a ^, it means that what is # what alphabetic is what is NOT in the ranges and characters # following the ^ # In other words, what IS alphabetic is whatever follows the ^ # [^|A-Za-z:~^'\x80-\xff] means to count as alphabetical all the # letters A-Z and a-z, all the characters from ASCII 0x80 (=decimal # 128) to ASCII 0xFF (=decimal FF), which is where all the accented # characters hide in the Latin-1 character set; and these # typographical characters: ~ : ^ ' # | is called alphabetical here, but it has special meaning in the # program as a morpheme-divider within words. # VARIABLES ############################################################ # %conc will hold the data in the format: # $conc{"morpheme"} holds "filename#line#word". # I.e., the first word is the filename and line#, with a # # inbetween # The second word is the word the morpheme occurrs in, or ~ if it # fills the word #### FOR MAC FOLKS ##################################################### ##Remove the initial # from the following few lines, if you're using MacPerl: #if (require "StandardFile.pl") { # hi MacPerl! # @ARGV = &StandardFile'GetFile("Input text file:", "TEXT"); # $output_file = 'concordance_1'; # while(-e $output_file) { #get a safe name for outputting. # $output_file =~ s/(\d+)$/(1+$1)/ie; # } # open(STODUT, "> $output_file"); # hopefully a working kludge #} ##### end of Mac junk # Main loop to take in and log all the morphemes. while ($line = <>) { chop $line; $line =~ s/\cm//g; # one never knows. @words = split(/$nonalph+/, $line); # Make a list of words on this line, where a word is a contiguous # string of alphabetic characters. foreach $word (@words) { # Now iterate thru that list. next if $word eq ''; $word = "|$word|"; @morphemes = split(/\|/, $word); foreach $morpheme (@morphemes) { next if $morpheme eq ''; $morpheme =~ tr/A-ZÁÉÍÓÚÂÊÎÔÛÄËÏÖÜÆÑ/a-záéíóúâêîôûäëïöüæñ/; # Add characters there as necessary to handle downcasing for # this language. $linenumber = sprintf("%06d", $.); # Make the line a 6-digit number, so we can sort on it. $conc{$morpheme} .= "$ARGV$;$linenumber$;$word "; } } $. = 0 if (eof); } $~ = "OCCUR1"; # Give a report on all the morpheme-forms we found. foreach $morpheme (sort(keys(%conc))) { &morpheme_report($conc{$morpheme}); } exit; # Byebye birdie ## SUBROUTINES ############################################################ sub morpheme_report { # Give a /concise/ report of the occurrence of the morphemes in question # This is a subroutine just so we can localize the variables. local (%wordinstance, $instance); # This builds a report such that multiple instances of the SAME # word containing this morpheme, in the SAME file, will be listed # on the SAME line. Across files that's not the case, or in # different words, and they'll be listed separately. foreach $instance (sort (split(/\s+/, $_[0]))) { next if $instance eq ''; # Iterate thru the sorted list of non-null words-records # (having been separated by whitespace) found in the string # we were called on. ($filename, $linenumber, $occurringform) = split($;, $instance); # Split this thing back into the three ports. The # split-character is the magic $; delimiter. $wordinstance{$occurringform, $filename} .= " $linenumber"; # make a 1:many map of word+filename => linenumbers } foreach $instance (sort(keys(%wordinstance))) { # Go thru a sorted (by word, then by filename) list of all # the places the morpheme occurs in. $linenumbers = $wordinstance{$instance}; $linenumbers =~ s/\s+0*/ /g; # Collapse whitespace plus any 0's into a single space. $linenumbers =~ s/^\s+//g; # Kill word-initial whitespace # Now start putting things into the right variables for the # format to use. ($occurringform, $filename) = split($;, $instance); # Unconflate the word and the filename. $occurringform =~ /\|$morpheme\|/i; $before = $`; $duringafter = $& . $'; # Now write it all out, in format OCCUR1 write; if ($linenumbers ne "") { # is there more to output? $~ = "OCCUR2"; write; # write the rest out in the OCCUR2 format $~ = "OCCUR1"; } } } # OCCUR1 is the print format for the first line of output for a morpheme. # OCCUR2 is for continuation lines. format OCCUR1 = @>>>>>>>>>>>>>>@<<<<<<<<<<<<<<<<<<<<<<< @>>>>>>>>>>>> ^<<<<<<<<<<<<<<<<<<<<<<<< $before, $duringafter, $filename, $linenumbers . format OCCUR2 = ~~ ^<<<<<<<<<<<<<<<<<<<<<<<< $linenumbers . # Quaecumque sunt vera 1; # ## end