# striprtf.pl # desc{ strips RTF codes from an RTF document, not terribly intelligently. } # # Purpose: # Use this script to view the unformatted content of an RTF document. # # Author: # Sean M. Burke, sburke@cpan.org, http://www.ling.nwu.edu/~sburke/ # # Usage: # striprtf.pl [inputfile.rtf] [inputfile2.rtf] # output is on standard output. # # Alternate Usage: # Feed the RTF in on standard input, as in this UNIX command: # cat s*.rtf | striprtf.pl | less while(<>){ # Do some extremely basic formatting. s/\n//g; s/\\par([^d])/\n$1/g; s/\\row/\n/g; s/\\cell([^x])/\t$1/g; s/\\tab/\t/g; # Take care of some character codewords. s/\\[lr]quote/\'/g; s/\\[lr]dblquote/\"/g; s/\\[-~*:]//g; s/\\_/-/g; s/\\e[nm]dash/-/g; s/\\e[nm]space/ /g; s/\\bullet/*/g; s/\\([\\{}])/"$;".(unpack("H2",$1))/eg; # hides \{, \}, and \\ s/\\'/$;/eg; # hides the \' sequence s/\\[-a-z0-9]+( ?)/$1/eg; s/[\\{}]/$1/g; s/$;([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; # decode things like \'e1 (actually, $;e1, by time we get here) print; }