#!/usr/bin/perl -s
# $Id: vdelatex 376 2012-10-29 12:44:54Z vlado $
# Copyright 2001-2012 Vlado Keselj web.cs.dal.ca/~vlado

use strict;
use vars qw($VERSION);
$VERSION = sprintf "1.%d", q$Revision: 376 $ =~ /(\d+)/g;

use vars qw($AllowedWordsFile $tmpFile $TmpAFile $inBraces $inBrackets
	    @NonSpellCommands @IgnoreRegs $VerbatimFlag $v $h
	    );

if ($v) { print "$VERSION\n"; exit; }
if ($h) { &help(); exit; }

sub help { print <<"#EOT" }
# vdelatex    Version $VERSION
# Perl script for spell-checking LaTeX and TeX files.
# uses spell, sort, and comm
#
# By default, the file 'allowedWords' is used as a list of additional
# allowed words.
#
# Usage: tree [switches] [directories]
#  -h  Print help and exit.
#  -AllowedWordsFile=file use file 'file' for additional allowed words
#                         instead of 'allowedWords'
#  -v  Print version of the program and exit.
#
# Examples:
# ---------
# vdelatex text.tex ...
# vdelatex -AllowedWordsFile=file text.latex t1.tex
#
# Command examples within the file:
# ---------------------------------
# %!vdelatex:allow:some_word_or_regex
# %!vdelatex:allow:list,of,words
# %!vdelatex:define non-spell command:\\att
# %!vdelatex:ignore the rest
# %!vdelatex:ignore this line
# %!vdelatex:ignore this word
# %!vdelatex:ignore begin
# %!vdelatex:ignore end
#EOT

if ( ! defined($AllowedWordsFile) and -f 'allowedWords' ) {
    $AllowedWordsFile = 'allowedWords';
} elsif (defined($AllowedWordsFile) and ! -f $AllowedWordsFile) {
    die "no file: $AllowedWordsFile";
}

$tmpFile = 'tmp.vdelatex';
$TmpAFile = "/tmp/vdelatex.$$.TmpAFile";

open(I,">$tmpFile") or die "vdelatex:cannot open $tmpFile for writing:";

$inBraces  ="\\{[^\\}]*\\}";
$inBrackets="\\[[^\\]]*\\]";

@NonSpellCommands = qw(bibitem bibliography label pageref ref);

@IgnoreRegs = ( '\\b(?:http|ftp):\\/\\/\S+'
);

sub addIgnore {
    my $w = shift;
    $w = "\Q$w";
    $w =~ s/^(\w)/\\b$1/;
    $w =~ s/(\w)$/$1\\b/;
    push @IgnoreRegs, $w;
}
    
if ( defined($AllowedWordsFile) ) {
    open(A, $AllowedWordsFile) or die;
    open(T, "| sort -u > $TmpAFile") or die "vdelatex:cannot open $TmpAFile for writing:";
    while(<A>) {
	chomp;
	if (/[-{}\\@.]/) { addIgnore($_) }
	else {
	    if (/^([a-z])([a-z -]*)$/) { print T "\U$1\L$2\n" }
	    print T "$_\n";
	}
    }
    close(A); close(T);
}

# to handle something like this in regexes:
# cs.dal.ca
# vlado@cs.dal.ca
@IgnoreRegs = sort {length($b) <=> length($a)} @IgnoreRegs;

$VerbatimFlag = '';

MAINLOOP:
while(<>) {
    if (/\{[^}]*$/) { $_ .= <> }

    my $vdelatexflag = ( index($_, "%!vdelatex:") > -1 );

    if ($vdelatexflag) {
	if (/^%!vdelatex:ignore the rest\s*$/) {
	    while (<>) { }
	    $_ = '';
	    last MAINLOOP;
	}
	if (/^%!vdelatex:define non-spell command:\\(\w+)/) {
	    push @NonSpellCommands, $1;
	    $_ = "\n";
	}
	elsif (/^%!vdelatex:allow:/) {
	    my $w = $'; chomp $w;
	    if (index($w,',') > -1) {
		local $_;
		foreach (split(/,/,$w)) { addIgnore($_) }
	    }
	    else {addIgnore($w) }
	    $_ = "\n";
	}

	if (/%!vdelatex:ignore begin\b/) {
	    my $keep = "$`\n";
	    my $nestingCount = 1;
	    $_ = $';
	    while (/%!vdelatex:ignore (begin|end)\b/) {
		if ($1 eq 'begin') { ++$nestingCount }
		else { -- $nestingCount }
		$_ = $';
		last if $nestingCount == 0;
	    }
	    if ($nestingCount > 0) {
		while (<>) {
		    while (/%!vdelatex:ignore (begin|end)\b/) {
			if ($1 eq 'begin') { ++$nestingCount }
			else { -- $nestingCount }
			$_ = $';
			last if $nestingCount == 0;
		    }
		    last if $nestingCount == 0;
		}
	    }
	    die "no %!vdelatex:ignore end" unless $nestingCount == 0;
	    $_ = $keep.$_;
	}

	s/^.*%!vdelatex:ignore this line *\r?$/ &clean($&) /mge;
    }

    if ($VerbatimFlag && /\\end\{verbatim\}/) { $VerbatimFlag = '' }
    if (!$VerbatimFlag && /\\begin\{verbatim\}/) { $VerbatimFlag = 1 }

    if ($VerbatimFlag) {
	s/\$/ /g;
	next MAINLOOP;
    }

    if (/\\begin\{eqnarray\*\}/) {
	my $keep = "$` ";
	$_ = $';
	while (! /\\end\{eqnarray\*\}/) {
	    $_ = <>;
	    die if $_ eq '';
	}
	/\\end\{eqnarray\*\}/;
	$_ = "$keep $'";
    }

    $vdelatexflag = ( index($_, "%!vdelatex:") > -1 );

    if ($vdelatexflag) {
	s/\S+ *%!vdelatex:ignore this word *$//;
    }
    s/%.*$//;
    goto LINE_DONE if /^\s*$/;

    s/\\-//g;			# remove \-

    s/\\[>=_&]/ /g;

    {
        my $w;
  	foreach $w (@IgnoreRegs) {
  	    s/$w/ /g;
  	}
    }

    s/^.*\\kill$/ /gim;
    s/\\\"//g;
    s/\\char"[0-9a-fA-F][0-9a-fA-F]/        /g;

    s/\\rule{[^{}]*}{[^{}]*}//g;
    s/\\(raise|m?kern) *[-0-9.]+(pt|mu)//g;

    s/\\begin\{(?:array|tabular)\}(?:\[[^\]]*\])?\{[rcl|]+\}//g;

    # Command taking one non-spell argument
    s/\\(psfig|(?:new)?pagestyle|bibliographystyle|include)$inBraces//g;
    s/\\(?:alph|arabic|include|input)$inBraces//g;
    s/\\(?:includegraphics)(?:$inBrackets)?$inBraces//g;
    s/\\(?:begin|end|vspace|cite|nocite|citentry|roman)$inBraces/ /g;
    s/\\(?:special)$inBraces/ /g;
    s/\\(?:re)?new(?:counter|command|environment)$inBraces/ /g;
    s/\\hspace\*?\{[^\}]*\}//g;

    {
	my $c;
	foreach $c (@NonSpellCommands) {
	    if (index($_, $c) > -1) {
		s/\\$c$inBraces//g;
	    }
	}
    }

    s/\\usepackage(?:$inBrackets)?$inBraces//g;

    s/\\makebox(?:$inBrackets){0,2}$inBraces//g;

    # two nonspell arguments
    s/\\(?:fontsize|newtheorem|addtocounter|setcounter|setlength)$inBraces$inBraces/ /g;

    s/ et al\./ /g;
    s/\\documentstyle\[[^\]]+\]/ /g;
    s/\\documentclass($inBrackets)?$inBraces//g;

    s/\\parindent=[0-9]+pt/ /g;

    s/{[0-9.]+(cm|pt|mm)}//g;
    s/\\\\(\[[^\]]+\])?/ /g;
    s/\\[a-zA-Z]+/ /g;
    s/^\.//g;

    #s/\$.{1,5}\$/ /g;

    #s/~/ /g;
    s/\\'e/'e/g;
    s/\\`e/''BQe/g;
    #s/[{}]/ /g;
    s/\$/ /g;
    s/\\ / /g;

    s/\b/ /g;
    #s/\s*(\w+)\s*/ $1 /g;
    #s/^ */ /;
    #s/ *$/ /;

    LINE_DONE:
    print I;
}
close(I);

if ( defined($AllowedWordsFile) ) {
    system("cat $tmpFile | spell | sort -u | comm -23 - $TmpAFile");
} else {
    system("cat $tmpFile | spell | sort -u");
}

unlink($TmpAFile);

# program end
sub clean {
    local $_ = shift;
    s/\S/ /g;
    return $_;
}