#!/usr/bin/perl
# 2002-2003 Vlado Keselj www.cs.dal.ca/~vlado
# Version: 1.1
# The newest version can be found at:
# http://vlado.keselj.net/srcperl/
#
# Cleans HTML tags.
# Warning: Follows strict HTML syntax for comments (which may be
# counter-intuitive), e.g., valid comments are:
# <!> <!-- cm --> <!-- comment 1 ---- comment2 -- -- c3 -- >
# and invalid comments are:
# <!-- comment 1 -- ERR --> <!-- comment 1 -- --> NOT FINISHED

$state = 'normal';

$irrComments = '';		# set to 1 to use <!--anything but--> comments

while (<>) {
    while ( length($_) > 0 ) {
	if ($state eq 'normal') {
	    if (/^([^<]*)<!>/)  { print $1; $_ = $'; }
	    elsif (/^([^<]*)<!--/) {
		print $1; $_ = $'; $state = 'comment';
	    }
	    elsif (/^([^<]*)</) {
		print $1; $_ = $'; $state = 'tag';
	    }
	    else { print; $_ = ''; }
	}
	elsif ($state eq 'comment' && ! $irrComments) {
	    if ($irrComments) {
		if (/-->/) { $_ = $'; $state = 'normal'; }
		else { $_ = '' }
	    } else {
		if (/--/) { $_ = $'; $state = 'betweencomments'; }
		else { $_ = '' }
	    }
	}
	elsif ($state eq 'betweencomments') {
	    if (/^\s*>/) { $_ = $'; $state = 'normal' }
	    elsif (/^\s*--/) { $_= $'; $state = 'comment'; }
	    elsif (/^\s*$/) { $_ = '' }
	    else { die "IMPROPER HTML COMMENT" }
	}
	elsif ($state eq 'tag') {
	    if (/^[^>\"\']*([>\'\"])/) {
		$_ = $';
		if ($1 eq '>') { $state = 'normal' }
		else { $state = 'quote'; $quote = $1; }
	    }
	    else { $_ = '' }
	}
	elsif ($state eq 'quote') {
	    if (/$quote/) { $_ = $'; $state = 'tag' }
	    else { $_ = '' }
	}
	else { die "UNKNOWN STATE ($state)" }
    }
    if (eof) { $state = 'normal' }
}