#!/usr/bin/perl # 2002-2003 Vlado Keselj www.cs.dal.ca/~vlado # Version: 1.1 # The newest version can be found at: # http://vlado.keselj.net/srcperl/ # # Cleans HTML tags. # Warning: Follows strict HTML syntax for comments (which may be # counter-intuitive), e.g., valid comments are: # NOT FINISHED $state = 'normal'; $irrComments = ''; # set to 1 to use comments while (<>) { while ( length($_) > 0 ) { if ($state eq 'normal') { if (/^([^<]*)/) { print $1; $_ = $'; } elsif (/^([^<]*)/) { $_ = $'; $state = 'normal'; } else { $_ = '' } } else { if (/--/) { $_ = $'; $state = 'betweencomments'; } else { $_ = '' } } } elsif ($state eq 'betweencomments') { if (/^\s*>/) { $_ = $'; $state = 'normal' } elsif (/^\s*--/) { $_= $'; $state = 'comment'; } elsif (/^\s*$/) { $_ = '' } else { die "IMPROPER HTML COMMENT" } } elsif ($state eq 'tag') { if (/^[^>\"\']*([>\'\"])/) { $_ = $'; if ($1 eq '>') { $state = 'normal' } else { $state = 'quote'; $quote = $1; } } else { $_ = '' } } elsif ($state eq 'quote') { if (/$quote/) { $_ = $'; $state = 'tag' } else { $_ = '' } } else { die "UNKNOWN STATE ($state)" } } if (eof) { $state = 'normal' } }