#!/usr/bin/perl -w00

#++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++
#
# Copyright (c) 1997-2002 James Blustein
#
# TITLE:          insert-link-track-calls.pl (insert-calls.pl for short)
# AUTHOR:         J. Blustein <jamie@cs.dal.ca> (see below for credit to 
#                                                basic source)
# CREATED:        17 Nov 1997
# LAST MODIFIED:  15 Sep 2002 (v3.0.2a)
#
# PURPOSE: Replaces HTML HREF anchors (anchors with outgoing links) with
#          calls to the JavaScript function used with the link-tracker.pl
#          script to record which links readers follow.
#
# NOTES: The HTML source code must be valid (according to the HTML 3.2
#       standard or below) and abide by the following additional three
#       restrictions: 
#         (a) no comments inside of anchor elements 
#         (b) no occurance of either `<A' or `<a' (two-character strings)
#             anywhere in the code except in anchors, including comments!
#         (c) no empty lines inside anchors
#
#        The HTML parsing might be more robust (and easier) with the HTML
#        module from CPAN.
#
###########################################################################
# Based on a program by Tom Christiansen:
#
#   HTML Hacking with Regular Expressions
#   by Tom Christiansen
#   in The Perl Journal 
#      vol. 1, no. 1, Spring 1996
#      pages 24 - 29
#
# Original source obtained from 
#   <URL:http://orwant.www.media.mit.edu/tpj/programs/Issue_1_HTMLregexps/
#        finding_links_2>
#   on 17 November 1997
#
#-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

require 5.004;

use CGI qw(escape);
use strict;
use File::Basename;

use constant VERSION => 'insert-calls.pl 3.0.2a (02 Jun 1999)';
use constant DEBUG_OUTPUT => 0; # 0 is off
use constant VERBOSE => 1;
use constant CALL_CGI_DIRECTLY => 0; # call JavaScript iff 0, 
                                     # call CGI otherwise

use constant JSPROG       => "go"; # go(HREF, NAME, TITLE)
 use constant CGIPROG     => "/cgi-bin/tracker";
 use constant DOC         => "1";
 use constant CGI_SEP     => "&";

my($PROG) = basename($0);

BEGIN {
    print STDERR "This is `" . VERSION . "'\n";
}

#++
#  Constants
#    CGIPROG:  The CGI program to call when a link is followed.
#              The NAME, HREF, and TITLE of the referring anchor will be
#             passed to the CGI program
#
#    DOC:      Something that can be resolved to the address (URL) of the
#             document into which the CGI calls are being inserted.  This
#             is needed to resolve all the HREFs.  You could use the entire
#             URL but that would take much longer to transmit than a short
#             identification number.
#
#    CGI_SEP:  Character that separates arguments to CGI program.
#             According to the specification a semicolon or an ampersnad
#             can be used (with a preference for a semicolon).  I have to
#             use ampersand because the CGI.pm module doesn't work with
#             semicolons.
#--


#++
# Global variables
#--
my(%unique_name); # these values are used below to create a name for links
                  # where no name is provided in the anchor

#-- About %pattern and %tag --
#  For my experiment I made several types of hypertext links (all of which
# were coded in HTML).  The NAME of the anchor included a label to identify
# the link type.  

my %pattern; # identifying regexps for the major types of links
my %tag;     # if a link has no name then it will be assigned one
             # composed of its $tag and a unique number

$tag{sem}    = "S"; # S for semantic
$tag{defn}   = "D"; # D for definition
$tag{struct} = "P"; # P for plain
$tag{other}  = "L"; # L for link 

$unique_name{sem}    = 1;
$unique_name{defn}   = 1;
$unique_name{struct} = 1;
$unique_name{other}  = 1;

$pattern{sem} = '(?:' . join ('|', qw{ 
                                      sent2sect
                                      s2sect
                                      sent2sent
                                      s2sent
                                      summary
                                      summ
                                      sent2para
                                      s2p
                                      s2para
                                      wg2sent
                                     }
                             )
                      . ')';

$pattern{defn} = '(?:' . join ('|', qw{
                                       defn:sys
                                       defn:term
                                       defn:abbr
                                       defn
                                      }
                              )
                       . ')';

$pattern{struct} = '(?:' . join ('|', qw{
                                         fn:back-jump
                                         list:LoF
                                         list:LoT
                                         list:ToC
                                         list:ToS
                                         list:jump
                                         list:localC
                                         struct
                                         struct:BottomUp
                                         struct:Footnote
                                         struct:LoF
                                         struct:LoT
                                         struct:Ref
                                         struct:Up2ToC
                                         struct:manual
                                        }
                                )
                         . ')';


# METHOD:
#   If there is an anchor in this paragraph then grab the text before and
#  after the anchor and the parts of the anchor itself (for processing in
#  the while loop).  Otherwise just print out the input line exactly as it
#  appears.
   
while (<>) {
   if ( m{<A} ) {
     while ( m{        # match repeatedly with /g
        \G(.*?)        # text before the anchor
                       #
        < \s* A        # this is an anchor
          \s+          #
          (HREF        # a link spec           \
          |NAME        # or a link destination  + saved as $2 
          |TITLE)      # or a link name        /
          \s* = \s*    # here comes the field described by $2
            ( ["'] )   # either quote, saved in $3 and \3
            ( .*? )    # the whole link, saved in $4
              \3       # the original $3 quote
                       #
          (?:\s*       # perhaps a second field
           (HREF       # a link spec            \
           |NAME       # or a link destination   + saved as $5
           |TITLE)     # or a link name         /
           \s* = \s*   # here comes the field described by $5
             ( ["'] )  # either quote, saved in $6 and \6
             ( .*? )   # the whole link, saved in $7
               \6      # the original $6 quote
          )?           # there might not be a second field
                       #
          (?:\s*       # perhaps a third field
           (HREF       # a link spec            \
           |NAME       # or a link destination   + saved as $8
           |TITLE)     # or a link name         /
           \s* = \s*   # here comes the field described by $8
             ( ["'] )  # either quote, saved in $9 and \9
             ( .*? )   # the whole link, saved in $10
               \9      # the original $9 quote
          )?           # there might not be a third field
                       #
         (.*? >)       # the rest of the tag 
                       #
         (.*?)         # text after the tag up to
         (?=<A|\Z)     # the end of the line or the next tag
              }xsgi)   # /x for expanded patterns
                       # /s so . can match \n
                       # /g to get multiple hits in one paragraph
                       # /i for case insensitivity on A and HREF|NAME|TITLE
     {
         my(@field) = (); # a place to keep the patterns matched above
         my($after);  # text that appeared on the line but before the <A
         my($before); # text that appeared on the line but after  the <A
         my($href);   # value of HREF field
         my($name);   # value of NAME field
         my($title);  # value of TITLE field
         my($rest);   # all of the other parts of the <A .. </A>
     
         #-- copying the patterns matched into declared variables
          # Would split() do this better?
          # Would pointers help here?
          # This works and that's good enough for me now and it can be
          # improved in a future version
         $field[ 1] =  $1;  $field[ 2] =  $2;  $field[ 3] =  $3; 
         $field[ 4] =  $4;  $field[ 5] =  $5;  $field[ 6] =  $6;
         $field[ 7] =  $7;  $field[ 8] =  $8;  $field[ 9] =  $9;
         $field[10] = $10;  $field[11] = $11;  $field[12] = $12;
     
         $before=$field[ 1] if defined $field[ 1];
         $rest  =$field[11] if defined $field[11];
         $after =$field[12] if defined $field[12];
     
         #-- filling in the values of $href, $name, and $title 
         #-- from the patterns matched above
         $href="";
         $href=$field[ 4] if $field[ 2] =~ m/HREF/i;
         $href=$field[ 7] if defined $field[ 5] && $field[ 5] =~ m/HREF/i;
         $href=$field[10] if defined $field[ 8] && $field[ 8] =~ m/HREF/i;
     
         $name="";
         $name=$field[ 4] if $field[ 2] =~ m/NAME/i;
         $name=$field[ 7] if defined $field[ 5] && $field[ 5] =~ m/NAME/i;
         $name=$field[10] if defined $field[ 8] && $field[ 8] =~ m/NAME/i;
     
         $title="";
         $title=$field[ 4] if $field[ 2] =~ m/TITLE/i;
         $title=$field[ 7] if defined $field[ 5] && $field[ 5] =~ m/TITLE/i;
         $title=$field[10] if defined $field[ 8] && $field[ 8] =~ m/TITLE/i;
     
         # if there is an HREF then replace it with our CGI call
         # else print out the anchor unchanged (except possibly reordered)
         my($elements)="";
         $elements .= "TITLE=\"$title\" " if "" ne $title;
         $elements .= "NAME=\"$name\" "   if "" ne $name;
 
         if ("" eq $href) {        
             $elements .= "$rest" if "" ne $rest;
             print "$before<A $elements$after";
         } else { 
             my($cgi_args) = "";
             if ("" eq $name) { # every link needs a NAME, so when there
                                # isn't one provided we make one using a
                                # global count
                                # (this would be a great place for
                                #  an array & loop to make the program
                                #  easier to read and update.)
		   if ($title =~ /^$pattern{sem}\s/) {
		       $name = $tag{sem} . $unique_name{sem}++;
		   }
		   elsif ($title =~ /^$pattern{defn}$/) {
		       $name = $tag{defn} . $unique_name{defn}++;
		   }
		   elsif ($title =~ /^$pattern{struct}$/) {
		       $name = $tag{struct} . $unique_name{struct}++;
		   }
		   else {
		       $name = $tag{other} . $unique_name{other}++;
		   }
                 $elements .= "NAME=\"$name\" ";
	       }

	       if (CALL_CGI_DIRECTLY) {
                $cgi_args .= "n=".&escape($name)  .CGI_SEP; # coming from 
                                                            # named anchor
                                                            # in ...
                $cgi_args .= "d=".DOC             .CGI_SEP; # this document
                $cgi_args .= "h=".&escape($href)  .CGI_SEP; # to here, with 
                $cgi_args .= "t=".&escape($title) .CGI_SEP; # this additional
                                                            # info attached
                $elements .= "HREF=\"" . CGIPROG . "?$cgi_args\" ";
                $elements .= "$rest" if "" ne $rest;
	      } else { # inserting call to JavaScript instead
		  $elements .= "onClick=\"" . JSPROG . 
                             "('" . &escape($href)  . "', '" .
                                    &escape($name)  . "', '" .
		             	      &escape($title) . "')\" ";
                $elements .= "HREF=\"" . $href . "\" ";
                $elements .= "$rest" if "" ne $rest;
	      }
	      print "$before<A $elements$after";
         }
     } # while we're in an anchor
   } else { # no anchors found in the input line
       print;
   }
}