#!/usr/bin/perl -s # (c) 2001 Vlado Keselj http://www.cs.dal.ca/~vlado # # A general script for several operations on syntactic trees. # (Standard tree formats, e.g. Penn TreeBank.) # # WARNING: The script is still being developed, and not all documented # features are implemented. # # See documentation at the end of file. $OpStat = 1 if $stat; $OpRules = 1 if $rules; $OpCats = 1 if $cats; # Efficiency reasons if ($OpStat || ($OpRules && $Sorted)) { $CollectRules=1 } if ($OpStat || ($OpCats && $Sorted)) { $CollectCats=1 } if ($grep || $egrep) { $OpGrep = 1; $RegExp = 1 if $egrep; $SubtreesFlag=1 if $s; $Rule = shift; if ($RegExp) { $Rule =~ s/\['[^\]]*'\]/\Q$&/g; print $Rule; } } $CurrentToken='START'; scan(); while ( $CurrentToken ne '') { $Tree = $Subtree = $Found = ''; doTree(); $Stat{'Number of sentences'} += 1; if ($OpGrep && $Found && !$SubtreesFlag) { print "$Tree\n" } } if ($Sorted) { if ($OpCats) { my $c; foreach $c (sort(keys %CatCount)) { print "$c\n"; } } if ($OpRules) { my $r; foreach $r (sort(keys %RuleCount)) { print "$r /$RuleCount{$r}\n"; } } } if ($OpStat) { my $i; $Stat{'Number of unique cats'} = scalar keys %CatCount; foreach $i ("Number of sentences", "Number of rules", "Number of unique rules", "Number of unique cats") { print "$i: $Stat{$i}\n"; } } sub scan { $Tree .= $CurrentToken; $Subtree .= $CurrentToken; scanBegin: return '' if $CurrentToken eq ''; if (! $CurrentLine) { $CurrentLine = <>; } if (! $CurrentLine) { $CurrentToken=''; return ''; } if ($CurrentLine =~ /^\s+/) { $Tree .= $&; $Subtree .= $&; $CurrentLine = $'; } if (! $CurrentLine ) { goto scanBegin; } if ($CurrentLine =~ /^(\(|\))/) { $CurrentToken = $1; $CurrentLine = $'; } else { $CurrentLine =~ /^([^\s()]+)/ or die; $CurrentToken = $1; $CurrentLine = $'; } return $CurrentToken; } sub doTree { my $ruleHead; my $saveSubtree = $Subtree; $Subtree = ''; if ($CurrentToken eq '(') { my $rule; scan(); die if $CurrentToken eq ''; if ($CurrentToken eq '(') { $ruleHead = '_empty'; } else { $ruleHead = $CurrentToken; scan(); } if ($CleanCats) { my $p = substr($ruleHead,0,1); $ruleHead = substr($ruleHead,1); $ruleHead = $` if $ruleHead =~ /[-=]/; $ruleHead = $p.$ruleHead; if ($p eq '-') { $ruleHead .= $p } } if ($CollectCats) { $CatCount{$ruleHead} += 1 } if ($OpCats && !$Sorted) { print "$ruleHead\n" } $ruleHead = quoteMaybe($ruleHead); $ruleHead = "[$ruleHead]"; $rule = "$ruleHead -> "; { my $mYield = ''; do { my $c = doTree(); if (!$NoEmpties || $Yield) { $rule .= $c.' '; $mYield .= " $Yield"; die if $CurrentToken eq ''; } } while ($CurrentToken ne ')'); $Yield = $mYield; $Yield =~ s/^ +//; } $Yield='' if $ruleHead eq '[\'-NONE-\']'; scan(); $rule .= '.'; if (!$NoEmpties || $Yield) { newRuleFound($rule) } } else { $Yield = $CurrentToken; $ruleHead = quoteMaybe($CurrentToken); scan(); } $Subtree = "$saveSubtree$Subtree"; return $ruleHead; } sub quoteMaybe { local $_; $_ = shift; return $_ if /^\w+$/; s/\\/\\\\/g; s/\'/\\\'/g; return "'$_'"; } sub newRuleFound { my $r = shift; if ($OpGrep && index($r, $Rule)==-1) { return } $Found = 1; $Stat{'Number of rules'} += 1; if ($CollectRules) { $RuleCount{$r} += 1; if ($RuleCount{$r} == 1) { $Stat{'Number of unique rules'} += 1; } } if ($OpRules) { if ($SubtreesFlag) { local $_; $_ =$Subtree; s/^\s+//; s/\s+$//; print "$_\n"; } else { print "$r\n" } } } __END__ ######################################################################## # Documentation =pod =head1 NAME trees-do (or trees-do.pl) - A general script for several operations on syntactic trees. (Standard tree formats, e.g. Penn TreeBank.) =head1 SYNOPSIS trees-do {flags} files... =head1 DESCRIPTION Flags: -stat print statistics about trees at the end -cats print all categories (non-terminals) -rules print rules (with counts) -Sorted sorted output of rules or cats -grep rule print trees containing rule -grep -s rule print subtrees rooted at a node labeled by a rule (instead of rule it can be a substring of rule) -egrep use of regular expressions -CleanCats removes everything after - or =, which is not in the first position -NoEmpties removed empty components (-NONE- produces empty) =head1 AUTHOR Copyright 2001 Vlado Keselj > This script is provided "as is" without express or implied warranty. =cut