#!/bin/perl -s # (c) 2001 Vlado Keselj # # Find a CF rule in a forest (prints all trees containing a rule). # (Standard tree formats, e.g. Penn TreeBank.) # # Example: trees-grepTree.pl '[NP] -> [DT] .' f1 f2 # # By default it will print the whole tree. # Use: -s to print only the subtrees rooted at the nodes labeled with # the rule $SubtreesFlag=1 if $s; $Rule = shift; $CurrentToken='START'; scan(); while ( $CurrentToken ne '') { $Tree = $Subtree = $Found = ''; doTree(); if ($Found && !$SubtreesFlag) { print "$Tree\n" } } sub scan { scanBegin: return '' if $CurrentToken eq ''; if (! $CurrentLine) { $CurrentLine = <>; } if (! $CurrentLine) { $CurrentToken=''; return ''; } if ($CurrentLine =~ /^\s+/) { $Tree .= $&; $Subtree .= $&; $CurrentLine = $'; } if (! $CurrentLine ) { goto scanBegin; } if ($CurrentLine =~ /^(\(|\))/) { $CurrentToken = $1; $CurrentLine = $'; } else { $CurrentLine =~ /^([^\s()]+)/ or die; $CurrentToken = $1; $CurrentLine = $'; } $Tree .= $CurrentToken; $Subtree .= $CurrentToken; return $CurrentToken; } sub doTree { my $ruleHead; my $saveSubtree = $Subtree; $Subtree = ''; if ($CurrentToken eq '(') { my $rule; scan(); die if $CurrentToken eq ''; if ($CurrentToken eq '(') { $ruleHead = '_empty'; } else { $ruleHead = quoteMaybe($CurrentToken); scan(); } $ruleHead = "[$ruleHead]"; $rule = "$ruleHead -> "; do { $rule .= doTree().' '; die if $CurrentToken eq ''; } while ($CurrentToken ne ')'); my $st = $Subtree; scan(); $rule .= '.'; if ($rule eq $Rule) { $Found = 1; if ($SubtreesFlag) { print "($st\n" } } } else { $ruleHead = quoteMaybe($CurrentToken); scan(); } $Subtree = "$saveSubtree$Subtree"; return $ruleHead; } sub quoteMaybe { local $_; $_ = shift; return $_ if /^\w+$/; s/\\/\\\\/g; s/\'/\\\'/g; return "'$_'"; }