#!/usr/bin/perl # (c) 2001 Vlado Keselj # # Extract CF rules from a forest. # (Standard tree formats, e.g. Penn TreeBank.) # `sort -u' on output is sometimes a good idea. # # Example: on input (Johnson 99): # # (S1 (S (NP (N Bill)) # (VP (V thought) # (S (NP (N Hillary)) # (VP (V left)) # (AD yesterday)))) # (. .) ) # # (S1 (S (NP (N Bill)) # (VP (V thought) # (S (NP (N Hillary)) # (VP (V left)))) # (AD yesterday)) # (. .) ) # # produces the output: # [N] -> Bill . # [NP] -> [N] . # [V] -> thought . # [N] -> Hillary . # [NP] -> [N] . # [V] -> left . # [VP] -> [V] . # [AD] -> yesterday . # [S] -> [NP] [VP] [AD] . # [VP] -> [V] [S] . # [S] -> [NP] [VP] . # ['.'] -> '.' . # [S1] -> [S] ['.'] . # [N] -> Bill . # [NP] -> [N] . # [V] -> thought . # [N] -> Hillary . # [NP] -> [N] . # [V] -> left . # [VP] -> [V] . # [S] -> [NP] [VP] . # [VP] -> [V] [S] . # [AD] -> yesterday . # [S] -> [NP] [VP] [AD] . # ['.'] -> '.' . # [S1] -> [S] ['.'] . $CurrentToken='START'; scan(); while ( $CurrentToken ne '') { doTree(); } sub scan { scanBegin: return '' if $CurrentToken eq ''; if (! $CurrentLine) { $CurrentLine = <>; } if (! $CurrentLine) { $CurrentToken=''; return ''; } $CurrentLine =~ s/^\s+//; if (! $CurrentLine ) { goto scanBegin; } if ($CurrentLine =~ /^(\(|\))\s*/) { $CurrentToken = $1; $CurrentLine = $'; } else { $CurrentLine =~ /^([^\s()]+)\s*/ or die; $CurrentToken = $1; $CurrentLine = $'; } return $CurrentToken; } sub doTree { my $ruleHead; if ($CurrentToken eq '(') { my $rule; scan(); die if $CurrentToken eq ''; if ($CurrentToken eq '(') { $ruleHead = '_empty'; } else { $ruleHead = quoteMaybe($CurrentToken); scan(); } $ruleHead = "[$ruleHead]"; $rule = "$ruleHead -> "; do { $rule .= doTree().' '; die if $CurrentToken eq ''; } while ($CurrentToken ne ')'); scan(); print "$rule.\n"; } else { $ruleHead = quoteMaybe($CurrentToken); scan(); } return $ruleHead; } sub quoteMaybe { local $_; $_ = shift; return $_ if /^\w+$/; s/\\/\\\\/g; s/\'/\\\'/g; return "'$_'"; }