chartparse_pa.pm
use strict;
package Chartparse_pa;
use Exporter;
BEGIN {
use Exporter ();
use vars qw(@ISA @EXPORT);
@ISA = qw(Exporter);
@EXPORT = qw(&chart_parse &print_chart &print_struct &spanning_edges
&print_spanning &file_to_lexiconref &file_to_rulesref);
}
# Chartparse.pm
# by Gabe Webster
# 4 May 2001
# Bottom-up CFG chart parser, no unification or packing, but with edges
# indexed by start (inactive)/end (active) vertices for efficiency, and with
# words and rules indexed by leftmost daughter.
# This version includes predicate-argument structure interpretation
# The edge data structure: @edge = ($start, $end, $structref, $tofindref,
# $predarg)
# $predarg is the predicate-argument structure of the current edge.
# $tofindref is a reference to an array of edge categories to find.
# $structref is a reference to a structure of the form (mothercat,
# daughtertree1ref, daughtertree2ref, etc.) So mother category is just
# struct[0], while daughter categories are struct[1]->[0], struct[2]->[0], etc.
# Thus, $edgeref->[2][0] is the category of the edge, while $edgeref->[3][0]
# is the first element of the tofind list.
my @agenda; # agenda of edges to be added to chart and processed
my %chart_inacts; # hash of inactive edges in the chart, indexed by left edge
my %chart_acts; # hash of active edges in the chart, indexed by right edge
my $debuglevel;
sub dummy
# This is how to call the chart parser with a hard-coded lexicon and rule set.
{
my $wordsref = ["my", "dog", "has", "fleas"];
my $rulesref;
my $lexiconref;
%$lexiconref = ( "my" => [[["D","my"],"Lx.my(x)"]],
"dog" => [[["N","dog"],"dog"]],
"has" => [[["V", "has"],"Ly.Lx.has(x,y)"]],
"fleas" => [[["N", "fleas"],"fleas"]]
);
%$rulesref = ( "D" => [[["NP","D","N"],["f","a"]]],
"V" => [[["VP","V","N"],["f","a"]]],
"NP" => [[["S","NP","VP"],["a","f"]]]
);
# input to chart_parse is an array of the words to be parsed.
chart_parse($wordsref, $rulesref, $lexiconref, 1);
print_chart(1,1); # two args: whether to print the structure contained within
# edges (T/F), and whether to print active edges along with
# inactive (T/F)
my @spanning_edgerefs = spanning_edges();
print $#spanning_edgerefs+1, " spanning edge";
if ($#spanning_edgerefs != 0) {print "s";}
print " found:\n";
foreach my $edgeref (@spanning_edgerefs) { print_struct($edgeref->[2]); }
}
sub chart_parse {
my ($wordsref, $rulesref, $lexiconref);
($wordsref, $rulesref, $lexiconref, $debuglevel) = @_;
%chart_inacts = (); # clear the hashes
%chart_acts = ();
add_lexentries($wordsref, $lexiconref);
process_agenda($rulesref);
@agenda = ();
return \%chart_inacts;
}
sub spanning_edges {
my @spanning_edgerefs = ();
my $array_ref = $chart_inacts{0};
my @allkeys = keys %chart_inacts;
my $numkeys = $#allkeys; # num - 1
foreach my $edgeref (@$array_ref) {
if ($edgeref->[1] == $numkeys+1) {push @spanning_edgerefs, $edgeref;}
}
return @spanning_edgerefs;
}
sub print_spanning {
my @spanning_edgerefs = @_;
print $#spanning_edgerefs+1, " spanning edge";
if ($#spanning_edgerefs != 0) {print "s";}
print " found:\n";
foreach my $edgeref (@spanning_edgerefs) { print_struct($edgeref->[2],
$edgeref->[4]); }
}
sub print_chart {
my($structp, $activep) = @_;
print "Inactive edges:\n";
my $thiskey;
foreach $thiskey (sort keys %chart_inacts) {
my $array_ref = $chart_inacts{$thiskey};
my $edgeref;
foreach $edgeref (@$array_ref) {
print "Start ", $edgeref->[0], ", end ", $edgeref->[1];
if (!$structp) { print ", cat $edgeref->[2][0]\n"; }
else { print "\n"; print_struct($edgeref->[2], $edgeref->[4]);
print "\n"; }
}
}
if ($activep) {
print "Active edges:\n";
foreach $thiskey (sort keys %chart_acts) {
my $array_ref = $chart_acts{$thiskey};
my $edgeref;
foreach $edgeref (@$array_ref) {
print "Start ", $edgeref->[0], ", end ", $edgeref->[1];
if (!$structp) { print ", cat $edgeref->[2][0], "; }
else { print "\n";
print_struct($edgeref->[2], $edgeref->[4]); }
my $tofind;
print "To find: ";
foreach $tofind (@{$edgeref->[3]}) { print $tofind, " "; }
print "\n";
if ($structp) {print "\n";}
}
}
}
}
sub print_struct2 {
my ($structref, $predarg) = @_;
print "Pred-arg:";
if ($predarg->[0] ne "") { print " value=", $predarg->[0]; }
my $arg;
foreach $arg (@{$predarg->[1]}) { print " arg=", $arg; }
print "\nStruct:"; print_struct_help($structref, 0); print "\n";
}
sub print_struct2_help {
my ($structref, $numspaces) = @_;
if ($#$structref > 0) {
print "\n";
for(my $i = 0; $i<$numspaces; $i++) { print " "; }
}
elsif ($numspaces == 0) { print "\n"; }
else { print ":"; }
print $structref->[0];
if ($#$structref > 0) {
my @restarray = @{$structref}[1..$#$structref];
my $daughterref;
foreach $daughterref (@restarray) {
print_struct_help($daughterref, $numspaces+1);
}
}
}
sub print_struct {
my ($structref, $predarg) = @_;
print "Pred-arg:";
if ($predarg->[0] ne "") { print " value=", $predarg->[0]; }
my $arg;
foreach $arg (@{$predarg->[1]}) { print " arg=", $arg; }
print "Struct:";
print_struct_help($structref, 0, [""], 0); print "\n";
}
sub print_struct_help {
my ($structref, $numspaces, $printline, $daughterno) = @_;
if ($#$structref > 0) { # the current tree has daughters
if ($daughterno > 1) { $daughterno = 1; }
for my $j (0 .. $daughterno) { # don't add extra line for first daughter, but yes
# for other daughters
print "\n";
for my $i (0 .. $numspaces-1) {
print " ";
if ($i == $numspaces-1) { print "|"; } # add a line just before printing the cat
else { print $printline->[$i]; }
}
}
print $structref->[0];
my @daughterarray = @{$structref}[1..$#$structref];
$printline->[$numspaces] = "|";
for my $i (0 .. $#daughterarray) {
if ($i == $#daughterarray) { $printline->[$numspaces] = " "; }
print_struct_help($daughterarray[$i], $numspaces+1, $printline,
$i);
}
}
else { # the current tree has no daughters
print " - $structref->[0]";
}
}
sub add_lexentries {
# Look up words in the lexicon and add all lexical entries to the agenda of
# edges to be processed. A word can have multiple categories, since the value
# of the lexicon hash is a reference to an array of lexical entries.
my ($wordsref, $lexiconref) = @_;
my $word;
my $wordno = 0;
foreach $word (@$wordsref) {
foreach my $lexentryref (@{$lexiconref->{$word}}) {
push @agenda, [$wordno,
$wordno+1,
[$lexentryref->[0][0], [$word]],
[],
[$lexentryref->[1],[]]];
if ($debuglevel > 0) {
print "\nAdding edge to agenda: ";
print "Start $wordno, end ", $wordno+1, "\n";
print_struct([$lexentryref->[0][0], [$word]],
[$lexentryref->[1],[]]);
}
}
$wordno++;
}
}
sub process_agenda {
# Go through all edges in agenda (@agenda) until there are no
# more. Keep in mind that more edges may be added to the end of @agenda
# in the body of the foreach loop.
my ($rulesref) = @_;
my $edgeref;
# In the next line, "shift @agenda" takes the element off the beginning
# of the agenda, resulting in a breadth-first parser. Changing this to
# "pop @agenda" will result in a depth-first parser.
while (defined($edgeref = shift @agenda)) {
if ($debuglevel > 0) {
print "\nAdding edge to chart: ";
print "Start $edgeref->[0], end $edgeref->[1]\n";
print_struct($edgeref->[2],$edgeref->[4]);
}
if ($#{$edgeref->[3]} == -1) { # if tofind is null, edge is inactive,
# so index it by its starting number
process_inactive($edgeref, $rulesref);
}
else { # edge is active, index it by its end number
if ($debuglevel > 0) { print "To find: @{$edgeref->[3]}\n" }
process_active($edgeref, $rulesref);
}
}
}
sub process_inactive {
# This is the processing of a single inactive edge.
my ($edgeref, $rulesref) = @_;
# push the edge onto the end of the correct array in the hash table
push @{$chart_inacts{$edgeref->[0]}}, $edgeref;
# combine inactive edge with any preceding active edges;
my $array_ref = $chart_acts{$edgeref->[0]}; # actives whose end is same
# as current inactive's start
my $activeref;
foreach $activeref (@$array_ref) {
if ($activeref->[3][0] eq $edgeref->[2][0]) { # if first tofind cat =
combine_edges ($activeref, $edgeref);
}
}
# Look for rule that has that edge's category as leftmost daughter.
my $ruleref;
foreach $ruleref (@{$rulesref->{$edgeref->[2][0]}}) {
my @newtofind = ();
my $i = 2;
while (defined($ruleref->[0][$i])) {
push @newtofind, $ruleref->[0][$i];
push @newtofind, $ruleref->[1][$i-1];
$i++;
}
my $predarg;
if ($ruleref->[1][0] eq "f") { $predarg = [$edgeref->[4][0],[]]; }
else { $predarg = ["",[$edgeref->[4][0]]]; }
push @agenda, [$edgeref->[0],
$edgeref->[1],
[$ruleref->[0][0], $edgeref->[2]],
[@newtofind],
$predarg];
if ($debuglevel > 0) {
print "\nAdding ps-rule edge to agenda: ";
print "Start $edgeref->[0], end $edgeref->[1]\n";
print_struct([$ruleref->[0][0], $edgeref->[2]],
$predarg);
if ($#newtofind > -1) { print "To find: @newtofind\n"; }
}
}
}
sub process_active {
# This is the processing of one active edge.
my $edgeref = $_[0];
# push the edge onto the correct array of the active hash table
push @{$chart_acts{$edgeref->[1]}}, $edgeref;
# look for inactive edges in the chart beginning at the end of the active
# edge
my $array_ref = $chart_inacts{$edgeref->[1]}; # inactives whose beginning
# is same as
# current active's end
my $inactref;
foreach $inactref (@$array_ref) {
if ($inactref->[2][0] eq $edgeref->[3][0]) { # if first tofind cat =
combine_edges ($edgeref, $inactref);
}
}
}
sub combine_edges {
# When the first "tofind" category of an active edge matches the category
# of an inactive edge, add a new edge combining the two edges to the processing
# agenda.
my ($activeref, $inactref) = @_;
my @newtofind = ();
my $i = 2;
while (defined($activeref->[3][$i])) {
push @newtofind, $activeref->[3][$i];
$i++;
}
my $predarg;
my $new_funct = "";
my @new_args = @{$activeref->[4][1]};
if ($activeref->[3][1] eq "f") {
$new_funct = $inactref->[4][0];
}
else {
$new_funct = $activeref->[4][0];
push @new_args, $inactref->[4][0];
}
if (($new_funct ne "") && ($#new_args > -1)) {
my $arg;
foreach $arg (@new_args) {
$new_funct =~ /^L(\w)\.(.+)$/;
$new_funct = $2;
my $temp = $1;
while ($new_funct =~ s/([^\w])$temp([^\w])/$1$arg$2/) { 1; }
}
$predarg = [$new_funct,[]];
}
else { $predarg = [$new_funct,[@new_args]]; }
push @agenda, [$activeref->[0],
$inactref->[1],
[@{$activeref->[2]}, $inactref->[2]], # add new daughter
# to struct --
# it's flattened
# automatically
[@newtofind],
$predarg];
if ($debuglevel > 0) {
print "\nAdding combined edge to agenda: ";
print "Start $activeref->[0], end $inactref->[1]\n";
print_struct([@{$activeref->[2]}, $inactref->[2]], $predarg);
if ($#newtofind > -1) { print "To find: @newtofind\n"; }
}
}
sub file_to_rulesref {
my $rulesfile = $_[0];
my %rules;
my $curline = "";
open (DATAFILE, "<$rulesfile") or die "Couldn't open file\n";
while (defined($curline = )) {
chomp($curline);
if ($curline =~ /^\s*(\S+)\s*->\s*(\S+(?:\s+\S+)*)\s*:\s*([fa](?:\s+[fa])*)\s*$/) {
my @daughters = split " ", $2;
my @f_as = split " ", $3;
push @{$rules{$daughters[0]}}, [[$1, @daughters],[@f_as]];
}
}
close (DATAFILE) or die "Couldn't close file\n";
return \%rules;
}
sub file_to_lexiconref {
my $lexiconfile = $_[0];
my %lexicon;
my $curline = "";
open (DATAFILE, "<$lexiconfile") or die "Couldn't open file\n";
while (defined($curline = )) {
chomp($curline);
if ($curline =~ /^\s*(\S+)\s+(\S+)\s*:\s*(\S+)\s*$/) {
push @{$lexicon{$1}}, [[$2, $1], $3];
}
}
close (DATAFILE) or die "Couldn't close file\n";
return \%lexicon;
}
1;