chartparse_pa.pm

use strict;

package Chartparse_pa;
use Exporter;

BEGIN {
   use Exporter ();
use vars qw(@ISA @EXPORT);
@ISA = qw(Exporter);
@EXPORT = qw(&chart_parse &print_chart &print_struct &spanning_edges
             &print_spanning &file_to_lexiconref &file_to_rulesref);
}

# Chartparse.pm
# by Gabe Webster
# 4 May 2001

# Bottom-up CFG chart parser, no unification or packing, but with edges
# indexed by start (inactive)/end (active) vertices for efficiency, and with
# words and rules indexed by leftmost daughter.

# This version includes predicate-argument structure interpretation

# The edge data structure: @edge = ($start, $end, $structref, $tofindref,
#                                   $predarg)
# $predarg is the predicate-argument structure of the current edge.
# $tofindref is a reference to an array of edge categories to find.  
# $structref is a reference to a structure of the form (mothercat, 
# daughtertree1ref, daughtertree2ref, etc.)  So mother category is just 
# struct[0], while daughter categories are struct[1]->[0], struct[2]->[0], etc.

# Thus, $edgeref->[2][0] is the category of the edge, while $edgeref->[3][0]
# is the first element of the tofind list.

my @agenda;       # agenda of edges to be added to chart and processed
my %chart_inacts; # hash of inactive edges in the chart, indexed by left edge
my %chart_acts;   # hash of active edges in the chart, indexed by right edge
my $debuglevel;

sub dummy 
# This is how to call the chart parser with a hard-coded lexicon and rule set.
{
my $wordsref = ["my", "dog", "has", "fleas"];

my $rulesref;
my $lexiconref;

%$lexiconref = ( "my" => [[["D","my"],"Lx.my(x)"]],
                 "dog" => [[["N","dog"],"dog"]], 
                 "has" => [[["V", "has"],"Ly.Lx.has(x,y)"]], 
                 "fleas" => [[["N", "fleas"],"fleas"]]
               );
%$rulesref = ( "D" => [[["NP","D","N"],["f","a"]]], 
               "V" => [[["VP","V","N"],["f","a"]]], 
               "NP" => [[["S","NP","VP"],["a","f"]]]
             );

# input to chart_parse is an array of the words to be parsed.

chart_parse($wordsref, $rulesref, $lexiconref, 1);
print_chart(1,1); # two args:  whether to print the structure contained within
                  # edges (T/F), and whether to print active edges along with
                  # inactive (T/F)

my @spanning_edgerefs = spanning_edges();

print $#spanning_edgerefs+1, " spanning edge"; 
if ($#spanning_edgerefs != 0) {print "s";}
print " found:\n";
foreach my $edgeref (@spanning_edgerefs) { print_struct($edgeref->[2]); }

}

sub chart_parse {

    my ($wordsref, $rulesref, $lexiconref);
    ($wordsref, $rulesref, $lexiconref, $debuglevel) = @_;
    %chart_inacts = (); # clear the hashes
    %chart_acts = ();
    add_lexentries($wordsref, $lexiconref);
    process_agenda($rulesref);
    @agenda = ();
    return \%chart_inacts;
}

sub spanning_edges {

  my @spanning_edgerefs = ();
  my $array_ref = $chart_inacts{0};
  my @allkeys = keys %chart_inacts;
  my $numkeys = $#allkeys; # num - 1
  foreach my $edgeref (@$array_ref) {
    if ($edgeref->[1] == $numkeys+1) {push @spanning_edgerefs, $edgeref;}
  }
  return @spanning_edgerefs;
}

sub print_spanning {
    my @spanning_edgerefs = @_;

    print $#spanning_edgerefs+1, " spanning edge";
    if ($#spanning_edgerefs != 0) {print "s";}
    print " found:\n";
    foreach my $edgeref (@spanning_edgerefs) { print_struct($edgeref->[2],
							    $edgeref->[4]); }
}

sub print_chart {
    my($structp, $activep) = @_;
    print "Inactive edges:\n";
    my $thiskey;
    foreach $thiskey (sort keys %chart_inacts) {
	my $array_ref = $chart_inacts{$thiskey};
        my $edgeref;
	foreach $edgeref (@$array_ref) {
	    print "Start ", $edgeref->[0], ", end ", $edgeref->[1];
	    if (!$structp) { print ", cat $edgeref->[2][0]\n"; }
	    else { print "\n"; print_struct($edgeref->[2], $edgeref->[4]); 
		   print "\n"; }
	}
    }
    if ($activep) {
	print "Active edges:\n";
	foreach $thiskey (sort keys %chart_acts) {
	    my $array_ref = $chart_acts{$thiskey};
	    my $edgeref;
	    foreach $edgeref (@$array_ref) {
		print "Start ", $edgeref->[0], ", end ", $edgeref->[1];
		if (!$structp) { print ", cat $edgeref->[2][0], "; }
		else { print "\n"; 
		       print_struct($edgeref->[2], $edgeref->[4]); }
		my $tofind;
		print "To find: ";
		foreach $tofind (@{$edgeref->[3]}) { print $tofind, " "; }
		print "\n";
		if ($structp) {print "\n";}
	    }
	}
    }
}

sub print_struct2 { 
    my ($structref, $predarg) = @_;
    print "Pred-arg:";
    if ($predarg->[0] ne "") { print " value=", $predarg->[0]; }
    my $arg;
    foreach $arg (@{$predarg->[1]}) { print " arg=", $arg; }
    print "\nStruct:"; print_struct_help($structref, 0); print "\n";
}

sub print_struct2_help {
  my ($structref, $numspaces) = @_;
  if ($#$structref > 0) {
    print "\n";
    for(my $i = 0; $i<$numspaces; $i++) { print "  "; }
  }
  elsif ($numspaces == 0) { print "\n"; }
  else { print ":"; }
  print $structref->[0];
  if ($#$structref > 0) { 
    my @restarray = @{$structref}[1..$#$structref];
    my $daughterref;
    foreach $daughterref (@restarray) {
       print_struct_help($daughterref, $numspaces+1);
    }
  }
}

sub print_struct { 
    my ($structref, $predarg) = @_;
    print "Pred-arg:";
    if ($predarg->[0] ne "") { print " value=", $predarg->[0]; }
    my $arg;
    foreach $arg (@{$predarg->[1]}) { print " arg=", $arg; }
    print "Struct:"; 
    print_struct_help($structref, 0, [""], 0); print "\n";
}

sub print_struct_help {
    my ($structref, $numspaces, $printline, $daughterno) = @_;
    if ($#$structref > 0) { # the current tree has daughters
	if ($daughterno > 1) { $daughterno = 1; }
	for my $j (0 .. $daughterno) { # don't add extra line for first daughter, but yes
	     # for other daughters
	    print "\n";
	    for my $i (0 .. $numspaces-1) { 
		print "  ";
		if ($i == $numspaces-1) { print "|"; } # add a line just before printing the cat
		else { print $printline->[$i]; } 
	    }
	}
	print $structref->[0];
	my @daughterarray = @{$structref}[1..$#$structref];
	$printline->[$numspaces] = "|";
	for my $i (0 .. $#daughterarray) {
	    if ($i == $#daughterarray) { $printline->[$numspaces] = " "; }
	    print_struct_help($daughterarray[$i], $numspaces+1, $printline, 
			      $i);
	}
    }
    else { # the current tree has no daughters
	print " - $structref->[0]";
    }
}

sub add_lexentries {
# Look up words in the lexicon and add all lexical entries to the agenda of
# edges to be processed.  A word can have multiple categories, since the value
# of the lexicon hash is a reference to an array of lexical entries.

    my ($wordsref, $lexiconref) = @_;
    my $word;
    my $wordno = 0;
    foreach $word (@$wordsref) {
	foreach my $lexentryref (@{$lexiconref->{$word}}) {
	    push @agenda, [$wordno, 
			   $wordno+1, 
			   [$lexentryref->[0][0], [$word]], 
			   [],
			   [$lexentryref->[1],[]]];
	    if ($debuglevel > 0) {
		print "\nAdding edge to agenda: ";
		print "Start $wordno, end ", $wordno+1, "\n";
		print_struct([$lexentryref->[0][0], [$word]], 
			     [$lexentryref->[1],[]]);
	    }
	}
	$wordno++;
    }
}

sub process_agenda {
# Go through all edges in agenda (@agenda) until there are no 
# more.  Keep in mind that more edges may be added to the end of @agenda 
# in the body of the foreach loop.

    my ($rulesref) = @_;
    my $edgeref;

    # In the next line, "shift @agenda" takes the element off the beginning
    # of the agenda, resulting in a breadth-first parser.  Changing this to 
    # "pop @agenda" will result in a depth-first parser.

	   while (defined($edgeref = shift @agenda)) {

	       if ($debuglevel > 0) {
		   print "\nAdding edge to chart: ";
		   print "Start $edgeref->[0], end $edgeref->[1]\n";
		   print_struct($edgeref->[2],$edgeref->[4]);
	       }

   	  if ($#{$edgeref->[3]} == -1) { # if tofind is null, edge is inactive,
	                                  # so index it by its starting number
	       process_inactive($edgeref, $rulesref);
	     }
	     else { # edge is active, index it by its end number
		 if ($debuglevel > 0) { print "To find: @{$edgeref->[3]}\n" }
	       process_active($edgeref, $rulesref);
	     }
    }
}

sub process_inactive {
# This is the processing of a single inactive edge.

    my ($edgeref, $rulesref) = @_;

    # push the edge onto the end of the correct array in the hash table
    push @{$chart_inacts{$edgeref->[0]}}, $edgeref;

    # combine inactive edge with any preceding active edges;
    my $array_ref = $chart_acts{$edgeref->[0]}; # actives whose end is same 
                                                # as current inactive's start
    my $activeref;
    foreach $activeref (@$array_ref) {
     	if ($activeref->[3][0] eq $edgeref->[2][0]) { # if first tofind cat =
	       combine_edges ($activeref, $edgeref);
	     }
    }

    # Look for rule that has that edge's category as leftmost daughter.
    my $ruleref;
    foreach $ruleref (@{$rulesref->{$edgeref->[2][0]}}) {
	     my @newtofind = ();
	     my $i = 2;
	     while (defined($ruleref->[0][$i])) {
		      push @newtofind, $ruleref->[0][$i];
	       push @newtofind, $ruleref->[1][$i-1];
		      $i++;
	     }
    my $predarg;
	   if ($ruleref->[1][0] eq "f") { $predarg = [$edgeref->[4][0],[]]; }
    else { $predarg = ["",[$edgeref->[4][0]]]; }

	   push @agenda, [$edgeref->[0], 
				$edgeref->[1], 
				[$ruleref->[0][0], $edgeref->[2]], 
				[@newtofind], 
	   $predarg];
	     if ($debuglevel > 0) {
		 print "\nAdding ps-rule edge to agenda: ";
		 print "Start $edgeref->[0], end $edgeref->[1]\n";
		 print_struct([$ruleref->[0][0], $edgeref->[2]], 
			      $predarg);
		 if ($#newtofind > -1) { print "To find: @newtofind\n"; }
	     }

    }
}

sub process_active {
# This is the processing of one active edge.

    my $edgeref = $_[0];
    # push the edge onto the correct array of the active hash table
    push @{$chart_acts{$edgeref->[1]}}, $edgeref;

    # look for inactive edges in the chart beginning at the end of the active 
    # edge
    my $array_ref = $chart_inacts{$edgeref->[1]}; # inactives whose beginning
                                                  # is same as
                                                  # current active's end
    my $inactref;
    foreach $inactref (@$array_ref) {
     	if ($inactref->[2][0] eq $edgeref->[3][0]) { # if first tofind cat =
	       combine_edges ($edgeref, $inactref);
	     }
    }   
}

sub combine_edges {
# When the first "tofind" category of an active edge matches the category
# of an inactive edge, add a new edge combining the two edges to the processing
# agenda.

    my ($activeref, $inactref) = @_;
    my @newtofind = ();
    my $i = 2;
    while (defined($activeref->[3][$i])) {
	push @newtofind, $activeref->[3][$i];
	$i++;
    }

    my $predarg;
    my $new_funct = "";
    my @new_args = @{$activeref->[4][1]};

    if ($activeref->[3][1] eq "f") { 
	$new_funct = $inactref->[4][0];
    }
    else {
	$new_funct = $activeref->[4][0];
	push @new_args, $inactref->[4][0];
    }

    if (($new_funct ne "") && ($#new_args > -1)) { 
      my $arg;
      foreach $arg (@new_args) {
        $new_funct =~ /^L(\w)\.(.+)$/;
        $new_funct = $2;
        my $temp = $1;
        while ($new_funct =~ s/([^\w])$temp([^\w])/$1$arg$2/) { 1; }
      }
    $predarg = [$new_funct,[]];
    }
    else { $predarg = [$new_funct,[@new_args]]; }

    push @agenda, [$activeref->[0], 
		       $inactref->[1],
		       [@{$activeref->[2]}, $inactref->[2]], # add new daughter
		                                             # to struct -- 
                                                             # it's flattened 
		                                             # automatically
		       [@newtofind],
         $predarg];
    if ($debuglevel > 0) {
	print "\nAdding combined edge to agenda: ";
	print "Start $activeref->[0], end $inactref->[1]\n";
	print_struct([@{$activeref->[2]}, $inactref->[2]], $predarg);
	if ($#newtofind > -1) { print "To find: @newtofind\n"; }
    }
}

sub file_to_rulesref {
    
    my $rulesfile = $_[0];
    my %rules;
    
    my $curline = "";
    
    open (DATAFILE, "<$rulesfile") or die "Couldn't open file\n";
    
    while (defined($curline = )) {
	
	chomp($curline);
	if ($curline =~ /^\s*(\S+)\s*->\s*(\S+(?:\s+\S+)*)\s*:\s*([fa](?:\s+[fa])*)\s*$/) {
	    my @daughters = split " ", $2;
	    my @f_as = split " ", $3;
	    push @{$rules{$daughters[0]}}, [[$1, @daughters],[@f_as]];
	}
    }
    close (DATAFILE) or die "Couldn't close file\n";
    return \%rules;
}

sub file_to_lexiconref {

    my $lexiconfile = $_[0];
    my %lexicon;
    
    my $curline = "";
    
    open (DATAFILE, "<$lexiconfile") or die "Couldn't open file\n";
    
    while (defined($curline = )) {
	
	chomp($curline);
	if ($curline =~ /^\s*(\S+)\s+(\S+)\s*:\s*(\S+)\s*$/) {
	    
	    push @{$lexicon{$1}}, [[$2, $1], $3];
	}
    }
    close (DATAFILE) or die "Couldn't close file\n";
    return \%lexicon;
}

1;