Code for Chartparse.pm

chartparse.pm

use strict;

package Chartparse;
use Exporter;

BEGIN {
    use Exporter ();
    use vars qw(@ISA @EXPORT);
    @ISA = qw(Exporter);
@EXPORT = qw(&chart_parse &print_chart &print_struct &spanning_edges
             &file_to_lexiconref &file_to_rulesref &lexiconref_to_file
             &rulesref_to_file &set_pcfg);
}

# Bottom-up PCFG chart parser, with edges indexed by start (inactive)/end (active) 
# vertices for efficiency, and with lexical entries indexed by word, and rules indexed
# by leftmost daughter.  Currently, no packing or support for features (UBCFG).

# NOTE:  Currently, probability calculation assumes that there are no lexical entries
# with phrasal category mothers (like NP -> he ) -- otherwise, the probabilities are 
# wrong.

# The edge data structure: %edge = ( start => $start, 
#           						 end => $end,
#									 struct => $structref,
#        							 tofind => $tofindref,
#									 probs => $probabilityarrayref )
# $structref is a reference to an array of the form (mothercat, daughtertree1ref, 
# daughtertree2ref, etc.).  So $edge{struct}[0] is the category of the edge, 
# $edge{struct}[1] points to the {struct} value of the first daughter, $edge{struct}[2] 
# points to the second daughter's {struct}, and so on.  $tofindref is a reference to an
# array of the edge categories to find for the edge, with $edge{tofind}[0] being the
# next edge to find.  $probabilityarrayref is a reference to an array of the
# probabilities of all the lexical entries and rules used to create the current edge.

# The rule data structure:  %rule = ( mother => $mothercat,
#   								  daughters => $daughterarrayref,
#									  freq => $rule_frequency )
# $rule_frequency is for PCFG use, and during parsing represents 
# P(M -> these daughters | M -> any set of daughters ).

# The lexical entry data structure:  @lexentry = ( $mothercat, $daughtercat, $frequency )

# The parser is currently breadth-first, but it can be changed to a 
# depth-first parser by changing one word (see sub process_queue below).

my @agenda;   # agenda (queue or stack) of edges to be added to chart and processed
my %chart_inacts; # hash of inactive edges in the chart, indexed by left edge
my %chart_acts;   # hash of active edges in the chart, indexed by right edge

my $pcfg = -1; # 0 is CFG; 1 is PCFG.  Set by the file_to_... functions automatically,
               # or by set_pcfg manually.  
               
sub example {
# This is how to call the chart parser with a hard-coded lexicon and rule set.

  my $wordsref = ["my", "dog", "has", "fleas"];

  my $rulesref;
  my $lexiconref;

  %$lexiconref = ( "my" => [["D","my"]],
                   "dog" => [["N","dog"]], 
                   "has" => [["V", "has"]], 
                   "fleas" => [["N", "fleas"]] );
  
  %$rulesref = ( "D" => [["NP","D","N"]], 
                 "V" => [["VP","V","N"]], 
                 "NP" => [["S","NP","VP"]] );
  
  # Input to chart_parse is an array of the words to be parsed.
  
  chart_parse($wordsref, $rulesref, $lexiconref);
  print_chart(1,0); # two args:  whether to print the structure contained within edges 
                    # (T/F), and whether to print active edges along with inactive (T/F)
  my @spanning_edgerefs = spanning_edges(1); # arg of 1 means print spanning edges
}

sub chart_parse {

    my ($wordsref, $rulesref, $lexiconref) = @_;
    %chart_inacts = (); # clear the hashes
    %chart_acts = ();
    add_lexentries($wordsref, $lexiconref);
    process_agenda($rulesref);
    @agenda = ();
    return \%chart_inacts;
}

sub spanning_edges {

    my $printp = $_[0];
    my @spanning_edgerefs = ();
    my $array_ref = $chart_inacts{0};
    my @allkeys = keys %chart_inacts;
    my $numkeys = $#allkeys; # num - 1
    foreach my $edgeref (@$array_ref) {
	  if ($edgeref->{end} == $numkeys+1) {push @spanning_edgerefs, $edgeref;}
    }
    if ($printp == 1) {
        print $#spanning_edgerefs+1, " spanning edge"; 
        if ($#spanning_edgerefs != 0) {print "s";}
        print " found:\n\n";
        foreach my $edgeref (@spanning_edgerefs) { 
          if ($pcfg == 1) {
            my $totalprob = 1;
            foreach my $prob (@{$edgeref->{probs}}) {
              $totalprob *= $prob;
            }
            my $normalprob = $totalprob**(1/($#{$edgeref->{probs}}+1)); # take geometric mean
            print "Edge with normalized probability $normalprob (non-normalized $totalprob), ";
          }
          print_struct($edgeref->{struct}); 
          print "\n";
        }
    }
    return @spanning_edgerefs;
}

sub print_chart {
    my($structp, $activep) = @_;
    print "\nInactive edges:\n\n";
    my $thiskey;
    foreach $thiskey (sort keys %chart_inacts) {
	my $array_ref = $chart_inacts{$thiskey};
	my $edgeref;
	foreach $edgeref (@$array_ref) {
	    print "Start ", $edgeref->{start}, ", end ", $edgeref->{end};
	    if (!$structp) { print ", cat $edgeref->{struct}[0]\n"; }
	    else { print "\n"; print_struct($edgeref->{struct}); print "\n"; }
	}
    }
    if ($activep) {
	print "\nActive edges:\n\n";
	foreach $thiskey (sort keys %chart_acts) {
	    my $array_ref = $chart_acts{$thiskey};
	    my $edgeref;
	    foreach $edgeref (@$array_ref) {
		print "Start ", $edgeref->{start}, ", end ", $edgeref->{end};
		if (!$structp) { print ", cat $edgeref->{struct}[0], "; }
		else { print "\n"; print_struct($edgeref->{struct}); }
		my $tofindcat;
		print "To find: ";
		foreach $tofindcat (@{$edgeref->{tofind}}) { print $tofindcat, " "; }
		print "\n";
		if ($structp) {print "\n";}
	    }
	}
    }
}

sub print_struct { print "Struct:"; print_struct_help(@_, 0, [""], 0); print "\n"; }

sub print_struct_help {
    my ($structref, $numspaces, $printline, $daughterno) = @_;
    if ($#$structref > 0) { # the current tree has daughters
      if ($daughterno > 1) { $daughterno = 1; }
      for my $j (0 .. $daughterno) { # don't add extra line for first daughter, but yes
       								 # for other daughters
        print "\n";
        for my $i (0 .. $numspaces-1) { 
	      print "  ";
	      if ($i == $numspaces-1) { print "|"; } # add a line just before printing the cat
	      else { print $printline->[$i]; } 
        }
      }
      print $structref->[0];
      my @daughterarray = @{$structref}[1..$#$structref];
      $printline->[$numspaces] = "|";
      for my $i (0 .. $#daughterarray) {
	    if ($i == $#daughterarray) { $printline->[$numspaces] = " "; }
        print_struct_help($daughterarray[$i], $numspaces+1, $printline, $i);
      }
    }
    else { # the current tree has no daughters
      print " - $structref->[0]";
    }
}


sub add_lexentries {
# Look up words in the lexicon and add all lexical entries to the queue of
# edges to be processed.  A word can have multiple categories, since the
# value of the $lexiconref hash is a reference to an array of lexical entries.

    my ($wordsref, $lexiconref) = @_;
    my $word;
    my $wordno = 0;
    foreach $word (@$wordsref) {
      # add an edge for the word -- for learning and for rules that have word daughters
      my %edge = ( start => $wordno, end => $wordno+1, struct => [$word], tofind => [] );
	push @agenda, \%edge;
	foreach my $lexentryref (@{$lexiconref->{$word}}) {
        # add an edge for the category of the word
        my %edge = ( start => $wordno, end => $wordno+1, 
         			 struct => [$lexentryref->[0], [$word]], tofind => [],
         			 probs => [$lexentryref->[2]] );
	    push @agenda, \%edge;
	}
	$wordno++;
    }
}

sub process_agenda {
# Go through all edges in processing agenda (@agenda) until there are no more.
# Keep in mind that more edges may be added to the end of @agenda in the body of 
# the foreach loop.

    my $edgeref;

    # In the next line, "shift @agenda" takes the element off the beginning
    # of the agenda, resulting in a queue data structure and a breadth-first parser.  
    # Changing this to "pop @agenda" will result in a stack, thus a depth-first parser.

    while (defined($edgeref = shift @agenda)) {
	if ($#{$edgeref->{tofind}} == -1) { # if tofind is null, edge is inactive, 
	                                      # so index it by its starting number
	    process_inactive($edgeref, @_);
	}
	else { # edge is active, index it by its end number
	    process_active($edgeref, @_);
	}
    }
}

sub process_inactive {
# This is the processing of a single inactive edge.

    my ($edgeref, $rulesref) = @_;
    # push the edge onto the end of the correct array in the hash table
    push @{$chart_inacts{$edgeref->{start}}}, $edgeref;

    # combine inactive edge with any preceding active edges;
    my $array_ref = $chart_acts{$edgeref->{start}}; # actives whose end is same 
                                                # as current inactive's start
    my $activeref;
    foreach $activeref (@$array_ref) {
	if ($activeref->{tofind}[0] eq $edgeref->{struct}[0]) { # if first tofind cat =
															# cat of edge, then
	    combine_edges ($activeref, $edgeref);               # combine the edges
	}
    }   

    # Look for rule that has that edge's category as leftmost daughter.
    my $ruleref;
    if (exists $rulesref->{$edgeref->{struct}[0]}) { # set of rules w/same cat as leftmost D
	foreach $ruleref (@{$rulesref->{$edgeref->{struct}[0]}}) {
	    my @newtofind = @{$ruleref->{daughters}};
	    shift @newtofind; # get rid of the first daughter of ruleref; the rest are to find

	    my %new_edge = ( start => $edgeref->{start}, end => $edgeref->{end},
	                     struct => [$ruleref->{mother}, $edgeref->{struct}],
	                     tofind => [@newtofind],
	                     probs => [@{$edgeref->{probs}}, $ruleref->{freq}]);
	    push @agenda, \%new_edge;
	}
    }
}

sub process_active {
# This is the processing of one active edge.

    my $edgeref = $_[0];
    # push the edge onto the correct array of the active hash table
    push @{$chart_acts{$edgeref->{end}}}, $edgeref;

    # look for inactive edges in the chart beginning at the end of the active 
    # edge
    my $array_ref = $chart_inacts{$edgeref->{end}}; # inactives whose beginning
                                                    # is same as
                                                    # current active's end
    my $inactref;
    foreach $inactref (@$array_ref) {
	if ($edgeref->{tofind}[0] eq $inactref->{struct}[0]) { # if first tofind cat=cat of inact
	    combine_edges ($edgeref, $inactref);
	}
    }   
}

sub combine_edges {
# When the first "tofind" category of an active edge matches the category
# of an inactive edge, add a new edge combining the two edges to the processing 
# queue.

    my ($activeref, $inactref) = @_;
    my @newtofind = ();
    my $i = 1;
    while (defined($activeref->{tofind}[$i])) {
	push @newtofind, $activeref->{tofind}[$i];
	$i++;
    }
    my %new_edge = ( start => $activeref->{start}, end => $inactref->{end}, 
                     struct => [@{$activeref->{struct}}, $inactref->{struct}], 
                                                         # add new daughter to
		                                                 # struct -- it's flattened 
		                                                 # automatically
		             tofind => [@newtofind],
		             probs => [@{$inactref->{probs}}, @{$activeref->{probs}}] );
    push @agenda, \%new_edge;
}
 
sub file_to_rulesref {

    my $rulesfile = $_[0];
    my %rules;

    my %rule_counts;

    my $curline = "";

    open (DATAFILE, "<$rulesfile") or die "Couldn't open file\n";

    while (defined($curline = )) {

	    chomp($curline);
	    if ($curline =~ /^\s*(\S+)\s*->\s*(\S+(?:\s+[^:\s]+)*)\s*(?::\s*(\S+)\s*)?$/) {
	    
          my $freq = -1;
          if (defined $3) { 
            $freq = $3;
            if ($pcfg == 0) { die "Some items have frequencies; others don't\n"; }
            $pcfg = 1; 
            if (exists $rule_counts{$1}) { $rule_counts{$1} += $freq; }
	          else { $rule_counts{$1} = $freq; }
            } # freqs
          elsif ($pcfg == 1) { die "Some items don't have frequencies; others do\n"; }
          else { $pcfg = 0; } # no freqs

	      my @daughters = split " ", $2;
	      my %rule = ( mother => $1, daughters => [@daughters], freq => $freq );
	      push @{$rules{$daughters[0]}}, \%rule;
	    }
    }
    close (DATAFILE) or die "Couldn't close file\n";

    # Now normalize freqs so they sum to 1
    if ($pcfg == 1) {
      my $already_normalized = 1;
      foreach my $rule_count (values %rule_counts) {
        if ($rule_count != 1) { $already_normalized = 0; next; }
      }
      if ($already_normalized == 0) {
        foreach my $rulearrayref (values %rules) {
          foreach my $ruleref (@$rulearrayref) {
            $ruleref->{freq} /= $rule_counts{$ruleref->{mother}}; # divide freq by total count
            										              # of all rules w/same mother
          }
        }
      }
    }

    return \%rules;
}

sub rulesref_to_file {

    my ($rulesfile, $rulesref) = @_;

    open (DATAFILE, ">$rulesfile") or die "Couldn't open output rules file $rulesfile\n";

    foreach my $array_ref  (values %$rulesref) { #for each value in hash
	foreach my $ruleref (@$array_ref) {

	    print DATAFILE $ruleref->{mother}, " ->"; print $ruleref->{mother}, " ->";

	    for my $i (0 .. $#{$ruleref->{daughters}}) {
		print DATAFILE " ", $ruleref->{daughters}[$i];
		print " ", $ruleref->{daughters}[$i];
	    }
	    print DATAFILE "\n"; print "\n";

	}
    }
    close (DATAFILE) or die "Couldn't close output rules file $rulesfile\n";
}

sub file_to_lexiconref {

    my $lexiconfile = $_[0];
    my %lexicon;

    my %lex_counts; # hash, indexed by mother node, to hold frequencies

    my $curline = "";

    open (DATAFILE, "<$lexiconfile") or die "Couldn't open file\n";

    while (defined($curline = )) {
  
	chomp($curline);
	if ($curline =~ /^\s*(\S+)\s+(\S+)\s*(?::\s*(\S+)\s*)?$/) {

        my $freq = -1;
        if (defined $3) { 
          $freq = $3;
          if ($pcfg == 0) { die "Some items have frequencies; others don't\n"; }
          $pcfg = 1; # freqs
          if (exists $lex_counts{$2}) { $lex_counts{$2} += $freq; }
	      else { $lex_counts{$2} = $freq; }
	    }
        elsif ($pcfg == 1) { die "Some items don't have frequencies; others do\n"; }
        else { $pcfg = 0; } # no freqs
        
	    print "$2 $1 $freq\n";	    
	    push @{$lexicon{$1}}, [$2, $1, $freq];
	}
    }
    close (DATAFILE) or die "Couldn't close file\n";

    # Now calculate probability of each mother node being realized as each possible daughter
    if ($pcfg == 1) {
      my $already_normalized = 1;
      foreach my $lex_count (values %lex_counts) {
        if ($lex_count != 1) { $already_normalized = 0; next; }
      }
      if ($already_normalized == 0) {
        foreach my $lexarrayref (values %lexicon) {
          foreach my $lexref (@$lexarrayref) {
            $lexref->[2] /= $lex_counts{$lexref->[0]}; # divide freq by total count of 
            										   # all lexentries w/same mother
          }
        }
      }
    }
    return \%lexicon;
}

sub lexiconref_to_file {

    my ($lexiconfile, $lexiconref) = @_;

    open (DATAFILE, ">$lexiconfile") or die "Couldn't open lexicon output file $lexiconfile\n";

    foreach my $array_ref  (values %$lexiconref) { #for each value in hash
	foreach my $lexiconref (@$array_ref) {

	    print DATAFILE $lexiconref->[1], " ", $lexiconref->[0], "\n";
	    print $lexiconref->[1], " ", $lexiconref->[0], "\n";
	}
    }
    close (DATAFILE) or die "Couldn't close file\n";
}

sub set_pcfg  {

  $pcfg = $_[0];
}

1;