chartparse.pm
use strict;
package Chartparse;
use Exporter;
BEGIN {
use Exporter ();
use vars qw(@ISA @EXPORT);
@ISA = qw(Exporter);
@EXPORT = qw(&chart_parse &print_chart &print_struct &spanning_edges
&file_to_lexiconref &file_to_rulesref &lexiconref_to_file
&rulesref_to_file &set_pcfg);
}
# Bottom-up PCFG chart parser, with edges indexed by start (inactive)/end (active)
# vertices for efficiency, and with lexical entries indexed by word, and rules indexed
# by leftmost daughter. Currently, no packing or support for features (UBCFG).
# NOTE: Currently, probability calculation assumes that there are no lexical entries
# with phrasal category mothers (like NP -> he ) -- otherwise, the probabilities are
# wrong.
# The edge data structure: %edge = ( start => $start,
# end => $end,
# struct => $structref,
# tofind => $tofindref,
# probs => $probabilityarrayref )
# $structref is a reference to an array of the form (mothercat, daughtertree1ref,
# daughtertree2ref, etc.). So $edge{struct}[0] is the category of the edge,
# $edge{struct}[1] points to the {struct} value of the first daughter, $edge{struct}[2]
# points to the second daughter's {struct}, and so on. $tofindref is a reference to an
# array of the edge categories to find for the edge, with $edge{tofind}[0] being the
# next edge to find. $probabilityarrayref is a reference to an array of the
# probabilities of all the lexical entries and rules used to create the current edge.
# The rule data structure: %rule = ( mother => $mothercat,
# daughters => $daughterarrayref,
# freq => $rule_frequency )
# $rule_frequency is for PCFG use, and during parsing represents
# P(M -> these daughters | M -> any set of daughters ).
# The lexical entry data structure: @lexentry = ( $mothercat, $daughtercat, $frequency )
# The parser is currently breadth-first, but it can be changed to a
# depth-first parser by changing one word (see sub process_queue below).
my @agenda; # agenda (queue or stack) of edges to be added to chart and processed
my %chart_inacts; # hash of inactive edges in the chart, indexed by left edge
my %chart_acts; # hash of active edges in the chart, indexed by right edge
my $pcfg = -1; # 0 is CFG; 1 is PCFG. Set by the file_to_... functions automatically,
# or by set_pcfg manually.
sub example {
# This is how to call the chart parser with a hard-coded lexicon and rule set.
my $wordsref = ["my", "dog", "has", "fleas"];
my $rulesref;
my $lexiconref;
%$lexiconref = ( "my" => [["D","my"]],
"dog" => [["N","dog"]],
"has" => [["V", "has"]],
"fleas" => [["N", "fleas"]] );
%$rulesref = ( "D" => [["NP","D","N"]],
"V" => [["VP","V","N"]],
"NP" => [["S","NP","VP"]] );
# Input to chart_parse is an array of the words to be parsed.
chart_parse($wordsref, $rulesref, $lexiconref);
print_chart(1,0); # two args: whether to print the structure contained within edges
# (T/F), and whether to print active edges along with inactive (T/F)
my @spanning_edgerefs = spanning_edges(1); # arg of 1 means print spanning edges
}
sub chart_parse {
my ($wordsref, $rulesref, $lexiconref) = @_;
%chart_inacts = (); # clear the hashes
%chart_acts = ();
add_lexentries($wordsref, $lexiconref);
process_agenda($rulesref);
@agenda = ();
return \%chart_inacts;
}
sub spanning_edges {
my $printp = $_[0];
my @spanning_edgerefs = ();
my $array_ref = $chart_inacts{0};
my @allkeys = keys %chart_inacts;
my $numkeys = $#allkeys; # num - 1
foreach my $edgeref (@$array_ref) {
if ($edgeref->{end} == $numkeys+1) {push @spanning_edgerefs, $edgeref;}
}
if ($printp == 1) {
print $#spanning_edgerefs+1, " spanning edge";
if ($#spanning_edgerefs != 0) {print "s";}
print " found:\n\n";
foreach my $edgeref (@spanning_edgerefs) {
if ($pcfg == 1) {
my $totalprob = 1;
foreach my $prob (@{$edgeref->{probs}}) {
$totalprob *= $prob;
}
my $normalprob = $totalprob**(1/($#{$edgeref->{probs}}+1)); # take geometric mean
print "Edge with normalized probability $normalprob (non-normalized $totalprob), ";
}
print_struct($edgeref->{struct});
print "\n";
}
}
return @spanning_edgerefs;
}
sub print_chart {
my($structp, $activep) = @_;
print "\nInactive edges:\n\n";
my $thiskey;
foreach $thiskey (sort keys %chart_inacts) {
my $array_ref = $chart_inacts{$thiskey};
my $edgeref;
foreach $edgeref (@$array_ref) {
print "Start ", $edgeref->{start}, ", end ", $edgeref->{end};
if (!$structp) { print ", cat $edgeref->{struct}[0]\n"; }
else { print "\n"; print_struct($edgeref->{struct}); print "\n"; }
}
}
if ($activep) {
print "\nActive edges:\n\n";
foreach $thiskey (sort keys %chart_acts) {
my $array_ref = $chart_acts{$thiskey};
my $edgeref;
foreach $edgeref (@$array_ref) {
print "Start ", $edgeref->{start}, ", end ", $edgeref->{end};
if (!$structp) { print ", cat $edgeref->{struct}[0], "; }
else { print "\n"; print_struct($edgeref->{struct}); }
my $tofindcat;
print "To find: ";
foreach $tofindcat (@{$edgeref->{tofind}}) { print $tofindcat, " "; }
print "\n";
if ($structp) {print "\n";}
}
}
}
}
sub print_struct { print "Struct:"; print_struct_help(@_, 0, [""], 0); print "\n"; }
sub print_struct_help {
my ($structref, $numspaces, $printline, $daughterno) = @_;
if ($#$structref > 0) { # the current tree has daughters
if ($daughterno > 1) { $daughterno = 1; }
for my $j (0 .. $daughterno) { # don't add extra line for first daughter, but yes
# for other daughters
print "\n";
for my $i (0 .. $numspaces-1) {
print " ";
if ($i == $numspaces-1) { print "|"; } # add a line just before printing the cat
else { print $printline->[$i]; }
}
}
print $structref->[0];
my @daughterarray = @{$structref}[1..$#$structref];
$printline->[$numspaces] = "|";
for my $i (0 .. $#daughterarray) {
if ($i == $#daughterarray) { $printline->[$numspaces] = " "; }
print_struct_help($daughterarray[$i], $numspaces+1, $printline, $i);
}
}
else { # the current tree has no daughters
print " - $structref->[0]";
}
}
sub add_lexentries {
# Look up words in the lexicon and add all lexical entries to the queue of
# edges to be processed. A word can have multiple categories, since the
# value of the $lexiconref hash is a reference to an array of lexical entries.
my ($wordsref, $lexiconref) = @_;
my $word;
my $wordno = 0;
foreach $word (@$wordsref) {
# add an edge for the word -- for learning and for rules that have word daughters
my %edge = ( start => $wordno, end => $wordno+1, struct => [$word], tofind => [] );
push @agenda, \%edge;
foreach my $lexentryref (@{$lexiconref->{$word}}) {
# add an edge for the category of the word
my %edge = ( start => $wordno, end => $wordno+1,
struct => [$lexentryref->[0], [$word]], tofind => [],
probs => [$lexentryref->[2]] );
push @agenda, \%edge;
}
$wordno++;
}
}
sub process_agenda {
# Go through all edges in processing agenda (@agenda) until there are no more.
# Keep in mind that more edges may be added to the end of @agenda in the body of
# the foreach loop.
my $edgeref;
# In the next line, "shift @agenda" takes the element off the beginning
# of the agenda, resulting in a queue data structure and a breadth-first parser.
# Changing this to "pop @agenda" will result in a stack, thus a depth-first parser.
while (defined($edgeref = shift @agenda)) {
if ($#{$edgeref->{tofind}} == -1) { # if tofind is null, edge is inactive,
# so index it by its starting number
process_inactive($edgeref, @_);
}
else { # edge is active, index it by its end number
process_active($edgeref, @_);
}
}
}
sub process_inactive {
# This is the processing of a single inactive edge.
my ($edgeref, $rulesref) = @_;
# push the edge onto the end of the correct array in the hash table
push @{$chart_inacts{$edgeref->{start}}}, $edgeref;
# combine inactive edge with any preceding active edges;
my $array_ref = $chart_acts{$edgeref->{start}}; # actives whose end is same
# as current inactive's start
my $activeref;
foreach $activeref (@$array_ref) {
if ($activeref->{tofind}[0] eq $edgeref->{struct}[0]) { # if first tofind cat =
# cat of edge, then
combine_edges ($activeref, $edgeref); # combine the edges
}
}
# Look for rule that has that edge's category as leftmost daughter.
my $ruleref;
if (exists $rulesref->{$edgeref->{struct}[0]}) { # set of rules w/same cat as leftmost D
foreach $ruleref (@{$rulesref->{$edgeref->{struct}[0]}}) {
my @newtofind = @{$ruleref->{daughters}};
shift @newtofind; # get rid of the first daughter of ruleref; the rest are to find
my %new_edge = ( start => $edgeref->{start}, end => $edgeref->{end},
struct => [$ruleref->{mother}, $edgeref->{struct}],
tofind => [@newtofind],
probs => [@{$edgeref->{probs}}, $ruleref->{freq}]);
push @agenda, \%new_edge;
}
}
}
sub process_active {
# This is the processing of one active edge.
my $edgeref = $_[0];
# push the edge onto the correct array of the active hash table
push @{$chart_acts{$edgeref->{end}}}, $edgeref;
# look for inactive edges in the chart beginning at the end of the active
# edge
my $array_ref = $chart_inacts{$edgeref->{end}}; # inactives whose beginning
# is same as
# current active's end
my $inactref;
foreach $inactref (@$array_ref) {
if ($edgeref->{tofind}[0] eq $inactref->{struct}[0]) { # if first tofind cat=cat of inact
combine_edges ($edgeref, $inactref);
}
}
}
sub combine_edges {
# When the first "tofind" category of an active edge matches the category
# of an inactive edge, add a new edge combining the two edges to the processing
# queue.
my ($activeref, $inactref) = @_;
my @newtofind = ();
my $i = 1;
while (defined($activeref->{tofind}[$i])) {
push @newtofind, $activeref->{tofind}[$i];
$i++;
}
my %new_edge = ( start => $activeref->{start}, end => $inactref->{end},
struct => [@{$activeref->{struct}}, $inactref->{struct}],
# add new daughter to
# struct -- it's flattened
# automatically
tofind => [@newtofind],
probs => [@{$inactref->{probs}}, @{$activeref->{probs}}] );
push @agenda, \%new_edge;
}
sub file_to_rulesref {
my $rulesfile = $_[0];
my %rules;
my %rule_counts;
my $curline = "";
open (DATAFILE, "<$rulesfile") or die "Couldn't open file\n";
while (defined($curline = )) {
chomp($curline);
if ($curline =~ /^\s*(\S+)\s*->\s*(\S+(?:\s+[^:\s]+)*)\s*(?::\s*(\S+)\s*)?$/) {
my $freq = -1;
if (defined $3) {
$freq = $3;
if ($pcfg == 0) { die "Some items have frequencies; others don't\n"; }
$pcfg = 1;
if (exists $rule_counts{$1}) { $rule_counts{$1} += $freq; }
else { $rule_counts{$1} = $freq; }
} # freqs
elsif ($pcfg == 1) { die "Some items don't have frequencies; others do\n"; }
else { $pcfg = 0; } # no freqs
my @daughters = split " ", $2;
my %rule = ( mother => $1, daughters => [@daughters], freq => $freq );
push @{$rules{$daughters[0]}}, \%rule;
}
}
close (DATAFILE) or die "Couldn't close file\n";
# Now normalize freqs so they sum to 1
if ($pcfg == 1) {
my $already_normalized = 1;
foreach my $rule_count (values %rule_counts) {
if ($rule_count != 1) { $already_normalized = 0; next; }
}
if ($already_normalized == 0) {
foreach my $rulearrayref (values %rules) {
foreach my $ruleref (@$rulearrayref) {
$ruleref->{freq} /= $rule_counts{$ruleref->{mother}}; # divide freq by total count
# of all rules w/same mother
}
}
}
}
return \%rules;
}
sub rulesref_to_file {
my ($rulesfile, $rulesref) = @_;
open (DATAFILE, ">$rulesfile") or die "Couldn't open output rules file $rulesfile\n";
foreach my $array_ref (values %$rulesref) { #for each value in hash
foreach my $ruleref (@$array_ref) {
print DATAFILE $ruleref->{mother}, " ->"; print $ruleref->{mother}, " ->";
for my $i (0 .. $#{$ruleref->{daughters}}) {
print DATAFILE " ", $ruleref->{daughters}[$i];
print " ", $ruleref->{daughters}[$i];
}
print DATAFILE "\n"; print "\n";
}
}
close (DATAFILE) or die "Couldn't close output rules file $rulesfile\n";
}
sub file_to_lexiconref {
my $lexiconfile = $_[0];
my %lexicon;
my %lex_counts; # hash, indexed by mother node, to hold frequencies
my $curline = "";
open (DATAFILE, "<$lexiconfile") or die "Couldn't open file\n";
while (defined($curline = )) {
chomp($curline);
if ($curline =~ /^\s*(\S+)\s+(\S+)\s*(?::\s*(\S+)\s*)?$/) {
my $freq = -1;
if (defined $3) {
$freq = $3;
if ($pcfg == 0) { die "Some items have frequencies; others don't\n"; }
$pcfg = 1; # freqs
if (exists $lex_counts{$2}) { $lex_counts{$2} += $freq; }
else { $lex_counts{$2} = $freq; }
}
elsif ($pcfg == 1) { die "Some items don't have frequencies; others do\n"; }
else { $pcfg = 0; } # no freqs
print "$2 $1 $freq\n";
push @{$lexicon{$1}}, [$2, $1, $freq];
}
}
close (DATAFILE) or die "Couldn't close file\n";
# Now calculate probability of each mother node being realized as each possible daughter
if ($pcfg == 1) {
my $already_normalized = 1;
foreach my $lex_count (values %lex_counts) {
if ($lex_count != 1) { $already_normalized = 0; next; }
}
if ($already_normalized == 0) {
foreach my $lexarrayref (values %lexicon) {
foreach my $lexref (@$lexarrayref) {
$lexref->[2] /= $lex_counts{$lexref->[0]}; # divide freq by total count of
# all lexentries w/same mother
}
}
}
}
return \%lexicon;
}
sub lexiconref_to_file {
my ($lexiconfile, $lexiconref) = @_;
open (DATAFILE, ">$lexiconfile") or die "Couldn't open lexicon output file $lexiconfile\n";
foreach my $array_ref (values %$lexiconref) { #for each value in hash
foreach my $lexiconref (@$array_ref) {
print DATAFILE $lexiconref->[1], " ", $lexiconref->[0], "\n";
print $lexiconref->[1], " ", $lexiconref->[0], "\n";
}
}
close (DATAFILE) or die "Couldn't close file\n";
}
sub set_pcfg {
$pcfg = $_[0];
}
1;