Edinburgh Speech Tools  2.1-release
EST_SCFG_Chart.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : June 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* A SCFG chart parser */
38 /* */
39 /*=======================================================================*/
40 #include <cstdlib>
41 #include "siod.h"
42 #include "EST_math.h"
43 #include "EST_SCFG.h"
44 #include "EST_SCFG_Chart.h"
45 
46 using namespace std;
47 
49 {
50  p_d1 = 0;
51  p_d2 = 0;
52  p_pos = 0;
53  p_prob = 0;
54 }
55 
57  int d1, int d2,
58  int pos)
59 {
60  p_d1 = d1;
61  p_d2 = d2;
62  p_pos = pos;
63  p_prob = prob;
64 }
65 
67 {
68 }
69 
70 LISP EST_SCFG_Chart::print_edge(int start, int end, int p,
72 {
73  // Return a lisp representation of the edge
74 
75  if (e->prob() == 0)
76  {
77  return NIL; // failed
78  }
79  else if (start+1 == end)
80  {
81  // unary rule, preterminal
82  LISP r = cons(rintern(grammar->nonterminal(p)),
83  cons(flocons(e->prob()),
85  cons(flocons(end),
86  cons(rintern(grammar->terminal(e->d1())),
87  NIL)))));
88  return r;
89  }
90  else
91  {
92  //name prob start end daughters
93  EST_SCFG_Chart_Edge *d1, *d2;
94 
95  d1 = edges[start][e->pos()][e->d1()];
96  d2 = edges[e->pos()][end][e->d2()];
97 
98  LISP daughters =
99  cons(print_edge(start,e->pos(),e->d1(),d1),
100  cons(print_edge(e->pos(),end,e->d2(),d2),
101  NIL));
102  LISP r = cons(rintern(grammar->nonterminal(p)),
103  cons(flocons(e->prob()),
104  cons(flocons(start),
105  cons(flocons(end),
106  daughters))));
107  return r;
108  }
109 }
110 
112 {
113  n_vertices = 0;
114  edges = 0;
115  wfst = 0;
116  emptyedge = 0;
117  grammar_local = TRUE;
118  grammar = new EST_SCFG;
119 }
120 
122 {
123  // delete all the vertices
124 
125  delete_edge_table();
126  if (grammar_local)
127  delete grammar;
128 }
129 
131 {
132  if (grammar_local)
133  delete grammar;
134  grammar_local = FALSE;
135  grammar = &imported_grammar;
136 }
137 
139 {
140  grammar->set_rules(r);
141 }
142 
144 {
145  // Set up well formed substring table from feature name in each
146  // stream item in s
147  setup_wfst(s->head(),0,name);
148 }
149 
151 {
152  // Set up well formed substring table from feature name in each
153  // stream item in s
154  EST_Item *p;
155  int n;
156 
157  delete_edge_table();
158  for (n_vertices=1,p=s; p != e; p=p->next())
159  n_vertices++;
160  setup_edge_table();
161 
162  for (n=0,p=s; p != e; p=p->next(),n++)
163  {
164  int term = grammar->terminal(p->f(name).string());
165  if (term == -1)
166  {
167  cerr << "SCFG_Chart: unknown terminal symbol \"" <<
168  p->f(name).string() << "\"" << endl;
169  term = 0; // to avoid crashing
170  }
171  wfst[n] = new EST_SCFG_Chart_Edge(1.0,term,0,-1);
172  }
173 }
174 
175 void EST_SCFG_Chart::delete_edge_table()
176 {
177  int i,j,k;
178 
179  if (wfst == 0) return;
180 
181  for (i=0; i < n_vertices; i++)
182  {
183  delete wfst[i];
184  for (j=0; j < n_vertices; j++)
185  {
186  for (k=0; k < grammar->num_nonterminals(); k++)
187  if (edges[i][j][k] != emptyedge)
188  delete edges[i][j][k];
189  delete [] edges[i][j];
190  }
191  delete [] edges[i];
192  }
193  delete [] wfst;
194  delete [] edges;
195  delete emptyedge;
196 
197  wfst = 0;
198  edges = 0;
199 
200 }
201 
202 void EST_SCFG_Chart::setup_edge_table()
203 {
204  int i,j,k;
205  int nt = grammar->num_nonterminals();
206 
207  wfst = new EST_SCFG_Chart_Edge*[n_vertices];
208  edges = new EST_SCFG_Chart_Edge ***[n_vertices];
209  emptyedge = new EST_SCFG_Chart_Edge(0,0,0,0);
210 
211  for (i=0; i < n_vertices; i++)
212  {
213  wfst[i] = 0;
214  edges[i] = new EST_SCFG_Chart_Edge **[n_vertices];
215  for (j=0; j < n_vertices; j++)
216  {
217  edges[i][j] = new EST_SCFG_Chart_Edge *[nt];
218  for (k=0; k < nt; k++)
219  edges[i][j][k] = 0;
220  }
221  }
222 }
223 
224 double EST_SCFG_Chart::find_best_tree_cal(int start,int end,int p)
225 {
226  // Find the best parse for non/terminal p between start and end
227  int best_j = -1;
228  int best_q = -1, best_r = -1;
229  double best_prob = 0;
230 
231  if (end-1 == start)
232  {
233  best_prob = grammar->prob_U(p,wfst[start]->d1());
234  if (best_prob > 0)
235  edges[start][end][p] =
236  new EST_SCFG_Chart_Edge(best_prob*wfst[start]->prob(),
237  wfst[start]->d1(),0,-1);
238  else
239  edges[start][end][p] = emptyedge;
240  return best_prob;
241  }
242  else
243  {
244  // for all rules expanding p find total and best prob
245  double s=0,t_prob,left,right;
246  int j,q,r;
247  int nt = grammar->num_nonterminals();
248 
249  for (q=0; q < nt; q++)
250  for (r=0; r < nt; r++)
251  {
252  double pBpqr = grammar->prob_B(p,q,r);
253  if (pBpqr > 0)
254  {
255  for (j=start+1; j < end; j++)
256  {
257  left=find_best_tree(start,j,q);
258  if (left > 0)
259  {
260  right = find_best_tree(j,end,r);
261  t_prob = pBpqr * left * right;
262  if (t_prob > best_prob)
263  {
264  best_prob = t_prob;
265  best_q = q;
266  best_r = r;
267  best_j = j;
268  }
269  s += t_prob;
270  }
271  }
272  }
273  }
274 
275  if (best_prob > 0)
276  edges[start][end][p] =
277  new EST_SCFG_Chart_Edge(s,best_q,best_r,best_j);
278  else
279  edges[start][end][p] = emptyedge;
280  return s;
281  }
282 }
283 
285 {
286  // do the parsing, find best parse only
287  if (n_vertices - 1 > 0)
288  find_best_tree(0,n_vertices-1,grammar->distinguished_symbol());
289 
290 }
291 
293 {
294  // Extract the parse from the edge table
295  EST_SCFG_Chart_Edge *top;
296 
297  top = edges[0][n_vertices-1][grammar->distinguished_symbol()];
298 
299  if (top == NULL)
300  return NIL; // no parse
301  else
302  return print_edge(0,n_vertices-1,grammar->distinguished_symbol(),top);
303 }
304 
306  EST_Relation *word,
307  int force)
308 {
309  // Build a tree stream in Syn linking the leafs of Syn to those
310  // in word, force guarantees a parse is necessary (though probably
311  // a random one)
312 
313  extract_parse(syn,word->head(),0,force);
314 }
315 
317  EST_Item *s, EST_Item *e, int force)
318 {
319  // Build a tree stream in Syn linking the leafs of Syn to those
320  // in word
321  EST_Item *p;
322  int num_words;
323 
324  for (num_words=0,p=s; p != e; p=p->next())
325  num_words++;
326 
327  if (num_words != (n_vertices-1))
328  {
329  cerr << "SCFG_Chart: extract_parse, number of items in link stream " <<
330  " different from those in parse tree" << endl;
331  return;
332  }
333 
334  EST_SCFG_Chart_Edge *top;
335  EST_Item *w = s;
336 
337  top = edges[0][n_vertices-1][grammar->distinguished_symbol()];
338 
339  if (top == NULL)
340  return; // failed to parse so no parse tree
341  else
342  {
343  EST_Item *ss = syn->append();
344 
345  extract_edge(0,n_vertices-1,grammar->distinguished_symbol(),top,
346  ss,&w);
347 
348  if ((force) && (!daughter1(ss))) // no parse found but *need* one
349  extract_forced_parse(0,n_vertices-1,ss,w);
350  return;
351  }
352 }
353 
354 void EST_SCFG_Chart::extract_forced_parse(int start, int end,
355  EST_Item *s, EST_Item *w)
356 {
357  // Extract a parse even though one wasn't found (often happens
358  // with single word or dual word sentences.
359 
360  if (start+1 == end)
361  {
362  s->append_daughter(w);
363  s->set_name(grammar->nonterminal(grammar->distinguished_symbol()));
364  s->set("prob",0.0); // maybe should be epsilon
365  }
366  else
367  {
368  extract_forced_parse(start,start+1,s->append_daughter(),w);
369  EST_Item *st = s->append_daughter();
370  st->set_name(grammar->nonterminal(grammar->distinguished_symbol()));
371  st->set("prob",0.0); // maybe should be epsilon
372  EST_Item *nw = w->next();
373  extract_forced_parse(start+1,end,st,nw);
374  }
375 }
376 
377 void EST_SCFG_Chart::extract_edge(int start, int end, int p,
379  EST_Item *s,
380  EST_Item **word)
381 {
382  // Build the node for this edge, and all of its daughters
383 
384  if (e->prob() == 0)
385  {
386  return; // failed
387  }
388  else if (start+1 == end)
389  {
390  // unary rule, preterminal
391  s->append_daughter((*word));
392  s->set_name(grammar->nonterminal(p));
393  s->set("prob",(float)e->prob());
394  if (*word ==NULL || **word == NULL) {
395  cerr << "SCFG_Chart: extract_edge, null word. Should not happen." << endl;
396  return; /* error */
397  }
398  *word = (*word)->next(); // increment along "word" stream
399  return;
400  }
401  else
402  {
403  //name prob start end daughters
404  EST_SCFG_Chart_Edge *d1, *d2;
405 
406  d1 = edges[start][e->pos()][e->d1()];
407  d2 = edges[e->pos()][end][e->d2()];
408 
409  // Inserts the new nodes in the tree (and creates new si nodes)
410  s->append_daughter();
411  s->append_daughter();
412 
413  extract_edge(start,e->pos(),e->d1(),d1,daughter1(s),word);
414  extract_edge(e->pos(),end,e->d2(),d2,daughter2(s),word);
415 
416  s->set_name(grammar->nonterminal(p));
417  s->set("prob",(float)e->prob());
418 
419  return;
420  }
421 }
422 
424 {
425  // Set up well formed substring table form lisp list
426  // Setup a relation and call the standard method of set up
427  LISP w,f;
428 
429  for (w=sent; w != NIL; w=cdr(w))
430  {
431  EST_Item *word = s.append();
432 
433  if (consp(car(w)))
434  { // a word with other feature info
435  word->set_name(get_c_string(car(car(w))));
436  if (consp(car(cdr(car(w)))))
437  for (f=car(cdr(car(w))); f != NIL; f=cdr(f))
438  {
439  if (FLONUMP(car(cdr(car(f)))))
440  word->set(get_c_string(car(car(f))),
441  get_c_float(car(cdr(car(f)))));
442  else
443  word->set(get_c_string(car(car(f))),
444  get_c_string(car(cdr(car(f)))));
445  }
446  else // we assume its a POS value, cause they didn't say
447  word->set("name",get_c_string(car(cdr(car(w)))));
448  }
449  else // for easy we set the pos field to the be the name
450  word->set("name",get_c_string(car(w)));
451  }
452 }
453 
454 void scfg_parse(EST_Relation *Word, const EST_String &name,
455  EST_Relation *Syntax, EST_SCFG &grammar)
456 {
457  // Parse feature name in Word to build Syntax relation
458  // The relations names above are *not* the names of the relations
459  // just named to reflect there conceptual usage
460  EST_SCFG_Chart chart;
461 
462  chart.set_grammar_rules(grammar);
463  chart.setup_wfst(Word,name);
464  chart.parse();
465  chart.extract_parse(Syntax,Word,TRUE);
466 
467  return;
468 }
469 
470 LISP scfg_parse(LISP string, LISP grammar)
471 {
472  // Parse and return full parse
473  EST_SCFG_Chart chart;
475  LISP parse;
476 
477  chart.set_grammar_rules(grammar);
478 
479  EST_SCFG_chart_load_relation(words,string);
480  chart.setup_wfst(&words,"name");
481  chart.parse();
482  parse = chart.find_parse();
483 
484  return parse;
485 }
486 
487 LISP scfg_parse(LISP string, EST_SCFG &grammar)
488 {
489  // Parse and return full parse
490  EST_SCFG_Chart chart;
492  LISP parse;
493 
494  chart.set_grammar_rules(grammar);
495 
496  EST_SCFG_chart_load_relation(words,string);
497  chart.setup_wfst(&words,"name");
498  chart.parse();
499  parse = chart.find_parse();
500 
501  return parse;
502 }
503 
504 LISP scfg_bracketing_only(LISP parse)
505 {
506  if (consp(siod_nth(4,parse)))
507  {
508  LISP d,ds;
509 
510  for (d=cdr(cdr(cdr(cdr(parse)))),ds=NIL; d != NIL; d=cdr(d))
511  ds = cons(scfg_bracketing_only(car(d)),ds);
512  return reverse(ds);
513  }
514  else
515  return siod_nth(4,parse);
516 
517 }
518 
520 {
521  // Compare bracketing of best parse to bracketing on original
522  // For each sentence parse it (unbracketed) and then
523  // find the percentage of valid brackets in parsed version that
524  // are valid in the original one.
525  int c;
526  LISP parse;
527  EST_SuffStats cb;
528  int failed = 0;
529  int fully_contained=0;
530 
531  for (c=0; c < corpus.length(); c++)
532  {
533  LISP flat = siod_flatten(corpus.a_no_check(c).string());
534 
535  parse = scfg_parse(flat,*this);
536  if (parse == NIL)
537  {
538  failed++;
539  continue;
540  }
541 
543  EST_SuffStats vs;
544 
545  count_bracket_crossing(corpus.a_no_check(c),parsed,vs);
546 
547  if (vs.mean() == 1)
548  fully_contained++;
549  cb += vs.mean();
550  }
551 
552  cout << "cross bracketing " << cb.mean()*100 << " (" << failed <<
553  " failed " << (float)(100.0*fully_contained)/corpus.length() <<
554  "% fully consistent from " << corpus.length()
555  << " sentences)" << endl;
556 
557 }
558 
560  const EST_bracketed_string &test,
561  EST_SuffStats &vs)
562 {
563  int i,j;
564 
565  if (ref.length() != test.length())
566  {
567  EST_error("bracket_crossing: sentences of different lengths");
568  }
569 
570  for (i=0; i < ref.length(); i++)
571  for (j=i+1; j <= ref.length(); j++)
572  if (test.valid(i,j) == 1)
573  {
574  if (ref.valid(i,j) == 0)
575  vs += 0;
576  else
577  vs += 1;
578  }
579 }
int valid(int i, int k) const
If a bracketing from i to k is valid in string.
Definition: EST_SCFG.h:86
EST_Item * head() const
Definition: EST_Relation.h:121
float end(const EST_Item &item)
Definition: EST_item_aux.cc:96
int d2()
(Non)terminal of daughter 2
float get_c_float(LISP x)
Definition: slib.cc:1858
int pos(void)
Postion, 0 1 or 2, where 0 is empty, 1 is incomplete 2 is complete.
EST_Item * append_daughter(EST_Item *li=0)
Definition: EST_Item.cc:425
void set(const EST_String &name, ssize_t ival)
Definition: EST_Item.h:185
int d1()
(Non)terminal of daughter 1
STATIC void left(STATUS Change)
Definition: editline.c:523
double mean(void) const
mean of currently cummulated values
#define NIL
Definition: siod_defs.h:92
STATIC void right(STATUS Change)
Definition: editline.c:538
void parse()
Parses the loaded WFST with the loaded grammar.
A class representing a stochastic context free grammar (SCFG).
Definition: EST_SCFG.h:182
LISP siod_flatten(LISP tree)
Definition: slib_list.cc:87
This class represents a bracketed string used in training of SCFGs.
Definition: EST_SCFG.h:57
LISP find_parse()
Return the parse in full LISP form.
EST_Item * daughter2(const EST_Item *n)
return second daughter of n
LISP siod_nth(int nth, LISP list)
Definition: siod.cc:214
int length() const
Definition: EST_SCFG.h:79
LISP scfg_bracketing_only(LISP parse)
const EST_Val f(const EST_String &name) const
Definition: EST_Item.h:260
void extract_parse(EST_Relation *syn, EST_Relation *word, int force=0)
Extract parse tree and add it to syn linking leafs to word.
const char * get_c_string(LISP x)
Definition: slib.cc:638
void set_name(const EST_String &name) const
Definition: EST_Item.h:254
LISP cons(LISP x, LISP y)
Definition: slib_list.cc:97
#define FLONUMP(x)
Definition: siod_defs.h:154
#define FALSE
Definition: EST_bool.h:119
section options Options< strong > or ngram_per_line Pseudo words
NULL
Definition: EST_WFST.cc:55
LISP consp(LISP x)
Definition: slib_list.cc:112
#define EST_error
Definition: EST_error.h:104
f
Definition: EST_item_aux.cc:48
void EST_SCFG_chart_load_relation(EST_Relation &s, LISP sent)
void count_bracket_crossing(const EST_bracketed_string &ref, const EST_bracketed_string &test, EST_SuffStats &vs)
const EST_String & string(void) const
Definition: EST_Val.h:161
EST_Item * next() const
Definition: EST_Item.h:348
void setup_wfst(EST_Relation *s, const EST_String &name="name")
float start(const EST_Item &item)
Definition: EST_item_aux.cc:52
double prob(void)
Edge probability.
LISP rintern(const char *name)
Definition: slib.cc:734
void set_grammar_rules(LISP r)
Initialize from LISP rules set.
A class for parsing with a probabilistic grammars.
LISP flocons(double x)
Definition: slib.cc:673
EST_Item * append(EST_Item *si)
Definition: EST_Relation.cc:88
LISP car(LISP x)
Definition: slib_list.cc:115
#define TRUE
Definition: EST_bool.h:118
void reverse(EST_Wave &sig)
void scfg_parse(EST_Relation *Word, const EST_String &name, EST_Relation *Syntax, EST_SCFG &grammar)
An internal class for EST_SCFG_Chart for representing edges in the chart during parsing with SCFGs...
EST_Item * daughter1(const EST_Item *n)
return first daughter of n
LISP cdr(LISP x)
Definition: slib_list.cc:124