Edinburgh Speech Tools  2.1-release
EST_SCFG_inout.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : October 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* Implementation of an inside-outside reestimation procedure for */
38 /* building a stochastic CFG seeded with a bracket corpus. */
39 /* Based on "Inside-Outside Reestimation from partially bracketed */
40 /* corpora", F Pereira and Y. Schabes. pp 128-135, 30th ACL, Newark, */
41 /* Delaware 1992. */
42 /* */
43 /* This should really be done in the log domain. Addition in the log */
44 /* domain can be done with a formula in Huang, Ariki and Jack */
45 /* (log(a)-log(b)) */
46 /* log(a+b) = log(1 + e ) + log(b) */
47 /* */
48 /*=======================================================================*/
49 #include <cstdlib>
50 #include "EST_SCFG_Chart.h"
51 #include "EST_simplestats.h"
52 #include "EST_math.h"
53 #include "EST_TVector.h"
54 
55 static const EST_bracketed_string def_val_s;
56 static EST_bracketed_string error_return_s;
59 
60 
61 #if defined(INSTANTIATE_TEMPLATES)
62 #include "../base_class/EST_TVector.cc"
63 
65 #endif
66 
67 using namespace std;
68 
69 void set_corpus(EST_Bcorpus &b, LISP examples)
70 {
71  LISP e;
72  int i;
73 
74  b.resize(siod_llength(examples));
75 
76  for (i=0,e=examples; e != NIL; e=cdr(e),i++)
78 }
79 
80 void EST_bracketed_string::init()
81 {
82  bs = NIL;
83  gc_protect(&bs);
84  symbols = 0;
85  valid_spans = 0;
86  p_length = 0;
87 }
88 
90 {
91  init();
92 }
93 
95 {
96  init();
97 
98  set_bracketed_string(string);
99 }
100 
102 {
103  int i;
104  bs=NIL;
105  gc_unprotect(&bs);
106  delete [] symbols;
107  for (i=0; i < p_length; i++)
108  delete [] valid_spans[i];
109  delete [] valid_spans;
110 }
111 
113 {
114 
115  bs=NIL;
116  delete [] symbols;
117 
118  p_length = find_num_nodes(string);
119  symbols = new LISP[p_length];
120 
121  set_leaf_indices(string,0,symbols);
122 
123  bs = string;
124 
125  int i,j;
126  valid_spans = new int*[length()];
127  for (i=0; i < length(); i++)
128  {
129  valid_spans[i] = new int[length()+1];
130  for (j=i+1; j <= length(); j++)
131  valid_spans[i][j] = 0;
132  }
133 
134  // fill in valid table
135  if (p_length > 0)
136  find_valid(0,bs);
137 
138 }
139 
140 int EST_bracketed_string::find_num_nodes(LISP string)
141 {
142  // This wont could nil as an atom
143  if (string == NIL)
144  return 0;
145  else if (CONSP(string))
146  return find_num_nodes(car(string))+
147  find_num_nodes(cdr(string));
148  else
149  return 1;
150 }
151 
152 int EST_bracketed_string::set_leaf_indices(LISP string,int i,LISP *syms)
153 {
154  if (string == NIL)
155  return i;
156  else if (!CONSP(car(string)))
157  {
158  syms[i] = string;
159  return set_leaf_indices(cdr(string),i+1,syms);
160  }
161  else // car is a tree
162  {
163  return set_leaf_indices(cdr(string),
164  set_leaf_indices(car(string),i,syms),
165  syms);
166  }
167 }
168 
169 void EST_bracketed_string::find_valid(int s,LISP t) const
170 {
171  LISP l;
172  int c;
173 
174  if (consp(t))
175  {
176  for (c=s,l=t; l != NIL; l=cdr(l))
177  {
178  c += num_leafs(car(l));
179  valid_spans[s][c] = 1;
180  }
181  find_valid(s,car(t));
182  find_valid(s+num_leafs(car(t)),cdr(t));
183  }
184 }
185 
186 int EST_bracketed_string::num_leafs(LISP t) const
187 {
188  if (t == NIL)
189  return 0;
190  else if (!consp(t))
191  return 1;
192  else
193  return num_leafs(car(t)) + num_leafs(cdr(t));
194 }
195 
197 {
198  inside = 0;
199  outside = 0;
200  n.resize(0);
201  d.resize(0);
202 }
203 
205 {
206 
207 }
208 
210 {
211  set_corpus(corpus,vload(filename,1));
212 }
213 
214 // From the formula in the paper
215 double EST_SCFG_traintest::f_I_cal(int c, int p, int i, int k)
216 {
217  // Find Inside probability
218  double res;
219 
220  if (i == k-1)
221  {
222  res = prob_U(p,terminal(corpus.a_no_check(c).symbol_at(i)));
223 // printf("prob_U p %s (%d) %d m %s (%d) res %g\n",
224 // (const char *)nonterminal(p),p,
225 // i,
226 // (const char *)corpus.a_no_check(c).symbol_at(i),
227 // terminal(corpus.a_no_check(c).symbol_at(i)),
228 // res);
229  }
230  else if (corpus.a_no_check(c).valid(i,k) == TRUE)
231  {
232  int j;
233  double s=0;
234  int q,r;
235 
236  for (q = 0; q < num_nonterminals(); q++)
237  for (r = 0; r < num_nonterminals(); r++)
238  {
239  double pBpqr = prob_B(p,q,r);
240  if (pBpqr > 0)
241  for (j=i+1; j < k; j++)
242  {
243  double in = f_I(c,q,i,j);
244  if (in > 0)
245  s += pBpqr * in * f_I(c,r,j,k);
246  }
247  }
248  res = s;
249  }
250  else
251  res = 0.0;
252 
253  inside[p][i][k] = res;
254 
255 // printf("f_I p %s i %d k %d res %g\n",
256 // (const char *)nonterminal(p),i,k,res);
257 
258  return res;
259 }
260 
261 double EST_SCFG_traintest::f_O_cal(int c, int p, int i, int k)
262 {
263  // Find Outside probability
264  double res;
265 
266  if ((i == 0) && (k == corpus.a_no_check(c).length()))
267  {
268  if (p == distinguished_symbol()) // distinguished non-terminal
269  res = 1.0;
270  else
271  res = 0.0;
272  }
273  else if (corpus.a_no_check(c).valid(i,k) == TRUE)
274  {
275  double s1=0.0;
276  double s2,s3;
277  double pBqrp,pBqpr;
278  int j;
279  int q,r;
280 
281  for (q = 0; q < num_nonterminals(); q++)
282  for (r = 0; r < num_nonterminals(); r++)
283  {
284  pBqrp = prob_B(q,r,p);
285  s2 = s3 = 0.0;
286  if (pBqrp > 0)
287  {
288  for (j=0;j < i; j++)
289  {
290  double out = f_O(c,q,j,k);
291  if (out > 0)
292  s2 += out * f_I(c,r,j,i);
293  }
294  s2 *= pBqrp;
295  }
296  pBqpr = prob_B(q,p,r);
297  if (pBqpr > 0)
298  {
299  for (j=k+1;j <= corpus.a_no_check(c).length(); j++)
300  {
301  double out = f_O(c,q,i,j);
302  if (out > 0)
303  s3 += out * f_I(c,r,k,j);
304  }
305  s3 *= pBqpr;
306  }
307  s1 += s2 + s3;
308  }
309  res = s1;
310  }
311  else // not a valid bracketing
312  res = 0.0;
313 
314  outside[p][i][k] = res;
315 
316  return res;
317 }
318 
319 void EST_SCFG_traintest::reestimate_rule_prob_B(int c, int ri, int p, int q, int r)
320 {
321  // Re-estimate probability for binary rules
322  int i,j,k;
323  double n2=0;
324 
325  double pBpqr = prob_B(p,q,r);
326 
327  if (pBpqr > 0)
328  {
329  for (i=0; i <= corpus.a_no_check(c).length()-2; i++)
330  for (j=i+1; j <= corpus.a_no_check(c).length()-1; j++)
331  {
332  double d1 = f_I(c,q,i,j);
333  if (d1 == 0) continue;
334  for (k=j+1; k <= corpus.a_no_check(c).length(); k++)
335  {
336  double d2 = f_I(c,r,j,k);
337  if (d2 == 0) continue;
338  double d3 = f_O(c,p,i,k);
339  if (d3 == 0) continue;
340  n2 += d1 * d2 * d3;
341  }
342  }
343  n2 *= pBpqr;
344  }
345  // f_P(c) is probably redundant
346  double fp = f_P(c);
347  double n1,d1;
348  n1 = n2 / fp;
349  if (fp == 0) n1=0;
350 
351  d1 = f_P(c,p) / fp;
352  if (fp == 0) d1=0;
353  // printf("n1 %f d1 %f n2 %f fp %f\n",n1,d1,n2,fp);
354  n[ri] += n1;
355  d[ri] += d1;
356 
357 }
358 
359 void EST_SCFG_traintest::reestimate_rule_prob_U(int c,int ri, int p, int m)
360 {
361  // Re-estimate probability for unary rules
362  int i;
363 
364 // printf("reestimate_rule_prob_U: %f p %s m %s\n",
365 // prob_U(ip,im),
366 // (const char *)p,
367 // (const char *)m);
368 
369  double n2=0;
370 
371  for (i=1; i < corpus.a_no_check(c).length(); i++)
372  if (m == terminal(corpus.a_no_check(c).symbol_at(i-1)))
373  n2 += prob_U(p,m) * f_O(c,p,i-1,i);
374 
375  double fP = f_P(c);
376  if (fP != 0)
377  {
378  n[ri] += n2 / fP;
379  d[ri] += f_P(c,p) / fP;
380  }
381 }
382 
383 double EST_SCFG_traintest::f_P(int c)
384 {
385  return f_I(c,distinguished_symbol(),0,corpus.a_no_check(c).length());
386 }
387 
388 double EST_SCFG_traintest::f_P(int c,int p)
389 {
390  int i,j;
391  double db=0;
392 
393  for (i=0; i < corpus.a_no_check(c).length(); i++)
394  for (j=i+1; j <= corpus.a_no_check(c).length(); j++)
395  {
396  double d1 = f_O(c,p,i,j);
397  if (d1 == 0) continue;
398  db += f_I(c,p,i,j)*d1;
399  }
400 
401  return db;
402 }
403 
404 void EST_SCFG_traintest::reestimate_grammar_probs(int passes,
405  int startpass,
406  int checkpoint,
407  int spread,
408  const EST_String &outfile)
409 {
410  // Iterate over the corpus cummulating factors for each rules
411  // This reduces the space requirements and recalculations of
412  // values for each sentences.
413  // Repeat training passes to number specified
414  int pass = 0;
415  double zero=0;
416  double se;
417  int ri,c;
418 
419  n.resize(rules.length());
420  d.resize(rules.length());
421 
422  for (pass = startpass; pass < passes; pass++)
423  {
424  EST_Litem *r;
425  double mC, lPc;
426 
427  d.fill(zero);
428  n.fill(zero);
430 
431  for (mC=0.0,lPc=0.0,c=0; c < corpus.length(); c++)
432  {
433  // For skipping some sentences to speed up convergence
434  if ((spread > 0) && (((c+(pass*spread))%100) >= spread))
435  continue;
436  printf(" %d",c); fflush(stdout);
437  if (corpus.a_no_check(c).length() == 0) continue;
438  init_io_cache(c,num_nonterminals());
439  for (ri=0,r=rules.head(); r != 0; r=r->next(),ri++)
440  {
441  if (rules(r).type() == est_scfg_binary_rule)
442  reestimate_rule_prob_B(c,ri,
443  rules(r).mother(),
444  rules(r).daughter1(),
445  rules(r).daughter2());
446  else
447  reestimate_rule_prob_U(c,
448  ri,
449  rules(r).mother(),
450  rules(r).daughter1());
451  }
452  lPc += safe_log(f_P(c));
453  mC += corpus.a_no_check(c).length();
454  clear_io_cache(c);
455  }
456  printf("\n");
457 
458  for (se=0.0,ri=0,r=rules.head(); r != 0; r=r->next(),ri++)
459  {
460  double n_prob = n[ri]/d[ri];
461  if (d[ri] == 0)
462  n_prob = 0;
463  se += (n_prob-rules(r).prob())*(n_prob-rules(r).prob());
464  rules(r).set_prob(n_prob);
465  }
466  printf("pass %d cross entropy %g RMSE %f %f %d\n",
467  pass,-(lPc/mC),(rules.length()?sqrt(se/rules.length()):INFINITY),
468  se,rules.length());
469 
470  if (checkpoint != -1 && checkpoint != 0)
471  {
472  if ((pass % checkpoint) == checkpoint-1)
473  {
474  char cp[20];
475  sprintf(cp,".%03d",pass);
476  save(outfile+cp);
477  user_gc(NIL); // just to keep things neat
478  }
479  }
480 
481  }
482 }
483 
485  int startpass,
486  int checkpoint,
487  int spread,
488  const EST_String &outfile)
489 {
490  // Train a Stochastic CFG using the inside outside algorithm
491 
492  reestimate_grammar_probs(passes, startpass, checkpoint,
493  spread, outfile);
494 }
495 
496 void EST_SCFG_traintest::init_io_cache(int c,int nt)
497 {
498  // Build an array to cache the in/out values
499  int i,j,k;
500  int mc = corpus.a_no_check(c).length()+1;
501 
502  inside = new double**[nt];
503  outside = new double**[nt];
504  for (i=0; i < nt; i++)
505  {
506  inside[i] = new double*[mc];
507  outside[i] = new double*[mc];
508  for (j=0; j < mc; j++)
509  {
510  inside[i][j] = new double[mc];
511  outside[i][j] = new double[mc];
512  for (k=0; k < mc; k++)
513  {
514  inside[i][j][k] = -1;
515  outside[i][j][k] = -1;
516  }
517  }
518  }
519 }
520 
521 void EST_SCFG_traintest::clear_io_cache(int c)
522 {
523  int mc = corpus.a_no_check(c).length()+1;
524  int i,j;
525 
526  if (inside == 0)
527  return;
528 
529  for (i=0; i < num_nonterminals(); i++)
530  {
531  for (j=0; j < mc; j++)
532  {
533  delete [] inside[i][j];
534  delete [] outside[i][j];
535  }
536  delete [] inside[i];
537  delete [] outside[i];
538  }
539 
540  delete [] inside;
541  delete [] outside;
542 
543  inside = 0;
544  outside = 0;
545 }
546 
547 double EST_SCFG_traintest::cross_entropy()
548 {
549  double lPc=0,mC=0;
550  int c;
551 
552  for (c=0; c < corpus.length(); c++)
553  {
554  lPc += log(f_P(c));
555  mC += corpus.a_no_check(c).length();
556  }
557 
558  return -(lPc/mC);
559 }
560 
562 {
563  // Test corpus against current grammar.
564  double mC,lPc;
565  int c,i;
566  int failed=0;
567  double fP;
568 
569  // Lets try simply finding the cross entropy
570  n.resize(rules.length());
571  d.resize(rules.length());
572  for (i=0; i < rules.length(); i++)
573  d[i] = n[i] = 0.0;
574 
575  for (mC=0.0,lPc=0.0,c=0; c < corpus.length(); c++)
576  {
577  if (corpus.length() > 50)
578  {
579  printf(" %d",c);
580  fflush(stdout);
581  }
582  init_io_cache(c,num_nonterminals());
583  fP = f_P(c);
584  if (fP == 0)
585  failed++;
586  else
587  {
588  lPc += safe_log(fP);
589  mC += corpus.a_no_check(c).length();
590  }
591  clear_io_cache(c);
592  }
593  if (corpus.length() > 50)
594  printf("\n");
595 
596  cout << "cross entropy " << -(lPc/mC) << " (" << failed << " failed out of " <<
597  corpus.length() << " sentences )" << endl;
598 
599 }
600 
int valid(int i, int k) const
If a bracketing from i to k is valid in string.
Definition: EST_SCFG.h:86
void set_bracketed_string(LISP string)
void set_rule_prob_cache()
(re-)set rule probability caches
Definition: EST_SCFG.cc:260
double safe_log(const double x)
Definition: EST_math.h:158
double prob_B(int p, int q, int r) const
The rule probability of given binary rule.
Definition: EST_SCFG.h:230
void fill(const T &v)
Fill entire array will value v.
Definition: EST_TVector.cc:105
#define NIL
Definition: siod_defs.h:92
int siod_llength(LISP list)
Definition: siod.cc:202
INLINE const T & a_no_check(ssize_t n) const
read-only const access operator: without bounds checking
Definition: EST_TVector.h:254
void gc_unprotect(LISP *location)
Definition: slib.cc:759
A class representing a stochastic context free grammar (SCFG).
Definition: EST_SCFG.h:182
This class represents a bracketed string used in training of SCFGs.
Definition: EST_SCFG.h:57
int distinguished_symbol() const
Definition: EST_SCFG.h:212
int num_nonterminals() const
Number of nonterminals.
Definition: EST_SCFG.h:226
LISP user_gc(LISP args)
Definition: slib.cc:1269
EST_write_status save(const EST_String &filename)
Save current grammar to named file.
Definition: EST_SCFG.cc:208
EST_UItem * next()
Definition: EST_UList.h:55
int length() const
Definition: EST_SCFG.h:79
LISP vload(const char *fname, long cflag)
Definition: slib_file.cc:632
EST_String terminal(int m) const
Convert terminal index to string form.
Definition: EST_SCFG.h:220
void resize(ssize_t n, int set=1)
Definition: EST_TVector.cc:196
INLINE ssize_t length() const
number of items in vector.
Definition: EST_TVector.h:249
SCFGRuleList rules
The rules themselves.
Definition: EST_SCFG.h:211
EST_String outfile
LISP consp(LISP x)
Definition: slib_list.cc:112
void load_corpus(const EST_String &filename)
void set_corpus(EST_Bcorpus &b, LISP examples)
const EST_String symbol_at(int i) const
The nth symbol in the string.
Definition: EST_SCFG.h:83
int length() const
Definition: EST_UList.cc:57
Template vector.
Definition: EST_TVector.h:145
EST_UItem * head() const
Definition: EST_UList.h:97
void gc_protect(LISP *location)
Definition: slib.cc:791
void train_inout(int passes, int startpass, int checkpoint, int spread, const EST_String &outfile)
LISP car(LISP x)
Definition: slib_list.cc:115
double prob_U(int p, int m) const
The rule probability of given unary rule.
Definition: EST_SCFG.h:232
LISP fp
Definition: kkcompile.cc:63
#define TRUE
Definition: EST_bool.h:118
#define CONSP(x)
Definition: siod_defs.h:153
EST_Item * daughter1(const EST_Item *n)
return first daughter of n
void resize(int n, int set=1)
resize vector
LISP cdr(LISP x)
Definition: slib_list.cc:124