Edinburgh Speech Tools  2.1-release
ngrammar_aux.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Simon King & Alan W Black */
34 /* Date : February 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* Auxiliary functions for EST_Ngram class */
38 /* */
39 /*=======================================================================*/
40 #include <iostream>
41 #include <cstring>
42 #include "EST_String.h"
43 #include "EST_Ngrammar.h"
44 
45 using namespace std;
46 
47 static bool
48 ExponentialFit(EST_DVector &N, double &a, double &b, int first, int last)
49 {
50  // fit the function N(r) = e^a * r^b where a,b are constants
51  // to the points N(first)...N(last) inclusive
52  // i.e. a straight line fit in the (natural) log domain
53  // minimising the mean squared error (in log domain), is :
54 
55  // limitation : r must be an integer (it is the index of v)
56 
57 
58  // default
59  if (last == -1)
60  last = N.n()-1;
61 
62  if (first < 0)
63  {
64  cerr << "ExponentialFit : first must be >= 0" << endl;
65  return false;
66  }
67 
68  if (last >= N.n()-1)
69  {
70  cerr << "ExponentialFit : last must be < N.n()-1 = " << N.n()-1 << endl;
71  }
72 
73  if (first == last)
74  {
75  a=log(N(first));
76  b=0;
77  return true;
78  }
79 
80  double ElnNr=0.0,ElnNrlnr=0.0,
81  Elnr=0.0,Elnr2=0.0,
82  R=0.0;
83 
84  for(int r=first;r<=last;r++)
85  {
86  R += 1;
87  if ( N(r) > 0)
88  {
89  ElnNr += log( N(r) );
90  ElnNrlnr += log( N(r) ) * log( (double)r );
91  }
92  Elnr += log( (double)r );
93  Elnr2 += log( (double)r ) * log( (double)r );
94  }
95 
96  // and the answer is :
97  b = ( (ElnNr*Elnr/R) - ElnNrlnr ) / ( (Elnr*Elnr/R) - Elnr2);
98  a = (ElnNr - (b*Elnr) ) / R;
99 
100  return true;
101 }
102 
103 static bool
104 smooth_ExponentialFit(EST_DVector &N, int first, int last)
105 {
106  double a=0.0,b=0.0;
107 
108  if (!ExponentialFit(N,a,b,first,last))
109  {
110  cerr << "smooth_ExponentialFit : ExponentialFit failed !" << endl;
111  return false;
112  }
113 
114  for(int r=first;r<=last;r++)
115  N[r] = exp(a)* pow((double)r, b);
116 
117  return true;
118 }
119 
121 {
122  EST_Litem *k;
123  double freq;
124  EST_String name;
125 
126  EST_DVector *ff = (EST_DVector*)params;
127  int max = ff->n();
128  for (k=s->pdf_const().item_start();
129  !s->pdf_const().item_end(k);
130  k = s->pdf_const().item_next(k))
131  {
132  s->pdf_const().item_freq(k,name,freq);
133  if(freq+0.5 < max)
134  (*ff)[(int)(freq+0.5)] += 1;
135 
136  }
137 
138 
139 }
140 
141 void get_max_f(EST_BackoffNgrammarState *s,void *params)
142 {
143  EST_Litem *k;
144  double freq;
145  EST_String name;
146 
147  double *max = (double*)params;
148  for (k=s->pdf_const().item_start();
149  !s->pdf_const().item_end(k);
150  k = s->pdf_const().item_next(k))
151  {
152  s->pdf_const().item_freq(k,name,freq);
153  if(freq > *max)
154  *max = freq;
155 
156  }
157 
158 
159 }
160 
162 {
163  EST_Litem *k;
164  double freq;
165  EST_String name;
166 
167  //cerr << "map_f_of_f : visited " << *s << endl;
168 
169  EST_DVector *map = (EST_DVector*)params;
170  int max = map->n();
171  for (k=s->pdf_const().item_start();
172  !s->pdf_const().item_end(k);
173  k = s->pdf_const().item_next(k))
174  {
175  s->pdf_const().item_freq(k,name,freq);
176  if (freq+0.5 < max)
177  {
178  double nfreq = (*map)((int)(freq+0.5));
179  s->pdf().set_frequency(name,nfreq);
180 
181  }
182 
183  }
184 
185 }
186 
188 {
189  EST_Litem *k;
190  double freq;
191  EST_String name;
192 
193  double *min = (double*)params;
194  for (k=s->pdf_const().item_start();
195  !s->pdf_const().item_end(k);
196  k = s->pdf_const().item_next(k))
197  {
198  s->pdf_const().item_freq(k,name,freq);
199  if (freq < *min)
200  {
201  //cerr << "zeroing " << freq << " < " << *min << endl;
202  s->pdf().override_frequency(k,0.0);
203  }
204  }
205 }
206 
208 {
209  int i,size;
210  EST_Litem *k;
211  double max=0.0;
212 
213  // if ff has zero size, do complete frequency of frequencies
214  // otherwise just fill in frequencies from 1 to ff.n()-1
215 
216  bool complete = (bool)(ff.n() == 0);
217 
218  switch(n.representation())
219  {
220 
222  case EST_Ngrammar::dense:
223  {
224  size = n.num_states();
225  if (complete)
226  {
227  // find highest frequency in EST_Ngram
228  for(i=0;i<size;i++)
229  if (n.p_states[i].pdf().samples() > max)
230  max = n.p_states[i].pdf().samples();
231 
232  ff.resize((int)(max+1.5));
233  ff.fill(0.0);
234  }
235 
236  // Sum the frequencies
237  for(i=0;i<size;i++)
238  {
239  for (k=n.p_states[i].pdf().item_start();
240  !n.p_states[i].pdf().item_end(k);
241  k = n.p_states[i].pdf().item_next(k))
242  {
243  EST_String name;
244  double freq;
245  n.p_states[i].pdf().item_freq(k,name,freq);
246  ff[(int)(freq+0.5)] += 1;
247  }
248  }
249 
250  if (complete)
251  {
252  // fill in ff(0) - the number of unseen ngrams
253 
254  double total=0;
255  for (i=1;i<ff.n();i++)
256  total += ff(i);
257 
258  ff[0] = pow(float(n.get_vocab_length()),float(n.order())) - total;
259  }
260  }
261  break;
262 
263 
265  {
266  if (complete)
267  {
269  &get_max_f,(void*)(&max),
270  this_order-1);
271  ff.resize((int)(max+1.5));
272  }
273 
274  // backoff grammar has grammars of orders
275  // order, order-1, ..., 1
276  // and we treat each order independently
277 
278  for (i=0;i<ff.n();i++)
279  ff[i] = 0;
280 
282  &make_f_of_f,(void*)(&ff),
283  this_order-1);
284 
285  if (complete)
286  {
287  // fill in ff(0) - the number of unseen ngrams
288  double total=0;
289  for (i=1;i<ff.n();i++)
290  total += ff(i);
291  ff[0] = pow(float(n.get_vocab_length()),float(this_order)) - total;
292 
293 
294 
295  }
296  }
297  break;
298 
299  default:
300  cerr << "unknown representation for EST_Ngrammar" << endl;
301  break;
302  }
303 
304 }
305 
306 void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order)
307 {
308  int i;
309  EST_Litem *k;
310 
311  switch(n.representation())
312  {
313 
315  case EST_Ngrammar::dense:
316  {
317  int size = n.p_num_states;
318 
319  for(i=0;i<size;i++)
320  {
321  for (k=n.p_states[i].pdf().item_start();
322  !n.p_states[i].pdf().item_end(k);
323  k = n.p_states[i].pdf().item_next(k))
324  {
325  EST_String name;
326  double freq,nfreq;
327  n.p_states[i].pdf().item_freq(k,name,freq);
328  nfreq = map((int)(freq+0.5));
329  n.p_states[i].pdf().set_frequency(name,nfreq);
330 
331 
332  }
333  }
334  }
335  break;
336 
338  {
339 
340  // backoff grammar has grammars of orders
341  // order, order-1, ..., 1
342  // and we treat each order independently
344  &map_f_of_f,(void*)(&map),
345  this_order-1);
346 
347  }
348  break;
349 
350  default:
351  cerr << "unknown representation for EST_Ngrammar" << endl;
352  break;
353 
354  }
355 }
356 
357 static void
358 adjusted_frequencies_BasicGoodTuring(EST_DVector &M,
359  const EST_DVector &N,
360  int maxcount)
361 {
362  // produce a map of frequencies, given a (smoothed) distribution
363  // only map up to a frequency of maxcount; beyond that
364  // map(r) == r
365 
366  if (maxcount > N.n()-2)
367  {
368  maxcount = N.n()-2;
369  cerr << "adjusted_frequencies_BasicGoodTuring :";
370  cerr << " maxcount is too big, reducing it to " << maxcount << endl;
371  }
372 
373  M.resize(N.n());
374  int r;
375  for(r=0; r<=maxcount;r++)
376  {
377  // don't like this bit, but what can you do ?
378  if( (N(r+1) == 0) || (N(r) == 0) )
379  M[r] = r;
380  else
381  M[r] = (r + 1) * N(r+1) / N(r);
382  }
383  // and do not map higher counts
384  for(;r<N.n();r++)
385  M[r]=r;
386 
387  return;
388 
389 }
390 
391 static void
392 smoothed_frequency_distribution_ExponentialFit(EST_DVector &N,int maxcount)
393 {
394  if (maxcount > N.n()-2)
395  {
396  maxcount = N.n()-2;
397  cerr << "smoothed_frequency_distribution_ExponentialFit :"
398  << " maxcount too big, reducing it to " << maxcount << endl;
399  }
400  // the idea to fit this particular fn was by Steve Isard
401  // we ignore r=0 in doing the fit
402 
403  if (!smooth_ExponentialFit(N,1,maxcount+1))
404  cerr << "smooth_ExponentialFit failed !" << endl;
405 
406  return;
407 }
408 
409 bool
410 Good_Turing_smooth(EST_Ngrammar &ngrammar, int maxcount, int mincount)
411 {
412  // works for any N
413  // since it just remaps frequencies
414 
415  // frequencies above maxcount are not changed
416  // for discounting -- see discount routine !
417 
418 
419  if (ngrammar.entry_type() != EST_Ngrammar::frequencies)
420  {
421  cerr << "EST_Ngram: cannot Good-Turing smooth ngram:" <<
422  " entries are not frequencies" << endl;
423  return false;
424  }
425 
426  switch(ngrammar.representation())
427  {
428 
430  case EST_Ngrammar::dense:
431  {
432  EST_DVector freqs,mapped_freqs;
433  // grammar is of a single order - simple
434  // Find frequency distribution
435  frequency_of_frequencies(freqs,ngrammar);
436  // smoothing should be optional - to do
437  smoothed_frequency_distribution_ExponentialFit(freqs,maxcount-1);
438  // Build map of frequencies
439  adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,maxcount);
440  // Map all frequencies in grammar to Good Turing Smoothed values
441  map_frequencies(ngrammar,mapped_freqs);
442 
443  }
444  break;
445 
447 {
448  (void)mincount;
449 
450  cerr << "Smoothing of backed of grammars is not available!" << endl;
451  return false;
452 
453  /*
454  // need to smooth for each order independently
455  int i,o;
456  double threshold;
457 
458  //for (o=1;o<=ngrammar.order();o++)
459  for(o=ngrammar.order();o<=ngrammar.order();o++)
460  {
461 
462  EST_DVector freqs,mapped_freqs;
463 
464  frequency_of_frequencies(freqs,ngrammar,o);
465 
466  cerr << "FREQ : " << freqs << endl;
467 
468  if(freqs.n() < 2)
469  {
470  // i.e. only unseen things - zero them all
471  threshold = 2; // 1 ?
472  if(o>1)
473  ngrammar.backoff_traverse(ngrammar.backoff_representation,
474  &zero_small_f,
475  (void*)(&threshold),
476  o-1);
477  continue;
478  }
479 
480  int max = maxcount;
481  if(max > freqs.n() - 2)
482  max = freqs.n() - 2;
483 
484  if(max > 2)
485  // need at least 3 points
486  // smoothing should be optional - to do
487  smoothed_frequency_distribution_ExponentialFit(freqs,max);
488 
489  cerr << "SMOOTHED : " << freqs << endl;
490 
491 
492  adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,max);
493 
494  // initial map of frequencies modifies total counts
495  // so pdfs are correct
496 
497  map_frequencies(ngrammar,mapped_freqs,o);
498 
499 
500  cerr << "Map for " << o << " : ";
501  for(i=0;i<max;i++)
502  cerr << mapped_freqs(i) << " ";
503  cerr << endl;
504 
505  cerr << "MAP : " << mapped_freqs << endl;
506 
507  // now go and zero the small frequencies
508  // but not for unigram
509 
510 
511  if(mincount < mapped_freqs.n())
512  threshold = mapped_freqs(mincount);
513  else
514  threshold = mapped_freqs(mapped_freqs.n()-1) + 1; // i.e. zero everything
515 
516  //cerr << "min = " << threshold << endl;
517 
518  if(o>1)
519  ngrammar.backoff_traverse(ngrammar.backoff_representation,
520  &zero_small_f,
521  (void*)(&threshold),
522  o-1);
523  */
524 
525 }
526 break;
527 
528 default:
529 cerr << "unknown representation for EST_Ngrammar" << endl;
530 break;
531 
532 }
533 
534 return true;
535 
536 }
537 
538 
539 void
540 Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount,
541  const double default_discount)
542 {
543 
544  if(ngrammar.representation() != EST_Ngrammar::backoff)
545  {
546  cerr << "Good_Turing_discount is not appropriate for non backoff grammar !"
547  << endl;
548  return;
549  }
550 
551 
552 
553  // method (for each grammar order):
554  // produce Good-Turing smoothed frequency map
555  // compute difference between unsmoothed map (0,1,2,3...)
556  // and GT map - this is the discount
557  // we do not subtract the discount here since we need to
558  // preserve the raw counts
559 
560  // discounting is applied up to frequency 'maxcount'
561  // beyond which we do not discount
562 
563  // to do : zero low frequencies ???? (prune the tree)
564  /*
565  ngrammar.backoff_traverse(ngrammar.backoff_representation,
566  &zero_small_f,
567  (void*)(&threshold),
568  o-1);
569  */
570 
571  // need to treat for each order independently
572  int i,o;
573 
574  // no discounting for order 1 - YES !?!?!?
575  for (o=1;o<=ngrammar.order();o++)
576  {
577 
578  EST_DVector freqs,mapped_freqs;
579  frequency_of_frequencies(freqs,ngrammar,o);
580 
581  int max = maxcount;
582  if(max > freqs.n() - 2)
583  max = freqs.n() - 2;
584 
585  if(max > 2)
586  {
587  // need at least 3 points
588  // smoothing should be optional - to do
589 
590  // April 97
591  // to overcome problems with N(x)=0, shift data points, fit, and shift back
592 
593  for(i=0;i<=max+1;i++)
594  freqs[i] += 1;
595 
596  smoothed_frequency_distribution_ExponentialFit(freqs,max);
597 
598  for(i=0;i<=max+1;i++)
599  {
600  freqs[i] -= 1;
601  // possibly unnecesary check
602  if ( freqs(i) < 0 )
603  freqs[i] = 0;
604  }
605  }
606 
607  adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,max);
608 
609  // and put it in the discount array
610  ngrammar.backoff_discount[o-1].resize(freqs.n());
611  for(i=(int)ngrammar.backoff_threshold;i<=max;i++)
612  {
613  ngrammar.backoff_discount[o-1][i] = (double)i - mapped_freqs(i);
614 
615  // sanity check
616  if( ngrammar.backoff_discount[o-1][i] < 0)
617  {
618  ngrammar.backoff_discount[o-1][i] = 0;
619  }
620  }
621 
622  for(;i<freqs.n();i++)
623  ngrammar.backoff_discount[o-1][i] = default_discount;
624 
625  }
626 }
627 
629 
EST_DVector * backoff_discount
Definition: EST_Ngrammar.h:269
void Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount, const double default_discount)
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order)
void zero_small_f(EST_BackoffNgrammarState *s, void *params)
double samples(void) const
Total number of example found.
EST_BackoffNgrammarState * backoff_representation
Definition: EST_Ngrammar.h:257
void fill(const T &v)
Fill entire array will value v.
Definition: EST_TVector.cc:105
#define VAL_REGISTER_CLASS(NAME, CLASS)
Definition: EST_Val_defs.h:62
EST_Litem * item_start() const
Used for iterating through members of the distribution.
entry_t entry_type() const
Definition: EST_Ngrammar.h:424
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
EST_NgrammarState * p_states
Definition: EST_Ngrammar.h:276
int num_states(void) const
Definition: EST_Ngrammar.h:413
void backoff_traverse(EST_BackoffNgrammarState *start_state, void(*function)(EST_BackoffNgrammarState *s, void *params), void *params)
float max(float a, float b)
Definition: EST_cluster.cc:143
EST_DiscreteProbDistribution & pdf()
Definition: EST_Ngrammar.h:115
void get_max_f(EST_BackoffNgrammarState *s, void *params)
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index.
void make_f_of_f(EST_BackoffNgrammarState *s, void *params)
EST_DiscreteProbDistribution & pdf()
Definition: EST_Ngrammar.h:167
double backoff_threshold
Definition: EST_Ngrammar.h:259
float min(float a, float b)
Definition: EST_cluster.cc:138
representation_t representation() const
Definition: EST_Ngrammar.h:425
const EST_DiscreteProbDistribution & pdf_const() const
Definition: EST_Ngrammar.h:166
void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n, int this_order)
A vector class for double precision floating point numbers. EST_DVector x should be used instead of f...
Definition: EST_DMatrix.h:122
bool Good_Turing_smooth(EST_Ngrammar &ngrammar, int maxcount, int mincount)
int get_vocab_length() const
Definition: EST_Ngrammar.h:416
getString int
Definition: EST_item_aux.cc:50
void set_frequency(const EST_String &s, double c)
int order() const
Definition: EST_Ngrammar.h:415
void override_frequency(const EST_String &s, double c)
Sets the frequency of named item, without modifying num\_samples.
void map_f_of_f(EST_BackoffNgrammarState *s, void *params)
INLINE ssize_t n() const
number of items in vector.
Definition: EST_TVector.h:251
#define bool
Definition: EST_bool.h:117
void resize(int n, int set=1)
resize vector