Edinburgh Speech Tools  2.1-release
freqsmooth.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Paul Taylor & Simon King & Alan W Black */
34 /* Date : April 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* Yet another method for smoothing an ngram */
38 /* */
39 /* Good_Turing smooth n-grammar so there are no zero occurrences */
40 /* For each ngram */
41 /* if |n-1gram| < threshold */
42 /* F(ngram) = !n-1gram|*(P(n-1gram)) */
43 /* */
44 /*=======================================================================*/
45 #include <iostream>
46 #include <cstring>
47 #include <cmath>
48 #include <climits>
49 #include <cfloat>
50 #include "EST_Ngrammar.h"
51 
52 using namespace std;
53 
54 static double fs_find_backoff_prob(EST_Ngrammar *backoff_ngrams,
55  int order,const EST_StrVector words,
56  int smooth_thresh);
57 
58 void Ngram_freqsmooth(EST_Ngrammar &ngram,int smooth_thresh1,
59  int smooth_thresh2)
60 {
61  // To assign reasonable frequencies for all ngrams in grammar
62  EST_Ngrammar *backoff_ngrams;
63  backoff_ngrams = new EST_Ngrammar[ngram.order()-1];
64 
65  Good_Turing_smooth(ngram,smooth_thresh1);
66 
67  fs_build_backoff_ngrams(backoff_ngrams,ngram);
68 
69  fs_backoff_smooth(backoff_ngrams,ngram,smooth_thresh2);
70 
71  delete [] backoff_ngrams;
72 
73 }
74 
76  EST_Ngrammar &ngram)
77 {
78  // Build all the backoff grammars back to uni-grams
79  int i,j,l;
80  EST_Litem *k;
81 
82  for (i=0; i < ngram.order()-1; i++)
83  backoff_ngrams[i].init(i+1,EST_Ngrammar::dense,
84  *ngram.vocab,*ngram.pred_vocab);
85 
86  for (i=0; i < ngram.num_states(); i++)
87  {
88  const EST_StrVector words = ngram.make_ngram_from_index(i);
89 
90  for (k=ngram.p_states[i].pdf().item_start();
91  !ngram.p_states[i].pdf().item_end(k);
92  k = ngram.p_states[i].pdf().item_next(k))
93  {
94  double freq;
95  EST_String name;
96  ngram.p_states[i].pdf().item_freq(k,name,freq);
97  // Build all the sub-ngrams and accumulate them
98  for (j=0; j < ngram.order()-1; j++)
99  {
100  EST_StrVector nnn(j+1);
101  nnn[j] = name;
102  for (l=0; l < j; l++)
103  nnn[l] = words(ngram.order()-1-j);
104  backoff_ngrams[j].accumulate(nnn,freq);
105  }
106  }
107  }
108 
109 }
110 
111 int fs_backoff_smooth(EST_Ngrammar *backoff_ngrams,
112  EST_Ngrammar &ngram, int smooth_thresh)
113 {
114  // For all ngrams which are too infrequent, adjust their
115  // frequencies based on their backoff probabilities
116  int i;
117  EST_Litem *j;
118  double occurs;
119  double backoff_prob;
120 
121  if (ngram.representation() != EST_Ngrammar::dense)
122  {
123  cerr << "Ngrammar: can only ptsmooth dense ngrammars" << endl;
124  return FALSE;
125  }
126  else
127  {
128  for (i=0; i < ngram.num_states(); i++)
129  {
130  if (ngram.p_states[i].pdf().samples() < smooth_thresh)
131  {
132  EST_DiscreteProbDistribution &pdf = ngram.p_states[i].pdf();
133  occurs = ngram.p_states[i].pdf().samples();
135  words.resize(words.n()+1);
136 
137  for (j=pdf.item_start();
138  ! pdf.item_end(j);
139  j = pdf.item_next(j))
140  {
141  EST_String name;
142  double freq;
143  pdf.item_freq(j,name,freq);
144  words[words.n()-1] = name;
145  backoff_prob =
146  fs_find_backoff_prob(backoff_ngrams,
147  ngram.order()-1,
148  words,
149  smooth_thresh);
150  pdf.set_frequency(j,occurs*backoff_prob);
151  }
152  }
153  }
154  }
155 
156  return TRUE;
157 }
158 
159 static double fs_find_backoff_prob(EST_Ngrammar *backoff_ngrams,
160  int order,const EST_StrVector words,
161  int smooth_thresh)
162 {
163  // Find the backoff prob for n-1gram
164  EST_StrVector nnn;
165  int i;
166 
167  if (order == 0)
168  return TINY_FREQ; // ultimate floor
169 
170  nnn.resize(order);
171 
172  for(i=0; i<order; i++)
173  nnn[order-1-i] = words(words.n()-1-i);
174 
175  if (backoff_ngrams[order-1].frequency(nnn) < smooth_thresh)
176  return fs_find_backoff_prob(backoff_ngrams,
177  order-1,words,smooth_thresh);
178  else
179  return backoff_ngrams[order-1].probability(nnn);
180 }
int fs_backoff_smooth(EST_Ngrammar *backoff_ngrams, EST_Ngrammar &ngram, int smooth_thresh)
Definition: freqsmooth.cc:111
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
#define TINY_FREQ
Definition: EST_Ngrammar.h:70
double samples(void) const
Total number of example found.
void accumulate(const EST_StrVector &words, const double count=1)
EST_Discrete * vocab
Definition: EST_Ngrammar.h:283
EST_Litem * item_start() const
Used for iterating through members of the distribution.
EST_Discrete * pred_vocab
Definition: EST_Ngrammar.h:284
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
EST_NgrammarState * p_states
Definition: EST_Ngrammar.h:276
int num_states(void) const
Definition: EST_Ngrammar.h:413
EST_DiscreteProbDistribution & pdf()
Definition: EST_Ngrammar.h:115
double probability(const EST_StrVector &words, bool force=false, const bool trace=false) const
void resize(ssize_t n, int set=1)
Definition: EST_TVector.cc:196
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index.
representation_t representation() const
Definition: EST_Ngrammar.h:425
#define FALSE
Definition: EST_bool.h:119
section options Options< strong > or ngram_per_line Pseudo words
bool Good_Turing_smooth(EST_Ngrammar &ngrammar, int maxcount, int mincount)
void set_frequency(const EST_String &s, double c)
const EST_StrVector & make_ngram_from_index(const int i) const
int order() const
Definition: EST_Ngrammar.h:415
void fs_build_backoff_ngrams(EST_Ngrammar *backoff_ngrams, EST_Ngrammar &ngram)
Definition: freqsmooth.cc:75
#define TRUE
Definition: EST_bool.h:118
INLINE ssize_t n() const
number of items in vector.
Definition: EST_TVector.h:251
void Ngram_freqsmooth(EST_Ngrammar &ngram, int smooth_thresh1, int smooth_thresh2)
Definition: freqsmooth.cc:58