Edinburgh Speech Tools  2.1-release
EST_PST.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : July 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* A general class for PredictionSuffixTrees */
38 /* */
39 /*=======================================================================*/
40 
41 #include <cstdlib>
42 #include <iostream>
43 #include <fstream>
44 #include <cstring>
45 #include "EST_String.h"
46 #include "EST_types.h"
47 #include "EST_Token.h"
48 #include "EST_PST.h"
49 #include "EST_multistats.h"
50 
51 using namespace std;
52 
54 
55 // Out of vocabulary identifier
58 
59 void slide(EST_IVector &i, const int l);
60 void slide(EST_StrVector &i, const int l);
61 
63 {
64 }
65 
66 void
68 {
69  // Print out path and frequency for each leaf
70 
71  if (p_level == 0)
72  {
73  // Base -- print from pd
74  EST_String s;
75  double freq;
76  EST_Litem *i;
77  for (i = pd.item_start();
78  !pd.item_end(i);
79  i=pd.item_next(i))
80  {
81  pd.item_freq(i,s,freq);
82  os << get_path() << " " << s << " : " << freq << endl;
83  }
84  }
85  else
86  {
88  for (t.begin(nodes); t; t++)
89  pstnode(t->v)->print_freqs(os);
90  }
91 }
92 
93 void
95 {
96  // Print out path and probability distributions for node above leafs
97 
98  if (p_level == 0)
99  {
100  // Base -- print from pd
101  EST_String s;
102  double prob;
103  os << get_path() << " :";
104  for (EST_Litem *i = pd.item_start(); !pd.item_end(i) ; i=pd.item_next(i))
105  {
106  pd.item_prob(i,s,prob);
107  os << " " << s << " " << prob;
108  }
109  os << endl;
110  }
111  else
112  {
114  for (t.begin(nodes); t; t++)
115  pstnode(t->v)->print_probs(os);
116  }
117 }
118 
119 const EST_String &
121 {
122  // Find most probable symbol in this node
123  return pd.most_probable(prob);
124 }
125 
127 {
128  p_order = 0;
129  num_states = 0;
130  nodes = 0;
131  pd = 0;
132 }
133 
134 void
136 {
137  p_order = order;
138  num_states = 0;
140  nodes->set_level(order-1);
142 }
143 
145 {
146  delete nodes;
147  delete pd;
148 }
149 
150 void
152 {
153  delete nodes;
154  delete pd;
155  pd = 0;
156 }
157 
160  const EST_StrVector &words,
161  const int index) const
162 {
163  // Return the probability distribution for this EST_PredictionSuffixTree context
164 
165  if (words.n() == index+1){
166  return node->prob_dist();
167  }else
168  {
171  next = pstnode(node->nodes.f(words(index),est_val(n)));
172  if (next == 0)
173  {
174  //cerr << "EST_PredictionSuffixTree: "
175  //<< "no EST_PredictionSuffixTree probabilities for context \n";
177  }
178  return p_prob_dist(next,words,index+1);
179  }
180 }
181 
182 
183 void
185  const double count,
186  const int index)
187 {
188 
189 /*
190  cerr << "accumulate ";
191  for (int i=0; i < p_order-1; i++)
192  cerr << words(i+index) << " ";
193  cerr << count << endl;
194  */
195 
196  if (words.n()+index < p_order)
197  cerr << "EST_PredictionSuffixTree: accumlating window is wtoo small"
198  << endl;
199  else
200  {
201  pd->cumulate(words(p_order-1+index),count); // a little extra book-keeping
202  p_accumulate(nodes,words,count,index);
203  }
204 }
205 
206 void
208  const EST_StrVector &words,
209  double count,
210  const int index)
211 {
212  /* Expand tree with new gram */
213  if (words.n() == index+1)
214  {
215  if (node->prob_dist().samples() == 0) // A new terminal node
216  node->set_state(num_states++);
217  node->cumulate(words(index),count);
218  }
219  else
220  {
223  next = pstnode(node->nodes.f(words(index),est_val(n)));
224  if (next == 0)
225  {
227  if (node->get_path() == "")
228  next->set_path(words(index));
229  else
230  {
231  next->set_path(node->get_path()+" "+ words(index));
232  }
233  next->set_level(node->get_level()-1);
234  node->nodes.set_val(words(index),est_val(next));
235  }
236  p_accumulate(next,words,count,index+1);
237  }
238 }
239 
240 double
242 {
243  // Reverse probability. What is prob of this context given predictee
244  const EST_DiscreteProbDistribution &pg = prob_dist(words);
245  double d1 = pg.frequency(words(order()-1));
246  double d2 = pd->frequency(words(order()-1));
247  double d3 = d1/d2;
248  return d3;
249 }
250 
251 double
253  const EST_DiscreteProbDistribution &pg) const
254 {
255  // Reverse probability. What is prob of this context given predictee
256  return (double)pg.frequency(words(order()-1)) /
257  pd->frequency(words(order()-1));
258 }
259 
260 const
262 {
263  double p;
264  int state;
265  return ppredict(nodes,words,&p,&state);
266 }
267 
268 const
270 {
271  int state;
272  return ppredict(nodes,words,p,&state);
273 }
274 
275 const
277 {
278  return ppredict(nodes,words,p,state);
279 }
280 
281 const
283  const EST_StrVector &words,
284  double *p,int *state,
285  const int index) const
286 {
287  /* Given the context whats the most probably symbol (or OOV) */
288  if (words.n() == index+1)
289  {
290  *state = node->get_state();
291  return node->most_probable(p);
292  }
293  else
294  {
297  next = pstnode(node->nodes.f(words(index),est_val(n)));
298  if (next == 0)
299  {
300  *p = 0.0;
301  *state = 0; // No not really acceptable
302  return PredictionSuffixTree_oov; /* utterly improbable -- i.e. not in EST_PredictionSuffixTree */
303  }
304  else
305  return ppredict(next,words,p,state,index+1);
306  }
307 }
308 
309 void
311 {
312  // Print a list of all PredictionSuffixTrees with their frequencies
313 
314  os << "EST_PredictionSuffixTree order=" << p_order << endl;
315  nodes->print_freqs(os);
316 
317 }
318 
319 void
321 {
322  // Print a list of all grams(-last) with their probability distributions
323 
324  os << "EST_PredictionSuffixTree " << p_order << endl;
325  nodes->print_probs(os);
326 
327 }
328 
329 int
330 EST_PredictionSuffixTree::save(const EST_String filename, const EST_PredictionSuffixTree::EST_filetype type)
331 {
332  // Save an EST_PredictionSuffixTree to file -- actual the grams and frequencies
333  (void)type;
334 
335  if (filename == "-")
336  print_freqs(cout);
337  else
338  {
339  ofstream os(filename);
340  print_freqs(os);
341  }
342  return 0;
343 }
344 
345 int
347 {
348  // Load an EST_PredictionSuffixTree for given file
349  EST_TokenStream ts;
350  int i,order,freq;
351 
352  clear();
353  if (ts.open(filename) != 0)
354  {
355  cerr << "EST_PredictionSuffixTree: failed to open \"" << filename << "\" for reading\n";
356  return -1;
357  }
358  ts.set_SingleCharSymbols(":");
359 
360  if (ts.get() != "EST_PredictionSuffixTree") // magic number
361  {
362  cerr << "EST_PredictionSuffixTree: file \"" << filename << "\" not an EST_PredictionSuffixTree\n";
363  return -1;
364  }
365 
366  order = atoi(ts.get().string());
367  if ((order < 1) || (order > 10))
368  {
369  cerr << "EST_PredictionSuffixTree: file \"" << filename << "\" has bad order\n";
370  return -1;
371  }
372  init(order);
373  //EST_String const* *window = new EST_String const*[order+1];
374  EST_StrVector window(order);
375  for (i=0; i<p_order; i++)
376  window[i] = "";
377 
378  while (!ts.eof())
379  {
380  slide(window,-1);
381  window[p_order-1] = ts.get().string();
382  if (ts.get() != ":")
383  {
384  cerr << "EST_PredictionSuffixTree: file \"" << filename << "\" missed parsed line ";
385  cerr << ts.linenum() << " near EST_PredictionSuffixTree\n";
386  for (i=0; i < order; i++)
387  cout << " " << window(i);
388  cout << endl;
389  }
390 
391  freq = atoi(ts.get().string());
392  accumulate(window,freq);
393  }
394 
395  return 0;
396 }
397 
398 void
400  const EST_String prev,
401  const EST_String prev_prev,
402  const EST_String last)
403 {
404  EST_TokenStream ts;
405  int i;
406 
407  if (filename == "-")
408  ts.open(stdin, FALSE);
409  else if (ts.open(filename) == -1)
410  return;
411 
412  //EST_String const* *window = new EST_String const*[p_order+1];
413  EST_StrVector window(p_order);
414  for (i=0; i<p_order-1; i++)
415  window[i] = prev_prev;
416  window[p_order-1] = prev;
417 
418  accumulate(window,1);
419 
420  //window[p_order] = 0; // end of array marker
421 
422  while (!ts.eof())
423  {
424  slide(window,-1);
425  window[p_order-1] = ts.get().string();
426  accumulate(window,1);
427  }
428 
429  // and finish off
430 
431  slide(window,-1);
432  window[p_order-1] = last;
433  accumulate(window,1);
434 
435 }
436 
437 void
439 {
440 
441  // knackered - fix
442 
443 
444  // adapted from build(..) above
445  // but could be made more efficient
446 
447  int i;
448  //EST_String const* *window = new EST_String const *[p_order+1];
449  EST_StrVector window(p_order);
450  for (i=0; i<p_order; i++)
451  window[i] = "";
452 
453  EST_Litem *i_ptr;
454  for(i_ptr=input.head();i_ptr!=NULL;i_ptr=i_ptr->next())
455  {
456  slide(window,-1);
457  window[p_order-1] = input(i_ptr);
458  accumulate(window,1);
459  }
460 }
461 
462 
463 void
465 {
466  // Run the current EST_PredictionSuffixTree over the given file name and generate
467  // statistics of its prediction power for the contents of that
468  // file. Use the confusion function
469  EST_StrStr_KVL pairs;
470  EST_StrList lex;
471  EST_TokenStream ts;
472  int i;
473 
474  if (filename == "-")
475  ts.open(stdin, FALSE);
476  else if (ts.open(filename) == -1)
477  return;
478 
479  /* The top level nodes list will always contain all the tokens */
480  /* Build the lexicon for confusion matrix */
482  for (p.begin(nodes->nodes); p; p++)
483  lex.append(p->k);
484  lex.append("_OOV_");
485 
486  EST_StrVector window(p_order);
487  //EST_String const* *window = new EST_String const*[p_order+1];
488  for (i=0; i<p_order; i++)
489  window[i] = "";
490  double e=0;
491  int num_tsamples = 0;
492 
493  while (!ts.eof())
494  {
495  slide(window,-1);
496  window[p_order-1] = ts.get().string();
497  const EST_DiscreteProbDistribution &pdist = prob_dist(window);
498  e += pdist.entropy();
499  num_tsamples++;
500  // cumulate real and predicted
501  pairs.add_item(window(p_order-1),predict(window),1);
502  }
503 
504  const EST_FMatrix &m = confusion(pairs,lex);
505  print_confusion(m,pairs,lex);
506  cout << "Mean entropy (?) is " << e/num_tsamples << endl;
507 
508 
509 }
510 
const EST_DiscreteProbDistribution & p_prob_dist(EST_PredictionSuffixTree_tree_node *node, const EST_StrVector &words, const int index=0) const
Definition: EST_PST.cc:159
EST_TokenStream & get(EST_Token &t)
get next token in stream
Definition: EST_Token.cc:499
void set_val(const EST_String &name, const EST_Val &sval)
Definition: EST_Features.h:217
double samples(void) const
Total number of example found.
EST_Val est_val(const EST_Item_featfunc f)
Definition: item_feats.cc:122
void p_accumulate(EST_PredictionSuffixTree_tree_node *node, const EST_StrVector &words, double count, const int index=0)
Definition: EST_PST.cc:207
void set_SingleCharSymbols(const EST_String &sc)
set which characters are to be treated as single character symbols
Definition: EST_Token.h:344
int get_level(void) const
Definition: EST_PST.h:72
const EST_DiscreteProbDistribution PSTnullProbDistribution
Definition: EST_PST.cc:57
const EST_String & most_probable(double *p) const
Definition: EST_PST.cc:120
#define VAL_REGISTER_CLASS(NAME, CLASS)
Definition: EST_Val_defs.h:62
int index(EST_TList< T > &l, T &val, bool(*eq)(const EST_UItem *, const EST_UItem *)=NULL)
Definition: EST_TList.h:286
void cumulate(const EST_String &s, double count=1)
Definition: EST_PST.h:73
EST_UItem * next()
Definition: EST_UList.h:55
int get_state(void) const
Definition: EST_PST.h:71
void build(const EST_String filename, const EST_String prev, const EST_String prev_prev, const EST_String last)
Definition: EST_PST.cc:399
int open(const EST_String &filename)
open a EST_TokenStream for a file.
Definition: EST_Token.cc:213
void print_confusion(const EST_FMatrix &a, EST_StrStr_KVL &list, EST_StrList &lex)
Definition: confusion.cc:77
int eof()
end of file
Definition: EST_Token.h:362
double frequency(const EST_String &s) const
const EST_Val & f(const EST_String &path) const
Definition: EST_Features.h:115
void accumulate(const EST_StrVector &words, const double count=1, const int index=0)
Definition: EST_PST.cc:184
void slide(EST_IVector &i, const int l)
double entropy(void) const
EST_FMatrix confusion(EST_StrStr_KVL &list, EST_StrList &lex)
Definition: confusion.cc:59
int linenum(void) const
returns line number of EST_TokenStream
Definition: EST_Token.h:360
double rev_prob(const EST_StrVector &words) const
Definition: EST_PST.cc:241
const EST_String PredictionSuffixTree_oov("_OOV_")
#define FALSE
Definition: EST_bool.h:119
const EST_String & predict(const EST_StrVector &words) const
Definition: EST_PST.cc:261
section options Options< strong > or ngram_per_line Pseudo words
NULL
Definition: EST_WFST.cc:55
const EST_DiscreteProbDistribution & prob_dist() const
Definition: EST_PST.h:76
const EST_String & ppredict(EST_PredictionSuffixTree_tree_node *node, const EST_StrVector &words, double *prob, int *state, const int index=0) const
Definition: EST_PST.cc:282
int save(const EST_String filename, const EST_PredictionSuffixTree::EST_filetype type=PredictionSuffixTree_ascii)
Definition: EST_PST.cc:330
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:196
void print_probs(ostream &os)
Definition: EST_PST.cc:94
int load(const EST_String filename)
Definition: EST_PST.cc:346
int add_item(const K &rkey, const V &rval, int no_search=0)
add key-val pair to list
Definition: EST_TKVL.cc:248
void init(const int order)
Definition: EST_PST.cc:135
void print_freqs(ostream &os)
Definition: EST_PST.cc:310
void begin(const Container &over)
Set the iterator ready to run over this container.
Template vector.
Definition: EST_TVector.h:145
EST_UItem * head() const
Definition: EST_UList.h:97
void test(const EST_String filename)
Definition: EST_PST.cc:464
INLINE ssize_t n() const
number of items in vector.
Definition: EST_TVector.h:251
void set_path(const EST_String &s)
Definition: EST_PST.h:68
void print_probs(ostream &os)
Definition: EST_PST.cc:320
void print_freqs(ostream &os)
Definition: EST_PST.cc:67
const EST_String & get_path(void) const
Definition: EST_PST.h:67