Edinburgh Speech Tools  2.1-release
EST_Ngrammar.h
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Simon King & Alan W Black */
34 /* Date : February 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* A general class for ngrams (bi-gram, tri-gram etc) */
38 /* */
39 /*=======================================================================*/
40 #ifndef __EST_NGRAMMAR_H__
41 #define __EST_NGRAMMAR_H__
42 
43 #include <cstdarg>
44 #include <cstdlib>
45 
46 #include "EST_String.h"
47 #include "EST_Val.h"
48 #include "EST_rw_status.h"
49 #include "EST_types.h"
50 #include "EST_FMatrix.h"
51 #include "EST_TList.h"
52 #include "EST_StringTrie.h"
53 #include "EST_simplestats.h"
54 #include "EST_PST.h"
55 #include "EST_string_aux.h"
56 #include "EST_math.h"
57 
58 // HTK style
59 #define SENTENCE_START_MARKER "!ENTER"
60 #define SENTENCE_END_MARKER "!EXIT"
61 #define OOV_MARKER "!OOV"
62 
63 #define EST_NGRAMBIN_MAGIC 1315402337
64 
65 // for compressed save/load
66 #define GZIP_FILENAME_EXTENSION "gz"
67 #define COMPRESS_FILENAME_EXTENSION "Z"
68 
69 // Ultimate floor
70 #define TINY_FREQ 1.0e-10
71 
72 // ngram state - represents the N-1 word history and contains
73 // the pdf of the next word
74 
76 
77 private:
78 
79 protected:
81  int p_id; // a 'name'
82 
83 public:
85 
86  p_pdf()
87 
88  {
89  init();
90  };
91  EST_NgrammarState(int id,EST_Discrete *d){clear();init(id,d);};
93  {clear();init(id,pdf);};
95  EST_NgrammarState(const EST_NgrammarState *const s);
97 
98  EST_IVector path; // how we got here
99 
100  // initialise
101  void clear();
102  void init();
103  void init(int id, EST_Discrete *d);
104  void init(int id, const EST_DiscreteProbDistribution &pdf);
105 
106  // build
107  void cumulate(const int index, const double count=1)
108  {p_pdf.cumulate(index,count);};
109  void cumulate(const EST_String &word, const double count=1)
110  {p_pdf.cumulate(word,count);};
111 
112  // access
113  int id() const {return p_id; };
114  const EST_DiscreteProbDistribution &pdf_const() const {return p_pdf; };
116  double probability(const EST_String &w) const
117  {return p_pdf.probability(w);}
118  double probability(int w) const {return p_pdf.probability(w);}
119  double frequency(const EST_String &w) const
120  {return p_pdf.frequency(w);}
121  double frequency(int w) const {return p_pdf.frequency(w);}
122  const EST_String &most_probable(double *prob = NULL) const
123  {return p_pdf.most_probable(prob);}
124 
125 friend ostream& operator<<(ostream& s, const EST_NgrammarState &a);
126 
127 };
128 
130 
131 private:
132 
133 protected:
134  int p_level; // = 0 for root node
138 
139  EST_BackoffNgrammarState* add_child(const EST_Discrete *d,
140  const EST_StrVector &words);
141  EST_BackoffNgrammarState* add_child(const EST_Discrete *d,
142  const EST_IVector &words);
143 public:
145  { init(); };
147  {clear();init(d,level);};
149  {clear();init(pdf,level);};
153 
154  // initialise
155  void clear();
156  void init();
157  void init(const EST_Discrete *d, int level);
158  void init(const EST_DiscreteProbDistribution &pdf, int level);
159 
160  // build
161  bool accumulate(const EST_StrVector &words,
162  const double count=1);
163  bool accumulate(const EST_IVector &words,
164  const double count=1);
165  // access
166  const EST_DiscreteProbDistribution &pdf_const() const {return p_pdf; };
168  double probability(const EST_String &w) const
169  {return p_pdf.probability(w);}
170  double frequency(const EST_String &w) const
171  {return p_pdf.frequency(w);}
172  const EST_String &most_probable(double *prob = NULL) const
173  {return p_pdf.most_probable(prob);}
174 
175  int level() const {return p_level;}
176 
178  {
179  return (EST_BackoffNgrammarState*)children.lookup(word);
180  }
181  EST_BackoffNgrammarState* get_child(const int word) const
182  {
183  return (EST_BackoffNgrammarState*)children.lookup(p_pdf.get_discrete()->name(word));
184  }
185 
186  void remove_child(EST_BackoffNgrammarState *child,
187  const EST_String &name);
188 
189  // recursive delete of contents and children
190  void zap();
191 
192  const EST_BackoffNgrammarState * get_state(const EST_StrVector &words) const;
193 
194  bool ngram_exists(const EST_StrVector &words,
195  const double threshold) const;
196  double get_backoff_weight() const {return backoff_weight; }
197  double get_backoff_weight(const EST_StrVector &words) const;
198  bool set_backoff_weight(const EST_StrVector &words, const double w);
200 
201  void print_freqs(ostream &os,const int order,EST_String followers="");
202 
203 friend ostream& operator<<(ostream& s, const EST_BackoffNgrammarState &a);
204 
205 };
206 
207 class EST_Ngrammar;
208 EST_write_status save_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n, double floor=0.0);
209 EST_write_status save_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n, const bool trace=false, double floor=0.0);
210 EST_write_status save_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n, const bool trace=false, double floor=0.0);
211 void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n,int this_order=0);
212 void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order=0);
213 bool Good_Turing_smooth(EST_Ngrammar &n, int maxcount, int mincount=0);
214 void Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount, const double default_discount=0.5);
215 
217 
218 public:
219 
220  // 3 representations : sparse, dense and backed off. User specifies which.
221  enum representation_t {sparse, dense, backoff};
222 
223  // now only keep frequencies (or log frequencies)
224  // probabilities (or log probabilities) can be done
225  // on the fly quickly enough
226  enum entry_t {frequencies, log_frequencies};
227 
228 protected:
229 
230  // each instance of an EST_Ngrammar is a grammar of fixed order
231  // e.g. a bigram (order = 2)
232  int p_order;
234 
235  double p_number_of_sentences; // which were used to build this grammar
236 
237 
240 
241  // only one representation in use at a time
244 
245  // sparse representation is a tree structure
246  // holding only those N-grams which were seen
248  bool init_sparse_representation();
249 
250  // dense representation is just an array of all states
251  bool init_dense_representation();
252 
253  // backoff representation is also a tree structure
254  // but the root state pdf is the most recent word in the
255  // ngram and going down the tree is going back in time....
256  // here is the root node :
258 
260 
261  // need a non-zero unigram floor to enable backing off
263 
264  // instead of simple discounting, we have a (possibly) different
265  // discount per order and per frequency
266  // e.g. backoff_discount[2](4) contains the discount to be
267  // applied to a trigram frequency of 4
268  // backoff_discount[0] is unused (we don't discount unigrams)
270  double get_backoff_discount(const int order, const double freq) const;
271 
272  bool init_backoff_representation();
273  void prune_backoff_representation(EST_BackoffNgrammarState *start_state=NULL); // remove any zero frequency branches
274  void backoff_restore_unigram_states();
275  int p_num_states; // == p_vocab_size ^ (p_ord-1) if fully dense
276  EST_NgrammarState *p_states; // state id is index into this array
277  int find_dense_state_index(const EST_IVector &words, int index=0) const;
278 
279  // and the reverse
280  const EST_StrVector &make_ngram_from_index(const int i) const;
281 
282  // vocabulary
284  EST_Discrete *pred_vocab; // may be different from state vocab
285  bool init_vocab(const EST_StrList &wordlist);
286  bool init_vocab(const EST_StrList &word_list,
287  const EST_StrList &pred_list);
288 
289  // make sure vocab matches a given wordlist
290  bool check_vocab(const EST_StrList &wordlist);
291 
293 
294  const EST_String &lastword(const EST_StrVector &words) const
295  { return words(p_order-1); }
296  int lastword(const EST_IVector &words) const
297  { return words(p_order-1); }
298  // are we allowing out-of-vocabulary words, or is the vocabulary closed?
299  bool allow_oov;
300 
301  bool sparse_to_dense();
302  bool dense_to_sparse();
303 
304  // these aren't sorted yet ...
305  void take_logs();
306  void take_exps();
307  void freqs_to_probs(); // just calls normalise
308 
309  bool build_sparse(const EST_String &filename,
310  const EST_String &prev,
311  const EST_String &prev_prev,
312  const EST_String &last);
313  // for dense and backoff
314  bool build_ngram(const EST_String &filename,
315  const EST_String &prev,
316  const EST_String &prev_prev,
317  const EST_String &last,
318  const EST_String &input_format);
319 
320  // go through all matching ngrams ( *(ngram[i])="" matches anything )
321  void iterate(EST_StrVector &words,
322  void (*function)(EST_Ngrammar *n,
323  EST_StrVector &words,
324  void *params),
325  void *params);
326 
327  // same, but with a constant Ngrammar
328  void const_iterate(EST_StrVector &words,
329  void (*function)(const EST_Ngrammar *const n,
330  EST_StrVector &words,
331  void *params),
332  void *params) const;
333 
334  bool p_init(int o, representation_t r);
335 
336  // new filename returned of we had to copy stdin to a
337  // temporary file - must delete it later !
338  bool oov_preprocess(const EST_String &filename,
339  EST_String &new_filename,
340  const EST_String &what);
341 
342 
343  const EST_NgrammarState &find_state_const(const EST_StrVector &words)const;
344  EST_NgrammarState &find_state(const EST_StrVector &words);
345  const EST_NgrammarState &find_state_const(const EST_IVector &words) const;
346  EST_NgrammarState &find_state(const EST_IVector &words);
347 
348  // special versions for backoff grammars
349  const EST_DiscreteProbDistribution &backoff_prob_dist(const EST_StrVector &words) const;
350  double backoff_reverse_probability_sub(const EST_StrVector &words,
351  const EST_BackoffNgrammarState *root) const;
352  double backoff_probability(const EST_StrVector &words,
353  const bool trace=false) const;
354  double backoff_reverse_probability(const EST_StrVector &words) const;
355  const EST_String & backoff_most_probable(const EST_StrVector &words,
356  double *prob = NULL) const;
357 
358  // backoff representation isn't a nice array of states
359  // so use this to visit every node in the tree
360  // and apply the function to that node
361  void backoff_traverse(EST_BackoffNgrammarState *start_state,
362  void (*function)(EST_BackoffNgrammarState *s,
363  void *params),
364  void *params);
365 
366  // visit every node at a given level
367  void backoff_traverse(EST_BackoffNgrammarState *start_state,
368  void (*function)(EST_BackoffNgrammarState *s,
369  void *params),
370  void *params, const int level);
371 public:
372 
374  default_values();
375  p_order = -1;
376  p_number_of_sentences = 0;
377  backoff_representation = 0;
378  backoff_discount = 0;
379  }
380 
382  const EST_StrList &wordlist)
383  {
384  default_values(); init(o,r,wordlist);
385  }
386 
387  // When state trans vocab differs from prediction vocab
389  const EST_StrList &wordlist,
390  const EST_StrList &predlist)
391  {
392  default_values(); init(o,r,wordlist,predlist);
393  }
394 
396  {
397  default_values(); init(o,r,v);
398  }
399  ~EST_Ngrammar();
400 
401  void default_values();
402  void clear();
403  bool init(int o, representation_t r,
404  const EST_StrList &wordlist);
405  bool init(int o, representation_t r,
406  const EST_StrList &wordlist,
407  const EST_StrList &predlist);
408  bool init(int o, representation_t r, EST_Discrete &v);
409  bool init(int o, representation_t r,
410  EST_Discrete &v,EST_Discrete &pv);
411 
412  // access
413  int num_states(void) const { return p_num_states;}
414  double samples(void) const { return p_num_samples;}
415  int order() const { return p_order; }
416  int get_vocab_length() const { return vocab?vocab->length():0; }
417  EST_String get_vocab_word(int i) const;
418  int get_vocab_word(const EST_String &s) const;
419  int get_pred_vocab_length() const { return pred_vocab->length(); }
420  EST_String get_pred_vocab_word(int i) const { return pred_vocab->name(i); }
421  int get_pred_vocab_word(const EST_String &s) const
422  { return pred_vocab->name(s); }
423  int closed_vocab() const {return !allow_oov; }
424  entry_t entry_type() const {return p_entry_type;}
426  { return p_representation;}
427 
428  // build
429  bool build(const EST_StrList &filenames,
430  const EST_String &prev = SENTENCE_START_MARKER,
431  const EST_String &prev_prev = SENTENCE_END_MARKER,
432  const EST_String &last = SENTENCE_END_MARKER,
433  const EST_String &input_format = "",
434  const EST_String &oov_mode = "",
435  const int mincount=1,
436  const int maxcount=10);
437 
438  // Accumulate ngrams
439  void accumulate(const EST_StrVector &words,
440  const double count=1);
441  //const int index=0);
442  void accumulate(const EST_IVector &words,
443  const double count=1);
444  //const int index=0);
445 
446  // hack - fix enter/exit probs s.t. P(...,!ENTER)=P(!EXIT,...)=0, for all x
447  void make_htk_compatible();
448 
449  // I/O functions
450  EST_read_status load(const EST_String &filename);
451  EST_read_status load(const EST_String &filename, const EST_StrList &wordlist);
452  EST_write_status save(const EST_String &filename,
453  const EST_String type="cstr_ascii",
454  const bool trace=false,
455  double floor=0.0);
456 
457  int wordlist_index(const EST_String &word, const bool report=true) const;
458  const EST_String &wordlist_index(int i) const;
459  int predlist_index(const EST_String &word) const;
460  const EST_String &predlist_index(int i) const;
461 
462  // set
463  bool set_entry_type(entry_t new_type);
464  bool set_representation(representation_t new_representation);
465 
466  // probability distributions
467  // -------------------------
468  // flag 'force' forces computation of probs on-the-fly if necessary
469  double probability(const EST_StrVector &words, bool force=false,
470  const bool trace=false) const;
471  double frequency(const EST_StrVector &words, bool force=false,
472  const bool trace=false) const;
473 
474  const EST_String &predict(const EST_StrVector &words,
475  double *prob,int *state) const;
476  const EST_String &predict(const EST_StrVector &words) const
477  {double p; int state; return predict(words,&p,&state); }
478  const EST_String &predict(const EST_StrVector &words,double *prob) const
479  {int state; return predict(words,prob,&state); }
480 
481  const EST_String &predict(const EST_IVector &words,double *prob,int *state) const;
482  const EST_String &predict(const EST_IVector &words) const
483  {double p; int state; return predict(words,&p,&state); }
484  const EST_String &predict(const EST_IVector &words,double *prob) const
485  {int state; return predict(words,prob,&state); }
486 
487  int find_state_id(const EST_StrVector &words) const;
488  int find_state_id(const EST_IVector &words) const;
489  int find_next_state_id(int state, int word) const;
490  // fast versions for common N
491  //const double probability(const EST_String w1);
492  //const double probability(const EST_String w1,const EST_String w2);
493  //const double probability(const EST_String w1,const EST_String w2,
494  //const EST_String w2);
495 
496  // reverse - probability of words[0..order-2] given word[order-1]
497  double reverse_probability(const EST_StrVector &words,
498  bool force=false) const;
499  double reverse_probability(const EST_IVector &words,
500  bool force=false) const;
501 
502  // predict, where words has 'order' elements and the last one is "" or NULL
503  const EST_DiscreteProbDistribution &prob_dist(const EST_StrVector &words) const;
504  const EST_DiscreteProbDistribution &prob_dist(const EST_IVector &words) const;
505  const EST_DiscreteProbDistribution &prob_dist(int state) const;
506 
507 // bool stats(const EST_String filename,
508 // double &raw_entropy, double &count,
509 // double &entropy, double &perplexity,
510 // const EST_String &prev = SENTENCE_START_MARKER,
511 // const EST_String &prev_prev = SENTENCE_END_MARKER,
512 // const EST_String &last = SENTENCE_END_MARKER,
513 // const EST_String &input_format = "") const;
514 
515  void fill_window_start(EST_IVector &window,
516  const EST_String &prev,
517  const EST_String &prev_prev) const;
518 
519  void fill_window_start(EST_StrVector &window,
520  const EST_String &prev,
521  const EST_String &prev_prev) const;
522 
523  // why anybody would want to do this ....
524  //EST_Ngrammar &operator =(const EST_Ngrammar &a);
525 
526  bool ngram_exists(const EST_StrVector &words) const;
527  bool ngram_exists(const EST_StrVector &words, const double threshold) const;
528  double get_backoff_weight(const EST_StrVector &words) const;
529  bool set_backoff_weight(const EST_StrVector &words, const double w);
530 
531  void print_freqs(ostream &os,double floor=0.0);
532 
533  // i/o functions
534  // -------------
535  friend ostream& operator<<(ostream& s, EST_Ngrammar &n);
536  friend EST_read_status load_ngram_htk_ascii(const EST_String filename,
537  EST_Ngrammar &n);
538  friend EST_read_status load_ngram_htk_binary(const EST_String filename,
539  EST_Ngrammar &n);
540  friend EST_read_status load_ngram_arpa(const EST_String filename,
541  EST_Ngrammar &n,
542  const EST_StrList &vocab);
543  friend EST_read_status load_ngram_cstr_ascii(const EST_String filename,
544  EST_Ngrammar &n);
545  friend EST_read_status load_ngram_cstr_bin(const EST_String filename,
546  EST_Ngrammar &n);
547 
549  ostream *ost,
550  EST_Ngrammar &n,
551  double floor);
552  friend EST_write_status save_ngram_htk_ascii(const EST_String filename,
553  EST_Ngrammar &n,
554  double floor);
555 
556  //friend EST_write_status save_ngram_htk_binary(const EST_String filename,
557  // EST_Ngrammar &n);
558  friend EST_write_status save_ngram_cstr_ascii(const EST_String filename,
559  EST_Ngrammar &n,
560  const bool trace,
561  double floor);
562  friend EST_write_status save_ngram_cstr_bin(const EST_String filename,
563  EST_Ngrammar &n,
564  const bool trace,
565  double floor);
566  friend EST_write_status save_ngram_arpa(const EST_String filename,
567  EST_Ngrammar &n);
568  friend EST_write_status save_ngram_arpa_sub(ostream *ost,
569  EST_Ngrammar &n,
570  const EST_StrVector &words);
571  friend EST_write_status save_ngram_wfst(const EST_String filename,
572  EST_Ngrammar &n);
573 
574  // Auxiliary functions
575 
576  // smoothing
577 friend void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n,int this_order);
578 friend void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order);
579 friend bool Good_Turing_smooth(EST_Ngrammar &n, int maxcount, int mincount);
580 friend void Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount,
581  const double default_discount);
582 
583 friend void fs_build_backoff_ngrams(EST_Ngrammar *backoff_ngrams,
584  EST_Ngrammar &ngram);
585 friend int fs_backoff_smooth(EST_Ngrammar *backoff_ngrams,
586  EST_Ngrammar &ngram, int smooth_thresh);
587 
588  // frequencies below mincount get backed off
589  // frequencies above maxcount are not smoothed(discounted)
590  bool compute_backoff_weights(const int mincount=1,
591  const int maxcount=10);
592 
593 
594  bool merge(EST_Ngrammar &n,float weight);
595 
596 friend class EST_BackoffNgrammar;
597 
598 };
599 
600 void Ngram_freqsmooth(EST_Ngrammar &ngram,
601  int smooth_thresh1,
602  int smooth_thresh2);
603 
604 // utils
605 void slide(EST_IVector &i, const int l);
606 void slide(EST_StrVector &i, const int l);
607 
608 bool test_stats(EST_Ngrammar &ngram,
609  const EST_String &filename,
610  double &raw_entropy,
611  double &count,
612  double &entropy,
613  double &perplexity,
614  const EST_String &input_format,
615  const EST_String &prev = SENTENCE_START_MARKER,
616  const EST_String &prev_prev = SENTENCE_END_MARKER,
617  const EST_String &last = SENTENCE_END_MARKER);
618 
620 
621 #endif // __EST_NGRAMMAR_H__
friend ostream & operator<<(ostream &s, const EST_NgrammarState &a)
const EST_String & predict(const EST_StrVector &words) const
Definition: EST_Ngrammar.h:476
EST_BackoffNgrammarState(const EST_Discrete *d, int level)
Definition: EST_Ngrammar.h:146
EST_Ngrammar(int o, representation_t r, const EST_StrList &wordlist, const EST_StrList &predlist)
Definition: EST_Ngrammar.h:388
EST_DVector * backoff_discount
Definition: EST_Ngrammar.h:269
double probability(int w) const
Definition: EST_Ngrammar.h:118
void slide(EST_IVector &i, const int l)
int fs_backoff_smooth(EST_Ngrammar *backoff_ngrams, EST_Ngrammar &ngram, int smooth_thresh)
Definition: freqsmooth.cc:111
double samples(void) const
Definition: EST_Ngrammar.h:414
EST_String p_sentence_end_marker
Definition: EST_Ngrammar.h:239
bool test_stats(EST_Ngrammar &ngram, const EST_String &filename, double &raw_entropy, double &count, double &entropy, double &perplexity, const EST_String &input_format, const EST_String &prev=SENTENCE_START_MARKER, const EST_String &prev_prev=SENTENCE_END_MARKER, const EST_String &last=SENTENCE_END_MARKER)
const EST_Discrete * get_discrete() const
Returns discrete vocabulary of distribution.
const EST_String & most_probable(double *prob=NULL) const
Definition: EST_Ngrammar.h:172
const EST_String & most_probable(double *prob=NULL) const
Return the most probable member of the distribution.
int get_pred_vocab_word(const EST_String &s) const
Definition: EST_Ngrammar.h:421
void Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount, const double default_discount=0.5)
EST_read_status load_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:227
EST_write_status
int closed_vocab() const
Definition: EST_Ngrammar.h:423
EST_BackoffNgrammarState * get_child(const int word) const
Definition: EST_Ngrammar.h:181
#define SENTENCE_END_MARKER
Definition: EST_Ngrammar.h:60
const EST_String & lastword(const EST_StrVector &words) const
Definition: EST_Ngrammar.h:294
EST_write_status save_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n, const bool trace=false, double floor=0.0)
Definition: ngrammar_io.cc:844
EST_BackoffNgrammarState * backoff_representation
Definition: EST_Ngrammar.h:257
bool save(Lattice &lattice, EST_String filename)
const EST_String & name(const int n) const
The name given the index.
const EST_String & predict(const EST_StrVector &words, double *prob) const
Definition: EST_Ngrammar.h:478
int merge
Definition: rxp.c:21
int length(void) const
The number of members in the discrete.
double frequency(const EST_String &w) const
Definition: EST_Ngrammar.h:170
bool load(Lattice &lattice, EST_String filename)
EST_read_status load_ngram_arpa(const EST_String filename, EST_Ngrammar &n, const EST_StrList &vocab)
Definition: ngrammar_io.cc:74
EST_Discrete * vocab
Definition: EST_Ngrammar.h:283
EST_write_status save_ngram_htk_ascii_sub(const EST_String &word, ostream *ost, EST_Ngrammar &n, double floor)
Definition: ngrammar_io.cc:424
entry_t entry_type() const
Definition: EST_Ngrammar.h:424
void cumulate(const int index, const double count=1)
Definition: EST_Ngrammar.h:107
EST_Discrete * pred_vocab
Definition: EST_Ngrammar.h:284
EST_IVector path
Definition: EST_Ngrammar.h:98
EST_Item * root(const EST_Item *n)
return root node of treeprevious sibling (sister) of n
double probability(const EST_String &w) const
Definition: EST_Ngrammar.h:116
EST_NgrammarState * p_states
Definition: EST_Ngrammar.h:276
int check_vocab(EST_Relation &a, EST_StrList &vocab)
double probability(const EST_String &s) const
int num_states(void) const
Definition: EST_Ngrammar.h:413
EST_read_status load_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:58
int get_pred_vocab_length() const
Definition: EST_Ngrammar.h:419
int index(EST_TList< T > &l, T &val, bool(*eq)(const EST_UItem *, const EST_UItem *)=NULL)
Definition: EST_TList.h:286
double frequency(const EST_String &w) const
Definition: EST_Ngrammar.h:119
void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n, int this_order=0)
EST_DiscreteProbDistribution vocab_pdf
Definition: EST_Ngrammar.h:292
const EST_String & predict(const EST_IVector &words, double *prob) const
Definition: EST_Ngrammar.h:484
EST_StringTrie children
Definition: EST_Ngrammar.h:137
double p_number_of_sentences
Definition: EST_Ngrammar.h:235
EST_DiscreteProbDistribution & pdf()
Definition: EST_Ngrammar.h:115
EST_write_status save_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n, double floor=0.0)
Definition: ngrammar_io.cc:565
EST_NgrammarState(int id, const EST_DiscreteProbDistribution &pdf)
Definition: EST_Ngrammar.h:92
void cumulate(const EST_String &s, double count=1)
Add this observation, may specify number of occurrences.
double frequency(const EST_String &s) const
EST_String get_pred_vocab_word(int i) const
Definition: EST_Ngrammar.h:420
void Ngram_freqsmooth(EST_Ngrammar &ngram, int smooth_thresh1, int smooth_thresh2)
Definition: freqsmooth.cc:58
int take_logs
representation_t p_representation
Definition: EST_Ngrammar.h:242
#define VAL_REGISTER_CLASS_DCLS(NAME, CLASS)
Definition: EST_Val_defs.h:44
double frequency(int w) const
Definition: EST_Ngrammar.h:121
EST_DiscreteProbDistribution p_pdf
Definition: EST_Ngrammar.h:136
EST_BackoffNgrammarState(const EST_DiscreteProbDistribution &pdf, int level)
Definition: EST_Ngrammar.h:148
EST_DiscreteProbDistribution & pdf()
Definition: EST_Ngrammar.h:167
double backoff_threshold
Definition: EST_Ngrammar.h:259
void cumulate(const EST_String &word, const double count=1)
Definition: EST_Ngrammar.h:109
double backoff_unigram_floor_freq
Definition: EST_Ngrammar.h:262
representation_t representation() const
Definition: EST_Ngrammar.h:425
EST_read_status load_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:289
const EST_DiscreteProbDistribution & pdf_const() const
Definition: EST_Ngrammar.h:166
int id() const
Definition: EST_Ngrammar.h:113
section options Options< strong > or ngram_per_line Pseudo words
NULL
Definition: EST_WFST.cc:55
EST_DiscreteProbDistribution p_pdf
Definition: EST_Ngrammar.h:80
EST_NgrammarState(int id, EST_Discrete *d)
Definition: EST_Ngrammar.h:91
A vector class for double precision floating point numbers. EST_DVector x should be used instead of f...
Definition: EST_DMatrix.h:122
EST_PredictionSuffixTree sparse_representation
Definition: EST_Ngrammar.h:247
EST_write_status save_ngram_arpa(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:670
EST_read_status load_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:66
int get_vocab_length() const
Definition: EST_Ngrammar.h:416
int lastword(const EST_IVector &words) const
Definition: EST_Ngrammar.h:296
bool Good_Turing_smooth(EST_Ngrammar &n, int maxcount, int mincount=0)
int order() const
Definition: EST_Ngrammar.h:415
EST_read_status
A string tree index class for indexing arbitrary objects by strings of characters.
EST_BackoffNgrammarState * get_child(const EST_String &word) const
Definition: EST_Ngrammar.h:177
const EST_String & predict(const EST_IVector &words) const
Definition: EST_Ngrammar.h:482
EST_write_status save_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n, const bool trace=false, double floor=0.0)
Definition: ngrammar_io.cc:747
const EST_String & most_probable(double *prob=NULL) const
Definition: EST_Ngrammar.h:122
entry_t p_entry_type
Definition: EST_Ngrammar.h:243
void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order=0)
void fs_build_backoff_ngrams(EST_Ngrammar *backoff_ngrams, EST_Ngrammar &ngram)
Definition: freqsmooth.cc:75
void save_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *ost)
Definition: ngrammar_io.cc:647
#define SENTENCE_START_MARKER
Definition: EST_Ngrammar.h:59
EST_String p_sentence_start_marker
Definition: EST_Ngrammar.h:238
double probability(const EST_String &w) const
Definition: EST_Ngrammar.h:168
void * lookup(const EST_String &key) const
Find contents index by key, 0 if there is not contents.
EST_Ngrammar(int o, representation_t r, const EST_StrList &wordlist)
Definition: EST_Ngrammar.h:381
const EST_DiscreteProbDistribution & pdf_const() const
Definition: EST_Ngrammar.h:114
EST_write_status save_ngram_wfst(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:806
Utility EST_String Functions header file.
double get_backoff_weight() const
Definition: EST_Ngrammar.h:196
EST_Ngrammar(int o, representation_t r, EST_Discrete &v)
Definition: EST_Ngrammar.h:395