Edinburgh Speech Tools  2.1-release
EST_Ngrammar.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Simon King & Alan W Black */
34 /* Date : February 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* An EST_Ngram class for building and manipulating bigrams trigrams etc */
38 /* */
39 /*=======================================================================*/
40 #include <iostream>
41 #include <fstream>
42 #include <cstring>
43 #include <cmath>
44 #include <climits>
45 #include <cfloat>
46 
47 using namespace std;
48 
49 #include "EST_Ngrammar.h"
50 #include "EST_Pathname.h"
51 #include "EST_Token.h"
52 #include "EST_io_aux.h"
53 
54 
56 static EST_String NOVOCAB("NOVOCAB");
57 
58 // **********************************************************************
59 
61 {
62  clear();
63  init(s.id(),s.pdf_const());
64 }
65 
67 {
68  clear();
69  init(s->id(),s->pdf_const()); // copy
70 }
71 
73 {
74  clear();
75 }
76 
78 {
79  p_id = -1;
80  p_pdf.clear();
81 }
82 
84 {
85  p_id=-1;
86  p_pdf.init();
87 }
88 
90 {
91  p_id = id;
92  p_pdf.init(d);
93 }
94 
95 
97 {
98  p_id = id;
99  p_pdf = pdf; // copy it
100 }
101 
102 
103 ostream& operator<<(ostream& s, const EST_NgrammarState &a)
104 {
105  s << "(" << a.id() << ": " << a.pdf_const() << " )";
106  return s;
107 }
108 
109 // **********************************************************************
110 
112 {
113  p_pdf.clear();
114  children.clear();
115 }
116 
117 
119 {
120  backoff_weight=0;
121  p_pdf.clear();
122 }
123 
125 {
126  backoff_weight=0;
127  p_level = -1; /* It better be sanely initialized somewhere else */
128  p_pdf.init();
129 }
130 
132 {
133  backoff_weight=0;
134  p_level = level;
135  p_pdf.init(d);
136 }
137 
139 {
140  backoff_weight=0;
141  p_level = level;
142  p_pdf = pdf; // copy it
143 }
144 
146  const double count)
147 {
148 
149 // int i;
150 // cerr << "accumulate ";
151 // for(i=0;i<words.n();i++)
152 // {
153 // if(words.n()-1-p_level == i)
154 // cerr <<"*";
155 // cerr << words(i);
156 // if(words.n()-1-p_level == i)
157 // cerr <<"*";
158 // }
159 // cerr << endl;
160 
161  // update pdf at this node
162  p_pdf.cumulate(words(words.n()-1-p_level),count);
163 
164  // then go down
165  if (words.n()-1-p_level > 0) // not at bottom of tree
166  {
167 
169 
170  // have we got the child
171  s = get_child(words(words.n()-1-p_level));
172  if (s==NULL)
173  // have to extend the tree
174  s = add_child(p_pdf.get_discrete(),words);
175  return s->accumulate(words,count);
176 
177  }
178  else
179  {
180  return true;
181  }
182 }
183 
185  const double count)
186 {
187 
188 // int i;
189 // cerr << "accumulate level " << p_level << " : ";
190 // for(i=0;i<words.n();i++)
191 // {
192 // if(words.n()-1-p_level == i)
193 // cerr <<"*";
194 // cerr << p_pdf.item_name(words(i));
195 // if(words.n()-1-p_level == i)
196 // cerr <<"*";
197 // cerr << " ";
198 // }
199 // cerr << endl;
200 
201  // update pdf at this node
202  p_pdf.cumulate(words(words.n()-1-p_level),count);
203 
204  //cerr << *this << endl;
205  //cerr << endl;
206 
207  // then go down
208  if (words.n()-1-p_level > 0) // not at bottom of tree
209  {
210 
212 
213  // have we got the child
214  s = get_child(words(words.n()-1-p_level));
215  if (s==NULL)
216  // have to extend the tree
217  add_child(p_pdf.get_discrete(),words);
218 
219  // get pointer again in case we built more than one level
220  s = get_child(words(words.n()-1-p_level));
221  if (s==NULL)
222  {
223  cerr << "Failed to extend tree - unknown reason !" << endl;
224  return false;
225  }
226  return s->accumulate(words,count);
227  }
228  else
229  {
230  return true;
231  }
232 }
233 
234 
237  const EST_StrVector &words)
238 {
240 
241  if (words.n()-1-p_level > 0) // more history still to go
242  {
243  // see if we can go down the tree
244  s = get_child(words(words.n()-1-p_level));
245  if (s != NULL)
246  return s->add_child(d,words);
247  else
248  {
249  // construct tree as we go
251  new_child->init(d,p_level+1);
252  children.add(words(words.n()-1-p_level), (void*)new_child);
253  return new_child->add_child(d,words);
254  }
255  }
256  else
257  {
258  // this is the node we are trying to add !
259  return this;
260  }
261 }
262 
263 
264 
267  const EST_IVector &words)
268 {
270 
271  if (words.n()-1-p_level > 0) // more history still to go
272  {
273  // see if we can go down the tree
274  s = get_child(words(words.n()-1-p_level));
275  if (s != NULL)
276  return s->add_child(d,words);
277  else
278  {
279  // construct tree as we go
281  new_child->init(d,p_level+1);
282  children.add(p_pdf.get_discrete()->name(words(words.n()-1-p_level)), (void*)new_child);
283  return new_child->add_child(d,words);
284  }
285  }
286  else
287  {
288  // this is the node we are trying to add !
289  return this;
290  }
291 }
292 
294  const EST_String &name)
295 {
296 
297  child->zap();
298  // can't remove from StringTrie, but can set pointer to NULL
299  children.add(name,NULL);
300  delete child;
301 }
302 
304  const int order,
305  EST_String followers)
306 {
307  // not right - just print out, then recurse through children
308  // change to use 'backoff_traverse'
309 
310  EST_Litem *k;
311  double freq;
312  EST_String name;
313  for (k=p_pdf.item_start();
314  !p_pdf.item_end(k);
315  k = p_pdf.item_next(k))
316  {
317  p_pdf.item_freq(k,name,freq);
318  EST_BackoffNgrammarState *s = ((EST_BackoffNgrammarState*)(children.lookup(name)));
319  if (p_level==order-1)
320  {
321  if(freq>0)
322  os << name << " " << followers
323  << ": " << freq << endl;
324  }
325  else if (s!=NULL)
326  s->print_freqs(os,order,name+" "+followers);
327 
328  }
329 }
330 
332  const double threshold) const
333 {
334  const EST_BackoffNgrammarState *s;
335  s = get_state(words);
336  if (s != NULL)
337  {
338  // return true for unigrams (state level 0)
339  // even if freq < threshold
340  return (bool)((s->level()==0) ||
341  ( s->frequency(words(0)) > threshold ));
342  }
343  else
344  return false;
345 }
346 
349 {
351  if (words.n()-1-p_level > 0) // more history still to go
352  {
353  s = get_child(words(words.n()-1-p_level));
354  if (s != NULL)
355  {
356  //cerr << "traversing from " << *this << endl;
357  //cerr << " to " << *s << endl << endl;
358  return s->get_state(words);
359  }
360  else
361  {
362  //cerr << "got NULL" << endl;
363  return NULL;
364  }
365  }
366  else
367  {
368  //cerr << "got " << *this << endl;
369  return this;
370  }
371 }
372 
374 {
375 
376  // recursively delete this state and all its children
377  EST_Litem *k;
378  double freq;
379  EST_String name;
380  for (k=p_pdf.item_start();
381  !p_pdf.item_end(k);
382  k = p_pdf.item_next(k))
383  {
384  p_pdf.item_freq(k,name,freq);
385  EST_BackoffNgrammarState *child = get_child(name);
386  if (child!=NULL)
387  remove_child(child,name);
388  }
389 
390  children.clear();
391  p_pdf.clear();
392 
393 }
394 
395 
397 {
399  if (words.n()-1-p_level >= 0)
400  {
401  s = get_child(words(words.n()-1-p_level));
402  if (s != NULL)
403  return s->get_backoff_weight(words);
404  else
405  {
406  // if there is no node, the weight would have been 1 anyway
407  //cerr << "Couldn't get weight for " << words << endl;
408  return 1; // default
409  }
410  }
411 
412  else
413  {
414  // reached node
415 
416 /*
417  cerr << "got bw for ";
418  for(int i=0;i<words.n();i++)
419  cerr << words(i) << " ";
420  cerr << " at level " << p_level << " = " << backoff_weight << endl;
421 */
422  return backoff_weight;
423  }
424 }
425 
426 
428 {
430  if (words.n()-1-p_level >= 0)
431  {
432  s = get_child(words(words.n()-1-p_level));
433  if (s != NULL)
434  return s->set_backoff_weight(words,w);
435  else
436  {
437  // we can't set the weight because the node
438  // doesn't exist - this is an error if the weight
439  // is not 1
440  if (w != 1)
441  {
442  cerr << "Couldn't set weight for " << words
443  << " to " << w << endl;
444  return false;
445  }
446  else
447  return true; // a weight of 1 does not need storing
448  }
449  }
450  else
451  {
452  // reached node
453  backoff_weight = w;
454  return true;
455  }
456 }
457 
459 {
460  int max=ff.n();
461  EST_Litem *k;
462  double freq;
463  EST_String name;
464  for (k=p_pdf.item_start();
465  !p_pdf.item_end(k);
466  k = p_pdf.item_next(k))
467  {
468  p_pdf.item_freq(k,name,freq);
469  if(freq < max)
470  ff[(int)(freq+0.5)] += 1;
471  }
472 }
473 
474 ostream& operator<<(ostream& s, const EST_BackoffNgrammarState &a)
475 {
476  s << "(backoff level:" << a.p_level
477  << " weight:" << a.backoff_weight << " " << a.pdf_const() << " )";
478 
479  return s;
480 }
481 
482 // **********************************************************************
483 
485 {
486  p_representation = EST_Ngrammar::sparse;
487  p_entry_type = EST_Ngrammar::frequencies; // by default
488  sparse_representation.clear();
489  allow_oov=false;
490  p_num_samples = 0;
491  p_num_states = 0;
492  p_states = 0;
493  vocab = 0;
494  pred_vocab = 0;
495  backoff_threshold = 1.0;
496  backoff_unigram_floor_freq = 0.0;
497 }
498 
500 {
501  delete [] p_states;
502 }
503 
505 {
506  // to do
507 }
508 
510  const EST_StrList &wordlist)
511 {
512  return (bool)(init_vocab(wordlist) && p_init(o,r));
513 }
514 
516  const EST_StrList &wordlist,
517  const EST_StrList &predlist)
518 {
519  return (bool)(init_vocab(wordlist,predlist) && p_init(o,r));
520 }
521 
523  EST_Discrete &v)
524 {
525  vocab = &v;
526  pred_vocab = vocab;
527  vocab_pdf.init(pred_vocab);
528  return p_init(o,r);
529 }
530 
532  EST_Discrete &v, EST_Discrete &pv)
533 {
534  vocab = &v;
535  pred_vocab = &pv;
536  vocab_pdf.init(pred_vocab);
537  return p_init(o,r);
538 }
539 
541 {
542  if (o <= 0)
543  {
544  cerr << "EST_Ngrammar order must be > 0" << endl;
545  return false;
546  }
547 
548  p_order = o;
549  p_representation = r;
550  p_number_of_sentences = 0;
551 
552  switch(p_representation)
553  {
554 
556  sparse_representation.init(p_order);
557  return true;
558  break;
559 
560  case EST_Ngrammar::dense:
561  return init_dense_representation();
562  break;
563 
565  return init_backoff_representation();
566  break;
567 
568  default:
569  cerr << "Unknown internal representation requested for EST_Ngrammar"
570  << endl;
571  return false;
572  break;
573  }
574 }
575 
577 {
578  // allocate a flattened N-dimensional matrix of states
579  int i;
580 
581  if (vocab->length() <= 0)
582  {
583  cerr << "EST_Ngrammar: dense_representation requires explicit vocab"
584  << endl;
585  return false;
586  }
587 
588  p_num_states = (int)pow(float(vocab->length()),float(p_order-1));
589  p_states = new EST_NgrammarState[p_num_states];
590  for (i=0; i < p_num_states; i++)
591  p_states[i].init(i,pred_vocab);
592 
593  return true;
594 }
595 
597 {
598 
599  if (vocab->length() <= 0)
600  {
601  cerr << "EST_Ngrammar: dense_representation requires explicit vocab"
602  << endl;
603  return false;
604  }
605 
606  p_num_states = (int)pow(float(vocab->length()),float(p_order-1));
607  p_states = new EST_NgrammarState[p_num_states];
608 
609  return (bool)(p_states != NULL);
610 }
611 
613 {
614 
615  // nothing much to do
616  backoff_representation = new EST_BackoffNgrammarState;
617  backoff_representation->init(vocab,0);
618  return true;
619 }
620 
621 
623 {
624  int i,ind=index;
625  EST_StrVector *ngram = new EST_StrVector;
626  ngram->resize(p_order-1); // exclude last word
627 
628  // matches 'find_dense_state_index' so first word is most significant
629  // also, cannot fill in last word
630 
631  for(i=p_order-2;i>=0;i--)
632  {
633 #if defined(sun) && ! defined(__svr4__)
634 /* SunOS */
635  int rem = ind%vocab->length();
636  int quot = ind/vocab->length();
637  (*ngram)[i] = wordlist_index(rem);
638  ind = quot;
639 #else
640  div_t d = div(ind,vocab->length());
641  (*ngram)[i] = wordlist_index(d.rem);
642  ind = d.quot;
643 #endif
644  }
645 
646  return *ngram;
647 }
648 
649 
650 
651 
652 bool EST_Ngrammar::init_vocab(const EST_StrList &word_list)
653 {
654  vocab = new EST_Discrete();
655  if(!vocab->init(word_list))
656  return false;
657 
658  pred_vocab = vocab; // same thing in this case
659  vocab_pdf.init(pred_vocab);
660 
661  return true;
662 }
663 
664 bool EST_Ngrammar::init_vocab(const EST_StrList &word_list,
665  const EST_StrList &pred_list)
666 {
667  vocab = new EST_Discrete();
668  if(!vocab->init(word_list))
669  return false;
670  pred_vocab = new EST_Discrete();
671  if(!pred_vocab->init(pred_list))
672  return false;
673  vocab_pdf.init(pred_vocab);
674 
675  return true;
676 }
677 
679 {
680  EST_Discrete *comp_vocab = new EST_Discrete();
681 
682  if(!comp_vocab->init(word_list))
683  {
684  delete comp_vocab;
685  return false;
686  }
687 
688  if(*vocab != *comp_vocab)
689  {
690  delete comp_vocab;
691  return false;
692  }
693 
694  delete comp_vocab;
695  return true;
696 }
697 
699 {
700  return vocab->name(i);
701 }
702 
704 {
705 
706  if (word=="") // can't lookup
707  return -1;
708 
709  int i;
710  i = pred_vocab->index(word);
711  if(i >= 0 )
712  return i;
713 
714  cerr << "Word \"" << word << "\" is not in the predictee word list" << endl;
715 
716  if (allow_oov)
717  {
718  i = pred_vocab->index(OOV_MARKER);
719  if(i >= 0)
720  return i;
721 
722  cerr << "Even " << OOV_MARKER << " is not in the predictee word list !" << endl;
723  }
724 
725  return -1;
726 }
727 
728 
730 {
731  return pred_vocab->name(i);
732 }
733 
734 int EST_Ngrammar::wordlist_index(const EST_String &word, const bool report) const
735 {
736 
737  if (word=="") // can't lookup
738  return -1;
739 
740  int i;
741  i = vocab->index(word);
742  if(i >= 0 )
743  return i;
744 
745  if(report)
746  cerr << "Word \"" << word << "\" is not in the word list" << endl;
747 
748  if (allow_oov)
749  {
750  i = vocab->index(OOV_MARKER);
751  if(i >= 0)
752  return i;
753  if(report)
754  cerr << "Even " << OOV_MARKER << " is not in the word list !" << endl;
755  }
756 
757  return -1;
758 }
759 
760 bool EST_Ngrammar::build(const EST_StrList &filenames,
761  const EST_String &prev,
762  const EST_String &prev_prev,
763  const EST_String &last,
764  const EST_String &input_format,
765  const EST_String &oov_mode,
766  const int mincount,
767  const int maxcount)
768 {
769 
770  p_sentence_start_marker = prev;
771  p_sentence_end_marker = last;
772 
773 
774  // when backing off, safest modes ...
775  if( (p_representation == EST_Ngrammar::backoff) &&
776  (oov_mode != "skip_file") &&
777  (oov_mode != "skip_sentence"))
778  cerr << "Warning : building a backoff grammar" << endl
779  << " with oov_mode '" << oov_mode
780  << "' is not recommended !" << endl;
781 
782  if( (oov_mode != "skip_ngram") &&
783  (oov_mode != "skip_sentence") &&
784  (oov_mode != "skip_file") &&
785  (oov_mode != "use_oov_marker") )
786  {
787  cerr << "Unknown oov_mode '" << oov_mode << "'" << endl;
788  return false;
789  }
790 
791  if( (oov_mode == "skip_sentence") &&
792  (input_format == "ngram_per_line"))
793  {
794  cerr << "Sorry, with input format 'ngram_per_line' you cannot " << endl
795  << " select oov_mode 'skip_sentence'" << endl;
796  return false;
797  }
798 
799  if(oov_mode == "use_oov_marker")
800  allow_oov = true;
801  else
802  allow_oov = false;
803 
804  bool skip_this;
805  EST_String new_filename;
806  EST_Litem *p;
807  for (p = filenames.head(); p; p = p->next())
808  {
809  cerr << "Building from " << filenames(p) << endl;
810 
811  skip_this=false;
812  if( ((oov_mode == "skip_sentence") &&
813  (input_format == "sentence_per_file")) ||
814  (oov_mode == "skip_file") )
815  skip_this = oov_preprocess(filenames(p),new_filename,
816  "skip if found");
817 
818  else if( ((oov_mode == "skip_sentence") &&
819  (input_format == "sentence_per_line")) ||
820  ((oov_mode == "skip_ngram") &&
821  (input_format == "ngram_per_line")) )
822  oov_preprocess(filenames(p),new_filename,"eliminate lines");
823 
824  else
825  new_filename = filenames(p);
826 
827 
828  if(skip_this)
829  {
830  cerr << "Skipping " << filenames(p)
831  << " (out of vocabulary words found)" << endl;
832  }
833  else
834  {
835 
836  switch(p_representation)
837  {
839  {
840  if (input_format != "")
841  {
842  cerr << "Can't build sparse ngram from '" << input_format;
843  cerr << "' data" << endl;
844  return false;
845  }
846  else if (!build_sparse(new_filename,prev,prev_prev,last))
847  return false;
848  }
849  break;
850 
851  case EST_Ngrammar::dense:
852  if (!build_ngram(new_filename,prev,prev_prev,last,input_format))
853  return false;
854  break;
855 
857  if (!build_ngram(new_filename,prev,prev_prev,last,input_format))
858  return false;
859  break;
860 
861  default:
862  cerr << "Unknown internal representation set for EST_Ngrammar"
863  << endl;
864  return false;
865  break;
866  }
867  }
868 
869  if((new_filename != filenames(p)) &&
870  (new_filename != "") &&
871  !delete_file(new_filename) )
872  {
873  cerr << "Warning : couldn't remove temporary file : "
874  << new_filename << endl;
875  }
876 
877  } // loop round files
878 
879  if (p_representation == EST_Ngrammar::backoff)
880  return compute_backoff_weights(mincount,maxcount);
881 
882  return true;
883 }
884 
886  const double count)
887 {
888  if (words.n() < p_order)
889  cerr << "EST_Ngrammar::accumulate - window is too small" << endl;
890  else
891  {
892  p_num_samples++;
893  const EST_String &w = lastword(words);
894  vocab_pdf.cumulate(w,count);
895 
896  switch(p_representation)
897  {
898  case EST_Ngrammar::dense:
900  find_state(words).cumulate(w,count);
901  break;
902 
904  backoff_representation->accumulate(words,count);
905  break;
906 
907  default:
908  cerr << "EST_Ngrammar::accumulate : invalid representation !"
909  << endl;
910  break;
911  }
912  } // switch
913 }
914 
916  const double count)
917 {
918 
919  /*
920  int i;
921  for(i=0;i<words.n();i++)
922  {
923  cerr << vocab_pdf.item_name(words(i));
924  cerr << " ";
925  }
926  cerr << endl;
927  */
928 
929  if (words.n() < p_order)
930  cerr << "EST_Ngrammar::accumulate - window is too small" << endl;
931  else
932  {
933  p_num_samples++;
934  vocab_pdf.cumulate(words(p_order-1),count);
935 
936  switch(p_representation)
937  {
938 
939  case EST_Ngrammar::dense:
941  find_state(words).cumulate(words(p_order-1),count);
942  break;
943 
945  backoff_representation->accumulate(words,count);
946  break;
947 
948  default:
949  cerr << "EST_Ngrammar::accumulate : invalid representation !"
950  << endl;
951  break;
952  } // switch
953  }
954 }
955 
956 
958 {
959  switch(p_representation)
960  {
962  return false;
963  break;
964 
965  case EST_Ngrammar::dense:
966  return true; // always exists !
967  break;
968 
970  {
971  if(words.n()==1)
972  return backoff_representation->ngram_exists(words,0);
973  else
974  return backoff_representation->ngram_exists(words,backoff_threshold);
975  }
976  break;
977 
978  default:
979  cerr << "ngram_exists: unknown ngrammar representation" << endl;
980  break;
981  }
982  return false;
983 }
984 
985 bool EST_Ngrammar::ngram_exists(const EST_StrVector &words, const double threshold) const
986 {
987  if (p_representation != EST_Ngrammar::backoff)
988  {
989  cerr << "Not a backoff grammar !" << endl;
990  return false;
991  }
992 
993  return backoff_representation->ngram_exists(words,threshold);
994 
995 }
996 
997 
999 {
1000  if(p_representation == EST_Ngrammar::backoff)
1001  return backoff_representation->get_backoff_weight(words);
1002  else
1003  {
1004  cerr << "Can't get backoff weight - not a backed off ngrammar !" << endl;
1005  return 0;
1006  }
1007 }
1008 
1010 {
1011  if(p_representation == EST_Ngrammar::backoff)
1012  return backoff_representation->set_backoff_weight(words,w);
1013  else
1014  {
1015  cerr << "Can't set backoff weight - not a backed off ngrammar !" << endl;
1016  return false;
1017  }
1018 }
1019 
1020 
1022  const EST_String &prev,
1023  const EST_String &prev_prev,
1024  const EST_String &last)
1025 {
1026  sparse_representation.build(filename,prev,prev_prev,last);
1027  return true;
1028 }
1029 
1030 
1032  const EST_String &prev,
1033  const EST_String &prev_prev) const
1034 {
1035  int i;
1036 
1037  for (i=0; i<window.n()-1; i++)
1038  window[i] = wordlist_index(prev_prev);
1039  window[i++] = wordlist_index(prev);
1040 }
1041 
1043  const EST_String &prev,
1044  const EST_String &prev_prev) const
1045 {
1046  int i;
1047 
1048  for (i=0; i<window.n()-1; i++)
1049  window[i] = prev_prev;
1050  window[i++] = prev;
1051 }
1052 
1054  EST_String &new_filename,
1055  const EST_String &what)
1056 {
1057  ostream *ost=0;
1058  EST_TokenStream ts;
1059  new_filename="";
1060  int bad_line_count=0;
1061  int good_line_count=0;
1062 
1063  // if filename is stdin we have to make a temporary file
1064  // if we are eliminating lines we also need a temporary file
1065 
1066  // what = "skip if found" | "eliminate lines"
1067 
1068  bool write_out = false;
1069  if( (what == "eliminate lines") || (filename == "-") )
1070  write_out = true;
1071 
1072  // open the original files for reading
1073  if (filename == "-")
1074  {
1075  if( ts.open(stdin, FALSE) == -1)
1076  {
1077  cerr << "EST_Ngrammar:: failed to open stdin";
1078  cerr << " for reading" << endl;
1079  return false;
1080  }
1081  }
1082  else if (ts.open(filename) == -1){
1083  cerr << "EST_Ngrammar: failed to open file \"" << filename
1084  << "\" for reading" << endl;
1085  return false; // hmmm ?
1086  }
1087 
1088  // open the output file if necessary
1089  if(write_out)
1090  {
1091  new_filename = make_tmp_filename();
1092  ost = new ofstream(new_filename);
1093 
1094  if(!(*ost))
1095  {
1096  cerr << "Ngrammar: couldn't create temporary file \""
1097  << new_filename << "\"" << endl;
1098  new_filename="";
1099  return false;
1100  }
1101  }
1102  else
1103  new_filename = filename;
1104 
1105  EST_String s,this_line;
1106  bool bad_line=false;
1107  while (!ts.eof())
1108  {
1109  s=ts.get().string();
1110 
1111  if (!bad_line && (s != ""))
1112  {
1113  if(wordlist_index(s,false) < 0)
1114  {
1115 
1116  if(what == "eliminate lines")
1117  {
1118  bad_line=true;
1119  }
1120  else // skip file
1121  {
1122  if(write_out)
1123  {
1124  // input was stdin, but we are now discarding it
1125  delete ost;
1126  if(!delete_file(new_filename))
1127  cerr << "Warning : couldn't delete temporary file '"
1128  << new_filename << "'" << endl;
1129  }
1130  new_filename="";
1131  return false;
1132  }
1133 
1134  }
1135  else
1136  this_line += s + " ";
1137  }
1138 
1139  // write out
1140  if(ts.eoln())
1141  {
1142  if(bad_line)
1143  {
1144  bad_line_count++;
1145  }
1146  else
1147  {
1148  if(write_out)
1149  {
1150  // this_line
1151  *ost << this_line << endl;
1152  good_line_count++;
1153  }
1154  }
1155  bad_line=false;
1156  this_line = "";
1157  }
1158 
1159  }
1160  cerr << "skipped " << bad_line_count << " and kept "
1161  << good_line_count << " lines from file " << filename << endl;
1162  return true;
1163 }
1164 
1166  const EST_String &prev,
1167  const EST_String &prev_prev,
1168  const EST_String &last,
1169  const EST_String &input_format)
1170 {
1171  p_entry_type = EST_Ngrammar::frequencies;
1172  int bad_word=0;
1173  EST_String s;
1174  EST_TokenStream ts;
1175  int eoln_is_eos = FALSE;
1176  int sliding_window = TRUE;
1177  int count=0;
1178  clear();
1179 
1180  if ( (input_format == "") || (input_format == "sentence_per_line") )
1181  {
1182  // may do something here later
1183  eoln_is_eos = TRUE;
1184  sliding_window = TRUE;
1185  }
1186  else if (input_format == "sentence_per_file")
1187  {
1188  eoln_is_eos = FALSE;
1189  sliding_window = TRUE;
1190  p_number_of_sentences = 1;
1191  }
1192  else if(input_format == "ngram_per_line")
1193  {
1194  eoln_is_eos = FALSE;
1195  sliding_window = FALSE;
1196  p_number_of_sentences = 1;
1197  }
1198  else
1199  {
1200  cerr << "Can't build from '" << input_format << "' data" << endl;
1201  return false;
1202  }
1203 
1204  EST_IVector window(p_order);
1205 
1206  if (filename == "-")
1207  {
1208  if( ts.open(stdin, FALSE) == -1)
1209  {
1210  cerr << "EST_Ngrammar:: failed to open stdin";
1211  cerr << " for reading" << endl;
1212  return false;
1213  }
1214  }
1215  else if (ts.open(filename) == -1){
1216  cerr << "EST_Ngrammar: failed to open \"" << filename
1217  << "\" for reading" << endl;
1218  return false;
1219  }
1220 
1221  // default input format is one sentence per line
1222  // full stops and commas etc. are taken literally
1223  // i.e. the user must do the preprocessing
1224 
1225  // we assume that all of prev,prev_prev,last are either
1226  // null or set, not a mixture of the two
1227 
1228  // at start window contains
1229  // [prev_prev, prev_prev, ....., prev_prev, prev]
1230  // This is not added to the ngram model though
1231  if(sliding_window)
1232  {
1233  fill_window_start(window,prev,prev_prev);
1234  if (window(p_order-1) == -1)
1235  bad_word = p_order;
1236  else if( (p_order>1) && (window(p_order-2) == -1))
1237  bad_word = p_order-1;
1238  else
1239  bad_word=0;
1240 
1241  if(bad_word > 0)
1242  cerr << "at start : bad word at " << bad_word << endl;
1243 
1244  }
1245  while (!ts.eof())
1246  {
1247  s=ts.get().string();
1248 
1249  if (s != "")
1250  {
1251  if(sliding_window)
1252  {
1253  slide(window,-1);
1254  if (bad_word > 0)
1255  bad_word--;
1256 
1257  window[p_order-1] = wordlist_index(s);
1258  if (window(p_order-1) < 0)
1259  {
1260  cerr << "EST_Ngrammar::build_ngram " <<
1261  " word \"" << s << "\" is not in vocabulary, skipping"
1262  << endl;
1263  bad_word = p_order;
1264  }
1265  if (bad_word == 0)
1266  accumulate(window);
1267  else
1268  {
1269  cerr << "not accumulating : bad word at " << bad_word;
1270  cerr << " window=" << window; // l<< endl;
1271  bad_word--;
1272  }
1273  }
1274  else
1275  {
1276  // not a sliding window - wait for end of line and accumulate
1277  if(count < p_order)
1278  {
1279  if(count == p_order-1) // last thing in window (predictee)
1280  window[count++] = predlist_index(s);
1281  else // not last thing (predictor)
1282  window[count++] = wordlist_index(s);
1283 
1284  if (window(count-1) < 0)
1285  {
1286  cerr << "EST_Ngrammar::build_ngram " <<
1287  " word \"" << s << "\" is not in vocabulary, skipping"
1288  << endl;
1289  bad_word = 1;
1290  }
1291  }
1292  else
1293  cerr << "Too many items on line - ignoring trailing ones !" << endl;
1294  }
1295  }
1296 
1297  // end of sentence ?
1298  if(ts.eoln())
1299  {
1300 
1301  if(!sliding_window)
1302  {
1303  if((count == p_order) && bad_word == 0)
1304  accumulate(window);
1305  count = 0;
1306  bad_word = 0;
1307  }
1308  else if (eoln_is_eos)
1309  {
1310  // have there been any accumulates since the last increment
1311  if (window(p_order-1) != wordlist_index(last))
1312  p_number_of_sentences += 1;
1313 
1314  slide(window,-1);
1315  window[p_order-1] = wordlist_index(last);
1316 
1317  if(window(p_order-1) == -1)
1318  {
1319  //cerr << "WARNING : skipping over bad word '"
1320  //<< last << "'" << endl;
1321  bad_word = p_order;
1322  }
1323 
1324  if (bad_word == 0)
1325  accumulate(window);
1326 
1327  fill_window_start(window,prev,prev_prev);
1328 
1329  // check for bad tags
1330  if (window(p_order-1) == -1)
1331  bad_word = p_order;
1332  else if( (p_order>1) && (window(p_order-2) == -1) )
1333  bad_word = p_order-1;
1334  else
1335  bad_word=0;
1336  }
1337  }
1338  }
1339 
1340  // if last accumulate was at end of sentence
1341  // we don't need to do the extra accumulate
1342  if ( sliding_window && (window(p_order-1) != wordlist_index(prev)))
1343  {
1344 
1345  // and finish with the ngram [.....last few words,last]
1346  slide(window,-1);
1347  window[p_order-1] = wordlist_index(last);
1348 
1349  if (window(p_order-1) == -1)
1350  bad_word=p_order;
1351 
1352  if (bad_word == 0)
1353  {
1354  accumulate(window);
1355  p_number_of_sentences += 1;
1356  }
1357  }
1358 
1359  ts.close();
1360 
1361  cerr << "Accumulated " << p_number_of_sentences << " sentences." << endl;
1362  return true;
1363 }
1364 
1365 
1367 {
1368  int i,j;
1369  double sum1=0,sum2=0;
1370 
1371 
1372  EST_StrVector new_ngram;
1373  new_ngram.resize(ngram.n()-1);
1374  for(i=0;i<new_ngram.n();i++)
1375  new_ngram[i] = ngram(i);
1376 
1377 
1378  cerr << "computing backoff w for ";
1379  for(i=0;i<new_ngram.n();i++)
1380  cerr << new_ngram(i) << " ";
1381  cerr << " \r";
1382 
1383  cerr << endl;
1384 
1385  // only have backoff weights if the associated n-1 gram exists
1386  if (!n->ngram_exists(new_ngram))
1387  {
1388  cerr << " NONE";
1389 
1390  // if ngram really exists, but was below threshold
1391  // set backoff weight to 1 (always back off)
1392  if (n->ngram_exists(new_ngram,0))
1393  {
1394  if(!n->set_backoff_weight(new_ngram,1))
1395  cerr << "WARNING : couldn't set weight !" << endl;
1396  cerr << " = 1";
1397  }
1398  cerr << endl;
1399  return;
1400  }
1401 
1402  // save
1403  EST_String tmp = ngram(ngram.n()-1);
1404 
1405  // for each possible word in the last position
1406  for(j=0;j<n->get_pred_vocab_length();j++)
1407  {
1408  ngram[ngram.n()-1] = n->get_pred_vocab_word(j);
1409 
1410  for(i=0;i<ngram.n();i++)
1411  cerr << ngram(i) << " ";
1412 
1413  if (n->ngram_exists(ngram))
1414  {
1415  cerr << n->probability(ngram) << " exists " << endl;
1416  // if we have the complete ngram, add it to sum1
1417  sum1 += n->probability(ngram);
1418  }
1419  else
1420  {
1421 
1422  // make this faster - take out of loop
1423 
1424  // or add the n-1 gram, excluding the first word to sum2
1425  EST_StrVector tmp_ngram;
1426  tmp_ngram.resize(ngram.n()-1);
1427  for(i=0;i<tmp_ngram.n();i++)
1428  tmp_ngram[i] = ngram(i+1);
1429 
1430  cerr << " unseen, P(";
1431  for(i=0;i<tmp_ngram.n();i++)
1432  cerr << tmp_ngram(i) << " ";
1433 
1434  cerr << ") = " << n->probability(tmp_ngram) << " " << endl;
1435  sum2 += n->probability(tmp_ngram);
1436  }
1437  }
1438 
1439  // and fill in the backoff weight
1440 
1441  if (sum2 == 0) // no unseen ngrams, no backing off
1442  {
1443  if(!n->set_backoff_weight(new_ngram,1))
1444  cerr << "WARNING : couldn't set weight !" << endl;
1445  cerr << 0 << endl;
1446  }
1447  else
1448  {
1449  if (sum1 > 1)
1450  {
1451  cerr << "NEGATIVE WEIGHT for ";
1452  for(i=0;i<new_ngram.n();i++)
1453  cerr << new_ngram(i) << " ";
1454  cerr << endl;
1455 
1456  cerr << "sum1=" << sum1 << " sum2=" << sum2;
1457  cerr << " " << (1 - sum1) / sum2 << endl;
1458 
1459  for(j=0;j<n->get_pred_vocab_length();j++)
1460  {
1461  ngram[ngram.n()-1] = n->get_pred_vocab_word(j);
1462 
1463 
1464  if (n->ngram_exists(ngram))
1465  {
1466 
1467  for(i=0;i<ngram.n();i++)
1468  cerr << ngram(i) << " ";
1469  cerr << " exists, prob = ";
1470  cerr << n->probability(ngram,false,true) << endl;
1471  }
1472 
1473  }
1474  sum1 = 1;
1475  sum2 = 1; // to get a weight of 0
1476  }
1477 
1478  if(!n->set_backoff_weight(new_ngram, (1 - sum1) / sum2 ))
1479  cerr << "WARNING : couldn't set weight !" << endl;
1480 
1481  cerr << "sum1=" << sum1 << " sum2=" << sum2;
1482  cerr << " weight=" << (1 - sum1) / sum2 << endl;
1483  }
1484 
1485  // restore
1486  ngram[ngram.n()-1] = tmp;
1487 
1488 }
1489 
1491  const int maxcount)
1492 {
1493 
1494 
1495  backoff_threshold = mincount;
1496  backoff_discount = new EST_DVector[p_order];
1497 
1498  int i,o;
1499 
1500  // but we must have children from the root node
1501  // for every unigram, since these can not be backed off
1502  // (must therefore be present, even if zero)
1503  // smoothing will fill in a floor for any zero frequencies
1504 
1505  backoff_restore_unigram_states();
1506 
1507  Good_Turing_discount(*this,maxcount);
1508 
1509  // and since some frequencies will have been set to zero
1510  // we have to prune away those branches of the tree
1511  //prune_backoff_representation();
1512 
1513 
1514  // now compute backoff weights, to satisfy the
1515  // 'probs sum to 1' condition
1516 
1517  // condition is (trigram case):
1518  // sum( p(w3|w1,w2) ) = 1, over all w1,w2
1519 
1520  // p(wd3|wd1,wd2)=
1521  // if(trigram exists) p_3(wd1,wd2,wd3)
1522  // else if(bigram w1,w2 exists) bo_wt_2(w1,w2)*p(wd3|wd2)
1523  // else p(wd3|w2)
1524  // and for a given wd3 they all sum to 1
1525 
1526  // so compute three sums :
1527  // sum(p_3(wd1,wd2,wd3)), for all w1,w2 where we have the trigram
1528  // sum(p(wd3|wd2)), for all w1,w2 where we have the bigram(w1,w2)
1529  // but not the trigram
1530  // sum(p(wd3|wd2)), for all w1,w2 where we don't have either
1531 
1532  // could probably do this recursively and more elegantly
1533  // but it makes my head hurt
1534 
1535  for (o=2;o<=order();o++) // for (o=1 .. .????
1536  {
1537 
1538  cerr << "Backing off order " << o << endl;
1539  //cerr << "=======================" << endl;
1540 
1542  words.resize(o);
1543 
1544  // for each possible history (ngram, excluding last word)
1545  // compute the backoff weight
1546  for(i=0;i<o-1;i++)
1547  words[i]="";
1548  words[o-1] = "!FILLED!";
1549  iterate(words,&compute_backoff_weight,NULL);
1550 
1551  //cerr << "=========done==========" << endl;
1552 
1553  }
1554 
1555  return true;
1556 }
1557 
1558 
1560 {
1561  // for every item in the root pdf, look for a child
1562  // and make it if not present
1563 
1564  // short cut is to cumulate some zero freq bigrams
1565  // to force construction of states where necessary
1567  int j;
1568  words.resize(2);
1569  words[0] = "wibble"; // doesn't matter what this is, count is 0
1570  for(j=0;j<get_pred_vocab_length();j++)
1571  {
1572  words[1] = get_pred_vocab_word(j);
1573  backoff_representation->accumulate(words,0);
1574  }
1575 
1576 }
1577 
1578 
1580 {
1581 
1582  if (start_state == NULL)
1583  start_state = backoff_representation;
1584 
1585  //cerr << "pruning state " << start_state << endl << *start_state << endl;
1586 
1587  // remove any branches with zero frequency count
1588 
1589  // find children of this state with zero freq and zap them
1590  EST_Litem *k;
1591  double freq;
1592  EST_String name;
1593  for (k=start_state->pdf_const().item_start();
1594  !start_state->pdf_const().item_end(k);
1595  k = start_state->pdf_const().item_next(k))
1596  {
1597  start_state->pdf_const().item_freq(k,name,freq);
1598  if (freq < TINY_FREQ)
1599  {
1600  EST_BackoffNgrammarState *child = start_state->get_child(name);
1601 
1602  if (child!=NULL)
1603  {
1604  //cerr << "Zapping " << name << " : " << child->level()
1605  //<< " " << child<< endl;
1606  start_state->remove_child(child,name);
1607  }
1608  }
1609 
1610  }
1611 
1612  // then recurse through remaining children
1613  for (k=start_state->pdf_const().item_start();
1614  !start_state->pdf_const().item_end(k);
1615  k = start_state->pdf_const().item_next(k))
1616  {
1617  start_state->pdf_const().item_freq(k,name,freq);
1618  EST_BackoffNgrammarState *child = start_state->get_child(name);
1619  if (child!=NULL)
1620  {
1621  //cerr << "recursing to " << name << " : " << child->level() << endl;
1622  //if((child!=NULL) && (child->level() == 3))
1623  //cerr << *child << endl;
1624  prune_backoff_representation(child);
1625  }
1626  }
1627 }
1628 
1629 
1630 ostream& operator<<(ostream& s, EST_Ngrammar &n)
1631 {
1632  switch(n.p_representation)
1633  {
1634  case EST_Ngrammar::sparse:
1636  break;
1637 
1638  case EST_Ngrammar::dense:
1639  s << "Dense" << endl;
1640  // s << n.dense_representation << endl;
1641  break;
1642 
1643  case EST_Ngrammar::backoff:
1644  s << "Backoff" << endl;
1645  s << *(n.backoff_representation) << endl;
1646  break;
1647 
1648  default:
1649  cerr << "Unknown internal representation of EST_Ngrammar : can't print"
1650  << endl;
1651  break;
1652  }
1653 
1654  return s;
1655 }
1656 
1657 bool
1659 {
1660  if (new_type == p_entry_type)
1661  return true;
1662 
1663  // entry type conversion --- hmmmmm
1664 
1665  cerr << "Couldn't do entry type conversion !" << endl;
1666  return false;
1667 }
1668 
1670 {
1671  cerr << "EST_Ngrammar::sparse_to_dense() "
1672  <<" not implemented" << endl;
1673  return false;
1674 }
1675 
1677 {
1678  cerr << "EST_Ngrammar::dense_to_sparse()"
1679  <<" not implemented" << endl;
1680  return false;
1681 }
1682 
1684  int index) const
1685 {
1686  int i,ind=0;
1687  for(i=0;i<p_order-1;i++)
1688  ind = ind*vocab->length() + words.a_no_check(i+index);
1689 
1690  return ind;
1691 }
1692 
1693 int EST_Ngrammar::find_next_state_id(int state, int word) const
1694 {
1695  // Find a new state index from the current after moving with word
1696  int i,f;
1697 
1698  if (p_order == 1)
1699  return 0;
1700  for (f=1,i=0; i<p_order-2; i++)
1701  f*=vocab->length();
1702  return ((state%f)*vocab->length())+word;
1703 }
1704 
1706 {
1707  switch(p_representation)
1708  {
1709  case EST_Ngrammar::sparse:
1710  // return p_states[sparse_representation.find_state(words)];
1711  return p_states[0];
1712  break;
1713 
1714  case EST_Ngrammar::dense:
1715  {
1716  EST_IVector tmp(words.n());
1717  int i;
1718  for(i=0;i<p_order-1;i++)
1719  {
1720  tmp[i] = wordlist_index(words(i));
1721  if (tmp(i) == -1) break;
1722  }
1723  tmp[i] = pred_vocab->index(words(i));
1724  if (tmp(i) == -1) break;
1725  return p_states[find_dense_state_index(tmp)];
1726  }
1727  break;
1728 
1729  case EST_Ngrammar::backoff:
1730  cerr << "find_state: not valid in backoff mode !" << endl;
1731  break;
1732 
1733  default:
1734  cerr << "find_state: unknown ngrammar representation" << endl;
1735  break;
1736  }
1737 
1738  return p_states[0];
1739 }
1740 
1741 
1742 const EST_NgrammarState &
1744 {
1745  switch(p_representation)
1746  {
1747  case EST_Ngrammar::sparse:
1748  // return p_states[sparse_representation.find_state(words)];
1749  return p_states[0];
1750  break;
1751 
1752  case EST_Ngrammar::dense:
1753  {
1754  EST_IVector tmp(words.n());
1755  int i;
1756  for(i=0;i<p_order-1;i++)
1757  {
1758  tmp[i] = wordlist_index(words(i));
1759  if (tmp(i) == -1) break;
1760  }
1761  tmp[i] = pred_vocab->index(words(i));
1762  if (tmp(i) == -1) break;
1763  return p_states[find_dense_state_index(tmp)];
1764  }
1765  break;
1766 
1767  case EST_Ngrammar::backoff:
1768  cerr << "find_state_const: not valid in backoff mode !" << endl;
1769  break;
1770 
1771  default:
1772  cerr << "find_state: unknown ngrammar representation" << endl;
1773  break;
1774  }
1775  return p_states[0];
1776 }
1777 
1778 
1780 {
1781  switch(p_representation)
1782  {
1783  case EST_Ngrammar::sparse:
1784  // return p_states[sparse_representation.find_state(words)];
1785  return p_states[0];
1786  break;
1787 
1788  case EST_Ngrammar::dense:
1789  return p_states[find_dense_state_index(words)];
1790  break;
1791 
1792  case EST_Ngrammar::backoff:
1793  cerr << "find_state: not valid in backoff mode !" << endl;
1794  break;
1795 
1796  default:
1797  cerr << "find_state: unknown ngrammar representation" << endl;
1798  break;
1799  }
1800  return p_states[0];
1801 }
1802 
1803 
1804 const EST_NgrammarState &
1806 {
1807  switch(p_representation)
1808  {
1809  case EST_Ngrammar::sparse:
1810  // return p_states[sparse_representation.find_state(words)];
1811  return p_states[0];
1812  break;
1813  case EST_Ngrammar::dense:
1814  return p_states[find_dense_state_index(words)];
1815  break;
1816 
1817  case EST_Ngrammar::backoff:
1818  cerr << "find_state_const: not valid in backoff mode !" << endl;
1819  break;
1820 
1821  default:
1822  cerr << "find_state: unknown ngrammar representation" << endl;
1823  break;
1824  }
1825 
1826  return p_states[0];
1827 }
1828 
1829 
1831 {
1832 
1833  if (new_representation == p_representation)
1834  return true;
1835 
1836  if (new_representation == EST_Ngrammar::sparse)
1837  return sparse_to_dense();
1838  else if (new_representation == EST_Ngrammar::dense)
1839  return dense_to_sparse();
1840  else
1841  {
1842  cerr << "set_representation: unknown ngrammar representation" << endl;
1843  return FALSE;
1844  }
1845 }
1846 
1847 double EST_Ngrammar::probability(const EST_StrVector &words, bool force, const bool trace) const
1848 {
1849  // Probability of this ngram
1850  (void)force;
1851 
1852  switch(p_representation)
1853  {
1854  case EST_Ngrammar::sparse:
1855  case EST_Ngrammar::dense:
1856  return find_state_const(words).probability(lastword(words));
1857  break;
1858 
1859  case EST_Ngrammar::backoff:
1860  return backoff_probability(words,trace);
1861  break;
1862 
1863  default:
1864  cerr << "probability: unknown ngrammar representation" << endl;
1865  return -1;
1866  break;
1867  }
1868 }
1869 
1870 double EST_Ngrammar::frequency(const EST_StrVector &words, bool force, const bool trace) const
1871 {
1872  // Frequency of this ngram
1873  (void)force;
1874 
1875  switch(p_representation)
1876  {
1877  case EST_Ngrammar::sparse:
1878  case EST_Ngrammar::dense:
1879  return find_state_const(words).frequency(lastword(words));
1880  break;
1881 
1882  case EST_Ngrammar::backoff:
1883  return backoff_probability(words,trace); // can't do freqs
1884  break;
1885 
1886  default:
1887  cerr << "probability: unknown ngrammar representation" << endl;
1888  return -1;
1889  break;
1890  }
1891 }
1892 
1894  double *prob,int *state) const
1895 {
1896  // What's the most probable word given this list of words
1897 
1898  switch(p_representation)
1899  {
1900  case EST_Ngrammar::sparse:
1901  case EST_Ngrammar::dense:
1902  {
1903  const EST_NgrammarState &s = find_state_const(words);
1904  *state = s.id();
1905  return s.most_probable(prob);
1906  }
1907  break;
1908 
1909  case EST_Ngrammar::backoff:
1910  state=NULL; // there are no states per se
1911  return backoff_most_probable(words,prob);
1912  break;
1913 
1914  default:
1915  cerr << "probability: unknown ngrammar representation" << endl;
1916  return EST_String::Empty;
1917  break;
1918  }
1919 }
1920 
1922  double *prob,int *state) const
1923 {
1924  // What's the most probable word given this list of words
1925 
1926  switch(p_representation)
1927  {
1928  case EST_Ngrammar::sparse:
1929  case EST_Ngrammar::dense:
1930  {
1931  const EST_NgrammarState &s = find_state_const(words);
1932  *state = s.id();
1933  return s.most_probable(prob);
1934  }
1935  break;
1936 
1937  case EST_Ngrammar::backoff:
1938  cerr << "probability: IVector access to backoff not supported" << endl;
1939  return EST_String::Empty;
1940  break;
1941 
1942  default:
1943  cerr << "probability: unknown ngrammar representation" << endl;
1944  return EST_String::Empty;
1945  break;
1946  }
1947 }
1948 
1950 {
1951  switch(p_representation)
1952  {
1953  case EST_Ngrammar::sparse:
1954  case EST_Ngrammar::dense:
1955  {
1956  const EST_NgrammarState &s = find_state_const(words);
1957  return s.id();
1958  }
1959  default:
1960  cerr << "Ngrammar: representation doesn't support states" << endl;
1961  return 0;
1962  break;
1963  }
1964 }
1965 
1967 {
1968  switch(p_representation)
1969  {
1970  case EST_Ngrammar::sparse:
1971  case EST_Ngrammar::dense:
1972  {
1973  const EST_NgrammarState &s = find_state_const(words);
1974  return s.id();
1975  }
1976  default:
1977  cerr << "Ngrammar: representation doesn't support states" << endl;
1978  return 0;
1979  break;
1980  }
1981 }
1982 
1984 {
1985  if (vocab)
1986  return vocab->name(i);
1987  else
1988  return NOVOCAB;
1989 }
1990 
1992 {
1993  int index = vocab->name(s);
1994  return index;
1995 }
1996 
1998  bool force) const
1999 {
2000  // Whats the probability of this ngram-1 given last word in ngram
2001  (void)force;
2002 
2003  switch(p_representation)
2004  {
2005  case EST_Ngrammar::sparse:
2006  case EST_Ngrammar::dense:
2007  {
2008  const EST_NgrammarState &s = find_state_const(words);
2009  // need number of occurrences of words[p_order-1]
2010  return s.frequency(lastword(words))/
2011  vocab_pdf.frequency(lastword(words));
2012  }
2013  break;
2014 
2015  case EST_Ngrammar::backoff:
2016  return backoff_reverse_probability(words);
2017  break;
2018 
2019  default:
2020  cerr << "probability: unknown ngrammar representation" << endl;
2021  return -1;
2022  break;
2023  }
2024 }
2025 
2027  bool force) const
2028 {
2029  // Whats the probability of this ngram-1 given last word in ngram
2030  (void)force;
2031 
2032  switch(p_representation)
2033  {
2034  case EST_Ngrammar::sparse:
2035  case EST_Ngrammar::dense:
2036  {
2037  const EST_NgrammarState &s = find_state_const(words);
2038  // need number of occurrences of words[p_order-1]
2039  return s.frequency(lastword(words))/
2040  vocab_pdf.frequency(lastword(words));
2041  }
2042  break;
2043 
2044  case EST_Ngrammar::backoff:
2045  cerr << "probability: reverse prob unavailable for backoff ngram"
2046  << endl;
2047  return -1;
2048  break;
2049 
2050  default:
2051  cerr << "probability: unknown ngrammar representation" << endl;
2052  return -1;
2053  break;
2054  }
2055 }
2056 
2058 EST_Ngrammar::prob_dist(int state) const
2059 {
2060  return p_states[state].pdf_const();
2061 }
2062 
2065 {
2066 
2067  switch(p_representation)
2068  {
2069  case EST_Ngrammar::sparse:
2070  case EST_Ngrammar::dense:
2071  {
2072  const EST_NgrammarState &s = find_state_const(words);
2073  return s.pdf_const();
2074  }
2075  break;
2076 
2077  case EST_Ngrammar::backoff:
2078  return backoff_prob_dist(words);
2079  break;
2080 
2081  default:
2082  cerr << "probability: unknown ngrammar representation" << endl;
2083  return PSTnullProbDistribution;
2084  break;
2085  }
2086 }
2087 
2090 {
2091 
2092  switch(p_representation)
2093  {
2094  case EST_Ngrammar::sparse:
2095  case EST_Ngrammar::dense:
2096  {
2097  const EST_NgrammarState &s = find_state_const(words);
2098  return s.pdf_const();
2099  }
2100  break;
2101 
2102  case EST_Ngrammar::backoff:
2103  cerr << "probability: unsupport IVector access of backoff ngram" << endl;
2104  return PSTnullProbDistribution;
2105  break;
2106 
2107  default:
2108  cerr << "probability: unknown ngrammar representation" << endl;
2109  return PSTnullProbDistribution;
2110  break;
2111  }
2112 }
2113 
2116 {
2117 
2118  EST_read_status r_val;
2119 
2120  //if ((r_val = load_ngram_htk_ascii(filename, *this)) != wrong_format)
2121  //return r_val;
2122  //if ((r_val = load_ngram_htk_binary(filename, *this)) != wrong_format)
2123  //return r_val;
2124  if ((r_val = load_ngram_cstr_ascii(filename, *this)) != wrong_format)
2125  return r_val;
2126  if ((r_val = load_ngram_cstr_bin(filename, *this)) != wrong_format)
2127  return r_val;
2128 
2129  // maybe the file is compressed ?
2130  EST_Pathname fname(filename);
2131  EST_String tmp_fname("");
2132 
2133  // crude but effective
2134  if (fname.extension() == GZIP_FILENAME_EXTENSION)
2135  tmp_fname = uncompress_file_to_temporary(filename,
2136  "gzip --decompress --stdout");
2137  else if (fname.extension() == COMPRESS_FILENAME_EXTENSION)
2138  tmp_fname = uncompress_file_to_temporary(filename,"uncompress -c");
2139 
2140  if(tmp_fname != "")
2141  {
2142  r_val = load(tmp_fname);
2143  delete_file(tmp_fname);
2144  return r_val;
2145  }
2146  else
2147  return misc_read_error;
2148 }
2149 
2151 EST_Ngrammar::load(const EST_String &filename, const EST_StrList &wordlist)
2152 {
2153 
2154  // for backed off grammars
2155  // need a wordlist to load ARPA format
2156 
2157  EST_read_status r_val;
2158 
2159  if ((r_val = load_ngram_arpa(filename, *this, wordlist)) != wrong_format)
2160  return r_val;
2161 
2162  // try other types, then check wordlist fits
2163  //if ((r_val = load_ngram_htk_ascii(filename, *this)) != wrong_format)
2164  //return r_val;
2165  //if ((r_val = load_ngram_htk_binary(filename, *this)) != wrong_format)
2166  //return r_val;
2167  if ((r_val = load_ngram_cstr_ascii(filename, *this)) != wrong_format)
2168  {
2169  if ((r_val == format_ok) && check_vocab(wordlist))
2170  return r_val;
2171  else
2172  {
2173  cerr << "Wordlist file does not match grammar wordlist !" << endl;
2174  return misc_read_error;
2175  }
2176  }
2177 
2178  if ((r_val = load_ngram_cstr_bin(filename, *this)) != wrong_format)
2179  {
2180  if ((r_val == format_ok) && check_vocab(wordlist))
2181  return r_val;
2182  else
2183  {
2184  cerr << "Wordlist does not match grammar !" << endl;
2185  return misc_read_error;
2186  }
2187  }
2188 
2189 
2190  cerr << "EST_Ngrammar::load can't determine ngrammar file type for input file " << filename << endl;
2191  return wrong_format;
2192 }
2193 
2194 
2195 void
2197 {
2198 
2199  cerr << "EST_Ngrammar::make_htk_compatible() not written yet." << endl;
2200  return;
2201 }
2202 
2204 EST_Ngrammar::save(const EST_String &filename, const EST_String type,
2205  const bool trace,double floor)
2206 {
2207 
2208  if (type == "")
2209  return save(filename,"cstr_ascii",false,floor); // choose default type
2210  if (type == "htk_ascii")
2211  return save_ngram_htk_ascii(filename, *this, floor);
2212  //if (type == "htk_binary")
2213  //return save_htk_binary(filename, *this);
2214  if (type == "arpa")
2215  return save_ngram_arpa(filename, *this);
2216  if (type == "cstr_ascii")
2217  return save_ngram_cstr_ascii(filename, *this,trace,floor);
2218  if (type == "cstr_bin")
2219  return save_ngram_cstr_bin(filename, *this, trace,floor);
2220  if (type == "wfst")
2221  return save_ngram_wfst(filename, *this);
2222 
2223  cerr << "EST_Ngrammar::save unknown output file type " << type << endl;
2224  return write_fail;
2225 }
2226 
2228  void (*function)(EST_Ngrammar *n,
2229  EST_StrVector &words,
2230  void *params),
2231  void *params)
2232 {
2233  int i,j=-1;
2234  EST_String tmp;
2235 
2236  // find next item in ngram to fill in
2237  for(i=0;i<words.n();i++)
2238  if (words[i] == "")
2239  {
2240  j=i;
2241  break;
2242  }
2243 
2244  if (j==-1)
2245  {
2246  // all filled in, so do the function
2247  (*function)(this,words,params);
2248 
2249  }
2250  else
2251  {
2252 
2253  tmp = words(j);
2254  if (j==p_order-1) // final position - use pred vocab
2255  {
2256  for(i=0;i<pred_vocab->length();i++){
2257  words[j] = pred_vocab->name(i);
2258  iterate(words,function,params);
2259  }
2260 
2261  }
2262  else
2263  {
2264  for(i=0;i<vocab->length();i++){
2265  words[j] = vocab->name(i);
2266  iterate(words,function,params);
2267  }
2268  }
2269  words[j] = tmp;
2270  }
2271 }
2272 
2274  void (*function)(const EST_Ngrammar *const n,
2275  EST_StrVector &words,
2276  void *params),
2277  void *params) const
2278 {
2279  int i,j=-1;
2280  EST_String tmp;
2281 
2282  // find next item in ngram to fill in
2283  for(i=0;i<words.n();i++)
2284  if (words[i] == "")
2285  {
2286  j=i;
2287  break;
2288  }
2289 
2290  if (j==-1)
2291  {
2292  // all filled in, so do the function
2293  (*function)(this,words,params);
2294 
2295  }
2296  else
2297  {
2298 
2299  tmp = words(j);
2300  if (j==p_order-1) // final position - use pred vocab
2301  {
2302  for(i=0;i<pred_vocab->length();i++){
2303  words[j] = pred_vocab->name(i);
2304  const_iterate(words,function,params);
2305  }
2306 
2307  }
2308  else
2309  {
2310  for(i=0;i<vocab->length();i++){
2311  words[j] = vocab->name(i);
2312  const_iterate(words,function,params);
2313  }
2314  }
2315  words[j] = tmp;
2316  }
2317 }
2318 
2319 void EST_Ngrammar::print_freqs(ostream &os,double floor)
2320 {
2321 
2322  if (p_representation == EST_Ngrammar::backoff)
2323  backoff_representation->print_freqs(os,p_order);
2324  else
2325  {
2326  int i,j;
2327  EST_Litem *k;
2328  EST_IVector window(p_order-1);
2329 
2330  for (i=0; i < p_num_states; i++)
2331  {
2332  // print out each ngram : freq
2333  for (k=p_states[i].pdf().item_start();
2334  !p_states[i].pdf().item_end(k);
2335  k = p_states[i].pdf().item_next(k))
2336  {
2337  double freq;
2338  EST_String name;
2339  int ind = i;
2340  p_states[i].pdf().item_freq(k,name,freq);
2341  if (freq == 0)
2342  freq = floor;
2343  if (freq > 0)
2344  {
2345  for (j = p_order-2; j >= 0; j--)
2346  {
2347  window[j] = ind%vocab->length();
2348  ind /= vocab->length();
2349  }
2350  for (j = 0; j < p_order-1; j++)
2351  os << wordlist_index(window(j)) << " ";
2352  os << name << " : " << freq << endl;
2353  }
2354  }
2355  }
2356  }
2357 }
2358 
2361 {
2362  // need to make this on the fly
2363  // by looking at all possible words in the final
2364  // position
2365 
2366  int i,j;
2367  EST_StrVector ngram;
2368  ngram.resize(words.n()+1);
2369  for(i=0;i<words.n();i++)
2370  ngram[i] = words(i);
2371 
2373 
2374  for(j=0;j<get_pred_vocab_length();j++)
2375  {
2376  ngram[ngram.n()-1] = get_pred_vocab_word(j);
2377  double tmp = backoff_probability(ngram,false);
2378  p->set_frequency(j,tmp); // actually probability
2379  }
2380  p->set_num_samples(1.0); // we can't do it in frequencies
2381 
2382  return *p;
2383 }
2384 
2385 double EST_Ngrammar::get_backoff_discount(const int order, const double freq) const
2386 {
2387  if(order > p_order)
2388  {
2389  cerr << "order too great in EST_Ngrammar::get_backoff_discount" << endl;
2390  return 0;
2391  }
2392 
2393  else if( (int)freq < backoff_discount[order-1].n())
2394  return backoff_discount[order-1]((int)freq);
2395 
2396  else
2397  return 0;
2398 }
2399 
2401  const bool trace) const
2402 {
2403  const EST_BackoffNgrammarState *state;
2404  int i;
2405  EST_StrVector new_ngram;
2406  double f=0,f2=0;
2407 
2408 
2409  if (trace)
2410  {
2411  cerr << "backoff_probability( ";
2412  for(i=0;i<words.n();i++)
2413  cerr << words(i) << " ";
2414  cerr << ") ";
2415  }
2416 
2417  // are we down to the unigram ?
2418  if (words.n() == 1)
2419  {
2420  if (trace)
2421  cerr << "unigram " << backoff_representation->probability(words(0))
2422  << endl;
2423 
2424  f=backoff_representation->frequency(words(0));
2425 
2426  // it seems outrageous, but use a floor because unigram
2427  // probs of zero mess up backing off
2428  if(f>0)
2429  return f / backoff_representation->pdf_const().samples();
2430  else
2431  return backoff_unigram_floor_freq / backoff_representation->pdf_const().samples();
2432  }
2433 
2434  // the first n-1 words in the ngram -- deeply inefficient at the moment !
2435  // should pass separately
2436  new_ngram.resize(words.n()-1);
2437  for(i=0;i<new_ngram.n();i++)
2438  new_ngram[i] = words(i);
2439 
2440  // see if we have the ngram
2441  state=backoff_representation->get_state(words);
2442 
2443  if( (state != NULL) &&
2444  ((f=state->frequency(words(0))) > backoff_threshold) )
2445  {
2446 
2447  //if (trace)
2448  //cerr << "in state " << state << " = " << *state << endl << endl;
2449 
2450 
2451  // because f>0, the freq of new_ngram must be non-zero too
2452 
2453  // special case - we don't have a freq for !ENTER (or whatever it's called)
2454  // so use the no. of sentences used to build this grammar
2455  if((new_ngram(0) == p_sentence_start_marker) ||
2456  (new_ngram(0) == p_sentence_end_marker) )
2457  {
2458  f2 = p_number_of_sentences;
2459  if (trace)
2460  cerr << "special freq used : " << f2 << endl;
2461  }
2462  else
2463  {
2464  state=backoff_representation->get_state(new_ngram);
2465  if (state == NULL)
2466  {
2467  cerr << "Something went horribly wrong !" << endl;
2468  return -1;
2469  }
2470  // state->frequency(new_ngram(0)) is the number of times
2471  // we saw new_ngram
2472 
2473  f2=state->frequency(new_ngram(0));
2474 
2475  if (trace)
2476  {
2477  //cerr << "n-1 state is " << *state << endl;
2478  cerr << " using freq for " << new_ngram(0) << " of " << f2 << endl;
2479  }
2480  }
2481 
2482  if (trace)
2483  {
2484 
2485  cerr << " ..... got (" << f << " - "
2486  << get_backoff_discount(state->level()+1,f)
2487  << ")/" << f2 << " = "
2488  << (f - get_backoff_discount(state->level()+1,f) ) / f2
2489  << endl;
2490  }
2491  return (f - get_backoff_discount(state->level()+1,f) ) / f2;
2492  }
2493 
2494  else // back off
2495  {
2496 
2497  double bo_wt = get_backoff_weight(new_ngram);
2498 
2499  for(i=0;i<new_ngram.n();i++)
2500  new_ngram[i] = words(i+1);
2501 
2502  if (trace)
2503  {
2504  cerr << "backed off(" << bo_wt << ") to (";
2505  for(i=0;i<new_ngram.n();i++)
2506  cerr << new_ngram(i) << " ";
2507  cerr << ") ";
2508  }
2509 
2510  return bo_wt * backoff_probability(new_ngram,trace);
2511  }
2512 
2513 }
2514 
2515 
2516 double
2518  const EST_BackoffNgrammarState *root) const
2519 {
2520 
2521  // so similar to backoff prob - should combine
2522  // to do ......
2523 
2524  const EST_BackoffNgrammarState *state;
2525  int i;
2526  EST_StrVector new_ngram;
2527  double f=0;
2528 
2529  /*
2530  cerr << "backoff_rev_probability_sub( ";
2531  for(i=0;i<words.n();i++)
2532  cerr << words(i) << " ";
2533  cerr << ") ";
2534  */
2535  // are we down to the unigram ?
2536  if (words.n() == 1)
2537  {
2538  //cerr << "unigram " << root->probability(words(0))
2539  //<< endl;
2540  return root->probability(words(0));
2541  }
2542 
2543  // the first n-1 words in the ngram -- deeply inefficient at the moment !
2544  // should pass separately
2545  new_ngram.resize(words.n()-1);
2546  for(i=0;i<new_ngram.n();i++)
2547  new_ngram[i] = words(i);
2548 
2549  // see if we have the ngram
2550  state=root->get_state(words);
2551 
2552 
2553  if( (state != NULL) &&
2554  ((f=state->frequency(words(0))) > 0) )
2555  {
2556  // because f>0, the freq of new_ngram must be non-zero too
2557  state=root->get_state(new_ngram);
2558  if (state == NULL)
2559  {
2560  cerr << "Something went horribly wrong !" << endl;
2561  return -1;
2562  }
2563  // state->frequency(new_ngram(0)) is the number of times
2564  // we saw new_ngram
2565  //cerr << "got " << f << "/" << state->frequency(new_ngram(0)) << endl;
2566  return f / state->frequency(new_ngram(0));
2567  }
2568 
2569  else // back off
2570  {
2571 
2572  double bo_wt = root->get_backoff_weight(new_ngram);
2573  //double bo_wt = root->get_backoff_weight(words);
2574 
2575  for(i=0;i<new_ngram.n();i++)
2576  new_ngram[i] = words(i+1);
2577 
2578  //cerr << "backed off(" << bo_wt << ") ";
2579  return bo_wt * backoff_reverse_probability_sub(new_ngram,root);
2580  }
2581 }
2582 
2583 double
2585 {
2586 
2587  // probability of words 1...n-1 , given the last word
2588  // easier to compute from the backoff tree structure than
2589  // 'normal' probability
2590 
2591  // find level 1 child corresponding to history before last word
2592  EST_BackoffNgrammarState *state;
2593  state = backoff_representation->get_child(words(words.n()-1));
2594 
2595  if(state == NULL)
2596  {
2597  // predictee isn't there so ......... ???
2598  return 0;
2599  }
2600 
2601  // now find backoff probability of words 0...n-2
2602  // starting from this state
2603  return backoff_reverse_probability_sub(words,state);
2604 
2605 }
2606 
2607 const EST_String &
2609  double *prob) const
2610 {
2611  return backoff_prob_dist(words).most_probable(prob);
2612 }
2613 
2614 void slide(EST_IVector &v, const int l)
2615 {
2616  int i;
2617 
2618  // slide elements by 'l' without wraparound
2619 
2620  if (l==0)
2621  return;
2622 
2623  else if (l<0)
2624  {
2625  // slide left
2626 
2627  for(i=0;i<v.n()+l;i++)
2628  v[i] = v(i-l);
2629 
2630  for(;i<v.n();i++)
2631  v[i] = 0;
2632 
2633  }
2634  else
2635  {
2636  // slide right
2637 
2638  for(i=v.n()-1;i>=l;i--)
2639  v[i] = v(i-l);
2640 
2641  for(;i>=0;i--)
2642  v[i] = 0;
2643  }
2644 }
2645 
2646 void
2648  void (*function)(EST_BackoffNgrammarState *s,
2649  void *params),
2650  void *params)
2651 {
2652 
2653  // apply the function to this node
2654  function(start_state,params);
2655 
2656  // and recurse down the tree
2657  EST_Litem *k;
2658  double freq;
2659  EST_String name;
2660  for (k=start_state->pdf_const().item_start();
2661  !start_state->pdf_const().item_end(k);
2662  k = start_state->pdf_const().item_next(k))
2663  {
2664  start_state->pdf_const().item_freq(k,name,freq);
2665  EST_BackoffNgrammarState *child = start_state->get_child(name);
2666  if (child!=NULL)
2667  backoff_traverse(child,function,params);
2668 
2669  }
2670 }
2671 
2672 void
2674  void (*function)(EST_BackoffNgrammarState *s,
2675  void *params),
2676  void *params,
2677  const int level)
2678 {
2679  // apply the function to this node, if level is correct
2680  if (start_state->level() == level)
2681  {
2682  function(start_state,params);
2683  }
2684  else if (start_state->level() < level)
2685  {
2686  // and recurse down the tree if we haven't
2687  // reached the level yet
2688  EST_Litem *k;
2689  double freq;
2690  EST_String name;
2691 
2692  for (k=start_state->pdf_const().item_start();
2693  !start_state->pdf_const().item_end(k);
2694  k = start_state->pdf_const().item_next(k))
2695  {
2696  start_state->pdf_const().item_freq(k,name,freq);
2697  EST_BackoffNgrammarState *child = start_state->get_child(name);
2698  if (child!=NULL)
2699  backoff_traverse(child,function,params,level);
2700 
2701  }
2702 
2703 
2704  }
2705 }
2706 void
2708 {
2709 
2710  EST_Ngrammar *other_n = (EST_Ngrammar*)((void**)params)[0];
2711  float *weight = (float*)((void**)params)[1];
2712 
2713  if(other_n->ngram_exists(ngram))
2714  n->accumulate(ngram,*weight * other_n->frequency(ngram));
2715 
2716 }
2717 
2718 bool
2720 {
2722  words.resize(p_order);
2723 
2724  void **params = new void*[2];
2725  params[0] = (void*)&n;
2726  params[1] = (void*)&weight;
2727 
2728  iterate(words,&merge_other_grammar,(void*)params);
2729 
2730  delete [] params;
2731  return true;
2732 }
2733 
2734 
2735 void slide(EST_StrVector &v, const int l)
2736 {
2737  // slide elements by 'l' without wraparound
2738  int i;
2739 
2740  if (l==0)
2741  return;
2742 
2743  else if (l<0)
2744  {
2745  // slide left
2746 
2747  for(i=0;i<v.n()+l;i++)
2748  v[i] = v(i-l);
2749 
2750  for(;i<v.n();i++)
2751  v[i] = "";
2752 
2753  }
2754  else
2755  {
2756  // slide right
2757 
2758  for(i=v.n()-1;i>=l;i--)
2759  v[i] = v(i-l);
2760 
2761  for(;i>=0;i--)
2762  v[i] = "";
2763 
2764  }
2765 }
2766 
bool oov_preprocess(const EST_String &filename, EST_String &new_filename, const EST_String &what)
#define GZIP_FILENAME_EXTENSION
Definition: EST_Ngrammar.h:66
EST_TokenStream & get(EST_Token &t)
get next token in stream
Definition: EST_Token.cc:499
bool accumulate(const EST_StrVector &words, const double count=1)
const EST_BackoffNgrammarState * get_state(const EST_StrVector &words) const
void Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount, const double default_discount)
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
void frequency_of_frequencies(EST_DVector &ff)
double backoff_reverse_probability_sub(const EST_StrVector &words, const EST_BackoffNgrammarState *root) const
#define TINY_FREQ
Definition: EST_Ngrammar.h:70
Utility IO Function header file.
int wordlist_index(const EST_String &word, const bool report=true) const
EST_read_status load(const EST_String &filename)
EST_read_status load_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:227
int predlist_index(const EST_String &word) const
EST_write_status
void prune_backoff_representation(EST_BackoffNgrammarState *start_state=NULL)
bool compute_backoff_weights(const int mincount=1, const int maxcount=10)
void remove_child(EST_BackoffNgrammarState *child, const EST_String &name)
bool set_backoff_weight(const EST_StrVector &words, const double w)
void accumulate(const EST_StrVector &words, const double count=1)
int find_dense_state_index(const EST_IVector &words, int index=0) const
EST_String get_vocab_word(int i) const
EST_BackoffNgrammarState * backoff_representation
Definition: EST_Ngrammar.h:257
bool save(Lattice &lattice, EST_String filename)
void compute_backoff_weight(EST_Ngrammar *n, EST_StrVector &ngram, void *)
void close(void)
Close stream.
Definition: EST_Token.cc:419
double get_backoff_weight(const EST_StrVector &words) const
bool dense_to_sparse()
const EST_String & predict(const EST_StrVector &words, double *prob, int *state) const
INLINE const T & a_no_check(ssize_t n) const
read-only const access operator: without bounds checking
Definition: EST_TVector.h:254
double reverse_probability(const EST_StrVector &words, bool force=false) const
double frequency(const EST_String &w) const
Definition: EST_Ngrammar.h:170
bool load(Lattice &lattice, EST_String filename)
EST_read_status load_ngram_arpa(const EST_String filename, EST_Ngrammar &n, const EST_StrList &vocab)
Definition: ngrammar_io.cc:74
int find_next_state_id(int state, int word) const
size_t index(const char *s, ssize_t pos=0) const
Position of substring (starting at pos)
Definition: EST_String.h:352
bool set_representation(representation_t new_representation)
void set_num_samples(const double c)
EST_Litem * item_start() const
Used for iterating through members of the distribution.
bool init_sparse_representation()
EST_String make_tmp_filename()
Make a unique temporary filename.
Definition: util_io.cc:56
void print_freqs(ostream &os, const int order, EST_String followers="")
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
EST_Item * root(const EST_Item *n)
return root node of treeprevious sibling (sister) of n
const EST_NgrammarState & find_state_const(const EST_StrVector &words) const
int check_vocab(EST_Relation &a, EST_StrList &vocab)
double get_backoff_discount(const int order, const double freq) const
int get_pred_vocab_length() const
Definition: EST_Ngrammar.h:419
int index(EST_TList< T > &l, T &val, bool(*eq)(const EST_UItem *, const EST_UItem *)=NULL)
Definition: EST_TList.h:286
bool init_backoff_representation()
double frequency(const EST_String &w) const
Definition: EST_Ngrammar.h:119
bool merge(EST_Ngrammar &n, float weight)
EST_UItem * next()
Definition: EST_UList.h:55
const EST_String & backoff_most_probable(const EST_StrVector &words, double *prob=NULL) const
bool p_init(int o, representation_t r)
void backoff_traverse(EST_BackoffNgrammarState *start_state, void(*function)(EST_BackoffNgrammarState *s, void *params), void *params)
int open(const EST_String &filename)
open a EST_TokenStream for a file.
Definition: EST_Token.cc:213
float max(float a, float b)
Definition: EST_cluster.cc:143
EST_TVector< EST_String > EST_StrVector
Definition: EST_types.h:52
int find_state_id(const EST_StrVector &words) const
void make_htk_compatible()
double probability(const EST_StrVector &words, bool force=false, const bool trace=false) const
bool build_ngram(const EST_String &filename, const EST_String &prev, const EST_String &prev_prev, const EST_String &last, const EST_String &input_format)
const EST_DiscreteProbDistribution & prob_dist(const EST_StrVector &words) const
int eof()
end of file
Definition: EST_Token.h:362
EST_String get_pred_vocab_word(int i) const
Definition: EST_Ngrammar.h:420
void resize(ssize_t n, int set=1)
Definition: EST_TVector.cc:196
representation_t p_representation
Definition: EST_Ngrammar.h:242
bool build_sparse(const EST_String &filename, const EST_String &prev, const EST_String &prev_prev, const EST_String &last)
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index.
EST_BackoffNgrammarState * add_child(const EST_Discrete *d, const EST_StrVector &words)
INLINE ssize_t length() const
number of items in vector.
Definition: EST_TVector.h:249
EST_write_status save_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n, const bool trace, double floor)
Definition: ngrammar_io.cc:844
EST_write_status save_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n, double floor)
Definition: ngrammar_io.cc:565
bool check_vocab(const EST_StrList &wordlist)
#define COMPRESS_FILENAME_EXTENSION
Definition: EST_Ngrammar.h:67
void fill_window_start(EST_IVector &window, const EST_String &prev, const EST_String &prev_prev) const
void backoff_restore_unigram_states()
#define wrong_format
EST_String uncompress_file_to_temporary(const EST_String &filename, const EST_String &prog_name)
Uncompress file by calling program prog, and write it to new tempoary file. Return name of temporary ...
Definition: util_io.cc:205
bool set_entry_type(entry_t new_type)
EST_read_status load_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:289
const EST_DiscreteProbDistribution & pdf_const() const
Definition: EST_Ngrammar.h:166
#define FALSE
Definition: EST_bool.h:119
int id() const
Definition: EST_Ngrammar.h:113
void print_freqs(ostream &os, double floor=0.0)
double backoff_probability(const EST_StrVector &words, const bool trace=false) const
double frequency(const EST_StrVector &words, bool force=false, const bool trace=false) const
#define misc_read_error
section options Options< strong > or ngram_per_line Pseudo words
NULL
Definition: EST_WFST.cc:55
A vector class for double precision floating point numbers. EST_DVector x should be used instead of f...
Definition: EST_DMatrix.h:122
EST_PredictionSuffixTree sparse_representation
Definition: EST_Ngrammar.h:247
EST_write_status save_ngram_arpa(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:670
f
Definition: EST_item_aux.cc:48
EST_write_status save_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n, const bool trace, double floor)
Definition: ngrammar_io.cc:747
bool set_backoff_weight(const EST_StrVector &words, const double w)
ostream & operator<<(ostream &s, const EST_NgrammarState &a)
getString int
Definition: EST_item_aux.cc:50
The file was not written successfully.
void set_frequency(const EST_String &s, double c)
const EST_DiscreteProbDistribution PSTnullProbDistribution
Definition: EST_Ngrammar.cc:55
const EST_StrVector & make_ngram_from_index(const int i) const
bool ngram_exists(const EST_StrVector &words, const double threshold) const
void const_iterate(EST_StrVector &words, void(*function)(const EST_Ngrammar *const n, EST_StrVector &words, void *params), void *params) const
EST_read_status
bool sparse_to_dense()
int delete_file(const EST_String &filename)
OS independent way of removing a file.
Definition: EST_io_aux.h:81
bool init_dense_representation()
void print_freqs(ostream &os)
Definition: EST_PST.cc:310
EST_BackoffNgrammarState * get_child(const EST_String &word) const
Definition: EST_Ngrammar.h:177
bool init(const EST_StrList &vocab)
(re-)initialise
Definition: EST_Discrete.cc:82
#define format_ok
const EST_String & most_probable(double *prob=NULL) const
Definition: EST_Ngrammar.h:122
#define OOV_MARKER
Definition: EST_Ngrammar.h:61
EST_String extension(void) const
double backoff_reverse_probability(const EST_StrVector &words) const
EST_UItem * head() const
Definition: EST_UList.h:97
EST_NgrammarState & find_state(const EST_StrVector &words)
bool build(const EST_StrList &filenames, const EST_String &prev=SENTENCE_START_MARKER, const EST_String &prev_prev=SENTENCE_END_MARKER, const EST_String &last=SENTENCE_END_MARKER, const EST_String &input_format="", const EST_String &oov_mode="", const int mincount=1, const int maxcount=10)
void iterate(EST_StrVector &words, void(*function)(EST_Ngrammar *n, EST_StrVector &words, void *params), void *params)
bool ngram_exists(const EST_StrVector &words) const
#define TRUE
Definition: EST_bool.h:118
INLINE ssize_t n() const
number of items in vector.
Definition: EST_TVector.h:251
const EST_DiscreteProbDistribution & backoff_prob_dist(const EST_StrVector &words) const
void merge_other_grammar(EST_Ngrammar *n, EST_StrVector &ngram, void *params)
bool init(int o, representation_t r, const EST_StrList &wordlist)
double probability(const EST_String &w) const
Definition: EST_Ngrammar.h:168
static const EST_String Empty
Constant empty string.
Definition: EST_String.h:110
EST_StrVector vocab
Definition: dp_main.cc:85
EST_write_status save(const EST_String &filename, const EST_String type="cstr_ascii", const bool trace=false, double floor=0.0)
bool init_vocab(const EST_StrList &wordlist)
const EST_DiscreteProbDistribution & pdf_const() const
Definition: EST_Ngrammar.h:114
EST_write_status save_ngram_wfst(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:806
void slide(EST_IVector &v, const int l)
int eoln()
end of line
Definition: EST_Token.cc:832
double get_backoff_weight() const
Definition: EST_Ngrammar.h:196
void default_values()