Edinburgh Speech Tools  2.1-release
ngrammar_io.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Simon King & Alan W Black */
34 /* Date : February 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* IO functions for EST_Ngram class */
38 /* */
39 /*=======================================================================*/
40 
41 #include <cstdlib>
42 #include <fstream>
43 #include <iostream>
44 #include "EST_unix.h"
45 #include <cstring>
46 #include <climits>
47 #include <cfloat>
48 #include "EST_String.h"
49 #include "EST_Ngrammar.h"
50 #include "EST_Token.h"
51 #include "EST_cutils.h"
52 #include "EST_File.h"
53 
54 
55 using namespace std;
56 
59 {
60  (void)filename;
61  (void)n;
62  return wrong_format;
63 }
64 
67 {
68  (void)filename;
69  (void)n;
70  return wrong_format;
71 }
72 
75 {
76 
77  EST_TokenStream ts;
78  EST_String s;
79  int i,j,k, order=0;
80  //double weight;
81  /*double occur;*/
82  int this_num,this_order;
83 
84  if (ts.open(filename) == -1)
85  return misc_read_error;
86 
87  // find backslash data backslash
88  while ((!ts.eof()) && !ts.get().string().contains("\\data\\"));
89 
90  if (ts.eof())
91  {
92  ts.close();
93  return wrong_format;
94  }
95 
96  // find order and numbers of ngrams
97 
98  // somewhere to keep numbers
99  EST_IVector nums(100); // not going to have anything bigger than a 100-gram !
100 
101  while (!ts.eof())
102  {
103  // have we got to next section
104  if (ts.peek().string().contains("-grams:"))
105  break;
106 
107  s=ts.get_upto_eoln().string();
108 
109  if(s.contains("ngram ") && s.contains("="))
110  {
111 
112  s=s.after("ngram ");
113  this_order=atoi(s.before("="));
114  this_num=atoi(s.after("="));
115 
116  //cerr << "There are " << this_num << " " << this_order
117  //<< "-grams" << endl;
118 
119  nums[this_order] = this_num;
120 
121  if(this_order > order)
122  order = this_order;
123  }
124 
125  }
126 
127 
128  if(order==0)
129  {
130  //cerr << "No ngram ?=? in header !" << endl;
131  ts.close();
132  return wrong_format;
133  }
134 
135  //cerr << "Initialising " << order << "-grammar" << endl;
136  if(!n.init(order,EST_Ngrammar::backoff,vocab))
137  return misc_read_error;
138 
139  // read data
140  for(i=1;i<=order;i++)
141  {
142 
143  EST_StrVector window(i);
144 
145  // find start of data for this order "<order>-grams:"
146  EST_String tmp = "\\" + itoString(i) + "-grams:";
147  while (!ts.eof())
148  {
149  s=ts.get().string();
150  if (s.contains(tmp))
151  break;
152  }
153 
154 
155  if(ts.eof())
156  {
157  cerr << "Unexpected end of grammar file whilst looking for '"
158  << tmp << "'" << endl;
159  return misc_read_error;
160  }
161 
162  //cerr << "Found order " << i << " : " << tmp << endl;
163  //cerr << "Looking for " << nums(i) << " ngrams" << endl;
164  // look for nums(i) ngrams
165 
166  for(j=0;j<nums(i);j++)
167  {
168 
169  for (k=0; ((k<i) && !ts.eof()); k++)
170  window[k] = ts.get().string();
171 
172  if(ts.eof())
173  {
174  cerr << "Unexpected end of file whilst reading " << i
175  << "-grams !" << endl;
176  return misc_read_error;
177  }
178 
179  /*occur = atof(ts.get().string()); unused*/
180  ts.get().string();
181 
182  // can't for backoff grammars, need to set probs directly
183 
184  cerr << "ooooooooops" << endl;
185  return wrong_format;
186  /* BEGIN COMMENT: This code is unreachable
187  //n.accumulate(window,occur);
188 
189  // backoff weight ?
190  if (!ts.eoln())
191  {
192  weight = atof(ts.get().string());
193  n.set_backoff_weight(window,weight);
194  }
195 
196  if (!ts.eoln())
197  {
198  cerr << "EST_Ngrammar:load_ngram_arpa expect end of line at filepos "
199  << ts.filepos() << endl;
200  ts.close();
201  return misc_read_error;
202  }
203  END COMMENT: This code is unreachable */
204 
205  }
206 
207  } // loop through orders
208 
209 
210  // find backslash end backslash
211  while (!ts.eof())
212  if (ts.get().string() == "\\end\\")
213  {
214  ts.close();
215  return format_ok;
216 
217  }
218 
219  cerr << "Missing \\end\\ !" << endl;
220 
221  ts.close();
222  return misc_read_error;
223 
224 }
225 
228 {
229  EST_TokenStream ts;
230  int i, order;
231  double occur;
232 
233  if (ts.open(filename) == -1)
234  return misc_read_error;
235 
236  if (ts.peek().string() != "Ngram_2")
237  {
238  ts.close();
239  return wrong_format;
240  }
241  ts.get(); // skip magic number
242 
243  order = atoi(ts.get().string());
244  ts.get_upto_eoln(); // skip to next line
246  EST_StrList pred_vocab; // may be different
247 
248  while (!ts.eoln())
249  vocab.append(ts.get().string());
250  ts.get_upto_eoln(); // skip to next line
251  while (!ts.eoln())
252  pred_vocab.append(ts.get().string());
253 
254  if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
255  {
256  cerr << "Something may be wrong with the vocab lists in '"
257  << filename << "'" << endl;
258  return misc_read_error;
259  }
260 
261  EST_StrVector window(order);
262 
263  while(!ts.eof())
264  {
265  for (i=0; i < order; i++)
266  window[i] = ts.get().string();
267  if (ts.get().string() != ":")
268  {
269  cerr << "EST_Ngrammar:load_ngram_cstr_ascii missing colon at filepos "
270  << ts.filepos() << endl;
271  return misc_read_error;
272  }
273  occur = atof(ts.get().string());
274  n.accumulate(window,occur);
275  if (!ts.eoln())
276  {
277  cerr << "EST_Ngrammar:load_ngram_cstr_ascii expect end of line at filepos "
278  << ts.filepos() << endl;
279  return misc_read_error;
280  }
281  }
282 
283  ts.close();
284 
285  return format_ok;
286 }
287 
290 {
291  EST_TokenStream ts;
292  int i,j,order;
293  EST_Litem *k;
294  int num_entries;
295  double approx_num_samples = 0.0;
296  long freq_data_start, freq_data_end;
297  FILE *ifd;
298  int magic = 0;
299  int swap = FALSE;
300 
301  if ((ifd=fopen(filename,"rb")) == NULL)
302  return misc_read_error;
303  if (fread(&magic,sizeof(int),1,ifd) != 1)
304  {
305  cerr << "Could not read integer from " << filename << endl;
306  fclose(ifd);
307  return misc_read_error;
308  }
309  if (SWAPINT(magic) == EST_NGRAMBIN_MAGIC)
310  swap = TRUE;
311  else if (magic != EST_NGRAMBIN_MAGIC) {
312  fclose(ifd);
313  return wrong_format;
314  }
315  if (ts.open(ifd, FALSE) == -1)
316  return misc_read_error;
317 
318  ts.set_SingleCharSymbols("\n");
319  ts.set_WhiteSpaceChars(" \t\r");
320 
321  if (ts.peek().string() != "mBin_2")
322  {
323  fclose(ifd);
324  ts.close();
325  return wrong_format;
326  }
327  ts.get(); // skip magic number
328 
329  order = atoi(ts.get().string());
330  if (ts.get() != "\n")
331  {
332  fclose(ifd);
333  ts.close();
334  return misc_read_error;
335  }
337  EST_StrList pred_vocab; // may be different
338 
339  while ((ts.peek() != "\n") && (!ts.eof()))
340  vocab.append(ts.get().string());
341  ts.get(); // skip newline
342  while ((ts.peek() != "\n") && (!ts.eof()))
343  pred_vocab.append(ts.get().string());
344 
345  // Need to get to the position one after the newline and
346  // who knows what TokenStream has already read,
347  EST_fseek(ifd,(long)(ts.peek().filepos()+5),SEEK_SET);
348 
349  if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
350  {
351  ts.close();
352  fclose(ifd);
353  return misc_read_error;
354  }
355 
356  EST_StrVector window(order);
357 
358  freq_data_start = EST_ftell(ifd);
359  EST_fseek(ifd,0,SEEK_END);
360  freq_data_end = EST_ftell(ifd);
361  num_entries = (freq_data_end-freq_data_start)/sizeof(double);
362  double *dd = new double[num_entries];
363 
364  // Go back to start of data
365  EST_fseek(ifd,freq_data_start,SEEK_SET);
366 
367  if (fread(dd,sizeof(double),num_entries,ifd) != (unsigned)num_entries)
368  {
369  cerr << "EST_Ngrammar::load_ngram_cstr_bin format does not have expected number of entries" << endl;
370  ts.close();
371  fclose(ifd);
372  delete[] dd;
373  return misc_read_error;
374  }
375  if (swap)
376  swap_bytes_double(dd,num_entries);
377 
378  for(j=i=0;i<n.num_states();i++)
379  {
380  if (j >= num_entries)
381  {
382  cerr << "EST_Ngrammar::load_ngram_cstr_bin unexpected end of frequency data" << endl;
383  ts.close();
384  fclose(ifd);
385  delete[] dd;
386  return misc_read_error;
387  }
388  for (k=n.p_states[i].pdf().item_start();
389  (!n.p_states[i].pdf().item_end(k)) && (j < num_entries) ;
390  k = n.p_states[i].pdf().item_next(k))
391  {
392  n.p_states[i].pdf().set_frequency(k,dd[j]);
393  // Update global info too
394  approx_num_samples += dd[j]; // probably not right
395  n.vocab_pdf.cumulate(k,dd[j]);
396 
397  // Number of consecutive occurrences of this frequency as in
398  // dd[j+1] if its a negative number
399  if (j+1 >= num_entries)
400  j++;
401  else if (dd[j+1] < -1)
402  dd[j+1]++;
403  else if (dd[j+1] == -1)
404  j +=2;
405  else
406  j++;
407  }
408  }
409 
410  // With smoothing num_samples might not be as exact as you like
411  n.p_num_samples = (int)approx_num_samples;
412 
413  delete [] dd;
414 
415  ts.close();
416  fclose(ifd);
417 
418  return format_ok;
419 }
420 
421 // ====================================================================
422 
424 save_ngram_htk_ascii_sub(const EST_String &word, ostream *ost,
425  EST_Ngrammar &n, double floor)
426 {
427  EST_Litem *k;
428  EST_String name;
429  double freq;
430  EST_StrVector this_ngram(2); // assumes bigram
431  this_ngram[0] = word;
433  this_pdf = n.prob_dist(this_ngram);
434 
435  double lfreq=-1;
436  int lcount=0;
437  double total_freq=0;
438 
439  double floor_prob_total = floor * (n.pred_vocab->length()-1);
440 
441  if (word == n.p_sentence_end_marker)
442  {
443  *ost << word;
444  *ost << " 0*" << n.pred_vocab->length()-1 << " " << 1 << endl;
445  return write_ok;
446  }
447 
448  if(floor_prob_total > 1)
449  {
450  cerr << "ERROR : floor is impossibly large, scaling it !" << endl;
451  floor = 1.0 / (double)(n.pred_vocab->length()-1);
452  floor_prob_total = 1;
453  }
454 
455  // not efficient but who cares ?
456  for (k=this_pdf.item_start();
457  !this_pdf.item_end(k);
458  k = this_pdf.item_next(k))
459  {
460  this_pdf.item_freq(k,name,freq);
461  if(name != n.p_sentence_start_marker)
462  {
463  total_freq += freq;
464  }
465  }
466 
467 
468  // 0 for prob(word,start marker)
469  *ost << word << " 0 ";
470 
471  if (total_freq <= 0)
472  {
473  *ost << 1.0 / (double)(n.pred_vocab->length()-1) << "*";
474  *ost << n.pred_vocab->length()-1 << " " << endl;
475  }
476  else
477  {
478  lfreq=-1;
479 
480  for (k=this_pdf.item_start();
481  !this_pdf.item_end(k);
482  k = this_pdf.item_next(k))
483  {
484  this_pdf.item_freq(k,name,freq);
485 
486  if ( (name == n.p_sentence_start_marker) ||
487  (name == n.p_sentence_end_marker) ||
488  (name == OOV_MARKER) )
489  continue;
490 
491  if (freq == lfreq)
492  lcount++;
493  else
494  {
495  if (lcount > 1)
496  *ost << "*" << lcount << " ";
497  else
498  *ost << " ";
499 
500  lcount=1;
501  lfreq = freq;
502 
503  if(freq > 0)
504  {
505  double base_prob = freq / total_freq;
506 
507  // and floor/scale it
508  *ost << floor + ( base_prob * (1-floor_prob_total) );
509 
510  }
511  else
512  *ost << floor;
513 
514  }
515 
516 
517  }
518 
519  } // total_freq > 0
520 
521 
522  if(!n.closed_vocab())
523  {
524 
525  // not fully tested !!!!!!!!
526 
527  *ost << 0 << " ERROR !!!!!!!! ";
528  }
529 
530 
531  if (total_freq > 0)
532  {
533  freq = this_pdf.frequency(n.p_sentence_end_marker);
534 
535  if(freq == lfreq)
536  {
537  lcount++;
538  *ost << "*" << lcount << " " << endl;
539  }
540  else
541  {
542 
543  if (lcount > 1)
544  *ost << "*" << lcount << " ";
545  else
546  *ost << " ";
547 
548  if(freq > 0)
549  {
550  double base_prob = freq / total_freq;
551 
552  // and floor/scale it
553  *ost << floor + ( base_prob * (1-floor_prob_total) ) << endl;
554 
555  }
556  else
557  *ost << floor << endl;
558  }
559  }
560 
561  return write_ok;
562 }
563 
566  EST_Ngrammar &n, double floor)
567 {
568 
569  ostream *ost;
570 
571  // only for bigram
572  if(n.order() != 2)
573  {
574  cerr << "Can only save bigrams in htk_ascii format" << endl;
575  return misc_write_error;
576  }
577 
578  if (floor < 0)
579  {
580  cerr << "Negative floor probability does not make sense !" << endl;
581  return misc_write_error;
582  }
583 
584  if (filename == "-")
585  ost = &cout;
586  else
587  ost = new ofstream(filename);
588 
589  if(!(*ost))
590  return write_fail;
591 
592  if(floor * (n.pred_vocab->length()-1) > 1)
593  {
594  floor = 1.0 / (double)(n.pred_vocab->length()-1);
595  cerr << "ERROR : floor is impossibly large, scaling it to ";
596  cerr << floor << endl;
597  }
598 
599  int i;
600 
601  if(n.p_sentence_start_marker == "")
602  {
603  cerr << "Can't save in HTK format as no sentence start/end tags"
604  << " were given !" << endl;
605  return misc_write_error;
606  }
607 
608  // need '!ENTER' (or whatever) as first word- that's HTK for you
610 
611  // the real words
612  for(i=0;i<n.vocab->length();i++)
613  {
614  if ( (n.vocab->name(i) != n.p_sentence_start_marker) &&
615  (n.vocab->name(i) != n.p_sentence_end_marker) &&
616  (n.vocab->name(i) != OOV_MARKER) )
617  save_ngram_htk_ascii_sub(n.vocab->name(i),ost,n,floor);
618  }
619 
620  if(!n.closed_vocab())
622 
624 
625  if(ost != &cout)
626  delete ost;
627 
628  return write_ok;
629 }
630 
631 /*
632  EST_write_status
633  save_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n)
634  {
635  return write_ok;
636  }
637  */
638 
639 void
641 {
642  if(n->ngram_exists(ngram))
643  *((double*)count) += 1;
644 }
645 
646 void
648 {
649 
650  int i;
651 
652  if(n->ngram_exists(ngram))
653  {
654  *((ostream*)(ost)) << safe_log10(n->probability(ngram)) << " ";
655  for(i=0;i<ngram.n();i++)
656  *((ostream*)(ost)) << ngram(i) << " ";
657 
658  if ((n->representation() == EST_Ngrammar::backoff) &&
659  (n->order() > ngram.n()) )
660  *((ostream*)(ost)) << safe_log10(n->get_backoff_weight(ngram));
661  //<< " = "
662  //<< n->get_backoff_weight(ngram) << " ";
663 
664  *((ostream*)(ost)) << endl;
665 
666  }
667 }
668 
671 {
672  // ARPA MIT-LL format - see HTK manual !!
673 
674  ostream *ost;
675  int i,o;
676  /*int num_n;*/
677  if (filename == "-")
678  ost = &cout;
679  else
680  ost = new ofstream(filename);
681 
682  if (!(*ost))
683  return write_fail;
684 
685  //n.set_entry_type(EST_Ngrammar::probabilities);
686  //n.make_htk_compatible(); // fix enter/exit probs
687  //*ost << *(n.vocab) << endl;
688 
689  // count number of ngrams
690  /*num_n = (int)n.samples();*/
691  *ost << "\\data\\" << endl;
692 
693  double *count = new double;
694 
696  {
697  for(o=1;o<=n.order();o++)
698  {
699  EST_StrVector ngram(o);
700  for(i=0;i<o;i++)
701  ngram[i] = "";
702  *count =0;
703 
704  // this is a deeply silly way to count them,
705  // we could traverse the tree directly !
706  n.iterate(ngram,&count_ngram_arpa_sub,(void*)count);
707  *ost << "ngram " << o << "=" << *count << endl;
708  }
709 
710  for(o=1;o<=n.order();o++)
711  {
712  *ost << endl;
713  *ost << "\\" << o << "-grams:" << endl;
714  EST_StrVector ngram(o);
715  for(i=0;i<o;i++)
716  ngram[i] = "";
717  n.iterate(ngram,&save_ngram_arpa_sub,(void*)ost);
718  }
719 
720  }
721  else
722  {
723  EST_StrVector ngram(n.order());
724  for(i=0;i<n.order();i++)
725  ngram[i] = "";
726  *count =0;
727  n.iterate(ngram,&count_ngram_arpa_sub,(void*)count);
728  *ost << "ngram " << n.order() << "=" << *count << endl;
729 
730  *ost << endl;
731  *ost << "\\" << n.order() << "-grams:" << endl;
732 
733  for(i=0;i<n.order();i++)
734  ngram[i] = "";
735  n.iterate(ngram,&save_ngram_arpa_sub,ost);
736 
737  }
738 
739  *ost << "\\end\\" << endl;
740 
741  if (ost != &cout) delete ost;
742  delete count;
743  return write_ok;
744 }
745 
748  const bool trace, double floor)
749 {
750  // awb's format
751  (void)trace;
752  ostream *ost;
753  int i;
754  EST_Litem *k;
755 
756  if (filename == "-")
757  ost = &cout;
758  else
759  ost = new ofstream(filename);
760 
761  if(!(*ost))
762  return write_fail;
763 
764  *ost << "Ngram_2 " << n.order() << endl;
765  for (i=0; i < n.vocab->length(); i++)
766  *ost << n.vocab->name(i) << " ";
767  *ost << endl;
768  for (i=0; i < n.pred_vocab->length(); i++)
769  *ost << n.pred_vocab->name(i) << " ";
770  *ost << endl;
771 
773  n.print_freqs(*ost,floor);
774  else if (n.representation() == EST_Ngrammar::backoff)
775  {
776  int total_ngrams = (int)pow(float(n.get_vocab_length()),float(n.order()-1));
777 
778  for(i=0;i<total_ngrams;i++)
779  {
781  const EST_StrVector this_ngram = n.make_ngram_from_index(i);
782  this_pdf = n.prob_dist(this_ngram);
783 
784  for (k=this_pdf.item_start();
785  !this_pdf.item_end(k);
786  k = this_pdf.item_next(k))
787  {
788  double freq;
789  EST_String name;
790  this_pdf.item_freq(k,name,freq);
791 
792  for (int jj=0; jj < this_ngram.n(); jj++)
793  *ost << this_ngram(jj) << " ";
794  *ost << name << " : " << freq << endl;
795  }
796  }
797  }
798 
799  if(ost != &cout)
800  delete ost;
801 
802  return write_ok;
803 }
804 
807 {
808  // Save as a WFST
809  FILE *ost;
810  int i;
811 
812  if ((ost = fopen(filename,"wb")) == NULL)
813  {
814  cerr << "Ngrammar save: unable to open \"" << filename <<
815  "\" for writing" << endl;
816  return write_fail;
817  }
818 
819  fprintf(ost,"EST_File fst\n");
820  fprintf(ost,"DataType ascii\n");
821  fprintf(ost,"in \"(");
822  for (i=0; i < n.vocab->length(); i++)
823  fprintf(ost," %s\n",(const char *)n.vocab->name(i));
824  fprintf(ost," )\"\n");
825  fprintf(ost,"out \"(");
826  for (i=0; i < n.vocab->length(); i++)
827  fprintf(ost," %s\n",(const char *)n.vocab->name(i));
828  fprintf(ost," )\"\n");
829  fprintf(ost,"NumStates %d\n",n.num_states());
830  fprintf(ost,"EST_Header_End\n");
831 
832  for (i=0; i<n.num_states(); i++)
833  {
834  fprintf(ost,"((%d nonfinal %d)\n",i,i);
835  fprintf(ost,")\n");
836  }
837 
838  fclose(ost);
839 
840  return write_ok;
841 }
842 
845  const bool trace, double floor)
846 {
847 
849  return misc_write_error;
850 
851  int i;
852  EST_Litem *k;
853  FILE *ofd;
854  double lfreq = -1;
855  double count = -1;
856  int magic = EST_NGRAMBIN_MAGIC;
857 
858  if (filename == "-")
859  {
860  if ((ofd=stdout) == NULL)
861  return misc_write_error;
862  }
863  else
864  {
865  if ((ofd=fopen(filename,"wb")) == NULL)
866  return misc_write_error;
867  }
868 
869  fwrite(&magic,sizeof(int),1,ofd);
870  fprintf(ofd,"mBin_2 %d\n",n.order());
871  for (i=0; i < n.vocab->length(); i++)
872  fprintf(ofd,"%s ",(const char *)n.vocab->name(i));
873  fprintf(ofd,"\n");
874  for (i=0; i < n.pred_vocab->length(); i++)
875  fprintf(ofd,"%s ",(const char *)n.pred_vocab->name(i));
876  fprintf(ofd,"\n");
877 
878  // We use a simple form of run-length encoding, if consecutive
879  // values are equal only a length is printed. lengths are
880  // negative as frequencies (even smoothed ones) can never be -ve
881 
882  if ( trace )
883  cerr << "Saving ..." << endl;
884 
886  {
887  for(i=0;i<n.num_states();i++)
888  {
889 
890  if ( trace )
891  cerr << "\r" << i*100/n.num_states() << "%";
892 
893  for (k=n.p_states[i].pdf().item_start();
894  !n.p_states[i].pdf().item_end(k);
895  k = n.p_states[i].pdf().item_next(k))
896  {
897  double freq;
898  EST_String name;
899  n.p_states[i].pdf().item_freq(k,name,freq);
900  if (freq == 0.0)
901  freq = floor;
902  if (freq == lfreq)
903  count--;
904  else
905  {
906  if (count < -1)
907  fwrite(&count,sizeof(double),1,ofd);
908  fwrite(&freq,sizeof(double),1,ofd);
909  count = -1;
910  }
911  lfreq = freq;
912  }
913  }
914  if (count < -1)
915  fwrite(&count,sizeof(double),1,ofd);
916  }
917  else if (n.representation() == EST_Ngrammar::backoff)
918  {
919  // need to construct pdfs in right order
920  // noting that dense states are indexed s.t. the last
921  // word in the ngram is the least significant 'bit'
922 
923  // number of ngrams, excluding last word, is
924  int total_ngrams = (int)pow(float(n.get_vocab_length()),float(n.order()-1));
925 
926  for(i=0;i<total_ngrams;i++)
927  {
928 
929  if ( trace )
930  cerr << "\r" << i*100/total_ngrams << "%";
931 
933  const EST_StrVector this_ngram = n.make_ngram_from_index(i);
934  this_pdf = n.prob_dist(this_ngram);
935 
936  for (k=this_pdf.item_start();
937  !this_pdf.item_end(k);
938  k = this_pdf.item_next(k))
939  {
940 
941  double freq;
942  EST_String name;
943  this_pdf.item_freq(k,name,freq);
944  if (freq == lfreq)
945  count--;
946  else
947  {
948  if (count < -1)
949  fwrite(&count,sizeof(double),1,ofd);
950  fwrite(&freq,sizeof(double),1,ofd);
951  count = -1;
952  }
953  lfreq = freq;
954  }
955 
956 
957  }
958 
959  }
960  if ( trace )
961  cerr << "\r \r" << endl;
962 
963  fclose(ofd);
964 
965  return write_ok;
966 }
void set_WhiteSpaceChars(const EST_String &ws)
set which characters are to be treated as whitespace
Definition: EST_Token.h:341
#define EST_NGRAMBIN_MAGIC
Definition: EST_Ngrammar.h:63
EST_TokenStream & get(EST_Token &t)
get next token in stream
Definition: EST_Token.cc:499
EST_FilePos filepos(void) const
file position in original EST_TokenStream.
Definition: EST_Token.h:189
EST_String p_sentence_end_marker
Definition: EST_Ngrammar.h:239
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
int contains(const char *s, ssize_t pos=-1) const
Does it contain this substring?
Definition: EST_String.h:365
void count_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *count)
Definition: ngrammar_io.cc:640
EST_read_status load_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:227
EST_FilePos EST_ftell(FILE *fp)
Definition: EST_File.h:71
EST_write_status
#define SWAPINT(x)
Definition: EST_cutils.h:75
int closed_vocab() const
Definition: EST_Ngrammar.h:423
void accumulate(const EST_StrVector &words, const double count=1)
void set_SingleCharSymbols(const EST_String &sc)
set which characters are to be treated as single character symbols
Definition: EST_Token.h:344
void close(void)
Close stream.
Definition: EST_Token.cc:419
const EST_String & name(const int n) const
The name given the index.
double get_backoff_weight(const EST_StrVector &words) const
int length(void) const
The number of members in the discrete.
EST_String itoString(int n)
Make a EST_String object from an integer.
Definition: util_io.cc:141
void swap_bytes_double(double *data, int length)
Definition: EST_swapping.cc:68
EST_read_status load_ngram_arpa(const EST_String filename, EST_Ngrammar &n, const EST_StrList &vocab)
Definition: ngrammar_io.cc:74
EST_Discrete * vocab
Definition: EST_Ngrammar.h:283
EST_write_status save_ngram_htk_ascii_sub(const EST_String &word, ostream *ost, EST_Ngrammar &n, double floor)
Definition: ngrammar_io.cc:424
EST_Litem * item_start() const
Used for iterating through members of the distribution.
EST_Discrete * pred_vocab
Definition: EST_Ngrammar.h:284
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
EST_NgrammarState * p_states
Definition: EST_Ngrammar.h:276
int num_states(void) const
Definition: EST_Ngrammar.h:413
double safe_log10(const double x)
Definition: EST_math.h:178
EST_read_status load_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:58
EST_DiscreteProbDistribution vocab_pdf
Definition: EST_Ngrammar.h:292
int open(const EST_String &filename)
open a EST_TokenStream for a file.
Definition: EST_Token.cc:213
EST_DiscreteProbDistribution & pdf()
Definition: EST_Ngrammar.h:115
double probability(const EST_StrVector &words, bool force=false, const bool trace=false) const
const EST_DiscreteProbDistribution & prob_dist(const EST_StrVector &words) const
int eof()
end of file
Definition: EST_Token.h:362
void cumulate(const EST_String &s, double count=1)
Add this observation, may specify number of occurrences.
double frequency(const EST_String &s) const
#define SEEK_END
Definition: system.h:28
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index.
#define misc_write_error
EST_write_status save_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n, const bool trace, double floor)
Definition: ngrammar_io.cc:844
EST_write_status save_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n, double floor)
Definition: ngrammar_io.cc:565
The file was written successfully.
#define wrong_format
representation_t representation() const
Definition: EST_Ngrammar.h:425
EST_read_status load_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:289
#define FALSE
Definition: EST_bool.h:119
void print_freqs(ostream &os, double floor=0.0)
#define misc_read_error
NULL
Definition: EST_WFST.cc:55
EST_write_status save_ngram_arpa(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:670
int EST_fseek(FILE *fp, EST_FilePos offset, int whence)
Definition: EST_File.h:75
EST_Token & peek(void)
peek at next token
Definition: EST_Token.h:332
EST_read_status load_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:66
int get_vocab_length() const
Definition: EST_Ngrammar.h:416
EST_write_status save_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n, const bool trace, double floor)
Definition: ngrammar_io.cc:747
getString int
Definition: EST_item_aux.cc:50
The file was not written successfully.
void set_frequency(const EST_String &s, double c)
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:196
const EST_StrVector & make_ngram_from_index(const int i) const
int order() const
Definition: EST_Ngrammar.h:415
EST_read_status
EST_FilePos filepos(void) const
current file position in EST_TokenStream
Definition: EST_Token.h:367
const EST_String & string() const
Definition: EST_Token.h:120
#define format_ok
#define OOV_MARKER
Definition: EST_Ngrammar.h:61
EST_Token get_upto_eoln(void)
get up to s in end of line as a single token.
Definition: EST_Token.cc:529
void save_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *ost)
Definition: ngrammar_io.cc:647
EST_String after(int pos, int len=1) const
Part after pos+len.
Definition: EST_String.h:308
EST_String before(int pos, int len=0) const
Part before position.
Definition: EST_String.h:276
void iterate(EST_StrVector &words, void(*function)(EST_Ngrammar *n, EST_StrVector &words, void *params), void *params)
bool ngram_exists(const EST_StrVector &words) const
#define TRUE
Definition: EST_bool.h:118
INLINE ssize_t n() const
number of items in vector.
Definition: EST_TVector.h:251
bool init(int o, representation_t r, const EST_StrList &wordlist)
EST_String p_sentence_start_marker
Definition: EST_Ngrammar.h:238
EST_StrVector vocab
Definition: dp_main.cc:85
EST_write_status save_ngram_wfst(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:806
int eoln()
end of line
Definition: EST_Token.cc:832
#define SEEK_SET
Definition: system.h:20