speech-tools/ngrammar__io_8cc_source.html

 /*************************************************************************/
 /*                                                                       */
 /*                Centre for Speech Technology Research                  */
 /*                     University of Edinburgh, UK                       */
 /*                      Copyright (c) 1996,1997                          */
 /*                        All Rights Reserved.                           */
 /*                                                                       */
 /*  Permission is hereby granted, free of charge, to use and distribute  */
 /*  this software and its documentation without restriction, including   */
 /*  without limitation the rights to use, copy, modify, merge, publish,  */
 /*  distribute, sublicense, and/or sell copies of this work, and to      */
 /*  permit persons to whom this work is furnished to do so, subject to   */
 /*  the following conditions:                                            */
 /*   1. The code must retain the above copyright notice, this list of    */
 /*      conditions and the following disclaimer.                         */
 /*   2. Any modifications must be clearly marked as such.                */
 /*   3. Original authors' names are not deleted.                         */
 /*   4. The authors' names are not used to endorse or promote products   */
 /*      derived from this software without specific prior written        */
 /*      permission.                                                      */
 /*                                                                       */
 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
 /*  THIS SOFTWARE.                                                       */
 /*                                                                       */
 /*************************************************************************/
 /*                     Author :  Simon King & Alan W Black               */
 /*                     Date   :  February 1997                           */
 /*-----------------------------------------------------------------------*/
 /*                                                                       */
 /* IO functions for EST_Ngram class                                      */
 /*                                                                       */
 /*=======================================================================*/

 #include <cstdlib>
 #include <fstream>
 #include <iostream>
 #include "EST_unix.h"
 #include <cstring>
 #include <climits>
 #include <cfloat>
 #include "EST_String.h"
 #include "EST_Ngrammar.h"
 #include "EST_Token.h"
 #include "EST_cutils.h"
 #include "EST_File.h"


 using namespace std;

 EST_read_status
 load_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n)
 {
     (void)filename;
     (void)n;
     return wrong_format;
 }

 EST_read_status
 load_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n)
 {
     (void)filename;
     (void)n;
     return wrong_format;
 }

 EST_read_status
 load_ngram_arpa(const EST_String filename, EST_Ngrammar &n, const EST_StrList &vocab)
 {

     EST_TokenStream ts;
     EST_String s;
     int i,j,k, order=0;
     //double weight;
     /*double occur;*/
     int this_num,this_order;

     if (ts.open(filename) == -1)
     return misc_read_error;

     // find  backslash data backslash
     while ((!ts.eof()) && !ts.get().string().contains("\\data\\"));

     if (ts.eof())
     {
     ts.close();
     return wrong_format;
     }

     // find order and numbers of ngrams

     // somewhere to keep numbers
     EST_IVector nums(100); // not going to have anything bigger than a 100-gram !

     while (!ts.eof())
     {
     // have we got to next section
     if (ts.peek().string().contains("-grams:"))
         break;

     s=ts.get_upto_eoln().string();

     if(s.contains("ngram ") && s.contains("="))
     {

         s=s.after("ngram ");
         this_order=atoi(s.before("="));
         this_num=atoi(s.after("="));

         //cerr << "There are " << this_num << " " << this_order
         //<< "-grams" << endl;

         nums[this_order] = this_num;

         if(this_order > order)
         order = this_order;
     }

     }


     if(order==0)
     {
     //cerr << "No ngram ?=? in header !" << endl;
     ts.close();
     return wrong_format;
     }

     //cerr << "Initialising " << order << "-grammar" << endl;
     if(!n.init(order,EST_Ngrammar::backoff,vocab))
     return misc_read_error;

     // read data
     for(i=1;i<=order;i++)
     {

     EST_StrVector window(i);

     // find start of data for this order "<order>-grams:"
     EST_String tmp =  "\\" + itoString(i) + "-grams:";
     while (!ts.eof())
     {
         s=ts.get().string();
         if (s.contains(tmp))
         break;
     }


     if(ts.eof())
     {
         cerr << "Unexpected end of grammar file whilst looking for '"
         << tmp << "'" << endl;
         return misc_read_error;
     }

     //cerr << "Found order " << i << " : " << tmp << endl;
     //cerr << "Looking for " << nums(i) << " ngrams" << endl;
     // look for nums(i) ngrams

     for(j=0;j<nums(i);j++)
     {

         for (k=0; ((k<i) && !ts.eof()); k++)
         window[k] = ts.get().string();

         if(ts.eof())
         {
         cerr << "Unexpected end of file whilst reading " << i
             << "-grams !" << endl;
         return misc_read_error;
         }

         /*occur = atof(ts.get().string()); unused*/
         ts.get().string();

         // can't for backoff grammars, need to set probs directly

         cerr << "ooooooooops" << endl;
         return wrong_format;
         /* BEGIN COMMENT: This code is unreachable
         //n.accumulate(window,occur);

         // backoff weight ?
         if (!ts.eoln())
         {
         weight = atof(ts.get().string());
         n.set_backoff_weight(window,weight);
         }

         if (!ts.eoln())
         {
         cerr << "EST_Ngrammar:load_ngram_arpa expect end of line at filepos "
             << ts.filepos() << endl;
         ts.close();
         return misc_read_error;
         }
         END COMMENT: This code is unreachable */

     }

     } // loop through orders


     // find backslash end backslash
     while (!ts.eof())
     if (ts.get().string() == "\\end\\")
     {
         ts.close();
         return format_ok;

     }

     cerr << "Missing \\end\\ !" << endl;

     ts.close();
     return misc_read_error;

 }

 EST_read_status
 load_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n)
 {
     EST_TokenStream ts;
     int i, order;
     double occur;

     if (ts.open(filename) == -1)
     return misc_read_error;

     if (ts.peek().string() != "Ngram_2")
     {
     ts.close();
     return wrong_format;
     }
     ts.get();           // skip magic number

     order = atoi(ts.get().string());
     ts.get_upto_eoln();     // skip to next line
     EST_StrList vocab;
     EST_StrList pred_vocab; // may be different

     while (!ts.eoln())
     vocab.append(ts.get().string());
     ts.get_upto_eoln();     // skip to next line
     while (!ts.eoln())
     pred_vocab.append(ts.get().string());

     if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
     {
     cerr << "Something may be wrong with the vocab lists in '"
         << filename << "'" << endl;
     return misc_read_error;
     }

     EST_StrVector window(order);

     while(!ts.eof())
     {
     for (i=0; i < order; i++)
         window[i] = ts.get().string();
     if (ts.get().string() != ":")
     {
         cerr << "EST_Ngrammar:load_ngram_cstr_ascii missing colon at filepos "
         << ts.filepos() << endl;
         return misc_read_error;
     }
     occur = atof(ts.get().string());
     n.accumulate(window,occur);
     if (!ts.eoln())
     {
         cerr << "EST_Ngrammar:load_ngram_cstr_ascii expect end of line at filepos "
         << ts.filepos() << endl;
         return misc_read_error;
     }
     }

     ts.close();

     return format_ok;
 }

 EST_read_status
 load_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n)
 {
     EST_TokenStream ts;
     int i,j,order;
     EST_Litem *k;
     int num_entries;
     double approx_num_samples = 0.0;
     long freq_data_start, freq_data_end;
     FILE *ifd;
     int magic = 0;
     int swap = FALSE;

     if ((ifd=fopen(filename,"rb")) == NULL)
     return misc_read_error;
     if (fread(&magic,sizeof(int),1,ifd) != 1)
     {
         cerr << "Could not read integer from " << filename << endl;
         fclose(ifd);
         return misc_read_error;
     }
     if (SWAPINT(magic) == EST_NGRAMBIN_MAGIC)
     swap = TRUE;
     else if (magic != EST_NGRAMBIN_MAGIC) {
         fclose(ifd);
         return wrong_format;
     }
     if (ts.open(ifd, FALSE) == -1)
     return misc_read_error;

     ts.set_SingleCharSymbols("\n");
     ts.set_WhiteSpaceChars(" \t\r");

     if (ts.peek().string() != "mBin_2")
     {
     fclose(ifd);
     ts.close();
     return wrong_format;
     }
     ts.get();           // skip magic number

     order = atoi(ts.get().string());
     if (ts.get() != "\n")
     {
     fclose(ifd);
     ts.close();
     return misc_read_error;
     }
     EST_StrList vocab;
     EST_StrList pred_vocab; // may be different

     while ((ts.peek() != "\n") && (!ts.eof()))
     vocab.append(ts.get().string());
     ts.get();           // skip newline
     while ((ts.peek() != "\n") && (!ts.eof()))
     pred_vocab.append(ts.get().string());

     // Need to get to the position one after the newline and
     // who knows what TokenStream has already read,
     EST_fseek(ifd,(long)(ts.peek().filepos()+5),SEEK_SET);

     if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
     {
     ts.close();
     fclose(ifd);
     return misc_read_error;
     }

     EST_StrVector window(order);

     freq_data_start = EST_ftell(ifd);
     EST_fseek(ifd,0,SEEK_END);
     freq_data_end = EST_ftell(ifd);
     num_entries = (freq_data_end-freq_data_start)/sizeof(double);
     double *dd = new double[num_entries];

     // Go back to start of data
     EST_fseek(ifd,freq_data_start,SEEK_SET);

     if (fread(dd,sizeof(double),num_entries,ifd) != (unsigned)num_entries)
     {
     cerr << "EST_Ngrammar::load_ngram_cstr_bin format does not have expected number of entries" << endl;
     ts.close();
     fclose(ifd);
     delete[] dd;
     return misc_read_error;
     }
     if (swap)
     swap_bytes_double(dd,num_entries);

     for(j=i=0;i<n.num_states();i++)
     {
     if (j >= num_entries)
     {
         cerr << "EST_Ngrammar::load_ngram_cstr_bin unexpected end of frequency data" << endl;
         ts.close();
         fclose(ifd);
         delete[] dd;
         return misc_read_error;
     }
     for (k=n.p_states[i].pdf().item_start();
          (!n.p_states[i].pdf().item_end(k)) && (j < num_entries) ;
          k = n.p_states[i].pdf().item_next(k))
     {
         n.p_states[i].pdf().set_frequency(k,dd[j]);
         // Update global info too
         approx_num_samples += dd[j]; // probably not right
         n.vocab_pdf.cumulate(k,dd[j]);

         // Number of consecutive occurrences of this frequency as in
         // dd[j+1] if its a negative number
         if (j+1 >= num_entries)
         j++;
         else if (dd[j+1] < -1)
         dd[j+1]++;
         else if (dd[j+1] == -1)
         j +=2;
         else
         j++;
     }
     }

     // With smoothing num_samples might not be as exact as you like
     n.p_num_samples = (int)approx_num_samples;

     delete [] dd;

     ts.close();
     fclose(ifd);

     return format_ok;
 }

 // ====================================================================

 EST_write_status
 save_ngram_htk_ascii_sub(const EST_String &word, ostream *ost,
              EST_Ngrammar &n, double floor)
 {
     EST_Litem *k;
     EST_String name;
     double freq;
     EST_StrVector this_ngram(2); // assumes bigram
     this_ngram[0] = word;
     EST_DiscreteProbDistribution this_pdf;
     this_pdf = n.prob_dist(this_ngram);

     double lfreq=-1;
     int lcount=0;
     double total_freq=0;

     double floor_prob_total = floor * (n.pred_vocab->length()-1);

     if (word == n.p_sentence_end_marker)
     {
     *ost << word;
     *ost << " 0*" << n.pred_vocab->length()-1 << " " << 1 << endl;
     return write_ok;
     }

     if(floor_prob_total > 1)
     {
     cerr << "ERROR : floor is impossibly large, scaling it !" << endl;
     floor = 1.0 / (double)(n.pred_vocab->length()-1);
     floor_prob_total = 1;
     }

     // not efficient but who cares ?
     for (k=this_pdf.item_start();
      !this_pdf.item_end(k);
      k = this_pdf.item_next(k))
     {
     this_pdf.item_freq(k,name,freq);
     if(name != n.p_sentence_start_marker)
     {
         total_freq += freq;
     }
     }


     // 0 for prob(word,start marker)
     *ost << word << " 0 ";

     if (total_freq <= 0)
     {
     *ost << 1.0 / (double)(n.pred_vocab->length()-1) << "*";
     *ost << n.pred_vocab->length()-1 << " " << endl;
     }
     else
     {
     lfreq=-1;

     for (k=this_pdf.item_start();
          !this_pdf.item_end(k);
          k = this_pdf.item_next(k))
     {
         this_pdf.item_freq(k,name,freq);

         if ( (name == n.p_sentence_start_marker) ||
         (name == n.p_sentence_end_marker) ||
         (name == OOV_MARKER) )
         continue;

         if (freq == lfreq)
         lcount++;
         else
         {
         if (lcount > 1)
             *ost << "*" << lcount << " ";
         else
             *ost << " ";

         lcount=1;
         lfreq = freq;

         if(freq > 0)
         {
             double base_prob = freq / total_freq;

             // and floor/scale it
             *ost << floor + ( base_prob * (1-floor_prob_total) );

         }
         else
             *ost << floor;

         }


     }

     }               // total_freq > 0


     if(!n.closed_vocab())
     {

     // not fully tested !!!!!!!!

     *ost << 0 << " ERROR !!!!!!!! ";
     }


     if (total_freq > 0)
     {
     freq = this_pdf.frequency(n.p_sentence_end_marker);

     if(freq == lfreq)
     {
         lcount++;
         *ost << "*" << lcount << " " << endl;
     }
     else
     {

         if (lcount > 1)
         *ost << "*" << lcount << " ";
         else
         *ost << " ";

         if(freq > 0)
         {
         double base_prob = freq / total_freq;

         // and floor/scale it
         *ost << floor + ( base_prob * (1-floor_prob_total) ) << endl;

         }
         else
         *ost << floor << endl;
     }
     }

     return write_ok;
 }

 EST_write_status
 save_ngram_htk_ascii(const EST_String filename,
              EST_Ngrammar &n, double floor)
 {

     ostream *ost;

     // only for bigram
     if(n.order() != 2)
     {
     cerr << "Can only save bigrams in htk_ascii format" << endl;
     return misc_write_error;
     }

     if (floor < 0)
     {
     cerr << "Negative floor probability does not make sense !" << endl;
     return misc_write_error;
     }

     if (filename == "-")
     ost = &cout;
     else
     ost = new ofstream(filename);

     if(!(*ost))
     return write_fail;

     if(floor * (n.pred_vocab->length()-1) > 1)
     {
     floor = 1.0 / (double)(n.pred_vocab->length()-1);
     cerr << "ERROR : floor is impossibly large, scaling it to ";
     cerr << floor << endl;
     }

     int i;

     if(n.p_sentence_start_marker == "")
     {
     cerr << "Can't save in HTK format as no sentence start/end tags"
         << " were given !" << endl;
     return misc_write_error;
     }

     // need '!ENTER' (or whatever) as first word- that's HTK for you
     save_ngram_htk_ascii_sub(n.p_sentence_start_marker,ost,n,floor);

     // the real words
     for(i=0;i<n.vocab->length();i++)
     {
     if ( (n.vocab->name(i) != n.p_sentence_start_marker) &&
         (n.vocab->name(i) != n.p_sentence_end_marker) &&
         (n.vocab->name(i) != OOV_MARKER) )
         save_ngram_htk_ascii_sub(n.vocab->name(i),ost,n,floor);
     }

     if(!n.closed_vocab())
     save_ngram_htk_ascii_sub(OOV_MARKER,ost,n,floor);

     save_ngram_htk_ascii_sub(n.p_sentence_end_marker,ost,n,floor);

     if(ost != &cout)
     delete ost;

     return write_ok;
 }

 /*
    EST_write_status
    save_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n)
    {
    return write_ok;
    }
    */

 void
 count_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *count)
 {
     if(n->ngram_exists(ngram))
     *((double*)count) += 1;
 }

 void
 save_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *ost)
 {

     int i;

     if(n->ngram_exists(ngram))
     {
     *((ostream*)(ost)) << safe_log10(n->probability(ngram)) << " ";
     for(i=0;i<ngram.n();i++)
         *((ostream*)(ost)) << ngram(i) << " ";

     if ((n->representation() == EST_Ngrammar::backoff) &&
         (n->order() > ngram.n()) )
         *((ostream*)(ost)) << safe_log10(n->get_backoff_weight(ngram));
     //<< " = "
     //<< n->get_backoff_weight(ngram) << " ";

     *((ostream*)(ost)) << endl;

     }
 }

 EST_write_status
 save_ngram_arpa(const EST_String filename, EST_Ngrammar &n)
 {
     // ARPA MIT-LL format - see HTK manual !!

     ostream *ost;
     int i,o;
     /*int num_n;*/
     if (filename == "-")
     ost = &cout;
     else
     ost = new ofstream(filename);

     if (!(*ost))
     return write_fail;

     //n.set_entry_type(EST_Ngrammar::probabilities);
     //n.make_htk_compatible(); // fix enter/exit probs
     //*ost << *(n.vocab) << endl;

     // count number of ngrams
     /*num_n = (int)n.samples();*/
     *ost << "\\data\\" << endl;

     double *count = new double;

     if (n.representation() == EST_Ngrammar::backoff)
     {
     for(o=1;o<=n.order();o++)
     {
         EST_StrVector ngram(o);
         for(i=0;i<o;i++)
         ngram[i] = "";
         *count =0;

         // this is a deeply silly way to count them,
         // we could traverse the tree directly !
         n.iterate(ngram,&count_ngram_arpa_sub,(void*)count);
         *ost << "ngram " << o << "=" << *count << endl;
     }

     for(o=1;o<=n.order();o++)
     {
         *ost << endl;
         *ost << "\\" << o << "-grams:" << endl;
         EST_StrVector ngram(o);
         for(i=0;i<o;i++)
         ngram[i] = "";
         n.iterate(ngram,&save_ngram_arpa_sub,(void*)ost);
     }

     }
     else
     {
     EST_StrVector ngram(n.order());
     for(i=0;i<n.order();i++)
         ngram[i] = "";
     *count =0;
     n.iterate(ngram,&count_ngram_arpa_sub,(void*)count);
     *ost << "ngram " << n.order() << "=" << *count << endl;

     *ost << endl;
     *ost << "\\" << n.order() << "-grams:" << endl;

     for(i=0;i<n.order();i++)
         ngram[i] = "";
     n.iterate(ngram,&save_ngram_arpa_sub,ost);

     }

     *ost << "\\end\\" << endl;

     if (ost != &cout) delete ost;
     delete count;
     return write_ok;
 }

 EST_write_status
 save_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n,
               const bool trace, double floor)
 {
     // awb's format
     (void)trace;
     ostream *ost;
     int i;
     EST_Litem *k;

     if (filename == "-")
     ost = &cout;
     else
     ost = new ofstream(filename);

     if(!(*ost))
     return write_fail;

     *ost << "Ngram_2 " << n.order() << endl;
     for (i=0; i < n.vocab->length(); i++)
     *ost << n.vocab->name(i) << " ";
     *ost << endl;
     for (i=0; i < n.pred_vocab->length(); i++)
     *ost << n.pred_vocab->name(i) << " ";
     *ost << endl;

     if (n.representation() == EST_Ngrammar::dense)
     n.print_freqs(*ost,floor);
     else if (n.representation() == EST_Ngrammar::backoff)
     {
       int total_ngrams = (int)pow(float(n.get_vocab_length()),float(n.order()-1));

     for(i=0;i<total_ngrams;i++)
     {
         EST_DiscreteProbDistribution this_pdf;
         const EST_StrVector this_ngram = n.make_ngram_from_index(i);
         this_pdf = n.prob_dist(this_ngram);

         for (k=this_pdf.item_start();
          !this_pdf.item_end(k);
          k = this_pdf.item_next(k))
         {
         double freq;
         EST_String name;
         this_pdf.item_freq(k,name,freq);

         for (int jj=0; jj < this_ngram.n(); jj++)
             *ost << this_ngram(jj) << " ";
         *ost << name << " : " << freq << endl;
         }
     }
     }

     if(ost != &cout)
     delete ost;

     return write_ok;
 }

 EST_write_status
 save_ngram_wfst(const EST_String filename, EST_Ngrammar &n)
 {
     // Save as a WFST
     FILE *ost;
     int i;

     if ((ost = fopen(filename,"wb")) == NULL)
     {
     cerr << "Ngrammar save: unable to open \"" << filename <<
         "\" for writing" << endl;
     return write_fail;
     }

     fprintf(ost,"EST_File fst\n");
     fprintf(ost,"DataType ascii\n");
     fprintf(ost,"in \"(");
     for (i=0; i < n.vocab->length(); i++)
     fprintf(ost," %s\n",(const char *)n.vocab->name(i));
     fprintf(ost," )\"\n");
     fprintf(ost,"out \"(");
     for (i=0; i < n.vocab->length(); i++)
     fprintf(ost," %s\n",(const char *)n.vocab->name(i));
     fprintf(ost," )\"\n");
     fprintf(ost,"NumStates %d\n",n.num_states());
     fprintf(ost,"EST_Header_End\n");

     for (i=0; i<n.num_states(); i++)
     {
     fprintf(ost,"((%d nonfinal %d)\n",i,i);
     fprintf(ost,")\n");
     }

     fclose(ost);

     return write_ok;
 }

 EST_write_status
 save_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n,
             const bool trace, double floor)
 {

     if (n.representation() == EST_Ngrammar::sparse)
     return misc_write_error;

     int i;
     EST_Litem *k;
     FILE *ofd;
     double lfreq = -1;
     double count = -1;
     int magic = EST_NGRAMBIN_MAGIC;

     if (filename == "-")
     {
     if ((ofd=stdout) == NULL)
         return misc_write_error;
     }
     else
     {
     if ((ofd=fopen(filename,"wb")) == NULL)
         return misc_write_error;
     }

     fwrite(&magic,sizeof(int),1,ofd);
     fprintf(ofd,"mBin_2 %d\n",n.order());
     for (i=0; i < n.vocab->length(); i++)
     fprintf(ofd,"%s ",(const char *)n.vocab->name(i));
     fprintf(ofd,"\n");
     for (i=0; i < n.pred_vocab->length(); i++)
     fprintf(ofd,"%s ",(const char *)n.pred_vocab->name(i));
     fprintf(ofd,"\n");

     // We use a simple form of run-length encoding, if consecutive
     // values are equal only a length is printed.  lengths are
     // negative as frequencies (even smoothed ones) can never be -ve

     if ( trace )
     cerr << "Saving ..." << endl;

     if (n.representation() == EST_Ngrammar::dense)
     {
     for(i=0;i<n.num_states();i++)
     {

         if ( trace )
         cerr << "\r" << i*100/n.num_states() << "%";

         for (k=n.p_states[i].pdf().item_start();
          !n.p_states[i].pdf().item_end(k);
          k = n.p_states[i].pdf().item_next(k))
         {
         double freq;
         EST_String name;
         n.p_states[i].pdf().item_freq(k,name,freq);
         if (freq == 0.0)
             freq = floor;
         if (freq == lfreq)
             count--;
         else
         {
             if (count < -1)
             fwrite(&count,sizeof(double),1,ofd);
             fwrite(&freq,sizeof(double),1,ofd);
             count = -1;
         }
         lfreq = freq;
         }
     }
     if (count < -1)
         fwrite(&count,sizeof(double),1,ofd);
     }
     else if (n.representation() == EST_Ngrammar::backoff)
     {
     // need to construct pdfs in right order
     // noting that dense states are indexed s.t. the last
     // word in the ngram is the least significant 'bit'

     // number of ngrams, excluding last word, is
       int total_ngrams = (int)pow(float(n.get_vocab_length()),float(n.order()-1));

     for(i=0;i<total_ngrams;i++)
     {

         if ( trace )
         cerr << "\r" << i*100/total_ngrams << "%";

         EST_DiscreteProbDistribution this_pdf;
         const EST_StrVector this_ngram = n.make_ngram_from_index(i);
         this_pdf = n.prob_dist(this_ngram);

         for (k=this_pdf.item_start();
          !this_pdf.item_end(k);
          k = this_pdf.item_next(k))
         {

         double freq;
         EST_String name;
         this_pdf.item_freq(k,name,freq);
         if (freq == lfreq)
             count--;
         else
         {
             if (count < -1)
             fwrite(&count,sizeof(double),1,ofd);
             fwrite(&freq,sizeof(double),1,ofd);
             count = -1;
         }
         lfreq = freq;
         }


     }

     }
     if ( trace )
     cerr << "\r      \r" << endl;

     fclose(ofd);

     return write_ok;
 }
EST_TokenStream::set_WhiteSpaceChars
void set_WhiteSpaceChars(const EST_String &ws)
set which characters are to be treated as whitespace
Definition: EST_Token.h:341

EST_NGRAMBIN_MAGIC
#define EST_NGRAMBIN_MAGIC
Definition: EST_Ngrammar.h:63

EST_TokenStream::get
EST_TokenStream & get(EST_Token &t)
get next token in stream
Definition: EST_Token.cc:499

EST_Token::filepos
EST_FilePos filepos(void) const
file position in original EST_TokenStream.
Definition: EST_Token.h:189

EST_Ngrammar::p_sentence_end_marker
EST_String p_sentence_end_marker
Definition: EST_Ngrammar.h:239

EST_DiscreteProbDistribution::item_next
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
Definition: EST_DProbDist.cc:388

EST_String::contains
int contains(const char *s, ssize_t pos=-1) const
Does it contain this substring?
Definition: EST_String.h:365

EST_TokenStream
Definition: EST_Token.h:239

count_ngram_arpa_sub
void count_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *count)
Definition: ngrammar_io.cc:640

EST_Ngrammar::dense
Definition: EST_Ngrammar.h:221

EST_DiscreteProbDistribution
Definition: EST_simplestats.h:210

load_ngram_cstr_ascii
EST_read_status load_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:227

EST_ftell
EST_FilePos EST_ftell(FILE *fp)
Definition: EST_File.h:71

EST_write_status
EST_write_status
Definition: EST_rw_status.h:124

SWAPINT
#define SWAPINT(x)
Definition: EST_cutils.h:75

EST_Ngrammar::closed_vocab
int closed_vocab() const
Definition: EST_Ngrammar.h:423

EST_UItem
Definition: EST_UList.h:49

EST_Ngrammar::accumulate
void accumulate(const EST_StrVector &words, const double count=1)
Definition: EST_Ngrammar.cc:885

EST_Ngrammar::p_num_samples
int p_num_samples
Definition: EST_Ngrammar.h:233

EST_TokenStream::set_SingleCharSymbols
void set_SingleCharSymbols(const EST_String &sc)
set which characters are to be treated as single character symbols
Definition: EST_Token.h:344

EST_TokenStream::close
void close(void)
Close stream.
Definition: EST_Token.cc:419

EST_Discrete::name
const EST_String & name(const int n) const
The name given the index.
Definition: EST_simplestats.h:94

EST_Ngrammar::get_backoff_weight
double get_backoff_weight(const EST_StrVector &words) const
Definition: EST_Ngrammar.cc:998

std

EST_Discrete::length
int length(void) const
The number of members in the discrete.
Definition: EST_simplestats.h:84

itoString
EST_String itoString(int n)
Make a EST_String object from an integer.
Definition: util_io.cc:141

swap_bytes_double
void swap_bytes_double(double *data, int length)
Definition: EST_swapping.cc:68

load_ngram_arpa
EST_read_status load_ngram_arpa(const EST_String filename, EST_Ngrammar &n, const EST_StrList &vocab)
Definition: ngrammar_io.cc:74

EST_Ngrammar::vocab
EST_Discrete * vocab
Definition: EST_Ngrammar.h:283

save_ngram_htk_ascii_sub
EST_write_status save_ngram_htk_ascii_sub(const EST_String &word, ostream *ost, EST_Ngrammar &n, double floor)
Definition: ngrammar_io.cc:424

EST_DiscreteProbDistribution::item_start
EST_Litem * item_start() const
Used for iterating through members of the distribution.
Definition: EST_DProbDist.cc:372

EST_Ngrammar::pred_vocab
EST_Discrete * pred_vocab
Definition: EST_Ngrammar.h:284

EST_DiscreteProbDistribution::item_end
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
Definition: EST_DProbDist.cc:380

EST_Ngrammar::p_states
EST_NgrammarState * p_states
Definition: EST_Ngrammar.h:276

EST_Ngrammar::num_states
int num_states(void) const
Definition: EST_Ngrammar.h:413

safe_log10
double safe_log10(const double x)
Definition: EST_math.h:178

load_ngram_htk_ascii
EST_read_status load_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:58

EST_unix.h

EST_Ngrammar::sparse
Definition: EST_Ngrammar.h:221

EST_Ngrammar
Definition: EST_Ngrammar.h:216

EST_Ngrammar::vocab_pdf
EST_DiscreteProbDistribution vocab_pdf
Definition: EST_Ngrammar.h:292

EST_TokenStream::open
int open(const EST_String &filename)
open a EST_TokenStream for a file.
Definition: EST_Token.cc:213

EST_NgrammarState::pdf
EST_DiscreteProbDistribution & pdf()
Definition: EST_Ngrammar.h:115

EST_Ngrammar::probability
double probability(const EST_StrVector &words, bool force=false, const bool trace=false) const
Definition: EST_Ngrammar.cc:1847

EST_cutils.h

EST_Ngrammar::prob_dist
const EST_DiscreteProbDistribution & prob_dist(const EST_StrVector &words) const
Definition: EST_Ngrammar.cc:2064

EST_TokenStream::eof
int eof()
end of file
Definition: EST_Token.h:362

EST_DiscreteProbDistribution::cumulate
void cumulate(const EST_String &s, double count=1)
Add this observation, may specify number of occurrences.
Definition: EST_DProbDist.cc:159

EST_DiscreteProbDistribution::frequency
double frequency(const EST_String &s) const
Definition: EST_DProbDist.cc:251

SEEK_END
#define SEEK_END
Definition: system.h:28

EST_DiscreteProbDistribution::item_freq
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index.
Definition: EST_DProbDist.cc:404

misc_write_error
#define misc_write_error
Definition: EST_rw_status.h:162

save_ngram_cstr_bin
EST_write_status save_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n, const bool trace, double floor)
Definition: ngrammar_io.cc:844

save_ngram_htk_ascii
EST_write_status save_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n, double floor)
Definition: ngrammar_io.cc:565

write_ok
The file was written successfully.
Definition: EST_rw_status.h:126

EST_TSimpleVector< int >

wrong_format
#define wrong_format
Definition: EST_rw_status.h:160

EST_Ngrammar::representation
representation_t representation() const
Definition: EST_Ngrammar.h:425

load_ngram_cstr_bin
EST_read_status load_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:289

FALSE
#define FALSE
Definition: EST_bool.h:119

EST_Ngrammar::print_freqs
void print_freqs(ostream &os, double floor=0.0)
Definition: EST_Ngrammar.cc:2319

misc_read_error
#define misc_read_error
Definition: EST_rw_status.h:161

NULL
NULL
Definition: EST_WFST.cc:55

save_ngram_arpa
EST_write_status save_ngram_arpa(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:670

EST_fseek
int EST_fseek(FILE *fp, EST_FilePos offset, int whence)
Definition: EST_File.h:75

EST_TokenStream::peek
EST_Token & peek(void)
peek at next token
Definition: EST_Token.h:332

load_ngram_htk_binary
EST_read_status load_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:66

EST_Ngrammar::get_vocab_length
int get_vocab_length() const
Definition: EST_Ngrammar.h:416

save_ngram_cstr_ascii
EST_write_status save_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n, const bool trace, double floor)
Definition: ngrammar_io.cc:747

int
getString int
Definition: EST_item_aux.cc:50

write_fail
The file was not written successfully.
Definition: EST_rw_status.h:128

EST_DiscreteProbDistribution::set_frequency
void set_frequency(const EST_String &s, double c)
Definition: EST_DProbDist.cc:270

EST_TList::append
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:196

EST_Ngrammar::make_ngram_from_index
const EST_StrVector & make_ngram_from_index(const int i) const
Definition: EST_Ngrammar.cc:622

EST_String.h

EST_Ngrammar::order
int order() const
Definition: EST_Ngrammar.h:415

EST_read_status
EST_read_status
Definition: EST_rw_status.h:111

EST_TokenStream::filepos
EST_FilePos filepos(void) const
current file position in EST_TokenStream
Definition: EST_Token.h:367

EST_Token::string
const EST_String & string() const
Definition: EST_Token.h:120

EST_TVector< EST_String >

format_ok
#define format_ok
Definition: EST_rw_status.h:159

EST_Ngrammar::backoff
Definition: EST_Ngrammar.h:221

OOV_MARKER
#define OOV_MARKER
Definition: EST_Ngrammar.h:61

EST_TokenStream::get_upto_eoln
EST_Token get_upto_eoln(void)
get up to s in end of line as a single token.
Definition: EST_Token.cc:529

EST_File.h

save_ngram_arpa_sub
void save_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *ost)
Definition: ngrammar_io.cc:647

EST_String::after
EST_String after(int pos, int len=1) const
Part after pos+len.
Definition: EST_String.h:308

EST_String::before
EST_String before(int pos, int len=0) const
Part before position.
Definition: EST_String.h:276

EST_Ngrammar::iterate
void iterate(EST_StrVector &words, void(*function)(EST_Ngrammar *n, EST_StrVector &words, void *params), void *params)
Definition: EST_Ngrammar.cc:2227

EST_Ngrammar::ngram_exists
bool ngram_exists(const EST_StrVector &words) const
Definition: EST_Ngrammar.cc:957

TRUE
#define TRUE
Definition: EST_bool.h:118

EST_TVector::n
INLINE ssize_t n() const
number of items in vector.
Definition: EST_TVector.h:251

EST_Ngrammar::init
bool init(int o, representation_t r, const EST_StrList &wordlist)
Definition: EST_Ngrammar.cc:509

EST_Ngrammar::p_sentence_start_marker
EST_String p_sentence_start_marker
Definition: EST_Ngrammar.h:238

EST_Token.h

vocab
EST_StrVector vocab
Definition: dp_main.cc:85

EST_Ngrammar.h

save_ngram_wfst
EST_write_status save_ngram_wfst(const EST_String filename, EST_Ngrammar &n)
Definition: ngrammar_io.cc:806

EST_TokenStream::eoln
int eoln()
end of line
Definition: EST_Token.cc:832

EST_String
Definition: EST_String.h:76

EST_TList
Definition: EST_TList.h:61

SEEK_SET
#define SEEK_SET
Definition: system.h:20