82 int this_num,this_order;
84 if (ts.
open(filename) == -1)
88 while ((!ts.
eof()) && !ts.
get().string().contains(
"\\data\\"));
113 this_order=atoi(s.
before(
"="));
114 this_num=atoi(s.
after(
"="));
119 nums[this_order] = this_num;
121 if(this_order > order)
140 for(i=1;i<=order;i++)
157 cerr <<
"Unexpected end of grammar file whilst looking for '" 158 << tmp <<
"'" << endl;
166 for(j=0;j<nums(i);j++)
169 for (k=0; ((k<i) && !ts.
eof()); k++)
170 window[k] = ts.
get().string();
174 cerr <<
"Unexpected end of file whilst reading " << i
175 <<
"-grams !" << endl;
184 cerr <<
"ooooooooops" << endl;
212 if (ts.
get().string() ==
"\\end\\")
219 cerr <<
"Missing \\end\\ !" << endl;
233 if (ts.
open(filename) == -1)
243 order = atoi(ts.
get().string());
256 cerr <<
"Something may be wrong with the vocab lists in '" 257 << filename <<
"'" << endl;
265 for (i=0; i < order; i++)
266 window[i] = ts.
get().string();
267 if (ts.
get().string() !=
":")
269 cerr <<
"EST_Ngrammar:load_ngram_cstr_ascii missing colon at filepos " 273 occur = atof(ts.
get().string());
277 cerr <<
"EST_Ngrammar:load_ngram_cstr_ascii expect end of line at filepos " 295 double approx_num_samples = 0.0;
296 long freq_data_start, freq_data_end;
301 if ((ifd=fopen(filename,
"rb")) ==
NULL)
303 if (fread(&magic,
sizeof(
int),1,ifd) != 1)
305 cerr <<
"Could not read integer from " << filename << endl;
329 order = atoi(ts.
get().string());
330 if (ts.
get() !=
"\n")
339 while ((ts.
peek() !=
"\n") && (!ts.
eof()))
342 while ((ts.
peek() !=
"\n") && (!ts.
eof()))
361 num_entries = (freq_data_end-freq_data_start)/
sizeof(
double);
362 double *dd =
new double[num_entries];
367 if (fread(dd,
sizeof(
double),num_entries,ifd) != (
unsigned)num_entries)
369 cerr <<
"EST_Ngrammar::load_ngram_cstr_bin format does not have expected number of entries" << endl;
380 if (j >= num_entries)
382 cerr <<
"EST_Ngrammar::load_ngram_cstr_bin unexpected end of frequency data" << endl;
394 approx_num_samples += dd[j];
399 if (j+1 >= num_entries)
401 else if (dd[j+1] < -1)
403 else if (dd[j+1] == -1)
431 this_ngram[0] = word;
448 if(floor_prob_total > 1)
450 cerr <<
"ERROR : floor is impossibly large, scaling it !" << endl;
452 floor_prob_total = 1;
469 *ost << word <<
" 0 ";
496 *ost <<
"*" << lcount <<
" ";
505 double base_prob = freq / total_freq;
508 *ost << floor + ( base_prob * (1-floor_prob_total) );
527 *ost << 0 <<
" ERROR !!!!!!!! ";
538 *ost <<
"*" << lcount <<
" " << endl;
544 *ost <<
"*" << lcount <<
" ";
550 double base_prob = freq / total_freq;
553 *ost << floor + ( base_prob * (1-floor_prob_total) ) << endl;
557 *ost << floor << endl;
574 cerr <<
"Can only save bigrams in htk_ascii format" << endl;
580 cerr <<
"Negative floor probability does not make sense !" << endl;
587 ost =
new ofstream(filename);
595 cerr <<
"ERROR : floor is impossibly large, scaling it to ";
596 cerr << floor << endl;
603 cerr <<
"Can't save in HTK format as no sentence start/end tags" 604 <<
" were given !" << endl;
643 *((
double*)count) += 1;
655 for(i=0;i<ngram.
n();i++)
656 *((ostream*)(ost)) << ngram(i) <<
" ";
659 (n->
order() > ngram.
n()) )
664 *((ostream*)(ost)) << endl;
680 ost =
new ofstream(filename);
691 *ost <<
"\\data\\" << endl;
693 double *count =
new double;
697 for(o=1;o<=n.
order();o++)
707 *ost <<
"ngram " << o <<
"=" << *count << endl;
710 for(o=1;o<=n.
order();o++)
713 *ost <<
"\\" << o <<
"-grams:" << endl;
724 for(i=0;i<n.
order();i++)
728 *ost <<
"ngram " << n.
order() <<
"=" << *count << endl;
731 *ost <<
"\\" << n.
order() <<
"-grams:" << endl;
733 for(i=0;i<n.
order();i++)
739 *ost <<
"\\end\\" << endl;
741 if (ost != &cout)
delete ost;
748 const bool trace,
double floor)
759 ost =
new ofstream(filename);
764 *ost <<
"Ngram_2 " << n.
order() << endl;
778 for(i=0;i<total_ngrams;i++)
792 for (
int jj=0; jj < this_ngram.
n(); jj++)
793 *ost << this_ngram(jj) <<
" ";
794 *ost << name <<
" : " << freq << endl;
812 if ((ost = fopen(filename,
"wb")) ==
NULL)
814 cerr <<
"Ngrammar save: unable to open \"" << filename <<
815 "\" for writing" << endl;
819 fprintf(ost,
"EST_File fst\n");
820 fprintf(ost,
"DataType ascii\n");
821 fprintf(ost,
"in \"(");
823 fprintf(ost,
" %s\n",(
const char *)n.
vocab->
name(i));
824 fprintf(ost,
" )\"\n");
825 fprintf(ost,
"out \"(");
827 fprintf(ost,
" %s\n",(
const char *)n.
vocab->
name(i));
828 fprintf(ost,
" )\"\n");
830 fprintf(ost,
"EST_Header_End\n");
834 fprintf(ost,
"((%d nonfinal %d)\n",i,i);
845 const bool trace,
double floor)
860 if ((ofd=stdout) ==
NULL)
865 if ((ofd=fopen(filename,
"wb")) ==
NULL)
869 fwrite(&magic,
sizeof(
int),1,ofd);
870 fprintf(ofd,
"mBin_2 %d\n",n.
order());
872 fprintf(ofd,
"%s ",(
const char *)n.
vocab->
name(i));
883 cerr <<
"Saving ..." << endl;
907 fwrite(&count,
sizeof(
double),1,ofd);
908 fwrite(&freq,
sizeof(
double),1,ofd);
915 fwrite(&count,
sizeof(
double),1,ofd);
926 for(i=0;i<total_ngrams;i++)
930 cerr <<
"\r" << i*100/total_ngrams <<
"%";
949 fwrite(&count,
sizeof(
double),1,ofd);
950 fwrite(&freq,
sizeof(
double),1,ofd);
961 cerr <<
"\r \r" << endl;
void set_WhiteSpaceChars(const EST_String &ws)
set which characters are to be treated as whitespace
#define EST_NGRAMBIN_MAGIC
EST_TokenStream & get(EST_Token &t)
get next token in stream
EST_FilePos filepos(void) const
file position in original EST_TokenStream.
EST_String p_sentence_end_marker
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
int contains(const char *s, ssize_t pos=-1) const
Does it contain this substring?
void count_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *count)
EST_read_status load_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n)
EST_FilePos EST_ftell(FILE *fp)
void accumulate(const EST_StrVector &words, const double count=1)
void set_SingleCharSymbols(const EST_String &sc)
set which characters are to be treated as single character symbols
void close(void)
Close stream.
const EST_String & name(const int n) const
The name given the index.
double get_backoff_weight(const EST_StrVector &words) const
int length(void) const
The number of members in the discrete.
EST_String itoString(int n)
Make a EST_String object from an integer.
void swap_bytes_double(double *data, int length)
EST_read_status load_ngram_arpa(const EST_String filename, EST_Ngrammar &n, const EST_StrList &vocab)
EST_write_status save_ngram_htk_ascii_sub(const EST_String &word, ostream *ost, EST_Ngrammar &n, double floor)
EST_Litem * item_start() const
Used for iterating through members of the distribution.
EST_Discrete * pred_vocab
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
EST_NgrammarState * p_states
int num_states(void) const
double safe_log10(const double x)
EST_read_status load_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n)
EST_DiscreteProbDistribution vocab_pdf
int open(const EST_String &filename)
open a EST_TokenStream for a file.
EST_DiscreteProbDistribution & pdf()
double probability(const EST_StrVector &words, bool force=false, const bool trace=false) const
const EST_DiscreteProbDistribution & prob_dist(const EST_StrVector &words) const
void cumulate(const EST_String &s, double count=1)
Add this observation, may specify number of occurrences.
double frequency(const EST_String &s) const
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index.
EST_write_status save_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n, const bool trace, double floor)
EST_write_status save_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n, double floor)
The file was written successfully.
representation_t representation() const
EST_read_status load_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n)
void print_freqs(ostream &os, double floor=0.0)
EST_write_status save_ngram_arpa(const EST_String filename, EST_Ngrammar &n)
int EST_fseek(FILE *fp, EST_FilePos offset, int whence)
EST_Token & peek(void)
peek at next token
EST_read_status load_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n)
int get_vocab_length() const
EST_write_status save_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n, const bool trace, double floor)
The file was not written successfully.
void set_frequency(const EST_String &s, double c)
void append(const T &item)
add item onto end of list
const EST_StrVector & make_ngram_from_index(const int i) const
EST_FilePos filepos(void) const
current file position in EST_TokenStream
const EST_String & string() const
EST_Token get_upto_eoln(void)
get up to s in end of line as a single token.
void save_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *ost)
EST_String after(int pos, int len=1) const
Part after pos+len.
EST_String before(int pos, int len=0) const
Part before position.
void iterate(EST_StrVector &words, void(*function)(EST_Ngrammar *n, EST_StrVector &words, void *params), void *params)
bool ngram_exists(const EST_StrVector &words) const
INLINE ssize_t n() const
number of items in vector.
bool init(int o, representation_t r, const EST_StrList &wordlist)
EST_String p_sentence_start_marker
EST_write_status save_ngram_wfst(const EST_String filename, EST_Ngrammar &n)