105 s <<
"(" << a.
id() <<
": " << a.
pdf_const() <<
" )";
162 p_pdf.cumulate(
words(words.
n()-1-p_level),count);
165 if (words.
n()-1-p_level > 0)
171 s = get_child(
words(words.
n()-1-p_level));
174 s = add_child(p_pdf.get_discrete(),
words);
202 p_pdf.cumulate(
words(words.
n()-1-p_level),count);
208 if (words.
n()-1-p_level > 0)
214 s = get_child(
words(words.
n()-1-p_level));
217 add_child(p_pdf.get_discrete(),
words);
220 s = get_child(
words(words.
n()-1-p_level));
223 cerr <<
"Failed to extend tree - unknown reason !" << endl;
226 return s->accumulate(words,count);
241 if (words.
n()-1-p_level > 0)
244 s = get_child(
words(words.
n()-1-p_level));
251 new_child->
init(d,p_level+1);
252 children.add(
words(words.
n()-1-p_level), (
void*)new_child);
271 if (words.
n()-1-p_level > 0)
274 s = get_child(
words(words.
n()-1-p_level));
281 new_child->
init(d,p_level+1);
282 children.add(p_pdf.get_discrete()->name(
words(words.
n()-1-p_level)), (
void*)new_child);
299 children.add(name,
NULL);
313 for (k=p_pdf.item_start();
315 k = p_pdf.item_next(k))
317 p_pdf.item_freq(k,name,freq);
319 if (p_level==order-1)
322 os << name <<
" " << followers
323 <<
": " << freq << endl;
332 const double threshold)
const 335 s = get_state(words);
340 return (
bool)((s->
level()==0) ||
351 if (words.
n()-1-p_level > 0)
353 s = get_child(
words(words.
n()-1-p_level));
380 for (k=p_pdf.item_start();
382 k = p_pdf.item_next(k))
384 p_pdf.item_freq(k,name,freq);
387 remove_child(child,name);
399 if (words.
n()-1-p_level >= 0)
401 s = get_child(
words(words.
n()-1-p_level));
422 return backoff_weight;
430 if (words.
n()-1-p_level >= 0)
432 s = get_child(
words(words.
n()-1-p_level));
442 cerr <<
"Couldn't set weight for " << words
443 <<
" to " << w << endl;
464 for (k=p_pdf.item_start();
466 k = p_pdf.item_next(k))
468 p_pdf.item_freq(k,name,freq);
470 ff[(
int)(freq+0.5)] += 1;
476 s <<
"(backoff level:" << a.
p_level 488 sparse_representation.clear();
495 backoff_threshold = 1.0;
496 backoff_unigram_floor_freq = 0.0;
512 return (
bool)(init_vocab(wordlist) && p_init(o,r));
519 return (
bool)(init_vocab(wordlist,predlist) && p_init(o,r));
527 vocab_pdf.init(pred_vocab);
536 vocab_pdf.
init(pred_vocab);
544 cerr <<
"EST_Ngrammar order must be > 0" << endl;
549 p_representation = r;
550 p_number_of_sentences = 0;
552 switch(p_representation)
556 sparse_representation.init(p_order);
561 return init_dense_representation();
565 return init_backoff_representation();
569 cerr <<
"Unknown internal representation requested for EST_Ngrammar" 583 cerr <<
"EST_Ngrammar: dense_representation requires explicit vocab" 590 for (i=0; i < p_num_states; i++)
591 p_states[i].init(i,pred_vocab);
601 cerr <<
"EST_Ngrammar: dense_representation requires explicit vocab" 609 return (
bool)(p_states !=
NULL);
631 for(i=p_order-2;i>=0;i--)
633 #if defined(sun) && ! defined(__svr4__) 637 (*ngram)[i] = wordlist_index(rem);
641 (*ngram)[i] = wordlist_index(d.rem);
655 if(!
vocab->init(word_list))
659 vocab_pdf.init(pred_vocab);
668 if(!
vocab->init(word_list))
671 if(!pred_vocab->init(pred_list))
673 vocab_pdf.init(pred_vocab);
682 if(!comp_vocab->
init(word_list))
688 if(*
vocab != *comp_vocab)
700 return vocab->name(i);
710 i = pred_vocab->index(word);
714 cerr <<
"Word \"" << word <<
"\" is not in the predictee word list" << endl;
722 cerr <<
"Even " <<
OOV_MARKER <<
" is not in the predictee word list !" << endl;
731 return pred_vocab->name(i);
741 i =
vocab->index(word);
746 cerr <<
"Word \"" << word <<
"\" is not in the word list" << endl;
754 cerr <<
"Even " <<
OOV_MARKER <<
" is not in the word list !" << endl;
770 p_sentence_start_marker = prev;
771 p_sentence_end_marker = last;
776 (oov_mode !=
"skip_file") &&
777 (oov_mode !=
"skip_sentence"))
778 cerr <<
"Warning : building a backoff grammar" << endl
779 <<
" with oov_mode '" << oov_mode
780 <<
"' is not recommended !" << endl;
782 if( (oov_mode !=
"skip_ngram") &&
783 (oov_mode !=
"skip_sentence") &&
784 (oov_mode !=
"skip_file") &&
785 (oov_mode !=
"use_oov_marker") )
787 cerr <<
"Unknown oov_mode '" << oov_mode <<
"'" << endl;
791 if( (oov_mode ==
"skip_sentence") &&
792 (input_format ==
"ngram_per_line"))
794 cerr <<
"Sorry, with input format 'ngram_per_line' you cannot " << endl
795 <<
" select oov_mode 'skip_sentence'" << endl;
799 if(oov_mode ==
"use_oov_marker")
807 for (p = filenames.
head(); p; p = p->
next())
809 cerr <<
"Building from " << filenames(p) << endl;
812 if( ((oov_mode ==
"skip_sentence") &&
813 (input_format ==
"sentence_per_file")) ||
814 (oov_mode ==
"skip_file") )
815 skip_this = oov_preprocess(filenames(p),new_filename,
818 else if( ((oov_mode ==
"skip_sentence") &&
819 (input_format ==
"sentence_per_line")) ||
820 ((oov_mode ==
"skip_ngram") &&
821 (input_format ==
"ngram_per_line")) )
822 oov_preprocess(filenames(p),new_filename,
"eliminate lines");
825 new_filename = filenames(p);
830 cerr <<
"Skipping " << filenames(p)
831 <<
" (out of vocabulary words found)" << endl;
836 switch(p_representation)
840 if (input_format !=
"")
842 cerr <<
"Can't build sparse ngram from '" << input_format;
843 cerr <<
"' data" << endl;
846 else if (!build_sparse(new_filename,prev,prev_prev,last))
852 if (!build_ngram(new_filename,prev,prev_prev,last,input_format))
857 if (!build_ngram(new_filename,prev,prev_prev,last,input_format))
862 cerr <<
"Unknown internal representation set for EST_Ngrammar" 869 if((new_filename != filenames(p)) &&
870 (new_filename !=
"") &&
873 cerr <<
"Warning : couldn't remove temporary file : " 874 << new_filename << endl;
880 return compute_backoff_weights(mincount,maxcount);
888 if (words.
n() < p_order)
889 cerr <<
"EST_Ngrammar::accumulate - window is too small" << endl;
894 vocab_pdf.cumulate(w,count);
896 switch(p_representation)
900 find_state(words).cumulate(w,count);
904 backoff_representation->accumulate(words,count);
908 cerr <<
"EST_Ngrammar::accumulate : invalid representation !" 929 if (words.
n() < p_order)
930 cerr <<
"EST_Ngrammar::accumulate - window is too small" << endl;
934 vocab_pdf.cumulate(
words(p_order-1),count);
936 switch(p_representation)
941 find_state(words).cumulate(
words(p_order-1),count);
945 backoff_representation->accumulate(words,count);
949 cerr <<
"EST_Ngrammar::accumulate : invalid representation !" 959 switch(p_representation)
972 return backoff_representation->ngram_exists(words,0);
974 return backoff_representation->ngram_exists(words,backoff_threshold);
979 cerr <<
"ngram_exists: unknown ngrammar representation" << endl;
989 cerr <<
"Not a backoff grammar !" << endl;
993 return backoff_representation->ngram_exists(words,threshold);
1001 return backoff_representation->get_backoff_weight(words);
1004 cerr <<
"Can't get backoff weight - not a backed off ngrammar !" << endl;
1012 return backoff_representation->set_backoff_weight(words,w);
1015 cerr <<
"Can't set backoff weight - not a backed off ngrammar !" << endl;
1026 sparse_representation.build(filename,prev,prev_prev,last);
1037 for (i=0; i<window.
n()-1; i++)
1038 window[i] = wordlist_index(prev_prev);
1039 window[i++] = wordlist_index(prev);
1048 for (i=0; i<window.
n()-1; i++)
1049 window[i] = prev_prev;
1060 int bad_line_count=0;
1061 int good_line_count=0;
1068 bool write_out =
false;
1069 if( (what ==
"eliminate lines") || (filename ==
"-") )
1073 if (filename ==
"-")
1077 cerr <<
"EST_Ngrammar:: failed to open stdin";
1078 cerr <<
" for reading" << endl;
1082 else if (ts.
open(filename) == -1){
1083 cerr <<
"EST_Ngrammar: failed to open file \"" << filename
1084 <<
"\" for reading" << endl;
1092 ost =
new ofstream(new_filename);
1096 cerr <<
"Ngrammar: couldn't create temporary file \"" 1097 << new_filename <<
"\"" << endl;
1103 new_filename = filename;
1106 bool bad_line=
false;
1109 s=ts.
get().string();
1111 if (!bad_line && (s !=
""))
1113 if(wordlist_index(s,
false) < 0)
1116 if(what ==
"eliminate lines")
1127 cerr <<
"Warning : couldn't delete temporary file '" 1128 << new_filename <<
"'" << endl;
1136 this_line += s +
" ";
1151 *ost << this_line << endl;
1160 cerr <<
"skipped " << bad_line_count <<
" and kept " 1161 << good_line_count <<
" lines from file " << filename << endl;
1175 int eoln_is_eos =
FALSE;
1176 int sliding_window =
TRUE;
1180 if ( (input_format ==
"") || (input_format ==
"sentence_per_line") )
1184 sliding_window =
TRUE;
1186 else if (input_format ==
"sentence_per_file")
1188 eoln_is_eos =
FALSE;
1189 sliding_window =
TRUE;
1190 p_number_of_sentences = 1;
1192 else if(input_format ==
"ngram_per_line")
1194 eoln_is_eos =
FALSE;
1195 sliding_window =
FALSE;
1196 p_number_of_sentences = 1;
1200 cerr <<
"Can't build from '" << input_format <<
"' data" << endl;
1206 if (filename ==
"-")
1210 cerr <<
"EST_Ngrammar:: failed to open stdin";
1211 cerr <<
" for reading" << endl;
1215 else if (ts.
open(filename) == -1){
1216 cerr <<
"EST_Ngrammar: failed to open \"" << filename
1217 <<
"\" for reading" << endl;
1233 fill_window_start(window,prev,prev_prev);
1234 if (window(p_order-1) == -1)
1236 else if( (p_order>1) && (window(p_order-2) == -1))
1237 bad_word = p_order-1;
1242 cerr <<
"at start : bad word at " << bad_word << endl;
1247 s=ts.
get().string();
1257 window[p_order-1] = wordlist_index(s);
1258 if (window(p_order-1) < 0)
1260 cerr <<
"EST_Ngrammar::build_ngram " <<
1261 " word \"" << s <<
"\" is not in vocabulary, skipping" 1269 cerr <<
"not accumulating : bad word at " << bad_word;
1270 cerr <<
" window=" << window;
1279 if(count == p_order-1)
1280 window[count++] = predlist_index(s);
1282 window[count++] = wordlist_index(s);
1284 if (window(count-1) < 0)
1286 cerr <<
"EST_Ngrammar::build_ngram " <<
1287 " word \"" << s <<
"\" is not in vocabulary, skipping" 1293 cerr <<
"Too many items on line - ignoring trailing ones !" << endl;
1303 if((count == p_order) && bad_word == 0)
1308 else if (eoln_is_eos)
1311 if (window(p_order-1) != wordlist_index(last))
1312 p_number_of_sentences += 1;
1315 window[p_order-1] = wordlist_index(last);
1317 if(window(p_order-1) == -1)
1327 fill_window_start(window,prev,prev_prev);
1330 if (window(p_order-1) == -1)
1332 else if( (p_order>1) && (window(p_order-2) == -1) )
1333 bad_word = p_order-1;
1342 if ( sliding_window && (window(p_order-1) != wordlist_index(prev)))
1347 window[p_order-1] = wordlist_index(last);
1349 if (window(p_order-1) == -1)
1355 p_number_of_sentences += 1;
1361 cerr <<
"Accumulated " << p_number_of_sentences <<
" sentences." << endl;
1369 double sum1=0,sum2=0;
1373 new_ngram.
resize(ngram.
n()-1);
1374 for(i=0;i<new_ngram.
n();i++)
1375 new_ngram[i] = ngram(i);
1378 cerr <<
"computing backoff w for ";
1379 for(i=0;i<new_ngram.
n();i++)
1380 cerr << new_ngram(i) <<
" ";
1395 cerr <<
"WARNING : couldn't set weight !" << endl;
1410 for(i=0;i<ngram.
n();i++)
1411 cerr << ngram(i) <<
" ";
1415 cerr << n->
probability(ngram) <<
" exists " << endl;
1426 tmp_ngram.
resize(ngram.
n()-1);
1427 for(i=0;i<tmp_ngram.
n();i++)
1428 tmp_ngram[i] = ngram(i+1);
1430 cerr <<
" unseen, P(";
1431 for(i=0;i<tmp_ngram.
n();i++)
1432 cerr << tmp_ngram(i) <<
" ";
1434 cerr <<
") = " << n->
probability(tmp_ngram) <<
" " << endl;
1444 cerr <<
"WARNING : couldn't set weight !" << endl;
1451 cerr <<
"NEGATIVE WEIGHT for ";
1452 for(i=0;i<new_ngram.
n();i++)
1453 cerr << new_ngram(i) <<
" ";
1456 cerr <<
"sum1=" << sum1 <<
" sum2=" << sum2;
1457 cerr <<
" " << (1 - sum1) / sum2 << endl;
1467 for(i=0;i<ngram.
n();i++)
1468 cerr << ngram(i) <<
" ";
1469 cerr <<
" exists, prob = ";
1479 cerr <<
"WARNING : couldn't set weight !" << endl;
1481 cerr <<
"sum1=" << sum1 <<
" sum2=" << sum2;
1482 cerr <<
" weight=" << (1 - sum1) / sum2 << endl;
1486 ngram[ngram.
n()-1] = tmp;
1495 backoff_threshold = mincount;
1505 backoff_restore_unigram_states();
1535 for (o=2;o<=order();o++)
1538 cerr <<
"Backing off order " << o << endl;
1548 words[o-1] =
"!FILLED!";
1569 words[0] =
"wibble";
1570 for(j=0;j<get_pred_vocab_length();j++)
1572 words[1] = get_pred_vocab_word(j);
1573 backoff_representation->accumulate(words,0);
1582 if (start_state ==
NULL)
1583 start_state = backoff_representation;
1624 prune_backoff_representation(child);
1639 s <<
"Dense" << endl;
1644 s <<
"Backoff" << endl;
1649 cerr <<
"Unknown internal representation of EST_Ngrammar : can't print" 1660 if (new_type == p_entry_type)
1665 cerr <<
"Couldn't do entry type conversion !" << endl;
1671 cerr <<
"EST_Ngrammar::sparse_to_dense() " 1672 <<
" not implemented" << endl;
1678 cerr <<
"EST_Ngrammar::dense_to_sparse()" 1679 <<
" not implemented" << endl;
1687 for(i=0;i<p_order-1;i++)
1700 for (f=1,i=0; i<p_order-2; i++)
1707 switch(p_representation)
1718 for(i=0;i<p_order-1;i++)
1720 tmp[i] = wordlist_index(
words(i));
1721 if (tmp(i) == -1)
break;
1723 tmp[i] = pred_vocab->index(
words(i));
1724 if (tmp(i) == -1)
break;
1725 return p_states[find_dense_state_index(tmp)];
1730 cerr <<
"find_state: not valid in backoff mode !" << endl;
1734 cerr <<
"find_state: unknown ngrammar representation" << endl;
1745 switch(p_representation)
1756 for(i=0;i<p_order-1;i++)
1758 tmp[i] = wordlist_index(
words(i));
1759 if (tmp(i) == -1)
break;
1761 tmp[i] = pred_vocab->index(
words(i));
1762 if (tmp(i) == -1)
break;
1763 return p_states[find_dense_state_index(tmp)];
1768 cerr <<
"find_state_const: not valid in backoff mode !" << endl;
1772 cerr <<
"find_state: unknown ngrammar representation" << endl;
1781 switch(p_representation)
1789 return p_states[find_dense_state_index(words)];
1793 cerr <<
"find_state: not valid in backoff mode !" << endl;
1797 cerr <<
"find_state: unknown ngrammar representation" << endl;
1807 switch(p_representation)
1814 return p_states[find_dense_state_index(words)];
1818 cerr <<
"find_state_const: not valid in backoff mode !" << endl;
1822 cerr <<
"find_state: unknown ngrammar representation" << endl;
1833 if (new_representation == p_representation)
1837 return sparse_to_dense();
1839 return dense_to_sparse();
1842 cerr <<
"set_representation: unknown ngrammar representation" << endl;
1852 switch(p_representation)
1856 return find_state_const(words).probability(lastword(words));
1860 return backoff_probability(words,trace);
1864 cerr <<
"probability: unknown ngrammar representation" << endl;
1875 switch(p_representation)
1879 return find_state_const(words).frequency(lastword(words));
1883 return backoff_probability(words,trace);
1887 cerr <<
"probability: unknown ngrammar representation" << endl;
1894 double *prob,
int *state)
const 1898 switch(p_representation)
1911 return backoff_most_probable(words,prob);
1915 cerr <<
"probability: unknown ngrammar representation" << endl;
1922 double *prob,
int *state)
const 1926 switch(p_representation)
1938 cerr <<
"probability: IVector access to backoff not supported" << endl;
1943 cerr <<
"probability: unknown ngrammar representation" << endl;
1951 switch(p_representation)
1960 cerr <<
"Ngrammar: representation doesn't support states" << endl;
1968 switch(p_representation)
1977 cerr <<
"Ngrammar: representation doesn't support states" << endl;
1986 return vocab->name(i);
2003 switch(p_representation)
2011 vocab_pdf.frequency(lastword(words));
2016 return backoff_reverse_probability(words);
2020 cerr <<
"probability: unknown ngrammar representation" << endl;
2032 switch(p_representation)
2040 vocab_pdf.frequency(lastword(words));
2045 cerr <<
"probability: reverse prob unavailable for backoff ngram" 2051 cerr <<
"probability: unknown ngrammar representation" << endl;
2060 return p_states[state].pdf_const();
2067 switch(p_representation)
2078 return backoff_prob_dist(words);
2082 cerr <<
"probability: unknown ngrammar representation" << endl;
2092 switch(p_representation)
2103 cerr <<
"probability: unsupport IVector access of backoff ngram" << endl;
2108 cerr <<
"probability: unknown ngrammar representation" << endl;
2136 "gzip --decompress --stdout");
2142 r_val =
load(tmp_fname);
2173 cerr <<
"Wordlist file does not match grammar wordlist !" << endl;
2184 cerr <<
"Wordlist does not match grammar !" << endl;
2190 cerr <<
"EST_Ngrammar::load can't determine ngrammar file type for input file " << filename << endl;
2199 cerr <<
"EST_Ngrammar::make_htk_compatible() not written yet." << endl;
2205 const bool trace,
double floor)
2209 return save(filename,
"cstr_ascii",
false,floor);
2210 if (type ==
"htk_ascii")
2216 if (type ==
"cstr_ascii")
2218 if (type ==
"cstr_bin")
2223 cerr <<
"EST_Ngrammar::save unknown output file type " << type << endl;
2237 for(i=0;i<words.
n();i++)
2247 (*function)(
this,
words,params);
2256 for(i=0;i<pred_vocab->length();i++){
2257 words[j] = pred_vocab->name(i);
2258 iterate(words,
function,params);
2265 words[j] =
vocab->name(i);
2266 iterate(words,
function,params);
2283 for(i=0;i<words.
n();i++)
2293 (*function)(
this,
words,params);
2302 for(i=0;i<pred_vocab->length();i++){
2303 words[j] = pred_vocab->name(i);
2304 const_iterate(words,
function,params);
2311 words[j] =
vocab->name(i);
2312 const_iterate(words,
function,params);
2323 backoff_representation->print_freqs(os,p_order);
2330 for (i=0; i < p_num_states; i++)
2333 for (k=p_states[i].pdf().item_start();
2334 !p_states[i].pdf().item_end(k);
2335 k = p_states[i].pdf().item_next(k))
2340 p_states[i].pdf().item_freq(k,name,freq);
2345 for (j = p_order-2; j >= 0; j--)
2350 for (j = 0; j < p_order-1; j++)
2351 os << wordlist_index(window(j)) <<
" ";
2352 os << name <<
" : " << freq << endl;
2369 for(i=0;i<words.
n();i++)
2370 ngram[i] =
words(i);
2374 for(j=0;j<get_pred_vocab_length();j++)
2376 ngram[ngram.
n()-1] = get_pred_vocab_word(j);
2377 double tmp = backoff_probability(ngram,
false);
2389 cerr <<
"order too great in EST_Ngrammar::get_backoff_discount" << endl;
2393 else if( (
int)freq < backoff_discount[order-1].n())
2394 return backoff_discount[order-1]((
int)freq);
2401 const bool trace)
const 2411 cerr <<
"backoff_probability( ";
2412 for(i=0;i<words.
n();i++)
2413 cerr <<
words(i) <<
" ";
2421 cerr <<
"unigram " << backoff_representation->probability(
words(0))
2424 f=backoff_representation->frequency(
words(0));
2429 return f / backoff_representation->pdf_const().samples();
2431 return backoff_unigram_floor_freq / backoff_representation->pdf_const().samples();
2436 new_ngram.
resize(words.
n()-1);
2437 for(i=0;i<new_ngram.
n();i++)
2438 new_ngram[i] =
words(i);
2441 state=backoff_representation->
get_state(words);
2443 if( (state !=
NULL) &&
2444 ((f=state->
frequency(words(0))) > backoff_threshold) )
2455 if((new_ngram(0) == p_sentence_start_marker) ||
2456 (new_ngram(0) == p_sentence_end_marker) )
2458 f2 = p_number_of_sentences;
2460 cerr <<
"special freq used : " << f2 << endl;
2464 state=backoff_representation->
get_state(new_ngram);
2467 cerr <<
"Something went horribly wrong !" << endl;
2478 cerr <<
" using freq for " << new_ngram(0) <<
" of " << f2 << endl;
2485 cerr <<
" ..... got (" << f <<
" - " 2486 << get_backoff_discount(state->
level()+1,
f)
2487 <<
")/" << f2 <<
" = " 2488 << (f - get_backoff_discount(state->
level()+1,
f) ) / f2
2491 return (f - get_backoff_discount(state->
level()+1,
f) ) / f2;
2497 double bo_wt = get_backoff_weight(new_ngram);
2499 for(i=0;i<new_ngram.
n();i++)
2500 new_ngram[i] =
words(i+1);
2504 cerr <<
"backed off(" << bo_wt <<
") to (";
2505 for(i=0;i<new_ngram.
n();i++)
2506 cerr << new_ngram(i) <<
" ";
2510 return bo_wt * backoff_probability(new_ngram,trace);
2545 new_ngram.
resize(words.
n()-1);
2546 for(i=0;i<new_ngram.
n();i++)
2547 new_ngram[i] =
words(i);
2553 if( (state !=
NULL) &&
2560 cerr <<
"Something went horribly wrong !" << endl;
2566 return f / state->
frequency(new_ngram(0));
2575 for(i=0;i<new_ngram.
n();i++)
2576 new_ngram[i] =
words(i+1);
2579 return bo_wt * backoff_reverse_probability_sub(new_ngram,root);
2603 return backoff_reverse_probability_sub(words,state);
2611 return backoff_prob_dist(words).most_probable(prob);
2627 for(i=0;i<v.
n()+l;i++)
2638 for(i=v.
n()-1;i>=l;i--)
2654 function(start_state,params);
2667 backoff_traverse(child,
function,params);
2680 if (start_state->
level() == level)
2682 function(start_state,params);
2684 else if (start_state->
level() < level)
2699 backoff_traverse(child,
function,params,level);
2711 float *weight = (
float*)((
void**)params)[1];
2724 void **params =
new void*[2];
2725 params[0] = (
void*)&n;
2726 params[1] = (
void*)&weight;
2747 for(i=0;i<v.
n()+l;i++)
2758 for(i=v.
n()-1;i>=l;i--)
bool oov_preprocess(const EST_String &filename, EST_String &new_filename, const EST_String &what)
#define GZIP_FILENAME_EXTENSION
EST_TokenStream & get(EST_Token &t)
get next token in stream
bool accumulate(const EST_StrVector &words, const double count=1)
const EST_BackoffNgrammarState * get_state(const EST_StrVector &words) const
void Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount, const double default_discount)
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
void frequency_of_frequencies(EST_DVector &ff)
double backoff_reverse_probability_sub(const EST_StrVector &words, const EST_BackoffNgrammarState *root) const
Utility IO Function header file.
int wordlist_index(const EST_String &word, const bool report=true) const
EST_read_status load(const EST_String &filename)
EST_read_status load_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n)
int predlist_index(const EST_String &word) const
void prune_backoff_representation(EST_BackoffNgrammarState *start_state=NULL)
bool compute_backoff_weights(const int mincount=1, const int maxcount=10)
void remove_child(EST_BackoffNgrammarState *child, const EST_String &name)
bool set_backoff_weight(const EST_StrVector &words, const double w)
void accumulate(const EST_StrVector &words, const double count=1)
int find_dense_state_index(const EST_IVector &words, int index=0) const
EST_String get_vocab_word(int i) const
EST_BackoffNgrammarState * backoff_representation
bool save(Lattice &lattice, EST_String filename)
void compute_backoff_weight(EST_Ngrammar *n, EST_StrVector &ngram, void *)
void close(void)
Close stream.
double get_backoff_weight(const EST_StrVector &words) const
const EST_String & predict(const EST_StrVector &words, double *prob, int *state) const
INLINE const T & a_no_check(ssize_t n) const
read-only const access operator: without bounds checking
double reverse_probability(const EST_StrVector &words, bool force=false) const
double frequency(const EST_String &w) const
bool load(Lattice &lattice, EST_String filename)
EST_read_status load_ngram_arpa(const EST_String filename, EST_Ngrammar &n, const EST_StrList &vocab)
int find_next_state_id(int state, int word) const
size_t index(const char *s, ssize_t pos=0) const
Position of substring (starting at pos)
bool set_representation(representation_t new_representation)
void set_num_samples(const double c)
EST_Litem * item_start() const
Used for iterating through members of the distribution.
bool init_sparse_representation()
EST_String make_tmp_filename()
Make a unique temporary filename.
void print_freqs(ostream &os, const int order, EST_String followers="")
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
EST_Item * root(const EST_Item *n)
return root node of treeprevious sibling (sister) of n
const EST_NgrammarState & find_state_const(const EST_StrVector &words) const
int check_vocab(EST_Relation &a, EST_StrList &vocab)
double get_backoff_discount(const int order, const double freq) const
int get_pred_vocab_length() const
int index(EST_TList< T > &l, T &val, bool(*eq)(const EST_UItem *, const EST_UItem *)=NULL)
bool init_backoff_representation()
double frequency(const EST_String &w) const
bool merge(EST_Ngrammar &n, float weight)
const EST_String & backoff_most_probable(const EST_StrVector &words, double *prob=NULL) const
bool p_init(int o, representation_t r)
void backoff_traverse(EST_BackoffNgrammarState *start_state, void(*function)(EST_BackoffNgrammarState *s, void *params), void *params)
int open(const EST_String &filename)
open a EST_TokenStream for a file.
float max(float a, float b)
EST_TVector< EST_String > EST_StrVector
int find_state_id(const EST_StrVector &words) const
void make_htk_compatible()
double probability(const EST_StrVector &words, bool force=false, const bool trace=false) const
bool build_ngram(const EST_String &filename, const EST_String &prev, const EST_String &prev_prev, const EST_String &last, const EST_String &input_format)
const EST_DiscreteProbDistribution & prob_dist(const EST_StrVector &words) const
EST_String get_pred_vocab_word(int i) const
void resize(ssize_t n, int set=1)
representation_t p_representation
bool build_sparse(const EST_String &filename, const EST_String &prev, const EST_String &prev_prev, const EST_String &last)
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index.
EST_BackoffNgrammarState * add_child(const EST_Discrete *d, const EST_StrVector &words)
INLINE ssize_t length() const
number of items in vector.
EST_write_status save_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n, const bool trace, double floor)
EST_write_status save_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n, double floor)
bool check_vocab(const EST_StrList &wordlist)
#define COMPRESS_FILENAME_EXTENSION
void fill_window_start(EST_IVector &window, const EST_String &prev, const EST_String &prev_prev) const
void backoff_restore_unigram_states()
EST_String uncompress_file_to_temporary(const EST_String &filename, const EST_String &prog_name)
Uncompress file by calling program prog, and write it to new tempoary file. Return name of temporary ...
bool set_entry_type(entry_t new_type)
EST_read_status load_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n)
const EST_DiscreteProbDistribution & pdf_const() const
void print_freqs(ostream &os, double floor=0.0)
double backoff_probability(const EST_StrVector &words, const bool trace=false) const
double frequency(const EST_StrVector &words, bool force=false, const bool trace=false) const
section options Options< strong > or ngram_per_line Pseudo words
A vector class for double precision floating point numbers. EST_DVector x should be used instead of f...
EST_PredictionSuffixTree sparse_representation
EST_write_status save_ngram_arpa(const EST_String filename, EST_Ngrammar &n)
EST_write_status save_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n, const bool trace, double floor)
bool set_backoff_weight(const EST_StrVector &words, const double w)
ostream & operator<<(ostream &s, const EST_NgrammarState &a)
The file was not written successfully.
void set_frequency(const EST_String &s, double c)
const EST_DiscreteProbDistribution PSTnullProbDistribution
const EST_StrVector & make_ngram_from_index(const int i) const
bool ngram_exists(const EST_StrVector &words, const double threshold) const
void const_iterate(EST_StrVector &words, void(*function)(const EST_Ngrammar *const n, EST_StrVector &words, void *params), void *params) const
int delete_file(const EST_String &filename)
OS independent way of removing a file.
bool init_dense_representation()
void print_freqs(ostream &os)
EST_BackoffNgrammarState * get_child(const EST_String &word) const
bool init(const EST_StrList &vocab)
(re-)initialise
const EST_String & most_probable(double *prob=NULL) const
EST_String extension(void) const
double backoff_reverse_probability(const EST_StrVector &words) const
EST_NgrammarState & find_state(const EST_StrVector &words)
~EST_BackoffNgrammarState()
bool build(const EST_StrList &filenames, const EST_String &prev=SENTENCE_START_MARKER, const EST_String &prev_prev=SENTENCE_END_MARKER, const EST_String &last=SENTENCE_END_MARKER, const EST_String &input_format="", const EST_String &oov_mode="", const int mincount=1, const int maxcount=10)
void iterate(EST_StrVector &words, void(*function)(EST_Ngrammar *n, EST_StrVector &words, void *params), void *params)
bool ngram_exists(const EST_StrVector &words) const
INLINE ssize_t n() const
number of items in vector.
const EST_DiscreteProbDistribution & backoff_prob_dist(const EST_StrVector &words) const
void merge_other_grammar(EST_Ngrammar *n, EST_StrVector &ngram, void *params)
bool init(int o, representation_t r, const EST_StrList &wordlist)
double probability(const EST_String &w) const
static const EST_String Empty
Constant empty string.
EST_write_status save(const EST_String &filename, const EST_String type="cstr_ascii", const bool trace=false, double floor=0.0)
bool init_vocab(const EST_StrList &wordlist)
const EST_DiscreteProbDistribution & pdf_const() const
EST_write_status save_ngram_wfst(const EST_String filename, EST_Ngrammar &n)
void slide(EST_IVector &v, const int l)
double get_backoff_weight() const