40 #ifndef __EST_NGRAMMAR_H__ 41 #define __EST_NGRAMMAR_H__ 59 #define SENTENCE_START_MARKER "!ENTER" 60 #define SENTENCE_END_MARKER "!EXIT" 61 #define OOV_MARKER "!OOV" 63 #define EST_NGRAMBIN_MAGIC 1315402337 66 #define GZIP_FILENAME_EXTENSION "gz" 67 #define COMPRESS_FILENAME_EXTENSION "Z" 70 #define TINY_FREQ 1.0e-10 162 const double count=1);
164 const double count=1);
195 const double threshold)
const;
198 bool set_backoff_weight(
const EST_StrVector &words,
const double w);
201 void print_freqs(ostream &os,
const int order,
EST_String followers=
"");
248 bool init_sparse_representation();
251 bool init_dense_representation();
270 double get_backoff_discount(
const int order,
const double freq)
const;
272 bool init_backoff_representation();
274 void backoff_restore_unigram_states();
280 const EST_StrVector &make_ngram_from_index(
const int i)
const;
295 {
return words(p_order-1); }
297 {
return words(p_order-1); }
301 bool sparse_to_dense();
302 bool dense_to_sparse();
307 void freqs_to_probs();
338 bool oov_preprocess(
const EST_String &filename,
350 double backoff_reverse_probability_sub(
const EST_StrVector &words,
353 const bool trace=
false)
const;
354 double backoff_reverse_probability(
const EST_StrVector &words)
const;
356 double *prob =
NULL)
const;
370 void *params,
const int level);
376 p_number_of_sentences = 0;
377 backoff_representation = 0;
378 backoff_discount = 0;
384 default_values();
init(o,r,wordlist);
392 default_values();
init(o,r,wordlist,predlist);
397 default_values();
init(o,r,v);
401 void default_values();
414 double samples(
void)
const {
return p_num_samples;}
415 int order()
const {
return p_order; }
418 int get_vocab_word(
const EST_String &s)
const;
422 {
return pred_vocab->
name(s); }
426 {
return p_representation;}
435 const int mincount=1,
436 const int maxcount=10);
440 const double count=1);
443 const double count=1);
447 void make_htk_compatible();
454 const bool trace=
false,
457 int wordlist_index(
const EST_String &word,
const bool report=
true)
const;
458 const EST_String &wordlist_index(
int i)
const;
459 int predlist_index(
const EST_String &word)
const;
460 const EST_String &predlist_index(
int i)
const;
463 bool set_entry_type(
entry_t new_type);
470 const bool trace=
false)
const;
472 const bool trace=
false)
const;
475 double *prob,
int *state)
const;
477 {
double p;
int state;
return predict(words,&p,&state); }
479 {
int state;
return predict(words,prob,&state); }
483 {
double p;
int state;
return predict(words,&p,&state); }
485 {
int state;
return predict(words,prob,&state); }
489 int find_next_state_id(
int state,
int word)
const;
498 bool force=
false)
const;
499 double reverse_probability(
const EST_IVector &words,
500 bool force=
false)
const;
527 bool ngram_exists(
const EST_StrVector &words,
const double threshold)
const;
529 bool set_backoff_weight(
const EST_StrVector &words,
const double w);
531 void print_freqs(ostream &os,
double floor=0.0);
581 const double default_discount);
590 bool compute_backoff_weights(
const int mincount=1,
591 const int maxcount=10);
596 friend class EST_BackoffNgrammar;
621 #endif // __EST_NGRAMMAR_H__ friend ostream & operator<<(ostream &s, const EST_NgrammarState &a)
const EST_String & predict(const EST_StrVector &words) const
EST_BackoffNgrammarState(const EST_Discrete *d, int level)
EST_Ngrammar(int o, representation_t r, const EST_StrList &wordlist, const EST_StrList &predlist)
EST_DVector * backoff_discount
double probability(int w) const
void slide(EST_IVector &i, const int l)
int fs_backoff_smooth(EST_Ngrammar *backoff_ngrams, EST_Ngrammar &ngram, int smooth_thresh)
double samples(void) const
EST_String p_sentence_end_marker
bool test_stats(EST_Ngrammar &ngram, const EST_String &filename, double &raw_entropy, double &count, double &entropy, double &perplexity, const EST_String &input_format, const EST_String &prev=SENTENCE_START_MARKER, const EST_String &prev_prev=SENTENCE_END_MARKER, const EST_String &last=SENTENCE_END_MARKER)
const EST_Discrete * get_discrete() const
Returns discrete vocabulary of distribution.
const EST_String & most_probable(double *prob=NULL) const
const EST_String & most_probable(double *prob=NULL) const
Return the most probable member of the distribution.
int get_pred_vocab_word(const EST_String &s) const
void Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount, const double default_discount=0.5)
EST_read_status load_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n)
EST_BackoffNgrammarState * get_child(const int word) const
#define SENTENCE_END_MARKER
const EST_String & lastword(const EST_StrVector &words) const
EST_write_status save_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n, const bool trace=false, double floor=0.0)
EST_BackoffNgrammarState * backoff_representation
bool save(Lattice &lattice, EST_String filename)
const EST_String & name(const int n) const
The name given the index.
const EST_String & predict(const EST_StrVector &words, double *prob) const
int length(void) const
The number of members in the discrete.
double frequency(const EST_String &w) const
bool load(Lattice &lattice, EST_String filename)
EST_read_status load_ngram_arpa(const EST_String filename, EST_Ngrammar &n, const EST_StrList &vocab)
EST_write_status save_ngram_htk_ascii_sub(const EST_String &word, ostream *ost, EST_Ngrammar &n, double floor)
entry_t entry_type() const
void cumulate(const int index, const double count=1)
EST_Discrete * pred_vocab
EST_Item * root(const EST_Item *n)
return root node of treeprevious sibling (sister) of n
double probability(const EST_String &w) const
EST_NgrammarState * p_states
int check_vocab(EST_Relation &a, EST_StrList &vocab)
double probability(const EST_String &s) const
int num_states(void) const
EST_read_status load_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n)
int get_pred_vocab_length() const
int index(EST_TList< T > &l, T &val, bool(*eq)(const EST_UItem *, const EST_UItem *)=NULL)
double frequency(const EST_String &w) const
void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n, int this_order=0)
EST_DiscreteProbDistribution vocab_pdf
const EST_String & predict(const EST_IVector &words, double *prob) const
double p_number_of_sentences
EST_DiscreteProbDistribution & pdf()
EST_write_status save_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n, double floor=0.0)
EST_NgrammarState(int id, const EST_DiscreteProbDistribution &pdf)
void cumulate(const EST_String &s, double count=1)
Add this observation, may specify number of occurrences.
double frequency(const EST_String &s) const
EST_String get_pred_vocab_word(int i) const
void Ngram_freqsmooth(EST_Ngrammar &ngram, int smooth_thresh1, int smooth_thresh2)
representation_t p_representation
#define VAL_REGISTER_CLASS_DCLS(NAME, CLASS)
double frequency(int w) const
EST_DiscreteProbDistribution p_pdf
EST_BackoffNgrammarState(const EST_DiscreteProbDistribution &pdf, int level)
EST_DiscreteProbDistribution & pdf()
void cumulate(const EST_String &word, const double count=1)
double backoff_unigram_floor_freq
representation_t representation() const
EST_read_status load_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n)
const EST_DiscreteProbDistribution & pdf_const() const
section options Options< strong > or ngram_per_line Pseudo words
EST_DiscreteProbDistribution p_pdf
EST_NgrammarState(int id, EST_Discrete *d)
A vector class for double precision floating point numbers. EST_DVector x should be used instead of f...
EST_PredictionSuffixTree sparse_representation
EST_write_status save_ngram_arpa(const EST_String filename, EST_Ngrammar &n)
EST_read_status load_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n)
int get_vocab_length() const
int lastword(const EST_IVector &words) const
bool Good_Turing_smooth(EST_Ngrammar &n, int maxcount, int mincount=0)
A string tree index class for indexing arbitrary objects by strings of characters.
EST_BackoffNgrammarState * get_child(const EST_String &word) const
const EST_String & predict(const EST_IVector &words) const
EST_write_status save_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n, const bool trace=false, double floor=0.0)
EST_BackoffNgrammarState()
const EST_String & most_probable(double *prob=NULL) const
void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order=0)
void fs_build_backoff_ngrams(EST_Ngrammar *backoff_ngrams, EST_Ngrammar &ngram)
void save_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *ost)
#define SENTENCE_START_MARKER
EST_String p_sentence_start_marker
double probability(const EST_String &w) const
void * lookup(const EST_String &key) const
Find contents index by key, 0 if there is not contents.
EST_Ngrammar(int o, representation_t r, const EST_StrList &wordlist)
const EST_DiscreteProbDistribution & pdf_const() const
EST_write_status save_ngram_wfst(const EST_String filename, EST_Ngrammar &n)
Utility EST_String Functions header file.
double get_backoff_weight() const
EST_Ngrammar(int o, representation_t r, EST_Discrete &v)