48 ExponentialFit(
EST_DVector &N,
double &a,
double &b,
int first,
int last)
64 cerr <<
"ExponentialFit : first must be >= 0" << endl;
70 cerr <<
"ExponentialFit : last must be < N.n()-1 = " << N.
n()-1 << endl;
80 double ElnNr=0.0,ElnNrlnr=0.0,
84 for(
int r=first;r<=last;r++)
90 ElnNrlnr += log( N(r) ) * log( (
double)r );
92 Elnr += log( (
double)r );
93 Elnr2 += log( (
double)r ) * log( (
double)r );
97 b = ( (ElnNr*Elnr/R) - ElnNrlnr ) / ( (Elnr*Elnr/R) - Elnr2);
98 a = (ElnNr - (b*Elnr) ) / R;
104 smooth_ExponentialFit(
EST_DVector &N,
int first,
int last)
108 if (!ExponentialFit(N,a,b,first,last))
110 cerr <<
"smooth_ExponentialFit : ExponentialFit failed !" << endl;
114 for(
int r=first;r<=last;r++)
115 N[r] = exp(a)* pow((
double)r, b);
134 (*ff)[(
int)(freq+0.5)] += 1;
147 double *
max = (
double*)params;
178 double nfreq = (*map)((
int)(freq+0.5));
193 double *
min = (
double*)params;
216 bool complete = (
bool)(ff.
n() == 0);
232 ff.
resize((
int)(max+1.5));
246 ff[(
int)(freq+0.5)] += 1;
255 for (i=1;i<ff.
n();i++)
271 ff.
resize((
int)(max+1.5));
278 for (i=0;i<ff.
n();i++)
289 for (i=1;i<ff.
n();i++)
300 cerr <<
"unknown representation for EST_Ngrammar" << endl;
328 nfreq = map((
int)(freq+0.5));
351 cerr <<
"unknown representation for EST_Ngrammar" << endl;
358 adjusted_frequencies_BasicGoodTuring(
EST_DVector &M,
366 if (maxcount > N.
n()-2)
369 cerr <<
"adjusted_frequencies_BasicGoodTuring :";
370 cerr <<
" maxcount is too big, reducing it to " << maxcount << endl;
375 for(r=0; r<=maxcount;r++)
378 if( (N(r+1) == 0) || (N(r) == 0) )
381 M[r] = (r + 1) * N(r+1) / N(r);
392 smoothed_frequency_distribution_ExponentialFit(
EST_DVector &N,
int maxcount)
394 if (maxcount > N.
n()-2)
397 cerr <<
"smoothed_frequency_distribution_ExponentialFit :" 398 <<
" maxcount too big, reducing it to " << maxcount << endl;
403 if (!smooth_ExponentialFit(N,1,maxcount+1))
404 cerr <<
"smooth_ExponentialFit failed !" << endl;
421 cerr <<
"EST_Ngram: cannot Good-Turing smooth ngram:" <<
422 " entries are not frequencies" << endl;
437 smoothed_frequency_distribution_ExponentialFit(freqs,maxcount-1);
439 adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,maxcount);
450 cerr <<
"Smoothing of backed of grammars is not available!" << endl;
529 cerr <<
"unknown representation for EST_Ngrammar" << endl;
541 const double default_discount)
546 cerr <<
"Good_Turing_discount is not appropriate for non backoff grammar !" 575 for (o=1;o<=ngrammar.
order();o++)
582 if(max > freqs.
n() - 2)
593 for(i=0;i<=max+1;i++)
596 smoothed_frequency_distribution_ExponentialFit(freqs,max);
598 for(i=0;i<=max+1;i++)
607 adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,max);
622 for(;i<freqs.
n();i++)
EST_DVector * backoff_discount
void Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount, const double default_discount)
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order)
void zero_small_f(EST_BackoffNgrammarState *s, void *params)
double samples(void) const
Total number of example found.
EST_BackoffNgrammarState * backoff_representation
void fill(const T &v)
Fill entire array will value v.
#define VAL_REGISTER_CLASS(NAME, CLASS)
EST_Litem * item_start() const
Used for iterating through members of the distribution.
entry_t entry_type() const
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
EST_NgrammarState * p_states
int num_states(void) const
void backoff_traverse(EST_BackoffNgrammarState *start_state, void(*function)(EST_BackoffNgrammarState *s, void *params), void *params)
float max(float a, float b)
EST_DiscreteProbDistribution & pdf()
void get_max_f(EST_BackoffNgrammarState *s, void *params)
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index.
void make_f_of_f(EST_BackoffNgrammarState *s, void *params)
EST_DiscreteProbDistribution & pdf()
float min(float a, float b)
representation_t representation() const
const EST_DiscreteProbDistribution & pdf_const() const
void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n, int this_order)
A vector class for double precision floating point numbers. EST_DVector x should be used instead of f...
bool Good_Turing_smooth(EST_Ngrammar &ngrammar, int maxcount, int mincount)
int get_vocab_length() const
void set_frequency(const EST_String &s, double c)
void override_frequency(const EST_String &s, double c)
Sets the frequency of named item, without modifying num\_samples.
void map_f_of_f(EST_BackoffNgrammarState *s, void *params)
INLINE ssize_t n() const
number of items in vector.
void resize(int n, int set=1)
resize vector