speech-tools/ngrammar__aux_8cc_source.html

 /*************************************************************************/
 /*                                                                       */
 /*                Centre for Speech Technology Research                  */
 /*                     University of Edinburgh, UK                       */
 /*                         Copyright (c) 1996                            */
 /*                        All Rights Reserved.                           */
 /*                                                                       */
 /*  Permission is hereby granted, free of charge, to use and distribute  */
 /*  this software and its documentation without restriction, including   */
 /*  without limitation the rights to use, copy, modify, merge, publish,  */
 /*  distribute, sublicense, and/or sell copies of this work, and to      */
 /*  permit persons to whom this work is furnished to do so, subject to   */
 /*  the following conditions:                                            */
 /*   1. The code must retain the above copyright notice, this list of    */
 /*      conditions and the following disclaimer.                         */
 /*   2. Any modifications must be clearly marked as such.                */
 /*   3. Original authors' names are not deleted.                         */
 /*   4. The authors' names are not used to endorse or promote products   */
 /*      derived from this software without specific prior written        */
 /*      permission.                                                      */
 /*                                                                       */
 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
 /*  THIS SOFTWARE.                                                       */
 /*                                                                       */
 /*************************************************************************/
 /*                     Author :  Simon King & Alan W Black               */
 /*                     Date   :  February 1997                           */
 /*-----------------------------------------------------------------------*/
 /*                                                                       */
 /* Auxiliary functions for EST_Ngram class                               */
 /*                                                                       */
 /*=======================================================================*/
 #include <iostream>
 #include <cstring>
 #include "EST_String.h"
 #include "EST_Ngrammar.h"

 using namespace std;

 static bool
 ExponentialFit(EST_DVector &N, double &a, double &b, int first, int last)
 {
   // fit the function  N(r) = e^a * r^b  where a,b are constants
   // to the points N(first)...N(last) inclusive
   // i.e. a straight line fit in the (natural) log domain
   // minimising the mean squared error (in log domain), is :

   // limitation : r must be an integer (it is the index of v)


   // default
   if (last == -1)
     last = N.n()-1;

   if (first < 0)
     {
       cerr << "ExponentialFit : first must be >= 0" << endl;
       return false;
     }

   if (last >= N.n()-1)
     {
       cerr << "ExponentialFit : last must be < N.n()-1 = " << N.n()-1 << endl;
     }

   if (first == last)
     {
       a=log(N(first));
       b=0;
       return true;
     }

   double ElnNr=0.0,ElnNrlnr=0.0,
   Elnr=0.0,Elnr2=0.0,
   R=0.0;

   for(int r=first;r<=last;r++)
     {
       R += 1;
       if ( N(r) > 0)
     {
       ElnNr += log( N(r) );
       ElnNrlnr += log( N(r) ) * log( (double)r );
     }
       Elnr += log( (double)r );
       Elnr2 += log( (double)r ) * log( (double)r );
     }

   // and the answer is :
   b = ( (ElnNr*Elnr/R) - ElnNrlnr ) / ( (Elnr*Elnr/R) - Elnr2);
   a = (ElnNr - (b*Elnr) ) / R;

   return true;
 }

 static bool
 smooth_ExponentialFit(EST_DVector &N, int first, int last)
 {
   double a=0.0,b=0.0;

   if (!ExponentialFit(N,a,b,first,last))
     {
       cerr << "smooth_ExponentialFit : ExponentialFit failed !" << endl;
       return false;
     }

   for(int r=first;r<=last;r++)
     N[r] = exp(a)* pow((double)r, b);

   return true;
 }

 void make_f_of_f(EST_BackoffNgrammarState *s,void *params)
 {
     EST_Litem *k;
     double freq;
     EST_String name;

     EST_DVector *ff = (EST_DVector*)params;
     int max = ff->n();
     for (k=s->pdf_const().item_start();
      !s->pdf_const().item_end(k);
      k = s->pdf_const().item_next(k))
     {
     s->pdf_const().item_freq(k,name,freq);
     if(freq+0.5 < max)
         (*ff)[(int)(freq+0.5)] += 1;

     }


 }

 void get_max_f(EST_BackoffNgrammarState *s,void *params)
 {
     EST_Litem *k;
     double freq;
     EST_String name;

     double *max = (double*)params;
     for (k=s->pdf_const().item_start();
      !s->pdf_const().item_end(k);
      k = s->pdf_const().item_next(k))
       {
       s->pdf_const().item_freq(k,name,freq);
       if(freq > *max)
         *max = freq;

       }


 }

 void map_f_of_f(EST_BackoffNgrammarState *s,void *params)
 {
     EST_Litem *k;
     double freq;
     EST_String name;

     //cerr << "map_f_of_f : visited " << *s << endl;

     EST_DVector *map = (EST_DVector*)params;
     int max = map->n();
     for (k=s->pdf_const().item_start();
      !s->pdf_const().item_end(k);
      k = s->pdf_const().item_next(k))
       {
       s->pdf_const().item_freq(k,name,freq);
       if (freq+0.5 < max)
         {
         double nfreq = (*map)((int)(freq+0.5));
         s->pdf().set_frequency(name,nfreq);

         }

       }

 }

 void zero_small_f(EST_BackoffNgrammarState *s,void *params)
 {
     EST_Litem *k;
     double freq;
     EST_String name;

     double *min = (double*)params;
     for (k=s->pdf_const().item_start();
      !s->pdf_const().item_end(k);
      k = s->pdf_const().item_next(k))
       {
       s->pdf_const().item_freq(k,name,freq);
       if (freq < *min)
       {
           //cerr << "zeroing " << freq << " < " << *min << endl;
           s->pdf().override_frequency(k,0.0);
       }
       }
 }

 void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n,int this_order)
 {
   int i,size;
   EST_Litem *k;
   double max=0.0;

   // if ff has zero size, do complete frequency of frequencies
   // otherwise just fill in frequencies from 1 to ff.n()-1

   bool complete = (bool)(ff.n() == 0);

   switch(n.representation())
     {

       case EST_Ngrammar::sparse:
       case EST_Ngrammar::dense:
     {
         size = n.num_states();
         if (complete)
           {
           // find highest frequency in EST_Ngram
           for(i=0;i<size;i++)
             if (n.p_states[i].pdf().samples() > max)
               max = n.p_states[i].pdf().samples();

           ff.resize((int)(max+1.5));
           ff.fill(0.0);
           }

         // Sum the frequencies
         for(i=0;i<size;i++)
           {
           for (k=n.p_states[i].pdf().item_start();
                !n.p_states[i].pdf().item_end(k);
                k = n.p_states[i].pdf().item_next(k))
             {
             EST_String name;
             double freq;
             n.p_states[i].pdf().item_freq(k,name,freq);
             ff[(int)(freq+0.5)] += 1;
             }
           }

         if (complete)
           {
           // fill in ff(0) - the number of unseen ngrams

           double total=0;
           for (i=1;i<ff.n();i++)
             total += ff(i);

           ff[0] = pow(float(n.get_vocab_length()),float(n.order())) - total;
           }
     }
     break;


       case EST_Ngrammar::backoff:
     {
         if (complete)
           {
           n.backoff_traverse(n.backoff_representation,
                      &get_max_f,(void*)(&max),
                      this_order-1);
           ff.resize((int)(max+1.5));
           }

         // backoff grammar has grammars of orders
         // order, order-1, ..., 1
         // and we treat each order independently

         for (i=0;i<ff.n();i++)
         ff[i] = 0;

         n.backoff_traverse(n.backoff_representation,
                    &make_f_of_f,(void*)(&ff),
                    this_order-1);

         if (complete)
           {
           // fill in ff(0) - the number of unseen ngrams
           double total=0;
           for (i=1;i<ff.n();i++)
             total += ff(i);
           ff[0] = pow(float(n.get_vocab_length()),float(this_order)) - total;


           }
     }
     break;

       default:
     cerr << "unknown representation for EST_Ngrammar" << endl;
     break;
     }

 }

 void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order)
 {
     int i;
     EST_Litem *k;

   switch(n.representation())
     {

       case EST_Ngrammar::sparse:
       case EST_Ngrammar::dense:
     {
         int size = n.p_num_states;

         for(i=0;i<size;i++)
           {
           for (k=n.p_states[i].pdf().item_start();
                !n.p_states[i].pdf().item_end(k);
                k = n.p_states[i].pdf().item_next(k))
             {
             EST_String name;
             double freq,nfreq;
             n.p_states[i].pdf().item_freq(k,name,freq);
             nfreq = map((int)(freq+0.5));
             n.p_states[i].pdf().set_frequency(name,nfreq);


             }
           }
     }
     break;

       case EST_Ngrammar::backoff:
     {

         // backoff grammar has grammars of orders
         // order, order-1, ..., 1
         // and we treat each order independently
         n.backoff_traverse(n.backoff_representation,
                    &map_f_of_f,(void*)(&map),
                    this_order-1);

     }
     break;

       default:
     cerr << "unknown representation for EST_Ngrammar" << endl;
     break;

     }
 }

 static void
 adjusted_frequencies_BasicGoodTuring(EST_DVector &M,
                      const EST_DVector &N,
                      int maxcount)
 {
     // produce a map of frequencies, given a (smoothed) distribution
     // only map up to a frequency of maxcount; beyond that
     // map(r) == r

     if (maxcount > N.n()-2)
     {
     maxcount = N.n()-2;
     cerr << "adjusted_frequencies_BasicGoodTuring :";
     cerr << " maxcount is too big, reducing it to " << maxcount << endl;
     }

     M.resize(N.n());
     int r;
     for(r=0; r<=maxcount;r++)
     {
     // don't like this bit, but what can you do ?
     if( (N(r+1) == 0) || (N(r) == 0) )
         M[r] = r;
     else
         M[r] = (r + 1) * N(r+1) / N(r);
     }
     // and do not map higher counts
     for(;r<N.n();r++)
     M[r]=r;

     return;

 }

 static void
 smoothed_frequency_distribution_ExponentialFit(EST_DVector &N,int maxcount)
 {
     if (maxcount > N.n()-2)
     {
     maxcount = N.n()-2;
     cerr << "smoothed_frequency_distribution_ExponentialFit :"
         << " maxcount too big, reducing it to " << maxcount << endl;
     }
     // the idea to fit this particular fn was by Steve Isard
     // we ignore r=0 in doing the fit

     if (!smooth_ExponentialFit(N,1,maxcount+1))
     cerr << "smooth_ExponentialFit failed !" << endl;

     return;
 }

 bool
 Good_Turing_smooth(EST_Ngrammar &ngrammar, int maxcount, int mincount)
 {
     // works for any N
     // since it just remaps frequencies

     // frequencies above maxcount are not changed
     // for discounting -- see discount routine !


     if (ngrammar.entry_type() != EST_Ngrammar::frequencies)
     {
     cerr << "EST_Ngram: cannot Good-Turing smooth ngram:"  <<
         " entries are not frequencies" << endl;
     return false;
     }

     switch(ngrammar.representation())
     {

       case EST_Ngrammar::sparse:
       case EST_Ngrammar::dense:
     {
     EST_DVector freqs,mapped_freqs;
     // grammar is of a single order - simple
     // Find frequency distribution
     frequency_of_frequencies(freqs,ngrammar);
     // smoothing should be optional - to do
     smoothed_frequency_distribution_ExponentialFit(freqs,maxcount-1);
     // Build map of frequencies
     adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,maxcount);
     // Map all frequencies in grammar to Good Turing Smoothed values
     map_frequencies(ngrammar,mapped_freqs);

     }
     break;

   case EST_Ngrammar::backoff:
 {
     (void)mincount;

     cerr << "Smoothing of backed of grammars is not available!" << endl;
     return false;

     /*
        // need to smooth for each order independently
        int i,o;
        double threshold;

        //for (o=1;o<=ngrammar.order();o++)
        for(o=ngrammar.order();o<=ngrammar.order();o++)
        {

        EST_DVector freqs,mapped_freqs;

        frequency_of_frequencies(freqs,ngrammar,o);

        cerr << "FREQ : " << freqs << endl;

        if(freqs.n() < 2)
        {
        // i.e. only unseen things - zero them all
        threshold = 2; // 1 ?
        if(o>1)
        ngrammar.backoff_traverse(ngrammar.backoff_representation,
        &zero_small_f,
        (void*)(&threshold),
        o-1);
        continue;
        }

        int max = maxcount;
        if(max > freqs.n() - 2)
        max = freqs.n() - 2;

        if(max > 2)
        // need at least 3 points
        // smoothing should be optional - to do
        smoothed_frequency_distribution_ExponentialFit(freqs,max);

        cerr << "SMOOTHED : " << freqs << endl;


        adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,max);

        // initial map of frequencies modifies total counts
        // so pdfs are correct

        map_frequencies(ngrammar,mapped_freqs,o);


        cerr << "Map for " << o << " : ";
        for(i=0;i<max;i++)
        cerr << mapped_freqs(i) << " ";
        cerr << endl;

        cerr << "MAP : " << mapped_freqs << endl;

        // now go and zero the small frequencies
        // but not for unigram


        if(mincount < mapped_freqs.n())
        threshold = mapped_freqs(mincount);
        else
        threshold = mapped_freqs(mapped_freqs.n()-1) + 1; // i.e. zero everything

        //cerr << "min = " << threshold << endl;

        if(o>1)
        ngrammar.backoff_traverse(ngrammar.backoff_representation,
        &zero_small_f,
        (void*)(&threshold),
        o-1);
        */

 }
 break;

 default:
 cerr << "unknown representation for EST_Ngrammar" << endl;
 break;

 }

 return true;

 }


 void
 Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount,
              const double default_discount)
 {

     if(ngrammar.representation() != EST_Ngrammar::backoff)
     {
     cerr << "Good_Turing_discount is not appropriate for non backoff grammar !"
         << endl;
     return;
     }


     // method (for each grammar order):
     // produce Good-Turing smoothed frequency map
     // compute difference between unsmoothed map (0,1,2,3...)
     // and GT map  - this is the discount
     // we do not subtract the discount here since we need to
     // preserve the raw counts

     // discounting is applied up to frequency 'maxcount'
     // beyond which we do not discount

     // to do : zero low frequencies ???? (prune the tree)
     /*
        ngrammar.backoff_traverse(ngrammar.backoff_representation,
        &zero_small_f,
        (void*)(&threshold),
        o-1);
        */

     // need to treat for each order independently
     int i,o;

     // no discounting for order 1 - YES !?!?!?
     for (o=1;o<=ngrammar.order();o++)
     {

     EST_DVector freqs,mapped_freqs;
     frequency_of_frequencies(freqs,ngrammar,o);

     int max = maxcount;
     if(max > freqs.n() - 2)
         max = freqs.n() - 2;

     if(max > 2)
     {
         // need at least 3 points
         // smoothing should be optional - to do

         // April 97
         // to overcome problems with N(x)=0, shift data points, fit, and shift back

         for(i=0;i<=max+1;i++)
         freqs[i] += 1;

         smoothed_frequency_distribution_ExponentialFit(freqs,max);

         for(i=0;i<=max+1;i++)
         {
         freqs[i] -= 1;
         // possibly unnecesary check
         if ( freqs(i) < 0 )
             freqs[i] = 0;
         }
     }

     adjusted_frequencies_BasicGoodTuring(mapped_freqs,freqs,max);

     // and put it in the discount array
     ngrammar.backoff_discount[o-1].resize(freqs.n());
     for(i=(int)ngrammar.backoff_threshold;i<=max;i++)
     {
         ngrammar.backoff_discount[o-1][i] =  (double)i - mapped_freqs(i);

         // sanity check
         if( ngrammar.backoff_discount[o-1][i] < 0)
         {
         ngrammar.backoff_discount[o-1][i] = 0;
         }
     }

     for(;i<freqs.n();i++)
         ngrammar.backoff_discount[o-1][i] = default_discount;

     }
 }

 VAL_REGISTER_CLASS(ngrammar,EST_Ngrammar)

EST_Ngrammar::backoff_discount
EST_DVector * backoff_discount
Definition: EST_Ngrammar.h:269

EST_BackoffNgrammarState
Definition: EST_Ngrammar.h:129

Good_Turing_discount
void Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount, const double default_discount)
Definition: ngrammar_aux.cc:540

EST_DiscreteProbDistribution::item_next
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
Definition: EST_DProbDist.cc:388

EST_Ngrammar::dense
Definition: EST_Ngrammar.h:221

map_frequencies
void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order)
Definition: ngrammar_aux.cc:306

zero_small_f
void zero_small_f(EST_BackoffNgrammarState *s, void *params)
Definition: ngrammar_aux.cc:187

EST_UItem
Definition: EST_UList.h:49

EST_DiscreteProbDistribution::samples
double samples(void) const
Total number of example found.
Definition: EST_simplestats.h:250

EST_Ngrammar::frequencies
Definition: EST_Ngrammar.h:226

EST_Ngrammar::backoff_representation
EST_BackoffNgrammarState * backoff_representation
Definition: EST_Ngrammar.h:257

EST_TVector::fill
void fill(const T &v)
Fill entire array will value v.
Definition: EST_TVector.cc:105

std

VAL_REGISTER_CLASS
#define VAL_REGISTER_CLASS(NAME, CLASS)
Definition: EST_Val_defs.h:62

EST_DiscreteProbDistribution::item_start
EST_Litem * item_start() const
Used for iterating through members of the distribution.
Definition: EST_DProbDist.cc:372

EST_Ngrammar::entry_type
entry_t entry_type() const
Definition: EST_Ngrammar.h:424

EST_DiscreteProbDistribution::item_end
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
Definition: EST_DProbDist.cc:380

EST_Ngrammar::p_states
EST_NgrammarState * p_states
Definition: EST_Ngrammar.h:276

EST_Ngrammar::num_states
int num_states(void) const
Definition: EST_Ngrammar.h:413

EST_Ngrammar::sparse
Definition: EST_Ngrammar.h:221

EST_Ngrammar
Definition: EST_Ngrammar.h:216

EST_Ngrammar::backoff_traverse
void backoff_traverse(EST_BackoffNgrammarState *start_state, void(*function)(EST_BackoffNgrammarState *s, void *params), void *params)
Definition: EST_Ngrammar.cc:2647

max
float max(float a, float b)
Definition: EST_cluster.cc:143

EST_NgrammarState::pdf
EST_DiscreteProbDistribution & pdf()
Definition: EST_Ngrammar.h:115

get_max_f
void get_max_f(EST_BackoffNgrammarState *s, void *params)
Definition: ngrammar_aux.cc:141

EST_DiscreteProbDistribution::item_freq
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index.
Definition: EST_DProbDist.cc:404

make_f_of_f
void make_f_of_f(EST_BackoffNgrammarState *s, void *params)
Definition: ngrammar_aux.cc:120

EST_BackoffNgrammarState::pdf
EST_DiscreteProbDistribution & pdf()
Definition: EST_Ngrammar.h:167

EST_Ngrammar::backoff_threshold
double backoff_threshold
Definition: EST_Ngrammar.h:259

min
float min(float a, float b)
Definition: EST_cluster.cc:138

EST_Ngrammar::representation
representation_t representation() const
Definition: EST_Ngrammar.h:425

EST_BackoffNgrammarState::pdf_const
const EST_DiscreteProbDistribution & pdf_const() const
Definition: EST_Ngrammar.h:166

frequency_of_frequencies
void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n, int this_order)
Definition: ngrammar_aux.cc:207

EST_DVector
A vector class for double precision floating point numbers. EST_DVector x should be used instead of f...
Definition: EST_DMatrix.h:122

Good_Turing_smooth
bool Good_Turing_smooth(EST_Ngrammar &ngrammar, int maxcount, int mincount)
Definition: ngrammar_aux.cc:410

EST_Ngrammar::get_vocab_length
int get_vocab_length() const
Definition: EST_Ngrammar.h:416

int
getString int
Definition: EST_item_aux.cc:50

EST_DiscreteProbDistribution::set_frequency
void set_frequency(const EST_String &s, double c)
Definition: EST_DProbDist.cc:270

EST_Ngrammar::p_num_states
int p_num_states
Definition: EST_Ngrammar.h:275

EST_String.h

EST_Ngrammar::order
int order() const
Definition: EST_Ngrammar.h:415

EST_Ngrammar::backoff
Definition: EST_Ngrammar.h:221

EST_DiscreteProbDistribution::override_frequency
void override_frequency(const EST_String &s, double c)
Sets the frequency of named item, without modifying num\_samples.
Definition: EST_DProbDist.cc:317

map_f_of_f
void map_f_of_f(EST_BackoffNgrammarState *s, void *params)
Definition: ngrammar_aux.cc:161

EST_TVector::n
INLINE ssize_t n() const
number of items in vector.
Definition: EST_TVector.h:251

bool
#define bool
Definition: EST_bool.h:117

EST_TSimpleVector::resize
void resize(int n, int set=1)
resize vector
Definition: EST_TSimpleVector.cc:67

EST_Ngrammar.h

EST_String
Definition: EST_String.h:76