speech-tools/freqsmooth_8cc_source.html

 /*************************************************************************/
 /*                                                                       */
 /*                Centre for Speech Technology Research                  */
 /*                     University of Edinburgh, UK                       */
 /*                         Copyright (c) 1997                            */
 /*                        All Rights Reserved.                           */
 /*                                                                       */
 /*  Permission is hereby granted, free of charge, to use and distribute  */
 /*  this software and its documentation without restriction, including   */
 /*  without limitation the rights to use, copy, modify, merge, publish,  */
 /*  distribute, sublicense, and/or sell copies of this work, and to      */
 /*  permit persons to whom this work is furnished to do so, subject to   */
 /*  the following conditions:                                            */
 /*   1. The code must retain the above copyright notice, this list of    */
 /*      conditions and the following disclaimer.                         */
 /*   2. Any modifications must be clearly marked as such.                */
 /*   3. Original authors' names are not deleted.                         */
 /*   4. The authors' names are not used to endorse or promote products   */
 /*      derived from this software without specific prior written        */
 /*      permission.                                                      */
 /*                                                                       */
 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
 /*  THIS SOFTWARE.                                                       */
 /*                                                                       */
 /*************************************************************************/
 /*                  Author :  Paul Taylor & Simon King & Alan W Black    */
 /*                  Date   :  April 1997                                 */
 /*-----------------------------------------------------------------------*/
 /*                                                                       */
 /* Yet another method for smoothing an ngram                             */
 /*                                                                       */
 /* Good_Turing smooth n-grammar so there are no zero occurrences         */
 /* For each ngram                                                        */
 /*    if |n-1gram| < threshold                                           */
 /*        F(ngram) = !n-1gram|*(P(n-1gram))                              */
 /*                                                                       */
 /*=======================================================================*/
 #include <iostream>
 #include <cstring>
 #include <cmath>
 #include <climits>
 #include <cfloat>
 #include "EST_Ngrammar.h"

 using namespace std;

 static double fs_find_backoff_prob(EST_Ngrammar *backoff_ngrams,
                        int order,const EST_StrVector words,
                        int smooth_thresh);

 void Ngram_freqsmooth(EST_Ngrammar &ngram,int smooth_thresh1,
                int smooth_thresh2)
 {
     //  To assign reasonable frequencies for all ngrams in grammar
     EST_Ngrammar *backoff_ngrams;
     backoff_ngrams = new EST_Ngrammar[ngram.order()-1];

     Good_Turing_smooth(ngram,smooth_thresh1);

     fs_build_backoff_ngrams(backoff_ngrams,ngram);

     fs_backoff_smooth(backoff_ngrams,ngram,smooth_thresh2);

     delete [] backoff_ngrams;

 }

 void fs_build_backoff_ngrams(EST_Ngrammar *backoff_ngrams,
                  EST_Ngrammar &ngram)
 {
     // Build all the backoff grammars back to uni-grams
     int i,j,l;
     EST_Litem *k;

     for (i=0; i < ngram.order()-1; i++)
     backoff_ngrams[i].init(i+1,EST_Ngrammar::dense,
                    *ngram.vocab,*ngram.pred_vocab);

     for (i=0; i < ngram.num_states(); i++)
     {
     const EST_StrVector words = ngram.make_ngram_from_index(i);

     for (k=ngram.p_states[i].pdf().item_start();
          !ngram.p_states[i].pdf().item_end(k);
          k = ngram.p_states[i].pdf().item_next(k))
     {
         double freq;
         EST_String name;
         ngram.p_states[i].pdf().item_freq(k,name,freq);
         // Build all the sub-ngrams and accumulate them
         for (j=0; j < ngram.order()-1; j++)
         {
         EST_StrVector nnn(j+1);
         nnn[j] = name;
         for (l=0; l < j; l++)
             nnn[l] = words(ngram.order()-1-j);
         backoff_ngrams[j].accumulate(nnn,freq);
         }
     }
     }

 }

 int fs_backoff_smooth(EST_Ngrammar *backoff_ngrams,
              EST_Ngrammar &ngram, int smooth_thresh)
 {
     // For all ngrams which are too infrequent, adjust their
     // frequencies based on their backoff probabilities
     int i;
     EST_Litem *j;
     double occurs;
     double backoff_prob;

     if (ngram.representation() != EST_Ngrammar::dense)
     {
     cerr << "Ngrammar: can only ptsmooth dense ngrammars" << endl;
     return FALSE;
     }
     else
     {
     for (i=0; i < ngram.num_states(); i++)
     {
         if (ngram.p_states[i].pdf().samples() < smooth_thresh)
         {
         EST_DiscreteProbDistribution &pdf = ngram.p_states[i].pdf();
         occurs = ngram.p_states[i].pdf().samples();
         EST_StrVector words = ngram.make_ngram_from_index(i);
         words.resize(words.n()+1);

         for (j=pdf.item_start();
              ! pdf.item_end(j);
              j = pdf.item_next(j))
         {
             EST_String name;
             double freq;
             pdf.item_freq(j,name,freq);
             words[words.n()-1] = name;
             backoff_prob =
             fs_find_backoff_prob(backoff_ngrams,
                          ngram.order()-1,
                          words,
                          smooth_thresh);
             pdf.set_frequency(j,occurs*backoff_prob);
         }
         }
     }
     }

     return TRUE;
 }

 static double fs_find_backoff_prob(EST_Ngrammar *backoff_ngrams,
                    int order,const EST_StrVector words,
                    int smooth_thresh)
 {
     // Find the backoff prob for n-1gram
     EST_StrVector nnn;
     int i;

     if (order == 0)
     return TINY_FREQ;   // ultimate floor

     nnn.resize(order);

     for(i=0; i<order; i++)
     nnn[order-1-i] = words(words.n()-1-i);

     if (backoff_ngrams[order-1].frequency(nnn) < smooth_thresh)
     return fs_find_backoff_prob(backoff_ngrams,
                        order-1,words,smooth_thresh);
     else
     return backoff_ngrams[order-1].probability(nnn);
 }
fs_backoff_smooth
int fs_backoff_smooth(EST_Ngrammar *backoff_ngrams, EST_Ngrammar &ngram, int smooth_thresh)
Definition: freqsmooth.cc:111

EST_DiscreteProbDistribution::item_next
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
Definition: EST_DProbDist.cc:388

TINY_FREQ
#define TINY_FREQ
Definition: EST_Ngrammar.h:70

EST_Ngrammar::dense
Definition: EST_Ngrammar.h:221

EST_DiscreteProbDistribution
Definition: EST_simplestats.h:210

EST_UItem
Definition: EST_UList.h:49

EST_DiscreteProbDistribution::samples
double samples(void) const
Total number of example found.
Definition: EST_simplestats.h:250

EST_Ngrammar::accumulate
void accumulate(const EST_StrVector &words, const double count=1)
Definition: EST_Ngrammar.cc:885

std

EST_Ngrammar::vocab
EST_Discrete * vocab
Definition: EST_Ngrammar.h:283

EST_DiscreteProbDistribution::item_start
EST_Litem * item_start() const
Used for iterating through members of the distribution.
Definition: EST_DProbDist.cc:372

EST_Ngrammar::pred_vocab
EST_Discrete * pred_vocab
Definition: EST_Ngrammar.h:284

EST_DiscreteProbDistribution::item_end
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
Definition: EST_DProbDist.cc:380

EST_Ngrammar::p_states
EST_NgrammarState * p_states
Definition: EST_Ngrammar.h:276

EST_Ngrammar::num_states
int num_states(void) const
Definition: EST_Ngrammar.h:413

EST_Ngrammar
Definition: EST_Ngrammar.h:216

EST_NgrammarState::pdf
EST_DiscreteProbDistribution & pdf()
Definition: EST_Ngrammar.h:115

EST_Ngrammar::probability
double probability(const EST_StrVector &words, bool force=false, const bool trace=false) const
Definition: EST_Ngrammar.cc:1847

EST_TVector::resize
void resize(ssize_t n, int set=1)
Definition: EST_TVector.cc:196

EST_DiscreteProbDistribution::item_freq
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index.
Definition: EST_DProbDist.cc:404

EST_Ngrammar::representation
representation_t representation() const
Definition: EST_Ngrammar.h:425

FALSE
#define FALSE
Definition: EST_bool.h:119

words
section options Options< strong > or ngram_per_line Pseudo words
Definition: ngram_test_man.dox:64

Good_Turing_smooth
bool Good_Turing_smooth(EST_Ngrammar &ngrammar, int maxcount, int mincount)
Definition: ngrammar_aux.cc:410

EST_DiscreteProbDistribution::set_frequency
void set_frequency(const EST_String &s, double c)
Definition: EST_DProbDist.cc:270

EST_Ngrammar::make_ngram_from_index
const EST_StrVector & make_ngram_from_index(const int i) const
Definition: EST_Ngrammar.cc:622

EST_Ngrammar::order
int order() const
Definition: EST_Ngrammar.h:415

EST_TVector< EST_String >

fs_build_backoff_ngrams
void fs_build_backoff_ngrams(EST_Ngrammar *backoff_ngrams, EST_Ngrammar &ngram)
Definition: freqsmooth.cc:75

TRUE
#define TRUE
Definition: EST_bool.h:118

EST_TVector::n
INLINE ssize_t n() const
number of items in vector.
Definition: EST_TVector.h:251

Ngram_freqsmooth
void Ngram_freqsmooth(EST_Ngrammar &ngram, int smooth_thresh1, int smooth_thresh2)
Definition: freqsmooth.cc:58

EST_Ngrammar.h

EST_String
Definition: EST_String.h:76