Edinburgh Speech Tools  2.1-release
ngrammar_utils.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1999 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : February 1999 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* A rationalization of some of the general functions */
38 /* */
39 /*=======================================================================*/
40 #include <iostream>
41 #include <cstring>
42 #include "EST_String.h"
43 #include "EST_Token.h"
44 #include "EST_error.h"
45 #include "EST_Ngrammar.h"
46 
47 using namespace std;
48 
49 static int get_next_window(EST_TokenStream &ts,
50  EST_StrVector &window,
51  const EST_String &input_format,
52  EST_Ngrammar &ngram)
53 {
54  int i;
55  if ((input_format == "sentence_per_line") ||
56  (input_format == "sentence_per_file"))
57  {
58  EST_String t = ts.get().string();
59  slide(window,-1);
60  window[ngram.order()-1] = t;
61  if (ngram.wordlist_index(t) == -1)
62  cerr << "EST_Ngrammar test: skipping bad word \"" <<
63  t << "\"" << endl;
64  }
65  else if (input_format == "ngram_per_line")
66  {
67  for (i=0; i < ngram.order(); i++)
68  {
69  EST_String t = ts.get().string();
70  window[i] = t;
71  if (ngram.wordlist_index(t) == -1)
72  cerr << "EST_Ngrammar test: skipping bad word \"" <<
73  t << "\"" << endl;
74  }
75  }
76  else
77  EST_error("EST_Ngrammar test: unknown input format \"%s\"\n",
78  (const char *)input_format);
79 
80  // Sigh, you pull a little thread and it all falls down
81  // For the time being can only deal in StrVectors rather than
82  // IVectors
83  for (i=0; i < ngram.order(); i++)
84  if (ngram.wordlist_index(window(i)) == -1)
85  return FALSE;
86  return TRUE;
87 }
88 
89 bool test_stats(EST_Ngrammar &ngram,
90  const EST_String &filename,
91  double &raw_entropy,
92  double &count,
93  double &entropy,
94  double &perplexity,
95  const EST_String &input_format,
96  const EST_String &prev,
97  const EST_String &prev_prev,
98  const EST_String &last)
99 {
100  // Apply an ngram to some data and report on its performance
101  // Output entropy and test set perplexity
102  // H = -1/Q . log P(wi | wi-1, wi-2, ... wi-n)
103  // H_p = 2^H
104  // Rabiner and Juang p450
105  EST_TokenStream ts;
106  double H,prob;
107  int Q;
108  EST_StrVector window(ngram.order());
109  (void)last;
110 
111  if (filename == "-")
112  ts.open(stdin,FALSE);
113  else if (ts.open(filename) == -1)
114  EST_error("EST_Ngrammar test: unable to open test file \"%s\"\n",
115  (const char *)filename);
116 
117  Q=0;
118  H=0.0;
119  ngram.fill_window_start(window,prev,prev_prev);
120 
121  while (!ts.eof() &&
122  (get_next_window(ts,window,input_format,ngram) == TRUE))
123  {
124  prob = ngram.probability(window);
125  H += log(prob);
126  Q++;
127  if ((input_format == "sentence_per_line") && (ts.eoln()))
128  ngram.fill_window_start(window,prev,prev_prev);
129  }
130 
131  count = Q;
132  raw_entropy = -1 * H;
133  entropy = -1 * (H/Q);
134  perplexity = pow(2.0,entropy);
135 
136 // printf("count %g entropy %g perplexity %g\n",
137 // count,entropy,perplexity);
138 
139  return true;
140 }
EST_TokenStream & get(EST_Token &t)
get next token in stream
Definition: EST_Token.cc:499
int wordlist_index(const EST_String &word, const bool report=true) const
int open(const EST_String &filename)
open a EST_TokenStream for a file.
Definition: EST_Token.cc:213
double probability(const EST_StrVector &words, bool force=false, const bool trace=false) const
int eof()
end of file
Definition: EST_Token.h:362
STATIC HISTORY H
Definition: editline.c:120
void fill_window_start(EST_IVector &window, const EST_String &prev, const EST_String &prev_prev) const
#define FALSE
Definition: EST_bool.h:119
#define EST_error
Definition: EST_error.h:104
int order() const
Definition: EST_Ngrammar.h:415
#define TRUE
Definition: EST_bool.h:118
void slide(EST_IVector &v, const int l)
int eoln()
end of line
Definition: EST_Token.cc:832
bool test_stats(EST_Ngrammar &ngram, const EST_String &filename, double &raw_entropy, double &count, double &entropy, double &perplexity, const EST_String &input_format, const EST_String &prev, const EST_String &prev_prev, const EST_String &last)