Edinburgh Speech Tools  2.1-release
ngram_test_main.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1995,1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Authors: Simon King */
34 /* Date : July 1995 */
35 /*-----------------------------------------------------------------------*/
36 /* EST_Ngrammar test program */
37 /* */
38 /*=======================================================================*/
39 #include "EST.h"
40 #include "EST_Ngrammar.h"
41 
42 using namespace std;
43 
44 int main(int argc, char **argv)
45 {
46  //int order;
47  EST_StrList files,script;
48  EST_Option al, op;
49  EST_String wordlist_file, script_file, in_file, format;
50  EST_String prev_tag, prev_prev_tag, last_tag;
51  EST_Litem *p;
52  //EST_Ngrammar::representation_t representation =
53  //EST_Ngrammar::dense;
54 
55  EST_StrList wordlist;
56  EST_Ngrammar ngrammar;
57  bool per_file_stats=false;
58  bool raw_stats=false;
59  bool brief=false;
60  EST_String input_format;
61 
62  double raw_entropy,count,entropy,perplexity,total_raw_H,total_count;
63  total_count = 0;
64  total_raw_H = 0;
65 
67  (argc, argv,
68  EST_String("[input file0] [input file1] ...\n")+
69  "-g <ifile> grammar file (required)\n"+
70  "-w <ifile> filename containing word list (required for some grammar formats)\n"+
71  "-S <ifile> script file\n"+
72  "-raw_stats print unnormalised entropy and sample count\n"+
73  "-brief print results in brief format\n"+
74  "-f print stats for each file\n"+
75  "\n"+
76  "-input_format <string>\n"+
77  " format of input data (default sentence_per_line)\n"+
78  " may also be sentence_per_file, or ngram_per_line.\n"+
79  "\n"+
80  "Pseudo-words :\n"+
81  "-prev_tag <string>\n"+
82  " tag before sentence start\n"+
83  "-prev_prev_tag <string>\n"+
84  " all words before 'prev_tag'\n"+
85  "-last_tag <string>\n"+
86  " after sentence end\n"+
87  "-default_tags\n"+
88  " use default tags of "+SENTENCE_START_MARKER+
90  " respectively\n",
91  files, al);
92 
93 
94  if (al.present("-w"))
95  wordlist_file = al.val("-w");
96  else{
97  wordlist_file = "";
98  }
99 
100  if (al.present("-f"))
101  per_file_stats = true;
102  if (al.present("-input_format"))
103  input_format = al.val("-input_format");
104  else
105  input_format = "sentence_per_line";
106 
107  if ( al.present("-raw_stats") || al.present("-r"))
108  raw_stats = true;
109 
110  if ( al.present("-brief") || al.present("-b") )
111  brief = true;
112 
113 
114  if (al.present("-default_tags"))
115  {
116  prev_tag = SENTENCE_START_MARKER;
117  prev_prev_tag = SENTENCE_END_MARKER;
118  last_tag = SENTENCE_END_MARKER;
119  }
120 
121  if (al.present("-prev_tag"))
122  {
123  if (al.present("-default_tags"))
124  cerr << "test_ngram: WARNING : -prev_tag overrides -default_tags"
125  << endl;
126  prev_tag = al.val("-prev_tag");
127  }
128 
129  if (al.present("-prev_prev_tag"))
130  {
131  if (al.present("-default_tags"))
132  cerr << "test_ngram: WARNING : -prev_prev_tag overrides -default_tags" << endl;
133  prev_prev_tag = al.val("-prev_prev_tag");
134  }
135 
136  if (al.present("-last_tag"))
137  {
138  if (al.present("-default_tags"))
139  cerr << "test_ngram: WARNING : -last_tag overrides -default_tags" << endl;
140  last_tag = al.val("-last_tag");
141  }
142 
143  if ( ( (prev_tag=="") || (prev_prev_tag=="") || (last_tag=="") )
144  && ( (prev_tag!="") || (prev_prev_tag!="") || (last_tag!="") ) )
145  {
146  cerr << "test_ngram: ERROR : if any tags are given, ALL must be given" << endl;
147  exit(1);
148  }
149 
150 
151  // script
152  if (al.present("-S"))
153  {
154  script_file = al.val("-S");
155 
156  if(load_StrList(script_file,script) != format_ok)
157  {
158  cerr << "test_ngram: Could not read script from file "
159  << script_file << endl;
160  exit(1);
161  }
162  }
163 
164  if (al.present("-g"))
165  in_file = al.val("-g");
166  else
167  {
168  cerr << "test_ngram: Must give a grammar filename using -g" << endl;
169  exit(1);
170  }
171 
172  // plus any files on command line
173  // except file "-" unless there is no script
174  if(script.head()==NULL)
175  script += files;
176  else
177  for(p=files.head();p!=0;p=p->next())
178  if(files(p) != "-")
179  script.append(files(p));
180 
181  if(script.head() == NULL)
182  {
183  cerr << "test_ngram: No test files given" << endl;
184  exit(1);
185  }
186 
187  if (wordlist_file != "")
188  {
189  // load wordlist
190  if (load_StrList(wordlist_file,wordlist) != format_ok)
191  {
192  cerr << "test_ngram: Could not read wordlist from file " << wordlist_file
193  << endl;
194  exit(1);
195  }
196 
197  // load grammar using wordlist
198  if (ngrammar.load(in_file,wordlist) != format_ok)
199  {
200  cerr << "test_ngram: Failed to load grammar" << endl;
201  exit(1);
202  }
203  }
204  else
205  {
206  if (ngrammar.load(in_file) != format_ok)
207  {
208  cerr << "test_ngram: Failed to load grammar" << endl;
209  exit(1);
210  }
211  }
212 
213  if (!brief)
214  {
215  cout << "Ngram Test Results" << endl;
216  cout << "==================" << endl;
217  }
218 
219  for (p = script.head(); p; p = p->next())
220  {
221  // test each file
222  if (test_stats(ngrammar,
223  script(p),
224  raw_entropy,count,
225  entropy,perplexity,
226  input_format,
227  prev_tag,
228  prev_prev_tag))
229  {
230  total_raw_H += raw_entropy;
231  total_count += count;
232 
233  if(per_file_stats)
234  {
235  if (brief)
236  cout << basename(script(p)) << " \t";
237  else
238  cout << script(p) << endl;
239 
240  if(raw_stats)
241  {
242  if (brief)
243  cout << raw_entropy << " " << count << " ";
244  else
245  {
246  cout << " raw entropy " << raw_entropy << endl;
247  cout << " count " << count << endl;
248  }
249  }
250 
251  if (brief)
252  cout << entropy << " " << perplexity << endl;
253  else
254  {
255  cout << " entropy " << entropy << endl;
256  cout << " perplexity " << perplexity << endl << endl;
257  }
258  }
259  }
260  else
261  {
262  cerr << "test_ngram: WARNING : file '" << script(p)
263  << "' could not be processed" << endl;
264  }
265 
266  }
267  if (total_count > 0)
268  {
269  if (!brief)
270  cout << "Summary for grammar " << in_file << endl;
271  else
272  if (per_file_stats)
273  cout << "summary \t";
274 
275  if(raw_stats)
276  {
277  if (brief)
278  cout << total_raw_H << " " << total_count << " ";
279  else
280  {
281  cout << " raw entropy " << total_raw_H << endl;
282  cout << " count " << total_count << endl;
283  }
284  }
285  if (brief)
286  {
287  cout << total_raw_H / total_count;
288  cout << " " << pow(2.0,total_raw_H / total_count);
289  cout << endl;
290  }
291  else
292  {
293  cout << " entropy " << total_raw_H / total_count << endl;
294  cout << " perplexity " << pow(2.0,total_raw_H / total_count);
295  cout << endl;
296  }
297  }
298  else
299  {
300  cerr << "test_ngram: No data processed" << endl;
301  }
302 
303  // everything went okay
304  return 0;
305 }
306 
307 
309 {
310  (void)a_list;
311  (void)al;
312 }
313 
EST_read_status load(const EST_String &filename)
#define SENTENCE_END_MARKER
Definition: EST_Ngrammar.h:60
EST_UItem * next()
Definition: EST_UList.h:55
int main(int argc, char **argv)
void override_lib_ops(EST_Option &a_list, EST_Option &al)
NULL
Definition: EST_WFST.cc:55
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
Definition: EST_TKVL.cc:145
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:196
EST_String basename(EST_String full, EST_String ext="")
This acts like the bourne shell basename command. By default, it strips any leading path from a strin...
Definition: util_io.cc:167
#define format_ok
int present(const K &rkey) const
Returns true if key is present.
Definition: EST_TKVL.cc:222
EST_UItem * head() const
Definition: EST_UList.h:97
EST_String
#define SENTENCE_START_MARKER
Definition: EST_Ngrammar.h:59
EST_read_status load_StrList(EST_String filename, EST_StrList &l)
Load tokens from a file and return them in a EST_StrList.
int parse_command_line(int argc, char *argv[], const EST_String &usage, EST_StrList &files, EST_Option &al, int make_stdio=1)
Definition: cmd_line.cc:101
bool test_stats(EST_Ngrammar &ngram, const EST_String &filename, double &raw_entropy, double &count, double &entropy, double &perplexity, const EST_String &input_format, const EST_String &prev, const EST_String &prev_prev, const EST_String &last)