44 int main(
int argc,
char **argv)
49 EST_String wordlist_file, script_file, in_file, format;
57 bool per_file_stats=
false;
62 double raw_entropy,count,entropy,perplexity,total_raw_H,total_count;
68 EST_String(
"[input file0] [input file1] ...\n")+
69 "-g <ifile> grammar file (required)\n"+
70 "-w <ifile> filename containing word list (required for some grammar formats)\n"+
71 "-S <ifile> script file\n"+
72 "-raw_stats print unnormalised entropy and sample count\n"+
73 "-brief print results in brief format\n"+
74 "-f print stats for each file\n"+
76 "-input_format <string>\n"+
77 " format of input data (default sentence_per_line)\n"+
78 " may also be sentence_per_file, or ngram_per_line.\n"+
81 "-prev_tag <string>\n"+
82 " tag before sentence start\n"+
83 "-prev_prev_tag <string>\n"+
84 " all words before 'prev_tag'\n"+
85 "-last_tag <string>\n"+
86 " after sentence end\n"+
95 wordlist_file = al.
val(
"-w");
101 per_file_stats =
true;
102 if (al.
present(
"-input_format"))
103 input_format = al.
val(
"-input_format");
105 input_format =
"sentence_per_line";
114 if (al.
present(
"-default_tags"))
123 if (al.
present(
"-default_tags"))
124 cerr <<
"test_ngram: WARNING : -prev_tag overrides -default_tags" 126 prev_tag = al.
val(
"-prev_tag");
129 if (al.
present(
"-prev_prev_tag"))
131 if (al.
present(
"-default_tags"))
132 cerr <<
"test_ngram: WARNING : -prev_prev_tag overrides -default_tags" << endl;
133 prev_prev_tag = al.
val(
"-prev_prev_tag");
138 if (al.
present(
"-default_tags"))
139 cerr <<
"test_ngram: WARNING : -last_tag overrides -default_tags" << endl;
140 last_tag = al.
val(
"-last_tag");
143 if ( ( (prev_tag==
"") || (prev_prev_tag==
"") || (last_tag==
"") )
144 && ( (prev_tag!=
"") || (prev_prev_tag!=
"") || (last_tag!=
"") ) )
146 cerr <<
"test_ngram: ERROR : if any tags are given, ALL must be given" << endl;
154 script_file = al.
val(
"-S");
158 cerr <<
"test_ngram: Could not read script from file " 159 << script_file << endl;
165 in_file = al.
val(
"-g");
168 cerr <<
"test_ngram: Must give a grammar filename using -g" << endl;
177 for(p=files.
head();p!=0;p=p->
next())
183 cerr <<
"test_ngram: No test files given" << endl;
187 if (wordlist_file !=
"")
192 cerr <<
"test_ngram: Could not read wordlist from file " << wordlist_file
200 cerr <<
"test_ngram: Failed to load grammar" << endl;
208 cerr <<
"test_ngram: Failed to load grammar" << endl;
215 cout <<
"Ngram Test Results" << endl;
216 cout <<
"==================" << endl;
219 for (p = script.
head(); p; p = p->
next())
230 total_raw_H += raw_entropy;
231 total_count += count;
236 cout <<
basename(script(p)) <<
" \t";
238 cout << script(p) << endl;
243 cout << raw_entropy <<
" " << count <<
" ";
246 cout <<
" raw entropy " << raw_entropy << endl;
247 cout <<
" count " << count << endl;
252 cout << entropy <<
" " << perplexity << endl;
255 cout <<
" entropy " << entropy << endl;
256 cout <<
" perplexity " << perplexity << endl << endl;
262 cerr <<
"test_ngram: WARNING : file '" << script(p)
263 <<
"' could not be processed" << endl;
270 cout <<
"Summary for grammar " << in_file << endl;
273 cout <<
"summary \t";
278 cout << total_raw_H <<
" " << total_count <<
" ";
281 cout <<
" raw entropy " << total_raw_H << endl;
282 cout <<
" count " << total_count << endl;
287 cout << total_raw_H / total_count;
288 cout <<
" " << pow(2.0,total_raw_H / total_count);
293 cout <<
" entropy " << total_raw_H / total_count << endl;
294 cout <<
" perplexity " << pow(2.0,total_raw_H / total_count);
300 cerr <<
"test_ngram: No data processed" << endl;
EST_read_status load(const EST_String &filename)
#define SENTENCE_END_MARKER
int main(int argc, char **argv)
void override_lib_ops(EST_Option &a_list, EST_Option &al)
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
void append(const T &item)
add item onto end of list
EST_String basename(EST_String full, EST_String ext="")
This acts like the bourne shell basename command. By default, it strips any leading path from a strin...
int present(const K &rkey) const
Returns true if key is present.
#define SENTENCE_START_MARKER
EST_read_status load_StrList(EST_String filename, EST_StrList &l)
Load tokens from a file and return them in a EST_StrList.
int parse_command_line(int argc, char *argv[], const EST_String &usage, EST_StrList &files, EST_Option &al, int make_stdio=1)
bool test_stats(EST_Ngrammar &ngram, const EST_String &filename, double &raw_entropy, double &count, double &entropy, double &perplexity, const EST_String &input_format, const EST_String &prev, const EST_String &prev_prev, const EST_String &last)