Edinburgh Speech Tools  2.1-release
wagon_test_main.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : October 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* A program for testing a CART tree against data, also may be used to */
37 /* predict values using a tree and data */
38 /* */
39 /*=======================================================================*/
40 #include <cstdlib>
41 #include <iostream>
42 #include <fstream>
43 #include <cstring>
44 #include "EST_Wagon.h"
45 #include "EST_cutils.h"
46 #include "EST_multistats.h"
47 #include "EST_Token.h"
48 #include "EST_cmd_line.h"
49 
50 using namespace std;
51 
52 static int wagon_test_main(int argc, char **argv);
53 static LISP find_feature_value(const char *feature,
54  LISP vector, LISP description);
55 static LISP wagon_vector_predict(LISP tree, LISP vector, LISP description);
56 static LISP get_data_vector(EST_TokenStream &data, LISP description);
57 static void simple_predict(EST_TokenStream &data, FILE *output,
58  LISP tree, LISP description, int all_info);
59 static void test_tree_class(EST_TokenStream &data, FILE *output,
60  LISP tree, LISP description);
61 static void test_tree_float(EST_TokenStream &data, FILE *output,
62  LISP tree, LISP description);
63 
64 
65 int main(int argc, char **argv)
66 {
67 
68  wagon_test_main(argc,argv);
69 
70  exit(0);
71  return 0;
72 }
73 
74 static int wagon_test_main(int argc, char **argv)
75 {
76  // Top level function sets up data and creates a tree
77  EST_Option al;
78  EST_StrList files;
79  LISP description,tree=NIL;;
80  EST_TokenStream data;
81  FILE *wgn_output;
82 
84  (argc, argv,
85  EST_String("<options>\n")+
86  "Summary: program to test CART models on data\n"+
87  "-desc <ifile> Field description file\n"+
88  "-data <ifile> Datafile, one vector per line\n"+
89  "-tree <ifile> File containing CART tree\n"+
90  "-track <ifile>\n"+
91  " track for vertex indices\n"+
92  "-predict Predict for each vector returning full vector\n"+
93  "-predict_val Predict for each vector returning just value\n"+
94  "-predictee <string>\n"+
95  " name of field to predict (default is first field)\n"+
96  "-heap <int> {210000}\n"+
97  " Set size of Lisp heap, should not normally need\n"+
98  " to be changed from its default\n"+
99  "-o <ofile> File to save output in\n",
100  files, al);
101 
102  siod_init(al.ival("-heap"));
103 
104  if (al.present("-desc"))
105  {
106  gc_protect(&description);
107  description = car(vload(al.val("-desc"),1));
108  }
109  else
110  {
111  cerr << argv[0] << ": no description file specified" << endl;
112  exit(-1);
113  }
114 
115  if (al.present("-tree"))
116  {
117  gc_protect(&tree);
118  tree = car(vload(al.val("-tree"),1));
119  if (tree == NIL)
120  {
121  cerr << argv[0] << ": no tree found in \"" << al.val("-tree")
122  << "\"" << endl;
123  exit(-1);
124  }
125  }
126  else
127  {
128  cerr << argv[0] << ": no tree file specified" << endl;
129  exit(-1);
130  }
131 
132  if (al.present("-data"))
133  {
134  if (data.open(al.val("-data")) != 0)
135  {
136  cerr << argv[0] << ": can't open data file \"" <<
137  al.val("-data") << "\" for input." << endl;
138  exit(-1);
139  }
140  }
141  else
142  {
143  cerr << argv[0] << ": no data file specified" << endl;
144  exit(-1);
145  }
146 
147  if (al.present("-track"))
148  {
149  wgn_VertexTrack.load(al.val("-track"));
150  }
151 
152  if (al.present("-o"))
153  {
154  if ((wgn_output = fopen(al.val("-o"),"w")) == NULL)
155  {
156  cerr << argv[0] << ": can't open output file \"" <<
157  al.val("-o") << "\"" << endl;
158  }
159  }
160  else
161  wgn_output = stdout;
162 
163  if (al.present("-predictee"))
164  {
165  LISP l;
166  int i;
167  wgn_predictee_name = al.val("-predictee");
168  for (l=description,i=0; l != NIL; l=cdr(l),i++)
170  {
171  wgn_predictee = i;
172  break;
173  }
174  if (l==NIL)
175  {
176  cerr << argv[0] << ": predictee \"" << wgn_predictee <<
177  "\" not in description\n";
178  }
179  }
180  const char *predict_type =
181  get_c_string(car(cdr(siod_nth(wgn_predictee,description))));
182 
183  if (al.present("-predict"))
184  simple_predict(data,wgn_output,tree,description,FALSE);
185  else if (al.present("-predict_val"))
186  simple_predict(data,wgn_output,tree,description,TRUE);
187  else if (streq(predict_type,"float") ||
188  streq(predict_type,"int"))
189  test_tree_float(data,wgn_output,tree,description);
190 #if 0
191  else if (streq(predict_type,"vector"))
192  test_tree_vector(data,wgn_output,tree,description);
193 #endif
194  else
195  test_tree_class(data,wgn_output,tree,description);
196 
197  if (wgn_output != stdout)
198  fclose(wgn_output);
199  data.close();
200  return 0;
201 }
202 
203 static LISP get_data_vector(EST_TokenStream &data, LISP description)
204 {
205  // read in one vector. Should be terminated with an newline
206  LISP v=NIL,d;
207 
208  if (data.eof())
209  return NIL;
210 
211  for (d=description; d != NIL; d=cdr(d))
212  {
213  EST_Token t = data.get();
214 
215  if ((d != description) && (t.whitespace().contains("\n")))
216  {
217  cerr << "wagon_test: unexpected newline within vector " <<
218  t.row() << " wrong number of features" << endl;
219  siod_error();
220  }
221  if (streq(get_c_string(car(cdr(car(d)))),"float") ||
222  streq(get_c_string(car(cdr(car(d)))),"int"))
223  v = cons(flocons(atof(t.string())),v);
224  else if ((streq(get_c_string(car(cdr(car(d)))),"_other_")) &&
225  (siod_member_str(t.string(),cdr(car(d))) == NIL))
226  v = cons(strintern("_other_"),v);
227  else
228  v = cons(strintern(t.string()),v);
229  }
230 
231  return reverse(v);
232 }
233 
234 static void simple_predict(EST_TokenStream &data, FILE *output,
235  LISP tree, LISP description, int all_info)
236 {
237  LISP vector,predict;
238  EST_String val;
239 
240  for (vector=get_data_vector(data,description);
241  vector != NIL; vector=get_data_vector(data,description))
242  {
243  predict = wagon_vector_predict(tree,vector,description);
244  if (all_info)
245  val = siod_sprint(car(reverse(predict)));
246  else
247  val = siod_sprint(predict);
248  fprintf(output,"%s\n",(const char *)val);
249  }
250 }
251 
252 static void test_tree_float(EST_TokenStream &data, FILE *output,
253  LISP tree, LISP description)
254 {
255  // Test tree against data to get summary of results FLOAT
256  float predict_val,real_val;
257  EST_SuffStats x,y,xx,yy,xy,se,e;
258  double cor,error;
259  LISP vector,predict;
260 
261  for (vector=get_data_vector(data,description);
262  vector != NIL; vector=get_data_vector(data,description))
263  {
264  predict = wagon_vector_predict(tree,vector,description);
265  predict_val = get_c_float(car(reverse(predict)));
266  real_val = get_c_float(siod_nth(wgn_predictee,vector));
267  x += predict_val;
268  y += real_val;
269  error = predict_val-real_val;
270  se += error*error;
271  e += fabs(error);
272  xx += predict_val*predict_val;
273  yy += real_val*real_val;
274  xy += predict_val*real_val;
275  }
276 
277  cor = (xy.mean() - (x.mean()*y.mean()))/
278  (sqrt(xx.mean()-(x.mean()*x.mean())) *
279  sqrt(yy.mean()-(y.mean()*y.mean())));
280 
281  fprintf(output,";; RMSE %1.4f Correlation is %1.4f Mean (abs) Error %1.4f (%1.4f)\n",
282  sqrt(se.mean()),
283  cor,
284  e.mean(),
285  e.stddev());
286 }
287 
288 static void test_tree_class(EST_TokenStream &data, FILE *output,
289  LISP tree, LISP description)
290 {
291  // Test tree against class data to get summary of results
292  EST_StrStr_KVL pairs;
293  EST_StrList lex;
294  EST_String predict_class,real_class;
295  LISP vector,w,predict;
296  double H=0,Q=0,prob;
297  (void)output;
298 
299  for (vector=get_data_vector(data,description);
300  vector != NIL; vector=get_data_vector(data,description))
301  {
302  predict = wagon_vector_predict(tree,vector,description);
303  predict_class = get_c_string(car(reverse(predict)));
304  real_class = get_c_string(siod_nth(wgn_predictee,vector));
305  prob = get_c_float(car(cdr(siod_assoc_str(real_class,
306  predict))));
307  if (prob == 0)
308  H += log(0.000001);
309  else
310  H += log(prob);
311  Q ++;
312  pairs.add_item(real_class,predict_class,1);
313  }
314  for (w=cdr(siod_nth(wgn_predictee,description)); w != NIL; w = cdr(w))
315  lex.append(get_c_string(car(w)));
316 
317  const EST_FMatrix &m = confusion(pairs,lex);
318  print_confusion(m,pairs,lex);
319  fprintf(stdout,";; entropy %g perplexity %g\n",
320  (-1*(H/Q)),pow(2.0,(-1*(H/Q))));
321 }
322 
323 #if 0
324 static void test_tree_vector(EST_TokenStream &data, FILE *output,
325  LISP tree, LISP description)
326 {
327  // Test tree against class data to get summary of results
328  // Note we are talking about predicting vectors (a *bunch* of
329  // numbers, not just a single class here)
330  EST_StrStr_KVL pairs;
331  EST_StrList lex;
332  EST_String predict_class,real_class;
333  LISP vector,w,predict;
334  double H=0,Q=0,prob;
335  (void)output;
336 
337  for (vector=get_data_vector(data,description);
338  vector != NIL; vector=get_data_vector(data,description))
339  {
340  predict = wagon_vector_predict(tree,vector,description);
341  predict_class = get_c_string(car(reverse(predict)));
342  real_class = get_c_string(siod_nth(wgn_predictee,vector));
343  prob = get_c_float(car(cdr(siod_assoc_str(real_class,
344  predict))));
345  if (prob == 0)
346  H += log(0.000001);
347  else
348  H += log(prob);
349  Q ++;
350  pairs.add_item(real_class,predict_class,1);
351  }
352  for (w=cdr(siod_nth(wgn_predictee,description)); w != NIL; w = cdr(w))
353  lex.append(get_c_string(car(w)));
354 
355  const EST_FMatrix &m = confusion(pairs,lex);
356  print_confusion(m,pairs,lex);
357  fprintf(stdout,";; entropy %g perplexity %g\n",
358  (-1*(H/Q)),pow(2.0,(-1*(H/Q))));
359 }
360 #endif
361 
362 static LISP wagon_vector_predict(LISP tree, LISP vector, LISP description)
363 {
364  // Using the LISP tree, vector and description, do standard prediction
365 
366  if (cdr(tree) == NIL)
367  return car(tree);
368 
369  LISP value = find_feature_value(wgn_ques_feature(car(tree)),
370  vector, description);
371 
372  if (wagon_ask_question(car(tree),value))
373  // Yes answer
374  return wagon_vector_predict(car(cdr(tree)),vector,description);
375  else
376  // No answer
377  return wagon_vector_predict(car(cdr(cdr(tree))),vector,description);
378 }
379 
380 static LISP find_feature_value(const char *feature,
381  LISP vector, LISP description)
382 {
383  LISP v,d;
384 
385  for (v=vector,d=description; v != NIL; v=cdr(v),d=cdr(d))
386  if (streq(feature,get_c_string(car(car(d)))))
387  return car(v);
388 
389  cerr << "wagon_test: can't find feature \"" << feature <<
390  "\" in description" << endl;
391  siod_error();
392  return NIL;
393 
394 }
395 
int row(void) const
Line number in original EST_TokenStream.
Definition: EST_Token.h:185
EST_TokenStream & get(EST_Token &t)
get next token in stream
Definition: EST_Token.cc:499
int main(int argc, char **argv)
int contains(const char *s, ssize_t pos=-1) const
Does it contain this substring?
Definition: EST_String.h:365
float get_c_float(LISP x)
Definition: slib.cc:1858
double stddev(void) const
standard deviation of currently cummulated values
#define siod_error()
Definition: siod.h:211
int ival(const EST_String &rkey, int m=1) const
Definition: EST_Option.cc:82
int wgn_predictee
Definition: wagon.cc:73
double mean(void) const
mean of currently cummulated values
void close(void)
Close stream.
Definition: EST_Token.cc:419
#define NIL
Definition: siod_defs.h:92
LISP strintern(const char *data)
Definition: slib_str.cc:22
const EST_String & whitespace()
Definition: EST_Token.h:112
#define streq(X, Y)
Definition: EST_cutils.h:57
LISP siod_assoc_str(const char *key, LISP alist)
Definition: siod.cc:125
LISP siod_nth(int nth, LISP list)
Definition: siod.cc:214
EST_Track error(EST_Track &ref, EST_Track &test, int relax=0)
int open(const EST_String &filename)
open a EST_TokenStream for a file.
Definition: EST_Token.cc:213
LISP vload(const char *fname, long cflag)
Definition: slib_file.cc:632
const char * get_c_string(LISP x)
Definition: slib.cc:638
void print_confusion(const EST_FMatrix &a, EST_StrStr_KVL &list, EST_StrList &lex)
Definition: confusion.cc:77
int eof()
end of file
Definition: EST_Token.h:362
EST_read_status load(const EST_String name, float ishift=0.0, float startt=0.0)
Definition: EST_Track.cc:1312
STATIC HISTORY H
Definition: editline.c:120
EST_String siod_sprint(LISP exp)
Definition: slib_file.cc:208
LISP cons(LISP x, LISP y)
Definition: slib_list.cc:97
EST_FMatrix confusion(EST_StrStr_KVL &list, EST_StrList &lex)
Definition: confusion.cc:59
#define FALSE
Definition: EST_bool.h:119
NULL
Definition: EST_WFST.cc:55
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
Definition: EST_TKVL.cc:145
EST_String wgn_predictee_name
Definition: wagon.cc:74
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:196
int add_item(const K &rkey, const V &rval, int no_search=0)
add key-val pair to list
Definition: EST_TKVL.cc:248
EST_Track wgn_VertexTrack
Definition: wagon.cc:62
const EST_String & string() const
Definition: EST_Token.h:120
int wagon_ask_question(LISP question, LISP value)
Definition: wagonint.cc:49
int present(const K &rkey) const
Returns true if key is present.
Definition: EST_TKVL.cc:222
LISP flocons(double x)
Definition: slib.cc:673
void gc_protect(LISP *location)
Definition: slib.cc:791
int siod_init(int heap_size=DEFAULT_HEAP_SIZE)
Definition: siod.cc:58
LISP car(LISP x)
Definition: slib_list.cc:115
int tree
Definition: rxp.c:21
EST_String
#define TRUE
Definition: EST_bool.h:118
void reverse(EST_Wave &sig)
LISP siod_member_str(const char *key, LISP list)
Definition: siod.cc:167
int parse_command_line(int argc, char *argv[], const EST_String &usage, EST_StrList &files, EST_Option &al, int make_stdio=1)
Definition: cmd_line.cc:101
LISP cdr(LISP x)
Definition: slib_list.cc:124
#define wgn_ques_feature(X)
Definition: EST_Wagon.h:285