Edinburgh Speech Tools  2.1-release
wagon.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : May 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* A Classification and Regression Tree (CART) Program */
37 /* A basic implementation of many of the techniques in */
38 /* Briemen et al. 1984 */
39 /* */
40 /* Added decision list support, Feb 1997 */
41 /* Added stepwise use of features, Oct 1997 */
42 /* */
43 /*=======================================================================*/
44 
45 #include <cstdlib>
46 #include <iostream>
47 #include <fstream>
48 #include <cstring>
49 #include "EST_Token.h"
50 #include "EST_FMatrix.h"
51 #include "EST_multistats.h"
52 #include "EST_Wagon.h"
53 #include "EST_math.h"
54 
55 using namespace std;
56 
58 
65 
67 int wgn_held_out = 0;
71 int wgn_count_field = -1;
73 int wgn_predictee = 0;
76 float wgn_balance = 0;
80 
81 static float do_summary(WNode &tree,WDataSet &ds,ostream *output);
82 static float test_tree_float(WNode &tree,WDataSet &ds,ostream *output);
83 static float test_tree_class(WNode &tree,WDataSet &ds,ostream *output);
84 static float test_tree_cluster(WNode &tree,WDataSet &dataset, ostream *output);
85 static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output);
86 static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output);
87 static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output);
88 static int wagon_split(int margin,WNode &node);
89 static void find_best_question(WVectorVector &dset, WQuestion &best_ques);
90 static void construct_binary_ques(int feat,WQuestion &test_ques);
91 static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds);
92 static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds);
93 static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in);
94 static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat);
95 
96 Declare_TList_T(WVector *, WVectorP)
97 
99 
100 #if defined(INSTANTIATE_TEMPLATES)
101 // Instantiate class
102 #include "../base_class/EST_TList.cc"
103 #include "../base_class/EST_TVector.cc"
104 
105 Instantiate_TList_T(WVector *, WVectorP)
106 
107 Instantiate_TVector(WVector *)
108 
109 #endif
110 
111 void wgn_load_datadescription(EST_String fname,LISP ignores)
112 {
113  // Load field description for a file
114  wgn_dataset.load_description(fname,ignores);
115  wgn_test_dataset.load_description(fname,ignores);
116 }
117 
119 {
120  // Read the data set from a filename. One vector per line
121  // Assume all numbers are numbers and non-nums are categorical
122  EST_TokenStream ts;
123  WVector *v;
124  int nvec=0,i;
125 
126  if (ts.open(fname) == -1)
127  wagon_error(EST_String("unable to open data file \"")+
128  fname+"\"");
129  ts.set_PunctuationSymbols("");
131  ts.set_SingleCharSymbols("");
132 
133  for ( ;!ts.eof(); )
134  {
135  v = new WVector(dataset.width());
136  i = 0;
137  do
138  {
139  int type = dataset.ftype(i);
140  if ((type == wndt_float) ||
141  (type == wndt_ols) ||
142  (wgn_count_field == i))
143  {
144  // need to ensure this is not NaN or Infinity
145  float f = atof(ts.get().string());
146  if (isfinite(f))
147  v->set_flt_val(i,f);
148  else
149  {
150  cout << fname << ": bad float " << f
151  << " in field " <<
152  dataset.feat_name(i) << " vector " <<
153  dataset.samples() << endl;
154  v->set_flt_val(i,0.0);
155  }
156  }
157  else if (type == wndt_binary)
158  v->set_int_val(i,atoi(ts.get().string()));
159  else if (type == wndt_cluster) /* index into distmatrix */
160  v->set_int_val(i,atoi(ts.get().string()));
161  else if (type == wndt_vector) /* index into VertexTrack */
162  v->set_int_val(i,atoi(ts.get().string()));
163  else if (type == wndt_trajectory) /* index to index and length */
164  { /* a number pointing to a vector in UnitTrack that */
165  /* has an idex into VertexTrack and a number of Vertices */
166  /* Thus if its 15, UnitTrack.a(15,0) is the start frame in */
167  /* VertexTrack and UnitTrack.a(15,1) is the number of */
168  /* frames in the unit */
169  v->set_int_val(i,atoi(ts.get().string()));
170  }
171  else if (type == wndt_ignore)
172  {
173  ts.get(); // skip it
174  v->set_int_val(i,0);
175  }
176  else // should check the different classes
177  {
178  EST_String s = ts.get().string();
179  int n = wgn_discretes.discrete(type).name(s);
180  if (n == -1)
181  {
182  cout << fname << ": bad value " << s << " in field " <<
183  dataset.feat_name(i) << " vector " <<
184  dataset.samples() << endl;
185  n = 0;
186  }
187  v->set_int_val(i,n);
188  }
189  i++;
190  }
191  while (!ts.eoln() && i<dataset.width());
192  nvec ++;
193  if (i != dataset.width())
194  {
195  wagon_error(fname+": data vector "+itoString(nvec)+" contains "
196  +itoString(i)+" parameters instead of "+
197  itoString(dataset.width()));
198  }
199  if (!ts.eoln())
200  {
201  cerr << fname << ": data vector " << nvec <<
202  " contains too many parameters instead of "
203  << dataset.width() << endl;
204  wagon_error(EST_String("extra parameter(s) from ")+
205  ts.peek().string());
206  }
207  dataset.append(v);
208  }
209 
210  cout << "Dataset of " << dataset.samples() << " vectors of " <<
211  dataset.width() << " parameters from: " << fname << endl;
212  ts.close();
213 }
214 
215 float summary_results(WNode &tree,ostream *output)
216 {
217  if (wgn_test_dataset.samples() != 0)
218  return do_summary(tree,wgn_test_dataset,output);
219  else
220  return do_summary(tree,wgn_dataset,output);
221 }
222 
223 static float do_summary(WNode &tree,WDataSet &ds,ostream *output)
224 {
225  if (wgn_dataset.ftype(wgn_predictee) == wndt_cluster)
226  return test_tree_cluster(tree,ds,output);
227  else if (wgn_dataset.ftype(wgn_predictee) == wndt_vector)
228  return test_tree_vector(tree,ds,output);
229  else if (wgn_dataset.ftype(wgn_predictee) == wndt_trajectory)
230  return test_tree_trajectory(tree,ds,output);
231  else if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
232  return test_tree_ols(tree,ds,output);
233  else if (wgn_dataset.ftype(wgn_predictee) >= wndt_class)
234  return test_tree_class(tree,ds,output);
235  else
236  return test_tree_float(tree,ds,output);
237 }
238 
239 WNode *wgn_build_tree(float &score)
240 {
241  // Build init node and split it while reducing the impurity
242  WNode *top = new WNode();
243  int margin = 0;
244 
245  wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,TRUE);
246 
247  margin = 0;
248  wagon_split(margin,*top); // recursively split data;
249 
250  if (wgn_held_out > 0)
251  {
252  wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,FALSE);
253  top->held_out_prune();
254  }
255 
256  if (wgn_prune)
257  top->prune();
258 
259  score = summary_results(*top,0);
260 
261  return top;
262 }
263 
264 static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in)
265 {
266  // Set data ommitting held_out percent if in is true
267  // or only including 100-held_out percent if in is false
268  int i,j;
269  EST_Litem *d;
270 
271  // Make it definitely big enough
272  data.resize(ds.length());
273 
274  for (j=i=0,d=ds.head(); d != 0; d=d->next(),j++)
275  {
276  if ((in) && ((j%100) >= held_out))
277  data[i++] = ds(d);
278 // else if ((!in) && ((j%100 < held_out)))
279 // data[i++] = ds(d);
280  else if (!in)
281  data[i++] = ds(d);
282 // if ((in) && (j < held_out))
283 // data[i++] = ds(d);
284 // else if ((!in) && (j >=held_out))
285 // data[i++] = ds(d);
286  }
287  // make it the actual size, but don't reset values
288  data.resize(i,1);
289 }
290 
291 static float test_tree_class(WNode &tree,WDataSet &dataset,ostream *output)
292 {
293  // Test tree against data to get summary of results
294  EST_StrStr_KVL pairs;
295  EST_StrList lex;
296  EST_Litem *p;
297  EST_String predict,real;
298  WNode *pnode;
299  double H=0,prob;
300  int i,type;
301  float correct=0,total=0, count=0;
302 
303  float bcorrect=0, bpredicted=0, bactual=0;
304  float precision=0, recall=0;
305 
306  for (p=dataset.head(); p != 0; p=p->next())
307  {
308  pnode = tree.predict_node((*dataset(p)));
309  predict = (EST_String)pnode->get_impurity().value();
310  if (wgn_count_field == -1)
311  count = 1.0;
312  else
313  count = dataset(p)->get_flt_val(wgn_count_field);
314  prob = pnode->get_impurity().pd().probability(predict);
315  H += (log(prob))*count;
316  type = dataset.ftype(wgn_predictee);
317  real = wgn_discretes[type].name(dataset(p)->get_int_val(wgn_predictee));
318 
319  if (wgn_opt_param == "B_NB_F1")
320  {
321  //cout << real << " " << predict << endl;
322  if (real == "B")
323  bactual +=count;
324  if (predict == "B")
325  {
326  bpredicted += count;
327  if (real == predict)
328  bcorrect += count;
329  }
330  // cout <<bactual << " " << bpredicted << " " << bcorrect << endl;
331  }
332  if (real == predict)
333  correct += count;
334  total += count;
335  pairs.add_item(real,predict,1);
336  }
337  for (i=0; i<wgn_discretes[dataset.ftype(wgn_predictee)].length(); i++)
338  lex.append(wgn_discretes[dataset.ftype(wgn_predictee)].name(i));
339 
340  const EST_FMatrix &m = confusion(pairs,lex);
341 
342  if (output != NULL)
343  {
344  print_confusion(m,pairs,lex); // should be to output not stdout
345  *output << ";; entropy " << (-1*(H/total)) << " perplexity " <<
346  pow(2.0,(-1*(H/total))) << endl;
347  }
348 
349 
350  // Minus it so bigger is better
351  if (wgn_opt_param == "entropy")
352  return -pow(2.0,(-1*(H/total)));
353  else if(wgn_opt_param == "B_NB_F1")
354  {
355  if(bpredicted == 0)
356  precision = 1;
357  else
358  precision = bcorrect/bpredicted;
359  if(bactual == 0)
360  recall = 1;
361  else
362  recall = bcorrect/bactual;
363  float fmeasure = 0;
364  if((precision+recall) !=0)
365  fmeasure = 2* (precision*recall)/(precision+recall);
366  cout<< "F1 :" << fmeasure << " Prec:" << precision << " Rec:" << recall << " B-Pred:" << bpredicted << " B-Actual:" << bactual << " B-Correct:" << bcorrect << endl;
367  return fmeasure;
368  }
369  else
370  return (float)correct/(float)total;
371 }
372 
373 static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output)
374 {
375  // Test tree against data to get summary of results VECTOR
376  // distance is calculated in zscores (as the values in vector may
377  // have quite different ranges
378  WNode *leaf;
379  EST_Litem *p;
380  float predict, actual;
381  EST_SuffStats x,y,xx,yy,xy,se,e;
382  EST_SuffStats b;
383  ssize_t i,pos;
384  double cor,error;
385  double count;
386  EST_Litem *pp;
387 
388  for (p=dataset.head(); p != 0; p=p->next())
389  {
390  leaf = tree.predict_node((*dataset(p)));
391  pos = dataset(p)->get_int_val(wgn_predictee);
392  for (int j=0; j<wgn_VertexFeats.num_channels(); j++)
393  if (wgn_VertexFeats.a(static_cast<ssize_t>(0),j) > 0.0)
394  {
395  b.reset();
396  for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())
397  {
398  i = leaf->get_impurity().members.item(pp);
399  b += wgn_VertexTrack.a(i,j);
400  }
401  predict = b.mean();
402  actual = wgn_VertexTrack.a(pos,j);
403  if (wgn_count_field == -1)
404  count = 1.0;
405  else
406  count = dataset(p)->get_flt_val(wgn_count_field);
407  x.cumulate(predict,count);
408  y.cumulate(actual,count);
409  /* Normalized the error by the standard deviation */
410  if (b.stddev() == 0)
411  error = predict-actual;
412  else
413  error = (predict-actual)/b.stddev();
414  se.cumulate((error*error),count);
415  e.cumulate(fabs(error),count);
416  xx.cumulate(predict*predict,count);
417  yy.cumulate(actual*actual,count);
418  xy.cumulate(predict*actual,count);
419  }
420  }
421 
422  // Pearson's product moment correlation coefficient
423 // cor = (xy.mean() - (x.mean()*y.mean()))/
424 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
425 // sqrt(yy.mean()-(y.mean()*y.mean())));
426  // Because when the variation is X is very small we can
427  // go negative, thus cause the sqrt's to give FPE
428  double v1 = xx.mean()-(x.mean()*x.mean());
429  double v2 = yy.mean()-(y.mean()*y.mean());
430 
431  double v3 = v1*v2;
432 
433  if (v3 <= 0)
434  // happens when there's very little variation in x
435  cor = 0;
436  else
437  cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
438 
439  if (output != NULL)
440  {
441  if (output != &cout) // save in output file
442  *output
443  << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
444  << " Correlation is " << ftoString(cor,4,1)
445  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
446  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
447 
448  cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
449  << " Correlation is " << ftoString(cor,4,1)
450  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
451  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
452  }
453 
454  if (wgn_opt_param == "rmse")
455  return -sqrt(se.mean()); // * -1 so bigger is better
456  else
457  return cor; // should really be % variance, I think
458 }
459 
460 static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output)
461 {
462  // Test tree against data to get summary of results TRAJECTORY
463  // distance is calculated in zscores (as the values in vector may
464  // have quite different ranges)
465  // NOT WRITTEN YET
466  WNode *leaf;
467  EST_Litem *p;
468  float predict, actual;
469  EST_SuffStats x,y,xx,yy,xy,se,e;
470  EST_SuffStats b;
471  ssize_t i,j,pos;
472  double cor,error;
473  double count;
474  EST_Litem *pp;
475 
476  for (p=dataset.head(); p != 0; p=p->next())
477  {
478  leaf = tree.predict_node((*dataset(p)));
479  pos = dataset(p)->get_int_val(wgn_predictee);
480  for (j=0; j<wgn_VertexFeats.num_channels(); j++)
481  if (wgn_VertexFeats.a(static_cast<ssize_t>(0),j) > 0.0)
482  {
483  b.reset();
484  for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())
485  {
486  i = leaf->get_impurity().members.item(pp);
487  b += wgn_VertexTrack.a(i,j);
488  }
489  predict = b.mean();
490  actual = wgn_VertexTrack.a(pos,j);
491  if (wgn_count_field == -1)
492  count = 1.0;
493  else
494  count = dataset(p)->get_flt_val(wgn_count_field);
495  x.cumulate(predict,count);
496  y.cumulate(actual,count);
497  /* Normalized the error by the standard deviation */
498  if (b.stddev() == 0)
499  error = predict-actual;
500  else
501  error = (predict-actual)/b.stddev();
502  se.cumulate((error*error),count);
503  e.cumulate(fabs(error),count);
504  xx.cumulate(predict*predict,count);
505  yy.cumulate(actual*actual,count);
506  xy.cumulate(predict*actual,count);
507  }
508  }
509 
510  // Pearson's product moment correlation coefficient
511 // cor = (xy.mean() - (x.mean()*y.mean()))/
512 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
513 // sqrt(yy.mean()-(y.mean()*y.mean())));
514  // Because when the variation is X is very small we can
515  // go negative, thus cause the sqrt's to give FPE
516  double v1 = xx.mean()-(x.mean()*x.mean());
517  double v2 = yy.mean()-(y.mean()*y.mean());
518 
519  double v3 = v1*v2;
520 
521  if (v3 <= 0)
522  // happens when there's very little variation in x
523  cor = 0;
524  else
525  cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
526 
527  if (output != NULL)
528  {
529  if (output != &cout) // save in output file
530  *output
531  << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
532  << " Correlation is " << ftoString(cor,4,1)
533  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
534  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
535 
536  cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
537  << " Correlation is " << ftoString(cor,4,1)
538  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
539  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
540  }
541 
542  if (wgn_opt_param == "rmse")
543  return -sqrt(se.mean()); // * -1 so bigger is better
544  else
545  return cor; // should really be % variance, I think
546 }
547 
548 static float test_tree_cluster(WNode &tree,WDataSet &dataset,ostream *output)
549 {
550  // Test tree against data to get summary of results for cluster trees
551  WNode *leaf;
552  int real;
553  int right_cluster=0;
554  EST_SuffStats ranking, meandist;
555  EST_Litem *p;
556 
557  for (p=dataset.head(); p != 0; p=p->next())
558  {
559  leaf = tree.predict_node((*dataset(p)));
560  real = dataset(p)->get_int_val(wgn_predictee);
561  meandist += leaf->get_impurity().cluster_distance(real);
562  right_cluster += leaf->get_impurity().in_cluster(real);
563  ranking += leaf->get_impurity().cluster_ranking(real);
564  }
565 
566  if (output != NULL)
567  {
568  int rightnumber = 0;
569  if (dataset.length() > 0) {
570  rightnumber = (int)(100.0*(float)right_cluster/(float)dataset.length());
571  }
572  // Want number in right class, mean distance in sds, mean ranking
573  if (output != &cout) // save in output file
574  *output << ";; Right cluster " << right_cluster << " (" <<
575  rightnumber <<
576  "%) mean ranking " << ranking.mean() << " mean distance "
577  << meandist.mean() << endl;
578  cout << "Right cluster " << right_cluster << " (" <<
579  rightnumber <<
580  "%) mean ranking " << ranking.mean() << " mean distance "
581  << meandist.mean() << endl;
582  }
583 
584  return 10000-meandist.mean(); // this doesn't work but I tested it
585 }
586 
587 static float test_tree_float(WNode &tree,WDataSet &dataset,ostream *output)
588 {
589  // Test tree against data to get summary of results FLOAT
590  EST_Litem *p;
591  float predict,real;
592  EST_SuffStats x,y,xx,yy,xy,se,e;
593  double cor,error;
594  double count;
595 
596  for (p=dataset.head(); p != 0; p=p->next())
597  {
598  predict = tree.predict((*dataset(p)));
599  real = dataset(p)->get_flt_val(wgn_predictee);
600  if (wgn_count_field == -1)
601  count = 1.0;
602  else
603  count = dataset(p)->get_flt_val(wgn_count_field);
604  x.cumulate(predict,count);
605  y.cumulate(real,count);
606  error = predict-real;
607  se.cumulate((error*error),count);
608  e.cumulate(fabs(error),count);
609  xx.cumulate(predict*predict,count);
610  yy.cumulate(real*real,count);
611  xy.cumulate(predict*real,count);
612  }
613 
614  // Pearson's product moment correlation coefficient
615 // cor = (xy.mean() - (x.mean()*y.mean()))/
616 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
617 // sqrt(yy.mean()-(y.mean()*y.mean())));
618  // Because when the variation is X is very small we can
619  // go negative, thus cause the sqrt's to give FPE
620  double v1 = xx.mean()-(x.mean()*x.mean());
621  double v2 = yy.mean()-(y.mean()*y.mean());
622 
623  double v3 = v1*v2;
624 
625  if (v3 <= 0)
626  // happens when there's very little variation in x
627  cor = 0;
628  else
629  cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
630 
631  if (output != NULL)
632  {
633  if (output != &cout) // save in output file
634  *output
635  << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
636  << " Correlation is " << ftoString(cor,4,1)
637  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
638  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
639 
640  cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
641  << " Correlation is " << ftoString(cor,4,1)
642  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
643  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
644  }
645 
646  if (wgn_opt_param == "rmse")
647  return -sqrt(se.mean()); // * -1 so bigger is better
648  else
649  return cor; // should really be % variance, I think
650 }
651 
652 static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output)
653 {
654  // Test tree against data to get summary of results OLS
655  EST_Litem *p;
656  /*WNode *leaf; // unused */
657  float predict,real;
658  EST_SuffStats x,y,xx,yy,xy,se,e;
659  double cor,error;
660  double count;
661 
662  for (p=dataset.head(); p != 0; p=p->next())
663  {
664  /*leaf = */tree.predict_node((*dataset(p)));
665  // do ols to get predict;
666  predict = 0.0;
667  real = dataset(p)->get_flt_val(wgn_predictee);
668  if (wgn_count_field == -1)
669  count = 1.0;
670  else
671  count = dataset(p)->get_flt_val(wgn_count_field);
672  x.cumulate(predict,count);
673  y.cumulate(real,count);
674  error = predict-real;
675  se.cumulate((error*error),count);
676  e.cumulate(fabs(error),count);
677  xx.cumulate(predict*predict,count);
678  yy.cumulate(real*real,count);
679  xy.cumulate(predict*real,count);
680  }
681 
682  // Pearson's product moment correlation coefficient
683 // cor = (xy.mean() - (x.mean()*y.mean()))/
684 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
685 // sqrt(yy.mean()-(y.mean()*y.mean())));
686  // Because when the variation is X is very small we can
687  // go negative, thus cause the sqrt's to give FPE
688  double v1 = xx.mean()-(x.mean()*x.mean());
689  double v2 = yy.mean()-(y.mean()*y.mean());
690 
691  double v3 = v1*v2;
692 
693  if (v3 <= 0)
694  // happens when there's very little variation in x
695  cor = 0;
696  else
697  cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
698 
699  if (output != NULL)
700  {
701  if (output != &cout) // save in output file
702  *output
703  << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
704  << " Correlation is " << ftoString(cor,4,1)
705  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
706  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
707 
708  cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
709  << " Correlation is " << ftoString(cor,4,1)
710  << " Mean (abs) Error " << ftoString(e.mean(),4,1)
711  << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
712  }
713 
714  if (wgn_opt_param == "rmse")
715  return -sqrt(se.mean()); // * -1 so bigger is better
716  else
717  return cor; // should really be % variance, I think
718 }
719 
720 static int wagon_split(int margin, WNode &node)
721 {
722  // Split given node (if possible)
723  WQuestion q;
724  WNode *l,*r;
725 
726  node.set_impurity(WImpurity(node.get_data()));
727  find_best_question(node.get_data(), q);
728 
729 /* printf("q.score() %f impurity %f\n",
730  q.get_score(),
731  node.get_impurity().measure()); */
732 
733  double impurity_measure = node.get_impurity().measure();
734  double question_score = q.get_score();
735 
736  if ((question_score < WGN_HUGE_VAL) &&
737  (question_score < impurity_measure))
738 
739  {
740  // Ok its worth a split
741  l = new WNode();
742  r = new WNode();
743  wgn_find_split(q,node.get_data(),l->get_data(),r->get_data());
744  node.set_subnodes(l,r);
745  node.set_question(q);
746  if (wgn_verbose)
747  {
748  int i;
749  for (i=0; i < margin; i++)
750  cout << " ";
751  cout << q << endl;
752  }
753  margin++;
754  wagon_split(margin,*l);
755  margin++;
756  wagon_split(margin,*r);
757  margin--;
758  return TRUE;
759  }
760  else
761  {
762  if (wgn_verbose)
763  {
764  int i;
765  for (i=0; i < margin; i++)
766  cout << " ";
767  cout << "stopped samples: " << node.samples() << " impurity: "
768  << node.get_impurity() << endl;
769  }
770  margin--;
771  return FALSE;
772  }
773 }
774 
777 {
778  int i, iy, in;
779 
780  y.resize(q.get_yes());
781  n.resize(q.get_no());
782 
783  for (iy=in=i=0; i < ds.n(); i++)
784  if (q.ask(*ds(i)) == TRUE)
785  y[iy++] = ds(i);
786  else
787  n[in++] = ds(i);
788 
789 }
790 
791 static void find_best_question(WVectorVector &dset,
792  WQuestion &best_ques)
793 {
794  // Ask all possible questions and find the best one
795  int i;
796  float bscore,tscore;
797  WQuestion test_ques;
798 
799  bscore = tscore = WGN_HUGE_VAL;
800  best_ques.set_score(bscore);
801  // test each feature with each possible question
802  for (i=0;i < wgn_dataset.width(); i++)
803  {
804  if ((wgn_dataset.ignore(i) == TRUE) ||
805  (i == wgn_predictee))
806  tscore = WGN_HUGE_VAL; // ignore this feature this time
807  else if (wgn_dataset.ftype(i) == wndt_binary)
808  {
809  construct_binary_ques(i,test_ques);
810  tscore = wgn_score_question(test_ques,dset);
811  }
812  else if (wgn_dataset.ftype(i) == wndt_float)
813  {
814  tscore = construct_float_ques(i,test_ques,dset);
815  }
816  else if (wgn_dataset.ftype(i) == wndt_ignore)
817  tscore = WGN_HUGE_VAL; // always ignore this feature
818 #if 0
819  // This doesn't work reasonably
820  else if (wgn_csubset && (wgn_dataset.ftype(i) >= wndt_class))
821  {
822  wagon_error("subset selection temporarily deleted");
823  tscore = construct_class_ques_subset(i,test_ques,dset);
824  }
825 #endif
826  else if (wgn_dataset.ftype(i) >= wndt_class)
827  tscore = construct_class_ques(i,test_ques,dset);
828  if (tscore < bscore)
829  {
830  best_ques = test_ques;
831  best_ques.set_score(tscore);
832  bscore = tscore;
833  }
834  }
835 
836  return;
837 }
838 
839 static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds)
840 {
841  // Find out which member of a class gives the best split
842  float tscore,bscore = WGN_HUGE_VAL;
843  int cl;
844  WQuestion test_q;
845 
846  test_q.set_fp(feat);
847  test_q.set_oper(wnop_is);
848  ques = test_q;
849 
850  for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)
851  {
852  test_q.set_operand1(EST_Val(cl));
853  tscore = wgn_score_question(test_q,ds);
854  if (tscore < bscore)
855  {
856  ques = test_q;
857  bscore = tscore;
858  }
859  }
860 
861  return bscore;
862 }
863 
864 #if 0
865 static float construct_class_ques_subset(int feat,WQuestion &ques,
866  WVectorVector &ds)
867 {
868  // Find out which subset of a class gives the best split.
869  // We first measure the subset of the data for each member of
870  // of the class. Then order those splits. Then go through finding
871  // where the best split of that ordered list is. This is described
872  // on page 247 of Breiman et al.
873  float tscore,bscore = WGN_HUGE_VAL;
874  LISP l;
875  int cl;
876 
877  ques.set_fp(feat);
878  ques.set_oper(wnop_is);
879  float *scores = new float[wgn_discretes[wgn_dataset.ftype(feat)].length()];
880 
881  // Only do it for exists values
882  for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)
883  {
884  ques.set_operand(flocons(cl));
885  scores[cl] = wgn_score_question(ques,ds);
886  }
887 
888  LISP order = sort_class_scores(feat,scores);
889  if (order == NIL)
890  return WGN_HUGE_VAL;
891  if (siod_llength(order) == 1)
892  { // Only one so we know the best "split"
893  ques.set_oper(wnop_is);
894  ques.set_operand(car(order));
895  return scores[get_c_int(car(order))];
896  }
897 
898  ques.set_oper(wnop_in);
899  LISP best_l = NIL;
900  for (l=cdr(order); CDR(l) != NIL; l = cdr(l))
901  {
902  ques.set_operand(l);
903  tscore = wgn_score_question(ques,ds);
904  if (tscore < bscore)
905  {
906  best_l = l;
907  bscore = tscore;
908  }
909 
910  }
911 
912  if (best_l != NIL)
913  {
914  if (siod_llength(best_l) == 1)
915  {
916  ques.set_oper(wnop_is);
917  ques.set_operand(car(best_l));
918  }
919  else if (equal(cdr(order),best_l) != NIL)
920  {
921  ques.set_oper(wnop_is);
922  ques.set_operand(car(order));
923  }
924  else
925  {
926  cout << "Found a good subset" << endl;
927  ques.set_operand(best_l);
928  }
929  }
930  return bscore;
931 }
932 
933 static LISP sort_class_scores(int feat,float *scores)
934 {
935  // returns sorted list of (non WGN_HUGE_VAL) items
936  int i;
937  LISP items = NIL;
938  LISP l;
939 
940  for (i=0; i < wgn_discretes[wgn_dataset.ftype(feat)].length(); i++)
941  {
942  if (scores[i] != WGN_HUGE_VAL)
943  {
944  if (items == NIL)
945  items = cons(flocons(i),NIL);
946  else
947  {
948  for (l=items; l != NIL; l=cdr(l))
949  {
950  if (scores[i] < scores[get_c_int(car(l))])
951  {
952  CDR(l) = cons(car(l),cdr(l));
953  CAR(l) = flocons(i);
954  break;
955  }
956  }
957  if (l == NIL)
958  items = l_append(items,cons(flocons(i),NIL));
959  }
960  }
961  }
962  return items;
963 }
964 #endif
965 
966 static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds)
967 {
968  // Find out a split of the range that gives the best score
969  // Naively does this by partitioning the range into float_range_split slots
970  float tscore,bscore = WGN_HUGE_VAL;
971  int d, i;
972  float p;
973  WQuestion test_q;
974  float max,min,val,incr;
975 
976  test_q.set_fp(feat);
977  test_q.set_oper(wnop_lessthan);
978  ques = test_q;
979 
980  min = max = ds(0)->get_flt_val(feat); /* set up some value */
981  for (d=0; d < ds.n(); d++)
982  {
983  val = ds(d)->get_flt_val(feat);
984  if (val < min)
985  min = val;
986  else if (val > max)
987  max = val;
988  }
989  if (max == min) // we're pure
990  return WGN_HUGE_VAL;
991  incr = (max-min)/wgn_float_range_split;
992  // so do float_range-1 splits
993  /* We calculate this based on the number splits, not the increments, */
994  /* becuase incr can be so small it doesn't increment p */
995  for (i=0,p=min+incr; i < wgn_float_range_split; i++,p += incr )
996  {
997  test_q.set_operand1(EST_Val(p));
998  tscore = wgn_score_question(test_q,ds);
999  if (tscore < bscore)
1000  {
1001  ques = test_q;
1002  bscore = tscore;
1003  }
1004  }
1005 
1006  return bscore;
1007 }
1008 
1009 static void construct_binary_ques(int feat,WQuestion &test_ques)
1010 {
1011  // construct a question. Not sure about this in general
1012  // of course continuous/categorical features will require different
1013  // rule and non-binary ones will require some test point
1014 
1015  test_ques.set_fp(feat);
1016  test_ques.set_oper(wnop_binary);
1017  test_ques.set_operand1(EST_Val(""));
1018 }
1019 
1020 static float score_question_set(WQuestion &q, WVectorVector &ds, int ignorenth)
1021 {
1022  // score this question as a possible split by finding
1023  // the sum of the impurities when ds is split with this question
1024  WImpurity y,n;
1025  int d, num_yes, num_no;
1026  float count;
1027  WVector *wv;
1028 
1029  num_yes = num_no = 0;
1030  y.data = &ds;
1031  n.data = &ds;
1032  for (d=0; d < ds.n(); d++)
1033  {
1034  if ((ignorenth < 2) ||
1035  (d%ignorenth != ignorenth-1))
1036  {
1037  wv = ds(d);
1038  if (wgn_count_field == -1)
1039  count = 1.0;
1040  else
1041  count = (*wv)[wgn_count_field];
1042 
1043  if (q.ask(*wv) == TRUE)
1044  {
1045  num_yes++;
1046  if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
1047  y.cumulate(d,count); // note the sample number not value
1048  else
1049  y.cumulate((*wv)[wgn_predictee],count);
1050  }
1051  else
1052  {
1053  num_no++;
1054  if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
1055  n.cumulate(d,count); // note the sample number not value
1056  else
1057  n.cumulate((*wv)[wgn_predictee],count);
1058  }
1059  }
1060  }
1061 
1062  q.set_yes(num_yes);
1063  q.set_no(num_no);
1064 
1065  int min_cluster;
1066 
1067  if ((wgn_balance == 0.0) ||
1069  min_cluster = wgn_min_cluster_size;
1070  else
1071  min_cluster = (int)(ds.n()/wgn_balance);
1072 
1073  if ((y.samples() < min_cluster) ||
1074  (n.samples() < min_cluster))
1075  return WGN_HUGE_VAL;
1076 
1077  float ym,nm,bm;
1078  // printf("awb_debug score_question_set X %f Y %f\n",
1079  // y.samples(), n.samples());
1080  ym = y.measure();
1081  nm = n.measure();
1082  bm = ym + nm;
1083 
1084  /* cout << q << endl;
1085  printf("test question y %f n %f b %f\n",
1086  ym, nm, bm); */
1087 
1088  return bm/2.0;
1089 }
1090 
1092 {
1093  // This level of indirection was introduced for later expansion
1094 
1095  return score_question_set(q,ds,1);
1096 }
1097 
1098 WNode *wagon_stepwise(float limit)
1099 {
1100  // Find the best single features and incrementally add features
1101  // that best improve result until it doesn't improve.
1102  // This is basically to automate what Kurt was doing in building
1103  // trees, he then automated it in PERL and as it seemed to work
1104  // I put it into wagon itself.
1105  // This can be pretty computationally intensive.
1106  WNode *best = 0,*new_best = 0;
1107  float bscore,best_score = -WGN_HUGE_VAL;
1108  int best_feat,i;
1109  int nf = 1;
1110 
1111  // Set all features to ignore
1112  for (i=0; i < wgn_dataset.width(); i++)
1113  wgn_dataset.set_ignore(i,TRUE);
1114 
1115  for (i=0; i < wgn_dataset.width(); i++)
1116  {
1117  if ((wgn_dataset.ftype(i) == wndt_ignore) || (i == wgn_predictee))
1118  {
1119  // This skips the round not because this has anything to
1120  // do with this feature being (user specified) ignored
1121  // but because it indicates there is one less cycle that is
1122  // necessary
1123  continue;
1124  }
1125  new_best = wagon_stepwise_find_next_best(bscore,best_feat);
1126 
1127  if ((bscore - fabs(bscore * (limit/100))) <= best_score)
1128  {
1129  // gone as far as we can
1130  delete new_best;
1131  break;
1132  }
1133  else
1134  {
1135  best_score = bscore;
1136  delete best;
1137  best = new_best;
1138  wgn_dataset.set_ignore(best_feat,FALSE);
1139  if (!wgn_quiet)
1140  {
1141  fprintf(stdout,"FEATURE %d %s: %2.4f\n",
1142  nf,
1143  (const char *)wgn_dataset.feat_name(best_feat),
1144  best_score);
1145  fflush(stdout);
1146  nf++;
1147  }
1148  }
1149  }
1150 
1151  return best;
1152 }
1153 
1154 static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat)
1155 {
1156  // Find which of the currently ignored features will best improve
1157  // the result
1158  WNode *best = 0;
1159  float best_score = -WGN_HUGE_VAL;
1160  int best_new_feat = -1;
1161  int i;
1162 
1163  for (i=0; i < wgn_dataset.width(); i++)
1164  {
1165  if (wgn_dataset.ftype(i) == wndt_ignore)
1166  continue; // user wants me to ignore this completely
1167  else if (i == wgn_predictee) // can't use the answer
1168  continue;
1169  else if (wgn_dataset.ignore(i) == TRUE)
1170  {
1171  WNode *current;
1172  float score;
1173 
1174  // Allow this feature to participate
1175  wgn_dataset.set_ignore(i,FALSE);
1176 
1177  current = wgn_build_tree(score);
1178 
1179  if (score > best_score)
1180  {
1181  best_score = score;
1182  delete best;
1183  best = current;
1184  best_new_feat = i;
1185 // fprintf(stdout,"BETTER FEATURE %d %s: %2.4f\n",
1186 // i,
1187 // (const char *)wgn_dataset.feat_name(i),
1188 // best_score);
1189 // fflush(stdout);
1190  }
1191  else
1192  delete current;
1193 
1194  // switch it off again
1195  wgn_dataset.set_ignore(i,TRUE);
1196  }
1197  }
1198 
1199  bscore = best_score;
1200  best_feat = best_new_feat;
1201  return best;
1202 }
float wgn_score_question(WQuestion &q, WVectorVector &ds)
Definition: wagon.cc:1091
int width(void) const
Definition: EST_Wagon.h:94
int wgn_count_field
Definition: wagon.cc:71
EST_TokenStream & get(EST_Token &t)
get next token in stream
Definition: EST_Token.cc:499
EST_Track wgn_VertexFeats
Definition: wagon.cc:63
const WVectorVector * data
Definition: EST_Wagon.h:159
EST_String wgn_count_field_name
Definition: wagon.cc:72
int ignore(int i) const
Definition: EST_Wagon.h:90
int wgn_min_cluster_size
Definition: wagon.cc:66
EST_Track wgn_VertexTrack
Definition: wagon.cc:62
EST_Val predict(const WVector &w)
Definition: wagon_aux.cc:51
double stddev(void) const
standard deviation of currently cummulated values
EST_FMatrix wgn_DistMatrix
Definition: wagon.cc:61
int get_no(void) const
Definition: EST_Wagon.h:128
void set_fp(const int &fp)
Definition: EST_Wagon.h:122
int samples(void) const
Definition: EST_Wagon.h:249
void set_ignore(int i, int value)
Definition: EST_Wagon.h:91
float cluster_distance(int i)
Definition: wagon_aux.cc:848
void cumulate(double a, double count=1.0)
WImpurity & get_impurity(void)
Definition: EST_Wagon.h:245
EST_String wgn_opt_param
Definition: wagon.cc:77
int wgn_predictee
Definition: wagon.cc:73
int ask(const WVector &w) const
Definition: wagon_aux.cc:263
float measure(void)
Definition: wagon_aux.cc:424
void wgn_load_dataset(WDataSet &dataset, EST_String fname)
Definition: wagon.cc:118
int num_channels() const
return number of channels in track
Definition: EST_Track.h:657
void set_question(const WQuestion &q)
Definition: EST_Wagon.h:242
void set_SingleCharSymbols(const EST_String &sc)
set which characters are to be treated as single character symbols
Definition: EST_Token.h:344
long int get_c_int(LISP x)
Definition: slib.cc:1850
WDataSet wgn_test_dataset
Definition: wagon.cc:60
double mean(void) const
mean of currently cummulated values
void close(void)
Close stream.
Definition: EST_Token.cc:419
const EST_String & name(const int n) const
The name given the index.
#define Instantiate_TVector(TYPE)
Definition: EST_TVectorI.h:55
#define NIL
Definition: siod_defs.h:92
int get_yes(void) const
Definition: EST_Wagon.h:127
int siod_llength(LISP list)
Definition: siod.cc:202
EST_Track wgn_UnitTrack
Definition: wagon.cc:64
int wgn_held_out
Definition: wagon.cc:67
EST_String itoString(int n)
Make a EST_String object from an integer.
Definition: util_io.cc:141
int ssize_t
void wgn_find_split(WQuestion &q, WVectorVector &ds, WVectorVector &y, WVectorVector &n)
Definition: wagon.cc:775
EST_IList members
Definition: EST_Wagon.h:156
WVectorP void wgn_load_datadescription(EST_String fname, LISP ignores)
Definition: wagon.cc:111
WDataSet wgn_dataset
Definition: wagon.cc:59
double probability(const EST_String &s) const
void held_out_prune(void)
Definition: wagon_aux.cc:107
EST_String ftoString(float n, int pres=3, int width=0, int l=0)
Make a EST_String object from an float, with variable precision.
Definition: util_io.cc:149
void set_PrePunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:350
LISP equal(LISP, LISP)
Definition: slib_list.cc:133
EST_UItem * next()
Definition: EST_UList.h:55
EST_Track error(EST_Track &ref, EST_Track &test, int relax=0)
int in_cluster(int i)
Definition: wagon_aux.cc:862
int open(const EST_String &filename)
open a EST_TokenStream for a file.
Definition: EST_Token.cc:213
EST_String wgn_vertex_output
Definition: wagon.cc:78
float max(float a, float b)
Definition: EST_cluster.cc:143
void set_subnodes(WNode *l, WNode *r)
Definition: EST_Wagon.h:240
void set_PunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:347
WNode * predict_node(const WVector &d)
Definition: wagon_aux.cc:61
void print_confusion(const EST_FMatrix &a, EST_StrStr_KVL &list, EST_StrList &lex)
Definition: confusion.cc:77
int eof()
end of file
Definition: EST_Token.h:362
void set_flt_val(int n, float f)
Definition: EST_Wagon.h:63
STATIC HISTORY H
Definition: editline.c:120
void resize(ssize_t n, int set=1)
Definition: EST_TVector.cc:196
void cumulate(const float pv, double count=1.0)
Definition: wagon_aux.cc:915
void set_operand1(const EST_Val &a)
Definition: EST_Wagon.h:124
float & a(ssize_t i, int c=0)
Definition: EST_Track.cc:1025
NULL
Definition: wagon.cc:98
void set_no(const int &n)
Definition: EST_Wagon.h:126
LISP cons(LISP x, LISP y)
Definition: slib_list.cc:97
void set_yes(const int &y)
Definition: EST_Wagon.h:125
EST_FMatrix confusion(EST_StrStr_KVL &list, EST_StrList &lex)
Definition: confusion.cc:59
float min(float a, float b)
Definition: EST_cluster.cc:138
const EST_String & feat_name(const int &i) const
Definition: EST_Wagon.h:92
EST_Discrete & discrete(const int t) const
#define FALSE
Definition: EST_bool.h:119
EST_String wgn_vertex_otype
Definition: wagon.cc:79
void prune(void)
Definition: wagon_aux.cc:83
EST_Val value(void)
Definition: wagon_aux.cc:361
void set_impurity(const WImpurity &imp)
Definition: EST_Wagon.h:241
float wgn_float_range_split
Definition: wagon.cc:75
f
Definition: EST_item_aux.cc:48
EST_Token & peek(void)
peek at next token
Definition: EST_Token.h:332
#define wagon_error(WMESS)
Definition: EST_Wagon.h:50
void set_score(const float &f)
Definition: EST_Wagon.h:134
getString int
Definition: EST_item_aux.cc:50
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:196
void reset(void)
reset internal values
#define Declare_TVector_Base_T(TYPE, DEFAULT, ERROR, TAG)
Definition: EST_TVectorI.h:64
#define CAR(x)
Definition: siod_defs.h:76
int length() const
Definition: EST_UList.cc:57
int add_item(const K &rkey, const V &rval, int no_search=0)
add key-val pair to list
Definition: EST_TKVL.cc:248
Declare_TList_T(WVector *, WVectorP) Declare_TVector_Base_T(WVector *
WVectorVector & get_data(void)
Definition: EST_Wagon.h:239
EST_String wgn_predictee_name
Definition: wagon.cc:74
Discretes wgn_discretes
Definition: wagon.cc:57
float wgn_balance
Definition: wagon.cc:76
float get_score(void) const
Definition: EST_Wagon.h:133
const EST_String & string() const
Definition: EST_Token.h:120
int wgn_quiet
Definition: wagon.cc:69
T & item(const EST_Litem *p)
Definition: EST_TList.h:139
EST_DiscreteProbDistribution & pd()
Definition: EST_Wagon.h:190
LISP flocons(double x)
Definition: slib.cc:673
WNode * wagon_stepwise(float limit)
Definition: wagon.cc:1098
EST_UItem * head() const
Definition: EST_UList.h:97
int samples(void) const
Definition: EST_Wagon.h:93
#define Instantiate_TList_T(TYPE, TAG)
Definition: EST_TListI.h:58
WNode * wgn_build_tree(float &score)
Definition: wagon.cc:239
int wgn_verbose
Definition: wagon.cc:70
void set_int_val(int n, int i)
Definition: EST_Wagon.h:62
LISP car(LISP x)
Definition: slib_list.cc:115
void set_oper(const wn_oper &o)
Definition: EST_Wagon.h:123
int tree
Definition: rxp.c:21
float summary_results(WNode &tree, ostream *output)
Definition: wagon.cc:215
#define WGN_HUGE_VAL
Definition: EST_Wagon.h:54
EST_String
int ftype(const int &i) const
Definition: EST_Wagon.h:89
#define TRUE
Definition: EST_bool.h:118
INLINE ssize_t n() const
number of items in vector.
Definition: EST_TVector.h:251
void load_description(const EST_String &descfname, LISP ignores)
Definition: wagon_aux.cc:181
float cluster_ranking(int i)
Definition: wagon_aux.cc:877
int wgn_prune
Definition: wagon.cc:68
LISP cdr(LISP x)
Definition: slib_list.cc:124
double samples(void)
Definition: wagon_aux.cc:386
#define CDR(x)
Definition: siod_defs.h:77
int eoln()
end of line
Definition: EST_Token.cc:832