Edinburgh Speech Tools  2.1-release
wagon_main.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996-2006 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : May 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* A Classification and Regression Tree (CART) Program */
37 /* A basic implementation of many of the techniques in */
38 /* Briemen et al. 1984 */
39 /* */
40 /* Added decision list support, Feb 1997 */
41 /* */
42 /* Added vector support for Clustergen 2005/2006 */
43 /* */
44 /*=======================================================================*/
45 #include <cstdlib>
46 #include <iostream>
47 #include <fstream>
48 #include <cstring>
49 #include "EST_Wagon.h"
50 #include "EST_cmd_line.h"
51 
52 using namespace std;
53 
55 
56 static wn_strategy_type wagon_type = wn_decision_tree;
57 
58 static int wagon_main(int argc, char **argv);
59 
60 
61 int main(int argc, char **argv)
62 {
63 
64  wagon_main(argc,argv);
65 
66  exit(0);
67  return 0;
68 }
69 
70 static int set_Vertex_Feats(EST_Track &wgn_VertexFeats,
71  EST_String &wagon_track_features)
72 {
73  int i,s=0,e;
74  EST_TokenStream ts;
75 
76  for (i=0; i<wgn_VertexFeats.num_channels(); i++)
77  wgn_VertexFeats.a(static_cast<ssize_t>(0),i) = 0.0;
78 
79  ts.open_string(wagon_track_features);
80  ts.set_WhiteSpaceChars(",- ");
83  ts.set_SingleCharSymbols("");
84 
85  while (!ts.eof())
86  {
87  EST_Token &token = ts.get();
88  const EST_String ws = (const char *)token.whitespace();
89  if (token == "all")
90  {
91  for (i=0; i<wgn_VertexFeats.num_channels(); i++)
92  wgn_VertexFeats.a(static_cast<ssize_t>(0),i) = 1.0;
93  break;
94  } else if ((ws == ",") || (ws == ""))
95  {
96  s = atoi(token.string());
97  wgn_VertexFeats.a(static_cast<ssize_t>(0),s) = 1.0;
98  } else if (ws == "-")
99  {
100  if (token == "")
101  e = wgn_VertexFeats.num_channels()-1;
102  else
103  e = atoi(token.string());
104  for (i=s; i<=e && i<wgn_VertexFeats.num_channels(); i++)
105  wgn_VertexFeats.a(static_cast<ssize_t>(0),i) = 1.0;
106  } else
107  {
108  printf("wagon: track_feats invalid: %s at position %lld\n",
109  (const char *)wagon_track_features,
110  (long long int) ts.filepos());
111  exit(-1);
112  }
113  }
114 
115  return 0;
116 }
117 
118 static int wagon_main(int argc, char **argv)
119 {
120  // Top level function sets up data and creates a tree
121  EST_Option al;
122  EST_StrList files;
123  EST_String wgn_oname;
124  ostream *wgn_coutput = 0;
125  float stepwise_limit = 0;
126  int feats_start=0, feats_end=0;
127  int i;
128 
130  (argc, argv,
131  EST_String("[options]\n") +
132  "Summary: CART building program\n"+
133  "-desc <ifile> Field description file\n"+
134  "-data <ifile> Datafile, one vector per line\n"+
135  "-stop <int> {50} Minimum number of examples for leaf nodes\n"+
136  "-test <ifile> Datafile to test tree on\n"+
137  "-frs <float> {10} Float range split, number of partitions to\n"+
138  " split a float feature range into\n"+
139  "-dlist Build a decision list (rather than tree)\n"+
140  "-dtree Build a decision tree (rather than list) default\n"+
141  "-output <ofile> \n"+
142  "-o <ofile> File to save output tree in\n"+
143  "-distmatrix <ifile>\n"+
144  " A distance matrix for clustering\n"+
145  "-track <ifile>\n"+
146  " track for vertex indices\n"+
147  "-track_start <int>\n"+
148  " start channel vertex indices\n"+
149  "-track_end <int>\n"+
150  " end (inclusive) channel for vertex indices\n"+
151  "-track_feats <string>\n"+
152  " Track features to use, comma separated list\n"+
153  " with feature numbers and/or ranges, 0 start\n"+
154  "-unittrack <ifile>\n"+
155  " track for unit start and length in vertex track\n"+
156  "-quiet No questions printed during building\n"+
157  "-verbose Lost of information printing during build\n"+
158  "-predictee <string>\n"+
159  " name of field to predict (default is first field)\n"+
160  "-ignore <string>\n"+
161  " Filename or bracket list of fields to ignore\n"+
162  "-count_field <string>\n"+
163  " Name of field containing count weight for samples\n"+
164  "-stepwise Incrementally find best features\n"+
165  "-swlimit <float> {0.0}\n"+
166  " Percentage necessary improvement for stepwise,\n"+
167  " may be negative.\n"+
168  "-swopt <string> Parameter to optimize for stepwise, for \n"+
169  " classification options are correct or entropy\n"+
170  " for regression options are rmse or correlation\n"+
171  " correct and correlation are the defaults\n"+
172  "-balance <float> For derived stop size, if dataset at node, divided\n"+
173  " by balance is greater than stop it is used as stop\n"+
174  " if balance is 0 (default) always use stop as is.\n"+
175  "-vertex_output <string> Output <mean> or <best> of cluster\n"+
176  "-held_out <int> Percent to hold out for pruning\n"+
177  "-heap <int> {210000}\n"+
178  " Set size of Lisp heap, should not normally need\n"+
179  " to be changed from its default, only with *very*\n"+
180  " large description files (> 1M)\n"+
181  "-noprune No (same class) pruning required\n",
182  files, al);
183 
184  if (al.present("-held_out"))
185  wgn_held_out = al.ival("-held_out");
186  if (al.present("-balance"))
187  wgn_balance = al.fval("-balance");
188  if ((!al.present("-desc")) || ((!al.present("-data"))))
189  {
190  cerr << argv[0] << ": missing description and/or datafile" << endl;
191  cerr << "use -h for description of arguments" << endl;
192  }
193 
194  if (al.present("-quiet"))
195  wgn_quiet = TRUE;
196  if (al.present("-verbose"))
197  wgn_verbose = TRUE;
198 
199  if (al.present("-stop"))
200  wgn_min_cluster_size = atoi(al.val("-stop"));
201  if (al.present("-noprune"))
202  wgn_prune = FALSE;
203  if (al.present("-predictee"))
204  wgn_predictee_name = al.val("-predictee");
205  if (al.present("-count_field"))
206  wgn_count_field_name = al.val("-count_field");
207  if (al.present("-swlimit"))
208  stepwise_limit = al.fval("-swlimit");
209  if (al.present("-frs")) // number of partitions to try in floats
210  wgn_float_range_split = atof(al.val("-frs"));
211  if (al.present("-swopt"))
212  wgn_opt_param = al.val("-swopt");
213  if (al.present("-vertex_output"))
214  wgn_vertex_output = al.val("-vertex_output");
215  if (al.present("-output") || al.present("-o"))
216  {
217  if (al.present("-o"))
218  wgn_oname = al.val("-o");
219  else
220  wgn_oname = al.val("-output");
221  wgn_coutput = new ofstream(wgn_oname);
222  if (!(*wgn_coutput))
223  {
224  cerr << "Wagon: can't open file \"" << wgn_oname <<
225  "\" for output " << endl;
226  exit(-1);
227  }
228  }
229  else
230  wgn_coutput = &cout;
231  if (al.present("-distmatrix"))
232  {
233  if (wgn_DistMatrix.load(al.val("-distmatrix")) != 0)
234  {
235  cerr << "Wagon: failed to load Distance Matrix from \"" <<
236  al.val("-distmatrix") << "\"\n" << endl;
237  exit(-1);
238  }
239  }
240  if (al.present("-dlist"))
241  wagon_type = wn_decision_list;
242 
243  WNode *tree;
244  float score;
245  LISP ignores = NIL;
246 
247  siod_init(al.ival("-heap"));
248 
249  if (al.present("-ignore"))
250  {
251  EST_String ig = al.val("-ignore");
252  if (ig[0] == '(')
253  ignores = read_from_string(ig);
254  else
255  ignores = vload(ig,1);
256  }
257  // Load in the data
258  wgn_load_datadescription(al.val("-desc"),ignores);
259  wgn_load_dataset(wgn_dataset,al.val("-data"));
260  if (al.present("-distmatrix") &&
262  {
263  cerr << "wagon: distance matrix is smaller than number of training elements\n";
264  exit(-1);
265  }
266  else if (al.present("-track"))
267  {
268  wgn_VertexTrack.load(al.val("-track"));
269  wgn_VertexFeats.resize(1,wgn_VertexTrack.num_channels());
270  for (i=0; i<wgn_VertexFeats.num_channels(); i++)
271  wgn_VertexFeats.a(static_cast<ssize_t>(0),i) = 1.0;
272  }
273 
274  if (al.present("-track_start"))
275  {
276  feats_start = al.ival("-track_start");
277  if ((feats_start < 0) ||
278  (feats_start > wgn_VertexTrack.num_channels()))
279  {
280  printf("wagon: track_start invalid: %d out of %d channels\n",
281  feats_start,
283  exit(-1);
284  }
285  for (i=0; i<feats_start; i++)
286  wgn_VertexFeats.a(static_cast<ssize_t>(0),i) = 0.0; /* don't do feats up to start */
287 
288  }
289 
290  if (al.present("-track_end"))
291  {
292  feats_end = al.ival("-track_end");
293  if ((feats_end < feats_start) ||
294  (feats_end > wgn_VertexTrack.num_channels()))
295  {
296  printf("wagon: track_end invalid: %d between start %d out of %d channels\n",
297  feats_end,
298  feats_start,
300  exit(-1);
301  }
302  for (i=feats_end+1; i<wgn_VertexTrack.num_channels(); i++)
303  wgn_VertexFeats.a(static_cast<ssize_t>(0),i) = 0.0; /* don't do feats after end */
304  }
305  if (al.present("-track_feats"))
306  { /* overrides start and end numbers */
307  EST_String wagon_track_features = al.val("-track_feats");
308  set_Vertex_Feats(wgn_VertexFeats,wagon_track_features);
309  }
310 
311  // printf("Track feats\n");
312  // for (i=0; i<wgn_VertexTrack.num_channels(); i++)
313  // if (wgn_VertexFeats.a(static_cast<ssize_t>(0),i) > 0.0)
314  // printf("%d ",i);
315  // printf("\n");
316 
317  if (al.present("-unittrack"))
318  { /* contains two features, a start and length. start indexes */
319  /* into VertexTrack to the first vector in the segment */
320  wgn_UnitTrack.load(al.val("-unittrack"));
321  }
322 
323  if (al.present("-test"))
325 
326  // Build and test the model
327  if (al.present("-stepwise"))
328  tree = wagon_stepwise(stepwise_limit);
329  else if (wagon_type == wn_decision_tree)
330  tree = wgn_build_tree(score); // default operation
331  else if (wagon_type == wn_decision_list)
332  // dlist is printed with build_dlist rather than returned
333  tree = wgn_build_dlist(score,wgn_coutput);
334  else
335  {
336  cerr << "Wagon: unknown operation, not tree or list" << endl;
337  exit(-1);
338  }
339 
340  if (tree != 0)
341  {
342  *wgn_coutput << *tree;
343  summary_results(*tree,wgn_coutput);
344  delete tree;
345  }
346 
347  if (wgn_coutput != &cout)
348  delete wgn_coutput;
349  return 0;
350 }
351 
void set_WhiteSpaceChars(const EST_String &ws)
set which characters are to be treated as whitespace
Definition: EST_Token.h:341
EST_TokenStream & get(EST_Token &t)
get next token in stream
Definition: EST_Token.cc:499
wn_strategy_type
Definition: wagon_main.cc:54
EST_String wgn_vertex_output
Definition: wagon.cc:78
WDataSet wgn_dataset
Definition: wagon.cc:59
void wgn_load_datadescription(EST_String fname, LISP ignores)
Definition: wagon.cc:111
int ival(const EST_String &rkey, int m=1) const
Definition: EST_Option.cc:82
int num_channels() const
return number of channels in track
Definition: EST_Track.h:657
void set_SingleCharSymbols(const EST_String &sc)
set which characters are to be treated as single character symbols
Definition: EST_Token.h:344
#define NIL
Definition: siod_defs.h:92
int wgn_min_cluster_size
Definition: wagon.cc:66
EST_Track wgn_UnitTrack
Definition: wagon.cc:64
const EST_String & whitespace()
Definition: EST_Token.h:112
ssize_t num_rows() const
return number of rows
Definition: EST_TMatrix.h:177
float fval(const EST_String &rkey, int m=1) const
Definition: EST_Option.cc:104
int wgn_held_out
Definition: wagon.cc:67
void resize(ssize_t num_frames, int num_channels, bool preserve=1)
Definition: EST_Track.cc:214
EST_String wgn_opt_param
Definition: wagon.cc:77
void set_PrePunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:350
int open_string(const EST_String &newbuffer)
open a EST_TokenStream for string rather than a file
Definition: EST_Token.cc:264
void set_PunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:347
LISP vload(const char *fname, long cflag)
Definition: slib_file.cc:632
int eof()
end of file
Definition: EST_Token.h:362
EST_read_status load(const EST_String name, float ishift=0.0, float startt=0.0)
Definition: EST_Track.cc:1312
int main(int argc, char **argv)
Definition: wagon_main.cc:61
float wgn_balance
Definition: wagon.cc:76
float & a(ssize_t i, int c=0)
Definition: EST_Track.cc:1025
WNode * wagon_stepwise(float limit)
Definition: wagon.cc:1098
WNode * wgn_build_dlist(float &score, ostream *output)
Definition: dlist.cc:60
LISP read_from_string(const char *)
Definition: slib_str.cc:65
#define FALSE
Definition: EST_bool.h:119
EST_String wgn_count_field_name
Definition: wagon.cc:72
EST_read_status load(const EST_String &filename)
Load from file (ascii or binary as defined in file)
Definition: EST_FMatrix.cc:523
EST_FMatrix wgn_DistMatrix
Definition: wagon.cc:61
int wgn_quiet
Definition: wagon.cc:69
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
Definition: EST_TKVL.cc:145
EST_String wgn_predictee_name
Definition: wagon.cc:74
int length() const
Definition: EST_UList.cc:57
EST_Track wgn_VertexTrack
Definition: wagon.cc:62
float summary_results(WNode &tree, ostream *output)
Definition: wagon.cc:215
EST_FilePos filepos(void) const
current file position in EST_TokenStream
Definition: EST_Token.h:367
const EST_String & string() const
Definition: EST_Token.h:120
int present(const K &rkey) const
Returns true if key is present.
Definition: EST_TKVL.cc:222
void wgn_load_dataset(WDataSet &ds, EST_String fname)
Definition: wagon.cc:118
int siod_init(int heap_size=DEFAULT_HEAP_SIZE)
Definition: siod.cc:58
float wgn_float_range_split
Definition: wagon.cc:75
int tree
Definition: rxp.c:21
EST_Track wgn_VertexFeats
Definition: wagon.cc:63
EST_String
#define TRUE
Definition: EST_bool.h:118
WDataSet wgn_test_dataset
Definition: wagon.cc:60
int wgn_prune
Definition: wagon.cc:68
int parse_command_line(int argc, char *argv[], const EST_String &usage, EST_StrList &files, EST_Option &al, int make_stdio=1)
Definition: cmd_line.cc:101
int wgn_verbose
Definition: wagon.cc:70
WNode * wgn_build_tree(float &score)
Definition: wagon.cc:239