88 static int wagon_split(
int margin,
WNode &node);
90 static void construct_binary_ques(
int feat,
WQuestion &test_ques);
94 static WNode *wagon_stepwise_find_next_best(
float &bscore,
int &best_feat);
100 #if defined(INSTANTIATE_TEMPLATES) 102 #include "../base_class/EST_TList.cc" 103 #include "../base_class/EST_TVector.cc" 126 if (ts.
open(fname) == -1)
135 v =
new WVector(dataset.
width());
139 int type = dataset.
ftype(i);
145 float f = atof(ts.
get().string());
150 cout << fname <<
": bad float " << f
182 cout << fname <<
": bad value " << s <<
" in field " <<
193 if (i != dataset.
width())
201 cerr << fname <<
": data vector " << nvec <<
202 " contains too many parameters instead of " 203 << dataset.
width() << endl;
210 cout <<
"Dataset of " << dataset.
samples() <<
" vectors of " <<
211 dataset.
width() <<
" parameters from: " << fname << endl;
217 if (wgn_test_dataset.
samples() != 0)
218 return do_summary(tree,wgn_test_dataset,output);
220 return do_summary(tree,wgn_dataset,output);
226 return test_tree_cluster(tree,ds,output);
228 return test_tree_vector(tree,ds,output);
230 return test_tree_trajectory(tree,ds,output);
232 return test_tree_ols(tree,ds,output);
234 return test_tree_class(tree,ds,output);
236 return test_tree_float(tree,ds,output);
248 wagon_split(margin,*top);
250 if (wgn_held_out > 0)
274 for (j=i=0,d=ds.
head(); d != 0; d=d->
next(),j++)
276 if ((in) && ((j%100) >= held_out))
291 static float test_tree_class(
WNode &tree,
WDataSet &dataset,ostream *output)
301 float correct=0,total=0, count=0;
303 float bcorrect=0, bpredicted=0, bactual=0;
304 float precision=0, recall=0;
306 for (p=dataset.
head(); p != 0; p=p->
next())
315 H += (log(prob))*count;
317 real = wgn_discretes[type].name(dataset(p)->get_int_val(
wgn_predictee));
319 if (wgn_opt_param ==
"B_NB_F1")
345 *output <<
";; entropy " << (-1*(H/total)) <<
" perplexity " <<
346 pow(2.0,(-1*(H/total))) << endl;
351 if (wgn_opt_param ==
"entropy")
352 return -pow(2.0,(-1*(H/total)));
353 else if(wgn_opt_param ==
"B_NB_F1")
358 precision = bcorrect/bpredicted;
362 recall = bcorrect/bactual;
364 if((precision+recall) !=0)
365 fmeasure = 2* (precision*recall)/(precision+recall);
366 cout<<
"F1 :" << fmeasure <<
" Prec:" << precision <<
" Rec:" << recall <<
" B-Pred:" << bpredicted <<
" B-Actual:" << bactual <<
" B-Correct:" << bcorrect << endl;
370 return (
float)correct/(float)total;
373 static float test_tree_vector(
WNode &tree,
WDataSet &dataset,ostream *output)
380 float predict, actual;
388 for (p=dataset.
head(); p != 0; p=p->
next())
393 if (wgn_VertexFeats.
a(static_cast<ssize_t>(0),j) > 0.0)
399 b += wgn_VertexTrack.
a(i,j);
402 actual = wgn_VertexTrack.
a(pos,j);
411 error = predict-actual;
413 error = (predict-actual)/b.
stddev();
444 <<
" Correlation is " <<
ftoString(cor,4,1)
449 <<
" Correlation is " <<
ftoString(cor,4,1)
454 if (wgn_opt_param ==
"rmse")
455 return -sqrt(se.
mean());
460 static float test_tree_trajectory(
WNode &tree,
WDataSet &dataset,ostream *output)
468 float predict, actual;
476 for (p=dataset.
head(); p != 0; p=p->
next())
481 if (wgn_VertexFeats.
a(static_cast<ssize_t>(0),j) > 0.0)
487 b += wgn_VertexTrack.
a(i,j);
490 actual = wgn_VertexTrack.
a(pos,j);
499 error = predict-actual;
501 error = (predict-actual)/b.
stddev();
532 <<
" Correlation is " <<
ftoString(cor,4,1)
537 <<
" Correlation is " <<
ftoString(cor,4,1)
542 if (wgn_opt_param ==
"rmse")
543 return -sqrt(se.
mean());
548 static float test_tree_cluster(
WNode &tree,
WDataSet &dataset,ostream *output)
557 for (p=dataset.
head(); p != 0; p=p->
next())
569 if (dataset.
length() > 0) {
570 rightnumber = (
int)(100.0*(
float)right_cluster/(float)dataset.
length());
574 *output <<
";; Right cluster " << right_cluster <<
" (" <<
576 "%) mean ranking " << ranking.
mean() <<
" mean distance " 577 << meandist.
mean() << endl;
578 cout <<
"Right cluster " << right_cluster <<
" (" <<
580 "%) mean ranking " << ranking.
mean() <<
" mean distance " 581 << meandist.
mean() << endl;
584 return 10000-meandist.
mean();
587 static float test_tree_float(
WNode &tree,
WDataSet &dataset,ostream *output)
596 for (p=dataset.
head(); p != 0; p=p->
next())
598 predict = tree.
predict((*dataset(p)));
606 error = predict-real;
636 <<
" Correlation is " <<
ftoString(cor,4,1)
641 <<
" Correlation is " <<
ftoString(cor,4,1)
646 if (wgn_opt_param ==
"rmse")
647 return -sqrt(se.
mean());
652 static float test_tree_ols(
WNode &tree,
WDataSet &dataset,ostream *output)
662 for (p=dataset.
head(); p != 0; p=p->
next())
674 error = predict-real;
704 <<
" Correlation is " <<
ftoString(cor,4,1)
709 <<
" Correlation is " <<
ftoString(cor,4,1)
714 if (wgn_opt_param ==
"rmse")
715 return -sqrt(se.
mean());
720 static int wagon_split(
int margin,
WNode &node)
727 find_best_question(node.
get_data(), q);
737 (question_score < impurity_measure))
749 for (i=0; i < margin; i++)
754 wagon_split(margin,*l);
756 wagon_split(margin,*r);
765 for (i=0; i < margin; i++)
767 cout <<
"stopped samples: " << node.
samples() <<
" impurity: " 783 for (iy=in=i=0; i < ds.
n(); i++)
802 for (i=0;i < wgn_dataset.
width(); i++)
809 construct_binary_ques(i,test_ques);
814 tscore = construct_float_ques(i,test_ques,dset);
822 wagon_error(
"subset selection temporarily deleted");
823 tscore = construct_class_ques_subset(i,test_ques,dset);
827 tscore = construct_class_ques(i,test_ques,dset);
830 best_ques = test_ques;
850 for (cl=0; cl < wgn_discretes[wgn_dataset.
ftype(feat)].length(); cl++)
865 static float construct_class_ques_subset(
int feat,
WQuestion &ques,
879 float *scores =
new float[wgn_discretes[wgn_dataset.
ftype(feat)].length()];
882 for (cl=0; cl < wgn_discretes[wgn_dataset.
ftype(feat)].length(); cl++)
888 LISP order = sort_class_scores(feat,scores);
894 ques.set_operand(
car(order));
917 ques.set_operand(
car(best_l));
922 ques.set_operand(
car(order));
926 cout <<
"Found a good subset" << endl;
927 ques.set_operand(best_l);
933 static LISP sort_class_scores(
int feat,
float *scores)
940 for (i=0; i < wgn_discretes[wgn_dataset.
ftype(feat)].length(); i++)
948 for (l=items; l !=
NIL; l=
cdr(l))
980 min = max = ds(0)->get_flt_val(feat);
981 for (d=0; d < ds.
n(); d++)
983 val = ds(d)->get_flt_val(feat);
1009 static void construct_binary_ques(
int feat,
WQuestion &test_ques)
1025 int d, num_yes, num_no;
1029 num_yes = num_no = 0;
1032 for (d=0; d < ds.
n(); d++)
1034 if ((ignorenth < 2) ||
1035 (d%ignorenth != ignorenth-1))
1073 if ((y.
samples() < min_cluster) ||
1095 return score_question_set(q,ds,1);
1106 WNode *best = 0,*new_best = 0;
1112 for (i=0; i < wgn_dataset.
width(); i++)
1115 for (i=0; i < wgn_dataset.
width(); i++)
1125 new_best = wagon_stepwise_find_next_best(bscore,best_feat);
1127 if ((bscore - fabs(bscore * (limit/100))) <= best_score)
1135 best_score = bscore;
1141 fprintf(stdout,
"FEATURE %d %s: %2.4f\n",
1143 (
const char *)wgn_dataset.
feat_name(best_feat),
1154 static WNode *wagon_stepwise_find_next_best(
float &bscore,
int &best_feat)
1160 int best_new_feat = -1;
1163 for (i=0; i < wgn_dataset.
width(); i++)
1179 if (score > best_score)
1199 bscore = best_score;
1200 best_feat = best_new_feat;
float wgn_score_question(WQuestion &q, WVectorVector &ds)
EST_TokenStream & get(EST_Token &t)
get next token in stream
EST_Track wgn_VertexFeats
const WVectorVector * data
EST_String wgn_count_field_name
EST_Track wgn_VertexTrack
EST_Val predict(const WVector &w)
double stddev(void) const
standard deviation of currently cummulated values
EST_FMatrix wgn_DistMatrix
void set_fp(const int &fp)
void set_ignore(int i, int value)
float cluster_distance(int i)
void cumulate(double a, double count=1.0)
WImpurity & get_impurity(void)
int ask(const WVector &w) const
void wgn_load_dataset(WDataSet &dataset, EST_String fname)
int num_channels() const
return number of channels in track
void set_question(const WQuestion &q)
void set_SingleCharSymbols(const EST_String &sc)
set which characters are to be treated as single character symbols
long int get_c_int(LISP x)
WDataSet wgn_test_dataset
double mean(void) const
mean of currently cummulated values
void close(void)
Close stream.
const EST_String & name(const int n) const
The name given the index.
#define Instantiate_TVector(TYPE)
int siod_llength(LISP list)
EST_String itoString(int n)
Make a EST_String object from an integer.
void wgn_find_split(WQuestion &q, WVectorVector &ds, WVectorVector &y, WVectorVector &n)
WVectorP void wgn_load_datadescription(EST_String fname, LISP ignores)
double probability(const EST_String &s) const
void held_out_prune(void)
EST_String ftoString(float n, int pres=3, int width=0, int l=0)
Make a EST_String object from an float, with variable precision.
void set_PrePunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
EST_Track error(EST_Track &ref, EST_Track &test, int relax=0)
int open(const EST_String &filename)
open a EST_TokenStream for a file.
EST_String wgn_vertex_output
float max(float a, float b)
void set_subnodes(WNode *l, WNode *r)
void set_PunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
WNode * predict_node(const WVector &d)
void print_confusion(const EST_FMatrix &a, EST_StrStr_KVL &list, EST_StrList &lex)
void set_flt_val(int n, float f)
void resize(ssize_t n, int set=1)
void cumulate(const float pv, double count=1.0)
void set_operand1(const EST_Val &a)
float & a(ssize_t i, int c=0)
void set_no(const int &n)
LISP cons(LISP x, LISP y)
void set_yes(const int &y)
EST_FMatrix confusion(EST_StrStr_KVL &list, EST_StrList &lex)
float min(float a, float b)
const EST_String & feat_name(const int &i) const
EST_Discrete & discrete(const int t) const
EST_String wgn_vertex_otype
void set_impurity(const WImpurity &imp)
float wgn_float_range_split
EST_Token & peek(void)
peek at next token
#define wagon_error(WMESS)
void set_score(const float &f)
void append(const T &item)
add item onto end of list
void reset(void)
reset internal values
#define Declare_TVector_Base_T(TYPE, DEFAULT, ERROR, TAG)
int add_item(const K &rkey, const V &rval, int no_search=0)
add key-val pair to list
Declare_TList_T(WVector *, WVectorP) Declare_TVector_Base_T(WVector *
WVectorVector & get_data(void)
EST_String wgn_predictee_name
float get_score(void) const
const EST_String & string() const
T & item(const EST_Litem *p)
EST_DiscreteProbDistribution & pd()
WNode * wagon_stepwise(float limit)
#define Instantiate_TList_T(TYPE, TAG)
WNode * wgn_build_tree(float &score)
void set_int_val(int n, int i)
void set_oper(const wn_oper &o)
float summary_results(WNode &tree, ostream *output)
int ftype(const int &i) const
INLINE ssize_t n() const
number of items in vector.
void load_description(const EST_String &descfname, LISP ignores)
float cluster_ranking(int i)