54 return impurity.value();
55 else if (question.ask(d))
56 return left->predict(d);
58 return right->predict(d);
65 else if (question.ask(d))
66 return left->predict_node(d);
68 return right->predict_node(d);
98 (
left->get_impurity().value() ==
right->get_impurity().value()))
119 if (question.get_score() < get_impurity().measure())
124 left->held_out_prune();
125 right->held_out_prune();
135 void WNode::print_out(ostream &s,
int margin)
140 for (i=0;i<margin;i++) s <<
" ";
147 left->print_out(s,margin+1);
148 right->print_out(s,margin+1);
167 for (i=0; i<dlength; i++)
191 p_type.resize(dlength);
192 p_ignore.resize(dlength);
193 p_name.resize(dlength);
200 for (i=0,d=description; d !=
NIL; d=
cdr(d),i++)
223 "\" can't be ignored \n");
234 else if (tname ==
"binary")
236 else if (tname ==
"cluster")
238 else if (tname ==
"vector")
240 else if (tname ==
"trajectory")
242 else if (tname ==
"ols")
244 else if (tname ==
"matrix")
246 else if (tname ==
"float")
252 "/"+p_name[i]+
" in description file \""+fname+
"\"");
259 "\" not found in description ");
269 if (w.
get_flt_val(feature_pos) == operand1.Float())
308 static EST_Regex needquotes(
".*[()'\";., \t\n\r].*");
344 if (name.matches(needquotes))
369 cerr <<
"WImpurity: no value currently set\n";
373 return EST_Val(p.most_probable(&prob));
391 return (
int)p.samples();
393 return members.length();
395 return members.length();
397 return members.length();
399 return members.length();
410 a.reset(); trajectory=0; l=0; width=0;
412 for (i=0; i < ds.
n(); i++)
419 cumulate((*(ds(i)))[wgn_predictee],
427 return a.variance()*a.samples();
429 return vector_impurity();
431 return trajectory_impurity();
433 return a.variance()*a.samples();
435 return p.entropy()*p.samples();
437 return cluster_impurity();
439 return ols_impurity();
442 cerr <<
"WImpurity: can't measure unset object" << endl;
447 float WImpurity::vector_impurity()
466 for (pp=members.head(), countpp=member_counts.head(); pp != 0; pp=pp->
next(), countpp=countpp->
next())
468 i = members.item(pp);
492 for (pp=members.head(); pp != 0; pp=pp->
next())
501 for (pp=members.head(); pp != 0; pp=pp->
next())
503 mmm = members.item(pp);
524 for (pp=members.head(); pp != 0; pp=pp->
next())
526 x = members.item(pp);
528 for (qq=pp->
next(); qq != 0; qq=qq->
next())
530 y = members.item(qq);
544 return a.mean() * count;
554 delete [] trajectory[j];
555 delete [] trajectory;
562 float WImpurity::trajectory_impurity()
575 double n, m, m1, m2, w;
588 for (pp=members.head(); pp != 0; pp=pp->
next())
590 i = members.item(pp);
623 for (pp=members.head(); pp != 0; pp=pp->
next())
625 i = members.item(pp);
628 for (ti=0,n=0.0; ti<l; ti++,n+=m)
641 for (ti=0; ti<l; ti++)
645 stdss += trajectory[ti][j].
stddev();
649 score = stdss.
mean() * members.length();
662 for (pp=members.head(); pp != 0; pp=pp->
next())
664 i = members.item(pp);
674 m1 = (float)(s1l)/(float)l1;
675 m2 = (float)(s2l)/(float)l2;
677 for (ti=0,n=0.0; s1l > 0 && ti<
l1; ti++,n+=m1)
679 ni = s + (((
int)n < s1l) ? (
int)n : s1l - 1);
687 trajectory[ti][j] += -1;
690 for (ti++,n=0.0; s2l > 0 && ti<l-1; ti++,n+=m2)
692 ni = s + (((
int)n < s2l) ? (
int)n : s2l - 1);
699 trajectory[ti][j] += -2;
706 for (w=0.0,ti=0; ti<
l1; ti++,w+=m)
709 stdss += trajectory[ti][j].
stddev() * w;
711 for (w=1.0,ti++; ti<l-1; ti++,w-=m)
714 stdss += trajectory[ti][j].
stddev() * w;
717 score = stdss.
mean() * members.length();
737 feat_names.
append(
"Intercept");
740 for (p=0,pp=members.
head(); pp; p++,pp=pp->
next())
742 n = members.
item(pp);
751 for (m=1,xm=1; m < w; m++)
772 float WImpurity::ols_impurity()
786 part_to_ols_data(X,Y,included,feat_names,members,*data);
804 printf(
"Impurity OLS X(%zd,%zd) Y(%zd,%zd) %f, %f, %f\n",
808 if (fabs(coeffsl[0]) > 10000)
814 return (1-best_score) *members.
length();
817 float WImpurity::cluster_impurity()
828 for (pp=members.
head(); pp != 0; pp=pp->
next())
830 i = members.
item(pp);
831 for (q=pp->
next(); q != 0; q=q->
next())
843 return a.stddev() * a.samples();
852 float dist = cluster_member_mean(i);
853 float mdist = dist-a.mean();
858 return fabs((dist-a.mean())/a.stddev());
866 float dist = cluster_member_mean(i);
869 for (pp=members.
head(); pp != 0; pp=pp->
next())
871 if (dist < cluster_member_mean(members.
item(pp)))
880 float dist = cluster_distance(i);
884 for (pp=members.
head(); pp != 0; pp=pp->
next())
886 if (dist >= cluster_distance(members.
item(pp)))
893 float WImpurity::cluster_member_mean(
int i)
901 for (sum=0.0,n=0,q=members.
head(); q != 0; q=q->
next())
912 return ( n == 0 ? 0.0 : sum/n );
935 member_counts.append((
float)count);
947 p.cumulate((
int)pv,count);
952 a.cumulate((
int)pv,count);
957 a.cumulate(pv,count);
961 wagon_error(
"WImpurity: cannot cumulate EST_Val type");
971 s <<
"(" << imp.a.
stddev() <<
" " << imp.a.
mean() <<
")";
976 imp.vector_impurity();
988 s <<
"(" << b.
mean() <<
" ";
1038 if (isfinite(cs[j].stddev()))
1050 s << imp.a.
mean() <<
")";
1055 imp.trajectory_impurity();
1056 for (i=0; i<imp.
l; i++)
1068 s << imp.a.
mean() <<
")";
1078 imp.cluster_member_mean(imp.
members.
item(p)) <<
")";
1084 s << imp.a.
mean() <<
")";
1099 part_to_ols_data(X,Y,included,feat_names,imp.
members,*(imp.
data));
1102 printf(
"no robust ols\n");
1109 for (i=0; i<coeffsl.
num_rows(); i++)
1112 s << feat_names.
nth(i);
1120 s <<
") " << cor <<
")";
1132 s <<
"(" << name <<
" " << prob <<
") ";
1137 s <<
"([WImpurity unset])";
int ols_test(const EST_FMatrix &real, const EST_FMatrix &predicted, float &correlation, float &rmse)
const WVectorVector * data
EST_String wgn_vertex_output
float wgn_score_question(WQuestion &q, WVectorVector &ds)
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
int robust_ols(const EST_FMatrix &X, const EST_FMatrix &Y, EST_FMatrix &coeffs)
const EST_String & most_probable(double *prob=NULL) const
Return the most probable member of the distribution.
EST_Val predict(const WVector &w)
double stddev(void) const
standard deviation of currently cummulated values
ssize_t num_columns() const
return number of columns
float cluster_distance(int i)
void cumulate(double a, double count=1.0)
A Regular expression class to go with the CSTR EST_String class.
STATIC void left(STATUS Change)
void wgn_find_split(WQuestion &q, WVectorVector &ds, WVectorVector &y, WVectorVector &n)
int ask(const WVector &w) const
int num_channels() const
return number of channels in track
double mean(void) const
mean of currently cummulated values
STATIC void right(STATUS Change)
int siod_llength(LISP list)
wn_oper get_op(void) const
INLINE const T & a_no_check(ssize_t n) const
read-only const access operator: without bounds checking
EST_String itoString(int n)
Make a EST_String object from an integer.
ssize_t num_rows() const
return number of rows
T & nth(int n)
return the Nth value
EST_SuffStats ** trajectory
ostream & operator<<(ostream &s, WNode &n)
EST_Litem * item_start() const
Used for iterating through members of the distribution.
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
int def(const EST_StrList &members)
void held_out_prune(void)
WNode * predict_node(const WVector &d)
LISP vload(const char *fname, long cflag)
void siod_list_to_strlist(LISP l, EST_StrList &a)
const char * get_c_string(LISP x)
void cumulate(const float pv, double count=1.0)
float & a(ssize_t i, int c=0)
int ols_apply(const EST_FMatrix &samples, const EST_FMatrix &coeffs, EST_FMatrix &res)
const EST_String & feat_name(const int &i) const
void item_prob(EST_Litem *idx, EST_String &s, double &prob) const
During iteration returns name and probability given index.
EST_String wgn_count_field_name
float mean(EST_FVector &m)
EST_FMatrix wgn_DistMatrix
#define wagon_error(WMESS)
const EST_String & string(void) const
int matches(const char *e, ssize_t pos=0) const
Exactly match this string?
EST_String wgn_predictee_name
void append(const T &item)
add item onto end of list
void reset(void)
reset internal values
void ignore_non_numbers()
EST_Track wgn_VertexTrack
const EST_IList & get_operandl(void) const
double samples(void)
number of samples in set
const EST_Val get_operand1(void) const
T & item(const EST_Litem *p)
int ilist_member(const EST_IList &l, int i)
int get_int_val(int n) const
float get_flt_val(int n) const
INLINE const T & a_no_check(ssize_t row, ssize_t col) const
const access with no bounds check, care recommend
void resize(int rows, int cols, int set=1)
resize matrix
EST_Track wgn_VertexFeats
int ftype(const int &i) const
EST_String quote_string(const EST_String &s, const EST_String "e, const EST_String &escape, int force)
float sum(const EST_FMatrix &a)
sum of elements
INLINE ssize_t n() const
number of items in vector.
LISP siod_member_str(const char *key, LISP list)
void load_description(const EST_String &descfname, LISP ignores)
float cluster_ranking(int i)
void resize(int n, int set=1)
resize vector