speech-tools/wagon__aux_8cc_source.html

 /*************************************************************************/
 /*                                                                       */
 /*                Centre for Speech Technology Research                  */
 /*                     University of Edinburgh, UK                       */
 /*                      Copyright (c) 1996,1997                          */
 /*                        All Rights Reserved.                           */
 /*                                                                       */
 /*  Permission is hereby granted, free of charge, to use and distribute  */
 /*  this software and its documentation without restriction, including   */
 /*  without limitation the rights to use, copy, modify, merge, publish,  */
 /*  distribute, sublicense, and/or sell copies of this work, and to      */
 /*  permit persons to whom this work is furnished to do so, subject to   */
 /*  the following conditions:                                            */
 /*   1. The code must retain the above copyright notice, this list of    */
 /*      conditions and the following disclaimer.                         */
 /*   2. Any modifications must be clearly marked as such.                */
 /*   3. Original authors' names are not deleted.                         */
 /*   4. The authors' names are not used to endorse or promote products   */
 /*      derived from this software without specific prior written        */
 /*      permission.                                                      */
 /*                                                                       */
 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
 /*  THIS SOFTWARE.                                                       */
 /*                                                                       */
 /*************************************************************************/
 /*                     Author :  Alan W Black                            */
 /*                     Date   :  May 1996                                */
 /*-----------------------------------------------------------------------*/
 /*                                                                       */
 /*  Various method functions                                             */
 /*=======================================================================*/

 #include <cstdlib>
 #include <iostream>
 #include <cstring>
 #include "EST_unix.h"
 #include "EST_cutils.h"
 #include "EST_Token.h"
 #include "EST_Wagon.h"
 #include "EST_math.h"

 using namespace std;

 EST_Val WNode::predict(const WVector &d)
 {
     if (leaf())
     return impurity.value();
     else if (question.ask(d))
     return left->predict(d);
     else
     return right->predict(d);
 }

 WNode *WNode::predict_node(const WVector &d)
 {
     if (leaf())
     return this;
     else if (question.ask(d))
     return left->predict_node(d);
     else
     return right->predict_node(d);
 }

 int WNode::pure(void)
 {
     //  A node is pure if it has no sub-nodes or its not of type class

     if ((left == 0) && (right == 0))
     return TRUE;
     else if (get_impurity().type() != wnim_class)
     return TRUE;
     else
     return FALSE;
 }

 void WNode::prune(void)
 {
     // Check all sub-nodes and if they are all of the same class
     // delete their sub nodes.  Returns pureness of this node

     if (pure() == FALSE)
     {
     // Ok lets try and make it pure
     if (left != 0) left->prune();
     if (right != 0) right->prune();

     // Have to check purity as well as values to ensure left and right
     // don't further split
     if ((left  != 0) && (left->pure()  == TRUE) &&
         (right != 0)  && (right->pure() == TRUE) &&
         (left->get_impurity().value() == right->get_impurity().value()))
     {
          delete left; left = 0;
          delete right; right = 0;
     }
     }

 }

 void WNode::held_out_prune()
 {
     // prune tree with held out data
     // Check if node's questions differentiates for the held out data
     // if not, prune all sub_nodes

     // Rescore with prune data
     set_impurity(WImpurity(get_data()));  // for this new data

     if (left != 0)
     {
     wgn_score_question(question,get_data());
     if (question.get_score() < get_impurity().measure())
     {  // its worth goint ot the next level
         wgn_find_split(question,get_data(),
                left->get_data(),
                right->get_data());
         left->held_out_prune();
         right->held_out_prune();
     }
     else
     {  // not worth the split so prune both sub_nodes
         delete left; left = 0;
         delete right; right = 0;
     }
     }
 }

 void WNode::print_out(ostream &s, int margin)
 {
     int i;

     s << endl;
     for (i=0;i<margin;i++) s << " ";
     s << "(";
     if (left==0) // base case
     s << impurity;
     else
     {
     s << question;
     left->print_out(s,margin+1);
     right->print_out(s,margin+1);
     }
     s << ")";
 }

 ostream & operator <<(ostream &s, WNode &n)
 {
     // Output this node and its sub-node

     n.print_out(s,0);
     s << endl;
     return s;
 }

 void WDataSet::ignore_non_numbers()
 {
     /* For ols we want to ignore anything that is categorial */
     int i;

     for (i=0; i<dlength; i++)
     {
         if ((p_type[i] == wndt_binary) ||
             (p_type[i] == wndt_float))
             continue;
         else
         {
             p_ignore[i] = TRUE;
         }
     }

     return;
 }

 void WDataSet::load_description(const EST_String &fname, LISP ignores)
 {
     // Initialise a dataset with sizes and types
     EST_String tname;
     int i;
     LISP description,d;

     description = car(vload(fname,1));
     dlength = siod_llength(description);

     p_type.resize(dlength);
     p_ignore.resize(dlength);
     p_name.resize(dlength);

     if (wgn_predictee_name == "")
     wgn_predictee = 0;  // default predictee is first field
     else
     wgn_predictee = -1;

     for (i=0,d=description; d != NIL; d=cdr(d),i++)
     {
     p_name[i] = get_c_string(car(car(d)));
     tname = get_c_string(car(cdr(car(d))));
     p_ignore[i] = FALSE;
     if ((wgn_predictee_name != "") && (wgn_predictee_name == p_name[i]))
         wgn_predictee = i;
     if ((wgn_count_field_name != "") &&
         (wgn_count_field_name == p_name[i]))
         wgn_count_field = i;
     if ((tname == "count") || (i == wgn_count_field))
     {
         // The count must be ignored, repeat it if you want it too
         p_type[i] = wndt_ignore;  // the count must be ignored
         p_ignore[i] = TRUE;
         wgn_count_field = i;
     }
     else if ((tname == "ignore") || (siod_member_str(p_name[i],ignores)))
     {
         p_type[i] = wndt_ignore;  // user specified ignore
         p_ignore[i] = TRUE;
         if (i == wgn_predictee)
         wagon_error(EST_String("predictee \"")+p_name[i]+
                 "\" can't be ignored \n");
     }
     else if (siod_llength(car(d)) > 2)
     {
         LISP rest = cdr(car(d));
         EST_StrList sl;
         siod_list_to_strlist(rest,sl);
         p_type[i] = wgn_discretes.def(sl);
         if (streq(get_c_string(car(rest)),"_other_"))
         wgn_discretes[p_type[i]].def_val("_other_");
     }
     else if (tname == "binary")
         p_type[i] = wndt_binary;
     else if (tname == "cluster")
         p_type[i] = wndt_cluster;
     else if (tname == "vector")
         p_type[i] = wndt_vector;
     else if (tname == "trajectory")
         p_type[i] = wndt_trajectory;
     else if (tname == "ols")
         p_type[i] = wndt_ols;
     else if (tname == "matrix")
         p_type[i] = wndt_matrix;
     else if (tname == "float")
         p_type[i] = wndt_float;
     else
     {
         wagon_error(EST_String("Unknown type \"")+tname+
             "\" for field number "+itoString(i)+
                         "/"+p_name[i]+" in description file \""+fname+"\"");
     }
     }

     if (wgn_predictee == -1)
     {
     wagon_error(EST_String("predictee field \"")+wgn_predictee_name+
             "\" not found in description ");
     }
 }

 int WQuestion::ask(const WVector &w) const
 {
     // Ask this question of the given vector
     switch (op)
     {
       case wnop_equal:    // for numbers
     if (w.get_flt_val(feature_pos) == operand1.Float())
         return TRUE;
     else
         return FALSE;
       case wnop_binary:    // for numbers
     if (w.get_int_val(feature_pos) == 1)
         return TRUE;
     else
         return FALSE;
       case wnop_greaterthan:
     if (w.get_flt_val(feature_pos) > operand1.Float())
         return TRUE;
     else
         return FALSE;
       case wnop_lessthan:
     if (w.get_flt_val(feature_pos) < operand1.Float())
         return TRUE;
     else
         return FALSE;
       case wnop_is:       // for classes
     if (w.get_int_val(feature_pos) == operand1.Int())
         return TRUE;
     else
         return FALSE;
       case wnop_in:       // for subsets -- note operand is list of ints
     if (ilist_member(operandl,w.get_int_val(feature_pos)))
         return TRUE;
     else
         return FALSE;
       default:
     wagon_error("Unknown test operator");
     }

     return FALSE;
 }

 ostream& operator<<(ostream& s, const WQuestion &q)
 {
     EST_String name;
     static EST_Regex needquotes(".*[()'\";., \t\n\r].*");

     s << "(" << wgn_dataset.feat_name(q.get_fp());
     switch (q.get_op())
     {
       case wnop_equal:
     s << " = " << q.get_operand1().string();
     break;
       case wnop_binary:
     break;
       case wnop_greaterthan:
     s << " > " << q.get_operand1().Float();
     break;
       case wnop_lessthan:
     s << " < " << q.get_operand1().Float();
     break;
       case wnop_is:
     name = wgn_discretes[wgn_dataset.ftype(q.get_fp())].
         name(q.get_operand1().Int());
     s << " is ";
     if (name.matches(needquotes))
         s << quote_string(name,"\"","\\",1);
     else
         s << name;
     break;
       case wnop_matches:
     name = wgn_discretes[wgn_dataset.ftype(q.get_fp())].
         name(q.get_operand1().Int());
     s << " matches " << quote_string(name,"\"","\\",1);
     break;
       case wnop_in:
     s << " in (";
     for (int l=0; l < q.get_operandl().length(); l++)
     {
         name = wgn_discretes[wgn_dataset.ftype(q.get_fp())].
         name(q.get_operandl().nth(l));
         if (name.matches(needquotes))
         s << quote_string(name,"\"","\\",1);
         else
         s << name;
         s << " ";
     }
     s << ")";
     break;
         // SunCC wont let me add this
 //      default:
 //  s << " unknown operation ";
     }
     s << ")";

     return s;
 }

 EST_Val WImpurity::value(void)
 {
     // Returns the recommended value for this
     EST_String s;
     double prob;

     if (t==wnim_unset)
     {
     cerr << "WImpurity: no value currently set\n";
     return EST_Val(0.0);
     }
     else if (t==wnim_class)
     return EST_Val(p.most_probable(&prob));
     else if (t==wnim_cluster)
     return EST_Val(a.mean());
     else if (t==wnim_ols)     /* OLS TBA */
     return EST_Val(a.mean());
     else if (t==wnim_vector)
     return EST_Val(a.mean()); /* wnim_vector */
     else if (t==wnim_trajectory)
     return EST_Val(a.mean()); /* NOT YET WRITTEN */
     else
     return EST_Val(a.mean());
 }

 double WImpurity::samples(void)
 {
     if (t==wnim_float)
     return a.samples();
     else if (t==wnim_class)
     return (int)p.samples();
     else if (t==wnim_cluster)
     return members.length();
     else if (t==wnim_ols)
     return members.length();
     else if (t==wnim_vector)
     return members.length();
     else if (t==wnim_trajectory)
     return members.length();
     else
     return 0;
 }

 WImpurity::WImpurity(const WVectorVector &ds)
 {
     int i;
     score = NAN;

     t=wnim_unset;
     a.reset(); trajectory=0; l=0; width=0;
     data = &ds;  // for ols, model calculation
     for (i=0; i < ds.n(); i++)
     {
         if (t == wnim_ols)
             cumulate(i,1);
         else if (wgn_count_field == -1)
         cumulate((*(ds(i)))[wgn_predictee],1);
         else
         cumulate((*(ds(i)))[wgn_predictee],
              (*(ds(i)))[wgn_count_field]);
     }
 }

 float WImpurity::measure(void)
 {
     if (t == wnim_float)
     return a.variance()*a.samples();
     else if (t == wnim_vector)
     return vector_impurity();
     else if (t == wnim_trajectory)
     return trajectory_impurity();
     else if (t == wnim_matrix)
     return a.variance()*a.samples();
     else if (t == wnim_class)
     return p.entropy()*p.samples();
     else if (t == wnim_cluster)
     return cluster_impurity();
     else if (t == wnim_ols)
     return ols_impurity();  /* RMSE for OLS model */
     else
     {
     cerr << "WImpurity: can't measure unset object" << endl;
     return 0.0;
     }
 }

 float WImpurity::vector_impurity()
 {
     // Find the mean/stddev for all values in all vectors
     // sum the variances and multiply them by the number of members
     EST_Litem *pp;
     EST_Litem *countpp;
     ssize_t i,j;
     EST_SuffStats b;
     double count = 1;

     a.reset();

 #if 1
     /* simple distance */
     for (j=0; j<wgn_VertexFeats.num_channels(); j++)
     {
         if (wgn_VertexFeats.a(static_cast<ssize_t>(0),j) > 0.0)
         {
             b.reset();
             for (pp=members.head(), countpp=member_counts.head(); pp != 0; pp=pp->next(), countpp=countpp->next())
             {
                 i = members.item(pp);

         // Accumulate the value with count
                 b.cumulate(wgn_VertexTrack.a(i,j), member_counts.item(countpp)) ;
             }
             a += b.stddev();
             count = b.samples();
         }
     }
 #endif

 #if 0
     /* full covariance */
     /* worse in listening experiments */
     EST_SuffStats **cs;
     int mmm;
     cs = new EST_SuffStats *[wgn_VertexTrack.num_channels()+1];
     for (j=0; j<=wgn_VertexTrack.num_channels(); j++)
         cs[j] = new EST_SuffStats[wgn_VertexTrack.num_channels()+1];
     /* Find means for diagonal */
     for (j=0; j<wgn_VertexFeats.num_channels(); j++)
     {
         if (wgn_VertexFeats.a(0,j) > 0.0)
         {
             for (pp=members.head(); pp != 0; pp=pp->next())
                 cs[j][j] += wgn_VertexTrack.a(members.item(pp),j);
         }
     }
     for (j=0; j<wgn_VertexFeats.num_channels(); j++)
     {
         for (i=j+1; i<wgn_VertexFeats.num_channels(); i++)
             if (wgn_VertexFeats.a(0,j) > 0.0)
             {
                 for (pp=members.head(); pp != 0; pp=pp->next())
                 {
                     mmm = members.item(pp);
                     cs[i][j] += (wgn_VertexTrack.a(mmm,i)-cs[j][j].mean())*
                         (wgn_VertexTrack.a(mmm,j)-cs[j][j].mean());
                 }
             }
     }
     for (j=0; j<wgn_VertexFeats.num_channels(); j++)
     {
         for (i=j+1; i<wgn_VertexFeats.num_channels(); i++)
             if (wgn_VertexFeats.a(0,j) > 0.0)
                 a += cs[i][j].stddev();
     }
     count = cs[0][0].samples();
 #endif

 #if 0
     // look at mean euclidean distance between vectors
     EST_Litem *qq;
     int x,y;
     double d,q;
     count = 0;
     for (pp=members.head(); pp != 0; pp=pp->next())
     {
         x = members.item(pp);
         count++;
         for (qq=pp->next(); qq != 0; qq=qq->next())
         {
             y = members.item(qq);
             for (q=0.0,j=0; j<wgn_VertexFeats.num_channels(); j++)
                 if (wgn_VertexFeats.a(0,j) > 0.0)
                 {
                     d = wgn_VertexTrack(x,j)-wgn_VertexTrack(y,j);
                     q += d*d;
                 }
             a += sqrt(q);
         }

     }
 #endif

     // This is sum of stddev*samples
     return a.mean() * count;
 }

 WImpurity::~WImpurity()
 {
     int j;

     if (trajectory != 0)
     {
         for (j=0; j<l; j++)
             delete [] trajectory[j];
         delete [] trajectory;
         trajectory = 0;
         l = 0;
     }
 }


 float WImpurity::trajectory_impurity()
 {
     // Find the mean length of all the units in the cluster
     // Create that number of points
     // Interpolate each unit to that number of points
     // collect means and standard deviations for each point
     // impurity is sum of the variance for each point and each coef
     // multiplied by the number of units.
     EST_Litem *pp;
     ssize_t i, j;
     int ti;
     ssize_t s, q, ni;
     int s1l, s2l;
     double n, m, m1, m2, w;
     EST_SuffStats lss, stdss;
     EST_SuffStats l1ss, l2ss;
     int l1, l2;
     int ola=0;

     if (trajectory != 0)
     {   /* already done this */
         return score;
     }

     lss.reset();
     l = 0;
     for (pp=members.head(); pp != 0; pp=pp->next())
     {
         i = members.item(pp);
         for (q=0; q<wgn_UnitTrack.a(i,1); q++)
         {
             ni = wgn_UnitTrack.a(i,0)+q;
             if (wgn_VertexTrack.a(ni,0) == -1.0)
             {
                 l1ss += q;
                 ola = 1;
                 break;
             }
         }
         if (q==wgn_UnitTrack.a(i,1))
         {   /* can't find -1 center point, so put all in l2 */
             l1ss += 0;
             l2ss += q;
         }
         else
             l2ss += wgn_UnitTrack.a(i,1) - (q+1) - 1;
         lss += wgn_UnitTrack.a(i,1); /* length of each unit in the cluster */
         if (wgn_UnitTrack.a(i,1) > l)
             l = (int)wgn_UnitTrack.a(i,1);
     }

     if (ola==0)  /* no -1's so its not an ola type cluster */
     {
         l = ((int)lss.mean() < 7) ? 7 : (int)lss.mean();

         /* a list of SuffStats on for each point in the trajectory */
         trajectory = new EST_SuffStats *[l];
         width = wgn_VertexTrack.num_channels()+1;
         for (j=0; j<l; j++)
             trajectory[j] = new EST_SuffStats[width];

         for (pp=members.head(); pp != 0; pp=pp->next())
         {   /* for each unit */
             i = members.item(pp);
             m = (float)wgn_UnitTrack.a(i,1L)/(float)l; /* find interpolation */
             s = wgn_UnitTrack.a(i,0); /* start point */
             for (ti=0,n=0.0; ti<l; ti++,n+=m)
             {
                 ni = (int)n;  // hmm floor or nint ??
                 for (j=0; j<wgn_VertexFeats.num_channels(); j++)
                 {
                     if (wgn_VertexFeats.a(static_cast<ssize_t>(0),j) > 0.0)
                         trajectory[ti][j] += wgn_VertexTrack.a(s+ni,j);
                 }
             }
         }

         /* find sum of sum of stddev for all coefs of all traj points */
         stdss.reset();
         for (ti=0; ti<l; ti++)
             for (j=0; j<wgn_VertexFeats.num_channels(); j++)
             {
                 if (wgn_VertexFeats.a(static_cast<ssize_t>(0),j) > 0.0)
                     stdss += trajectory[ti][j].stddev();
             }

         // This is sum of all stddev * samples
         score = stdss.mean() * members.length();
     }
     else
     {   /* OLA model */
         l1 = (l1ss.mean() < 10.0) ? 10 : (int)l1ss.mean();
         l2 = (l2ss.mean() < 10.0) ? 10 : (int)l2ss.mean();
         l = l1 + l2 + 1 + 1;

         /* a list of SuffStats on for each point in the trajectory */
         trajectory = new EST_SuffStats *[l];
         for (j=0; j<l; j++)
             trajectory[j] = new EST_SuffStats[wgn_VertexTrack.num_channels()+1];

         for (pp=members.head(); pp != 0; pp=pp->next())
         {   /* for each unit */
             i = members.item(pp);
             s1l = 0;
             s = wgn_UnitTrack.a(i,0); /* start point */
             for (q=0; q<wgn_UnitTrack.a(i,1); q++)
                 if (wgn_VertexTrack.a(s+q,0) == -1.0)
                 {
                     s1l = q; /* printf("awb q is -1 at %d\n",q); */
                     break;
                 }
             s2l = (int)wgn_UnitTrack.a(i,1) - (s1l + 2);
             m1 = (float)(s1l)/(float)l1; /* find interpolation step */
             m2 = (float)(s2l)/(float)l2; /* find interpolation step */
             /* First half */
             for (ti=0,n=0.0; s1l > 0 && ti<l1; ti++,n+=m1)
             {
                 ni = s + (((int)n < s1l) ? (int)n : s1l - 1);
                 for (j=0; j<wgn_VertexFeats.num_channels(); j++)
                     if (wgn_VertexFeats.a(static_cast<ssize_t>(0),j) > 0.0)
                         trajectory[ti][j] += wgn_VertexTrack.a(ni,j);
             }
             ti = l1; /* do it explicitly in case s1l < 1 */
             for (j=0; j<wgn_VertexFeats.num_channels(); j++)
                 if (wgn_VertexFeats.a(static_cast<ssize_t>(0),j) > 0.0)
                     trajectory[ti][j] += -1;
             /* Second half */
             s += s1l+1;
             for (ti++,n=0.0; s2l > 0 && ti<l-1; ti++,n+=m2)
             {
                 ni = s + (((int)n < s2l) ? (int)n : s2l - 1);
                 for (j=0; j<wgn_VertexFeats.num_channels(); j++)
                     if (wgn_VertexFeats.a(static_cast<ssize_t>(0),j) > 0.0)
                         trajectory[ti][j] += wgn_VertexTrack.a(ni,j);
             }
             for (j=0; j<wgn_VertexFeats.num_channels(); j++)
                 if (wgn_VertexFeats.a(static_cast<ssize_t>(0),j) > 0.0)
                     trajectory[ti][j] += -2;
         }

         /* find sum of sum of stddev for all coefs of all traj points */
         /* windowing the sums with a triangular weight window         */
         stdss.reset();
         m = 1.0/(float)l1;
         for (w=0.0,ti=0; ti<l1; ti++,w+=m)
             for (j=0; j<wgn_VertexFeats.num_channels(); j++)
                 if (wgn_VertexFeats.a(static_cast<ssize_t>(0),j) > 0.0)
                 stdss += trajectory[ti][j].stddev() * w;
         m = 1.0/(float)l2;
         for (w=1.0,ti++; ti<l-1; ti++,w-=m)
             for (j=0; j<wgn_VertexFeats.num_channels(); j++)
                 if (wgn_VertexFeats.a(static_cast<ssize_t>(0),j) > 0.0)
                     stdss += trajectory[ti][j].stddev() * w;

         // This is sum of all stddev * samples
         score = stdss.mean() * members.length();
     }
     return score;
 }

 static void part_to_ols_data(EST_FMatrix &X, EST_FMatrix &Y,
                              EST_IVector &included,
                              EST_StrList &feat_names,
                              const EST_IList &members,
                              const WVectorVector &d)
 {
     int m,n,p;
     int w, xm=0;
     EST_Litem *pp;
     WVector *wv;

     w = wgn_dataset.width();
     included.resize(w);
     X.resize(members.length(),w);
     Y.resize(members.length(),1);
     feat_names.append("Intercept");
     included[0] = TRUE;

     for (p=0,pp=members.head(); pp; p++,pp=pp->next())
     {
         n = members.item(pp);
         if (n < 0)
         {
             p--;
             continue;
         }
         wv = d(n);
     Y.a_no_check(p,0) = (*wv)[0];
     X.a_no_check(p,0) = 1;
     for (m=1,xm=1; m < w; m++)
         {
             if (wgn_dataset.ftype(m) == wndt_float)
             {
                 if (p == 0) // only do this once
                 {
                     feat_names.append(wgn_dataset.feat_name(m));
                 }
                 X.a_no_check(p,xm) = (*wv)[m];
                 included.a_no_check(xm) = FALSE;
                 included.a_no_check(xm) = TRUE;
                 xm++;
             }
         }
     }

     included.resize(xm);
     X.resize(p,xm);
     Y.resize(p,1);
 }

 float WImpurity::ols_impurity()
 {
     // Build an OLS model for the current data and measure it against
     // the data itself and give a RMSE
     EST_FMatrix X,Y;
     EST_IVector included;
     EST_FMatrix coeffs;
     EST_StrList feat_names;
     float best_score;
     EST_FMatrix coeffsl;
     EST_FMatrix pred;
     float cor,rmse;

     // Load the sample members into matrices for ols
     part_to_ols_data(X,Y,included,feat_names,members,*data);

     // Find the best ols model.
     // Far too computationally expensive
     //    if (!stepwise_ols(X,Y,feat_names,0.0,coeffs,
     //                      X,Y,included,best_score))
     //  return WGN_HUGE_VAL;  // couldn't find a model

     // Non stepwise model
     if (!robust_ols(X,Y,included,coeffsl))
     {
         //        printf("no robust ols\n");
         return WGN_HUGE_VAL;
     }
     ols_apply(X,coeffsl,pred);
     ols_test(Y,pred,cor,rmse);
     best_score = cor;

     printf("Impurity OLS X(%zd,%zd) Y(%zd,%zd) %f, %f, %f\n",
              X.num_rows(),X.num_columns(),Y.num_rows(),Y.num_columns(),
              rmse,cor,
              1-best_score);
     if (fabs(coeffsl[0]) > 10000)
     {
         // printf("weird sized Intercept %f\n",coeffsl[0]);
         return WGN_HUGE_VAL;
     }

     return (1-best_score) *members.length();
 }

 float WImpurity::cluster_impurity()
 {
     // Find the mean distance between all members of the dataset
     // Uses the global DistMatrix for distances between members of
     // the cluster set.  Distances are assumed to be symmetric thus only
     // the bottom half of the distance matrix is filled
     EST_Litem *pp, *q;
     int i,j;
     double dist;

     a.reset();
     for (pp=members.head(); pp != 0; pp=pp->next())
     {
     i = members.item(pp);
     for (q=pp->next(); q != 0; q=q->next())
     {
         j = members.item(q);
         dist = (j < i ? wgn_DistMatrix.a_no_check(i,j) :
                     wgn_DistMatrix.a_no_check(j,i));
         a+=dist;  // cumulate for whole cluster
     }
     }

     // This is sum distance between cross product of members
 //    return a.sum();
     if (a.samples() > 1)
         return a.stddev() * a.samples();
     else
         return 0.0;
 }

 float WImpurity::cluster_distance(int i)
 {
     // Distance this unit is from all others in this cluster
     // in absolute standard deviations from the the mean.
     float dist = cluster_member_mean(i);
     float mdist = dist-a.mean();

     if (mdist == 0.0)
     return 0.0;
     else
     return fabs((dist-a.mean())/a.stddev());

 }

 int WImpurity::in_cluster(int i)
 {
     // Would this be a member of this cluster?.  Returns 1 if
     // its distance is less than at least one other
     float dist = cluster_member_mean(i);
     EST_Litem *pp;

     for (pp=members.head(); pp != 0; pp=pp->next())
     {
     if (dist < cluster_member_mean(members.item(pp)))
         return 1;
     }
     return 0;
 }

 float WImpurity::cluster_ranking(int i)
 {
     // Position in ranking closest to centre
     float dist = cluster_distance(i);
     EST_Litem *pp;
     int ranking = 1;

     for (pp=members.head(); pp != 0; pp=pp->next())
     {
     if (dist >= cluster_distance(members.item(pp)))
         ranking++;
     }

     return ranking;
 }

 float WImpurity::cluster_member_mean(int i)
 {
     // Returns the mean difference between this member and all others
     // in cluster
     EST_Litem *q;
     int j,n;
     double dist,sum;

     for (sum=0.0,n=0,q=members.head(); q != 0; q=q->next())
     {
     j = members.item(q);
     if (i != j)
     {
         dist = (j < i ? wgn_DistMatrix(i,j) : wgn_DistMatrix(j,i));
         sum += dist;
         n++;
     }
     }

     return ( n == 0 ? 0.0 : sum/n );
 }

 void WImpurity::cumulate(const float pv,double count)
 {
     // Cumulate data for impurity calculation

     if (wgn_dataset.ftype(wgn_predictee) == wndt_cluster)
     {
     t = wnim_cluster;
     members.append((int)pv);
     }
     else if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
     {
     t = wnim_ols;
     members.append((int)pv);
     }
     else if (wgn_dataset.ftype(wgn_predictee) == wndt_vector)
     {
     t = wnim_vector;

     // AUP: Implement counts in vectors
     members.append((int)pv);
     member_counts.append((float)count);
     }
     else if (wgn_dataset.ftype(wgn_predictee) == wndt_trajectory)
     {
     t = wnim_trajectory;
     members.append((int)pv);
     }
     else if (wgn_dataset.ftype(wgn_predictee) >= wndt_class)
     {
     if (t == wnim_unset)
         p.init(&wgn_discretes[wgn_dataset.ftype(wgn_predictee)]);
     t = wnim_class;
     p.cumulate((int)pv,count);
     }
     else if (wgn_dataset.ftype(wgn_predictee) == wndt_binary)
     {
     t = wnim_float;
     a.cumulate((int)pv,count);
     }
     else if (wgn_dataset.ftype(wgn_predictee) == wndt_float)
     {
     t = wnim_float;
     a.cumulate(pv,count);
     }
     else
     {
     wagon_error("WImpurity: cannot cumulate EST_Val type");
     }
 }

 ostream & operator <<(ostream &s, WImpurity &imp)
 {
     ssize_t j,i;
     EST_SuffStats b;

     if (imp.t == wnim_float)
     s << "(" << imp.a.stddev() << " " << imp.a.mean() << ")";
     else if (imp.t == wnim_vector)
     {
       EST_Litem *p, *countp;
     s << "((";
         imp.vector_impurity();
         if (wgn_vertex_output == "mean")  //output means
         {
             for (j=0; j<wgn_VertexTrack.num_channels(); j++)
             {
                 b.reset();
                 for (p=imp.members.head(), countp=imp.member_counts.head(); p != 0; p=p->next(), countp=countp->next())
                 {
           // Accumulate the members with their counts
           b.cumulate(wgn_VertexTrack.a((ssize_t)imp.members.item(p),j), imp.member_counts.item(countp));
           //b += wgn_VertexTrack.a(imp.members.item(p),j);
                 }
                 s << "(" << b.mean() << " ";
                 if (isfinite(b.stddev()))
                     s << b.stddev() << ")";
                 else
                     s << "0.001" << ")";
                 if (j+1<wgn_VertexTrack.num_channels())
                     s << " ";
             }
         }
         else /* output best in the cluster */
         {
             /* print out vector closest to center, rather than average */
             double best = WGN_HUGE_VAL;
             double x,d;
             ssize_t bestp = 0;
             EST_SuffStats *cs;

             cs = new EST_SuffStats [wgn_VertexTrack.num_channels()+1];

             for (j=0; j<wgn_VertexFeats.num_channels(); j++)
                 if (wgn_VertexFeats.a(static_cast<ssize_t>(0),j) > 0.0)
                 {
                     cs[j].reset();
                     for (p=imp.members.head(); p != 0; p=p->next())
                     {
                         cs[j] += wgn_VertexTrack.a((ssize_t)imp.members.item(p),j);
                     }
                 }

             for (p=imp.members.head(); p != 0; p=p->next())
             {
                 for (x=0.0,j=0; j<wgn_VertexFeats.num_channels(); j++)
                     if (wgn_VertexFeats.a(static_cast<ssize_t>(0),j) > 0.0)
                     {
                         d = (wgn_VertexTrack.a((ssize_t)imp.members.item(p),j)-cs[j].mean())
                             /* / cs[j].stddev() */ ; /* seems worse 061218 */
                         x += d*d;
                     }
                 if (x < best)
                 {
                     bestp = imp.members.item(p);
                     best = x;
                 }
             }
             for (j=0; j<wgn_VertexTrack.num_channels(); j++)
             {
                 s << "( ";
                 s << wgn_VertexTrack.a(bestp,j);
                 //                s << " 0 "; // fake stddev
                 s << " ";
                 if (isfinite(cs[j].stddev()))
                     s << cs[j].stddev();
                 else
                     s << "0";
                 s << " ) ";
                 if (j+1<wgn_VertexTrack.num_channels())
                     s << " ";
             }

             delete [] cs;
         }
     s << ") ";
     s << imp.a.mean() << ")";
     }
     else if (imp.t == wnim_trajectory)
     {
     s << "((";
         imp.trajectory_impurity();
         for (i=0; i<imp.l; i++)
         {
             s << "(";
             for (j=0; j<wgn_VertexTrack.num_channels(); j++)
             {
                 s << "(" << imp.trajectory[i][j].mean() << " "
                   << imp.trajectory[i][j].stddev() << " " << ")";
             }
             s << ")\n";
         }
     s << ") ";
     // Mean of cross product of distances (cluster score)
     s << imp.a.mean() << ")";
     }
     else if (imp.t == wnim_cluster)
     {
     EST_Litem *p;
     s << "((";
     for (p=imp.members.head(); p != 0; p=p->next())
     {
         // Ouput cluster member and its mean distance to others
         s << "(" << imp.members.item(p) << " " <<
         imp.cluster_member_mean(imp.members.item(p)) << ")";
         if (p->next() != 0)
         s << " ";
     }
     s << ") ";
     // Mean of cross product of distances (cluster score)
     s << imp.a.mean() << ")";
     }
     else if (imp.t == wnim_ols)
     {
         /* Output intercept, feature names and coefficients for ols model */
         EST_FMatrix X,Y;
         EST_IVector included;
         EST_FMatrix coeffs;
         EST_StrList feat_names;
         EST_FMatrix coeffsl;
         EST_FMatrix pred;
         float cor=0.0,rmse;

         s << "((";
         // Load the sample members into matrices for ols
         part_to_ols_data(X,Y,included,feat_names,imp.members,*(imp.data));
         if (!robust_ols(X,Y,included,coeffsl))
         {
             printf("no robust ols\n");
             // shouldn't happen
         }
         else
         {
             ols_apply(X,coeffsl,pred);
             ols_test(Y,pred,cor,rmse);
             for (i=0; i<coeffsl.num_rows(); i++)
             {
                 s << "(";
                 s << feat_names.nth(i);
                 s << " ";
                 s << coeffsl[i];
                 s << ") ";
             }
         }

     // Mean of cross product of distances (cluster score)
     s << ") " << cor << ")";
     }
     else if (imp.t == wnim_class)
     {
     EST_Litem *i;
     EST_String name;
     double prob;

     s << "(";
     for (i=imp.p.item_start(); !imp.p.item_end(i); i=imp.p.item_next(i))
     {
         imp.p.item_prob(i,name,prob);
         s << "(" << name << " " << prob << ") ";
     }
     s << imp.p.most_probable(&prob) << ")";
     }
     else
     s << "([WImpurity unset])";

     return s;
 }


wnop_matches
Definition: EST_Wagon.h:97

ols_test
int ols_test(const EST_FMatrix &real, const EST_FMatrix &predicted, float &correlation, float &rmse)
Definition: EST_ols.cc:288

WDataSet::width
int width(void) const
Definition: EST_Wagon.h:94

EST_Val::Int
int Int(void) const
Definition: EST_Val.h:141

wnim_vector
Definition: EST_Wagon.h:140

wgn_count_field
int wgn_count_field
Definition: wagon.cc:71

WImpurity::data
const WVectorVector * data
Definition: EST_Wagon.h:159

wgn_vertex_output
EST_String wgn_vertex_output
Definition: wagon.cc:78

wgn_score_question
float wgn_score_question(WQuestion &q, WVectorVector &ds)
Definition: wagon.cc:1091

EST_DiscreteProbDistribution::item_next
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
Definition: EST_DProbDist.cc:388

wnop_in
Definition: EST_Wagon.h:97

wnim_float
Definition: EST_Wagon.h:139

robust_ols
int robust_ols(const EST_FMatrix &X, const EST_FMatrix &Y, EST_FMatrix &coeffs)
Definition: EST_ols.cc:73

EST_DiscreteProbDistribution::most_probable
const EST_String & most_probable(double *prob=NULL) const
Return the most probable member of the distribution.
Definition: EST_DProbDist.cc:184

wgn_dataset
WDataSet wgn_dataset
Definition: wagon.cc:59

wnim_cluster
Definition: EST_Wagon.h:140

WNode::predict
EST_Val predict(const WVector &w)
Definition: wagon_aux.cc:51

EST_SuffStats::stddev
double stddev(void) const
standard deviation of currently cummulated values
Definition: EST_simplestats.h:168

wndt_float
Definition: EST_Wagon.h:71

wnim_ols
Definition: EST_Wagon.h:140

EST_UItem
Definition: EST_UList.h:49

EST_TMatrix::num_columns
ssize_t num_columns() const
return number of columns
Definition: EST_TMatrix.h:179

WImpurity::cluster_distance
float cluster_distance(int i)
Definition: wagon_aux.cc:848

EST_SuffStats::cumulate
void cumulate(double a, double count=1.0)
Definition: EST_simplestats.h:170

wgn_predictee
int wgn_predictee
Definition: wagon.cc:73

EST_Regex
A Regular expression class to go with the CSTR EST_String class.
Definition: EST_Regex.h:56

left
STATIC void left(STATUS Change)
Definition: editline.c:523

wgn_find_split
void wgn_find_split(WQuestion &q, WVectorVector &ds, WVectorVector &y, WVectorVector &n)
Definition: wagon.cc:775

WQuestion::ask
int ask(const WVector &w) const
Definition: wagon_aux.cc:263

WImpurity::measure
float measure(void)
Definition: wagon_aux.cc:424

wgn_discretes
Discretes wgn_discretes
Definition: wagon.cc:57

EST_Track::num_channels
int num_channels() const
return number of channels in track
Definition: EST_Track.h:657

EST_SuffStats::mean
double mean(void) const
mean of currently cummulated values
Definition: EST_simplestats.h:163

EST_Wagon.h

NIL
#define NIL
Definition: siod_defs.h:92

right
STATIC void right(STATUS Change)
Definition: editline.c:538

siod_llength
int siod_llength(LISP list)
Definition: siod.cc:202

std

WQuestion::get_op
wn_oper get_op(void) const
Definition: EST_Wagon.h:130

EST_TVector::a_no_check
INLINE const T & a_no_check(ssize_t n) const
read-only const access operator: without bounds checking
Definition: EST_TVector.h:254

EST_math.h

wgn_UnitTrack
EST_Track wgn_UnitTrack
Definition: wagon.cc:64

itoString
EST_String itoString(int n)
Make a EST_String object from an integer.
Definition: util_io.cc:141

EST_TMatrix::num_rows
ssize_t num_rows() const
return number of rows
Definition: EST_TMatrix.h:177

EST_TList::nth
T & nth(int n)
return the Nth value
Definition: EST_TList.h:145

WImpurity::trajectory
EST_SuffStats ** trajectory
Definition: EST_Wagon.h:158

operator<<
ostream & operator<<(ostream &s, WNode &n)
Definition: wagon_aux.cc:153

ssize_t
int ssize_t
Definition: EST_socket_win32.h:48

streq
#define streq(X, Y)
Definition: EST_cutils.h:57

EST_DiscreteProbDistribution::item_start
EST_Litem * item_start() const
Used for iterating through members of the distribution.
Definition: EST_DProbDist.cc:372

WImpurity::members
EST_IList members
Definition: EST_Wagon.h:156

EST_DiscreteProbDistribution::item_end
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
Definition: EST_DProbDist.cc:380

Discretes::def
int def(const EST_StrList &members)
Definition: EST_Discrete.cc:160

WImpurity::member_counts
EST_FList member_counts
Definition: EST_Wagon.h:157

WNode::held_out_prune
void held_out_prune(void)
Definition: wagon_aux.cc:107

wndt_ols
Definition: EST_Wagon.h:74

EST_unix.h

EST_UItem::next
EST_UItem * next()
Definition: EST_UList.h:55

WImpurity::in_cluster
int in_cluster(int i)
Definition: wagon_aux.cc:862

WQuestion::get_fp
int get_fp(void) const
Definition: EST_Wagon.h:129

WNode::predict_node
WNode * predict_node(const WVector &d)
Definition: wagon_aux.cc:61

vload
LISP vload(const char *fname, long cflag)
Definition: slib_file.cc:632

EST_cutils.h

wnop_lessthan
Definition: EST_Wagon.h:97

siod_list_to_strlist
void siod_list_to_strlist(LISP l, EST_StrList &a)
Definition: siod.cc:520

get_c_string
const char * get_c_string(LISP x)
Definition: slib.cc:638

wnop_greaterthan
Definition: EST_Wagon.h:96

wndt_class
Definition: EST_Wagon.h:71

WImpurity::cumulate
void cumulate(const float pv, double count=1.0)
Definition: wagon_aux.cc:915

EST_Track::a
float & a(ssize_t i, int c=0)
Definition: EST_Track.cc:1025

WImpurity::WImpurity
WImpurity()
Definition: EST_Wagon.h:163

EST_TSimpleVector< int >

l2
#define l2

wnop_is
Definition: EST_Wagon.h:97

ols_apply
int ols_apply(const EST_FMatrix &samples, const EST_FMatrix &coeffs, EST_FMatrix &res)
Definition: EST_ols.cc:185

WDataSet::feat_name
const EST_String & feat_name(const int &i) const
Definition: EST_Wagon.h:92

WImpurity::l
int l
Definition: EST_Wagon.h:161

FALSE
#define FALSE
Definition: EST_bool.h:119

WNode::prune
void prune(void)
Definition: wagon_aux.cc:83

WNode
Definition: EST_Wagon.h:225

WImpurity::value
EST_Val value(void)
Definition: wagon_aux.cc:361

EST_DiscreteProbDistribution::item_prob
void item_prob(EST_Litem *idx, EST_String &s, double &prob) const
During iteration returns name and probability given index.
Definition: EST_DProbDist.cc:418

wgn_count_field_name
EST_String wgn_count_field_name
Definition: wagon.cc:72

wndt_trajectory
Definition: EST_Wagon.h:73

mean
float mean(EST_FVector &m)
Definition: EST_multistats.cc:44

EST_Val
Definition: EST_Val.h:75

wgn_DistMatrix
EST_FMatrix wgn_DistMatrix
Definition: wagon.cc:61

wagon_error
#define wagon_error(WMESS)
Definition: EST_Wagon.h:50

EST_Val::string
const EST_String & string(void) const
Definition: EST_Val.h:161

EST_String::matches
int matches(const char *e, ssize_t pos=0) const
Exactly match this string?
Definition: EST_String.cc:651

int
getString int
Definition: EST_item_aux.cc:50

wgn_predictee_name
EST_String wgn_predictee_name
Definition: wagon.cc:74

EST_TList::append
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:196

WImpurity
Definition: EST_Wagon.h:144

EST_SuffStats::reset
void reset(void)
reset internal values
Definition: EST_simplestats.h:153

EST_UList::length
int length() const
Definition: EST_UList.cc:57

wndt_ignore
Definition: EST_Wagon.h:76

WDataSet::ignore_non_numbers
void ignore_non_numbers()
Definition: wagon_aux.cc:162

wgn_VertexTrack
EST_Track wgn_VertexTrack
Definition: wagon.cc:62

WQuestion::get_operandl
const EST_IList & get_operandl(void) const
Definition: EST_Wagon.h:132

EST_SuffStats::samples
double samples(void)
number of samples in set
Definition: EST_simplestats.h:157

WQuestion::get_operand1
const EST_Val get_operand1(void) const
Definition: EST_Wagon.h:131

wnop_equal
Definition: EST_Wagon.h:96

wnop_binary
Definition: EST_Wagon.h:96

WImpurity::~WImpurity
~WImpurity()
Definition: wagon_aux.cc:547

X
#define X

wnim_matrix
Definition: EST_Wagon.h:140

EST_TVector< WVector * >

wndt_vector
Definition: EST_Wagon.h:73

EST_TList::item
T & item(const EST_Litem *p)
Definition: EST_TList.h:139

ilist_member
int ilist_member(const EST_IList &l, int i)
Definition: EST_ilist_aux.cc:53

wndt_binary
Definition: EST_Wagon.h:71

WVector::get_int_val
int get_int_val(int n) const
Definition: EST_Wagon.h:60

wnim_class
Definition: EST_Wagon.h:139

EST_UList::head
EST_UItem * head() const
Definition: EST_UList.h:97

EST_SuffStats
Definition: EST_simplestats.h:136

WVector::get_flt_val
float get_flt_val(int n) const
Definition: EST_Wagon.h:61

wndt_matrix
Definition: EST_Wagon.h:73

EST_TMatrix::a_no_check
INLINE const T & a_no_check(ssize_t row, ssize_t col) const
const access with no bounds check, care recommend
Definition: EST_TMatrix.h:182

l1
#define l1

EST_TSimpleMatrix::resize
void resize(int rows, int cols, int set=1)
resize matrix
Definition: EST_TSimpleMatrix.cc:83

car
LISP car(LISP x)
Definition: slib_list.cc:115

wgn_VertexFeats
EST_Track wgn_VertexFeats
Definition: wagon.cc:63

WGN_HUGE_VAL
#define WGN_HUGE_VAL
Definition: EST_Wagon.h:54

wnim_trajectory
Definition: EST_Wagon.h:141

EST_String
EST_String
Definition: EST_features_aux.cc:50

WDataSet::ftype
int ftype(const int &i) const
Definition: EST_Wagon.h:89

quote_string
EST_String quote_string(const EST_String &s, const EST_String &quote, const EST_String &escape, int force)
Definition: EST_Token.cc:844

sum
float sum(const EST_FMatrix &a)
sum of elements
Definition: vec_mat_aux.cc:147

TRUE
#define TRUE
Definition: EST_bool.h:118

EST_TVector::n
INLINE ssize_t n() const
number of items in vector.
Definition: EST_TVector.h:251

siod_member_str
LISP siod_member_str(const char *key, LISP list)
Definition: siod.cc:167

EST_Token.h

WDataSet::load_description
void load_description(const EST_String &descfname, LISP ignores)
Definition: wagon_aux.cc:181

WQuestion
Definition: EST_Wagon.h:99

WImpurity::cluster_ranking
float cluster_ranking(int i)
Definition: wagon_aux.cc:877

EST_TSimpleVector::resize
void resize(int n, int set=1)
resize vector
Definition: EST_TSimpleVector.cc:67

wnim_unset
Definition: EST_Wagon.h:139

cdr
LISP cdr(LISP x)
Definition: slib_list.cc:124

wndt_cluster
Definition: EST_Wagon.h:73

EST_FMatrix
Definition: EST_FMatrix.h:59

WImpurity::samples
double samples(void)
Definition: wagon_aux.cc:386

WVector
Definition: EST_Wagon.h:56

EST_String
Definition: EST_String.h:76

EST_Val::Float
float Float(void) const
Definition: EST_Val.h:149

EST_TList
Definition: EST_TList.h:61