Edinburgh Speech Tools  2.1-release
dlist.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : February 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* Decision lists */
37 /* These are like a right branching decision tree, yes answers are not */
38 /* partitioned any further. These are used for the Yarowsky-style */
39 /* homograph disambiguators */
40 /* */
41 /* Yarowsky, D. "Homograph Disambiguation in Text-to_Speech Synthesis" */
42 /* in "Progress in Speech Synthesis" eds van Santen, J. et al. Springer */
43 /* pp 157-172, 1996 */
44 /* Rivest, R.L. "Learning decision lists", Machine Learning 2:229-246 */
45 /* 1987 */
46 /* */
47 /*=======================================================================*/
48 #include <iostream>
49 #include <cstring>
50 #include "EST_Wagon.h"
51 #include "EST_FMatrix.h"
52 #include "EST_multistats.h"
53 
54 using namespace std;
55 
56 static WDlist *wagon_decision_list();
57 static void do_dlist_summary(WDlist *dlist, WDataSet &dataset);
58 static WDlist *dlist_score_question(WQuestion &q, WVectorList &ds);
59 
60 WNode *wgn_build_dlist(float &score,ostream *output)
61 {
62  WDlist *dlist;
63 
64  dlist = wagon_decision_list();
65 
66  *output << *dlist;
67 
68  if (wgn_test_dataset.width() > 0) // load in test data
69  do_dlist_summary(dlist,wgn_test_dataset);
70  else
71  do_dlist_summary(dlist,wgn_dataset);
72 
73  score = 0;
74  return 0;
75 }
76 
77 static void do_dlist_summary(WDlist *dlist, WDataSet &dataset)
78 {
79  // Test dlist against data to get summary of results
80  EST_StrStr_KVL pairs;
81  EST_StrList lex;
82  EST_Litem *p;
83  EST_String predict,real;
84  int i,type;
85 
86  for (p=dataset.head(); p != 0; p=p->next())
87  {
88  predict = (EST_String)dlist->predict(*dataset(p));
89  type = dataset.ftype(0);
90  real = wgn_discretes[type].name(dataset(p)->get_int_val(0));
91  pairs.add_item(real,predict,1);
92  }
93  for (i=0; i<wgn_discretes[dataset.ftype(0)].length(); i++)
94  lex.append(wgn_discretes[dataset.ftype(0)].name(i));
95 
96  const EST_FMatrix &m = confusion(pairs,lex);
97  print_confusion(m,pairs,lex);
98 
99 }
100 
101 static WDlist *wagon_decision_list()
102 {
103  // For each possible question, measure it using
104  // yarowsky's loglikelihood ratio
105  // Abs(log(P(class_1|question_j)/P(class_2|question_j)))
106  // Output it for external sorting
107  // Hmm this implies binary classes
108  int i,cl;
109  WQuestion ques;
110  WDlist *dlist, *d;
111 
112  dlist=0;
113  for (i=1;i < wgn_dataset.width(); i++)
114  {
115  if (wgn_dataset.ftype(i) == wndt_ignore)
116  continue;
117  else if (wgn_dataset.ftype(i) >= wndt_class)
118  {
119  ques.set_fp(i);
120  ques.set_oper(wnop_is);
121  for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(i)].length(); cl++)
122  {
123  ques.set_operand1(EST_Val(cl));
124  d = dlist_score_question(ques,wgn_dataset);
125  if (d != 0)
126  dlist = add_to_dlist(dlist,d);
127  }
128  }
129  }
130 
131  return dlist;
132 }
133 
134 static WDlist *dlist_score_question(WQuestion &q, WVectorList &ds)
135 {
136  // score this question for decision lists
137  // the sum of the impurities when ds is split with this question
138  WImpurity y;
139  EST_Litem *d;
140  WVector *wv;
141  int i;
142 
143  for (i=0,d=ds.head(); d != 0; d=d->next(),i++)
144  {
145  wv = ds(d);
146  if (q.ask(*wv) == TRUE)
147  y.cumulate((*wv)[0]);
148  }
149 
150  if (y.samples() > wgn_min_cluster_size)
151  {
152  q.set_yes((int)y.samples());
153 
155 
156  // Generalizing the formula in Yarowsky (pp157-172) we modify it
157  // to take absolute log-likelihood ration of the most probability
158  // of the most probable over the rest
159  EST_String t;
160  double p;
161  double f;
162  WDlist *n = new WDlist;
163  n->set_question(q);
164 
165  t = pd.most_probable(&p);
166  f = pd.frequency(t);
167  n->set_score(fabs(log((f+0.0001)/(pd.samples()-f+0.0001))));
168  n->set_best(t,(int)f,(int)pd.samples());
169 
170 #if 0 // original two-case code
171  int freqa, freqb;
172  pd.item_freq(0,s,freqa);
173  pd.item_freq(1,s,freqb);
174 
175  n->set_score(fabs(log((0.0001+freqa)/(0.0001+freqb))));
176  n->set_freqs(freqa,freqb);
177 #endif
178  return n;
179  }
180 
181  return 0;
182 }
183 
185 {
186  if (p_question.ask(d))
187  return p_token;
188  else if (next == 0)
189  return "guess"; // should be a priori most probable as dlist can't help
190  else
191  return next->predict(d);
192 }
193 
195 {
196  // Add a to l at appropriate place in score order
197  WDlist *p,*lp;
198 
199  if (l == 0)
200  return a;
201  else
202  {
203  for (lp=0,p=l; p != 0; lp=p,p=p->next)
204  {
205  if (a->score() > p->score())
206  {
207  a->next = p;
208  if (lp == 0)
209  return a;
210  else
211  {
212  lp->next = a;
213  return l;
214  }
215  }
216  }
217  // add to end
218  lp->next = a;
219  }
220  return l;
221 }
222 
223 ostream &operator <<(ostream &s, WDlist &dlist)
224 {
225  s << endl;
226  s << "(";
227  s << dlist.p_question;
228  s << " ((";
229  s << dlist.p_score;
230  s << " " << dlist.p_freq << " " << dlist.p_samples <<
231  " " << dlist.p_token << "))";
232  if (dlist.next != 0)
233  s << *dlist.next;
234  else
235  s << endl;
236  s << ")";
237 
238  return s;
239 }
int width(void) const
Definition: EST_Wagon.h:94
const EST_String & most_probable(double *prob=NULL) const
Return the most probable member of the distribution.
WDataSet wgn_dataset
Definition: wagon.cc:59
void set_fp(const int &fp)
Definition: EST_Wagon.h:122
double samples(void) const
Total number of example found.
ostream & operator<<(ostream &s, WDlist &dlist)
Definition: dlist.cc:223
int ask(const WVector &w) const
Definition: wagon_aux.cc:263
Discretes wgn_discretes
Definition: wagon.cc:57
int wgn_min_cluster_size
Definition: wagon.cc:66
EST_UItem * next()
Definition: EST_UList.h:55
EST_Val predict(const WVector &w)
Definition: dlist.cc:184
void print_confusion(const EST_FMatrix &a, EST_StrStr_KVL &list, EST_StrList &lex)
Definition: confusion.cc:77
double frequency(const EST_String &s) const
void cumulate(const float pv, double count=1.0)
Definition: wagon_aux.cc:915
void set_operand1(const EST_Val &a)
Definition: EST_Wagon.h:124
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index.
void set_yes(const int &y)
Definition: EST_Wagon.h:125
EST_FMatrix confusion(EST_StrStr_KVL &list, EST_StrList &lex)
Definition: confusion.cc:59
WNode * wgn_build_dlist(float &score, ostream *output)
Definition: dlist.cc:60
float score() const
Definition: EST_Wagon.h:217
f
Definition: EST_item_aux.cc:48
WDlist * add_to_dlist(WDlist *l, WDlist *a)
Definition: dlist.cc:194
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:196
int add_item(const K &rkey, const V &rval, int no_search=0)
add key-val pair to list
Definition: EST_TKVL.cc:248
void set_question(const WQuestion &q)
Definition: EST_Wagon.h:214
EST_DiscreteProbDistribution & pd()
Definition: EST_Wagon.h:190
EST_UItem * head() const
Definition: EST_UList.h:97
void set_oper(const wn_oper &o)
Definition: EST_Wagon.h:123
EST_String
int ftype(const int &i) const
Definition: EST_Wagon.h:89
#define TRUE
Definition: EST_bool.h:118
void set_best(const EST_String &t, int freq, int samples)
Definition: EST_Wagon.h:215
WDataSet wgn_test_dataset
Definition: wagon.cc:60
void set_score(float s)
Definition: EST_Wagon.h:213
double samples(void)
Definition: wagon_aux.cc:386