Edinburgh Speech Tools  2.1-release
EST_DProbDist.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : July 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* Simple statistics (for discrete probability distributions */
38 /* */
39 /*=======================================================================*/
40 #include <iostream>
41 #include <fstream>
42 #include <cstdlib>
43 #include <cstdio>
44 #include <cstring>
45 #include "EST_String.h"
46 #include "EST_TKVL.h"
47 #include "EST_simplestats.h"
48 
49 using namespace std;
50 
51 /* We share ints and pointers for two types of probability distributions */
52 /* The know discrete sets can be indexed by ints which is *much* faster */
53 /* the indices pass around a pointers but the lower part contain ints in */
54 /* the discrete case */
55 /* On 64bit architectures this is a issue so we need have some macros */
56 /* to help us here. */
57 
58 int est_64to32(void *c)
59 { /* this returns the bottom end of the pointer as an unsigned int */
60  /* I believe this is a safe way to do it, we check the bits in the */
61  /* 64 bit int and multiply them out in the 32 bit one */
62  /* there might be better ways, but I think you'd need to think about */
63  /* byte order then */
64  long long l;
65  int d;
66  int i,x;
67 
68  l = (long long)c;
69 
70  for (i=0,d=0,x=1; i<24; i++)
71  {
72  if (l & 1)
73  d += x;
74  l = l >> 1;
75  x += x;
76  }
77 
78  return d;
79 }
80 /* #define tprob_int(X) ((sizeof(void *) != 8) ? est_64to32(X) : (int)X) */
81 #define tprob_int(X) (est_64to32(X))
82 
83 
85  const double n_samples, const EST_DVector &counts)
86 {
87  type = tprob_discrete;
88  discrete = d;
89  num_samples = n_samples;
90 
91  icounts = counts;
92 }
93 
95 {
96  copy(b);
97 }
98 
100 {
101  type = b.type;
102  num_samples = b.num_samples;
103  discrete = b.discrete;
104  icounts = b.icounts;
105  scounts = b.scounts;
106 }
107 
109 {
110  icounts.resize(0);
111 }
112 
114 {
115  type = tprob_string;
116  num_samples = 0;
117  discrete = 0;
118 }
119 
121 {
122  int i;
123  clear();
124  type = tprob_discrete;
125  num_samples = 0;
126  discrete = new EST_Discrete(vocab);
127 
128  icounts.resize(vocab.length());
129  for (i=0; i<icounts.length(); i++)
130  icounts.a_no_check(i) = 0;
131 
132  return true;
133 }
134 
136 {
137  int i;
138  clear();
139  type = tprob_discrete;
140  num_samples = 0;
141  discrete = d;
142  icounts.resize(d->length());
143  for (i=0; i<icounts.length(); i++)
144  icounts.a_no_check(i) = 0;
145 }
146 
148 {
149  icounts[tprob_int(i)] += count;
150  num_samples += count;
151 }
152 
154 {
155  icounts[i] += count;
156  num_samples += count;
157 }
158 
160 {
161  EST_Litem *p;
162 
163  if (type == tprob_discrete)
164  {
165  int idx = discrete->index(s);
166  icounts[idx] += count;
167  }
168  else // its a (slow) string type
169  {
170  for (p=scounts.list.head(); p != 0; p=p->next())
171  {
172  if (scounts.list(p).k == s)
173  {
174  scounts.list(p).v += count;
175  break;
176  }
177  }
178  if (p == 0) // first occurrence
179  scounts.add_item(s,count,1); // add without search
180  }
181  num_samples += count;
182 }
183 
185 {
186  EST_Litem *p,*t;
187  double max = 0;
188 
189  if (type == tprob_discrete)
190  {
191  int i,pt=-1;
192  for (i=0; i < icounts.length(); i++)
193  if (icounts.a_no_check(i) > max)
194  {
195  pt = i;
196  max = icounts.a_no_check(i);
197  }
198  if (max == 0)
199  {
200  if(prob != NULL)
201  *prob = 0.0;
202  return EST_String::Empty;
203  }
204  else
205  {
206  if(prob != NULL)
207  *prob = probability(pt);
208  return discrete->name(pt);
209  }
210  }
211  else
212  {
213  t = 0;
214  for (p=scounts.list.head(); p != 0; p=p->next())
215  if (scounts.list(p).v > max)
216  {
217  t = p;
218  max = scounts.list(p).v;
219  }
220  if (max == 0)
221  {
222  if(prob != NULL)
223  *prob = 0.0;
224  return EST_String::Empty;
225  }
226  else
227  {
228  if(prob != NULL)
229  *prob = (double)max/num_samples;
230  return scounts.list(t).k;
231  }
232  }
233 }
234 
236 {
237  if (frequency(s) == 0.0)
238  return 0.0;
239  else
240  return (double)frequency(s)/num_samples;
241 }
242 
244 {
245  if (frequency(i) == 0.0)
246  return 0.0;
247  else
248  return (double)frequency(i)/num_samples;
249 }
250 
252 {
253  if (type == tprob_discrete)
254  return icounts.a_no_check(discrete->index(s));
255  else
256  return scounts.val_def(s,0);
257 }
258 
260 {
261  if (type == tprob_discrete)
262  return icounts(i);
263  else
264  {
265  cerr << "ProbDistribution: can't access string type pd with int\n";
266  return 0;
267  }
268 }
269 
271 {
272  if (type == tprob_discrete)
273  {
274  num_samples -= icounts.a_no_check(discrete->index(s));
275  num_samples += c;
276  icounts.a_no_check(discrete->index(s)) = c;
277  }
278  else
279  {
280  num_samples -= scounts.val_def(s,0);
281  num_samples += c;
282  scounts.add_item(s,c);
283  }
284 }
285 
287 {
288  if (type == tprob_discrete)
289  {
290  num_samples -= icounts[i];
291  num_samples += c;
292  icounts[i] = c;
293  }
294  else
295  {
296  cerr << "ProbDistribution: can't access string type pd with int\n";
297  }
298 
299 }
300 
302 {
303  if (type == tprob_discrete)
304  {
305  num_samples -= icounts[tprob_int(i)];
306  num_samples += c;
307  icounts[tprob_int(i)] = c;
308  }
309  else
310  {
311  cerr << "ProbDistribution: can't access string type pd with int\n";
312  }
313 
314 }
315 
316 
318 {
319  if (type == tprob_discrete)
320  icounts.a_no_check(discrete->index(s)) = c;
321  else
322  scounts.add_item(s,c);
323 }
324 
326 {
327  if (type == tprob_discrete)
328  icounts[i] = c;
329  else
330  cerr << "ProbDistribution: can't access string type pd with int\n";
331 }
332 
334 {
335  if (type == tprob_discrete)
336  icounts[tprob_int(i)] = c;
337  else
338  cerr << "ProbDistribution: can't access string type pd with int\n";
339 }
340 
342 {
343  // Returns the entropy of the current distribution
344  double e=0.0;
345  EST_Litem *p;
346  int i;
347 
348  if (type == tprob_discrete)
349  {
350  for (i=0; i < icounts.length(); i++)
351  {
352  double prob = icounts.a_no_check(i)/num_samples;
353  if (prob != 0.0)
354  e += prob * log(prob); /* log10(prob)/log10(2) */
355  }
356  }
357  else
358  {
359  for (p=scounts.list.head(); p != 0; p=p->next())
360  {
361  double prob = scounts.list(p).v/num_samples;
362  if (prob != 0.0)
363  e += prob * log(prob); /* log10(prob)/log10(2) */
364  }
365  }
366 
367  return -e;
368 
369 }
370 
371 // For iterating through members of a probability distribution
373 {
374  if (type == tprob_discrete)
375  return NULL;
376  else
377  return scounts.list.head();
378 }
379 
381 {
382  if (type == tprob_discrete)
383  return (tprob_int(idx) >= icounts.length());
384  else
385  return (idx == 0);
386 }
387 
389 {
390  if (type == tprob_discrete)
391  return (EST_Litem *)(((unsigned char *)idx)+1);
392  else
393  return idx->next();
394 }
395 
397 {
398  if (type == tprob_discrete)
399  return discrete->name(tprob_int(idx));
400  else
401  return scounts.list(idx).k;
402 }
403 
405 {
406  if (type == tprob_discrete)
407  {
408  s = discrete->name(tprob_int(idx));
409  freq = icounts(tprob_int(idx));
410  }
411  else
412  {
413  s = scounts.list(idx).k;
414  freq = scounts.list(idx).v;
415  }
416 }
417 
419 {
420  if (type == tprob_discrete)
421  {
422  prob = probability(tprob_int(idx));
423  s = discrete->name(tprob_int(idx));
424  }
425  else
426  {
427  s = scounts.list(idx).k;
428  prob = (double)scounts.list(idx).v/num_samples;
429  }
430 }
431 
432 ostream & operator<<(ostream &s, const EST_DiscreteProbDistribution &pd)
433 {
434  // Output best with probabilities
435  EST_Litem *i;
436  double prob;
437  double sum=0;
438  EST_String name;
439 
440  s << "(";
441  for (i=pd.item_start(); !pd.item_end(i); i=pd.item_next(i))
442  {
443  pd.item_prob(i,name,prob);
444  s << "(" << name << "=" << prob << ") ";
445  sum+=prob;
446  }
447  s << "best=" << pd.most_probable(&prob) << " samples="
448  << pd.samples() << " sum=" << sum << ")";
449  return s;
450 }
451 
453 {
454  // I'd much rather this was never called
455  copy(a);
456  return *this;
457 }
458 
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
const EST_String & most_probable(double *prob=NULL) const
Return the most probable member of the distribution.
double samples(void) const
Total number of example found.
int est_64to32(void *c)
int length(void) const
The number of members in the discrete.
EST_Litem * item_start() const
Used for iterating through members of the distribution.
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
double probability(const EST_String &s) const
EST_UItem * next()
Definition: EST_UList.h:55
float max(float a, float b)
Definition: EST_cluster.cc:143
void cumulate(const EST_String &s, double count=1)
Add this observation, may specify number of occurrences.
double frequency(const EST_String &s) const
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index.
ostream & operator<<(ostream &s, const EST_DiscreteProbDistribution &pd)
double entropy(void) const
void copy(const EST_DiscreteProbDistribution &b)
Copy all data from another DPD to this.
void item_prob(EST_Litem *idx, EST_String &s, double &prob) const
During iteration returns name and probability given index.
#define tprob_int(X)
const EST_String & item_name(EST_Litem *idx) const
During iteration returns name given index.
EST_TVector< T > & copy(EST_TVector< T > a, const EST_TList< T > &in)
NULL
Definition: EST_WFST.cc:55
void clear(void)
Reset, clearing all counts and vocabulary.
A vector class for double precision floating point numbers. EST_DVector x should be used instead of f...
Definition: EST_DMatrix.h:122
void set_frequency(const EST_String &s, double c)
int length() const
Definition: EST_UList.cc:57
EST_DiscreteProbDistribution & operator=(const EST_DiscreteProbDistribution &a)
void override_frequency(const EST_String &s, double c)
Sets the frequency of named item, without modifying num\_samples.
float sum(const EST_FMatrix &a)
sum of elements
Definition: vec_mat_aux.cc:147
static const EST_String Empty
Constant empty string.
Definition: EST_String.h:110
EST_StrVector vocab
Definition: dp_main.cc:85