58 static LISP *find_state_usage(
EST_WFST &wfst, LISP data);
60 static LISP *find_state_entropies(const
EST_WFST &wfst, LISP *data);
64 static LISP find_best_split(
EST_WFST &wfst,
67 static
double find_score_if_split(
EST_WFST &wfst,
71 static LISP find_split_pdfs(
EST_WFST &wfst,
81 static void split_state(
EST_WFST &wfst, LISP trans_list,
int ostate);
92 if (ts.
open(filename) == -1)
93 EST_error(
"wfst_train: failed to read data from \"%s\"",
94 (
const char *)filename);
107 cerr <<
"wfst_train: data contains unknown symbol \"" <<
113 while (!ts.
eoln() && !ts.
eof());
118 printf(
"wfst_train: loaded %d lines of %d tokens\n",
124 static LISP *find_state_usage(
EST_WFST &wfst, LISP data)
127 LISP *state_data =
new LISP[wfst.
num_states()];
128 static LISP ddd =
NIL;
144 ddd =
cons(state_data[i],ddd);
150 for (i=0,d=data; d; d=
cdr(d),i++)
153 for (w=
car(d); w; w=
cdr(w))
155 state_data[s] =
cons(w,state_data[s]);
160 printf(
"sentence %d not in language, skipping\n",i);
183 sentropy += w * log(w);
185 return -1 * sentropy;
191 LISP *state_entropies;
192 LISP best_trans_list =
NIL;
193 int c=0,i, max_entropy_state;
199 state_data = find_state_usage(wfst,data);
202 state_entropies = find_state_entropies(wfst,state_data);
204 max_entropy_state = -1;
213 best_trans_list = find_best_split(wfst,max_entropy_state,
215 if (best_trans_list !=
NIL)
220 delete [] state_entropies;
222 if (max_entropy_state == -1)
224 printf(
"No new max_entropy state\n");
225 delete [] state_data;
228 if (best_trans_list ==
NIL)
230 printf(
"No best_trans in max_entropy state\n");
231 delete [] state_data;
242 printf(
"c is %d\n",c);
245 printf(
"reached cycle end %d\n",c);
246 delete [] state_data;
250 split_state(wfst, best_trans_list, max_entropy_state);
256 sprintf(bbb,
"%03d",c);
257 wfst.
save(chkpntname+bbb+
".wfst");
260 delete [] state_data;
265 static int me_compare_function(
const void *a,
const void *b)
283 static LISP *find_state_entropies(
const EST_WFST &wfst, LISP *data)
285 double all_entropy = 0;
289 static LISP ddd =
NIL;
298 sentropy = entropy(s);
302 ddd =
cons(slist[i],ddd);
304 printf(
"average entropy is %g\n",all_entropy/i);
311 static LISP find_best_split(
EST_WFST &wfst,
312 int split_state_name,
326 double best_score, score, sfreq;
328 for (dd = data[split_state_name]; dd; dd =
cdr(dd))
330 splits = find_split_pdfs(wfst,split_state_name,data,pdf_all);
334 for (num_pdfs=0,s=splits; s !=
NIL; s=
cdr(s),num_pdfs++)
335 ssplits[num_pdfs] =
car(s);
337 qsort(ssplits,num_pdfs,
sizeof(LISP),me_compare_function);
345 for (b=1; b < num_pdfs; b++)
347 if (ssplits[b] ==
NIL)
349 score = score_pdf_combine(*a_pdf,*pdf(
car(
cdr(
cdr(ssplits[b])))),
351 if (score < best_score)
367 car(
cdr(ssplits[best_b]))));
370 b_pdf = pdf(
car(
cdr(
cdr(ssplits[best_b]))));
371 for (i=b_pdf->item_start(); !b_pdf->item_end(i);
372 i = b_pdf->item_next(i))
374 b_pdf->item_freq(i,sname,sfreq);
375 a_pdf->cumulate(i,sfreq);
377 ssplits[best_b] =
NIL;
383 for (dd=
car(
cdr(ssplits[0])); dd; dd=
cdr(dd))
384 printf(
"%s ",(
const char *)wfst.
in_symbol(trans(
car(dd))->in_symbol()));
423 static LISP find_split_pdfs(
EST_WFST &wfst,
424 int split_state_name,
432 LISP pdfs =
NIL,dd,ttt,p,t;
441 if ((s->
transitions(tp)->state() == split_state_name)
447 for (dd = data[i]; dd; dd =
cdr(dd))
457 value = score_pdf_combine(*pdf,empty,pdf_all);
466 pdfs =
cons(ttt,pdfs);
477 int split_state_name,
483 double best_score,bb;
486 best_score = entropy(split_state)*
siod_llength(data[split_state_name]);
498 bb = find_score_if_split(wfst,i,s->
transitions(tp),data);
515 << best_trans->
weight() <<
" " 516 << best_trans->
state() <<
" " << best_score << endl;
520 static double find_score_if_split(
EST_WFST &wfst,
535 ent_split = ent_remain = 32*32*32*32;
547 for (dd = data[fromstate]; dd; dd =
cdr(dd))
556 if (pdf_split.samples() > 0)
557 ent_split = pdf_split.entropy();
559 tostate = trans->
state();
561 for (dd = data[tostate]; dd; dd =
cdr(dd))
564 for (i=pdf_split.item_start(); !pdf_split.item_end(i);
565 i = pdf_split.item_next(i))
567 pdf_split.item_freq(i,sname,sfreq);
568 pdf_remain.cumulate(i,-1*sfreq);
570 if (pdf_remain.samples() > 0)
571 ent_remain = pdf_remain.entropy();
573 if ((pdf_remain.samples() == 0) ||
574 (pdf_split.samples() == 0))
577 score = (ent_remain * pdf_remain.samples()) +
578 (ent_split * pdf_split.samples());
594 int ostate = trans->
state();
615 static void split_state(
EST_WFST &wfst, LISP trans_list,
int ostate)
625 for (t=trans_list; t; t=
cdr(t))
626 trans(
car(t))->set_state(nstate);
LISP siod(const class EST_Val v)
EST_TokenStream & get(EST_Token &t)
get next token in stream
EST_WFST_Transition * find_best_trans_split(EST_WFST &wfst, int split_state, LISP *data)
void qsort(EST_TList< T > &a)
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
#define SIOD_REGISTER_CLASS(NAME, CLASS)
float get_c_float(LISP x)
int add_state(enum wfst_state_type state_type)
Add a new state, returns new name.
an internal class for EST_WFST for representing transitions in an WFST
a call representing a weighted finite-state transducer
double samples(void) const
Total number of example found.
void wfst_train(EST_WFST &wfst, LISP data)
#define VAL_REGISTER_TYPE_NODEL(NAME, CLASS)
long int get_c_int(LISP x)
int siod_llength(LISP list)
LISP append(LISP l1, LISP l2)
void gc_unprotect(LISP *location)
LISP load_string_data(EST_WFST &wfst, EST_String &filename)
#define VAL_REGISTER_CLASS(NAME, CLASS)
EST_Litem * item_start() const
Used for iterating through members of the distribution.
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
int in_symbol(const EST_String &s) const
Map input symbol to input alphabet index.
void start_cumulate()
Clear and start cumulation.
int open(const EST_String &filename)
open a EST_TokenStream for a file.
void stop_cumulate()
Stop cumulation and calculate probabilities on transitions.
void cumulate(const EST_String &s, double count=1)
Add this observation, may specify number of occurrences.
const EST_Discrete & in_symbols() const
Accessing the input alphabet.
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index.
an internal class for EST_WFST used to represent a state in a WFST
double entropy(void) const
LISP cons(LISP x, LISP y)
const EST_WFST_State * state(int i) const
Return internal state information.
EST_write_status save(const EST_String &filename, const EST_String type="ascii")
?
LISP setcar(LISP cell, LISP value)
EST_WFST_Transition * find_transition(int state, int in, int out) const
Find (first) transition given in and out symbols.
EST_WFST_State * state_non_const(int i)
Return internal state information (non-const)
void gc_protect(LISP *location)
void reverse(EST_Wave &sig)
wfst_translist transitions