Edinburgh Speech Tools  2.1-release
kkcompile.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996-1998 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : December 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* A Koskenniemi/Kay/Kaplan rule compiler to WFST using the techniques */
38 /* Ritchie et al.'s "Computational Morphology" (but followed through to */
39 /* make real WFSTs). */
40 /* */
41 /*=======================================================================*/
42 #include <iostream>
43 #include "EST_WFST.h"
44 #include "EST_cutils.h"
45 
46 using namespace std;
47 
48 ostream &operator << (ostream &s, const EST_WFST &w)
49 {
50  (void)w;
51  return s << "<<EST_WFST>>";
52 }
53 
55 
56 #if defined(INSTANTIATE_TEMPLATES)
57 #include "../base_class/EST_TList.cc"
58 
60 
61 #endif
62 
63 static LISP expand_fp(const EST_String p,LISP fp);
64 static LISP find_feasible_pairs(LISP rules);
65 static LISP all_but(LISP rulepair,LISP fp);
66 static LISP expand_sets(LISP sets,LISP fp);
67 static LISP inline_sets(LISP l, LISP sets);
68 static void full_kkcompile(LISP inalpha,LISP outalpha,
69  LISP fp, LISP rules, LISP sets,
70  EST_WFST &all_wfst);
71 
72 void kkcompile(LISP ruleset, EST_WFST &all_wfst)
73 {
74  // Build a transducer from given kkrule (Kay/Kaplan/Koskenniemi)
75  // Rules are of the form LeftContext Map RightContext
76 
77  // The WFST is recognizing all string except the rulepair unless
78  // its in the proper context.
79  LISP fp; // feasible pairs, those pairs with rules (rather than IxO)
80  LISP inalpha = siod_nth(1,siod_assoc_str("Alphabets",cdr(cdr(ruleset))));
81  LISP outalpha = siod_nth(2,siod_assoc_str("Alphabets",cdr(cdr(ruleset))));
82  LISP sets = cdr(siod_assoc_str("Sets",ruleset));
83  LISP rules = cdr(siod_assoc_str("Rules",ruleset));
84 
85  fp = find_feasible_pairs(rules);
86  sets = expand_sets(sets,fp);
87 
88  full_kkcompile(inalpha,outalpha,fp,rules,sets,all_wfst);
89 }
90 
91 static void full_kkcompile(LISP inalpha,LISP outalpha,
92  LISP fp, LISP rules, LISP sets,
93  EST_WFST &all_wfst)
94 {
95  wfst_list rulelist;
96  LISP r;
97 
98  for (r=rules; r != NIL; r=cdr(r))
99  {
100  EST_WFST r_wfst,base_wfst,det_wfst;
101  rulelist.append(r_wfst);
102  EST_WFST &rr_wfst = rulelist.last(); // to avoid copying the filled one
103  cout << "Rule: " << siod_llength(rules)-siod_llength(r) << endl;
104  pprint(car(r));
105  base_wfst.kkrule_compile(inalpha,outalpha,fp,car(r),sets);
106  cout << " base " << base_wfst.summary() << endl;
107  det_wfst.determinize(base_wfst);
108  cout << " determinized " << det_wfst.summary() << endl;
109  rr_wfst.minimize(det_wfst);
110  cout << " minimized " << rr_wfst.summary() << endl;
111  }
112 
113  cout << "WFST: intersecting " << rulelist.length() << " rules" << endl;
114  EST_Litem *p,*nnp;
115  int i;
116  for (i=0,p=rulelist.head(); p->next() != 0; p=nnp)
117  {
118  EST_WFST r_wfst,base_wfst,det_wfst;
119  EST_WFST mmm;
120  rulelist.append(r_wfst);
121  EST_WFST &rr_wfst = rulelist.last(); // to avoid copying the filled one
122  cout << "intersecting " << i << " and " << i+1 << " " <<
123  rulelist.length()-2 << " left" << endl;
124  cout << " " << rulelist(p).summary() << " and " << endl;
125  cout << " " << rulelist(p->next()).summary() << " becomes " << endl;
126  mmm.intersection(rulelist(p),rulelist(p->next()));
127  cout << " " << mmm.summary() << " minimizes to " << endl;
128  rr_wfst.minimize(mmm);
129  cout << " " << rr_wfst.summary() << endl;
130  nnp=p->next()->next();
131  i+=2;
132  rulelist.remove(p->next());
133  rulelist.remove(p);
134  }
135 
136  all_wfst = rulelist.first();
137 
138 }
139 
140 static LISP expand_sets(LISP sets,LISP fp)
141 {
142  // Expand sets into regexes that represent them. Single
143  // char values are converted to disjunctions of feasible pairs
144  // that have the same surface character
145  LISP s,es=NIL,e,ne;
146 
147  for (s=sets; s != NIL; s=cdr(s))
148  {
149  for (ne=NIL,e=cdr(car(s)); e != NIL; e=cdr(e))
150  {
151  EST_String ss = get_c_string(car(e));
152  if (ss.contains("/"))
153  ne = cons(car(e),ne);
154  else
155  ne = append(expand_fp(ss,fp),ne);
156  }
157  if (ne == NIL)
158  {
159  cerr << "WFST: kkcompile: set " << get_c_string(car(car(s))) <<
160  " has no feasible pairs" << endl;
161  }
162 
163  else if (siod_llength(ne) == 1)
164  es = cons(cons(car(car(s)),ne),es);
165  else
166  es = cons(cons(car(car(s)),
167  cons(cons(rintern("or"),reverse(ne)),
168  NIL)),es);
169  }
170 
171  return reverse(es);
172 }
173 
174 static LISP expand_fp(const EST_String p,LISP fp)
175 {
176  // Find all fp's that have this p as their surface char
177  LISP m=NIL,f;
178  EST_Regex rg(EST_String("^")+p+"/.*");
179 
180  for (f=fp; f != NIL; f=cdr(f))
181  {
182  EST_String ss = get_c_string(car(f));
183  if ((p == ss) || (ss.matches(rg)))
184  m = cons(car(f),m);
185  }
186  return m;
187 }
188 
189 static LISP find_feasible_pairs(LISP rules)
190 {
191  // Find the set of pairs that have rules associated with them
192  // This effectively defines the transducer alphabet.
193  LISP fp = NIL;
194  LISP r;
195 
196  for (r=rules; r != NIL; r=cdr(r))
197  {
198  if (siod_member_str(get_c_string(siod_nth(0,car(r))),fp) == NIL)
199  fp = cons(siod_nth(0,car(r)),fp);
200  }
201  return fp;
202 }
203 
204 static int surface_coercion(LISP rt)
205 {
206  return (streq("<=",get_c_string(rt)));
207 }
208 
209 static int context_restriction(LISP rt)
210 {
211  return (streq("=>",get_c_string(rt)));
212 }
213 
214 static int composite(LISP rt)
215 {
216  return (streq("<=>",get_c_string(rt)));
217 }
218 
219 static LISP inline_sets(LISP l, LISP sets)
220 {
221  // Replace any set name with the regex equivalent
222  LISP s;
223  if (l == NIL)
224  return NIL;
225  else if (consp(l))
226  return cons(inline_sets(car(l),sets),inline_sets(cdr(l),sets));
227  else if ((s=siod_assoc_str(get_c_string(l),sets)) != NIL)
228  return car(cdr(s));
229  else
230  return l;
231 }
232 
233 void EST_WFST::kkrule_compile(LISP inalpha, LISP outalpha, LISP fp,
234  LISP rule,LISP sets)
235 {
236  // Build a WFST to transduce this particular rule
237  // Accepts any other combination of feasible pairs too
238  LISP leftcontext = inline_sets(siod_nth(2,rule),sets);
239  LISP rulepair = siod_nth(0,rule);
240  LISP ruletype = siod_nth(1,rule);
241  LISP rightcontext = inline_sets(siod_nth(4,rule),sets);
242  LISP p;
243  int i;
244  int end_LC,end_RP,end_NOTRP,end_RC,err_state;
245 
246  // Initialize alphabets
247  init(inalpha,outalpha); // should be passed as discretes
248 
249  p_start_state = add_state(wfst_final); // empty WFST
250  // Add transitions for all pairs except rulepair
251  for (p=fp; p != NIL; p=cdr(p))
252  if ((!equal(rulepair,car(p))) ||
253  (surface_coercion(ruletype)))
254  build_wfst(p_start_state,p_start_state,car(p));
255 
256  // build for LC
257  if (leftcontext)
258  {
259  end_LC = add_state(wfst_final);
260  build_wfst(p_start_state,end_LC,leftcontext);
261  // for all states in LC mark final & add epsilon to p_start_state
262  for (i=end_LC; i < p_num_states; i++)
263  {
264  build_wfst(i,p_start_state,epsilon_label());
265  p_states[i]->set_type(wfst_final);
266  }
267  }
268  else // no LC
269  end_LC = p_start_state;
270 
271  // build for RP and RC from end_LC
272  if (composite(ruletype) || context_restriction(ruletype))
273  {
274  if (rightcontext)
275  {
276  end_RP = add_state(wfst_nonfinal);
277  build_wfst(end_LC,end_RP,rulepair);
278  // build for RC from end map to p_start_state
279  build_wfst(end_RP,p_start_state,rightcontext);
280  err_state = add_state(wfst_error);
281  for (i=end_RP; i < err_state; i++)
282  { // for everything other that the correct path go to err_state
283  // without this explicit error state the epsilon to start
284  // allows almost everything
285  if (transition(i,get_c_string(epsilon_label()))
286  != WFST_ERROR_STATE)
287  break; // not a state require extra transitions
288  for (p=fp; p != NIL; p=cdr(p))
289  if (transition(i,get_c_string(car(p))) == WFST_ERROR_STATE)
290  build_wfst(i,err_state,car(p));
291  build_wfst(i,p_start_state,epsilon_label());
292  p_states[i]->set_type(wfst_licence);
293  }
294  }
295  else // no RC, so end back at start
296  build_wfst(end_LC,p_start_state,rulepair);
297  }
298 
299  // Build for notRP and RC from end_LC
300  if (composite(ruletype) || surface_coercion(ruletype))
301  {
302  LISP abrp = all_but(rulepair,fp);
303  if (abrp)
304  {
305  if (rightcontext)
306  {
307  end_RC = add_state(wfst_error);
308  end_NOTRP = add_state(wfst_nonfinal);
309  build_wfst(end_LC,end_NOTRP,abrp);
310  // build for RC from end RP to error state
311  build_wfst(end_NOTRP,end_RC,rightcontext);
312  // for all states in RC except final one mark final & add
313  // epsilon to p_start_state
314  for (i=end_NOTRP; i < p_num_states; i++)
315  {
316  build_wfst(i,p_start_state,epsilon_label());
317  p_states[i]->set_type(wfst_final);
318  }
319  }
320  else // no RC,
321  {
322  end_RC = add_state(wfst_error);
323  build_wfst(end_LC,end_RC,abrp);
324  }
325  }
326  }
327 }
328 
329 static LISP all_but(LISP rulepair,LISP fp)
330 {
331  // Returns pairs that have the same surface symbol as rulepair
332  // but different lexical symbol
333  LISP r,notrp=NIL;
334  EST_String s,l,p,sr,lr,rr;
335 
336  p = get_c_string(rulepair);
337  if (p.contains("/"))
338  {
339  s = p.before("/");
340  l = p.after("/");
341  }
342  else
343  {
344  s = p;
345  l = p;
346  }
347 
348  for (r=fp; r != NIL; r = cdr(r))
349  {
350  rr = get_c_string(car(r));
351  if (rr.contains("/"))
352  {
353  sr = rr.before("/");
354  lr = rr.after("/");
355  }
356  else
357  {
358  sr = rr;
359  lr = rr;
360  }
361  if ((l != lr) && (s == sr))
362  notrp = cons(car(r),notrp);
363  }
364 
365  if (siod_llength(notrp) > 1)
366  notrp = cons(strintern("or"),notrp);
367  return notrp;
368 }
369 
370 void intersect(wfst_list &wl, EST_WFST &all)
371 {
372  // Intersect the wfst's in wl into all
373 
374  all.intersection(wl);
375 
376 }
377 
void minimize(const EST_WFST &a)
Build minimized form of a.
Definition: wfst_ops.cc:485
int contains(const char *s, ssize_t pos=-1) const
Does it contain this substring?
Definition: EST_String.h:365
a call representing a weighted finite-state transducer
Definition: EST_WFST.h:154
A Regular expression class to go with the CSTR EST_String class.
Definition: EST_Regex.h:56
#define NIL
Definition: siod_defs.h:92
#define Instantiate_TList(TYPE)
Definition: EST_TListI.h:61
int siod_llength(LISP list)
Definition: siod.cc:202
const T & last() const
return const reference to last item in list
Definition: EST_TList.h:155
LISP strintern(const char *data)
Definition: slib_str.cc:22
LISP append(LISP l1, LISP l2)
Definition: slib_list.cc:177
void intersect(wfst_list &wl, EST_WFST &all)
Definition: kkcompile.cc:370
void intersection(EST_TList< EST_WFST > &wl)
Definition: wfst_ops.cc:357
#define streq(X, Y)
Definition: EST_cutils.h:57
#define WFST_ERROR_STATE
Definition: EST_WFST.h:52
LISP siod_assoc_str(const char *key, LISP alist)
Definition: siod.cc:125
void determinize(const EST_WFST &a)
Build determinized form of a.
Definition: wfst_ops.cc:166
Declare_TList(EST_WFST) static LISP expand_fp(const EST_String p
LISP siod_nth(int nth, LISP list)
Definition: siod.cc:214
LISP equal(LISP, LISP)
Definition: slib_list.cc:133
EST_UItem * next()
Definition: EST_UList.h:55
void pprint(LISP exp)
Definition: slib_file.cc:95
const char * get_c_string(LISP x)
Definition: slib.cc:638
LISP cons(LISP x, LISP y)
Definition: slib_list.cc:97
void kkcompile(LISP ruleset, EST_WFST &all_wfst)
Definition: kkcompile.cc:72
void kkrule_compile(LISP inalpha, LISP outalpha, LISP fp, LISP rule, LISP sets)
Definition: kkcompile.cc:233
LISP consp(LISP x)
Definition: slib_list.cc:112
const T & first() const
return const reference to first item in list
Definition: EST_TList.h:152
f
Definition: EST_item_aux.cc:48
int matches(const char *e, ssize_t pos=0) const
Exactly match this string?
Definition: EST_String.cc:651
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:196
int length() const
Definition: EST_UList.cc:57
LISP rintern(const char *name)
Definition: slib.cc:734
ostream & operator<<(ostream &s, const EST_WFST &w)
Definition: kkcompile.cc:48
EST_String summary() const
Definition: EST_WFST.cc:647
EST_UItem * head() const
Definition: EST_UList.h:97
LISP car(LISP x)
Definition: slib_list.cc:115
EST_String after(int pos, int len=1) const
Part after pos+len.
Definition: EST_String.h:308
EST_String before(int pos, int len=0) const
Part before position.
Definition: EST_String.h:276
LISP fp
Definition: kkcompile.cc:63
EST_String
void reverse(EST_Wave &sig)
LISP siod_member_str(const char *key, LISP list)
Definition: siod.cc:167
EST_Litem * remove(EST_Litem *ptr)
Definition: EST_TList.h:180
LISP cdr(LISP x)
Definition: slib_list.cc:124