Edinburgh Speech Tools  2.1-release
apml.cc
Go to the documentation of this file.
1  /************************************************************************/
2  /* */
3  /* Centre for Speech Technology Research */
4  /* University of Edinburgh, UK */
5  /* Copyright (c) 2002 */
6  /* All Rights Reserved. */
7  /* */
8  /* Permission is hereby granted, free of charge, to use and distribute */
9  /* this software and its documentation without restriction, including */
10  /* without limitation the rights to use, copy, modify, merge, publish, */
11  /* distribute, sublicense, and/or sell copies of this work, and to */
12  /* permit persons to whom this work is furnished to do so, subject to */
13  /* the following conditions: */
14  /* 1. The code must retain the above copyright notice, this list of */
15  /* conditions and the following disclaimer. */
16  /* 2. Any modifications must be clearly marked as such. */
17  /* 3. Original authors' names are not deleted. */
18  /* 4. The authors' names are not used to endorse or promote products */
19  /* derived from this software without specific prior written */
20  /* permission. */
21  /* */
22  /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23  /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24  /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25  /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26  /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27  /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28  /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29  /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30  /* THIS SOFTWARE. */
31  /* */
32  /*************************************************************************/
33  /* */
34  /* Author: Rob Clark (robert@cstr.ed.ac.uk) */
35  /* -------------------------------------------------------------------- */
36  /* Code to read APML format XML as utterances. */
37  /* */
38  /*************************************************************************/
39 
40 #include <cstdlib>
41 #include <cstdio>
42 #include "EST_THash.h"
43 #include "EST_error.h"
44 #include "apml.h"
45 #include "rxp/XML_Parser.h"
46 
47 using namespace std;
48 
49 static EST_Regex simpleIDRegex(".*#id(w\\([0-9]+\\))");
50 static EST_Regex rangeIDRegex(".*#id(w\\([0-9]+\\)).*id(w\\([0-9]+\\))");
51 static EST_Regex RXpunc("[\\.,\\?\\!\"]+");
52 
53 class Parse_State
54  {
55 public:
56  int depth;
57  int maxid;
58  EST_Utterance *utt;
59  EST_Relation *tokens;
60  EST_Relation *perf;
61  EST_Relation *com;
62  EST_Relation *semstruct;
63  EST_Relation *emphasis;
64  EST_Relation *boundary;
65  EST_Relation *pause;
67  EST_Item *pending;
68  EST_Item *last_token;
69  };
70 
71 class Apml_Parser_Class : public XML_Parser_Class
72 {
73 protected:
74  virtual void document_open(XML_Parser_Class &c,
75  XML_Parser &p,
76  void *data);
77  virtual void document_close(XML_Parser_Class &c,
78  XML_Parser &p,
79  void *data);
80 
81  virtual void element_open(XML_Parser_Class &c,
82  XML_Parser &p,
83  void *data,
84  const char *name,
85  XML_Attribute_List &attributes);
86  virtual void element(XML_Parser_Class &c,
87  XML_Parser &p,
88  void *data,
89  const char *name,
90  XML_Attribute_List &attributes);
91  virtual void element_close(XML_Parser_Class &c,
92  XML_Parser &p,
93  void *data,
94  const char *name);
95 
96  virtual void pcdata(XML_Parser_Class &c,
97  XML_Parser &p,
98  void *data,
99  const char *chars);
100  virtual void cdata(XML_Parser_Class &c,
101  XML_Parser &p,
102  void *data,
103  const char *chars);
104 
105  virtual void processing(XML_Parser_Class &c,
106  XML_Parser &p,
107  void *data,
108  const char *instruction);
109  virtual void error(XML_Parser_Class &c,
110  XML_Parser &p,
111  void *data);
112 };
113 
114 static void print_attributes(XML_Attribute_List &attributes)
115 {
117 
118  for(them.begin(attributes); them ; them++)
119  printf(" %s='%s'",
120  (const char *)them->k,
121  (const char *)them->v);
122 }
123 
125  const EST_String &name,
126  EST_Utterance &u,
127  int &max_id)
128 {
129  (void)max_id;
130  (void)print_attributes; // just to shut -Wall up.
131  Apml_Parser_Class pclass;
132  Parse_State state;
133 
134  u.clear();
135 
136  state.utt=&u;
137 
138  XML_Parser *parser = pclass.make_parser(file, name, &state);
139  parser->track_context(TRUE);
140 
141  CATCH_ERRORS()
142  return read_format_error;
143 
144  parser->go();
145 
147 
148  return read_ok;
149 }
150 
151 
152 
153 /** Now we define the callbacks.
154  */
155 
156 void Apml_Parser_Class::document_open(XML_Parser_Class &c,
157  XML_Parser &p,
158  void *data)
159 {
160  (void)c; (void)p;
161  Parse_State *state = (Parse_State *)data;
162 
163  state->maxid=0;
164 
165  state->depth=1;
166  state->parent=NULL;
167  state->pending=NULL;
168  state->last_token=NULL;
169 
170  // create relations:
171  state->perf = state->utt->create_relation("Perfomative");
172  state->com = state->utt->create_relation("Communicative");
173  state->tokens = state->utt->create_relation("Token");
174  state->semstruct = state->utt->create_relation("SemStructure");
175  state->emphasis = state->utt->create_relation("Emphasis");
176  state->boundary = state->utt->create_relation("Boundary");
177  state->pause = state->utt->create_relation("Pause");
178 
179 
180 }
181 
182 void Apml_Parser_Class::document_close(XML_Parser_Class &c,
183  XML_Parser &p,
184  void *data)
185 {
186  (void)c; (void)p; (void)data;
187 }
188 
189 
190 void Apml_Parser_Class::element_open(XML_Parser_Class &c,
191  XML_Parser &p,
192  void *data,
193  const char *name,
194  XML_Attribute_List &attributes)
195 {
196  (void)c; (void)p; (void)attributes;
197  Parse_State *state = (Parse_State *)data;
198 
199  //cout << " In element_open: " << name << "\n";
200 
201  if (strcmp(name, "turnallocation")==0)
202  {
203  // currently ignore
204  return;
205  }
206 
207  if (strcmp(name, "apml")==0)
208  return; // ignore
209 
210  state->depth++;
211 
212  if( strcmp(name, "performative")==0
213  || strcmp(name, "rheme")==0
214  || strcmp(name, "theme")==0
215  || strcmp(name, "emphasis")==0
216  || strcmp(name, "boundary")==0
217  || strcmp(name, "pause")==0)
218  {
219 
220  // create new item content
221  EST_Item_Content *cont = new EST_Item_Content();
222  cont->set_name(name);
223 
225  for(them.begin(attributes); them ; them++)
226  {
227  EST_String k = them->k;
228  EST_String v = them->v;
229  cont->f.set(k,v);
230  }
231 
232  EST_Item *item;
233 
234  if( strcmp(name, "emphasis")==0 )
235  {
236  item = state->emphasis->append();
237  state->pending = item;
238  }
239  else if(strcmp(name, "boundary")==0 )
240  {
241  item = state->boundary->append();
242  if(state->last_token)
243  item->append_daughter(state->last_token);
244  }
245  else if(strcmp(name, "pause")==0 )
246  {
247  item = state->pause->append();
248  if(state->last_token)
249  item->append_daughter(state->last_token);
250  }
251  else
252  {
253  if (state->parent == NULL)
254  item = state->semstruct->append();
255  else
256  item = state->parent->append_daughter();
257  state->parent=item;
258  }
259 
260  item->set_contents(cont);
261 
262 
263  }
264  else
265  EST_warning("APML Parser: unknown element %s", name);
266 }
267 
268 
269 void Apml_Parser_Class::element(XML_Parser_Class &c,
270  XML_Parser &p,
271  void *data,
272  const char *name,
273  XML_Attribute_List &attributes)
274 {
275  (void)c; (void)p; (void)attributes;
276 
277  element_open(c, p, data, name, attributes);
278  element_close(c, p, data, name);
279 }
280 
281 
282 void Apml_Parser_Class::element_close(XML_Parser_Class &c,
283  XML_Parser &p,
284  void *data,
285  const char *name)
286 {
287  (void)c; (void)p; (void)name;
288  Parse_State *state = (Parse_State *)data;
289 
290  if ( strcmp(name, "emphasis")==0
291  || strcmp(name, "boundary")==0
292  || strcmp(name, "pause")==0 )
293  {
294  state->depth--;
295  state->pending=NULL;
296  }
297 
298 
299  if (strcmp(name, "performative")==0
300  || strcmp(name, "theme")==0
301  || strcmp(name, "rheme")==0)
302  {
303  state->depth--;
304  state->pending = NULL;
305  state->parent=state->parent->up();
306  }
307 }
308 
309 
310 void Apml_Parser_Class::pcdata(XML_Parser_Class &c,
311  XML_Parser &p,
312  void *data,
313  const char *chars)
314 {
315  (void)c;
316  (void)p;
317 
318  Parse_State *state = (Parse_State *)data;
319  EST_String strings[255];
320 
321  split(chars,strings,255,RXwhite);
322 
323  // for(int cc=0 ; cc < 20 ; ++cc)
324  // cout << cc << ": \"" << strings[cc] << "\" (" << strings[cc].length() << ")\n";
325 
326  int s=0;
327 
328  while( s < 1 || strings[s].length() > 0 )
329  {
330  if(strings[s].length() > 0 )
331  {
332  // Just Punctuation
333  if(strings[s].matches(RXpunc))
334  {
335  state->last_token->set("punc",strings[s]);
336  }
337  // Text and possibly punc
338  else
339  {
340  EST_Item_Content *cont = new EST_Item_Content();
341  EST_Item *item;
342 
343  if (state->parent == NULL)
344  item = state->semstruct->append();
345  else
346  item = state->parent->append_daughter();
347  item->set_contents(cont);
348 
349  // strip pre-punc here.
350  int i = strings[s].index(RXpunc);
351  EST_String ps = strings[s].at(RXpunc);
352  EST_String intermediate;
353  if( ps.length() > 0 && i == 0)
354  {
355  cout << "Got pre punc: " << ps << endl;
356  intermediate = strings[s].after(RXpunc);
357  // cont->set_name(strings[s].before(RXpunc));
358  item->set("prepunctuation",ps);
359  }
360  else
361  {
362  intermediate = strings[s];
363  item->set("prepunctuation","");
364  }
365  // now strip punc
366  ps = intermediate.at(RXpunc);
367  if( ps.length() > 0 )
368  {
369  cout << "Got punc: " << ps << endl;
370  cont->set_name(intermediate.before(RXpunc));
371  item->set("punc",ps);
372  }
373  else
374  {
375  cont->set_name(intermediate);
376  item->set("punc","");
377  }
378 
379  state->tokens->append(item);
380  state->last_token = item;
381 
382  if(state->pending)
383  {
384  state->pending->append_daughter(item);
385  }
386 
387  // if (state->parent != NULL && p.context(0) == "w")
388  // state->parent->set(EST_String("token"), chars);
389 
390  //cout << " got token: " << item->name() << "\n";
391  }
392  }
393  ++s;
394  }
395 }
396 
397 
398 void Apml_Parser_Class::cdata(XML_Parser_Class &c,
399  XML_Parser &p,
400  void *data,
401  const char *chars)
402 {
403  (void)c; (void)p; (void)data; (void)chars;
404  // Parse_State *state = (Parse_State *)data;
405 
406  // printf("APML XML Parser [cdata[%s]] %d\n", chars, state->depth);
407 }
408 
409 
410 void Apml_Parser_Class::processing(XML_Parser_Class &c,
411  XML_Parser &p,
412  void *data,
413  const char *instruction)
414 {
415  (void)c; (void)p;
416  Parse_State *state = (Parse_State *)data;
417 
418  printf("APML XML Parser [proc[%s]] %d\n", instruction, state->depth);
419 }
420 
421 
423  XML_Parser &p,
424  void *data)
425 {
426  (void)c; (void)p; (void)data;
427  // Parse_State *state = (Parse_State *)data;
428 
429  EST_error("APML Parser %s", get_error(p));
430 
431  est_error_throw();
432 }
433 
434 
435 
436 
437 
438 
439 
EST_read_status apml_read(FILE *file, const EST_String &name, EST_Utterance &u, int &max_id)
Definition: apml.cc:124
void set_contents(EST_Item_Content *li)
Definition: EST_Item.cc:202
#define END_CATCH_ERRORS()
Definition: EST_error.h:144
void clear()
remove everything in utterance
The file was read in successfully.
EST_Item * append_daughter(EST_Item *li=0)
Definition: EST_Item.cc:425
void set(const EST_String &name, ssize_t ival)
Definition: EST_Item.h:185
A Regular expression class to go with the CSTR EST_String class.
Definition: EST_Regex.h:56
void set(const EST_String &name, int ival)
Definition: EST_Features.h:186
A specialised hash table for when the key is an EST_String.
Definition: EST_THash.h:304
size_t index(const char *s, ssize_t pos=0) const
Position of substring (starting at pos)
Definition: EST_String.h:352
EST_Track error(EST_Track &ref, EST_Track &test, int relax=0)
EST_Features f
General features for this item.
void go()
Run the parser.
Definition: XML_Parser.cc:282
#define est_error_throw()
Definition: EST_error.h:112
EST_Regex RXwhite("[ \n\t\r]+")
White space.
void track_context(bool flag)
Definition: XML_Parser.cc:391
#define CATCH_ERRORS()
Definition: EST_error.h:126
The file exists but is not in the format specified.
NULL
Definition: EST_WFST.cc:55
#define EST_error
Definition: EST_error.h:104
void set_name(const EST_String &s)
set name
EST_read_status
size_t length(void) const
Length of string ({not} length of underlying chunk)
Definition: EST_String.h:231
void begin(const Container &over)
Set the iterator ready to run over this container.
#define EST_warning
Definition: EST_error.h:106
EST_String after(int pos, int len=1) const
Part after pos+len.
Definition: EST_String.h:308
EST_String before(int pos, int len=0) const
Part before position.
Definition: EST_String.h:276
EST_String at(int from, int len=0) const
Return part at position.
Definition: EST_String.h:292
#define TRUE
Definition: EST_bool.h:118
EST_Item * parent(const EST_Item *n)
return parent of n