Edinburgh Speech Tools  2.1-release
EST_Regex.cc
Go to the documentation of this file.
1  /************************************************************************/
2  /* */
3  /* Centre for Speech Technology Research */
4  /* University of Edinburgh, UK */
5  /* Copyright (c) 1997 */
6  /* All Rights Reserved. */
7  /* */
8  /* Permission is hereby granted, free of charge, to use and distribute */
9  /* this software and its documentation without restriction, including */
10  /* without limitation the rights to use, copy, modify, merge, publish, */
11  /* distribute, sublicense, and/or sell copies of this work, and to */
12  /* permit persons to whom this work is furnished to do so, subject to */
13  /* the following conditions: */
14  /* 1. The code must retain the above copyright notice, this list of */
15  /* conditions and the following disclaimer. */
16  /* 2. Any modifications must be clearly marked as such. */
17  /* 3. Original authors' names are not deleted. */
18  /* 4. The authors' names are not used to endorse or promote products */
19  /* derived from this software without specific prior written */
20  /* permission. */
21  /* */
22  /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23  /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24  /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25  /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26  /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27  /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28  /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29  /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30  /* THIS SOFTWARE. */
31  /* */
32  /************************************************************************/
33  /* Author : Richard Caley (rjc@cstr.ed.ac.uk) */
34  /* Date : February 1997 */
35  /* -------------------------------------------------------------------- */
36  /* */
37  /* A Regular expression class to go with the CSTR EST_String class. Uses*/
38  /* Henry Spencer`s regexp routines which allocate space dynamically */
39  /* using malloc, so we use free in here rather than wfree because */
40  /* wfree might at some time start doing something more than just be a */
41  /* safe wrapper around free. If you try and use another regexp */
42  /* package, beware of changes to how memory is allocated. */
43  /* */
44  /* We maintain two compiled versions, one for substring matches and */
45  /* one for whole string matches (because sometimes the regexp */
46  /* compiler can special case the latter). These are compiled when */
47  /* first used. */
48  /* */
49  /************************************************************************/
50 
51 #ifdef NO_EST
52 # include <unistd.h>
53 #else
54 # include "EST_unix.h"
55 #endif
56 #include <cstdlib>
57 #include <cstdio>
58 #include <cstring>
59 #include "EST_String.h"
60 #include "EST_Regex.h"
61 
62 using namespace std;
63 
64 #ifdef sun
65 #ifndef __svr4__
66 /* SunOS */
67 #include <cstring>
68 #endif
69 #endif
70 
71 // extern "C" {
72 #include "regexp.h"
73 
74 /*
75 void *t_regcomp(void *v)
76 {
77  return v;
78 }
79 
80 void *cpp_regcomp(void *v)
81 {
82  return v;
83 }
84 */
85 // #define wfree(P) (1==1)
86 
87 // These define the different escape conventions for the FSF's
88 // regexp code and Henry Spencer's
89 
90 static const char *fsf_magic="^$*+?[].\\";
91 static const char *fsf_magic_backslashed="()|<>";
92 static const char *spencer_magic="^$*+?[].()|\\\n";
93 static const char *spencer_magic_backslashed="<>";
94 
95 EST_Regex RXwhite("[ \n\t\r]+");
96 EST_Regex RXalpha("[A-Za-z]+");
97 EST_Regex RXlowercase("[a-z]+");
98 EST_Regex RXuppercase("[A-Z]+");
99 EST_Regex RXalphanum("[0-9A-Za-z]+");
100 EST_Regex RXidentifier("[A-Za-z_][0-9A-Za-z_]+");
101 EST_Regex RXint("-?[0-9]+");
102 EST_Regex RXdouble("-?\\(\\([0-9]+\\.[0-9]*\\)\\|\\([0-9]+\\)\\|\\(\\.[0-9]+\\)\\)\\([eE][---+]?[0-9]+\\)?");
103 
104 // use this to free compiled regex since the regexp package uses malloc
105 // and walloc might end up doing something clever.
106 
107 /* extern "C" void free(void *p); */
108 
109 #if NSUBEXP != EST_Regex_max_subexpressions
110 # error "EST_Regex_max_subexpressions must be equal to NSUBEXP"
111 #endif
112 
114 {
115  compiled = NULL;
116  compiled_match = NULL;
117 }
118 
119 EST_Regex::EST_Regex(const char *s) : EST_String(s)
120 
121 {
122 
123  compiled = NULL;
124  compiled_match = NULL;
125 
126 }
127 
129 
130 {
131  compiled = NULL;
132  compiled_match = NULL;
133 }
134 
136 {
137  compiled = NULL;
138  compiled_match = NULL;
139 }
140 
141 
143 {
144  if (compiled_match)
145  free(compiled_match);
146  if (compiled)
147  free(compiled);
148 }
149 
150 // Convert a regular expression from the external syntax (defined by the
151 // the FSF library) to the one expected by the regexp routines (which
152 // say it's V8 syntax).
153 
154 char *EST_Regex::regularize(int match) const
155 {
156  char *reg = walloc(char, size()*2+3);
157  char *r=reg;
158  const char *e;
159  int magic=0,last_was_bs=0;
160  const char * in_brackets=NULL;
161  const char *ex = (size()==0)?"":str();
162 
163  if (match && *ex != '^')
164  *(r++) = '^';
165 
166  for(e=ex; *e ; e++)
167  {
168  if (*e == '\\' && !last_was_bs)
169  {
170  last_was_bs=1;
171  continue;
172  }
173 
174  magic=strchr((last_was_bs?fsf_magic_backslashed:fsf_magic), *e)!=NULL;
175 
176  if (in_brackets)
177  {
178  *(r++) = *e;
179  if (*e == ']' && (e-in_brackets)>1)
180  in_brackets=0;
181  }
182  else if (magic)
183  {
184  if (strchr(spencer_magic_backslashed, *e))
185  *(r++) = '\\';
186 
187  *(r++) = *e;
188  if (*e == '[')
189  in_brackets=e;
190  }
191  else
192  {
193  if (strchr(spencer_magic, *e))
194  *(r++) = '\\';
195 
196  *(r++) = *e;
197  }
198  last_was_bs=0;
199  }
200 
201  if (match && (e==ex || *(e-1) != '$'))
202  {
203  if (last_was_bs)
204  *(r++) = '\\';
205  *(r++) = '$';
206  }
207 
208  *r='\0';
209 
210  // cerr<<"reg||"<<ex<<"||"<<reg<<"\n";
211 
212  return reg;
213 }
214 
216 {
217  if (!compiled)
218  {
219  char *reg=regularize(0);
220  void * t =(void *)hs_regcomp(reg);
221  compiled=t;
222  wfree(reg);
223  }
224 
225  if (!compiled)
226  cerr << "EST_Regex: can't compile '" << str() << "'\n";
227 }
228 
230 {
231  if (!compiled_match)
232  {
233  char *reg=regularize(1);
234 
235  void * t =(void *)hs_regcomp(reg);
236  compiled_match=t;
237  wfree(reg);
238  }
239 
240  if (!compiled_match)
241  cerr << "EST_Regex: can't compile '" << str() << "'\n";
242 }
243 
244 int EST_Regex::run(const char *on, size_t from, size_t &start, size_t &end, size_t *starts, size_t *ends)
245 {
246 
247  compile();
248 
249  if (compiled && from <= strlen(on))
250  {
251  if (hs_regexec((hs_regexp *)compiled, on+from))
252  {
253  hs_regexp *re = (hs_regexp *)compiled;
254 
255  start = re->startp[0] - on;
256  end = re->endp[0]- on;
257 
258  if (starts)
259  {
260  int i;
261  for (i=0; i<EST_Regex_max_subexpressions; i++)
262  starts[i] = re->startp[i]?(re->startp[i] - on):EST_STRING_ERR_IDX;
263  }
264  if (ends)
265  {
266  int i;
267  for (i=0; i<EST_Regex_max_subexpressions; i++)
268  ends[i] = re->endp[i]?(re->endp[i] - on):EST_STRING_ERR_IDX;
269  }
270 
271  return 1;
272  }
273  }
274  return 0;
275 }
276 
277 int EST_Regex::run_match(const char *on, size_t from, size_t *starts, size_t *ends)
278 {
279 
280  compile_match();
281 
282  hs_regexp *re = (hs_regexp *)compiled_match;
283 
284  if (compiled_match && from <= strlen(on))
285  if (hs_regexec(re, on+from))
286  {
287  if (starts)
288  {
289  size_t i;
290  for (i=0; i<EST_Regex_max_subexpressions; i++)
291  starts[i] = re->startp[i]?(re->startp[i] - on):EST_STRING_ERR_IDX;
292  }
293  if (ends)
294  {
295  size_t i;
296  for (i=0; i<EST_Regex_max_subexpressions; i++)
297  ends[i] = re->endp[i]?(re->endp[i] - on):EST_STRING_ERR_IDX;
298  }
299  return 1;
300  }
301 
302  return 0;
303 }
304 
306 {
307  ((EST_String &)(*this)) = (EST_String)ex;
308  compiled = NULL;
309  compiled_match = NULL;
310 
311  return *this;
312 }
313 
315 {
316  ((EST_String &)(*this)) = s;
317  compiled = NULL;
318  compiled_match = NULL;
319 
320  return *this;
321 }
322 
324 {
325  ((EST_String &)(*this)) = s;
326  compiled = NULL;
327  compiled_match = NULL;
328 
329  return *this;
330 }
331 
332 ostream &operator << (ostream &s, const EST_Regex &str)
333 {
334  return s << (EST_String)str;
335 }
336 
float end(const EST_Item &item)
Definition: EST_item_aux.cc:96
#define walloc(TYPE, SIZE)
Definition: EST_walloc.h:52
EST_String(void)
Construct an empty string.
Definition: EST_String.h:201
char * endp[NSUBEXP]
Definition: regexp.h:52
A Regular expression class to go with the CSTR EST_String class.
Definition: EST_Regex.h:56
friend ostream & operator<<(ostream &s, const EST_Regex &str)
Stream output of regular expression.
Definition: EST_Regex.cc:332
#define EST_Regex_max_subexpressions
Definition: EST_Regex.h:150
EST_Regex RXdouble("-?\\(\\([0-9]+\\.[0-9]*\\)\\|\\([0-9]+\\)\\|\\(\\.[0-9]+\\)\\)\\([eE][---+]?[0-9]+\\)?")
Floating point number.
EST_Regex(void)
Empty constructor, just for form.
Definition: EST_Regex.cc:113
STATIC char * reg(int paren, int *flagp)
Definition: regexp.cc:293
EST_Regex RXint("-?[0-9]+")
Integer.
EST_Regex RXuppercase("[A-Z]+")
Sequence of upper case alphabetic characters.
void compile_match()
Compile expression in a form which only matches whole string.
Definition: EST_Regex.cc:229
EST_Regex & operator=(const EST_Regex ex)
Definition: EST_Regex.cc:305
EST_Regex RXwhite("[ \n\t\r]+")
White space.
~EST_Regex()
Destructor.
Definition: EST_Regex.cc:142
int run_match(const char *on, size_t from=0, size_t *starts=NULL, size_t *ends=NULL)
Run to see if it matches the entire string.
Definition: EST_Regex.cc:277
#define EST_STRING_ERR_IDX
Definition: EST_String.h:116
hs_regexp * hs_regcomp(const char *exp)
Definition: regexp.cc:203
EST_Regex RXidentifier("[A-Za-z_][0-9A-Za-z_]+")
Initial letter or underscore followed by letters underscores or digits.
EST_Regex RXlowercase("[a-z]+")
Sequence of lower case alphabetic characters.
EST_Regex RXalpha("[A-Za-z]+")
Sequence of alphabetic characters.
NULL
Definition: EST_WFST.cc:55
char * regularize(int match) const
Translate the expression into the internally used syntax.
Definition: EST_Regex.cc:154
int hs_regexec(const hs_regexp *prog, const char *string)
Definition: regexp.cc:776
void compile()
Compile expression.
Definition: EST_Regex.cc:215
float start(const EST_Item &item)
Definition: EST_item_aux.cc:52
const char * str(void) const
Get a const-pointer to the actual memory.
Definition: EST_String.h:235
EST_Regex RXalphanum("[0-9A-Za-z]+")
Sequence of letters and/or digits.
void wfree(void *p)
Definition: walloc.c:131
int run(const char *on, size_t from, size_t &start, size_t &end, size_t *starts=NULL, size_t *ends=NULL)
Run to find a matching substring.
Definition: EST_Regex.cc:244
int size() const
Size of the expression.
Definition: EST_Regex.h:89
char * startp[NSUBEXP]
Definition: regexp.h:51