Edinburgh Speech Tools  2.1-release
regexp.cc
Go to the documentation of this file.
1 /*
2  * regcomp and regexec -- regsub and regerror are elsewhere
3  *
4  * Copyright (c) 1986 by University of Toronto.
5  * Written by Henry Spencer. Not derived from licensed software.
6  *
7  * Permission is granted to anyone to use this software for any
8  * purpose on any computer system, and to redistribute it freely,
9  * subject to the following restrictions:
10  *
11  * 1. The author is not responsible for the consequences of use of
12  * this software, no matter how awful, even if they arise
13  * from defects in it.
14  *
15  * 2. The origin of this software must not be misrepresented, either
16  * by explicit claim or by omission.
17  *
18  * 3. Altered versions must be plainly marked as such, and must not
19  * be misrepresented as being the original software.
20  *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22  *** to assist in implementing egrep.
23  *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24  *** hoptoad!gnu, on 27 Dec 1986, to add < and > for word-matching
25  *** as in BSD grep and ex.
26  *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28  *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30  *
31  * Beware that some of this code is subtly aware of the way operator
32  * precedence is structured in regular expressions. Serious changes in
33  * regular-expression syntax might require a total rethink.
34  */
35 #include <cstdio>
36 #include <cctype>
37 #include <cstdlib>
38 #include <cstring>
39 #include "regexp.h"
40 #include "regmagic.h"
41 
42 /*
43  * The "internal use only" fields in regexp.h are present to pass info from
44  * compile to execute that permits the execute phase to run lots faster on
45  * simple cases. They are:
46  *
47  * regstart char that must begin a match; '\0' if none obvious
48  * reganch is the match anchored (at beginning-of-line only)?
49  * regmust string (pointer into program) that match must include, or NULL
50  * regmlen length of regmust string
51  *
52  * Regstart and reganch permit very fast decisions on suitable starting points
53  * for a match, cutting down the work a lot. Regmust permits fast rejection
54  * of lines that cannot possibly match. The regmust tests are costly enough
55  * that regcomp() supplies a regmust only if the r.e. contains something
56  * potentially expensive (at present, the only such thing detected is * or +
57  * at the start of the r.e., which can involve a lot of backup). Regmlen is
58  * supplied because the test in regexec() needs it and regcomp() is computing
59  * it anyway.
60  */
61 
62 /*
63  * Structure for regexp "program". This is essentially a linear encoding
64  * of a nondeterministic finite-state machine (aka syntax charts or
65  * "railroad normal form" in parsing technology). Each node is an opcode
66  * plus a "next" pointer, possibly plus an operand. "Next" pointers of
67  * all nodes except BRANCH implement concatenation; a "next" pointer with
68  * a BRANCH on both ends of it is connecting two alternatives. (Here we
69  * have one of the subtle syntax dependencies: an individual BRANCH (as
70  * opposed to a collection of them) is never concatenated with anything
71  * because of operator precedence.) The operand of some types of node is
72  * a literal string; for others, it is a node leading into a sub-FSM. In
73  * particular, the operand of a BRANCH node is the first node of the branch.
74  * (NB this is *not* a tree structure: the tail of the branch connects
75  * to the thing following the set of BRANCHes.) The opcodes are:
76  */
77 
78 /* definition number opnd? meaning */
79 #define END 0 /* no End of program. */
80 #define BOL 1 /* no Match "" at beginning of line. */
81 #define EOL 2 /* no Match "" at end of line. */
82 #define ANY 3 /* no Match any one character. */
83 #define ANYOF 4 /* str Match any character in this string. */
84 #define ANYBUT 5 /* str Match any character not in this string. */
85 #define BRANCH 6 /* node Match this alternative, or the next... */
86 #define BACK 7 /* no Match "", "next" ptr points backward. */
87 #define EXACTLY 8 /* str Match this string. */
88 #define NOTHING 9 /* no Match empty string. */
89 #define STAR 10 /* node Match this (simple) thing 0 or more times. */
90 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
91 #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */
92 #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */
93 #define OPEN 20 /* no Mark this point in input as start of #n. */
94  /* OPEN+1 is number 1, etc. */
95 #define CLOSE 30 /* no Analogous to OPEN. */
96 
97 /*
98  * Opcode notes:
99  *
100  * BRANCH The set of branches constituting a single choice are hooked
101  * together with their "next" pointers, since precedence prevents
102  * anything being concatenated to any individual branch. The
103  * "next" pointer of the last BRANCH in a choice points to the
104  * thing following the whole choice. This is also where the
105  * final "next" pointer of each individual branch points; each
106  * branch starts with the operand node of a BRANCH node.
107  *
108  * BACK Normal "next" pointers all implicitly point forward; BACK
109  * exists to make loop structures possible.
110  *
111  * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
112  * BRANCH structures using BACK. Simple cases (one character
113  * per match) are implemented with STAR and PLUS for speed
114  * and to minimize recursive plunges.
115  *
116  * OPEN,CLOSE ...are numbered at compile time.
117  */
118 
119 /*
120  * A node is one char of opcode followed by two chars of "next" pointer.
121  * "Next" pointers are stored as two 8-bit pieces, high order first. The
122  * value is a positive offset from the opcode of the node containing it.
123  * An operand, if any, simply follows the node. (Note that much of the
124  * code generation knows about this implicit relationship.)
125  *
126  * Using two bytes for the "next" pointer is vast overkill for most things,
127  * but allows patterns to get big without disasters.
128  */
129 #define OP(p) (*(p))
130 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
131 #define OPERAND(p) ((p) + 3)
132 
133 /*
134  * See regmagic.h for one further detail of program structure.
135  */
136 
137 
138 /*
139  * Utility definitions.
140  */
141 #ifndef CHARBITS
142 #define UCHARAT(p) ((int)*(unsigned char *)(p))
143 #else
144 #define UCHARAT(p) ((int)*(p)&CHARBITS)
145 #endif
146 
147 #define FAIL(m) { hs_regerror(m); return(NULL); }
148 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
149 
150 /*
151  * Flags to be passed up and down.
152  */
153 #define HASWIDTH 01 /* Known never to match null string. */
154 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
155 #define SPSTART 04 /* Starts with * or +. */
156 #define WORST 0 /* Worst case. */
157 
158 /*
159  * Global work variables for regcomp().
160  */
161 static const char *regparse; /* Input-scan pointer. */
162 static int regnpar; /* () count. */
163 static char regdummy;
164 static char *regcode; /* Code-emit pointer; &regdummy = don't. */
165 static long regsize; /* Code size. */
166 
167 /*
168  * Forward declarations for regcomp()'s friends.
169  */
170 #ifndef STATIC
171 #define STATIC static
172 #endif
173 STATIC char *reg(int paren, int *flagp);
174 STATIC char *regbranch(int *flagp);
175 STATIC char *regpiece(int *flagp);
176 STATIC char *regatom(int *flagp);
177 STATIC char *regnode(char op);
178 STATIC char *regnext(register char *p);
179 STATIC void regc(unsigned char b);
180 STATIC void reginsert(char op, char *opnd);
181 STATIC void regtail(char *p, char *val);
182 STATIC void regoptail(char *p, char *val);
183 #ifdef STRCSPN
184 STATIC int strcspn();
185 #endif
186 
187 /*
188  - regcomp - compile a regular expression into internal code
189  *
190  * We can't allocate space until we know how big the compiled form will be,
191  * but we can't compile it (and thus know how big it is) until we've got a
192  * place to put the code. So we cheat: we compile it twice, once with code
193  * generation turned off and size counting turned on, and once "for real".
194  * This also means that we don't allocate space until we are sure that the
195  * thing really will compile successfully, and we never have to move the
196  * code and thus invalidate pointers into it. (Note that it has to be in
197  * one piece because free() must be able to free it all.)
198  *
199  * Beware that the optimization-preparation code in here knows about some
200  * of the structure of the compiled regexp.
201  */
202 hs_regexp *
203 hs_regcomp(const char *exp)
204 {
205  hs_regexp *r;
206  char *scan;
207  char *longest;
208  unsigned int len;
209  int flags;
210  if (exp == NULL)
211  FAIL("NULL argument");
212 
213  /* First pass: determine size, legality. */
214 #ifdef notdef
215  if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */
216 #endif
217  regparse = exp;
218  regnpar = 1;
219  regsize = 0L;
220  regcode = &regdummy;
221  regc(MAGIC);
222  if (reg(0, &flags) == NULL)
223  return(NULL);
224 
225  /* Small enough for pointer-storage convention? */
226  if (regsize >= 32767L) /* Probably could be 65535L. */
227  FAIL("regexp too big");
228 
229  /* Allocate space. */
230  r = (hs_regexp *)malloc(sizeof(hs_regexp) + (unsigned)regsize);
231  if (r == NULL)
232  FAIL("out of space");
233 
234  /* Second pass: emit code. */
235  regparse = exp;
236  regnpar = 1;
237  regcode = r->program;
238  regc(MAGIC);
239  if (reg(0, &flags) == NULL)
240  return(NULL);
241 
242  /* Dig out information for optimizations. */
243  r->regstart = '\0'; /* Worst-case defaults. */
244  r->reganch = 0;
245  r->regmust = NULL;
246  r->regmlen = 0;
247  scan = r->program+1; /* First BRANCH. */
248 
249  if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
250  scan = OPERAND(scan);
251 
252  /* Starting-point info. */
253  if (OP(scan) == EXACTLY)
254  r->regstart = *OPERAND(scan);
255  else if (OP(scan) == BOL)
256  r->reganch++;
257 
258  /*
259  * If there's something expensive in the r.e., find the
260  * longest literal string that must appear and make it the
261  * regmust. Resolve ties in favor of later strings, since
262  * the regstart check works with the beginning of the r.e.
263  * and avoiding duplication strengthens checking. Not a
264  * strong reason, but sufficient in the absence of others.
265  */
266  if (flags&SPSTART) {
267  longest = NULL;
268  len = 0;
269  for (; scan != NULL; scan = regnext(scan))
270  if ((OP(scan) == EXACTLY) &&
271  (strlen(OPERAND(scan)) >= len)) {
272  longest = OPERAND(scan);
273  len = strlen(OPERAND(scan));
274  }
275  r->regmust = longest;
276  r->regmlen = len;
277  }
278  }
279  return(r);
280 
281 }
282 
283 /*
284  - reg - regular expression, i.e. main body or parenthesized thing
285  *
286  * Caller must absorb opening parenthesis.
287  *
288  * Combining parenthesis handling with the base level of regular expression
289  * is a trifle forced, but the need to tie the tails of the branches to what
290  * follows makes it hard to avoid.
291  */
292 static char *
293 reg(int paren, int *flagp)
294  /* Parenthesized? */
295 
296 {
297  char *ret;
298  char *br;
299  char *ender;
300  int parno=0;
301  int flags;
302 
303  *flagp = HASWIDTH; /* Tentatively. */
304 
305  /* Make an OPEN node, if parenthesized. */
306  if (paren) {
307  if (regnpar >= NSUBEXP)
308  FAIL("too many ()");
309  parno = regnpar;
310  regnpar++;
311  ret = regnode(OPEN+parno);
312  } else
313  ret = NULL;
314 
315  /* Pick up the branches, linking them together. */
316  br = regbranch(&flags);
317  if (br == NULL)
318  return(NULL);
319  if (ret != NULL)
320  regtail(ret, br); /* OPEN -> first. */
321  else
322  ret = br;
323  if (!(flags&HASWIDTH))
324  *flagp &= ~HASWIDTH;
325  *flagp |= flags&SPSTART;
326  while (*regparse == '|' || *regparse == '\n') {
327  regparse++;
328  br = regbranch(&flags);
329  if (br == NULL)
330  return(NULL);
331  regtail(ret, br); /* BRANCH -> BRANCH. */
332  if (!(flags&HASWIDTH))
333  *flagp &= ~HASWIDTH;
334  *flagp |= flags&SPSTART;
335  }
336 
337  /* Make a closing node, and hook it on the end. */
338  ender = regnode((paren) ? CLOSE+parno : END);
339  regtail(ret, ender);
340 
341  /* Hook the tails of the branches to the closing node. */
342  for (br = ret; br != NULL; br = regnext(br))
343  regoptail(br, ender);
344 
345  /* Check for proper termination. */
346  if (paren && *regparse++ != ')') {
347  FAIL("unmatched ()");
348  } else if (!paren && *regparse != '\0') {
349  if (*regparse == ')') {
350  FAIL("unmatched ()");
351  } else
352  FAIL("junk on end"); /* "Can't happen". */
353  /* NOTREACHED */
354  }
355 
356  return(ret);
357 }
358 
359 /*
360  - regbranch - one alternative of an | operator
361  *
362  * Implements the concatenation operator.
363  */
364 static char *
365 regbranch(int *flagp)
366 {
367  char *ret;
368  char *chain;
369  char *latest;
370  int flags;
371 
372  *flagp = WORST; /* Tentatively. */
373 
374  ret = regnode(BRANCH);
375  chain = NULL;
376  while (*regparse != '\0' && *regparse != ')' &&
377  *regparse != '\n' && *regparse != '|') {
378  latest = regpiece(&flags);
379  if (latest == NULL)
380  return(NULL);
381  *flagp |= flags&HASWIDTH;
382  if (chain == NULL) /* First piece. */
383  *flagp |= flags&SPSTART;
384  else
385  regtail(chain, latest);
386  chain = latest;
387  }
388  if (chain == NULL) /* Loop ran zero times. */
389  (void) regnode(NOTHING);
390 
391  return(ret);
392 }
393 
394 /*
395  - regpiece - something followed by possible [*+?]
396  *
397  * Note that the branching code sequences used for ? and the general cases
398  * of * and + are somewhat optimized: they use the same NOTHING node as
399  * both the endmarker for their branch list and the body of the last branch.
400  * It might seem that this node could be dispensed with entirely, but the
401  * endmarker role is not redundant.
402  */
403 static char *
404 regpiece(int *flagp)
405 {
406  char *ret;
407  char op;
408  char *next;
409  int flags;
410 
411  ret = regatom(&flags);
412  if (ret == NULL)
413  return(NULL);
414 
415  op = *regparse;
416  if (!ISMULT(op)) {
417  *flagp = flags;
418  return(ret);
419  }
420 
421  if (!(flags&HASWIDTH) && op != '?')
422  FAIL("*+ operand could be empty");
423  *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
424 
425  if (op == '*' && (flags&SIMPLE))
426  reginsert(STAR, ret);
427  else if (op == '*') {
428  /* Emit x* as (x&|), where & means "self". */
429  reginsert(BRANCH, ret); /* Either x */
430  regoptail(ret, regnode(BACK)); /* and loop */
431  regoptail(ret, ret); /* back */
432  regtail(ret, regnode(BRANCH)); /* or */
433  regtail(ret, regnode(NOTHING)); /* null. */
434  } else if (op == '+' && (flags&SIMPLE))
435  reginsert(PLUS, ret);
436  else if (op == '+') {
437  /* Emit x+ as x(&|), where & means "self". */
438  next = regnode(BRANCH); /* Either */
439  regtail(ret, next);
440  regtail(regnode(BACK), ret); /* loop back */
441  regtail(next, regnode(BRANCH)); /* or */
442  regtail(ret, regnode(NOTHING)); /* null. */
443  } else if (op == '?') {
444  /* Emit x? as (x|) */
445  reginsert(BRANCH, ret); /* Either x */
446  regtail(ret, regnode(BRANCH)); /* or */
447  next = regnode(NOTHING); /* null. */
448  regtail(ret, next);
449  regoptail(ret, next);
450  }
451  regparse++;
452  if (ISMULT(*regparse))
453  FAIL("nested *?+");
454 
455  return(ret);
456 }
457 
458 /*
459  - regatom - the lowest level
460  *
461  * Optimization: gobbles an entire sequence of ordinary characters so that
462  * it can turn them into a single node, which is smaller to store and
463  * faster to run. Backslashed characters are exceptions, each becoming a
464  * separate node; the code is simpler that way and it's not worth fixing.
465  */
466 static char *
467 regatom(int *flagp)
468 {
469  char *ret;
470  int flags;
471 
472  *flagp = WORST; /* Tentatively. */
473 
474  switch (*regparse++) {
475  /* FIXME: these chars only have meaning at beg/end of pat? */
476  case '^':
477  ret = regnode(BOL);
478  break;
479  case '$':
480  ret = regnode(EOL);
481  break;
482  case '.':
483  ret = regnode(ANY);
484  *flagp |= HASWIDTH|SIMPLE;
485  break;
486  case '[': {
487  int class1;
488  int classend;
489 
490  if (*regparse == '^') { /* Complement of range. */
491  ret = regnode(ANYBUT);
492  regparse++;
493  } else
494  ret = regnode(ANYOF);
495  if (*regparse == ']' || *regparse == '-')
496  regc(*regparse++);
497  while (*regparse != '\0' && *regparse != ']') {
498  if (*regparse == '-') {
499  regparse++;
500  if (*regparse == ']' || *regparse == '\0')
501  regc('-');
502  else {
503  class1 = UCHARAT(regparse-2)+1;
504  classend = UCHARAT(regparse);
505  if (class1 > classend+1)
506  FAIL("invalid [] range");
507  for (; class1 <= classend; class1++)
508  regc(class1);
509  regparse++;
510  }
511  } else
512  regc(*regparse++);
513  }
514  regc('\0');
515  if (*regparse != ']')
516  FAIL("unmatched []");
517  regparse++;
518  *flagp |= HASWIDTH|SIMPLE;
519  }
520  break;
521  case '(':
522  ret = reg(1, &flags);
523  if (ret == NULL)
524  return(NULL);
525  *flagp |= flags&(HASWIDTH|SPSTART);
526  break;
527  case '\0':
528  case '|':
529  case '\n':
530  case ')':
531  FAIL("internal urp"); /* Supposed to be caught earlier. */
532  break;
533  case '?':
534  case '+':
535  case '*':
536  FAIL("?+* follows nothing");
537  break;
538  case '\\':
539  switch (*regparse++) {
540  case '\0':
541  FAIL("trailing \\");
542  break;
543  case '<':
544  ret = regnode(WORDA);
545  break;
546  case '>':
547  ret = regnode(WORDZ);
548  break;
549  /* FIXME: Someday handle \1, \2, ... */
550  default:
551  /* Handle general quoted chars in exact-match routine */
552  goto de_fault;
553  }
554  break;
555  de_fault:
556  default:
557  /*
558  * Encode a string of characters to be matched exactly.
559  *
560  * This is a bit tricky due to quoted chars and due to
561  * '*', '+', and '?' taking the SINGLE char previous
562  * as their operand.
563  *
564  * On entry, the char at regparse[-1] is going to go
565  * into the string, no matter what it is. (It could be
566  * following a \ if we are entered from the '\' case.)
567  *
568  * Basic idea is to pick up a good char in ch and
569  * examine the next char. If it's *+? then we twiddle.
570  * If it's \ then we frozzle. If it's other magic char
571  * we push ch and terminate the string. If none of the
572  * above, we push ch on the string and go around again.
573  *
574  * regprev is used to remember where "the current char"
575  * starts in the string, if due to a *+? we need to back
576  * up and put the current char in a separate, 1-char, string.
577  * When regprev is NULL, ch is the only char in the
578  * string; this is used in *+? handling, and in setting
579  * flags |= SIMPLE at the end.
580  */
581  {
582  const char *regprev;
583  char ch = 0;
584 
585  regparse--; /* Look at cur char */
586  ret = regnode(EXACTLY);
587  for ( regprev = 0 ; ; ) {
588  ch = *regparse++; /* Get current char */
589  switch (*regparse) { /* look at next one */
590 
591  default:
592  regc(ch); /* Add cur to string */
593  break;
594 
595  case '.': case '[': case '(':
596  case ')': case '|': case '\n':
597  case '$': case '^':
598  case '\0':
599  /* FIXME, $ and ^ should not always be magic */
600  magic:
601  regc(ch); /* dump cur char */
602  goto done; /* and we are done */
603 
604  case '?': case '+': case '*':
605  if (!regprev) /* If just ch in str, */
606  goto magic; /* use it */
607  /* End mult-char string one early */
608  regparse = regprev; /* Back up parse */
609  goto done;
610 
611  case '\\':
612  regc(ch); /* Cur char OK */
613  switch (regparse[1]){ /* Look after \ */
614  case '\0':
615  case '<':
616  case '>':
617  /* FIXME: Someday handle \1, \2, ... */
618  goto done; /* Not quoted */
619  default:
620  /* Backup point is \, scan * point is after it. */
621  regprev = regparse;
622  regparse++;
623  continue; /* NOT break; */
624  }
625  }
626  regprev = regparse; /* Set backup point */
627  }
628  done:
629  regc('\0');
630  *flagp |= HASWIDTH;
631  if (!regprev) /* One char? */
632  *flagp |= SIMPLE;
633  }
634  break;
635  }
636 
637  return(ret);
638 }
639 
640 /*
641  - regnode - emit a node
642  */
643 static char * /* Location. */
644 regnode(char op)
645 {
646  char *ret;
647  char *ptr;
648 
649  ret = regcode;
650  if (ret == &regdummy) {
651  regsize += 3;
652  return(ret);
653  }
654 
655  ptr = ret;
656  *ptr++ = op;
657  *ptr++ = '\0'; /* Null "next" pointer. */
658  *ptr++ = '\0';
659  regcode = ptr;
660 
661  return(ret);
662 }
663 
664 /*
665  - regc - emit (if appropriate) a byte of code
666  */
667 static void
668 regc(unsigned char b)
669 {
670  if (regcode != &regdummy)
671  *regcode++ = b;
672  else
673  regsize++;
674 }
675 
676 /*
677  - reginsert - insert an operator in front of already-emitted operand
678  *
679  * Means relocating the operand.
680  */
681 static void
682 reginsert(char op, char *opnd)
683 {
684  char *src;
685  char *dst;
686  char *place;
687 
688  if (regcode == &regdummy) {
689  regsize += 3;
690  return;
691  }
692 
693  src = regcode;
694  regcode += 3;
695  dst = regcode;
696  while (src > opnd)
697  *--dst = *--src;
698 
699  place = opnd; /* Op node, where operand used to be. */
700  *place++ = op;
701  *place++ = '\0';
702  *place++ = '\0';
703 }
704 
705 /*
706  - regtail - set the next-pointer at the end of a node chain
707  */
708 static void
709 regtail(char *p, char *val)
710 {
711  char *scan;
712  char *temp;
713  int offset;
714 
715  if (p == &regdummy)
716  return;
717 
718  /* Find last node. */
719  scan = p;
720  for (;;) {
721  temp = regnext(scan);
722  if (temp == NULL)
723  break;
724  scan = temp;
725  }
726 
727  if (OP(scan) == BACK)
728  offset = scan - val;
729  else
730  offset = val - scan;
731  *(scan+1) = (offset>>8)&0377;
732  *(scan+2) = offset&0377;
733 }
734 
735 /*
736  - regoptail - regtail on operand of first argument; nop if operandless
737  */
738 static void
739 regoptail(char *p, char *val)
740 {
741  /* "Operandless" and "op != BRANCH" are synonymous in practice. */
742  if (p == NULL || p == &regdummy || OP(p) != BRANCH)
743  return;
744  regtail(OPERAND(p), val);
745 }
746 
747 /*
748  * regexec and friends
749  */
750 
751 /*
752  * Global work variables for regexec().
753  */
754 static const char *reginput; /* EST_String-input pointer. */
755 static const char *regbol; /* Beginning of input, for ^ check. */
756 static const char **regstartp; /* Pointer to startp array. */
757 static const char **regendp; /* Ditto for endp. */
758 
759 /*
760  * Forwards.
761  */
762 STATIC int regtry(hs_regexp *prog, const char *string);
763 STATIC int regmatch(char *prog);
764 STATIC int regrepeat(char *p);
765 
766 #ifdef DEBUG
767 FILE *regnarrate = stdout;
768 void regdump();
769 STATIC char *regprop( char *scan);
770 #endif
771 
772 /*
773  - regexec - match a regexp against a string
774  */
775 int
776 hs_regexec(const hs_regexp *prog, const char *string)
777 {
778  char *s;
779 
780  /* Be paranoid... */
781  if (prog == NULL || string == NULL) {
782  hs_regerror("NULL parameter");
783  return(0);
784  }
785 
786  /* Check validity of program. */
787  if (UCHARAT(prog->program) != MAGIC) {
788  hs_regerror("corrupted program");
789  return(0);
790  }
791 
792  /* If there is a "must appear" string, look for it. */
793  if (prog->regmust != NULL) {
794  s = (char *)string;
795  while ((s = strchr(s, prog->regmust[0])) != NULL) {
796  if (strncmp(s, prog->regmust, prog->regmlen) == 0)
797  break; /* Found it. */
798  s++;
799  }
800  if (s == NULL) /* Not present. */
801  return(0);
802  }
803 
804  /* Mark beginning of line for ^ . */
805  regbol = (char *)string;
806 
807  /* Simplest case: anchored match need be tried only once. */
808  if (prog->reganch)
809  return(regtry((hs_regexp *)prog, string));
810 
811  /* Messy cases: unanchored match. */
812  s = (char *)string;
813  if (prog->regstart != '\0')
814  /* We know what char it must start with. */
815  while ((s = strchr(s, prog->regstart)) != NULL) {
816  if (regtry((hs_regexp *)prog, s))
817  return(1);
818  s++;
819  }
820  else
821  /* We don't -- general case. */
822  do {
823  if (regtry((hs_regexp *)prog, s))
824  return(1);
825  } while (*s++ != '\0');
826 
827  /* Failure. */
828  return(0);
829 }
830 
831 /*
832  - regtry - try match at specific point
833  */
834 static int /* 0 failure, 1 success */
835 regtry(hs_regexp *prog, const char *string)
836 {
837  int i;
838  char **sp;
839  char **ep;
840 
841  reginput = string;
842  regstartp = (const char **)prog->startp;
843  regendp = (const char **)prog->endp;
844 
845  sp = prog->startp;
846  ep = prog->endp;
847  for (i = NSUBEXP; i > 0; i--) {
848  *sp++ = NULL;
849  *ep++ = NULL;
850  }
851  if (regmatch(prog->program + 1)) {
852  prog->startp[0] = (char *)string;
853  prog->endp[0] = (char *)reginput;
854  return(1);
855  } else
856  return(0);
857 }
858 
859 /*
860  - regmatch - main matching routine
861  *
862  * Conceptually the strategy is simple: check to see whether the current
863  * node matches, call self recursively to see whether the rest matches,
864  * and then act accordingly. In practice we make some effort to avoid
865  * recursion, in particular by going through "ordinary" nodes (that don't
866  * need to know whether the rest of the match failed) by a loop instead of
867  * by recursion.
868  */
869 static int /* 0 failure, 1 success */
870 regmatch(char *prog)
871 {
872  char *scan; /* Current node. */
873  char *next; /* Next node. */
874 
875  scan = prog;
876 #ifdef DEBUG
877  if (scan != NULL && regnarrate)
878  fprintf(regnarrate, "%s(\n", regprop(scan));
879 #endif
880  while (scan != NULL) {
881 #ifdef DEBUG
882  if (regnarrate)
883  fprintf(regnarrate, "%s...\n", regprop(scan));
884 #endif
885  next = regnext(scan);
886 
887  switch (OP(scan)) {
888  case BOL:
889  if (reginput != regbol)
890  return(0);
891  break;
892  case EOL:
893  if (*reginput != '\0')
894  return(0);
895  break;
896  case WORDA:
897  /* Must be looking at a letter, digit, or _ */
898  if ((!isalnum((int)*reginput)) && *reginput != '_')
899  return(0);
900  /* Prev must be BOL or nonword */
901  if (reginput > regbol &&
902  (isalnum((int)reginput[-1]) || reginput[-1] == '_'))
903  return(0);
904  break;
905  case WORDZ:
906  /* Must be looking at non letter, digit, or _ */
907  if (isalnum((int)*reginput) || *reginput == '_')
908  return(0);
909  /* We don't care what the previous char was */
910  break;
911  case ANY:
912  if (*reginput == '\0')
913  return(0);
914  reginput++;
915  break;
916  case EXACTLY: {
917  int len;
918  char *opnd;
919 
920  opnd = OPERAND(scan);
921  /* Inline the first character, for speed. */
922  if (*opnd != *reginput)
923  return(0);
924  len = strlen(opnd);
925  if (len > 1 && strncmp(opnd, reginput, len) != 0)
926  return(0);
927  reginput += len;
928  }
929  break;
930  case ANYOF:
931  if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
932  return(0);
933  reginput++;
934  break;
935  case ANYBUT:
936  if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
937  return(0);
938  reginput++;
939  break;
940  case NOTHING:
941  break;
942  case BACK:
943  break;
944  case OPEN+1:
945  case OPEN+2:
946  case OPEN+3:
947  case OPEN+4:
948  case OPEN+5:
949  case OPEN+6:
950  case OPEN+7:
951  case OPEN+8:
952  case OPEN+9: {
953  int no;
954  const char *save;
955 
956  no = OP(scan) - OPEN;
957  save = reginput;
958 
959  if (regmatch(next)) {
960  /*
961  * Don't set startp if some later
962  * invocation of the same parentheses
963  * already has.
964  */
965  if (regstartp[no] == NULL)
966  regstartp[no] = save;
967  return(1);
968  } else
969  return(0);
970  }
971  break;
972  case CLOSE+1:
973  case CLOSE+2:
974  case CLOSE+3:
975  case CLOSE+4:
976  case CLOSE+5:
977  case CLOSE+6:
978  case CLOSE+7:
979  case CLOSE+8:
980  case CLOSE+9: {
981  int no;
982  const char *save;
983 
984  no = OP(scan) - CLOSE;
985  save = reginput;
986 
987  if (regmatch(next)) {
988  /*
989  * Don't set endp if some later
990  * invocation of the same parentheses
991  * already has.
992  */
993  if (regendp[no] == NULL)
994  regendp[no] = save;
995  return(1);
996  } else
997  return(0);
998  }
999  break;
1000  case BRANCH: {
1001  const char *save;
1002 
1003  if (OP(next) != BRANCH) /* No choice. */
1004  next = OPERAND(scan); /* Avoid recursion. */
1005  else {
1006  do {
1007  save = reginput;
1008  if (regmatch(OPERAND(scan)))
1009  return(1);
1010  reginput = save;
1011  scan = regnext(scan);
1012  } while (scan != NULL && OP(scan) == BRANCH);
1013  return(0);
1014  /* NOTREACHED */
1015  }
1016  }
1017  break;
1018  case STAR:
1019  case PLUS: {
1020  char nextch;
1021  int no;
1022  const char *save;
1023  int min;
1024 
1025  /*
1026  * Lookahead to avoid useless match attempts
1027  * when we know what character comes next.
1028  */
1029  nextch = '\0';
1030  if (OP(next) == EXACTLY)
1031  nextch = *OPERAND(next);
1032  min = (OP(scan) == STAR) ? 0 : 1;
1033  save = reginput;
1034 
1035  no = regrepeat(OPERAND(scan));
1036  while (no >= min) {
1037  /* If it could work, try it. */
1038  if (nextch == '\0' || *reginput == nextch)
1039  {
1040  if (regmatch(next))
1041  return(1);
1042  }
1043  /* Couldn't or didn't -- back up. */
1044  no--;
1045  reginput = save + no;
1046  }
1047  return(0);
1048  }
1049  break;
1050  case END:
1051  return(1); /* Success! */
1052  break;
1053  default:
1054  hs_regerror("memory corruption");
1055  return(0);
1056  break;
1057  }
1058 
1059  scan = next;
1060  }
1061 
1062  /*
1063  * We get here only if there's trouble -- normally "case END" is
1064  * the terminating point.
1065  */
1066  hs_regerror("corrupted pointers");
1067  return(0);
1068 }
1069 
1070 /*
1071  - regrepeat - repeatedly match something simple, report how many
1072  */
1073 static int
1074 regrepeat(char *p)
1075 {
1076  int count = 0;
1077  const char *scan;
1078  char *opnd;
1079 
1080  scan = reginput;
1081  opnd = OPERAND(p);
1082  switch (OP(p)) {
1083  case ANY:
1084  count = strlen(scan);
1085  scan += count;
1086  break;
1087  case EXACTLY:
1088  while (*opnd == *scan) {
1089  count++;
1090  scan++;
1091  }
1092  break;
1093  case ANYOF:
1094  while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1095  count++;
1096  scan++;
1097  }
1098  break;
1099  case ANYBUT:
1100  while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1101  count++;
1102  scan++;
1103  }
1104  break;
1105  default: /* Oh dear. Called inappropriately. */
1106  hs_regerror("internal foulup");
1107  count = 0; /* Best compromise. */
1108  break;
1109  }
1110  reginput = scan;
1111 
1112  return(count);
1113 }
1114 
1115 /*
1116  - regnext - dig the "next" pointer out of a node
1117  */
1118 static char *
1119 regnext(char *p)
1120 {
1121  int offset;
1122 
1123  if (p == &regdummy)
1124  return(NULL);
1125 
1126  offset = NEXT(p);
1127  if (offset == 0)
1128  return(NULL);
1129 
1130  if (OP(p) == BACK)
1131  return(p-offset);
1132  else
1133  return(p+offset);
1134 }
1135 
1136 #ifdef DEBUG
1137 
1138 STATIC char *regprop(char *scan);
1139 
1140 /*
1141  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1142  */
1143 void
1144 regdump(hs_regexp *r)
1145 {
1146  char *s;
1147  char op = EXACTLY; /* Arbitrary non-END op. */
1148  char *next;
1149 
1150  s = r->program + 1;
1151  while (op != END) { /* While that wasn't END last time... */
1152  op = OP(s);
1153  printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1154  next = regnext(s);
1155  if (next == NULL) /* Next ptr. */
1156  printf("(0)");
1157  else
1158  printf("(%d)", (s-r->program)+(next-s));
1159  s += 3;
1160  if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1161  /* Literal string, where present. */
1162  while (*s != '\0') {
1163  putchar(*s);
1164  s++;
1165  }
1166  s++;
1167  }
1168  putchar('\n');
1169  }
1170 
1171  /* Header fields of interest. */
1172  if (r->regstart != '\0')
1173  printf("start `%c' ", r->regstart);
1174  if (r->reganch)
1175  printf("anchored ");
1176  if (r->regmust != NULL)
1177  printf("must have \"%s\"", r->regmust);
1178  printf("\n");
1179 }
1180 
1181 /*
1182  - regprop - printable representation of opcode
1183  */
1184 static char *
1185 regprop(char *op)
1186 {
1187  char *p=NULL;
1188  static char buf[50];
1189 
1190  (void) strcpy(buf, ":");
1191 
1192  switch (OP(op)) {
1193  case BOL:
1194  p = "BOL";
1195  break;
1196  case EOL:
1197  p = "EOL";
1198  break;
1199  case ANY:
1200  p = "ANY";
1201  break;
1202  case ANYOF:
1203  p = "ANYOF";
1204  break;
1205  case ANYBUT:
1206  p = "ANYBUT";
1207  break;
1208  case BRANCH:
1209  p = "BRANCH";
1210  break;
1211  case EXACTLY:
1212  p = "EXACTLY";
1213  break;
1214  case NOTHING:
1215  p = "NOTHING";
1216  break;
1217  case BACK:
1218  p = "BACK";
1219  break;
1220  case END:
1221  p = "END";
1222  break;
1223  case OPEN+1:
1224  case OPEN+2:
1225  case OPEN+3:
1226  case OPEN+4:
1227  case OPEN+5:
1228  case OPEN+6:
1229  case OPEN+7:
1230  case OPEN+8:
1231  case OPEN+9:
1232  sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1233  p = NULL;
1234  break;
1235  case CLOSE+1:
1236  case CLOSE+2:
1237  case CLOSE+3:
1238  case CLOSE+4:
1239  case CLOSE+5:
1240  case CLOSE+6:
1241  case CLOSE+7:
1242  case CLOSE+8:
1243  case CLOSE+9:
1244  sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1245  p = NULL;
1246  break;
1247  case STAR:
1248  p = "STAR";
1249  break;
1250  case PLUS:
1251  p = "PLUS";
1252  break;
1253  case WORDA:
1254  p = "WORDA";
1255  break;
1256  case WORDZ:
1257  p = "WORDZ";
1258  break;
1259  default:
1260  hs_regerror("corrupted opcode");
1261  break;
1262  }
1263  if (p != NULL)
1264  (void) strcat(buf, p);
1265  return(buf);
1266 }
1267 #endif
1268 
1269 /*
1270  * The following is provided for those people who do not have strcspn() in
1271  * their C libraries. They should get off their butts and do something
1272  * about it; at least one public-domain implementation of those (highly
1273  * useful) string routines has been published on Usenet.
1274  */
1275 #ifdef STRCSPN
1276 /*
1277  * strcspn - find length of initial segment of s1 consisting entirely
1278  * of characters not from s2
1279  */
1280 
1281 static int
1282 strcspn(s1, s2)
1283 char *s1;
1284 char *s2;
1285 {
1286  char *scan1;
1287  char *scan2;
1288  int count;
1289 
1290  count = 0;
1291  for (scan1 = s1; *scan1 != '\0'; scan1++) {
1292  for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1293  if (*scan1 == *scan2++)
1294  return(count);
1295  count++;
1296  }
1297  return(count);
1298 }
1299 #endif
int regmlen
Definition: regexp.h:56
#define CLOSE
Definition: regexp.cc:95
STATIC char * regnode(char op)
Definition: regexp.cc:644
#define OP(p)
Definition: regexp.cc:129
STATIC void regc(unsigned char b)
Definition: regexp.cc:668
#define BOL
Definition: regexp.cc:80
STATIC char * regbranch(int *flagp)
Definition: regexp.cc:365
void hs_regerror(char *s) const
Definition: regerror.c:6
#define END
Definition: regexp.cc:79
#define EOL
Definition: regexp.cc:81
#define NSUBEXP
Definition: regexp.h:49
STATIC char * regpiece(int *flagp)
Definition: regexp.cc:404
char * endp[NSUBEXP]
Definition: regexp.h:52
#define NEXT(p)
Definition: regexp.cc:130
bool save(Lattice &lattice, EST_String filename)
char * regmust
Definition: regexp.h:55
STATIC int regrepeat(char *p)
Definition: regexp.cc:1074
STATIC char * reg(int paren, int *flagp)
Definition: regexp.cc:293
STATIC int regmatch(char *prog)
Definition: regexp.cc:870
#define NOTHING
Definition: regexp.cc:88
#define PLUS
Definition: regexp.cc:90
#define FAIL(m)
Definition: regexp.cc:147
#define WORDZ
Definition: regexp.cc:92
#define HASWIDTH
Definition: regexp.cc:153
#define STATIC
Definition: regexp.cc:171
#define OPERAND(p)
Definition: regexp.cc:131
hs_regexp * hs_regcomp(const char *exp)
Definition: regexp.cc:203
#define ISMULT(c)
Definition: regexp.cc:148
float min(float a, float b)
Definition: EST_cluster.cc:138
NULL
Definition: EST_WFST.cc:55
#define WORDA
Definition: regexp.cc:91
int hs_regexec(const hs_regexp *prog, const char *string)
Definition: regexp.cc:776
#define EXACTLY
Definition: regexp.cc:87
#define BRANCH
Definition: regexp.cc:85
#define SIMPLE
Definition: regexp.cc:154
STATIC void regoptail(char *p, char *val)
Definition: regexp.cc:739
#define ANYBUT
Definition: regexp.cc:84
#define ANYOF
Definition: regexp.cc:83
#define OPEN
Definition: regexp.cc:93
#define STAR
Definition: regexp.cc:89
STATIC char * regnext(register char *p)
char reganch
Definition: regexp.h:54
STATIC int regtry(hs_regexp *prog, const char *string)
Definition: regexp.cc:835
char regstart
Definition: regexp.h:53
#define BACK
Definition: regexp.cc:86
STATIC void regtail(char *p, char *val)
Definition: regexp.cc:709
#define ANY
Definition: regexp.cc:82
STATIC char * regatom(int *flagp)
Definition: regexp.cc:467
#define WORST
Definition: regexp.cc:156
char program[1]
Definition: regexp.h:57
#define UCHARAT(p)
Definition: regexp.cc:142
#define MAGIC
Definition: regmagic.h:5
#define SPSTART
Definition: regexp.cc:155
STATIC void reginsert(char op, char *opnd)
Definition: regexp.cc:682
char * startp[NSUBEXP]
Definition: regexp.h:51