github.com/afumu/libc@v0.0.6/musl/src/regex/regcomp.c (about)

     1  /*
     2    regcomp.c - TRE POSIX compatible regex compilation functions.
     3  
     4    Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
     5    All rights reserved.
     6  
     7    Redistribution and use in source and binary forms, with or without
     8    modification, are permitted provided that the following conditions
     9    are met:
    10  
    11      1. Redistributions of source code must retain the above copyright
    12         notice, this list of conditions and the following disclaimer.
    13  
    14      2. Redistributions in binary form must reproduce the above copyright
    15         notice, this list of conditions and the following disclaimer in the
    16         documentation and/or other materials provided with the distribution.
    17  
    18    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
    19    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    20    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
    21    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
    22    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
    23    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
    24    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
    25    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    26    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    27    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    28    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    29  
    30  */
    31  
    32  #include <string.h>
    33  #include <stdlib.h>
    34  #include <regex.h>
    35  #include <limits.h>
    36  #include <stdint.h>
    37  #include <ctype.h>
    38  
    39  #include "tre.h"
    40  
    41  #include <assert.h>
    42  
    43  /***********************************************************************
    44   from tre-compile.h
    45  ***********************************************************************/
    46  
    47  typedef struct {
    48    int position;
    49    int code_min;
    50    int code_max;
    51    int *tags;
    52    int assertions;
    53    tre_ctype_t class;
    54    tre_ctype_t *neg_classes;
    55    int backref;
    56  } tre_pos_and_tags_t;
    57  
    58  
    59  /***********************************************************************
    60   from tre-ast.c and tre-ast.h
    61  ***********************************************************************/
    62  
    63  /* The different AST node types. */
    64  typedef enum {
    65    LITERAL,
    66    CATENATION,
    67    ITERATION,
    68    UNION
    69  } tre_ast_type_t;
    70  
    71  /* Special subtypes of TRE_LITERAL. */
    72  #define EMPTY	  -1   /* Empty leaf (denotes empty string). */
    73  #define ASSERTION -2   /* Assertion leaf. */
    74  #define TAG	  -3   /* Tag leaf. */
    75  #define BACKREF	  -4   /* Back reference leaf. */
    76  
    77  #define IS_SPECIAL(x)	((x)->code_min < 0)
    78  #define IS_EMPTY(x)	((x)->code_min == EMPTY)
    79  #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
    80  #define IS_TAG(x)	((x)->code_min == TAG)
    81  #define IS_BACKREF(x)	((x)->code_min == BACKREF)
    82  
    83  
    84  /* A generic AST node.  All AST nodes consist of this node on the top
    85     level with `obj' pointing to the actual content. */
    86  typedef struct {
    87    tre_ast_type_t type;   /* Type of the node. */
    88    void *obj;             /* Pointer to actual node. */
    89    int nullable;
    90    int submatch_id;
    91    int num_submatches;
    92    int num_tags;
    93    tre_pos_and_tags_t *firstpos;
    94    tre_pos_and_tags_t *lastpos;
    95  } tre_ast_node_t;
    96  
    97  
    98  /* A "literal" node.  These are created for assertions, back references,
    99     tags, matching parameter settings, and all expressions that match one
   100     character. */
   101  typedef struct {
   102    long code_min;
   103    long code_max;
   104    int position;
   105    tre_ctype_t class;
   106    tre_ctype_t *neg_classes;
   107  } tre_literal_t;
   108  
   109  /* A "catenation" node.	 These are created when two regexps are concatenated.
   110     If there are more than one subexpressions in sequence, the `left' part
   111     holds all but the last, and `right' part holds the last subexpression
   112     (catenation is left associative). */
   113  typedef struct {
   114    tre_ast_node_t *left;
   115    tre_ast_node_t *right;
   116  } tre_catenation_t;
   117  
   118  /* An "iteration" node.	 These are created for the "*", "+", "?", and "{m,n}"
   119     operators. */
   120  typedef struct {
   121    /* Subexpression to match. */
   122    tre_ast_node_t *arg;
   123    /* Minimum number of consecutive matches. */
   124    int min;
   125    /* Maximum number of consecutive matches. */
   126    int max;
   127    /* If 0, match as many characters as possible, if 1 match as few as
   128       possible.	Note that this does not always mean the same thing as
   129       matching as many/few repetitions as possible. */
   130    unsigned int minimal:1;
   131  } tre_iteration_t;
   132  
   133  /* An "union" node.  These are created for the "|" operator. */
   134  typedef struct {
   135    tre_ast_node_t *left;
   136    tre_ast_node_t *right;
   137  } tre_union_t;
   138  
   139  
   140  static tre_ast_node_t *
   141  tre_ast_new_node(tre_mem_t mem, int type, void *obj)
   142  {
   143  	tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
   144  	if (!node || !obj)
   145  		return 0;
   146  	node->obj = obj;
   147  	node->type = type;
   148  	node->nullable = -1;
   149  	node->submatch_id = -1;
   150  	return node;
   151  }
   152  
   153  static tre_ast_node_t *
   154  tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
   155  {
   156  	tre_ast_node_t *node;
   157  	tre_literal_t *lit;
   158  
   159  	lit = tre_mem_calloc(mem, sizeof *lit);
   160  	node = tre_ast_new_node(mem, LITERAL, lit);
   161  	if (!node)
   162  		return 0;
   163  	lit->code_min = code_min;
   164  	lit->code_max = code_max;
   165  	lit->position = position;
   166  	return node;
   167  }
   168  
   169  static tre_ast_node_t *
   170  tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
   171  {
   172  	tre_ast_node_t *node;
   173  	tre_iteration_t *iter;
   174  
   175  	iter = tre_mem_calloc(mem, sizeof *iter);
   176  	node = tre_ast_new_node(mem, ITERATION, iter);
   177  	if (!node)
   178  		return 0;
   179  	iter->arg = arg;
   180  	iter->min = min;
   181  	iter->max = max;
   182  	iter->minimal = minimal;
   183  	node->num_submatches = arg->num_submatches;
   184  	return node;
   185  }
   186  
   187  static tre_ast_node_t *
   188  tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
   189  {
   190  	tre_ast_node_t *node;
   191  	tre_union_t *un;
   192  
   193  	if (!left)
   194  		return right;
   195  	un = tre_mem_calloc(mem, sizeof *un);
   196  	node = tre_ast_new_node(mem, UNION, un);
   197  	if (!node || !right)
   198  		return 0;
   199  	un->left = left;
   200  	un->right = right;
   201  	node->num_submatches = left->num_submatches + right->num_submatches;
   202  	return node;
   203  }
   204  
   205  static tre_ast_node_t *
   206  tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
   207  {
   208  	tre_ast_node_t *node;
   209  	tre_catenation_t *cat;
   210  
   211  	if (!left)
   212  		return right;
   213  	cat = tre_mem_calloc(mem, sizeof *cat);
   214  	node = tre_ast_new_node(mem, CATENATION, cat);
   215  	if (!node)
   216  		return 0;
   217  	cat->left = left;
   218  	cat->right = right;
   219  	node->num_submatches = left->num_submatches + right->num_submatches;
   220  	return node;
   221  }
   222  
   223  
   224  /***********************************************************************
   225   from tre-stack.c and tre-stack.h
   226  ***********************************************************************/
   227  
   228  typedef struct tre_stack_rec tre_stack_t;
   229  
   230  /* Creates a new stack object.	`size' is initial size in bytes, `max_size'
   231     is maximum size, and `increment' specifies how much more space will be
   232     allocated with realloc() if all space gets used up.	Returns the stack
   233     object or NULL if out of memory. */
   234  static tre_stack_t *
   235  tre_stack_new(int size, int max_size, int increment);
   236  
   237  /* Frees the stack object. */
   238  static void
   239  tre_stack_destroy(tre_stack_t *s);
   240  
   241  /* Returns the current number of objects in the stack. */
   242  static int
   243  tre_stack_num_objects(tre_stack_t *s);
   244  
   245  /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
   246     `value' on top of stack `s'.  Returns REG_ESPACE if out of memory.
   247     This tries to realloc() more space before failing if maximum size
   248     has not yet been reached.  Returns REG_OK if successful. */
   249  #define declare_pushf(typetag, type)					      \
   250    static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
   251  
   252  declare_pushf(voidptr, void *);
   253  declare_pushf(int, int);
   254  
   255  /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
   256     element off of stack `s' and returns it.  The stack must not be
   257     empty. */
   258  #define declare_popf(typetag, type)		  \
   259    static type tre_stack_pop_ ## typetag(tre_stack_t *s)
   260  
   261  declare_popf(voidptr, void *);
   262  declare_popf(int, int);
   263  
   264  /* Just to save some typing. */
   265  #define STACK_PUSH(s, typetag, value)					      \
   266    do									      \
   267      {									      \
   268        status = tre_stack_push_ ## typetag(s, value);			      \
   269      }									      \
   270    while (/*CONSTCOND*/0)
   271  
   272  #define STACK_PUSHX(s, typetag, value)					      \
   273    {									      \
   274      status = tre_stack_push_ ## typetag(s, value);			      \
   275      if (status != REG_OK)						      \
   276        break;								      \
   277    }
   278  
   279  #define STACK_PUSHR(s, typetag, value)					      \
   280    {									      \
   281      reg_errcode_t _status;						      \
   282      _status = tre_stack_push_ ## typetag(s, value);			      \
   283      if (_status != REG_OK)						      \
   284        return _status;							      \
   285    }
   286  
   287  union tre_stack_item {
   288    void *voidptr_value;
   289    int int_value;
   290  };
   291  
   292  struct tre_stack_rec {
   293    int size;
   294    int max_size;
   295    int increment;
   296    int ptr;
   297    union tre_stack_item *stack;
   298  };
   299  
   300  
   301  static tre_stack_t *
   302  tre_stack_new(int size, int max_size, int increment)
   303  {
   304    tre_stack_t *s;
   305  
   306    s = xmalloc(sizeof(*s));
   307    if (s != NULL)
   308      {
   309        s->stack = xmalloc(sizeof(*s->stack) * size);
   310        if (s->stack == NULL)
   311  	{
   312  	  xfree(s);
   313  	  return NULL;
   314  	}
   315        s->size = size;
   316        s->max_size = max_size;
   317        s->increment = increment;
   318        s->ptr = 0;
   319      }
   320    return s;
   321  }
   322  
   323  static void
   324  tre_stack_destroy(tre_stack_t *s)
   325  {
   326    xfree(s->stack);
   327    xfree(s);
   328  }
   329  
   330  static int
   331  tre_stack_num_objects(tre_stack_t *s)
   332  {
   333    return s->ptr;
   334  }
   335  
   336  static reg_errcode_t
   337  tre_stack_push(tre_stack_t *s, union tre_stack_item value)
   338  {
   339    if (s->ptr < s->size)
   340      {
   341        s->stack[s->ptr] = value;
   342        s->ptr++;
   343      }
   344    else
   345      {
   346        if (s->size >= s->max_size)
   347  	{
   348  	  return REG_ESPACE;
   349  	}
   350        else
   351  	{
   352  	  union tre_stack_item *new_buffer;
   353  	  int new_size;
   354  	  new_size = s->size + s->increment;
   355  	  if (new_size > s->max_size)
   356  	    new_size = s->max_size;
   357  	  new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
   358  	  if (new_buffer == NULL)
   359  	    {
   360  	      return REG_ESPACE;
   361  	    }
   362  	  assert(new_size > s->size);
   363  	  s->size = new_size;
   364  	  s->stack = new_buffer;
   365  	  tre_stack_push(s, value);
   366  	}
   367      }
   368    return REG_OK;
   369  }
   370  
   371  #define define_pushf(typetag, type)  \
   372    declare_pushf(typetag, type) {     \
   373      union tre_stack_item item;	     \
   374      item.typetag ## _value = value;  \
   375      return tre_stack_push(s, item);  \
   376  }
   377  
   378  define_pushf(int, int)
   379  define_pushf(voidptr, void *)
   380  
   381  #define define_popf(typetag, type)		    \
   382    declare_popf(typetag, type) {			    \
   383      return s->stack[--s->ptr].typetag ## _value;    \
   384    }
   385  
   386  define_popf(int, int)
   387  define_popf(voidptr, void *)
   388  
   389  
   390  /***********************************************************************
   391   from tre-parse.c and tre-parse.h
   392  ***********************************************************************/
   393  
   394  /* Parse context. */
   395  typedef struct {
   396  	/* Memory allocator. The AST is allocated using this. */
   397  	tre_mem_t mem;
   398  	/* Stack used for keeping track of regexp syntax. */
   399  	tre_stack_t *stack;
   400  	/* The parsed node after a parse function returns. */
   401  	tre_ast_node_t *n;
   402  	/* Position in the regexp pattern after a parse function returns. */
   403  	const char *s;
   404  	/* The first character of the last subexpression parsed. */
   405  	const char *start;
   406  	/* Current submatch ID. */
   407  	int submatch_id;
   408  	/* Current position (number of literal). */
   409  	int position;
   410  	/* The highest back reference or -1 if none seen so far. */
   411  	int max_backref;
   412  	/* Compilation flags. */
   413  	int cflags;
   414  } tre_parse_ctx_t;
   415  
   416  /* Some macros for expanding \w, \s, etc. */
   417  static const struct {
   418  	char c;
   419  	const char *expansion;
   420  } tre_macros[] = {
   421  	{'t', "\t"}, {'n', "\n"}, {'r', "\r"},
   422  	{'f', "\f"}, {'a', "\a"}, {'e', "\033"},
   423  	{'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
   424  	{'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
   425  	{ 0, 0 }
   426  };
   427  
   428  /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
   429     must have at least `len' items.  Sets buf[0] to zero if the there
   430     is no match in `tre_macros'. */
   431  static const char *tre_expand_macro(const char *s)
   432  {
   433  	int i;
   434  	for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
   435  	return tre_macros[i].expansion;
   436  }
   437  
   438  static int
   439  tre_compare_lit(const void *a, const void *b)
   440  {
   441  	const tre_literal_t *const *la = a;
   442  	const tre_literal_t *const *lb = b;
   443  	/* assumes the range of valid code_min is < INT_MAX */
   444  	return la[0]->code_min - lb[0]->code_min;
   445  }
   446  
   447  struct literals {
   448  	tre_mem_t mem;
   449  	tre_literal_t **a;
   450  	int len;
   451  	int cap;
   452  };
   453  
   454  static tre_literal_t *tre_new_lit(struct literals *p)
   455  {
   456  	tre_literal_t **a;
   457  	if (p->len >= p->cap) {
   458  		if (p->cap >= 1<<15)
   459  			return 0;
   460  		p->cap *= 2;
   461  		a = xrealloc(p->a, p->cap * sizeof *p->a);
   462  		if (!a)
   463  			return 0;
   464  		p->a = a;
   465  	}
   466  	a = p->a + p->len++;
   467  	*a = tre_mem_calloc(p->mem, sizeof **a);
   468  	return *a;
   469  }
   470  
   471  static int add_icase_literals(struct literals *ls, int min, int max)
   472  {
   473  	tre_literal_t *lit;
   474  	int b, e, c;
   475  	for (c=min; c<=max; ) {
   476  		/* assumes islower(c) and isupper(c) are exclusive
   477  		   and toupper(c)!=c if islower(c).
   478  		   multiple opposite case characters are not supported */
   479  		if (tre_islower(c)) {
   480  			b = e = tre_toupper(c);
   481  			for (c++, e++; c<=max; c++, e++)
   482  				if (tre_toupper(c) != e) break;
   483  		} else if (tre_isupper(c)) {
   484  			b = e = tre_tolower(c);
   485  			for (c++, e++; c<=max; c++, e++)
   486  				if (tre_tolower(c) != e) break;
   487  		} else {
   488  			c++;
   489  			continue;
   490  		}
   491  		lit = tre_new_lit(ls);
   492  		if (!lit)
   493  			return -1;
   494  		lit->code_min = b;
   495  		lit->code_max = e-1;
   496  		lit->position = -1;
   497  	}
   498  	return 0;
   499  }
   500  
   501  
   502  /* Maximum number of character classes in a negated bracket expression. */
   503  #define MAX_NEG_CLASSES 64
   504  
   505  struct neg {
   506  	int negate;
   507  	int len;
   508  	tre_ctype_t a[MAX_NEG_CLASSES];
   509  };
   510  
   511  // TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
   512  
   513  /*
   514  bracket grammar:
   515  Bracket  =  '[' List ']'  |  '[^' List ']'
   516  List     =  Term  |  List Term
   517  Term     =  Char  |  Range  |  Chclass  |  Eqclass
   518  Range    =  Char '-' Char  |  Char '-' '-'
   519  Char     =  Coll  |  coll_single
   520  Meta     =  ']'  |  '-'
   521  Coll     =  '[.' coll_single '.]'  |  '[.' coll_multi '.]'  |  '[.' Meta '.]'
   522  Eqclass  =  '[=' coll_single '=]'  |  '[=' coll_multi '=]'
   523  Chclass  =  '[:' class ':]'
   524  
   525  coll_single is a single char collating element but it can be
   526   '-' only at the beginning or end of a List and
   527   ']' only at the beginning of a List and
   528   '^' anywhere except after the openning '['
   529  */
   530  
   531  static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
   532  {
   533  	const char *start = s;
   534  	tre_ctype_t class;
   535  	int min, max;
   536  	wchar_t wc;
   537  	int len;
   538  
   539  	for (;;) {
   540  		class = 0;
   541  		len = mbtowc(&wc, s, -1);
   542  		if (len <= 0)
   543  			return *s ? REG_BADPAT : REG_EBRACK;
   544  		if (*s == ']' && s != start) {
   545  			ctx->s = s+1;
   546  			return REG_OK;
   547  		}
   548  		if (*s == '-' && s != start && s[1] != ']' &&
   549  		    /* extension: [a-z--@] is accepted as [a-z]|[--@] */
   550  		    (s[1] != '-' || s[2] == ']'))
   551  			return REG_ERANGE;
   552  		if (*s == '[' && (s[1] == '.' || s[1] == '='))
   553  			/* collating symbols and equivalence classes are not supported */
   554  			return REG_ECOLLATE;
   555  		if (*s == '[' && s[1] == ':') {
   556  			char tmp[CHARCLASS_NAME_MAX+1];
   557  			s += 2;
   558  			for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
   559  				if (s[len] == ':') {
   560  					memcpy(tmp, s, len);
   561  					tmp[len] = 0;
   562  					class = tre_ctype(tmp);
   563  					break;
   564  				}
   565  			}
   566  			if (!class || s[len+1] != ']')
   567  				return REG_ECTYPE;
   568  			min = 0;
   569  			max = TRE_CHAR_MAX;
   570  			s += len+2;
   571  		} else {
   572  			min = max = wc;
   573  			s += len;
   574  			if (*s == '-' && s[1] != ']') {
   575  				s++;
   576  				len = mbtowc(&wc, s, -1);
   577  				max = wc;
   578  				/* XXX - Should use collation order instead of
   579  				   encoding values in character ranges. */
   580  				if (len <= 0 || min > max)
   581  					return REG_ERANGE;
   582  				s += len;
   583  			}
   584  		}
   585  
   586  		if (class && neg->negate) {
   587  			if (neg->len >= MAX_NEG_CLASSES)
   588  				return REG_ESPACE;
   589  			neg->a[neg->len++] = class;
   590  		} else  {
   591  			tre_literal_t *lit = tre_new_lit(ls);
   592  			if (!lit)
   593  				return REG_ESPACE;
   594  			lit->code_min = min;
   595  			lit->code_max = max;
   596  			lit->class = class;
   597  			lit->position = -1;
   598  
   599  			/* Add opposite-case codepoints if REG_ICASE is present.
   600  			   It seems that POSIX requires that bracket negation
   601  			   should happen before case-folding, but most practical
   602  			   implementations do it the other way around. Changing
   603  			   the order would need efficient representation of
   604  			   case-fold ranges and bracket range sets even with
   605  			   simple patterns so this is ok for now. */
   606  			if (ctx->cflags & REG_ICASE && !class)
   607  				if (add_icase_literals(ls, min, max))
   608  					return REG_ESPACE;
   609  		}
   610  	}
   611  }
   612  
   613  static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
   614  {
   615  	int i, max, min, negmax, negmin;
   616  	tre_ast_node_t *node = 0, *n;
   617  	tre_ctype_t *nc = 0;
   618  	tre_literal_t *lit;
   619  	struct literals ls;
   620  	struct neg neg;
   621  	reg_errcode_t err;
   622  
   623  	ls.mem = ctx->mem;
   624  	ls.len = 0;
   625  	ls.cap = 32;
   626  	ls.a = xmalloc(ls.cap * sizeof *ls.a);
   627  	if (!ls.a)
   628  		return REG_ESPACE;
   629  	neg.len = 0;
   630  	neg.negate = *s == '^';
   631  	if (neg.negate)
   632  		s++;
   633  
   634  	err = parse_bracket_terms(ctx, s, &ls, &neg);
   635  	if (err != REG_OK)
   636  		goto parse_bracket_done;
   637  
   638  	if (neg.negate) {
   639  		/*
   640  		 * With REG_NEWLINE, POSIX requires that newlines are not matched by
   641  		 * any form of a non-matching list.
   642  		 */
   643  		if (ctx->cflags & REG_NEWLINE) {
   644  			lit = tre_new_lit(&ls);
   645  			if (!lit) {
   646  				err = REG_ESPACE;
   647  				goto parse_bracket_done;
   648  			}
   649  			lit->code_min = '\n';
   650  			lit->code_max = '\n';
   651  			lit->position = -1;
   652  		}
   653  		/* Sort the array if we need to negate it. */
   654  		qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
   655  		/* extra lit for the last negated range */
   656  		lit = tre_new_lit(&ls);
   657  		if (!lit) {
   658  			err = REG_ESPACE;
   659  			goto parse_bracket_done;
   660  		}
   661  		lit->code_min = TRE_CHAR_MAX+1;
   662  		lit->code_max = TRE_CHAR_MAX+1;
   663  		lit->position = -1;
   664  		/* negated classes */
   665  		if (neg.len) {
   666  			nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
   667  			if (!nc) {
   668  				err = REG_ESPACE;
   669  				goto parse_bracket_done;
   670  			}
   671  			memcpy(nc, neg.a, neg.len*sizeof *neg.a);
   672  			nc[neg.len] = 0;
   673  		}
   674  	}
   675  
   676  	/* Build a union of the items in the array, negated if necessary. */
   677  	negmax = negmin = 0;
   678  	for (i = 0; i < ls.len; i++) {
   679  		lit = ls.a[i];
   680  		min = lit->code_min;
   681  		max = lit->code_max;
   682  		if (neg.negate) {
   683  			if (min <= negmin) {
   684  				/* Overlap. */
   685  				negmin = MAX(max + 1, negmin);
   686  				continue;
   687  			}
   688  			negmax = min - 1;
   689  			lit->code_min = negmin;
   690  			lit->code_max = negmax;
   691  			negmin = max + 1;
   692  		}
   693  		lit->position = ctx->position;
   694  		lit->neg_classes = nc;
   695  		n = tre_ast_new_node(ctx->mem, LITERAL, lit);
   696  		node = tre_ast_new_union(ctx->mem, node, n);
   697  		if (!node) {
   698  			err = REG_ESPACE;
   699  			break;
   700  		}
   701  	}
   702  
   703  parse_bracket_done:
   704  	xfree(ls.a);
   705  	ctx->position++;
   706  	ctx->n = node;
   707  	return err;
   708  }
   709  
   710  static const char *parse_dup_count(const char *s, int *n)
   711  {
   712  	*n = -1;
   713  	if (!isdigit(*s))
   714  		return s;
   715  	*n = 0;
   716  	for (;;) {
   717  		*n = 10 * *n + (*s - '0');
   718  		s++;
   719  		if (!isdigit(*s) || *n > RE_DUP_MAX)
   720  			break;
   721  	}
   722  	return s;
   723  }
   724  
   725  static const char *parse_dup(const char *s, int ere, int *pmin, int *pmax)
   726  {
   727  	int min, max;
   728  
   729  	s = parse_dup_count(s, &min);
   730  	if (*s == ',')
   731  		s = parse_dup_count(s+1, &max);
   732  	else
   733  		max = min;
   734  
   735  	if (
   736  		(max < min && max >= 0) ||
   737  		max > RE_DUP_MAX ||
   738  		min > RE_DUP_MAX ||
   739  		min < 0 ||
   740  		(!ere && *s++ != '\\') ||
   741  		*s++ != '}'
   742  	)
   743  		return 0;
   744  	*pmin = min;
   745  	*pmax = max;
   746  	return s;
   747  }
   748  
   749  static int hexval(unsigned c)
   750  {
   751  	if (c-'0'<10) return c-'0';
   752  	c |= 32;
   753  	if (c-'a'<6) return c-'a'+10;
   754  	return -1;
   755  }
   756  
   757  static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
   758  {
   759  	if (node->submatch_id >= 0) {
   760  		tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
   761  		if (!n)
   762  			return REG_ESPACE;
   763  		n = tre_ast_new_catenation(ctx->mem, n, node);
   764  		if (!n)
   765  			return REG_ESPACE;
   766  		n->num_submatches = node->num_submatches;
   767  		node = n;
   768  	}
   769  	node->submatch_id = subid;
   770  	node->num_submatches++;
   771  	ctx->n = node;
   772  	return REG_OK;
   773  }
   774  
   775  /*
   776  BRE grammar:
   777  Regex  =  Branch  |  '^'  |  '$'  |  '^$'  |  '^' Branch  |  Branch '$'  |  '^' Branch '$'
   778  Branch =  Atom  |  Branch Atom
   779  Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '\(' Branch '\)'  |  back_ref
   780  Dup    =  '*'  |  '\{' Count '\}'  |  '\{' Count ',\}'  |  '\{' Count ',' Count '\}'
   781  
   782  (leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
   783  
   784  ERE grammar:
   785  Regex  =  Branch  |  Regex '|' Branch
   786  Branch =  Atom  |  Branch Atom
   787  Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '(' Regex ')'  |  '^'  |  '$'
   788  Dup    =  '*'  |  '+'  |  '?'  |  '{' Count '}'  |  '{' Count ',}'  |  '{' Count ',' Count '}'
   789  
   790  (a*+?, ^*, $+, \X, {, (|a) are unspecified)
   791  */
   792  
   793  static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
   794  {
   795  	int len, ere = ctx->cflags & REG_EXTENDED;
   796  	const char *p;
   797  	tre_ast_node_t *node;
   798  	wchar_t wc;
   799  	switch (*s) {
   800  	case '[':
   801  		return parse_bracket(ctx, s+1);
   802  	case '\\':
   803  		p = tre_expand_macro(s+1);
   804  		if (p) {
   805  			/* assume \X expansion is a single atom */
   806  			reg_errcode_t err = parse_atom(ctx, p);
   807  			ctx->s = s+2;
   808  			return err;
   809  		}
   810  		/* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
   811  		switch (*++s) {
   812  		case 0:
   813  			return REG_EESCAPE;
   814  		case 'b':
   815  			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
   816  			break;
   817  		case 'B':
   818  			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
   819  			break;
   820  		case '<':
   821  			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
   822  			break;
   823  		case '>':
   824  			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
   825  			break;
   826  		case 'x':
   827  			s++;
   828  			int i, v = 0, c;
   829  			len = 2;
   830  			if (*s == '{') {
   831  				len = 8;
   832  				s++;
   833  			}
   834  			for (i=0; i<len && v<0x110000; i++) {
   835  				c = hexval(s[i]);
   836  				if (c < 0) break;
   837  				v = 16*v + c;
   838  			}
   839  			s += i;
   840  			if (len == 8) {
   841  				if (*s != '}')
   842  					return REG_EBRACE;
   843  				s++;
   844  			}
   845  			node = tre_ast_new_literal(ctx->mem, v, v, ctx->position++);
   846  			s--;
   847  			break;
   848  		case '{':
   849  		case '+':
   850  		case '?':
   851  			/* extension: treat \+, \? as repetitions in BRE */
   852  			/* reject repetitions after empty expression in BRE */
   853  			if (!ere)
   854  				return REG_BADRPT;
   855  		case '|':
   856  			/* extension: treat \| as alternation in BRE */
   857  			if (!ere) {
   858  				node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
   859  				s--;
   860  				goto end;
   861  			}
   862  			/* fallthrough */
   863  		default:
   864  			if (!ere && (unsigned)*s-'1' < 9) {
   865  				/* back reference */
   866  				int val = *s - '0';
   867  				node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position++);
   868  				ctx->max_backref = MAX(val, ctx->max_backref);
   869  			} else {
   870  				/* extension: accept unknown escaped char
   871  				   as a literal */
   872  				goto parse_literal;
   873  			}
   874  		}
   875  		s++;
   876  		break;
   877  	case '.':
   878  		if (ctx->cflags & REG_NEWLINE) {
   879  			tre_ast_node_t *tmp1, *tmp2;
   880  			tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
   881  			tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
   882  			if (tmp1 && tmp2)
   883  				node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
   884  			else
   885  				node = 0;
   886  		} else {
   887  			node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
   888  		}
   889  		s++;
   890  		break;
   891  	case '^':
   892  		/* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
   893  		if (!ere && s != ctx->start)
   894  			goto parse_literal;
   895  		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
   896  		s++;
   897  		break;
   898  	case '$':
   899  		/* '$' is special everywhere in EREs, and at the end of a BRE subexpression. */
   900  		if (!ere && s[1] && (s[1]!='\\'|| (s[2]!=')' && s[2]!='|')))
   901  			goto parse_literal;
   902  		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
   903  		s++;
   904  		break;
   905  	case '*':
   906  	case '{':
   907  	case '+':
   908  	case '?':
   909  		/* reject repetitions after empty expression in ERE */
   910  		if (ere)
   911  			return REG_BADRPT;
   912  	case '|':
   913  		if (!ere)
   914  			goto parse_literal;
   915  	case 0:
   916  		node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
   917  		break;
   918  	default:
   919  parse_literal:
   920  		len = mbtowc(&wc, s, -1);
   921  		if (len < 0)
   922  			return REG_BADPAT;
   923  		if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
   924  			tre_ast_node_t *tmp1, *tmp2;
   925  			/* multiple opposite case characters are not supported */
   926  			tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
   927  			tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
   928  			if (tmp1 && tmp2)
   929  				node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
   930  			else
   931  				node = 0;
   932  		} else {
   933  			node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
   934  		}
   935  		ctx->position++;
   936  		s += len;
   937  		break;
   938  	}
   939  end:
   940  	if (!node)
   941  		return REG_ESPACE;
   942  	ctx->n = node;
   943  	ctx->s = s;
   944  	return REG_OK;
   945  }
   946  
   947  #define PUSHPTR(err, s, v) do { \
   948  	if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
   949  		return err; \
   950  } while(0)
   951  
   952  #define PUSHINT(err, s, v) do { \
   953  	if ((err = tre_stack_push_int(s, v)) != REG_OK) \
   954  		return err; \
   955  } while(0)
   956  
   957  static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
   958  {
   959  	tre_ast_node_t *nbranch=0, *nunion=0;
   960  	int ere = ctx->cflags & REG_EXTENDED;
   961  	const char *s = ctx->start;
   962  	int subid = 0;
   963  	int depth = 0;
   964  	reg_errcode_t err;
   965  	tre_stack_t *stack = ctx->stack;
   966  
   967  	PUSHINT(err, stack, subid++);
   968  	for (;;) {
   969  		if ((!ere && *s == '\\' && s[1] == '(') ||
   970  		    (ere && *s == '(')) {
   971  			PUSHPTR(err, stack, nunion);
   972  			PUSHPTR(err, stack, nbranch);
   973  			PUSHINT(err, stack, subid++);
   974  			s++;
   975  			if (!ere)
   976  				s++;
   977  			depth++;
   978  			nbranch = nunion = 0;
   979  			ctx->start = s;
   980  			continue;
   981  		}
   982  		if ((!ere && *s == '\\' && s[1] == ')') ||
   983  		    (ere && *s == ')' && depth)) {
   984  			ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
   985  			if (!ctx->n)
   986  				return REG_ESPACE;
   987  		} else {
   988  			err = parse_atom(ctx, s);
   989  			if (err != REG_OK)
   990  				return err;
   991  			s = ctx->s;
   992  		}
   993  
   994  	parse_iter:
   995  		for (;;) {
   996  			int min, max;
   997  
   998  			if (*s!='\\' && *s!='*') {
   999  				if (!ere)
  1000  					break;
  1001  				if (*s!='+' && *s!='?' && *s!='{')
  1002  					break;
  1003  			}
  1004  			if (*s=='\\' && ere)
  1005  				break;
  1006  			/* extension: treat \+, \? as repetitions in BRE */
  1007  			if (*s=='\\' && s[1]!='+' && s[1]!='?' && s[1]!='{')
  1008  				break;
  1009  			if (*s=='\\')
  1010  				s++;
  1011  
  1012  			/* handle ^* at the start of a BRE. */
  1013  			if (!ere && s==ctx->start+1 && s[-1]=='^')
  1014  				break;
  1015  
  1016  			/* extension: multiple consecutive *+?{,} is unspecified,
  1017  			   but (a+)+ has to be supported so accepting a++ makes
  1018  			   sense, note however that the RE_DUP_MAX limit can be
  1019  			   circumvented: (a{255}){255} uses a lot of memory.. */
  1020  			if (*s=='{') {
  1021  				s = parse_dup(s+1, ere, &min, &max);
  1022  				if (!s)
  1023  					return REG_BADBR;
  1024  			} else {
  1025  				min=0;
  1026  				max=-1;
  1027  				if (*s == '+')
  1028  					min = 1;
  1029  				if (*s == '?')
  1030  					max = 1;
  1031  				s++;
  1032  			}
  1033  			if (max == 0)
  1034  				ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
  1035  			else
  1036  				ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
  1037  			if (!ctx->n)
  1038  				return REG_ESPACE;
  1039  		}
  1040  
  1041  		nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
  1042  		if ((ere && *s == '|') ||
  1043  		    (ere && *s == ')' && depth) ||
  1044  		    (!ere && *s == '\\' && s[1] == ')') ||
  1045  		    /* extension: treat \| as alternation in BRE */
  1046  		    (!ere && *s == '\\' && s[1] == '|') ||
  1047  		    !*s) {
  1048  			/* extension: empty branch is unspecified (), (|a), (a|)
  1049  			   here they are not rejected but match on empty string */
  1050  			int c = *s;
  1051  			nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
  1052  			nbranch = 0;
  1053  
  1054  			if (c == '\\' && s[1] == '|') {
  1055  				s+=2;
  1056  				ctx->start = s;
  1057  			} else if (c == '|') {
  1058  				s++;
  1059  				ctx->start = s;
  1060  			} else {
  1061  				if (c == '\\') {
  1062  					if (!depth) return REG_EPAREN;
  1063  					s+=2;
  1064  				} else if (c == ')')
  1065  					s++;
  1066  				depth--;
  1067  				err = marksub(ctx, nunion, tre_stack_pop_int(stack));
  1068  				if (err != REG_OK)
  1069  					return err;
  1070  				if (!c && depth<0) {
  1071  					ctx->submatch_id = subid;
  1072  					return REG_OK;
  1073  				}
  1074  				if (!c || depth<0)
  1075  					return REG_EPAREN;
  1076  				nbranch = tre_stack_pop_voidptr(stack);
  1077  				nunion = tre_stack_pop_voidptr(stack);
  1078  				goto parse_iter;
  1079  			}
  1080  		}
  1081  	}
  1082  }
  1083  
  1084  
  1085  /***********************************************************************
  1086   from tre-compile.c
  1087  ***********************************************************************/
  1088  
  1089  
  1090  /*
  1091    TODO:
  1092     - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
  1093       function calls.
  1094  */
  1095  
  1096  /*
  1097    Algorithms to setup tags so that submatch addressing can be done.
  1098  */
  1099  
  1100  
  1101  /* Inserts a catenation node to the root of the tree given in `node'.
  1102     As the left child a new tag with number `tag_id' to `node' is added,
  1103     and the right child is the old root. */
  1104  static reg_errcode_t
  1105  tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
  1106  {
  1107    tre_catenation_t *c;
  1108  
  1109    c = tre_mem_alloc(mem, sizeof(*c));
  1110    if (c == NULL)
  1111      return REG_ESPACE;
  1112    c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
  1113    if (c->left == NULL)
  1114      return REG_ESPACE;
  1115    c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
  1116    if (c->right == NULL)
  1117      return REG_ESPACE;
  1118  
  1119    c->right->obj = node->obj;
  1120    c->right->type = node->type;
  1121    c->right->nullable = -1;
  1122    c->right->submatch_id = -1;
  1123    c->right->firstpos = NULL;
  1124    c->right->lastpos = NULL;
  1125    c->right->num_tags = 0;
  1126    c->right->num_submatches = 0;
  1127    node->obj = c;
  1128    node->type = CATENATION;
  1129    return REG_OK;
  1130  }
  1131  
  1132  /* Inserts a catenation node to the root of the tree given in `node'.
  1133     As the right child a new tag with number `tag_id' to `node' is added,
  1134     and the left child is the old root. */
  1135  static reg_errcode_t
  1136  tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
  1137  {
  1138    tre_catenation_t *c;
  1139  
  1140    c = tre_mem_alloc(mem, sizeof(*c));
  1141    if (c == NULL)
  1142      return REG_ESPACE;
  1143    c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
  1144    if (c->right == NULL)
  1145      return REG_ESPACE;
  1146    c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
  1147    if (c->left == NULL)
  1148      return REG_ESPACE;
  1149  
  1150    c->left->obj = node->obj;
  1151    c->left->type = node->type;
  1152    c->left->nullable = -1;
  1153    c->left->submatch_id = -1;
  1154    c->left->firstpos = NULL;
  1155    c->left->lastpos = NULL;
  1156    c->left->num_tags = 0;
  1157    c->left->num_submatches = 0;
  1158    node->obj = c;
  1159    node->type = CATENATION;
  1160    return REG_OK;
  1161  }
  1162  
  1163  typedef enum {
  1164    ADDTAGS_RECURSE,
  1165    ADDTAGS_AFTER_ITERATION,
  1166    ADDTAGS_AFTER_UNION_LEFT,
  1167    ADDTAGS_AFTER_UNION_RIGHT,
  1168    ADDTAGS_AFTER_CAT_LEFT,
  1169    ADDTAGS_AFTER_CAT_RIGHT,
  1170    ADDTAGS_SET_SUBMATCH_END
  1171  } tre_addtags_symbol_t;
  1172  
  1173  
  1174  typedef struct {
  1175    int tag;
  1176    int next_tag;
  1177  } tre_tag_states_t;
  1178  
  1179  
  1180  /* Go through `regset' and set submatch data for submatches that are
  1181     using this tag. */
  1182  static void
  1183  tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
  1184  {
  1185    int i;
  1186  
  1187    for (i = 0; regset[i] >= 0; i++)
  1188      {
  1189        int id = regset[i] / 2;
  1190        int start = !(regset[i] % 2);
  1191        if (start)
  1192  	tnfa->submatch_data[id].so_tag = tag;
  1193        else
  1194  	tnfa->submatch_data[id].eo_tag = tag;
  1195      }
  1196    regset[0] = -1;
  1197  }
  1198  
  1199  
  1200  /* Adds tags to appropriate locations in the parse tree in `tree', so that
  1201     subexpressions marked for submatch addressing can be traced. */
  1202  static reg_errcode_t
  1203  tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
  1204  	     tre_tnfa_t *tnfa)
  1205  {
  1206    reg_errcode_t status = REG_OK;
  1207    tre_addtags_symbol_t symbol;
  1208    tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
  1209    int bottom = tre_stack_num_objects(stack);
  1210    /* True for first pass (counting number of needed tags) */
  1211    int first_pass = (mem == NULL || tnfa == NULL);
  1212    int *regset, *orig_regset;
  1213    int num_tags = 0; /* Total number of tags. */
  1214    int num_minimals = 0;	 /* Number of special minimal tags. */
  1215    int tag = 0;	    /* The tag that is to be added next. */
  1216    int next_tag = 1; /* Next tag to use after this one. */
  1217    int *parents;	    /* Stack of submatches the current submatch is
  1218  		       contained in. */
  1219    int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
  1220    tre_tag_states_t *saved_states;
  1221  
  1222    tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
  1223    if (!first_pass)
  1224      {
  1225        tnfa->end_tag = 0;
  1226        tnfa->minimal_tags[0] = -1;
  1227      }
  1228  
  1229    regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
  1230    if (regset == NULL)
  1231      return REG_ESPACE;
  1232    regset[0] = -1;
  1233    orig_regset = regset;
  1234  
  1235    parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
  1236    if (parents == NULL)
  1237      {
  1238        xfree(regset);
  1239        return REG_ESPACE;
  1240      }
  1241    parents[0] = -1;
  1242  
  1243    saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
  1244    if (saved_states == NULL)
  1245      {
  1246        xfree(regset);
  1247        xfree(parents);
  1248        return REG_ESPACE;
  1249      }
  1250    else
  1251      {
  1252        unsigned int i;
  1253        for (i = 0; i <= tnfa->num_submatches; i++)
  1254  	saved_states[i].tag = -1;
  1255      }
  1256  
  1257    STACK_PUSH(stack, voidptr, node);
  1258    STACK_PUSH(stack, int, ADDTAGS_RECURSE);
  1259  
  1260    while (tre_stack_num_objects(stack) > bottom)
  1261      {
  1262        if (status != REG_OK)
  1263  	break;
  1264  
  1265        symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
  1266        switch (symbol)
  1267  	{
  1268  
  1269  	case ADDTAGS_SET_SUBMATCH_END:
  1270  	  {
  1271  	    int id = tre_stack_pop_int(stack);
  1272  	    int i;
  1273  
  1274  	    /* Add end of this submatch to regset. */
  1275  	    for (i = 0; regset[i] >= 0; i++);
  1276  	    regset[i] = id * 2 + 1;
  1277  	    regset[i + 1] = -1;
  1278  
  1279  	    /* Pop this submatch from the parents stack. */
  1280  	    for (i = 0; parents[i] >= 0; i++);
  1281  	    parents[i - 1] = -1;
  1282  	    break;
  1283  	  }
  1284  
  1285  	case ADDTAGS_RECURSE:
  1286  	  node = tre_stack_pop_voidptr(stack);
  1287  
  1288  	  if (node->submatch_id >= 0)
  1289  	    {
  1290  	      int id = node->submatch_id;
  1291  	      int i;
  1292  
  1293  
  1294  	      /* Add start of this submatch to regset. */
  1295  	      for (i = 0; regset[i] >= 0; i++);
  1296  	      regset[i] = id * 2;
  1297  	      regset[i + 1] = -1;
  1298  
  1299  	      if (!first_pass)
  1300  		{
  1301  		  for (i = 0; parents[i] >= 0; i++);
  1302  		  tnfa->submatch_data[id].parents = NULL;
  1303  		  if (i > 0)
  1304  		    {
  1305  		      int *p = xmalloc(sizeof(*p) * (i + 1));
  1306  		      if (p == NULL)
  1307  			{
  1308  			  status = REG_ESPACE;
  1309  			  break;
  1310  			}
  1311  		      assert(tnfa->submatch_data[id].parents == NULL);
  1312  		      tnfa->submatch_data[id].parents = p;
  1313  		      for (i = 0; parents[i] >= 0; i++)
  1314  			p[i] = parents[i];
  1315  		      p[i] = -1;
  1316  		    }
  1317  		}
  1318  
  1319  	      /* Add end of this submatch to regset after processing this
  1320  		 node. */
  1321  	      STACK_PUSHX(stack, int, node->submatch_id);
  1322  	      STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
  1323  	    }
  1324  
  1325  	  switch (node->type)
  1326  	    {
  1327  	    case LITERAL:
  1328  	      {
  1329  		tre_literal_t *lit = node->obj;
  1330  
  1331  		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
  1332  		  {
  1333  		    int i;
  1334  		    if (regset[0] >= 0)
  1335  		      {
  1336  			/* Regset is not empty, so add a tag before the
  1337  			   literal or backref. */
  1338  			if (!first_pass)
  1339  			  {
  1340  			    status = tre_add_tag_left(mem, node, tag);
  1341  			    tnfa->tag_directions[tag] = direction;
  1342  			    if (minimal_tag >= 0)
  1343  			      {
  1344  				for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
  1345  				tnfa->minimal_tags[i] = tag;
  1346  				tnfa->minimal_tags[i + 1] = minimal_tag;
  1347  				tnfa->minimal_tags[i + 2] = -1;
  1348  				minimal_tag = -1;
  1349  				num_minimals++;
  1350  			      }
  1351  			    tre_purge_regset(regset, tnfa, tag);
  1352  			  }
  1353  			else
  1354  			  {
  1355  			    node->num_tags = 1;
  1356  			  }
  1357  
  1358  			regset[0] = -1;
  1359  			tag = next_tag;
  1360  			num_tags++;
  1361  			next_tag++;
  1362  		      }
  1363  		  }
  1364  		else
  1365  		  {
  1366  		    assert(!IS_TAG(lit));
  1367  		  }
  1368  		break;
  1369  	      }
  1370  	    case CATENATION:
  1371  	      {
  1372  		tre_catenation_t *cat = node->obj;
  1373  		tre_ast_node_t *left = cat->left;
  1374  		tre_ast_node_t *right = cat->right;
  1375  		int reserved_tag = -1;
  1376  
  1377  
  1378  		/* After processing right child. */
  1379  		STACK_PUSHX(stack, voidptr, node);
  1380  		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
  1381  
  1382  		/* Process right child. */
  1383  		STACK_PUSHX(stack, voidptr, right);
  1384  		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
  1385  
  1386  		/* After processing left child. */
  1387  		STACK_PUSHX(stack, int, next_tag + left->num_tags);
  1388  		if (left->num_tags > 0 && right->num_tags > 0)
  1389  		  {
  1390  		    /* Reserve the next tag to the right child. */
  1391  		    reserved_tag = next_tag;
  1392  		    next_tag++;
  1393  		  }
  1394  		STACK_PUSHX(stack, int, reserved_tag);
  1395  		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
  1396  
  1397  		/* Process left child. */
  1398  		STACK_PUSHX(stack, voidptr, left);
  1399  		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
  1400  
  1401  		}
  1402  	      break;
  1403  	    case ITERATION:
  1404  	      {
  1405  		tre_iteration_t *iter = node->obj;
  1406  
  1407  		if (first_pass)
  1408  		  {
  1409  		    STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
  1410  		  }
  1411  		else
  1412  		  {
  1413  		    STACK_PUSHX(stack, int, tag);
  1414  		    STACK_PUSHX(stack, int, iter->minimal);
  1415  		  }
  1416  		STACK_PUSHX(stack, voidptr, node);
  1417  		STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
  1418  
  1419  		STACK_PUSHX(stack, voidptr, iter->arg);
  1420  		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
  1421  
  1422  		/* Regset is not empty, so add a tag here. */
  1423  		if (regset[0] >= 0 || iter->minimal)
  1424  		  {
  1425  		    if (!first_pass)
  1426  		      {
  1427  			int i;
  1428  			status = tre_add_tag_left(mem, node, tag);
  1429  			if (iter->minimal)
  1430  			  tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
  1431  			else
  1432  			  tnfa->tag_directions[tag] = direction;
  1433  			if (minimal_tag >= 0)
  1434  			  {
  1435  			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
  1436  			    tnfa->minimal_tags[i] = tag;
  1437  			    tnfa->minimal_tags[i + 1] = minimal_tag;
  1438  			    tnfa->minimal_tags[i + 2] = -1;
  1439  			    minimal_tag = -1;
  1440  			    num_minimals++;
  1441  			  }
  1442  			tre_purge_regset(regset, tnfa, tag);
  1443  		      }
  1444  
  1445  		    regset[0] = -1;
  1446  		    tag = next_tag;
  1447  		    num_tags++;
  1448  		    next_tag++;
  1449  		  }
  1450  		direction = TRE_TAG_MINIMIZE;
  1451  	      }
  1452  	      break;
  1453  	    case UNION:
  1454  	      {
  1455  		tre_union_t *uni = node->obj;
  1456  		tre_ast_node_t *left = uni->left;
  1457  		tre_ast_node_t *right = uni->right;
  1458  		int left_tag;
  1459  		int right_tag;
  1460  
  1461  		if (regset[0] >= 0)
  1462  		  {
  1463  		    left_tag = next_tag;
  1464  		    right_tag = next_tag + 1;
  1465  		  }
  1466  		else
  1467  		  {
  1468  		    left_tag = tag;
  1469  		    right_tag = next_tag;
  1470  		  }
  1471  
  1472  		/* After processing right child. */
  1473  		STACK_PUSHX(stack, int, right_tag);
  1474  		STACK_PUSHX(stack, int, left_tag);
  1475  		STACK_PUSHX(stack, voidptr, regset);
  1476  		STACK_PUSHX(stack, int, regset[0] >= 0);
  1477  		STACK_PUSHX(stack, voidptr, node);
  1478  		STACK_PUSHX(stack, voidptr, right);
  1479  		STACK_PUSHX(stack, voidptr, left);
  1480  		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
  1481  
  1482  		/* Process right child. */
  1483  		STACK_PUSHX(stack, voidptr, right);
  1484  		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
  1485  
  1486  		/* After processing left child. */
  1487  		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
  1488  
  1489  		/* Process left child. */
  1490  		STACK_PUSHX(stack, voidptr, left);
  1491  		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
  1492  
  1493  		/* Regset is not empty, so add a tag here. */
  1494  		if (regset[0] >= 0)
  1495  		  {
  1496  		    if (!first_pass)
  1497  		      {
  1498  			int i;
  1499  			status = tre_add_tag_left(mem, node, tag);
  1500  			tnfa->tag_directions[tag] = direction;
  1501  			if (minimal_tag >= 0)
  1502  			  {
  1503  			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
  1504  			    tnfa->minimal_tags[i] = tag;
  1505  			    tnfa->minimal_tags[i + 1] = minimal_tag;
  1506  			    tnfa->minimal_tags[i + 2] = -1;
  1507  			    minimal_tag = -1;
  1508  			    num_minimals++;
  1509  			  }
  1510  			tre_purge_regset(regset, tnfa, tag);
  1511  		      }
  1512  
  1513  		    regset[0] = -1;
  1514  		    tag = next_tag;
  1515  		    num_tags++;
  1516  		    next_tag++;
  1517  		  }
  1518  
  1519  		if (node->num_submatches > 0)
  1520  		  {
  1521  		    /* The next two tags are reserved for markers. */
  1522  		    next_tag++;
  1523  		    tag = next_tag;
  1524  		    next_tag++;
  1525  		  }
  1526  
  1527  		break;
  1528  	      }
  1529  	    }
  1530  
  1531  	  if (node->submatch_id >= 0)
  1532  	    {
  1533  	      int i;
  1534  	      /* Push this submatch on the parents stack. */
  1535  	      for (i = 0; parents[i] >= 0; i++);
  1536  	      parents[i] = node->submatch_id;
  1537  	      parents[i + 1] = -1;
  1538  	    }
  1539  
  1540  	  break; /* end case: ADDTAGS_RECURSE */
  1541  
  1542  	case ADDTAGS_AFTER_ITERATION:
  1543  	  {
  1544  	    int minimal = 0;
  1545  	    int enter_tag;
  1546  	    node = tre_stack_pop_voidptr(stack);
  1547  	    if (first_pass)
  1548  	      {
  1549  		node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
  1550  		  + tre_stack_pop_int(stack);
  1551  		minimal_tag = -1;
  1552  	      }
  1553  	    else
  1554  	      {
  1555  		minimal = tre_stack_pop_int(stack);
  1556  		enter_tag = tre_stack_pop_int(stack);
  1557  		if (minimal)
  1558  		  minimal_tag = enter_tag;
  1559  	      }
  1560  
  1561  	    if (!first_pass)
  1562  	      {
  1563  		if (minimal)
  1564  		  direction = TRE_TAG_MINIMIZE;
  1565  		else
  1566  		  direction = TRE_TAG_MAXIMIZE;
  1567  	      }
  1568  	    break;
  1569  	  }
  1570  
  1571  	case ADDTAGS_AFTER_CAT_LEFT:
  1572  	  {
  1573  	    int new_tag = tre_stack_pop_int(stack);
  1574  	    next_tag = tre_stack_pop_int(stack);
  1575  	    if (new_tag >= 0)
  1576  	      {
  1577  		tag = new_tag;
  1578  	      }
  1579  	    break;
  1580  	  }
  1581  
  1582  	case ADDTAGS_AFTER_CAT_RIGHT:
  1583  	  node = tre_stack_pop_voidptr(stack);
  1584  	  if (first_pass)
  1585  	    node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
  1586  	      + ((tre_catenation_t *)node->obj)->right->num_tags;
  1587  	  break;
  1588  
  1589  	case ADDTAGS_AFTER_UNION_LEFT:
  1590  	  /* Lift the bottom of the `regset' array so that when processing
  1591  	     the right operand the items currently in the array are
  1592  	     invisible.	 The original bottom was saved at ADDTAGS_UNION and
  1593  	     will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
  1594  	  while (*regset >= 0)
  1595  	    regset++;
  1596  	  break;
  1597  
  1598  	case ADDTAGS_AFTER_UNION_RIGHT:
  1599  	  {
  1600  	    int added_tags, tag_left, tag_right;
  1601  	    tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
  1602  	    tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
  1603  	    node = tre_stack_pop_voidptr(stack);
  1604  	    added_tags = tre_stack_pop_int(stack);
  1605  	    if (first_pass)
  1606  	      {
  1607  		node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
  1608  		  + ((tre_union_t *)node->obj)->right->num_tags + added_tags
  1609  		  + ((node->num_submatches > 0) ? 2 : 0);
  1610  	      }
  1611  	    regset = tre_stack_pop_voidptr(stack);
  1612  	    tag_left = tre_stack_pop_int(stack);
  1613  	    tag_right = tre_stack_pop_int(stack);
  1614  
  1615  	    /* Add tags after both children, the left child gets a smaller
  1616  	       tag than the right child.  This guarantees that we prefer
  1617  	       the left child over the right child. */
  1618  	    /* XXX - This is not always necessary (if the children have
  1619  	       tags which must be seen for every match of that child). */
  1620  	    /* XXX - Check if this is the only place where tre_add_tag_right
  1621  	       is used.	 If so, use tre_add_tag_left (putting the tag before
  1622  	       the child as opposed after the child) and throw away
  1623  	       tre_add_tag_right. */
  1624  	    if (node->num_submatches > 0)
  1625  	      {
  1626  		if (!first_pass)
  1627  		  {
  1628  		    status = tre_add_tag_right(mem, left, tag_left);
  1629  		    tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
  1630  		    if (status == REG_OK)
  1631  		      status = tre_add_tag_right(mem, right, tag_right);
  1632  		    tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
  1633  		  }
  1634  		num_tags += 2;
  1635  	      }
  1636  	    direction = TRE_TAG_MAXIMIZE;
  1637  	    break;
  1638  	  }
  1639  
  1640  	default:
  1641  	  assert(0);
  1642  	  break;
  1643  
  1644  	} /* end switch(symbol) */
  1645      } /* end while(tre_stack_num_objects(stack) > bottom) */
  1646  
  1647    if (!first_pass)
  1648      tre_purge_regset(regset, tnfa, tag);
  1649  
  1650    if (!first_pass && minimal_tag >= 0)
  1651      {
  1652        int i;
  1653        for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
  1654        tnfa->minimal_tags[i] = tag;
  1655        tnfa->minimal_tags[i + 1] = minimal_tag;
  1656        tnfa->minimal_tags[i + 2] = -1;
  1657        minimal_tag = -1;
  1658        num_minimals++;
  1659      }
  1660  
  1661    assert(tree->num_tags == num_tags);
  1662    tnfa->end_tag = num_tags;
  1663    tnfa->num_tags = num_tags;
  1664    tnfa->num_minimals = num_minimals;
  1665    xfree(orig_regset);
  1666    xfree(parents);
  1667    xfree(saved_states);
  1668    return status;
  1669  }
  1670  
  1671  
  1672  
  1673  /*
  1674    AST to TNFA compilation routines.
  1675  */
  1676  
  1677  typedef enum {
  1678    COPY_RECURSE,
  1679    COPY_SET_RESULT_PTR
  1680  } tre_copyast_symbol_t;
  1681  
  1682  /* Flags for tre_copy_ast(). */
  1683  #define COPY_REMOVE_TAGS	 1
  1684  #define COPY_MAXIMIZE_FIRST_TAG	 2
  1685  
  1686  static reg_errcode_t
  1687  tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
  1688  	     int flags, int *pos_add, tre_tag_direction_t *tag_directions,
  1689  	     tre_ast_node_t **copy, int *max_pos)
  1690  {
  1691    reg_errcode_t status = REG_OK;
  1692    int bottom = tre_stack_num_objects(stack);
  1693    int num_copied = 0;
  1694    int first_tag = 1;
  1695    tre_ast_node_t **result = copy;
  1696    tre_copyast_symbol_t symbol;
  1697  
  1698    STACK_PUSH(stack, voidptr, ast);
  1699    STACK_PUSH(stack, int, COPY_RECURSE);
  1700  
  1701    while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
  1702      {
  1703        tre_ast_node_t *node;
  1704        if (status != REG_OK)
  1705  	break;
  1706  
  1707        symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
  1708        switch (symbol)
  1709  	{
  1710  	case COPY_SET_RESULT_PTR:
  1711  	  result = tre_stack_pop_voidptr(stack);
  1712  	  break;
  1713  	case COPY_RECURSE:
  1714  	  node = tre_stack_pop_voidptr(stack);
  1715  	  switch (node->type)
  1716  	    {
  1717  	    case LITERAL:
  1718  	      {
  1719  		tre_literal_t *lit = node->obj;
  1720  		int pos = lit->position;
  1721  		int min = lit->code_min;
  1722  		int max = lit->code_max;
  1723  		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
  1724  		  {
  1725  		    /* XXX - e.g. [ab] has only one position but two
  1726  		       nodes, so we are creating holes in the state space
  1727  		       here.  Not fatal, just wastes memory. */
  1728  		    pos += *pos_add;
  1729  		    num_copied++;
  1730  		  }
  1731  		else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
  1732  		  {
  1733  		    /* Change this tag to empty. */
  1734  		    min = EMPTY;
  1735  		    max = pos = -1;
  1736  		  }
  1737  		else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
  1738  			 && first_tag)
  1739  		  {
  1740  		    /* Maximize the first tag. */
  1741  		    tag_directions[max] = TRE_TAG_MAXIMIZE;
  1742  		    first_tag = 0;
  1743  		  }
  1744  		*result = tre_ast_new_literal(mem, min, max, pos);
  1745  		if (*result == NULL)
  1746  		  status = REG_ESPACE;
  1747  		else {
  1748  		  tre_literal_t *p = (*result)->obj;
  1749  		  p->class = lit->class;
  1750  		  p->neg_classes = lit->neg_classes;
  1751  		}
  1752  
  1753  		if (pos > *max_pos)
  1754  		  *max_pos = pos;
  1755  		break;
  1756  	      }
  1757  	    case UNION:
  1758  	      {
  1759  		tre_union_t *uni = node->obj;
  1760  		tre_union_t *tmp;
  1761  		*result = tre_ast_new_union(mem, uni->left, uni->right);
  1762  		if (*result == NULL)
  1763  		  {
  1764  		    status = REG_ESPACE;
  1765  		    break;
  1766  		  }
  1767  		tmp = (*result)->obj;
  1768  		result = &tmp->left;
  1769  		STACK_PUSHX(stack, voidptr, uni->right);
  1770  		STACK_PUSHX(stack, int, COPY_RECURSE);
  1771  		STACK_PUSHX(stack, voidptr, &tmp->right);
  1772  		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
  1773  		STACK_PUSHX(stack, voidptr, uni->left);
  1774  		STACK_PUSHX(stack, int, COPY_RECURSE);
  1775  		break;
  1776  	      }
  1777  	    case CATENATION:
  1778  	      {
  1779  		tre_catenation_t *cat = node->obj;
  1780  		tre_catenation_t *tmp;
  1781  		*result = tre_ast_new_catenation(mem, cat->left, cat->right);
  1782  		if (*result == NULL)
  1783  		  {
  1784  		    status = REG_ESPACE;
  1785  		    break;
  1786  		  }
  1787  		tmp = (*result)->obj;
  1788  		tmp->left = NULL;
  1789  		tmp->right = NULL;
  1790  		result = &tmp->left;
  1791  
  1792  		STACK_PUSHX(stack, voidptr, cat->right);
  1793  		STACK_PUSHX(stack, int, COPY_RECURSE);
  1794  		STACK_PUSHX(stack, voidptr, &tmp->right);
  1795  		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
  1796  		STACK_PUSHX(stack, voidptr, cat->left);
  1797  		STACK_PUSHX(stack, int, COPY_RECURSE);
  1798  		break;
  1799  	      }
  1800  	    case ITERATION:
  1801  	      {
  1802  		tre_iteration_t *iter = node->obj;
  1803  		STACK_PUSHX(stack, voidptr, iter->arg);
  1804  		STACK_PUSHX(stack, int, COPY_RECURSE);
  1805  		*result = tre_ast_new_iter(mem, iter->arg, iter->min,
  1806  					   iter->max, iter->minimal);
  1807  		if (*result == NULL)
  1808  		  {
  1809  		    status = REG_ESPACE;
  1810  		    break;
  1811  		  }
  1812  		iter = (*result)->obj;
  1813  		result = &iter->arg;
  1814  		break;
  1815  	      }
  1816  	    default:
  1817  	      assert(0);
  1818  	      break;
  1819  	    }
  1820  	  break;
  1821  	}
  1822      }
  1823    *pos_add += num_copied;
  1824    return status;
  1825  }
  1826  
  1827  typedef enum {
  1828    EXPAND_RECURSE,
  1829    EXPAND_AFTER_ITER
  1830  } tre_expand_ast_symbol_t;
  1831  
  1832  /* Expands each iteration node that has a finite nonzero minimum or maximum
  1833     iteration count to a catenated sequence of copies of the node. */
  1834  static reg_errcode_t
  1835  tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
  1836  	       int *position, tre_tag_direction_t *tag_directions)
  1837  {
  1838    reg_errcode_t status = REG_OK;
  1839    int bottom = tre_stack_num_objects(stack);
  1840    int pos_add = 0;
  1841    int pos_add_total = 0;
  1842    int max_pos = 0;
  1843    int iter_depth = 0;
  1844  
  1845    STACK_PUSHR(stack, voidptr, ast);
  1846    STACK_PUSHR(stack, int, EXPAND_RECURSE);
  1847    while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
  1848      {
  1849        tre_ast_node_t *node;
  1850        tre_expand_ast_symbol_t symbol;
  1851  
  1852        if (status != REG_OK)
  1853  	break;
  1854  
  1855        symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
  1856        node = tre_stack_pop_voidptr(stack);
  1857        switch (symbol)
  1858  	{
  1859  	case EXPAND_RECURSE:
  1860  	  switch (node->type)
  1861  	    {
  1862  	    case LITERAL:
  1863  	      {
  1864  		tre_literal_t *lit= node->obj;
  1865  		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
  1866  		  {
  1867  		    lit->position += pos_add;
  1868  		    if (lit->position > max_pos)
  1869  		      max_pos = lit->position;
  1870  		  }
  1871  		break;
  1872  	      }
  1873  	    case UNION:
  1874  	      {
  1875  		tre_union_t *uni = node->obj;
  1876  		STACK_PUSHX(stack, voidptr, uni->right);
  1877  		STACK_PUSHX(stack, int, EXPAND_RECURSE);
  1878  		STACK_PUSHX(stack, voidptr, uni->left);
  1879  		STACK_PUSHX(stack, int, EXPAND_RECURSE);
  1880  		break;
  1881  	      }
  1882  	    case CATENATION:
  1883  	      {
  1884  		tre_catenation_t *cat = node->obj;
  1885  		STACK_PUSHX(stack, voidptr, cat->right);
  1886  		STACK_PUSHX(stack, int, EXPAND_RECURSE);
  1887  		STACK_PUSHX(stack, voidptr, cat->left);
  1888  		STACK_PUSHX(stack, int, EXPAND_RECURSE);
  1889  		break;
  1890  	      }
  1891  	    case ITERATION:
  1892  	      {
  1893  		tre_iteration_t *iter = node->obj;
  1894  		STACK_PUSHX(stack, int, pos_add);
  1895  		STACK_PUSHX(stack, voidptr, node);
  1896  		STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
  1897  		STACK_PUSHX(stack, voidptr, iter->arg);
  1898  		STACK_PUSHX(stack, int, EXPAND_RECURSE);
  1899  		/* If we are going to expand this node at EXPAND_AFTER_ITER
  1900  		   then don't increase the `pos' fields of the nodes now, it
  1901  		   will get done when expanding. */
  1902  		if (iter->min > 1 || iter->max > 1)
  1903  		  pos_add = 0;
  1904  		iter_depth++;
  1905  		break;
  1906  	      }
  1907  	    default:
  1908  	      assert(0);
  1909  	      break;
  1910  	    }
  1911  	  break;
  1912  	case EXPAND_AFTER_ITER:
  1913  	  {
  1914  	    tre_iteration_t *iter = node->obj;
  1915  	    int pos_add_last;
  1916  	    pos_add = tre_stack_pop_int(stack);
  1917  	    pos_add_last = pos_add;
  1918  	    if (iter->min > 1 || iter->max > 1)
  1919  	      {
  1920  		tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
  1921  		int j;
  1922  		int pos_add_save = pos_add;
  1923  
  1924  		/* Create a catenated sequence of copies of the node. */
  1925  		for (j = 0; j < iter->min; j++)
  1926  		  {
  1927  		    tre_ast_node_t *copy;
  1928  		    /* Remove tags from all but the last copy. */
  1929  		    int flags = ((j + 1 < iter->min)
  1930  				 ? COPY_REMOVE_TAGS
  1931  				 : COPY_MAXIMIZE_FIRST_TAG);
  1932  		    pos_add_save = pos_add;
  1933  		    status = tre_copy_ast(mem, stack, iter->arg, flags,
  1934  					  &pos_add, tag_directions, &copy,
  1935  					  &max_pos);
  1936  		    if (status != REG_OK)
  1937  		      return status;
  1938  		    if (seq1 != NULL)
  1939  		      seq1 = tre_ast_new_catenation(mem, seq1, copy);
  1940  		    else
  1941  		      seq1 = copy;
  1942  		    if (seq1 == NULL)
  1943  		      return REG_ESPACE;
  1944  		  }
  1945  
  1946  		if (iter->max == -1)
  1947  		  {
  1948  		    /* No upper limit. */
  1949  		    pos_add_save = pos_add;
  1950  		    status = tre_copy_ast(mem, stack, iter->arg, 0,
  1951  					  &pos_add, NULL, &seq2, &max_pos);
  1952  		    if (status != REG_OK)
  1953  		      return status;
  1954  		    seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
  1955  		    if (seq2 == NULL)
  1956  		      return REG_ESPACE;
  1957  		  }
  1958  		else
  1959  		  {
  1960  		    for (j = iter->min; j < iter->max; j++)
  1961  		      {
  1962  			tre_ast_node_t *tmp, *copy;
  1963  			pos_add_save = pos_add;
  1964  			status = tre_copy_ast(mem, stack, iter->arg, 0,
  1965  					      &pos_add, NULL, &copy, &max_pos);
  1966  			if (status != REG_OK)
  1967  			  return status;
  1968  			if (seq2 != NULL)
  1969  			  seq2 = tre_ast_new_catenation(mem, copy, seq2);
  1970  			else
  1971  			  seq2 = copy;
  1972  			if (seq2 == NULL)
  1973  			  return REG_ESPACE;
  1974  			tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
  1975  			if (tmp == NULL)
  1976  			  return REG_ESPACE;
  1977  			seq2 = tre_ast_new_union(mem, tmp, seq2);
  1978  			if (seq2 == NULL)
  1979  			  return REG_ESPACE;
  1980  		      }
  1981  		  }
  1982  
  1983  		pos_add = pos_add_save;
  1984  		if (seq1 == NULL)
  1985  		  seq1 = seq2;
  1986  		else if (seq2 != NULL)
  1987  		  seq1 = tre_ast_new_catenation(mem, seq1, seq2);
  1988  		if (seq1 == NULL)
  1989  		  return REG_ESPACE;
  1990  		node->obj = seq1->obj;
  1991  		node->type = seq1->type;
  1992  	      }
  1993  
  1994  	    iter_depth--;
  1995  	    pos_add_total += pos_add - pos_add_last;
  1996  	    if (iter_depth == 0)
  1997  	      pos_add = pos_add_total;
  1998  
  1999  	    break;
  2000  	  }
  2001  	default:
  2002  	  assert(0);
  2003  	  break;
  2004  	}
  2005      }
  2006  
  2007    *position += pos_add_total;
  2008  
  2009    /* `max_pos' should never be larger than `*position' if the above
  2010       code works, but just an extra safeguard let's make sure
  2011       `*position' is set large enough so enough memory will be
  2012       allocated for the transition table. */
  2013    if (max_pos > *position)
  2014      *position = max_pos;
  2015  
  2016    return status;
  2017  }
  2018  
  2019  static tre_pos_and_tags_t *
  2020  tre_set_empty(tre_mem_t mem)
  2021  {
  2022    tre_pos_and_tags_t *new_set;
  2023  
  2024    new_set = tre_mem_calloc(mem, sizeof(*new_set));
  2025    if (new_set == NULL)
  2026      return NULL;
  2027  
  2028    new_set[0].position = -1;
  2029    new_set[0].code_min = -1;
  2030    new_set[0].code_max = -1;
  2031  
  2032    return new_set;
  2033  }
  2034  
  2035  static tre_pos_and_tags_t *
  2036  tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
  2037  	    tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
  2038  {
  2039    tre_pos_and_tags_t *new_set;
  2040  
  2041    new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
  2042    if (new_set == NULL)
  2043      return NULL;
  2044  
  2045    new_set[0].position = position;
  2046    new_set[0].code_min = code_min;
  2047    new_set[0].code_max = code_max;
  2048    new_set[0].class = class;
  2049    new_set[0].neg_classes = neg_classes;
  2050    new_set[0].backref = backref;
  2051    new_set[1].position = -1;
  2052    new_set[1].code_min = -1;
  2053    new_set[1].code_max = -1;
  2054  
  2055    return new_set;
  2056  }
  2057  
  2058  static tre_pos_and_tags_t *
  2059  tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
  2060  	      int *tags, int assertions)
  2061  {
  2062    int s1, s2, i, j;
  2063    tre_pos_and_tags_t *new_set;
  2064    int *new_tags;
  2065    int num_tags;
  2066  
  2067    for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
  2068    for (s1 = 0; set1[s1].position >= 0; s1++);
  2069    for (s2 = 0; set2[s2].position >= 0; s2++);
  2070    new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
  2071    if (!new_set )
  2072      return NULL;
  2073  
  2074    for (s1 = 0; set1[s1].position >= 0; s1++)
  2075      {
  2076        new_set[s1].position = set1[s1].position;
  2077        new_set[s1].code_min = set1[s1].code_min;
  2078        new_set[s1].code_max = set1[s1].code_max;
  2079        new_set[s1].assertions = set1[s1].assertions | assertions;
  2080        new_set[s1].class = set1[s1].class;
  2081        new_set[s1].neg_classes = set1[s1].neg_classes;
  2082        new_set[s1].backref = set1[s1].backref;
  2083        if (set1[s1].tags == NULL && tags == NULL)
  2084  	new_set[s1].tags = NULL;
  2085        else
  2086  	{
  2087  	  for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
  2088  	  new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
  2089  					 * (i + num_tags + 1)));
  2090  	  if (new_tags == NULL)
  2091  	    return NULL;
  2092  	  for (j = 0; j < i; j++)
  2093  	    new_tags[j] = set1[s1].tags[j];
  2094  	  for (i = 0; i < num_tags; i++)
  2095  	    new_tags[j + i] = tags[i];
  2096  	  new_tags[j + i] = -1;
  2097  	  new_set[s1].tags = new_tags;
  2098  	}
  2099      }
  2100  
  2101    for (s2 = 0; set2[s2].position >= 0; s2++)
  2102      {
  2103        new_set[s1 + s2].position = set2[s2].position;
  2104        new_set[s1 + s2].code_min = set2[s2].code_min;
  2105        new_set[s1 + s2].code_max = set2[s2].code_max;
  2106        /* XXX - why not | assertions here as well? */
  2107        new_set[s1 + s2].assertions = set2[s2].assertions;
  2108        new_set[s1 + s2].class = set2[s2].class;
  2109        new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
  2110        new_set[s1 + s2].backref = set2[s2].backref;
  2111        if (set2[s2].tags == NULL)
  2112  	new_set[s1 + s2].tags = NULL;
  2113        else
  2114  	{
  2115  	  for (i = 0; set2[s2].tags[i] >= 0; i++);
  2116  	  new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
  2117  	  if (new_tags == NULL)
  2118  	    return NULL;
  2119  	  for (j = 0; j < i; j++)
  2120  	    new_tags[j] = set2[s2].tags[j];
  2121  	  new_tags[j] = -1;
  2122  	  new_set[s1 + s2].tags = new_tags;
  2123  	}
  2124      }
  2125    new_set[s1 + s2].position = -1;
  2126    return new_set;
  2127  }
  2128  
  2129  /* Finds the empty path through `node' which is the one that should be
  2130     taken according to POSIX.2 rules, and adds the tags on that path to
  2131     `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
  2132     set to the number of tags seen on the path. */
  2133  static reg_errcode_t
  2134  tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
  2135  		int *assertions, int *num_tags_seen)
  2136  {
  2137    tre_literal_t *lit;
  2138    tre_union_t *uni;
  2139    tre_catenation_t *cat;
  2140    tre_iteration_t *iter;
  2141    int i;
  2142    int bottom = tre_stack_num_objects(stack);
  2143    reg_errcode_t status = REG_OK;
  2144    if (num_tags_seen)
  2145      *num_tags_seen = 0;
  2146  
  2147    status = tre_stack_push_voidptr(stack, node);
  2148  
  2149    /* Walk through the tree recursively. */
  2150    while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
  2151      {
  2152        node = tre_stack_pop_voidptr(stack);
  2153  
  2154        switch (node->type)
  2155  	{
  2156  	case LITERAL:
  2157  	  lit = (tre_literal_t *)node->obj;
  2158  	  switch (lit->code_min)
  2159  	    {
  2160  	    case TAG:
  2161  	      if (lit->code_max >= 0)
  2162  		{
  2163  		  if (tags != NULL)
  2164  		    {
  2165  		      /* Add the tag to `tags'. */
  2166  		      for (i = 0; tags[i] >= 0; i++)
  2167  			if (tags[i] == lit->code_max)
  2168  			  break;
  2169  		      if (tags[i] < 0)
  2170  			{
  2171  			  tags[i] = lit->code_max;
  2172  			  tags[i + 1] = -1;
  2173  			}
  2174  		    }
  2175  		  if (num_tags_seen)
  2176  		    (*num_tags_seen)++;
  2177  		}
  2178  	      break;
  2179  	    case ASSERTION:
  2180  	      assert(lit->code_max >= 1
  2181  		     || lit->code_max <= ASSERT_LAST);
  2182  	      if (assertions != NULL)
  2183  		*assertions |= lit->code_max;
  2184  	      break;
  2185  	    case EMPTY:
  2186  	      break;
  2187  	    default:
  2188  	      assert(0);
  2189  	      break;
  2190  	    }
  2191  	  break;
  2192  
  2193  	case UNION:
  2194  	  /* Subexpressions starting earlier take priority over ones
  2195  	     starting later, so we prefer the left subexpression over the
  2196  	     right subexpression. */
  2197  	  uni = (tre_union_t *)node->obj;
  2198  	  if (uni->left->nullable)
  2199  	    STACK_PUSHX(stack, voidptr, uni->left)
  2200  	  else if (uni->right->nullable)
  2201  	    STACK_PUSHX(stack, voidptr, uni->right)
  2202  	  else
  2203  	    assert(0);
  2204  	  break;
  2205  
  2206  	case CATENATION:
  2207  	  /* The path must go through both children. */
  2208  	  cat = (tre_catenation_t *)node->obj;
  2209  	  assert(cat->left->nullable);
  2210  	  assert(cat->right->nullable);
  2211  	  STACK_PUSHX(stack, voidptr, cat->left);
  2212  	  STACK_PUSHX(stack, voidptr, cat->right);
  2213  	  break;
  2214  
  2215  	case ITERATION:
  2216  	  /* A match with an empty string is preferred over no match at
  2217  	     all, so we go through the argument if possible. */
  2218  	  iter = (tre_iteration_t *)node->obj;
  2219  	  if (iter->arg->nullable)
  2220  	    STACK_PUSHX(stack, voidptr, iter->arg);
  2221  	  break;
  2222  
  2223  	default:
  2224  	  assert(0);
  2225  	  break;
  2226  	}
  2227      }
  2228  
  2229    return status;
  2230  }
  2231  
  2232  
  2233  typedef enum {
  2234    NFL_RECURSE,
  2235    NFL_POST_UNION,
  2236    NFL_POST_CATENATION,
  2237    NFL_POST_ITERATION
  2238  } tre_nfl_stack_symbol_t;
  2239  
  2240  
  2241  /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
  2242     the nodes of the AST `tree'. */
  2243  static reg_errcode_t
  2244  tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
  2245  {
  2246    int bottom = tre_stack_num_objects(stack);
  2247  
  2248    STACK_PUSHR(stack, voidptr, tree);
  2249    STACK_PUSHR(stack, int, NFL_RECURSE);
  2250  
  2251    while (tre_stack_num_objects(stack) > bottom)
  2252      {
  2253        tre_nfl_stack_symbol_t symbol;
  2254        tre_ast_node_t *node;
  2255  
  2256        symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
  2257        node = tre_stack_pop_voidptr(stack);
  2258        switch (symbol)
  2259  	{
  2260  	case NFL_RECURSE:
  2261  	  switch (node->type)
  2262  	    {
  2263  	    case LITERAL:
  2264  	      {
  2265  		tre_literal_t *lit = (tre_literal_t *)node->obj;
  2266  		if (IS_BACKREF(lit))
  2267  		  {
  2268  		    /* Back references: nullable = false, firstpos = {i},
  2269  		       lastpos = {i}. */
  2270  		    node->nullable = 0;
  2271  		    node->firstpos = tre_set_one(mem, lit->position, 0,
  2272  					     TRE_CHAR_MAX, 0, NULL, -1);
  2273  		    if (!node->firstpos)
  2274  		      return REG_ESPACE;
  2275  		    node->lastpos = tre_set_one(mem, lit->position, 0,
  2276  						TRE_CHAR_MAX, 0, NULL,
  2277  						(int)lit->code_max);
  2278  		    if (!node->lastpos)
  2279  		      return REG_ESPACE;
  2280  		  }
  2281  		else if (lit->code_min < 0)
  2282  		  {
  2283  		    /* Tags, empty strings, params, and zero width assertions:
  2284  		       nullable = true, firstpos = {}, and lastpos = {}. */
  2285  		    node->nullable = 1;
  2286  		    node->firstpos = tre_set_empty(mem);
  2287  		    if (!node->firstpos)
  2288  		      return REG_ESPACE;
  2289  		    node->lastpos = tre_set_empty(mem);
  2290  		    if (!node->lastpos)
  2291  		      return REG_ESPACE;
  2292  		  }
  2293  		else
  2294  		  {
  2295  		    /* Literal at position i: nullable = false, firstpos = {i},
  2296  		       lastpos = {i}. */
  2297  		    node->nullable = 0;
  2298  		    node->firstpos =
  2299  		      tre_set_one(mem, lit->position, (int)lit->code_min,
  2300  				  (int)lit->code_max, 0, NULL, -1);
  2301  		    if (!node->firstpos)
  2302  		      return REG_ESPACE;
  2303  		    node->lastpos = tre_set_one(mem, lit->position,
  2304  						(int)lit->code_min,
  2305  						(int)lit->code_max,
  2306  						lit->class, lit->neg_classes,
  2307  						-1);
  2308  		    if (!node->lastpos)
  2309  		      return REG_ESPACE;
  2310  		  }
  2311  		break;
  2312  	      }
  2313  
  2314  	    case UNION:
  2315  	      /* Compute the attributes for the two subtrees, and after that
  2316  		 for this node. */
  2317  	      STACK_PUSHR(stack, voidptr, node);
  2318  	      STACK_PUSHR(stack, int, NFL_POST_UNION);
  2319  	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
  2320  	      STACK_PUSHR(stack, int, NFL_RECURSE);
  2321  	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
  2322  	      STACK_PUSHR(stack, int, NFL_RECURSE);
  2323  	      break;
  2324  
  2325  	    case CATENATION:
  2326  	      /* Compute the attributes for the two subtrees, and after that
  2327  		 for this node. */
  2328  	      STACK_PUSHR(stack, voidptr, node);
  2329  	      STACK_PUSHR(stack, int, NFL_POST_CATENATION);
  2330  	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
  2331  	      STACK_PUSHR(stack, int, NFL_RECURSE);
  2332  	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
  2333  	      STACK_PUSHR(stack, int, NFL_RECURSE);
  2334  	      break;
  2335  
  2336  	    case ITERATION:
  2337  	      /* Compute the attributes for the subtree, and after that for
  2338  		 this node. */
  2339  	      STACK_PUSHR(stack, voidptr, node);
  2340  	      STACK_PUSHR(stack, int, NFL_POST_ITERATION);
  2341  	      STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
  2342  	      STACK_PUSHR(stack, int, NFL_RECURSE);
  2343  	      break;
  2344  	    }
  2345  	  break; /* end case: NFL_RECURSE */
  2346  
  2347  	case NFL_POST_UNION:
  2348  	  {
  2349  	    tre_union_t *uni = (tre_union_t *)node->obj;
  2350  	    node->nullable = uni->left->nullable || uni->right->nullable;
  2351  	    node->firstpos = tre_set_union(mem, uni->left->firstpos,
  2352  					   uni->right->firstpos, NULL, 0);
  2353  	    if (!node->firstpos)
  2354  	      return REG_ESPACE;
  2355  	    node->lastpos = tre_set_union(mem, uni->left->lastpos,
  2356  					  uni->right->lastpos, NULL, 0);
  2357  	    if (!node->lastpos)
  2358  	      return REG_ESPACE;
  2359  	    break;
  2360  	  }
  2361  
  2362  	case NFL_POST_ITERATION:
  2363  	  {
  2364  	    tre_iteration_t *iter = (tre_iteration_t *)node->obj;
  2365  
  2366  	    if (iter->min == 0 || iter->arg->nullable)
  2367  	      node->nullable = 1;
  2368  	    else
  2369  	      node->nullable = 0;
  2370  	    node->firstpos = iter->arg->firstpos;
  2371  	    node->lastpos = iter->arg->lastpos;
  2372  	    break;
  2373  	  }
  2374  
  2375  	case NFL_POST_CATENATION:
  2376  	  {
  2377  	    int num_tags, *tags, assertions;
  2378  	    reg_errcode_t status;
  2379  	    tre_catenation_t *cat = node->obj;
  2380  	    node->nullable = cat->left->nullable && cat->right->nullable;
  2381  
  2382  	    /* Compute firstpos. */
  2383  	    if (cat->left->nullable)
  2384  	      {
  2385  		/* The left side matches the empty string.  Make a first pass
  2386  		   with tre_match_empty() to get the number of tags and
  2387  		   parameters. */
  2388  		status = tre_match_empty(stack, cat->left,
  2389  					 NULL, NULL, &num_tags);
  2390  		if (status != REG_OK)
  2391  		  return status;
  2392  		/* Allocate arrays for the tags and parameters. */
  2393  		tags = xmalloc(sizeof(*tags) * (num_tags + 1));
  2394  		if (!tags)
  2395  		  return REG_ESPACE;
  2396  		tags[0] = -1;
  2397  		assertions = 0;
  2398  		/* Second pass with tre_mach_empty() to get the list of
  2399  		   tags and parameters. */
  2400  		status = tre_match_empty(stack, cat->left, tags,
  2401  					 &assertions, NULL);
  2402  		if (status != REG_OK)
  2403  		  {
  2404  		    xfree(tags);
  2405  		    return status;
  2406  		  }
  2407  		node->firstpos =
  2408  		  tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
  2409  				tags, assertions);
  2410  		xfree(tags);
  2411  		if (!node->firstpos)
  2412  		  return REG_ESPACE;
  2413  	      }
  2414  	    else
  2415  	      {
  2416  		node->firstpos = cat->left->firstpos;
  2417  	      }
  2418  
  2419  	    /* Compute lastpos. */
  2420  	    if (cat->right->nullable)
  2421  	      {
  2422  		/* The right side matches the empty string.  Make a first pass
  2423  		   with tre_match_empty() to get the number of tags and
  2424  		   parameters. */
  2425  		status = tre_match_empty(stack, cat->right,
  2426  					 NULL, NULL, &num_tags);
  2427  		if (status != REG_OK)
  2428  		  return status;
  2429  		/* Allocate arrays for the tags and parameters. */
  2430  		tags = xmalloc(sizeof(int) * (num_tags + 1));
  2431  		if (!tags)
  2432  		  return REG_ESPACE;
  2433  		tags[0] = -1;
  2434  		assertions = 0;
  2435  		/* Second pass with tre_mach_empty() to get the list of
  2436  		   tags and parameters. */
  2437  		status = tre_match_empty(stack, cat->right, tags,
  2438  					 &assertions, NULL);
  2439  		if (status != REG_OK)
  2440  		  {
  2441  		    xfree(tags);
  2442  		    return status;
  2443  		  }
  2444  		node->lastpos =
  2445  		  tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
  2446  				tags, assertions);
  2447  		xfree(tags);
  2448  		if (!node->lastpos)
  2449  		  return REG_ESPACE;
  2450  	      }
  2451  	    else
  2452  	      {
  2453  		node->lastpos = cat->right->lastpos;
  2454  	      }
  2455  	    break;
  2456  	  }
  2457  
  2458  	default:
  2459  	  assert(0);
  2460  	  break;
  2461  	}
  2462      }
  2463  
  2464    return REG_OK;
  2465  }
  2466  
  2467  
  2468  /* Adds a transition from each position in `p1' to each position in `p2'. */
  2469  static reg_errcode_t
  2470  tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
  2471  	       tre_tnfa_transition_t *transitions,
  2472  	       int *counts, int *offs)
  2473  {
  2474    tre_pos_and_tags_t *orig_p2 = p2;
  2475    tre_tnfa_transition_t *trans;
  2476    int i, j, k, l, dup, prev_p2_pos;
  2477  
  2478    if (transitions != NULL)
  2479      while (p1->position >= 0)
  2480        {
  2481  	p2 = orig_p2;
  2482  	prev_p2_pos = -1;
  2483  	while (p2->position >= 0)
  2484  	  {
  2485  	    /* Optimization: if this position was already handled, skip it. */
  2486  	    if (p2->position == prev_p2_pos)
  2487  	      {
  2488  		p2++;
  2489  		continue;
  2490  	      }
  2491  	    prev_p2_pos = p2->position;
  2492  	    /* Set `trans' to point to the next unused transition from
  2493  	       position `p1->position'. */
  2494  	    trans = transitions + offs[p1->position];
  2495  	    while (trans->state != NULL)
  2496  	      {
  2497  #if 0
  2498  		/* If we find a previous transition from `p1->position' to
  2499  		   `p2->position', it is overwritten.  This can happen only
  2500  		   if there are nested loops in the regexp, like in "((a)*)*".
  2501  		   In POSIX.2 repetition using the outer loop is always
  2502  		   preferred over using the inner loop.	 Therefore the
  2503  		   transition for the inner loop is useless and can be thrown
  2504  		   away. */
  2505  		/* XXX - The same position is used for all nodes in a bracket
  2506  		   expression, so this optimization cannot be used (it will
  2507  		   break bracket expressions) unless I figure out a way to
  2508  		   detect it here. */
  2509  		if (trans->state_id == p2->position)
  2510  		  {
  2511  		    break;
  2512  		  }
  2513  #endif
  2514  		trans++;
  2515  	      }
  2516  
  2517  	    if (trans->state == NULL)
  2518  	      (trans + 1)->state = NULL;
  2519  	    /* Use the character ranges, assertions, etc. from `p1' for
  2520  	       the transition from `p1' to `p2'. */
  2521  	    trans->code_min = p1->code_min;
  2522  	    trans->code_max = p1->code_max;
  2523  	    trans->state = transitions + offs[p2->position];
  2524  	    trans->state_id = p2->position;
  2525  	    trans->assertions = p1->assertions | p2->assertions
  2526  	      | (p1->class ? ASSERT_CHAR_CLASS : 0)
  2527  	      | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
  2528  	    if (p1->backref >= 0)
  2529  	      {
  2530  		assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
  2531  		assert(p2->backref < 0);
  2532  		trans->u.backref = p1->backref;
  2533  		trans->assertions |= ASSERT_BACKREF;
  2534  	      }
  2535  	    else
  2536  	      trans->u.class = p1->class;
  2537  	    if (p1->neg_classes != NULL)
  2538  	      {
  2539  		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
  2540  		trans->neg_classes =
  2541  		  xmalloc(sizeof(*trans->neg_classes) * (i + 1));
  2542  		if (trans->neg_classes == NULL)
  2543  		  return REG_ESPACE;
  2544  		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
  2545  		  trans->neg_classes[i] = p1->neg_classes[i];
  2546  		trans->neg_classes[i] = (tre_ctype_t)0;
  2547  	      }
  2548  	    else
  2549  	      trans->neg_classes = NULL;
  2550  
  2551  	    /* Find out how many tags this transition has. */
  2552  	    i = 0;
  2553  	    if (p1->tags != NULL)
  2554  	      while(p1->tags[i] >= 0)
  2555  		i++;
  2556  	    j = 0;
  2557  	    if (p2->tags != NULL)
  2558  	      while(p2->tags[j] >= 0)
  2559  		j++;
  2560  
  2561  	    /* If we are overwriting a transition, free the old tag array. */
  2562  	    if (trans->tags != NULL)
  2563  	      xfree(trans->tags);
  2564  	    trans->tags = NULL;
  2565  
  2566  	    /* If there were any tags, allocate an array and fill it. */
  2567  	    if (i + j > 0)
  2568  	      {
  2569  		trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
  2570  		if (!trans->tags)
  2571  		  return REG_ESPACE;
  2572  		i = 0;
  2573  		if (p1->tags != NULL)
  2574  		  while(p1->tags[i] >= 0)
  2575  		    {
  2576  		      trans->tags[i] = p1->tags[i];
  2577  		      i++;
  2578  		    }
  2579  		l = i;
  2580  		j = 0;
  2581  		if (p2->tags != NULL)
  2582  		  while (p2->tags[j] >= 0)
  2583  		    {
  2584  		      /* Don't add duplicates. */
  2585  		      dup = 0;
  2586  		      for (k = 0; k < i; k++)
  2587  			if (trans->tags[k] == p2->tags[j])
  2588  			  {
  2589  			    dup = 1;
  2590  			    break;
  2591  			  }
  2592  		      if (!dup)
  2593  			trans->tags[l++] = p2->tags[j];
  2594  		      j++;
  2595  		    }
  2596  		trans->tags[l] = -1;
  2597  	      }
  2598  
  2599  	    p2++;
  2600  	  }
  2601  	p1++;
  2602        }
  2603    else
  2604      /* Compute a maximum limit for the number of transitions leaving
  2605         from each state. */
  2606      while (p1->position >= 0)
  2607        {
  2608  	p2 = orig_p2;
  2609  	while (p2->position >= 0)
  2610  	  {
  2611  	    counts[p1->position]++;
  2612  	    p2++;
  2613  	  }
  2614  	p1++;
  2615        }
  2616    return REG_OK;
  2617  }
  2618  
  2619  /* Converts the syntax tree to a TNFA.	All the transitions in the TNFA are
  2620     labelled with one character range (there are no transitions on empty
  2621     strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
  2622     the regexp. */
  2623  static reg_errcode_t
  2624  tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
  2625  		int *counts, int *offs)
  2626  {
  2627    tre_union_t *uni;
  2628    tre_catenation_t *cat;
  2629    tre_iteration_t *iter;
  2630    reg_errcode_t errcode = REG_OK;
  2631  
  2632    /* XXX - recurse using a stack!. */
  2633    switch (node->type)
  2634      {
  2635      case LITERAL:
  2636        break;
  2637      case UNION:
  2638        uni = (tre_union_t *)node->obj;
  2639        errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
  2640        if (errcode != REG_OK)
  2641  	return errcode;
  2642        errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
  2643        break;
  2644  
  2645      case CATENATION:
  2646        cat = (tre_catenation_t *)node->obj;
  2647        /* Add a transition from each position in cat->left->lastpos
  2648  	 to each position in cat->right->firstpos. */
  2649        errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
  2650  			       transitions, counts, offs);
  2651        if (errcode != REG_OK)
  2652  	return errcode;
  2653        errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
  2654        if (errcode != REG_OK)
  2655  	return errcode;
  2656        errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
  2657        break;
  2658  
  2659      case ITERATION:
  2660        iter = (tre_iteration_t *)node->obj;
  2661        assert(iter->max == -1 || iter->max == 1);
  2662  
  2663        if (iter->max == -1)
  2664  	{
  2665  	  assert(iter->min == 0 || iter->min == 1);
  2666  	  /* Add a transition from each last position in the iterated
  2667  	     expression to each first position. */
  2668  	  errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
  2669  				   transitions, counts, offs);
  2670  	  if (errcode != REG_OK)
  2671  	    return errcode;
  2672  	}
  2673        errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
  2674        break;
  2675      }
  2676    return errcode;
  2677  }
  2678  
  2679  
  2680  #define ERROR_EXIT(err)		  \
  2681    do				  \
  2682      {				  \
  2683        errcode = err;		  \
  2684        if (/*CONSTCOND*/1)	  \
  2685        	goto error_exit;	  \
  2686      }				  \
  2687   while (/*CONSTCOND*/0)
  2688  
  2689  
  2690  int
  2691  regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
  2692  {
  2693    tre_stack_t *stack;
  2694    tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
  2695    tre_pos_and_tags_t *p;
  2696    int *counts = NULL, *offs = NULL;
  2697    int i, add = 0;
  2698    tre_tnfa_transition_t *transitions, *initial;
  2699    tre_tnfa_t *tnfa = NULL;
  2700    tre_submatch_data_t *submatch_data;
  2701    tre_tag_direction_t *tag_directions = NULL;
  2702    reg_errcode_t errcode;
  2703    tre_mem_t mem;
  2704  
  2705    /* Parse context. */
  2706    tre_parse_ctx_t parse_ctx;
  2707  
  2708    /* Allocate a stack used throughout the compilation process for various
  2709       purposes. */
  2710    stack = tre_stack_new(512, 1024000, 128);
  2711    if (!stack)
  2712      return REG_ESPACE;
  2713    /* Allocate a fast memory allocator. */
  2714    mem = tre_mem_new();
  2715    if (!mem)
  2716      {
  2717        tre_stack_destroy(stack);
  2718        return REG_ESPACE;
  2719      }
  2720  
  2721    /* Parse the regexp. */
  2722    memset(&parse_ctx, 0, sizeof(parse_ctx));
  2723    parse_ctx.mem = mem;
  2724    parse_ctx.stack = stack;
  2725    parse_ctx.start = regex;
  2726    parse_ctx.cflags = cflags;
  2727    parse_ctx.max_backref = -1;
  2728    errcode = tre_parse(&parse_ctx);
  2729    if (errcode != REG_OK)
  2730      ERROR_EXIT(errcode);
  2731    preg->re_nsub = parse_ctx.submatch_id - 1;
  2732    tree = parse_ctx.n;
  2733  
  2734  #ifdef TRE_DEBUG
  2735    tre_ast_print(tree);
  2736  #endif /* TRE_DEBUG */
  2737  
  2738    /* Referring to nonexistent subexpressions is illegal. */
  2739    if (parse_ctx.max_backref > (int)preg->re_nsub)
  2740      ERROR_EXIT(REG_ESUBREG);
  2741  
  2742    /* Allocate the TNFA struct. */
  2743    tnfa = xcalloc(1, sizeof(tre_tnfa_t));
  2744    if (tnfa == NULL)
  2745      ERROR_EXIT(REG_ESPACE);
  2746    tnfa->have_backrefs = parse_ctx.max_backref >= 0;
  2747    tnfa->have_approx = 0;
  2748    tnfa->num_submatches = parse_ctx.submatch_id;
  2749  
  2750    /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
  2751       regexp does not have back references, this can be skipped. */
  2752    if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
  2753      {
  2754  
  2755        /* Figure out how many tags we will need. */
  2756        errcode = tre_add_tags(NULL, stack, tree, tnfa);
  2757        if (errcode != REG_OK)
  2758  	ERROR_EXIT(errcode);
  2759  
  2760        if (tnfa->num_tags > 0)
  2761  	{
  2762  	  tag_directions = xmalloc(sizeof(*tag_directions)
  2763  				   * (tnfa->num_tags + 1));
  2764  	  if (tag_directions == NULL)
  2765  	    ERROR_EXIT(REG_ESPACE);
  2766  	  tnfa->tag_directions = tag_directions;
  2767  	  memset(tag_directions, -1,
  2768  		 sizeof(*tag_directions) * (tnfa->num_tags + 1));
  2769  	}
  2770        tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
  2771  				   sizeof(*tnfa->minimal_tags));
  2772        if (tnfa->minimal_tags == NULL)
  2773  	ERROR_EXIT(REG_ESPACE);
  2774  
  2775        submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
  2776  			      sizeof(*submatch_data));
  2777        if (submatch_data == NULL)
  2778  	ERROR_EXIT(REG_ESPACE);
  2779        tnfa->submatch_data = submatch_data;
  2780  
  2781        errcode = tre_add_tags(mem, stack, tree, tnfa);
  2782        if (errcode != REG_OK)
  2783  	ERROR_EXIT(errcode);
  2784  
  2785      }
  2786  
  2787    /* Expand iteration nodes. */
  2788    errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
  2789  			   tag_directions);
  2790    if (errcode != REG_OK)
  2791      ERROR_EXIT(errcode);
  2792  
  2793    /* Add a dummy node for the final state.
  2794       XXX - For certain patterns this dummy node can be optimized away,
  2795  	   for example "a*" or "ab*".	Figure out a simple way to detect
  2796  	   this possibility. */
  2797    tmp_ast_l = tree;
  2798    tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
  2799    if (tmp_ast_r == NULL)
  2800      ERROR_EXIT(REG_ESPACE);
  2801  
  2802    tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
  2803    if (tree == NULL)
  2804      ERROR_EXIT(REG_ESPACE);
  2805  
  2806    errcode = tre_compute_nfl(mem, stack, tree);
  2807    if (errcode != REG_OK)
  2808      ERROR_EXIT(errcode);
  2809  
  2810    counts = xmalloc(sizeof(int) * parse_ctx.position);
  2811    if (counts == NULL)
  2812      ERROR_EXIT(REG_ESPACE);
  2813  
  2814    offs = xmalloc(sizeof(int) * parse_ctx.position);
  2815    if (offs == NULL)
  2816      ERROR_EXIT(REG_ESPACE);
  2817  
  2818    for (i = 0; i < parse_ctx.position; i++)
  2819      counts[i] = 0;
  2820    tre_ast_to_tnfa(tree, NULL, counts, NULL);
  2821  
  2822    add = 0;
  2823    for (i = 0; i < parse_ctx.position; i++)
  2824      {
  2825        offs[i] = add;
  2826        add += counts[i] + 1;
  2827        counts[i] = 0;
  2828      }
  2829    transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
  2830    if (transitions == NULL)
  2831      ERROR_EXIT(REG_ESPACE);
  2832    tnfa->transitions = transitions;
  2833    tnfa->num_transitions = add;
  2834  
  2835    errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
  2836    if (errcode != REG_OK)
  2837      ERROR_EXIT(errcode);
  2838  
  2839    tnfa->firstpos_chars = NULL;
  2840  
  2841    p = tree->firstpos;
  2842    i = 0;
  2843    while (p->position >= 0)
  2844      {
  2845        i++;
  2846        p++;
  2847      }
  2848  
  2849    initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
  2850    if (initial == NULL)
  2851      ERROR_EXIT(REG_ESPACE);
  2852    tnfa->initial = initial;
  2853  
  2854    i = 0;
  2855    for (p = tree->firstpos; p->position >= 0; p++)
  2856      {
  2857        initial[i].state = transitions + offs[p->position];
  2858        initial[i].state_id = p->position;
  2859        initial[i].tags = NULL;
  2860        /* Copy the arrays p->tags, and p->params, they are allocated
  2861  	 from a tre_mem object. */
  2862        if (p->tags)
  2863  	{
  2864  	  int j;
  2865  	  for (j = 0; p->tags[j] >= 0; j++);
  2866  	  initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
  2867  	  if (!initial[i].tags)
  2868  	    ERROR_EXIT(REG_ESPACE);
  2869  	  memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
  2870  	}
  2871        initial[i].assertions = p->assertions;
  2872        i++;
  2873      }
  2874    initial[i].state = NULL;
  2875  
  2876    tnfa->num_transitions = add;
  2877    tnfa->final = transitions + offs[tree->lastpos[0].position];
  2878    tnfa->num_states = parse_ctx.position;
  2879    tnfa->cflags = cflags;
  2880  
  2881    tre_mem_destroy(mem);
  2882    tre_stack_destroy(stack);
  2883    xfree(counts);
  2884    xfree(offs);
  2885  
  2886    preg->TRE_REGEX_T_FIELD = (void *)tnfa;
  2887    return REG_OK;
  2888  
  2889   error_exit:
  2890    /* Free everything that was allocated and return the error code. */
  2891    tre_mem_destroy(mem);
  2892    if (stack != NULL)
  2893      tre_stack_destroy(stack);
  2894    if (counts != NULL)
  2895      xfree(counts);
  2896    if (offs != NULL)
  2897      xfree(offs);
  2898    preg->TRE_REGEX_T_FIELD = (void *)tnfa;
  2899    regfree(preg);
  2900    return errcode;
  2901  }
  2902  
  2903  
  2904  
  2905  
  2906  void
  2907  regfree(regex_t *preg)
  2908  {
  2909    tre_tnfa_t *tnfa;
  2910    unsigned int i;
  2911    tre_tnfa_transition_t *trans;
  2912  
  2913    tnfa = (void *)preg->TRE_REGEX_T_FIELD;
  2914    if (!tnfa)
  2915      return;
  2916  
  2917    for (i = 0; i < tnfa->num_transitions; i++)
  2918      if (tnfa->transitions[i].state)
  2919        {
  2920  	if (tnfa->transitions[i].tags)
  2921  	  xfree(tnfa->transitions[i].tags);
  2922  	if (tnfa->transitions[i].neg_classes)
  2923  	  xfree(tnfa->transitions[i].neg_classes);
  2924        }
  2925    if (tnfa->transitions)
  2926      xfree(tnfa->transitions);
  2927  
  2928    if (tnfa->initial)
  2929      {
  2930        for (trans = tnfa->initial; trans->state; trans++)
  2931  	{
  2932  	  if (trans->tags)
  2933  	    xfree(trans->tags);
  2934  	}
  2935        xfree(tnfa->initial);
  2936      }
  2937  
  2938    if (tnfa->submatch_data)
  2939      {
  2940        for (i = 0; i < tnfa->num_submatches; i++)
  2941  	if (tnfa->submatch_data[i].parents)
  2942  	  xfree(tnfa->submatch_data[i].parents);
  2943        xfree(tnfa->submatch_data);
  2944      }
  2945  
  2946    if (tnfa->tag_directions)
  2947      xfree(tnfa->tag_directions);
  2948    if (tnfa->firstpos_chars)
  2949      xfree(tnfa->firstpos_chars);
  2950    if (tnfa->minimal_tags)
  2951      xfree(tnfa->minimal_tags);
  2952    xfree(tnfa);
  2953  }