about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--src/regex/regcomp.c1715
1 files changed, 634 insertions, 1081 deletions
diff --git a/src/regex/regcomp.c b/src/regex/regcomp.c
index d9076275..4cdaa1ea 100644
--- a/src/regex/regcomp.c
+++ b/src/regex/regcomp.c
@@ -34,6 +34,7 @@
 #include <regex.h>
 #include <limits.h>
 #include <stdint.h>
+#include <ctype.h>
 
 #include "tre.h"
 
@@ -135,108 +136,88 @@ typedef struct {
   tre_ast_node_t *right;
 } tre_union_t;
 
-static tre_ast_node_t *
-tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
-
-static tre_ast_node_t *
-tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
-
-static tre_ast_node_t *
-tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
-		 int minimal);
-
-static tre_ast_node_t *
-tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
-
-static tre_ast_node_t *
-tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
-		       tre_ast_node_t *right);
-
 
 static tre_ast_node_t *
-tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
+tre_ast_new_node(tre_mem_t mem, int type, void *obj)
 {
-  tre_ast_node_t *node;
-
-  node = tre_mem_calloc(mem, sizeof(*node));
-  if (!node)
-    return NULL;
-  node->obj = tre_mem_calloc(mem, size);
-  if (!node->obj)
-    return NULL;
-  node->type = type;
-  node->nullable = -1;
-  node->submatch_id = -1;
-
-  return node;
+	tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
+	if (!node || !obj)
+		return 0;
+	node->obj = obj;
+	node->type = type;
+	node->nullable = -1;
+	node->submatch_id = -1;
+	return node;
 }
 
 static tre_ast_node_t *
 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
 {
-  tre_ast_node_t *node;
-  tre_literal_t *lit;
-
-  node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t));
-  if (!node)
-    return NULL;
-  lit = node->obj;
-  lit->code_min = code_min;
-  lit->code_max = code_max;
-  lit->position = position;
-
-  return node;
+	tre_ast_node_t *node;
+	tre_literal_t *lit;
+
+	lit = tre_mem_calloc(mem, sizeof *lit);
+	node = tre_ast_new_node(mem, LITERAL, lit);
+	if (!node)
+		return 0;
+	lit->code_min = code_min;
+	lit->code_max = code_max;
+	lit->position = position;
+	return node;
 }
 
 static tre_ast_node_t *
-tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
-		 int minimal)
+tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
 {
-  tre_ast_node_t *node;
-  tre_iteration_t *iter;
-
-  node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t));
-  if (!node)
-    return NULL;
-  iter = node->obj;
-  iter->arg = arg;
-  iter->min = min;
-  iter->max = max;
-  iter->minimal = minimal;
-  node->num_submatches = arg->num_submatches;
-
-  return node;
+	tre_ast_node_t *node;
+	tre_iteration_t *iter;
+
+	iter = tre_mem_calloc(mem, sizeof *iter);
+	node = tre_ast_new_node(mem, ITERATION, iter);
+	if (!node)
+		return 0;
+	iter->arg = arg;
+	iter->min = min;
+	iter->max = max;
+	iter->minimal = minimal;
+	node->num_submatches = arg->num_submatches;
+	return node;
 }
 
 static tre_ast_node_t *
 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 {
-  tre_ast_node_t *node;
-
-  node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t));
-  if (node == NULL)
-    return NULL;
-  ((tre_union_t *)node->obj)->left = left;
-  ((tre_union_t *)node->obj)->right = right;
-  node->num_submatches = left->num_submatches + right->num_submatches;
-
-  return node;
+	tre_ast_node_t *node;
+	tre_union_t *un;
+
+	if (!left)
+		return right;
+	un = tre_mem_calloc(mem, sizeof *un);
+	node = tre_ast_new_node(mem, UNION, un);
+	if (!node || !right)
+		return 0;
+	un->left = left;
+	un->right = right;
+	node->num_submatches = left->num_submatches + right->num_submatches;
+	return node;
 }
 
 static tre_ast_node_t *
-tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
-		       tre_ast_node_t *right)
+tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
 {
-  tre_ast_node_t *node;
-
-  node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t));
-  if (node == NULL)
-    return NULL;
-  ((tre_catenation_t *)node->obj)->left = left;
-  ((tre_catenation_t *)node->obj)->right = right;
-  node->num_submatches = left->num_submatches + right->num_submatches;
-
-  return node;
+	tre_ast_node_t *node;
+	tre_catenation_t *cat;
+
+	if (!left)
+		return right;
+	cat = tre_mem_calloc(mem, sizeof *cat);
+	node = tre_ast_new_node(mem, CATENATION, cat);
+	if (!node)
+		return 0;
+	cat->left = left;
+	cat->right = right;
+	node->num_submatches = left->num_submatches + right->num_submatches;
+	return node;
 }
 
 
@@ -412,1077 +393,654 @@ define_popf(voidptr, void *)
 
 /* Parse context. */
 typedef struct {
-  /* Memory allocator.	The AST is allocated using this. */
-  tre_mem_t mem;
-  /* Stack used for keeping track of regexp syntax. */
-  tre_stack_t *stack;
-  /* The parse result. */
-  tre_ast_node_t *result;
-  /* The regexp to parse and its length. */
-  const char *re;
-  /* The first character of the entire regexp. */
-  const char *re_start;
-  /* Current submatch ID. */
-  int submatch_id;
-  /* Current position (number of literal). */
-  int position;
-  /* The highest back reference or -1 if none seen so far. */
-  int max_backref;
-  /* This flag is set if the regexp uses approximate matching. */
-  int have_approx;
-  /* Compilation flags. */
-  int cflags;
-  /* If this flag is set the top-level submatch is not captured. */
-  int nofirstsub;
+	/* Memory allocator. The AST is allocated using this. */
+	tre_mem_t mem;
+	/* Stack used for keeping track of regexp syntax. */
+	tre_stack_t *stack;
+	/* The parsed node after a parse function returns. */
+	tre_ast_node_t *n;
+	/* Position in the regexp pattern after a parse function returns. */
+	const char *s;
+	/* The first character of the regexp. */
+	const char *re;
+	/* Current submatch ID. */
+	int submatch_id;
+	/* Current position (number of literal). */
+	int position;
+	/* The highest back reference or -1 if none seen so far. */
+	int max_backref;
+	/* Compilation flags. */
+	int cflags;
 } tre_parse_ctx_t;
 
-/* Parses a wide character regexp pattern into a syntax tree.  This parser
-   handles both syntaxes (BRE and ERE), including the TRE extensions. */
-static reg_errcode_t
-tre_parse(tre_parse_ctx_t *ctx);
-
-
-/*
-  This parser is just a simple recursive descent parser for POSIX.2
-  regexps.  The parser supports both the obsolete default syntax and
-  the "extended" syntax, and some nonstandard extensions.
-*/
-
-/* Characters with special meanings in regexp syntax. */
-#define CHAR_PIPE	   '|'
-#define CHAR_LPAREN	   '('
-#define CHAR_RPAREN	   ')'
-#define CHAR_LBRACE	   '{'
-#define CHAR_RBRACE	   '}'
-#define CHAR_LBRACKET	   '['
-#define CHAR_RBRACKET	   ']'
-#define CHAR_MINUS	   '-'
-#define CHAR_STAR	   '*'
-#define CHAR_QUESTIONMARK  '?'
-#define CHAR_PLUS	   '+'
-#define CHAR_PERIOD	   '.'
-#define CHAR_COLON	   ':'
-#define CHAR_EQUAL	   '='
-#define CHAR_COMMA	   ','
-#define CHAR_CARET	   '^'
-#define CHAR_DOLLAR	   '$'
-#define CHAR_BACKSLASH	   '\\'
-#define CHAR_HASH	   '#'
-#define CHAR_TILDE	   '~'
-
-
 /* Some macros for expanding \w, \s, etc. */
-static const struct tre_macro_struct {
-  const char c;
-  const char *expansion;
-} tre_macros[] =
-  { {'t', "\t"},	   {'n', "\n"},		   {'r', "\r"},
-    {'f', "\f"},	   {'a', "\a"},		   {'e', "\033"},
-    {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
-    {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"},   {'D', "[^[:digit:]]"},
-    { 0, NULL }
-  };
-
+static const struct {
+	char c;
+	const char *expansion;
+} tre_macros[] = {
+	{'t', "\t"}, {'n', "\n"}, {'r', "\r"},
+	{'f', "\f"}, {'a', "\a"}, {'e', "\033"},
+	{'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
+	{'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
+	{ 0, 0 }
+};
 
 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
    must have at least `len' items.  Sets buf[0] to zero if the there
    is no match in `tre_macros'. */
-static const char *
-tre_expand_macro(const char *regex)
+static const char *tre_expand_macro(const char *s)
 {
-  int i;
-
-  if (!*regex)
-    return 0;
-
-  for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++);
-  return tre_macros[i].expansion;
+	int i;
+	for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
+	return tre_macros[i].expansion;
 }
 
-static reg_errcode_t
-tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
-	 tre_ast_node_t ***items)
+static int
+tre_compare_lit(const void *a, const void *b)
 {
-  reg_errcode_t status;
-  tre_ast_node_t **array = *items;
-  /* Allocate more space if necessary. */
-  if (*i >= *max_i)
-    {
-      tre_ast_node_t **new_items;
-      /* If the array is already 1024 items large, give up -- there's
-	 probably an error in the regexp (e.g. not a '\0' terminated
-	 string and missing ']') */
-      if (*max_i > 1024)
-	return REG_ESPACE;
-      *max_i *= 2;
-      new_items = xrealloc(array, sizeof(*array) * *max_i);
-      if (new_items == NULL)
-	return REG_ESPACE;
-      *items = array = new_items;
-    }
-  array[*i] = tre_ast_new_literal(mem, min, max, -1);
-  status = array[*i] == NULL ? REG_ESPACE : REG_OK;
-  (*i)++;
-  return status;
+	const tre_literal_t *const *la = a;
+	const tre_literal_t *const *lb = b;
+	/* assumes the range of valid code_min is < INT_MAX */
+	return la[0]->code_min - lb[0]->code_min;
 }
 
+struct literals {
+	tre_mem_t mem;
+	tre_literal_t **a;
+	int len;
+	int cap;
+};
 
-static int
-tre_compare_items(const void *a, const void *b)
+static tre_literal_t *tre_new_lit(struct literals *p)
 {
-  const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
-  const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
-  tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
-  int a_min = l_a->code_min, b_min = l_b->code_min;
-
-  if (a_min < b_min)
-    return -1;
-  else if (a_min > b_min)
-    return 1;
-  else
-    return 0;
+	tre_literal_t **a;
+	if (p->len >= p->cap) {
+		if (p->cap >= 1<<15)
+			return 0;
+		p->cap *= 2;
+		a = xrealloc(p->a, p->cap * sizeof *p->a);
+		if (!a)
+			return 0;
+		p->a = a;
+	}
+	a = p->a + p->len++;
+	*a = tre_mem_calloc(p->mem, sizeof **a);
+	return *a;
 }
 
-/* Maximum number of character classes that can occur in a negated bracket
-   expression.	*/
-#define MAX_NEG_CLASSES 64
-
-/* Maximum length of character class names. */
-#define MAX_CLASS_NAME
-
-static reg_errcode_t
-tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
-			tre_ctype_t neg_classes[], int *num_neg_classes,
-			tre_ast_node_t ***items, int *num_items,
-			int *items_size)
+static int add_icase_literals(struct literals *ls, int min, int max)
 {
-  const char *re = ctx->re;
-  reg_errcode_t status = REG_OK;
-  tre_ctype_t class = (tre_ctype_t)0;
-  int i = *num_items;
-  int max_i = *items_size;
-  int skip;
-
-  /* Build an array of the items in the bracket expression. */
-  while (status == REG_OK)
-    {
-      skip = 0;
-      if (!*re)
-	{
-	  status = REG_EBRACK;
-	}
-      else if (*re == CHAR_RBRACKET && re > ctx->re)
-	{
-	  re++;
-	  break;
+	tre_literal_t *lit;
+	int b, e, c;
+	for (c=min; c<=max; ) {
+		/* assumes islower(c) and isupper(c) are exclusive
+		   and toupper(c)!=c if islower(c).
+		   multiple opposite case characters are not supported */
+		if (tre_islower(c)) {
+			b = e = tre_toupper(c);
+			for (c++, e++; c<=max; c++, e++)
+				if (tre_toupper(c) != e) break;
+		} else if (tre_isupper(c)) {
+			b = e = tre_tolower(c);
+			for (c++, e++; c<=max; c++, e++)
+				if (tre_tolower(c) != e) break;
+		} else {
+			c++;
+			continue;
+		}
+		lit = tre_new_lit(ls);
+		if (!lit)
+			return -1;
+		lit->code_min = b;
+		lit->code_max = e-1;
+		lit->position = -1;
 	}
-      else
-	{
-	  tre_cint_t min = 0, max = 0;
-	  wchar_t wc;
-	  int clen = mbtowc(&wc, re, -1);
+	return 0;
+}
 
-	  if (clen<0) clen=1, wc=WEOF;
 
-	  class = (tre_ctype_t)0;
-	  if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET)
-	    {
-	      min = wc;
-	      re += clen+1;
-	      clen = mbtowc(&wc, re, -1);
-	      if (clen<0) clen=1, wc=WEOF;
-	      max = wc;
-	      re += clen;
-	      /* XXX - Should use collation order instead of encoding values
-		 in character ranges. */
-	      if (min > max)
-		status = REG_ERANGE;
-	    }
-	  else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
-	    status = REG_ECOLLATE;
-	  else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
-	    status = REG_ECOLLATE;
-	  else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
-	    {
-	      char tmp_str[64];
-	      const char *endptr = re + 2;
-	      int len;
-	      while (*endptr && *endptr != CHAR_COLON)
-		endptr++;
-	      if (*endptr)
-		{
-		  len = MIN(endptr - re - 2, 63);
-		  strncpy(tmp_str, re + 2, len);
-		  tmp_str[len] = '\0';
-		  class = tre_ctype(tmp_str);
-		  if (!class)
-		    status = REG_ECTYPE;
-		  re = endptr + 2;
-		}
-	      else
-		status = REG_ECTYPE;
-	      min = 0;
-	      max = TRE_CHAR_MAX;
-	    }
-	  else
-	    {
-	      if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
-		  && ctx->re != re)
-		/* Two ranges are not allowed to share and endpoint. */
-		status = REG_ERANGE;
-	      min = max = wc;
-	      re += clen;
-	    }
+/* Maximum number of character classes in a negated bracket expression. */
+#define MAX_NEG_CLASSES 64
 
-	  if (status != REG_OK)
-	    break;
+struct neg {
+	int negate;
+	int len;
+	tre_ctype_t a[MAX_NEG_CLASSES];
+};
 
-	  if (class && negate)
-	    if (*num_neg_classes >= MAX_NEG_CLASSES)
-	      status = REG_ESPACE;
-	    else
-	      neg_classes[(*num_neg_classes)++] = class;
-	  else if (!skip)
-	    {
-	      status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
-	      if (status != REG_OK)
-		break;
-	      ((tre_literal_t*)((*items)[i-1])->obj)->class = class;
-	    }
+// TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
 
-	  /* Add opposite-case counterpoints if REG_ICASE is present.
-	     This is broken if there are more than two "same" characters. */
-	  if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
-	    {
-	      tre_cint_t cmin, ccurr;
+/*
+bracket grammar:
+Bracket  =  '[' List ']'  |  '[^' List ']'
+List     =  Term  |  List Term
+Term     =  Char  |  Range  |  Chclass  |  Eqclass
+Range    =  Char '-' Char  |  Char '-' '-'
+Char     =  Coll  |  coll_single
+Meta     =  ']'  |  '-'
+Coll     =  '[.' coll_single '.]'  |  '[.' coll_multi '.]'  |  '[.' Meta '.]'
+Eqclass  =  '[=' coll_single '=]'  |  '[=' coll_multi '=]'
+Chclass  =  '[:' class ':]'
+
+coll_single is a single char collating element but it can be
+ '-' only at the beginning or end of a List and
+ ']' only at the beginning of a List and
+ '^' anywhere except after the openning '['
+*/
 
-	      while (min <= max)
-		{
-		  if (tre_islower(min))
-		    {
-		      cmin = ccurr = tre_toupper(min++);
-		      while (tre_islower(min) && tre_toupper(min) == ccurr + 1
-			     && min <= max)
-			ccurr = tre_toupper(min++);
-		      status = tre_new_item(ctx->mem, cmin, ccurr,
-					    &i, &max_i, items);
-		    }
-		  else if (tre_isupper(min))
-		    {
-		      cmin = ccurr = tre_tolower(min++);
-		      while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
-			     && min <= max)
-			ccurr = tre_tolower(min++);
-		      status = tre_new_item(ctx->mem, cmin, ccurr,
-					    &i, &max_i, items);
-		    }
-		  else min++;
-		  if (status != REG_OK)
-		    break;
+static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
+{
+	const char *start = s;
+	tre_ctype_t class;
+	int min, max;
+	wchar_t wc;
+	int len;
+
+	for (;;) {
+		class = 0;
+		len = mbtowc(&wc, s, -1);
+		if (len <= 0)
+			return *s ? REG_BADPAT : REG_EBRACK;
+		if (*s == ']' && s != start) {
+			ctx->s = s+1;
+			return REG_OK;
+		}
+		if (*s == '-' && s != start && s[1] != ']' &&
+		    /* extension: [a-z--@] is accepted as [a-z]|[--@] */
+		    (s[1] != '-' || s[2] == ']'))
+			return REG_ERANGE;
+		if (*s == '[' && (s[1] == '.' || s[1] == '='))
+			/* collating symbols and equivalence classes are not supported */
+			return REG_ECOLLATE;
+		if (*s == '[' && s[1] == ':') {
+			char tmp[CHARCLASS_NAME_MAX+1];
+			s += 2;
+			for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
+				if (s[len] == ':') {
+					memcpy(tmp, s, len);
+					tmp[len] = 0;
+					class = tre_ctype(tmp);
+					break;
+				}
+			}
+			if (!class || s[len+1] != ']')
+				return REG_ECTYPE;
+			min = 0;
+			max = TRE_CHAR_MAX;
+			s += len+2;
+		} else {
+			min = max = wc;
+			s += len;
+			if (*s == '-' && s[1] != ']') {
+				s++;
+				len = mbtowc(&wc, s, -1);
+				max = wc;
+				/* XXX - Should use collation order instead of
+				   encoding values in character ranges. */
+				if (len <= 0 || min > max)
+					return REG_ERANGE;
+				s += len;
+			}
+		}
+
+		if (class && neg->negate) {
+			if (neg->len >= MAX_NEG_CLASSES)
+				return REG_ESPACE;
+			neg->a[neg->len++] = class;
+		} else  {
+			tre_literal_t *lit = tre_new_lit(ls);
+			if (!lit)
+				return REG_ESPACE;
+			lit->code_min = min;
+			lit->code_max = max;
+			lit->class = class;
+			lit->position = -1;
+
+			/* Add opposite-case codepoints if REG_ICASE is present.
+			   It seems that POSIX requires that bracket negation
+			   should happen before case-folding, but most practical
+			   implementations do it the other way around. Changing
+			   the order would need efficient representation of
+			   case-fold ranges and bracket range sets even with
+			   simple patterns so this is ok for now. */
+			if (ctx->cflags & REG_ICASE && !class)
+				if (add_icase_literals(ls, min, max))
+					return REG_ESPACE;
 		}
-	      if (status != REG_OK)
-		break;
-	    }
 	}
-    }
-  *num_items = i;
-  *items_size = max_i;
-  ctx->re = re;
-  return status;
 }
 
-static reg_errcode_t
-tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
+static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
 {
-  tre_ast_node_t *node = NULL;
-  int negate = 0;
-  reg_errcode_t status = REG_OK;
-  tre_ast_node_t **items, *u, *n;
-  int i = 0, j, max_i = 32, curr_max, curr_min;
-  tre_ctype_t neg_classes[MAX_NEG_CLASSES];
-  int num_neg_classes = 0;
-
-  /* Start off with an array of `max_i' elements. */
-  items = xmalloc(sizeof(*items) * max_i);
-  if (items == NULL)
-    return REG_ESPACE;
-
-  if (*ctx->re == CHAR_CARET)
-    {
-      negate = 1;
-      ctx->re++;
-    }
-
-  status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
-				   &items, &i, &max_i);
-
-  if (status != REG_OK)
-    goto parse_bracket_done;
-
-  /* Sort the array if we need to negate it. */
-  if (negate)
-    qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
-
-  curr_max = curr_min = 0;
-  /* Build a union of the items in the array, negated if necessary. */
-  for (j = 0; j < i && status == REG_OK; j++)
-    {
-      int min, max;
-      tre_literal_t *l = items[j]->obj;
-      min = l->code_min;
-      max = l->code_max;
-
-      if (negate)
-	{
-	  if (min < curr_max)
-	    {
-	      /* Overlap. */
-	      curr_max = MAX(max + 1, curr_max);
-	      l = NULL;
-	    }
-	  else
-	    {
-	      /* No overlap. */
-	      curr_max = min - 1;
-	      if (curr_max >= curr_min)
-		{
-		  l->code_min = curr_min;
-		  l->code_max = curr_max;
+	int i, max, min, negmax, negmin;
+	tre_ast_node_t *node = 0, *n;
+	tre_ctype_t *nc = 0;
+	tre_literal_t *lit;
+	struct literals ls;
+	struct neg neg;
+	reg_errcode_t err;
+
+	ls.mem = ctx->mem;
+	ls.len = 0;
+	ls.cap = 32;
+	ls.a = xmalloc(ls.cap * sizeof *ls.a);
+	if (!ls.a)
+		return REG_ESPACE;
+	neg.len = 0;
+	neg.negate = *s == '^';
+	if (neg.negate)
+		s++;
+
+	err = parse_bracket_terms(ctx, s, &ls, &neg);
+	if (err != REG_OK)
+		goto parse_bracket_done;
+
+	if (neg.negate) {
+		/* Sort the array if we need to negate it. */
+		qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
+		/* extra lit for the last negated range */
+		lit = tre_new_lit(&ls);
+		if (!lit) {
+			err = REG_ESPACE;
+			goto parse_bracket_done;
 		}
-	      else
-		{
-		  l = NULL;
+		lit->code_min = TRE_CHAR_MAX+1;
+		lit->code_max = TRE_CHAR_MAX+1;
+		lit->position = -1;
+		/* negated classes */
+		if (neg.len) {
+			nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
+			if (!nc) {
+				err = REG_ESPACE;
+				goto parse_bracket_done;
+			}
+			memcpy(nc, neg.a, neg.len*sizeof *neg.a);
+			nc[neg.len] = 0;
 		}
-	      curr_min = curr_max = max + 1;
-	    }
 	}
 
-      if (l != NULL)
-	{
-	  int k;
-	  l->position = ctx->position;
-	  if (num_neg_classes > 0)
-	    {
-	      l->neg_classes = tre_mem_alloc(ctx->mem,
-					     (sizeof(*l->neg_classes)
-					      * (num_neg_classes + 1)));
-	      if (l->neg_classes == NULL)
-		{
-		  status = REG_ESPACE;
-		  break;
+	/* Build a union of the items in the array, negated if necessary. */
+	negmax = negmin = 0;
+	for (i = 0; i < ls.len; i++) {
+		lit = ls.a[i];
+		min = lit->code_min;
+		max = lit->code_max;
+		if (neg.negate) {
+			if (min <= negmin) {
+				/* Overlap. */
+				negmin = MAX(max + 1, negmin);
+				continue;
+			}
+			negmax = min - 1;
+			lit->code_min = negmin;
+			lit->code_max = negmax;
+			negmin = max + 1;
 		}
-	      for (k = 0; k < num_neg_classes; k++)
-		l->neg_classes[k] = neg_classes[k];
-	      l->neg_classes[k] = (tre_ctype_t)0;
-	    }
-	  else
-	    l->neg_classes = NULL;
-	  if (node == NULL)
-	    node = items[j];
-	  else
-	    {
-	      u = tre_ast_new_union(ctx->mem, node, items[j]);
-	      if (u == NULL)
-		status = REG_ESPACE;
-	      node = u;
-	    }
-	}
-    }
-
-  if (status != REG_OK)
-    goto parse_bracket_done;
-
-  if (negate)
-    {
-      int k;
-      n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
-      if (n == NULL)
-	status = REG_ESPACE;
-      else
-	{
-	  tre_literal_t *l = n->obj;
-	  if (num_neg_classes > 0)
-	    {
-	      l->neg_classes = tre_mem_alloc(ctx->mem,
-					     (sizeof(*l->neg_classes)
-					      * (num_neg_classes + 1)));
-	      if (l->neg_classes == NULL)
-		{
-		  status = REG_ESPACE;
-		  goto parse_bracket_done;
+		lit->position = ctx->position;
+		lit->neg_classes = nc;
+		n = tre_ast_new_node(ctx->mem, LITERAL, lit);
+		node = tre_ast_new_union(ctx->mem, node, n);
+		if (!node) {
+			err = REG_ESPACE;
+			break;
 		}
-	      for (k = 0; k < num_neg_classes; k++)
-		l->neg_classes[k] = neg_classes[k];
-	      l->neg_classes[k] = (tre_ctype_t)0;
-	    }
-	  else
-	    l->neg_classes = NULL;
-	  if (node == NULL)
-	    node = n;
-	  else
-	    {
-	      u = tre_ast_new_union(ctx->mem, node, n);
-	      if (u == NULL)
-		status = REG_ESPACE;
-	      node = u;
-	    }
 	}
-    }
-
-  if (status != REG_OK)
-    goto parse_bracket_done;
 
-#ifdef TRE_DEBUG
-  tre_ast_print(node);
-#endif /* TRE_DEBUG */
-
- parse_bracket_done:
-  xfree(items);
-  ctx->position++;
-  *result = node;
-  return status;
+parse_bracket_done:
+	xfree(ls.a);
+	ctx->position++;
+	ctx->n = node;
+	return err;
 }
 
-
-/* Parses a positive decimal integer.  Returns -1 if the string does not
-   contain a valid number. */
-static int
-tre_parse_int(const char **regex)
+static const char *parse_dup_count(const char *s, int *n)
 {
-  int num = -1;
-  const char *r = *regex;
-  while (*r-'0'<10U)
-    {
-      if (num < 0)
-	num = 0;
-      num = num * 10 + *r - '0';
-      r++;
-    }
-  *regex = r;
-  return num;
+	*n = -1;
+	if (!isdigit(*s))
+		return s;
+	*n = 0;
+	for (;;) {
+		*n = 10 * *n + (*s - '0');
+		s++;
+		if (!isdigit(*s) || *n > RE_DUP_MAX)
+			break;
+	}
+	return s;
 }
 
-
-static reg_errcode_t
-tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
+static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s)
 {
-  int min, max;
-  const char *r = ctx->re;
-  int minimal = 0;
-
-  /* Parse number (minimum repetition count). */
-  min = -1;
-  if (*r >= '0' && *r <= '9') {
-    min = tre_parse_int(&r);
-  }
-
-  /* Parse comma and second number (maximum repetition count). */
-  max = min;
-  if (*r == CHAR_COMMA)
-    {
-      r++;
-      max = tre_parse_int(&r);
-    }
-
-  /* Check that the repeat counts are sane. */
-  if ((max >= 0 && min > max) || max > RE_DUP_MAX)
-    return REG_BADBR;
-
-  /* Missing }. */
-  if (!*r)
-    return REG_EBRACE;
-
-  /* Empty contents of {}. */
-  if (r == ctx->re)
-    return REG_BADBR;
-
-  /* Parse the ending '}' or '\}'.*/
-  if (ctx->cflags & REG_EXTENDED)
-    {
-      if (*r != CHAR_RBRACE)
-	return REG_BADBR;
-      r++;
-    }
-  else
-    {
-      if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE)
-	return REG_BADBR;
-      r += 2;
-    }
-
-  /* Create the AST node(s). */
-  if (min == 0 && max == 0)
-    {
-      *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
-      if (*result == NULL)
-	return REG_ESPACE;
-    }
-  else
-    {
-      if (min < 0 && max < 0)
-	/* Only approximate parameters set, no repetitions. */
-	min = max = 1;
-
-      *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
-      if (!*result)
-	return REG_ESPACE;
-    }
-
-  ctx->re = r;
-  return REG_OK;
+	int min, max;
+
+	s = parse_dup_count(s, &min);
+	if (*s == ',')
+		s = parse_dup_count(s+1, &max);
+	else
+		max = min;
+
+	if (
+		(max < min && max >= 0) ||
+		max > RE_DUP_MAX ||
+		min > RE_DUP_MAX ||
+		min < 0 ||
+		(!(ctx->cflags & REG_EXTENDED) && *s++ != '\\') ||
+		*s++ != '}'
+	)
+		return REG_BADBR;
+
+	if (min == 0 && max == 0)
+		ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+	else
+		ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
+	if (!ctx->n)
+		return REG_ESPACE;
+	ctx->s = s;
+	return REG_OK;
 }
 
-typedef enum {
-  PARSE_RE = 0,
-  PARSE_ATOM,
-  PARSE_MARK_FOR_SUBMATCH,
-  PARSE_BRANCH,
-  PARSE_PIECE,
-  PARSE_CATENATION,
-  PARSE_POST_CATENATION,
-  PARSE_UNION,
-  PARSE_POST_UNION,
-  PARSE_POSTFIX,
-  PARSE_RESTORE_CFLAGS
-} tre_parse_re_stack_symbol_t;
-
-
-static reg_errcode_t
-tre_parse(tre_parse_ctx_t *ctx)
+static int hexval(unsigned c)
 {
-  tre_ast_node_t *result = NULL;
-  tre_parse_re_stack_symbol_t symbol;
-  reg_errcode_t status = REG_OK;
-  tre_stack_t *stack = ctx->stack;
-  int bottom = tre_stack_num_objects(stack);
-  int depth = 0;
-  wchar_t wc;
-  int clen;
-
-  if (!ctx->nofirstsub)
-    {
-      STACK_PUSH(stack, int, ctx->submatch_id);
-      STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
-      ctx->submatch_id++;
-    }
-  STACK_PUSH(stack, int, PARSE_RE);
-  ctx->re_start = ctx->re;
-
-
-  /* The following is basically just a recursive descent parser.  I use
-     an explicit stack instead of recursive functions mostly because of
-     two reasons: compatibility with systems which have an overflowable
-     call stack, and efficiency (both in lines of code and speed).  */
-  while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
-    {
-      if (status != REG_OK)
-	break;
-      symbol = tre_stack_pop_int(stack);
-      switch (symbol)
-	{
-	case PARSE_RE:
-	  /* Parse a full regexp.  A regexp is one or more branches,
-	     separated by the union operator `|'. */
-	  if (ctx->cflags & REG_EXTENDED)
-	    STACK_PUSHX(stack, int, PARSE_UNION);
-	  STACK_PUSHX(stack, int, PARSE_BRANCH);
-	  break;
-
-	case PARSE_BRANCH:
-	  /* Parse a branch.  A branch is one or more pieces, concatenated.
-	     A piece is an atom possibly followed by a postfix operator. */
-	  STACK_PUSHX(stack, int, PARSE_CATENATION);
-	  STACK_PUSHX(stack, int, PARSE_PIECE);
-	  break;
-
-	case PARSE_PIECE:
-	  /* Parse a piece.  A piece is an atom possibly followed by one
-	     or more postfix operators. */
-	    STACK_PUSHX(stack, int, PARSE_POSTFIX);
-	  STACK_PUSHX(stack, int, PARSE_ATOM);
-	  break;
-
-	case PARSE_CATENATION:
-	  /* If the expression has not ended, parse another piece. */
-	  {
-	    tre_char_t c;
-	    if (!*ctx->re)
-	      break;
-	    c = *ctx->re;
-		if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
-		  break;
-		if ((ctx->cflags & REG_EXTENDED
-		     && c == CHAR_RPAREN && depth > 0)
-		    || (!(ctx->cflags & REG_EXTENDED)
-			&& (c == CHAR_BACKSLASH
-			    && *(ctx->re + 1) == CHAR_RPAREN)))
-		  {
-		    if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
-		      status = REG_EPAREN;
-		    depth--;
-		    if (!(ctx->cflags & REG_EXTENDED))
-		      ctx->re += 2;
-		    break;
-		  }
-
-	      {
-		/* Default case, left associative concatenation. */
-		STACK_PUSHX(stack, int, PARSE_CATENATION);
-		STACK_PUSHX(stack, voidptr, result);
-		STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
-		STACK_PUSHX(stack, int, PARSE_PIECE);
-	      }
-	    break;
-	  }
-
-	case PARSE_POST_CATENATION:
-	  {
-	    tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
-	    tre_ast_node_t *tmp_node;
-	    tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
-	    if (!tmp_node)
-	      return REG_ESPACE;
-	    result = tmp_node;
-	    break;
-	  }
-
-	case PARSE_UNION:
-	  switch (*ctx->re)
-	    {
-	    case CHAR_PIPE:
-	      STACK_PUSHX(stack, int, PARSE_UNION);
-	      STACK_PUSHX(stack, voidptr, result);
-	      STACK_PUSHX(stack, int, PARSE_POST_UNION);
-	      STACK_PUSHX(stack, int, PARSE_BRANCH);
-	      ctx->re++;
-	      break;
-
-	    case CHAR_RPAREN:
-	      ctx->re++;
-	      break;
-
-	    default:
-	      break;
-	    }
-	  break;
-
-	case PARSE_POST_UNION:
-	  {
-	    tre_ast_node_t *tmp_node;
-	    tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
-	    tmp_node = tre_ast_new_union(ctx->mem, tree, result);
-	    if (!tmp_node)
-	      return REG_ESPACE;
-	    result = tmp_node;
-	    break;
-	  }
-
-	case PARSE_POSTFIX:
-	  /* Parse postfix operators. */
-	  switch (*ctx->re)
-	    {
-	    case CHAR_PLUS:
-	    case CHAR_QUESTIONMARK:
-	      if (!(ctx->cflags & REG_EXTENDED))
-		break;
-		/*FALLTHROUGH*/
-	    case CHAR_STAR:
-	      {
-		tre_ast_node_t *tmp_node;
-		int minimal = 0;
-		int rep_min = 0;
-		int rep_max = -1;
-
-		if (*ctx->re == CHAR_PLUS)
-		  rep_min = 1;
-		if (*ctx->re == CHAR_QUESTIONMARK)
-		  rep_max = 1;
-
-		ctx->re++;
-		tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
-					    minimal);
-		if (tmp_node == NULL)
-		  return REG_ESPACE;
-		result = tmp_node;
-		STACK_PUSHX(stack, int, PARSE_POSTFIX);
-	      }
-	      break;
-
-	    case CHAR_BACKSLASH:
-	      /* "\{" is special without REG_EXTENDED */
-	      if (!(ctx->cflags & REG_EXTENDED)
-		  && *(ctx->re + 1) == CHAR_LBRACE)
-		{
-		  ctx->re++;
-		  goto parse_brace;
-		}
-	      else
-		break;
-
-	    case CHAR_LBRACE:
-	      /* "{" is literal without REG_EXTENDED */
-	      if (!(ctx->cflags & REG_EXTENDED))
-		break;
-
-	    parse_brace:
-	      ctx->re++;
+	if (c-'0'<10) return c-'0';
+	c |= 32;
+	if (c-'a'<6) return c-'a'+10;
+	return -1;
+}
 
-	      status = tre_parse_bound(ctx, &result);
-	      if (status != REG_OK)
-		return status;
-	      STACK_PUSHX(stack, int, PARSE_POSTFIX);
-	      break;
-	    }
-	  break;
+static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
+{
+	if (node->submatch_id >= 0) {
+		tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+		if (!n)
+			return REG_ESPACE;
+		n = tre_ast_new_catenation(ctx->mem, n, node);
+		if (!n)
+			return REG_ESPACE;
+		n->num_submatches = node->num_submatches;
+		node = n;
+	}
+	node->submatch_id = subid;
+	node->num_submatches++;
+	ctx->n = node;
+	return REG_OK;
+}
 
-	case PARSE_ATOM:
-	  /* Parse an atom.  An atom is a regular expression enclosed in `()',
-	     an empty set of `()', a bracket expression, `.', `^', `$',
-	     a `\' followed by a character, or a single character. */
+/*
+BRE grammar:
+Regex  =  Branch  |  '^'  |  '$'  |  '^$'  |  '^' Branch  |  Branch '$'  |  '^' Branch '$'
+Branch =  Atom  |  Branch Atom
+Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '\(' Branch '\)'  |  back_ref
+Dup    =  '*'  |  '\{' Count '\}'  |  '\{' Count ',\}'  |  '\{' Count ',' Count '\}'
 
-	  switch (*ctx->re)
-	    {
-	    case CHAR_LPAREN:  /* parenthesized subexpression */
+(leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
 
-	      if (ctx->cflags & REG_EXTENDED)
-		{
-		lparen:
-		  depth++;
-		    {
-		      ctx->re++;
-		      /* First parse a whole RE, then mark the resulting tree
-			 for submatching. */
-		      STACK_PUSHX(stack, int, ctx->submatch_id);
-		      STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
-		      STACK_PUSHX(stack, int, PARSE_RE);
-		      ctx->submatch_id++;
-		    }
-		}
-	      else
-		goto parse_literal;
-	      break;
+ERE grammar:
+Regex  =  Branch  |  Regex '|' Branch
+Branch =  Atom  |  Branch Atom
+Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '(' Regex ')'  |  '^'  |  '$'
+Dup    =  '*'  |  '+'  |  '?'  |  '{' Count '}'  |  '{' Count ',}'  |  '{' Count ',' Count '}'
 
-	    case CHAR_LBRACKET: /* bracket expression */
-	      ctx->re++;
-	      status = tre_parse_bracket(ctx, &result);
-	      if (status != REG_OK)
-		return status;
-	      break;
+(a*+?, ^*, $+, \X, {, (|a) are unspecified)
+*/
 
-	    case CHAR_BACKSLASH:
-	      /* If this is "\(" or "\)" chew off the backslash and
-		 try again. */
-	      if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
-		{
-		  ctx->re++;
-		  goto lparen;
-		}
-	      if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_RPAREN)
-		{
-		  goto empty_atom;
+static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
+{
+	int len, ere = ctx->cflags & REG_EXTENDED;
+	const char *p;
+	tre_ast_node_t *node;
+	wchar_t wc;
+	switch (*s) {
+	case '[':
+		return parse_bracket(ctx, s+1);
+	case '\\':
+		p = tre_expand_macro(s+1);
+		if (p) {
+			/* assume \X expansion is a single atom */
+			reg_errcode_t err = parse_atom(ctx, p);
+			ctx->s = s+2;
+			return err;
 		}
-
-	      /* If a macro is used, parse the expanded macro recursively. */
-	      {
-		const char *buf = tre_expand_macro(ctx->re + 1);
-		if (buf)
-		  {
-		    tre_parse_ctx_t subctx;
-		    memcpy(&subctx, ctx, sizeof(subctx));
-		    subctx.re = buf;
-		    subctx.nofirstsub = 1;
-		    status = tre_parse(&subctx);
-		    if (status != REG_OK)
-		      return status;
-		    ctx->re += 2;
-		    ctx->position = subctx.position;
-		    result = subctx.result;
-		    break;
-		  }
-	      }
-
-	      if (!ctx->re[1])
-		/* Trailing backslash. */
-		return REG_EESCAPE;
-
-	      ctx->re++;
-	      switch (*ctx->re)
-		{
+		/* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
+		switch (*++s) {
+		case 0:
+			return REG_EESCAPE;
 		case 'b':
-		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
-					       ASSERT_AT_WB, -1);
-		  ctx->re++;
-		  break;
+			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
+			break;
 		case 'B':
-		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
-					       ASSERT_AT_WB_NEG, -1);
-		  ctx->re++;
-		  break;
+			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
+			break;
 		case '<':
-		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
-					       ASSERT_AT_BOW, -1);
-		  ctx->re++;
-		  break;
+			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
+			break;
 		case '>':
-		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
-					       ASSERT_AT_EOW, -1);
-		  ctx->re++;
-		  break;
+			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
+			break;
 		case 'x':
-		  ctx->re++;
-		  if (ctx->re[0] != CHAR_LBRACE)
-		    {
-		      /* 8 bit hex char. */
-		      char tmp[3] = {0, 0, 0};
-		      long val;
-
-		      if (tre_isxdigit(ctx->re[0]))
-			{
-			  tmp[0] = (char)ctx->re[0];
-			  ctx->re++;
+			s++;
+			int i, v = 0, c;
+			len = 2;
+			if (*s == '{') {
+				len = 8;
+				s++;
 			}
-		      if (tre_isxdigit(ctx->re[0]))
-			{
-			  tmp[1] = (char)ctx->re[0];
-			  ctx->re++;
+			for (i=0; i<len && v<0x110000; i++) {
+				c = hexval(s[i]);
+				if (c < 0) break;
+				v = 16*v + c;
 			}
-		      val = strtol(tmp, NULL, 16);
-		      result = tre_ast_new_literal(ctx->mem, (int)val,
-						   (int)val, ctx->position);
-		      ctx->position++;
-		      break;
-		    }
-		  else if (*ctx->re)
-		    {
-		      /* Wide char. */
-		      char tmp[32];
-		      long val;
-		      int i = 0;
-		      ctx->re++;
-		      while (*ctx->re && i < sizeof tmp)
-			{
-			  if (ctx->re[0] == CHAR_RBRACE)
-			    break;
-			  if (tre_isxdigit(ctx->re[0]))
-			    {
-			      tmp[i] = (char)ctx->re[0];
-			      i++;
-			      ctx->re++;
-			      continue;
-			    }
-			  return REG_EBRACE;
+			s += i;
+			if (len == 8) {
+				if (*s != '}')
+					return REG_EBRACE;
+				s++;
 			}
-		      ctx->re++;
-		      tmp[i] = 0;
-		      val = strtol(tmp, NULL, 16);
-		      result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
-						   ctx->position);
-		      ctx->position++;
-		      break;
-		    }
-		  /*FALLTHROUGH*/
-
+			node = tre_ast_new_literal(ctx->mem, v, v, ctx->position);
+			ctx->position++;
+			s--;
+			break;
 		default:
-		  if (tre_isdigit(*ctx->re))
-		    {
-		      /* Back reference. */
-		      int val = *ctx->re - '0';
-		      result = tre_ast_new_literal(ctx->mem, BACKREF, val,
-						   ctx->position);
-		      if (result == NULL)
-			return REG_ESPACE;
-		      ctx->position++;
-		      ctx->max_backref = MAX(val, ctx->max_backref);
-		      ctx->re++;
-		    }
-		  else
-		    {
-		      /* Escaped character. */
-		      result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
-						   ctx->position);
-		      ctx->position++;
-		      ctx->re++;
-		    }
-		  break;
-		}
-	      if (result == NULL)
-		return REG_ESPACE;
-	      break;
-
-	    case CHAR_PERIOD:	 /* the any-symbol */
-	      if (ctx->cflags & REG_NEWLINE)
-		{
-		  tre_ast_node_t *tmp1;
-		  tre_ast_node_t *tmp2;
-		  tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1,
-					     ctx->position);
-		  if (!tmp1)
-		    return REG_ESPACE;
-		  tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
-					     ctx->position + 1);
-		  if (!tmp2)
-		    return REG_ESPACE;
-		  result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
-		  if (!result)
-		    return REG_ESPACE;
-		  ctx->position += 2;
-		}
-	      else
-		{
-		  result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
-					       ctx->position);
-		  if (!result)
-		    return REG_ESPACE;
-		  ctx->position++;
+			if (isdigit(*s)) {
+				/* back reference */
+				int val = *s - '0';
+				node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position);
+				ctx->max_backref = MAX(val, ctx->max_backref);
+			} else {
+				/* extension: accept unknown escaped char
+				   as a literal */
+				node = tre_ast_new_literal(ctx->mem, *s, *s, ctx->position);
+			}
+			ctx->position++;
 		}
-	      ctx->re++;
-	      break;
-
-	    case CHAR_CARET:	 /* beginning of line assertion */
-	      /* '^' has a special meaning everywhere in EREs, and at
-		 beginning of BRE. */
-	      if (ctx->cflags & REG_EXTENDED
-		  || ctx->re == ctx->re_start)
-		{
-		  if (!(ctx->cflags & REG_EXTENDED))
-		    STACK_PUSHX(stack, int, PARSE_CATENATION);
-		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
-					       ASSERT_AT_BOL, -1);
-		  if (result == NULL)
-		    return REG_ESPACE;
-		  ctx->re++;
+		s++;
+		break;
+	case '.':
+		if (ctx->cflags & REG_NEWLINE) {
+			tre_ast_node_t *tmp1, *tmp2;
+			tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
+			tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
+			if (tmp1 && tmp2)
+				node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
+			else
+				node = 0;
+		} else {
+			node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
 		}
-	      else
-		goto parse_literal;
-	      break;
-
-	    case CHAR_DOLLAR:	 /* end of line assertion. */
-	      /* '$' is special everywhere in EREs, and in the end of the
-		 string in BREs. */
-	      if (ctx->cflags & REG_EXTENDED
-		  || !*(ctx->re + 1))
-		{
-		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
-					       ASSERT_AT_EOL, -1);
-		  if (result == NULL)
-		    return REG_ESPACE;
-		  ctx->re++;
+		s++;
+		break;
+	case '^':
+		/* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
+		if (!ere && s != ctx->re)
+			goto parse_literal;
+		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
+		s++;
+		break;
+	case '$':
+		/* '$' is special everywhere in EREs, and in the end of the string in BREs. */
+		if (!ere && s[1])
+			goto parse_literal;
+		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
+		s++;
+		break;
+	case '*':
+	case '|':
+	case '{':
+	case '+':
+	case '?':
+		if (!ere)
+			goto parse_literal;
+	case 0:
+		node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+		break;
+	default:
+parse_literal:
+		len = mbtowc(&wc, s, -1);
+		if (len < 0)
+			return REG_BADPAT;
+		if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
+			tre_ast_node_t *tmp1, *tmp2;
+			/* multiple opposite case characters are not supported */
+			tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
+			tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
+			if (tmp1 && tmp2)
+				node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
+			else
+				node = 0;
+		} else {
+			node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
 		}
-	      else
-		goto parse_literal;
-	      break;
-
-	    case CHAR_RPAREN:
-	      if (!depth)
-	        goto parse_literal;
-	    case CHAR_STAR:
-	    case CHAR_PIPE:
-	    case CHAR_LBRACE:
-	    case CHAR_PLUS:
-	    case CHAR_QUESTIONMARK:
-	      if (!(ctx->cflags & REG_EXTENDED))
-	        goto parse_literal;
-
-	    case 0:
-	    empty_atom:
-	      result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
-	      if (!result)
+		ctx->position++;
+		s += len;
+		break;
+	}
+	if (!node)
 		return REG_ESPACE;
-	      break;
+	ctx->n = node;
+	ctx->s = s;
+	return REG_OK;
+}
 
-	    default:
-	    parse_literal:
+#define PUSHPTR(err, s, v) do { \
+	if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
+		return err; \
+} while(0)
 
-	      clen = mbtowc(&wc, ctx->re, -1);
-	      if (clen<0) clen=1, wc=WEOF;
+#define PUSHINT(err, s, v) do { \
+	if ((err = tre_stack_push_int(s, v)) != REG_OK) \
+		return err; \
+} while(0)
 
-	      /* Note that we can't use an tre_isalpha() test here, since there
-		 may be characters which are alphabetic but neither upper or
-		 lower case. */
-	      if (ctx->cflags & REG_ICASE
-		  && (tre_isupper(wc) || tre_islower(wc)))
-		{
-		  tre_ast_node_t *tmp1;
-		  tre_ast_node_t *tmp2;
-
-		  /* XXX - Can there be more than one opposite-case
-		     counterpoints for some character in some locale?  Or
-		     more than two characters which all should be regarded
-		     the same character if case is ignored?  If yes, there
-		     does not seem to be a portable way to detect it.  I guess
-		     that at least for multi-character collating elements there
-		     could be several opposite-case counterpoints, but they
-		     cannot be supported portably anyway. */
-		  tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc),
-					     tre_toupper(wc),
-					     ctx->position);
-		  if (!tmp1)
-		    return REG_ESPACE;
-		  tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc),
-					     tre_tolower(wc),
-					     ctx->position);
-		  if (!tmp2)
-		    return REG_ESPACE;
-		  result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
-		  if (!result)
-		    return REG_ESPACE;
+static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
+{
+	tre_ast_node_t *nbranch=0, *nunion=0;
+	int ere = ctx->cflags & REG_EXTENDED;
+	const char *s = ctx->re;
+	int subid = 0;
+	int depth = 0;
+	reg_errcode_t err;
+	tre_stack_t *stack = ctx->stack;
+
+	PUSHINT(err, stack, subid++);
+	for (;;) {
+		if ((!ere && *s == '\\' && s[1] == '(') ||
+		    (ere && *s == '(')) {
+			PUSHPTR(err, stack, nunion);
+			PUSHPTR(err, stack, nbranch);
+			PUSHINT(err, stack, subid++);
+			s++;
+			if (!ere)
+				s++;
+			depth++;
+			nbranch = nunion = 0;
+			continue;
 		}
-	      else
-		{
-		  result = tre_ast_new_literal(ctx->mem, wc, wc,
-					       ctx->position);
-		  if (!result)
-		    return REG_ESPACE;
+		if ((!ere && *s == '\\' && s[1] == ')') ||
+		    (ere && *s == ')' && depth)) {
+			ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+			if (!ctx->n)
+				return REG_ESPACE;
+		} else {
+			err = parse_atom(ctx, s);
+			if (err != REG_OK)
+				return err;
+			s = ctx->s;
 		}
-	      ctx->position++;
-	      ctx->re += clen;
-	      break;
-	    }
-	  break;
-
-	case PARSE_MARK_FOR_SUBMATCH:
-	  {
-	    int submatch_id = tre_stack_pop_int(stack);
 
-	    if (result->submatch_id >= 0)
-	      {
-		tre_ast_node_t *n, *tmp_node;
-		n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
-		if (n == NULL)
-		  return REG_ESPACE;
-		tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
-		if (tmp_node == NULL)
-		  return REG_ESPACE;
-		tmp_node->num_submatches = result->num_submatches;
-		result = tmp_node;
-	      }
-	    result->submatch_id = submatch_id;
-	    result->num_submatches++;
-	    break;
-	  }
-
-	case PARSE_RESTORE_CFLAGS:
-	  ctx->cflags = tre_stack_pop_int(stack);
-	  break;
+	parse_iter:
+		/* extension: repetitions are accepted after an empty node
+		   eg. (+), ^*, a$?, a|{2} */
+		switch (*s) {
+		case '+':
+		case '?':
+			if (!ere)
+				break;
+			/* fallthrough */
+		case '*':;
+			int min=0, max=-1;
+			if (*s == '+')
+				min = 1;
+			if (*s == '?')
+				max = 1;
+			s++;
+			ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
+			if (!ctx->n)
+				return REG_ESPACE;
+			/* extension: multiple consecutive *+?{,} is unspecified,
+			   but (a+)+ has to be supported so accepting a++ makes
+			   sense, note however that the RE_DUP_MAX limit can be
+			   circumvented: (a{255}){255} uses a lot of memory.. */
+			goto parse_iter;
+		case '\\':
+			if (ere || s[1] != '{')
+				break;
+			s++;
+			goto parse_brace;
+		case '{':
+			if (!ere)
+				break;
+		parse_brace:
+			err = parse_dup(ctx, s+1);
+			if (err != REG_OK)
+				return err;
+			s = ctx->s;
+			goto parse_iter;
+		}
 
-	default:
-	  assert(0);
-	  break;
+		nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
+		if ((ere && *s == '|') ||
+		    (ere && *s == ')' && depth) ||
+		    (!ere && *s == '\\' && s[1] == ')') ||
+		    !*s) {
+			/* extension: empty branch is unspecified (), (|a), (a|)
+			   here they are not rejected but match on empty string */
+			int c = *s;
+			nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
+			nbranch = 0;
+			if (c != '|') {
+				if (c == '\\') {
+					if (!depth) return REG_EPAREN;
+					s+=2;
+				} else if (c == ')')
+					s++;
+				depth--;
+				err = marksub(ctx, nunion, tre_stack_pop_int(stack));
+				if (err != REG_OK)
+					return err;
+				if (!c && depth<0) {
+					ctx->submatch_id = subid;
+					return REG_OK;
+				}
+				if (!c || depth<0)
+					return REG_EPAREN;
+				nbranch = tre_stack_pop_voidptr(stack);
+				nunion = tre_stack_pop_voidptr(stack);
+				goto parse_iter;
+			}
+			s++;
+		}
 	}
-    }
-
-  /* Check for missing closing parentheses. */
-  if (depth > 0)
-    return REG_EPAREN;
-
-  if (status == REG_OK)
-    ctx->result = result;
-
-  return status;
 }
 
 
-
 /***********************************************************************
  from tre-compile.c
 ***********************************************************************/
@@ -3122,12 +2680,7 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
   if (errcode != REG_OK)
     ERROR_EXIT(errcode);
   preg->re_nsub = parse_ctx.submatch_id - 1;
-  tree = parse_ctx.result;
-
-  /* Back references and approximate matching cannot currently be used
-     in the same regexp. */
-  if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
-    ERROR_EXIT(REG_BADPAT);
+  tree = parse_ctx.n;
 
 #ifdef TRE_DEBUG
   tre_ast_print(tree);
@@ -3142,7 +2695,7 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
   if (tnfa == NULL)
     ERROR_EXIT(REG_ESPACE);
   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
-  tnfa->have_approx = parse_ctx.have_approx;
+  tnfa->have_approx = 0;
   tnfa->num_submatches = parse_ctx.submatch_id;
 
   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the