summary refs log tree commit diff
diff options
context:
space:
mode:
authorFangrui Song <maskray@google.com>2021-11-02 10:07:59 -0700
committerFangrui Song <maskray@google.com>2021-11-02 10:07:59 -0700
commitfdcd177fd36c60ddc9cbc6013831413dbd83c3f9 (patch)
tree7e2cc87b0532e1e835aa890f01cf2cd9c057f658
parentdb432f033d7714c5885838dae7d7fb3bdaa5a850 (diff)
downloadglibc-fdcd177fd36c60ddc9cbc6013831413dbd83c3f9.tar.gz
glibc-fdcd177fd36c60ddc9cbc6013831413dbd83c3f9.tar.xz
glibc-fdcd177fd36c60ddc9cbc6013831413dbd83c3f9.zip
regex: Unnest nested functions in regcomp.c
This refactor moves four functions out of a nested scope and converts
them into static always_inline functions. collseqwc, table_size,
symb_table, extra are now initialized to zero because they are passed as
function arguments.

On x86-64, .text is 16 byte larger likely due to the 4 stores.
This is nothing compared to the amount of work that regcomp has to do
looking up the collation weights, or other functions.

If the non-buildable `sysdeps/generic/dl-machine.h` doesn't count,
this patch removes the last `auto inline` usage from glibc.

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
-rw-r--r--posix/regcomp.c464
1 files changed, 241 insertions, 223 deletions
diff --git a/posix/regcomp.c b/posix/regcomp.c
index 887e5b5068..e2b5c2807a 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -2831,262 +2831,277 @@ build_collating_symbol (bitset_t sbcset, const unsigned char *name)
 }
 #endif /* not _LIBC */
 
-/* This function parse bracket expression like "[abc]", "[a-c]",
-   "[[.a-a.]]" etc.  */
-
-static bin_tree_t *
-parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
-		   reg_syntax_t syntax, reg_errcode_t *err)
-{
 #ifdef _LIBC
-  const unsigned char *collseqmb;
-  const char *collseqwc;
-  uint32_t nrules;
-  int32_t table_size;
-  const int32_t *symb_table;
-  const unsigned char *extra;
+/* Local function for parse_bracket_exp used in _LIBC environment.
+   Seek the collating symbol entry corresponding to NAME.
+   Return the index of the symbol in the SYMB_TABLE,
+   or -1 if not found.  */
 
-  /* Local function for parse_bracket_exp used in _LIBC environment.
-     Seek the collating symbol entry corresponding to NAME.
-     Return the index of the symbol in the SYMB_TABLE,
-     or -1 if not found.  */
-
-  auto inline int32_t
-  __attribute__ ((always_inline))
-  seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
-    {
-      int32_t elem;
+static inline int32_t
+__attribute__ ((always_inline))
+seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
+			     const int32_t *symb_table, int32_t table_size,
+			     const unsigned char *extra)
+{
+  int32_t elem;
 
-      for (elem = 0; elem < table_size; elem++)
-	if (symb_table[2 * elem] != 0)
-	  {
-	    int32_t idx = symb_table[2 * elem + 1];
-	    /* Skip the name of collating element name.  */
-	    idx += 1 + extra[idx];
-	    if (/* Compare the length of the name.  */
-		name_len == extra[idx]
-		/* Compare the name.  */
-		&& memcmp (name, &extra[idx + 1], name_len) == 0)
-	      /* Yep, this is the entry.  */
-	      return elem;
-	  }
-      return -1;
-    }
+  for (elem = 0; elem < table_size; elem++)
+    if (symb_table[2 * elem] != 0)
+      {
+	int32_t idx = symb_table[2 * elem + 1];
+	/* Skip the name of collating element name.  */
+	idx += 1 + extra[idx];
+	if (/* Compare the length of the name.  */
+	    name_len == extra[idx]
+	    /* Compare the name.  */
+	    && memcmp (name, &extra[idx + 1], name_len) == 0)
+	  /* Yep, this is the entry.  */
+	  return elem;
+      }
+  return -1;
+}
 
-  /* Local function for parse_bracket_exp used in _LIBC environment.
-     Look up the collation sequence value of BR_ELEM.
-     Return the value if succeeded, UINT_MAX otherwise.  */
+/* Local function for parse_bracket_exp used in _LIBC environment.
+   Look up the collation sequence value of BR_ELEM.
+   Return the value if succeeded, UINT_MAX otherwise.  */
 
-  auto inline unsigned int
-  __attribute__ ((always_inline))
-  lookup_collation_sequence_value (bracket_elem_t *br_elem)
+static inline unsigned int
+__attribute__ ((always_inline))
+lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
+				 const unsigned char *collseqmb,
+				 const char *collseqwc, int32_t table_size,
+				 const int32_t *symb_table,
+				 const unsigned char *extra)
+{
+  if (br_elem->type == SB_CHAR)
     {
-      if (br_elem->type == SB_CHAR)
-	{
-	  /*
-	  if (MB_CUR_MAX == 1)
-	  */
-	  if (nrules == 0)
-	    return collseqmb[br_elem->opr.ch];
-	  else
-	    {
-	      wint_t wc = __btowc (br_elem->opr.ch);
-	      return __collseq_table_lookup (collseqwc, wc);
-	    }
-	}
-      else if (br_elem->type == MB_CHAR)
+      /* if (MB_CUR_MAX == 1) */
+      if (nrules == 0)
+	return collseqmb[br_elem->opr.ch];
+      else
 	{
-	  if (nrules != 0)
-	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
+	  wint_t wc = __btowc (br_elem->opr.ch);
+	  return __collseq_table_lookup (collseqwc, wc);
 	}
-      else if (br_elem->type == COLL_SYM)
+    }
+  else if (br_elem->type == MB_CHAR)
+    {
+      if (nrules != 0)
+	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
+    }
+  else if (br_elem->type == COLL_SYM)
+    {
+      size_t sym_name_len = strlen ((char *) br_elem->opr.name);
+      if (nrules != 0)
 	{
-	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
-	  if (nrules != 0)
+	  int32_t elem, idx;
+	  elem = seek_collating_symbol_entry (br_elem->opr.name,
+					      sym_name_len,
+					      symb_table, table_size,
+					      extra);
+	  if (elem != -1)
 	    {
-	      int32_t elem, idx;
-	      elem = seek_collating_symbol_entry (br_elem->opr.name,
-						  sym_name_len);
-	      if (elem != -1)
-		{
-		  /* We found the entry.  */
-		  idx = symb_table[2 * elem + 1];
-		  /* Skip the name of collating element name.  */
-		  idx += 1 + extra[idx];
-		  /* Skip the byte sequence of the collating element.  */
-		  idx += 1 + extra[idx];
-		  /* Adjust for the alignment.  */
-		  idx = (idx + 3) & ~3;
-		  /* Skip the multibyte collation sequence value.  */
-		  idx += sizeof (unsigned int);
-		  /* Skip the wide char sequence of the collating element.  */
-		  idx += sizeof (unsigned int) *
-		    (1 + *(unsigned int *) (extra + idx));
-		  /* Return the collation sequence value.  */
-		  return *(unsigned int *) (extra + idx);
-		}
-	      else if (sym_name_len == 1)
-		{
-		  /* No valid character.  Match it as a single byte
-		     character.  */
-		  return collseqmb[br_elem->opr.name[0]];
-		}
+	      /* We found the entry.  */
+	      idx = symb_table[2 * elem + 1];
+	      /* Skip the name of collating element name.  */
+	      idx += 1 + extra[idx];
+	      /* Skip the byte sequence of the collating element.  */
+	      idx += 1 + extra[idx];
+	      /* Adjust for the alignment.  */
+	      idx = (idx + 3) & ~3;
+	      /* Skip the multibyte collation sequence value.  */
+	      idx += sizeof (unsigned int);
+	      /* Skip the wide char sequence of the collating element.  */
+	      idx += sizeof (unsigned int) *
+		(1 + *(unsigned int *) (extra + idx));
+	      /* Return the collation sequence value.  */
+	      return *(unsigned int *) (extra + idx);
 	    }
 	  else if (sym_name_len == 1)
-	    return collseqmb[br_elem->opr.name[0]];
+	    {
+	      /* No valid character.  Match it as a single byte
+		 character.  */
+	      return collseqmb[br_elem->opr.name[0]];
+	    }
 	}
-      return UINT_MAX;
+      else if (sym_name_len == 1)
+	return collseqmb[br_elem->opr.name[0]];
     }
+  return UINT_MAX;
+}
 
-  /* Local function for parse_bracket_exp used in _LIBC environment.
-     Build the range expression which starts from START_ELEM, and ends
-     at END_ELEM.  The result are written to MBCSET and SBCSET.
-     RANGE_ALLOC is the allocated size of mbcset->range_starts, and
-     mbcset->range_ends, is a pointer argument since we may
-     update it.  */
+/* Local function for parse_bracket_exp used in _LIBC environment.
+   Build the range expression which starts from START_ELEM, and ends
+   at END_ELEM.  The result are written to MBCSET and SBCSET.
+   RANGE_ALLOC is the allocated size of mbcset->range_starts, and
+   mbcset->range_ends, is a pointer argument since we may
+   update it.  */
 
-  auto inline reg_errcode_t
-  __attribute__ ((always_inline))
-  build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
-		   bracket_elem_t *start_elem, bracket_elem_t *end_elem)
-    {
-      unsigned int ch;
-      uint32_t start_collseq;
-      uint32_t end_collseq;
-
-      /* Equivalence Classes and Character Classes can't be a range
-	 start/end.  */
-      if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
-			    || start_elem->type == CHAR_CLASS
-			    || end_elem->type == EQUIV_CLASS
-			    || end_elem->type == CHAR_CLASS))
-	return REG_ERANGE;
+static inline reg_errcode_t
+__attribute__ ((always_inline))
+build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
+		 bracket_elem_t *start_elem, bracket_elem_t *end_elem,
+		 re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
+		 const unsigned char *collseqmb, const char *collseqwc,
+		 int32_t table_size, const int32_t *symb_table,
+		 const unsigned char *extra)
+{
+  unsigned int ch;
+  uint32_t start_collseq;
+  uint32_t end_collseq;
 
-      /* FIXME: Implement rational ranges here, too.  */
-      start_collseq = lookup_collation_sequence_value (start_elem);
-      end_collseq = lookup_collation_sequence_value (end_elem);
-      /* Check start/end collation sequence values.  */
-      if (__glibc_unlikely (start_collseq == UINT_MAX
-			    || end_collseq == UINT_MAX))
-	return REG_ECOLLATE;
-      if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
-			    && start_collseq > end_collseq))
-	return REG_ERANGE;
+  /* Equivalence Classes and Character Classes can't be a range
+     start/end.  */
+  if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
+                        || start_elem->type == CHAR_CLASS
+                        || end_elem->type == EQUIV_CLASS
+                        || end_elem->type == CHAR_CLASS))
+    return REG_ERANGE;
 
-      /* Got valid collation sequence values, add them as a new entry.
-	 However, if we have no collation elements, and the character set
-	 is single byte, the single byte character set that we
-	 build below suffices. */
-      if (nrules > 0 || dfa->mb_cur_max > 1)
+  /* FIXME: Implement rational ranges here, too.  */
+  start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
+						   table_size, symb_table, extra);
+  end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
+						 table_size, symb_table, extra);
+  /* Check start/end collation sequence values.  */
+  if (__glibc_unlikely (start_collseq == UINT_MAX
+                        || end_collseq == UINT_MAX))
+    return REG_ECOLLATE;
+  if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
+                        && start_collseq > end_collseq))
+    return REG_ERANGE;
+
+  /* Got valid collation sequence values, add them as a new entry.
+     However, if we have no collation elements, and the character set
+     is single byte, the single byte character set that we
+     build below suffices. */
+  if (nrules > 0 || dfa->mb_cur_max > 1)
+    {
+      /* Check the space of the arrays.  */
+      if (__glibc_unlikely (*range_alloc == mbcset->nranges))
 	{
-	  /* Check the space of the arrays.  */
-	  if (__glibc_unlikely (*range_alloc == mbcset->nranges))
-	    {
-	      /* There is not enough space, need realloc.  */
-	      uint32_t *new_array_start;
-	      uint32_t *new_array_end;
-	      Idx new_nranges;
-
-	      /* +1 in case of mbcset->nranges is 0.  */
-	      new_nranges = 2 * mbcset->nranges + 1;
-	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
-					    new_nranges);
-	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
-					  new_nranges);
+	  /* There is not enough space, need realloc.  */
+	  uint32_t *new_array_start;
+	  uint32_t *new_array_end;
+	  int new_nranges;
 
-	      if (__glibc_unlikely (new_array_start == NULL
-				    || new_array_end == NULL))
-		return REG_ESPACE;
+	  /* +1 in case of mbcset->nranges is 0.  */
+	  new_nranges = 2 * mbcset->nranges + 1;
+	  new_array_start = re_realloc (mbcset->range_starts, uint32_t,
+					new_nranges);
+	  new_array_end = re_realloc (mbcset->range_ends, uint32_t,
+				      new_nranges);
 
-	      mbcset->range_starts = new_array_start;
-	      mbcset->range_ends = new_array_end;
-	      *range_alloc = new_nranges;
-	    }
+          if (__glibc_unlikely (new_array_start == NULL
+                                || new_array_end == NULL))
+	    return REG_ESPACE;
 
-	  mbcset->range_starts[mbcset->nranges] = start_collseq;
-	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
+	  mbcset->range_starts = new_array_start;
+	  mbcset->range_ends = new_array_end;
+	  *range_alloc = new_nranges;
 	}
 
-      /* Build the table for single byte characters.  */
-      for (ch = 0; ch < SBC_MAX; ch++)
-	{
-	  uint32_t ch_collseq;
-	  /*
-	  if (MB_CUR_MAX == 1)
-	  */
-	  if (nrules == 0)
-	    ch_collseq = collseqmb[ch];
-	  else
-	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
-	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
-	    bitset_set (sbcset, ch);
-	}
-      return REG_NOERROR;
+      mbcset->range_starts[mbcset->nranges] = start_collseq;
+      mbcset->range_ends[mbcset->nranges++] = end_collseq;
     }
 
-  /* Local function for parse_bracket_exp used in _LIBC environment.
-     Build the collating element which is represented by NAME.
-     The result are written to MBCSET and SBCSET.
-     COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
-     pointer argument since we may update it.  */
+  /* Build the table for single byte characters.  */
+  for (ch = 0; ch < SBC_MAX; ch++)
+    {
+      uint32_t ch_collseq;
+      /* if (MB_CUR_MAX == 1) */
+      if (nrules == 0)
+	ch_collseq = collseqmb[ch];
+      else
+	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
+      if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
+	bitset_set (sbcset, ch);
+    }
+  return REG_NOERROR;
+}
+
+/* Local function for parse_bracket_exp used in _LIBC environment.
+   Build the collating element which is represented by NAME.
+   The result are written to MBCSET and SBCSET.
+   COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
+   pointer argument since we may update it.  */
 
-  auto inline reg_errcode_t
-  __attribute__ ((always_inline))
-  build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
-			  Idx *coll_sym_alloc, const unsigned char *name)
+static inline reg_errcode_t
+__attribute__ ((always_inline))
+build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
+			int *coll_sym_alloc, const unsigned char *name,
+			uint32_t nrules, int32_t table_size,
+			const int32_t *symb_table, const unsigned char *extra)
+{
+  int32_t elem, idx;
+  size_t name_len = strlen ((const char *) name);
+  if (nrules != 0)
     {
-      int32_t elem, idx;
-      size_t name_len = strlen ((const char *) name);
-      if (nrules != 0)
+      elem = seek_collating_symbol_entry (name, name_len, symb_table,
+					  table_size, extra);
+      if (elem != -1)
 	{
-	  elem = seek_collating_symbol_entry (name, name_len);
-	  if (elem != -1)
-	    {
-	      /* We found the entry.  */
-	      idx = symb_table[2 * elem + 1];
-	      /* Skip the name of collating element name.  */
-	      idx += 1 + extra[idx];
-	    }
-	  else if (name_len == 1)
-	    {
-	      /* No valid character, treat it as a normal
-		 character.  */
-	      bitset_set (sbcset, name[0]);
-	      return REG_NOERROR;
-	    }
-	  else
-	    return REG_ECOLLATE;
-
-	  /* Got valid collation sequence, add it as a new entry.  */
-	  /* Check the space of the arrays.  */
-	  if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
-	    {
-	      /* Not enough, realloc it.  */
-	      /* +1 in case of mbcset->ncoll_syms is 0.  */
-	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
-	      /* Use realloc since mbcset->coll_syms is NULL
-		 if *alloc == 0.  */
-	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
-						   new_coll_sym_alloc);
-	      if (__glibc_unlikely (new_coll_syms == NULL))
-		return REG_ESPACE;
-	      mbcset->coll_syms = new_coll_syms;
-	      *coll_sym_alloc = new_coll_sym_alloc;
-	    }
-	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
+	  /* We found the entry.  */
+	  idx = symb_table[2 * elem + 1];
+	  /* Skip the name of collating element name.  */
+	  idx += 1 + extra[idx];
+	}
+      else if (name_len == 1)
+	{
+	  /* No valid character, treat it as a normal
+	     character.  */
+	  bitset_set (sbcset, name[0]);
 	  return REG_NOERROR;
 	}
       else
+	return REG_ECOLLATE;
+
+      /* Got valid collation sequence, add it as a new entry.  */
+      /* Check the space of the arrays.  */
+      if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
 	{
-	  if (__glibc_unlikely (name_len != 1))
-	    return REG_ECOLLATE;
-	  else
-	    {
-	      bitset_set (sbcset, name[0]);
-	      return REG_NOERROR;
-	    }
+	  /* Not enough, realloc it.  */
+	  /* +1 in case of mbcset->ncoll_syms is 0.  */
+	  int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
+	  /* Use realloc since mbcset->coll_syms is NULL
+	     if *alloc == 0.  */
+	  int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
+					       new_coll_sym_alloc);
+          if (__glibc_unlikely (new_coll_syms == NULL))
+	    return REG_ESPACE;
+	  mbcset->coll_syms = new_coll_syms;
+	  *coll_sym_alloc = new_coll_sym_alloc;
 	}
+      mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
+      return REG_NOERROR;
     }
+  else
+    {
+      if (__glibc_unlikely (name_len != 1))
+	return REG_ECOLLATE;
+      else
+	{
+	  bitset_set (sbcset, name[0]);
+	  return REG_NOERROR;
+	}
+    }
+}
+#endif /* _LIBC */
+
+/* This function parse bracket expression like "[abc]", "[a-c]",
+   "[[.a-a.]]" etc.  */
+
+static bin_tree_t *
+parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
+		   reg_syntax_t syntax, reg_errcode_t *err)
+{
+#ifdef _LIBC
+  const unsigned char *collseqmb;
+  const char *collseqwc = NULL;
+  uint32_t nrules;
+  int32_t table_size = 0;
+  const int32_t *symb_table = NULL;
+  const unsigned char *extra = NULL;
 #endif
 
   re_token_t br_token;
@@ -3230,7 +3245,9 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 
 #ifdef _LIBC
 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
-				  &start_elem, &end_elem);
+				  &start_elem, &end_elem,
+				  dfa, syntax, nrules, collseqmb, collseqwc,
+				  table_size, symb_table, extra);
 #else
 # ifdef RE_ENABLE_I18N
 	  *err = build_range_exp (syntax, sbcset,
@@ -3283,7 +3300,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 #ifdef RE_ENABLE_I18N
 					     mbcset, &coll_sym_alloc,
 #endif /* RE_ENABLE_I18N */
-					     start_elem.opr.name);
+					     start_elem.opr.name,
+					     nrules, table_size, symb_table, extra);
 	      if (__glibc_unlikely (*err != REG_NOERROR))
 		goto parse_bracket_exp_free_return;
 	      break;