Update.

* posix/Makefile: Add rules to build and run tst-rxspencer. (distribute): Add rxspencer/tests and rxspencer/COPYRIGHT. * posix/tst-rxspencer.c: New file. * posix/rxspencer/tests: New file. * posix/rxspencer/COPYRIGHT: New file. Patch mostly by Jakub Jelinek.
author: Ulrich Drepper <drepper@redhat.com> 2003-11-13 20:52:55 +0000
committer: Ulrich Drepper <drepper@redhat.com> 2003-11-13 20:52:55 +0000
commit: 78c81ab7b4a25563697ce988ecff73c9937cef16 (patch)
tree: f001077b4dbd1250b2a747ceef6d6ca3f6b21830 /posix
parent: 78d8b07a44111d861be5f54847faccbc1219c3e7 (diff)
download: glibc-78c81ab7b4a25563697ce988ecff73c9937cef16.tar.gz
glibc-78c81ab7b4a25563697ce988ecff73c9937cef16.tar.xz
glibc-78c81ab7b4a25563697ce988ecff73c9937cef16.zip
4 files changed, 1044 insertions, 2 deletions
diff --git a/posix/Makefile b/posix/Makefile
index faff565936..c305c5e6dc 100644
--- a/posix/Makefile
+++ b/posix/Makefile
@@ -34,7 +34,7 @@ distribute := confstr.h TESTS TESTS2C.sed testcases.h \
 	      PTESTS PTESTS2C.sed ptestcases.h \
 	      globtest.c globtest.sh wordexp-tst.sh annexc.c fnmatch_loop.c   \
 	      spawn_int.h tst-getconf.sh regcomp.c regexec.c regex_internal.c \
-	      regex_internal.h fork.h
+	      regex_internal.h fork.h rxspencer/tests rxspencer/COPYRIGHT
 
 routines :=								      \
 	uname								      \
@@ -78,7 +78,7 @@ tests		:= tstgetopt testfnm runtests runptests	     \
 		   bug-regex8 bug-regex9 bug-regex10 bug-regex11 bug-regex12 \
 		   bug-regex13 bug-regex14 bug-regex15 bug-regex16 \
 		   bug-regex17 bug-regex18 bug-regex19 bug-regex20 \
-		   tst-nice tst-nanosleep transbug
+		   tst-nice tst-nanosleep transbug tst-rxspencer
 ifeq (yes,$(build-shared))
 test-srcs	:= globtest
 tests           += wordexp-test tst-exec tst-spawn
@@ -147,6 +147,7 @@ tst-exec-ARGS = -- $(built-program-cmd)
 tst-spawn-ARGS = -- $(built-program-cmd)
 tst-dir-ARGS = `pwd` `cd $(common-objdir)/$(subdir); pwd` `cd $(common-objdir); pwd` $(objpfx)tst-dir
 tst-chmod-ARGS = `pwd`
+tst-rxspencer-ARGS = rxspencer/tests
 
 tst-fnmatch-ENV = LOCPATH=$(common-objpfx)localedata
 tst-regexloc-ENV = LOCPATH=$(common-objpfx)localedata
diff --git a/posix/rxspencer/COPYRIGHT b/posix/rxspencer/COPYRIGHT
new file mode 100644
index 0000000000..30c1f7a488
--- /dev/null
+++ b/posix/rxspencer/COPYRIGHT
@@ -0,0 +1,20 @@
+Copyright 1992, 1993, 1994, 1997 Henry Spencer.  All rights reserved.
+This software is not subject to any license of the American Telephone
+and Telegraph Company or of the Regents of the University of California.
+
+Permission is granted to anyone to use this software for any purpose on
+any computer system, and to alter it and redistribute it, subject
+to the following restrictions:
+
+1. The author is not responsible for the consequences of use of this
+   software, no matter how awful, even if they arise from flaws in it.
+
+2. The origin of this software must not be misrepresented, either by
+   explicit claim or by omission.  Since few users ever read sources,
+   credits must appear in the documentation.
+
+3. Altered versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.  Since few users
+   ever read sources, credits must appear in the documentation.
+
+4. This notice may not be removed or altered.
diff --git a/posix/rxspencer/tests b/posix/rxspencer/tests
new file mode 100644
index 0000000000..acd4623c74
--- /dev/null
+++ b/posix/rxspencer/tests
@@ -0,0 +1,506 @@
+# regular expression test set
+# Lines are at least three fields, separated by one or more tabs.  "" stands
+# for an empty field.  First field is an RE.  Second field is flags.  If
+# C flag given, regcomp() is expected to fail, and the third field is the
+# error name (minus the leading REG_).
+#
+# Otherwise it is expected to succeed, and the third field is the string to
+# try matching it against.  If there is no fourth field, the match is
+# expected to fail.  If there is a fourth field, it is the substring that
+# the RE is expected to match.  If there is a fifth field, it is a comma-
+# separated list of what the subexpressions should match, with - indicating
+# no match for that one.  In both the fourth and fifth fields, a (sub)field
+# starting with @ indicates that the (sub)expression is expected to match
+# a null string followed by the stuff after the @; this provides a way to
+# test where null strings match.  The character `N' in REs and strings
+# is newline, `S' is space, `T' is tab, `Z' is NUL.
+#
+# The full list of flags:
+#	-	placeholder, does nothing
+#	b	RE is a BRE, not an ERE
+#	&	try it as both an ERE and a BRE
+#	C	regcomp() error expected, third field is error name
+#	i	REG_ICASE
+#	m	("mundane") REG_NOSPEC
+#	s	REG_NOSUB (not really testable)
+#	n	REG_NEWLINE
+#	^	REG_NOTBOL
+#	$	REG_NOTEOL
+#	#	REG_STARTEND (see below)
+#	p	REG_PEND
+#
+# For REG_STARTEND, the start/end offsets are those of the substring
+# enclosed in ().
+
+# basics
+a		&	a	a
+abc		&	abc	abc
+abc|de		-	abc	abc
+a|b|c		-	abc	a
+
+# parentheses and perversions thereof
+a(b)c		-	abc	abc
+a\(b\)c		b	abc	abc
+a(		C	EPAREN
+a(		b	a(	a(
+a\(		-	a(	a(
+a\(		bC	EPAREN
+a\(b		bC	EPAREN
+a(b		C	EPAREN
+a(b		b	a(b	a(b
+# gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly)
+a)		-	a)	a)
+)		-	)	)
+# end gagging (in a just world, those *should* give EPAREN)
+a)		b	a)	a)
+a\)		bC	EPAREN
+\)		bC	EPAREN
+a()b		-	ab	ab
+a\(\)b		b	ab	ab
+
+# anchoring and REG_NEWLINE
+^abc$		&	abc	abc
+a^b		-	a^b
+a^b		b	a^b	a^b
+a$b		-	a$b
+a$b		b	a$b	a$b
+^		&	abc	@abc
+$		&	abc	@
+^$		&	""	@
+$^		-	""	@
+\($\)\(^\)	b	""	@
+# stop retching, those are legitimate (although disgusting)
+^^		-	""	@
+$$		-	""	@
+b$		&	abNc
+b$		&n	abNc	b
+^b$		&	aNbNc
+^b$		&n	aNbNc	b
+^$		&n	aNNb	@Nb
+^$		n	abc
+^$		n	abcN	@
+$^		n	aNNb	@Nb
+\($\)\(^\)	bn	aNNb	@Nb
+^^		n^	aNNb	@Nb
+$$		n	aNNb	@NN
+^a		^	a
+a$		$	a
+^a		^n	aNb
+^b		^n	aNb	b
+a$		$n	bNa
+b$		$n	bNa	b
+a*(^b$)c*	-	b	b
+a*\(^b$\)c*	b	b	b
+
+# certain syntax errors and non-errors
+|		C	EMPTY
+|		b	|	|
+*		C	BADRPT
+*		b	*	*
++		C	BADRPT
+?		C	BADRPT
+""		&C	EMPTY
+()		-	abc	@abc
+\(\)		b	abc	@abc
+a||b		C	EMPTY
+|ab		C	EMPTY
+ab|		C	EMPTY
+(|a)b		C	EMPTY
+(a|)b		C	EMPTY
+(*a)		C	BADRPT
+(+a)		C	BADRPT
+(?a)		C	BADRPT
+({1}a)		C	BADRPT
+\(\{1\}a\)	bC	BADRPT
+(a|*b)		C	BADRPT
+(a|+b)		C	BADRPT
+(a|?b)		C	BADRPT
+(a|{1}b)	C	BADRPT
+^*		C	BADRPT
+^*		b	*	*
+^+		C	BADRPT
+^?		C	BADRPT
+^{1}		C	BADRPT
+^\{1\}		bC	BADRPT
+
+# metacharacters, backslashes
+a.c		&	abc	abc
+a[bc]d		&	abd	abd
+a\*c		&	a*c	a*c
+a\\b		&	a\b	a\b
+a\\\*b		&	a\*b	a\*b
+# The following test is wrong.  Using \b in an BRE or ERE is undefined.
+# a\bc		&	abc	abc
+a\		&C	EESCAPE
+a\\bc		&	a\bc	a\bc
+\{		bC	BADRPT
+a\[b		&	a[b	a[b
+a[b		&C	EBRACK
+# trailing $ is a peculiar special case for the BRE code
+a$		&	a	a
+a$		&	a$
+a\$		&	a
+a\$		&	a$	a$
+a\\$		&	a
+a\\$		&	a$
+a\\$		&	a\$
+a\\$		&	a\	a\
+
+# back references, ugh
+a\(b\)\2c	bC	ESUBREG
+a\(b\1\)c	bC	ESUBREG
+a\(b*\)c\1d	b	abbcbbd	abbcbbd	bb
+a\(b*\)c\1d	b	abbcbd
+a\(b*\)c\1d	b	abbcbbbd
+^\(.\)\1	b	abc
+a\([bc]\)\1d	b	abcdabbd	abbd	b
+a\(\([bc]\)\2\)*d	b	abbccd	abbccd
+a\(\([bc]\)\2\)*d	b	abbcbd
+# actually, this next one probably ought to fail, but the spec is unclear
+a\(\(b\)*\2\)*d		b	abbbd	abbbd
+# here is a case that no NFA implementation does right
+\(ab*\)[ab]*\1	b	ababaaa	ababaaa	a
+# check out normal matching in the presence of back refs
+\(a\)\1bcd	b	aabcd	aabcd
+\(a\)\1bc*d	b	aabcd	aabcd
+\(a\)\1bc*d	b	aabd	aabd
+\(a\)\1bc*d	b	aabcccd	aabcccd
+\(a\)\1bc*[ce]d	b	aabcccd	aabcccd
+^\(a\)\1b\(c\)*cd$	b	aabcccd	aabcccd
+
+# ordinary repetitions
+ab*c		&	abc	abc
+ab+c		-	abc	abc
+ab?c		-	abc	abc
+a\(*\)b		b	a*b	a*b
+a\(**\)b	b	ab	ab
+a\(***\)b	bC	BADRPT
+*a		b	*a	*a
+**a		b	a	a
+***a		bC	BADRPT
+
+# the dreaded bounded repetitions
+# The following two tests are not correct:
+#{		&	{	{
+#{abc		&	{abc	{abc
+# '{' is always a special char outside bracket expressions.  So test ony BRE:
+{		b	{	{
+{abc		b	{abc	{abc
+{1		C	BADRPT
+{1}		C	BADRPT
+# Same reason as for the two tests above:
+#a{b		&	a{b	a{b
+a{b		b	a{b	a{b
+a{1}b		-	ab	ab
+a\{1\}b		b	ab	ab
+a{1,}b		-	ab	ab
+a\{1,\}b	b	ab	ab
+a{1,2}b		-	aab	aab
+a\{1,2\}b	b	aab	aab
+a{1		C	EBRACE
+a\{1		bC	EBRACE
+a{1a		C	EBRACE
+a\{1a		bC	EBRACE
+a{1a}		C	BADBR
+a\{1a\}		bC	BADBR
+# These four tests checks for undefined behavior.  Our implementation does
+# something different.
+#a{,2}		-	a{,2}	a{,2}
+#a\{,2\}		bC	BADBR
+#a{,}		-	a{,}	a{,}
+#a\{,\}		bC	BADBR
+a{1,x}		C	BADBR
+a\{1,x\}	bC	BADBR
+a{1,x		C	EBRACE
+a\{1,x		bC	EBRACE
+# These two tests probably fails due to an arbitrary limit on the number of
+# repetitions in the other implementation.
+#a{300}		C	BADBR
+#a\{300\}	bC	BADBR
+a{1,0}		C	BADBR
+a\{1,0\}	bC	BADBR
+ab{0,0}c	-	abcac	ac
+ab\{0,0\}c	b	abcac	ac
+ab{0,1}c	-	abcac	abc
+ab\{0,1\}c	b	abcac	abc
+ab{0,3}c	-	abbcac	abbc
+ab\{0,3\}c	b	abbcac	abbc
+ab{1,1}c	-	acabc	abc
+ab\{1,1\}c	b	acabc	abc
+ab{1,3}c	-	acabc	abc
+ab\{1,3\}c	b	acabc	abc
+ab{2,2}c	-	abcabbc	abbc
+ab\{2,2\}c	b	abcabbc	abbc
+ab{2,4}c	-	abcabbc	abbc
+ab\{2,4\}c	b	abcabbc	abbc
+((a{1,10}){1,10}){1,10}	-	a	a	a,a
+
+# multiple repetitions
+# Wow, there is serious disconnect here.  The ERE grammar is like this:
+# ERE_expression : one_char_or_coll_elem_ERE
+#                | '^'
+#                | '$'
+#                | '(' extended_reg_exp ')'
+#                | ERE_expression ERE_dupl_symbol
+#                ;
+# where ERE_dupl_symbol is any of the repetition methods.  It is clear from
+# this that consecutive repetition is OK.  On top of this, the one test not
+# marked as failing must fail.  For BREs the situation is different, so we
+# use the four tests.
+#a**		&C	BADRPT
+a**		bC	BADRPT
+#a++		C	BADRPT
+#a??		C	BADRPT
+#a*+		C	BADRPT
+#a*?		C	BADRPT
+#a+*		C	BADRPT
+#a+?		C	BADRPT
+#a?*		C	BADRPT
+#a?+		C	BADRPT
+#a{1}{1}		C	BADRPT
+#a*{1}		C	BADRPT
+#a+{1}		C	BADRPT
+#a?{1}		C	BADRPT
+#a{1}*		C	BADRPT
+#a{1}+		C	BADRPT
+#a{1}?		C	BADRPT
+#a*{b}		-	a{b}	a{b}
+a\{1\}\{1\}	bC	BADRPT
+a*\{1\}		bC	BADRPT
+a\{1\}*		bC	BADRPT
+
+# brackets, and numerous perversions thereof
+a[b]c		&	abc	abc
+a[ab]c		&	abc	abc
+a[^ab]c		&	adc	adc
+a[]b]c		&	a]c	a]c
+a[[b]c		&	a[c	a[c
+a[-b]c		&	a-c	a-c
+a[^]b]c		&	adc	adc
+a[^-b]c		&	adc	adc
+a[b-]c		&	a-c	a-c
+a[b		&C	EBRACK
+a[]		&C	EBRACK
+a[1-3]c		&	a2c	a2c
+a[3-1]c		&C	ERANGE
+a[1-3-5]c	&C	ERANGE
+a[[.-.]--]c	&	a-c	a-c
+# I don't thing the error value should be ERANGE since a[1-] would be
+# valid, too.  Expect EBRACK.
+#a[1-		&C	ERANGE
+a[1-		&C	EBRACK
+a[[.		&C	EBRACK
+a[[.x		&C	EBRACK
+a[[.x.		&C	EBRACK
+a[[.x.]		&C	EBRACK
+a[[.x.]]	&	ax	ax
+a[[.x,.]]	&C	ECOLLATE
+# XXX Doesn't work yet.
+# a[[.one.]]b	&	a1b	a1b
+a[[.notdef.]]b	&C	ECOLLATE
+a[[.].]]b	&	a]b	a]b
+a[[:alpha:]]c	&	abc	abc
+a[[:notdef:]]c	&C	ECTYPE
+a[[:		&C	EBRACK
+a[[:alpha	&C	EBRACK
+a[[:alpha:]	&C	EBRACK
+a[[:alpha,:]	&C	ECTYPE
+a[[:]:]]b	&C	ECTYPE
+a[[:-:]]b	&C	ECTYPE
+a[[:alph:]]	&C	ECTYPE
+a[[:alphabet:]]	&C	ECTYPE
+[[:alnum:]]+	-	-%@a0X-	a0X
+[[:alpha:]]+	-	-%@aX0-	aX
+[[:blank:]]+	-	aSSTb	SST
+[[:cntrl:]]+	-	aNTb	NT
+[[:digit:]]+	-	a019b	019
+[[:graph:]]+	-	Sa%bS	a%b
+[[:lower:]]+	-	AabC	ab
+[[:print:]]+	-	NaSbN	aSb
+[[:punct:]]+	-	S%-&T	%-&
+[[:space:]]+	-	aSNTb	SNT
+[[:upper:]]+	-	aBCd	BC
+[[:xdigit:]]+	-	p0f3Cq	0f3C
+a[[=b=]]c	&	abc	abc
+a[[=		&C	EBRACK
+a[[=b		&C	EBRACK
+a[[=b=		&C	EBRACK
+a[[=b=]		&C	EBRACK
+a[[=b,=]]	&C	ECOLLATE
+# XXX Doesn't work yet.
+#a[[=one=]]b	&	a1b	a1b
+
+# complexities
+a(((b)))c	-	abc	abc
+a(b|(c))d	-	abd	abd
+a(b*|c)d	-	abbd	abbd
+# just gotta have one DFA-buster, of course
+a[ab]{20}	-	aaaaabaaaabaaaabaaaab	aaaaabaaaabaaaabaaaab
+# and an inline expansion in case somebody gets tricky
+a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab]	-	aaaaabaaaabaaaabaaaab	aaaaabaaaabaaaabaaaab
+# and in case somebody just slips in an NFA...
+a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night)	-	aaaaabaaaabaaaabaaaabweeknights	aaaaabaaaabaaaabaaaabweeknights
+# fish for anomalies as the number of states passes 32
+12345678901234567890123456789	-	a12345678901234567890123456789b	12345678901234567890123456789
+123456789012345678901234567890	-	a123456789012345678901234567890b	123456789012345678901234567890
+1234567890123456789012345678901	-	a1234567890123456789012345678901b	1234567890123456789012345678901
+12345678901234567890123456789012	-	a12345678901234567890123456789012b	12345678901234567890123456789012
+123456789012345678901234567890123	-	a123456789012345678901234567890123b	123456789012345678901234567890123
+# and one really big one, beyond any plausible word width
+1234567890123456789012345678901234567890123456789012345678901234567890	-	a1234567890123456789012345678901234567890123456789012345678901234567890b	1234567890123456789012345678901234567890123456789012345678901234567890
+# fish for problems as brackets go past 8
+[ab][cd][ef][gh][ij][kl][mn]	-	xacegikmoq	acegikm
+[ab][cd][ef][gh][ij][kl][mn][op]	-	xacegikmoq	acegikmo
+[ab][cd][ef][gh][ij][kl][mn][op][qr]	-	xacegikmoqy	acegikmoq
+[ab][cd][ef][gh][ij][kl][mn][op][q]	-	xacegikmoqy	acegikmoq
+
+# subtleties of matching
+abc		&	xabcy	abc
+a\(b\)?c\1d	b	acd
+aBc		i	Abc	Abc
+a[Bc]*d		i	abBCcd	abBCcd
+0[[:upper:]]1	&i	0a1	0a1
+0[[:lower:]]1	&i	0A1	0A1
+a[^b]c		&i	abc
+a[^b]c		&i	aBc
+a[^b]c		&i	adc	adc
+[a]b[c]		-	abc	abc
+[a]b[a]		-	aba	aba
+[abc]b[abc]	-	abc	abc
+[abc]b[abd]	-	abd	abd
+a(b?c)+d	-	accd	accd
+(wee|week)(knights|night)	-	weeknights	weeknights
+(we|wee|week|frob)(knights|night|day)	-	weeknights	weeknights
+a[bc]d		-	xyzaaabcaababdacd	abd
+a[ab]c		-	aaabc	abc
+abc		s	abc	abc
+a*		&	b	@b
+
+# Let's have some fun -- try to match a C comment.
+# first the obvious, which looks okay at first glance...
+/\*.*\*/	-	/*x*/	/*x*/
+# but...
+/\*.*\*/	-	/*x*/y/*z*/	/*x*/y/*z*/
+# okay, we must not match */ inside; try to do that...
+/\*([^*]|\*[^/])*\*/	-	/*x*/	/*x*/
+/\*([^*]|\*[^/])*\*/	-	/*x*/y/*z*/	/*x*/
+# but...
+/\*([^*]|\*[^/])*\*/	-	/*x**/y/*z*/	/*x**/y/*z*/
+# and a still fancier version, which does it right (I think)...
+/\*([^*]|\*+[^*/])*\*+/	-	/*x*/	/*x*/
+/\*([^*]|\*+[^*/])*\*+/	-	/*x*/y/*z*/	/*x*/
+/\*([^*]|\*+[^*/])*\*+/	-	/*x**/y/*z*/	/*x**/
+/\*([^*]|\*+[^*/])*\*+/	-	/*x****/y/*z*/	/*x****/
+/\*([^*]|\*+[^*/])*\*+/	-	/*x**x*/y/*z*/	/*x**x*/
+/\*([^*]|\*+[^*/])*\*+/	-	/*x***x/y/*z*/	/*x***x/y/*z*/
+
+# subexpressions
+.*		-	abc	abc	-
+a(b)(c)d	-	abcd	abcd	b,c
+a(((b)))c	-	abc	abc	b,b,b
+a(b|(c))d	-	abd	abd	b,-
+a(b*|c|e)d	-	abbd	abbd	bb
+a(b*|c|e)d	-	acd	acd	c
+a(b*|c|e)d	-	ad	ad	@d
+a(b?)c		-	abc	abc	b
+a(b?)c		-	ac	ac	@c
+a(b+)c		-	abc	abc	b
+a(b+)c		-	abbbc	abbbc	bbb
+a(b*)c		-	ac	ac	@c
+(a|ab)(bc([de]+)f|cde)	-	abcdef	abcdef	a,bcdef,de
+# the regression tester only asks for 9 subexpressions
+a(b)(c)(d)(e)(f)(g)(h)(i)(j)k	-	abcdefghijk	abcdefghijk	b,c,d,e,f,g,h,i,j
+a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l	-	abcdefghijkl	abcdefghijkl	b,c,d,e,f,g,h,i,j,k
+a([bc]?)c	-	abc	abc	b
+a([bc]?)c	-	ac	ac	@c
+a([bc]+)c	-	abc	abc	b
+a([bc]+)c	-	abcc	abcc	bc
+a([bc]+)bc	-	abcbc	abcbc	bc
+a(bb+|b)b	-	abb	abb	b
+a(bbb+|bb+|b)b	-	abb	abb	b
+a(bbb+|bb+|b)b	-	abbb	abbb	bb
+a(bbb+|bb+|b)bb	-	abbb	abbb	b
+(.*).*		-	abcdef	abcdef	abcdef
+(a*)*		-	bc	@b	@b
+
+# do we get the right subexpression when it is used more than once?
+a(b|c)*d	-	ad	ad	-
+a(b|c)*d	-	abcd	abcd	c
+a(b|c)+d	-	abd	abd	b
+a(b|c)+d	-	abcd	abcd	c
+a(b|c?)+d	-	ad	ad	@d
+a(b|c?)+d	-	abcd	abcd	@d
+a(b|c){0,0}d	-	ad	ad	-
+a(b|c){0,1}d	-	ad	ad	-
+a(b|c){0,1}d	-	abd	abd	b
+a(b|c){0,2}d	-	ad	ad	-
+a(b|c){0,2}d	-	abcd	abcd	c
+a(b|c){0,}d	-	ad	ad	-
+a(b|c){0,}d	-	abcd	abcd	c
+a(b|c){1,1}d	-	abd	abd	b
+a(b|c){1,1}d	-	acd	acd	c
+a(b|c){1,2}d	-	abd	abd	b
+a(b|c){1,2}d	-	abcd	abcd	c
+a(b|c){1,}d	-	abd	abd	b
+a(b|c){1,}d	-	abcd	abcd	c
+a(b|c){2,2}d	-	acbd	acbd	b
+a(b|c){2,2}d	-	abcd	abcd	c
+a(b|c){2,4}d	-	abcd	abcd	c
+a(b|c){2,4}d	-	abcbd	abcbd	b
+a(b|c){2,4}d	-	abcbcd	abcbcd	c
+a(b|c){2,}d	-	abcd	abcd	c
+a(b|c){2,}d	-	abcbd	abcbd	b
+a(b+|((c)*))+d	-	abd	abd	@d,@d,-
+# XXX Needs to be checked.
+#a(b+|((c)*))+d	-	abcd	abcd	@d,@d,-
+
+# check out the STARTEND option
+[abc]		&#	a(b)c	b
+[abc]		&#	a(d)c
+[abc]		&#	a(bc)d	b
+[abc]		&#	a(dc)d	c
+.		&#	a()c
+b.*c		&#	b(bc)c	bc
+b.*		&#	b(bc)c	bc
+.*c		&#	b(bc)c	bc
+
+# plain strings, with the NOSPEC flag
+abc		m	abc	abc
+abc		m	xabcy	abc
+abc		m	xyz
+a*b		m	aba*b	a*b
+a*b		m	ab
+""		mC	EMPTY
+
+# cases involving NULs
+aZb		&	a	a
+aZb		&p	a
+aZb		&p#	(aZb)	aZb
+aZ*b		&p#	(ab)	ab
+a.b		&#	(aZb)	aZb
+a.*		&#	(aZb)c	aZb
+
+# word boundaries (ick)
+[[:<:]]a	&	a	a
+[[:<:]]a	&	ba
+[[:<:]]a	&	-a	a
+a[[:>:]]	&	a	a
+a[[:>:]]	&	ab
+a[[:>:]]	&	a-	a
+[[:<:]]a.c[[:>:]]	&	axcd-dayc-dazce-abc	abc
+[[:<:]]a.c[[:>:]]	&	axcd-dayc-dazce-abc-q	abc
+[[:<:]]a.c[[:>:]]	&	axc-dayc-dazce-abc	axc
+[[:<:]]b.c[[:>:]]	&	a_bxc-byc_d-bzc-q	bzc
+[[:<:]].x..[[:>:]]	&	y_xa_-_xb_y-_xc_-axdc	_xc_
+[[:<:]]a_b[[:>:]]	&	x_a_b
+
+# past problems, and suspected problems
+(A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A])	-	A1	A1
+abcdefghijklmnop	i	abcdefghijklmnop	abcdefghijklmnop
+abcdefghijklmnopqrstuv	i	abcdefghijklmnopqrstuv	abcdefghijklmnopqrstuv
+(ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN])	-	CC11	CC11
+CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a	-	CC11	CC11
+Char \([a-z0-9_]*\)\[.*	b	Char xyz[k	Char xyz[k	xyz
+a?b	-	ab	ab
+-\{0,1\}[0-9]*$	b	-5	-5
+a*a*a*a*a*a*a*	&	aaaaaa	aaaaaa
diff --git a/posix/tst-rxspencer.c b/posix/tst-rxspencer.c
new file mode 100644
index 0000000000..eed3e1820b
--- /dev/null
+++ b/posix/tst-rxspencer.c
@@ -0,0 +1,515 @@
+/* Regular expression tests.
+   Copyright (C) 2003 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sys/types.h>
+#include <mcheck.h>
+#include <regex.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+#include <getopt.h>
+
+static void
+replace_special_chars (char *str)
+{
+  for (; (str = strpbrk (str, "NTSZ")) != NULL; ++str)
+    switch (*str)
+      {
+      case 'N': *str = '\n'; break;
+      case 'T': *str = '\t'; break;
+      case 'S': *str = ' '; break;
+      case 'Z': *str = '\0'; break;
+      }
+}
+
+static void
+glibc_re_syntax (char *str)
+{
+  char *p, *end = strchr (str, '\0') + 1;
+
+  /* Replace [[:<:]] with \< and [[:>:]] with \>.  */
+  for (p = str; (p = strstr (p, "[[:")) != NULL; )
+    if ((p[3] == '<' || p[3] == '>') && strncmp (p + 4, ":]]", 3) == 0)
+      {
+        p[0] = '\\';
+        p[1] = p[3];
+        memmove (p + 2, p + 7, end - p - 7);
+        end -= 5;
+        p += 2;
+      }
+    else
+      p += 3;
+}
+
+static char *
+mb_replace (char *dst, const char c)
+{
+  switch (c)
+    {
+    /* Replace a with \'a and A with \'A.  */
+    case 'a':
+      *dst++ = '\xc3';
+      *dst++ = '\xa1';
+      break;
+    case 'A':
+      *dst++ = '\xc3';
+      *dst++ = '\x81';
+      break;
+    /* Replace b with \v{c} and B with \v{C}.  */
+    case 'b':
+      *dst++ = '\xc4';
+      *dst++ = '\x8d';
+      break;
+    case 'B':
+      *dst++ = '\xc4';
+      *dst++ = '\x8c';
+      break;
+    /* Replace c with \v{d} and C with \v{D}.  */
+    case 'c':
+      *dst++ = '\xc4';
+      *dst++ = '\x8f';
+      break;
+    case 'C':
+      *dst++ = '\xc4';
+      *dst++ = '\x8e';
+      break;
+    /* Replace d with \'e and D with \'E.  */
+    case 'd':
+      *dst++ = '\xc3';
+      *dst++ = '\xa9';
+      break;
+    case 'D':
+      *dst++ = '\xc3';
+      *dst++ = '\x89';
+      break;
+    }
+  return dst;
+}
+
+static char *
+mb_frob_string (const char *str, const char *letters)
+{
+  char *ret, *dst;
+  const char *src;
+
+  if (str == NULL)
+    return NULL;
+
+  ret = malloc (2 * strlen (str) + 1);
+  if (ret == NULL)
+    return NULL;
+
+  for (src = str, dst = ret; *src; ++src)
+    if (strchr (letters, *src))
+      dst = mb_replace (dst, *src);
+    else
+      *dst++ = *src;
+  *dst = '\0';
+  return ret;
+}
+
+/* Like mb_frob_string, but don't replace anything between
+   [: and :], [. and .] or [= and =].  */
+
+static char *
+mb_frob_pattern (const char *str, const char *letters)
+{
+  char *ret, *dst;
+  const char *src;
+  int in_class = 0;
+
+  if (str == NULL)
+    return NULL;
+
+  ret = malloc (2 * strlen (str) + 1);
+  if (ret == NULL)
+    return NULL;
+
+  for (src = str, dst = ret; *src; ++src)
+    if (!in_class && strchr (letters, *src))
+      dst = mb_replace (dst, *src);
+    else
+      {
+	if (!in_class && *src == '[' && strchr (":.=", src[1]))
+	  in_class = 1;
+	else if (in_class && *src == ']' && strchr (":.=", src[-1]))
+	  in_class = 0;
+	*dst++ = *src;
+      }
+  *dst = '\0';
+  return ret;
+}
+
+static int
+check_match (regmatch_t *rm, int idx, const char *string,
+	     const char *match, const char *fail)
+{
+  if (match[0] == '-' && match[1] == '\0')
+    {
+      if (rm[idx].rm_so == -1 && rm[idx].rm_eo == -1)
+	return 0;
+      printf ("%s rm[%d] unexpectedly matched\n", fail, idx);
+      return 1;
+    }
+
+  if (rm[idx].rm_so == -1 || rm[idx].rm_eo == -1)
+    {
+      printf ("%s rm[%d] unexpectedly did not match\n", fail, idx);
+      return 1;
+    }
+
+  if (match[0] == '@')
+    {
+      if (rm[idx].rm_so != rm[idx].rm_eo)
+	{
+	  printf ("%s rm[%d] not empty\n", fail, idx);
+	  return 1;
+	}
+
+      if (strncmp (string + rm[idx].rm_so, match + 1, strlen (match + 1)))
+	{
+	  printf ("%s rm[%d] not matching %s\n", fail, idx, match);
+	  return 1;
+	}
+      return 0;
+    }
+
+  if (rm[idx].rm_eo - rm[idx].rm_so != strlen (match)
+      || strncmp (string + rm[idx].rm_so, match,
+		  rm[idx].rm_eo - rm[idx].rm_so))
+    {
+      printf ("%s rm[%d] not matching %s\n", fail, idx, match);
+      return 1;
+    }
+
+  return 0;
+}
+
+static int
+test (const char *pattern, int cflags, const char *string, int eflags,
+      char *expect, char *matches, const char *fail)
+{
+  regex_t re;
+  regmatch_t rm[10];
+  int n, ret = 0;
+
+  n = regcomp (&re, pattern, cflags);
+  if (n != 0)
+    {
+      if (eflags == -1)
+	{
+	  static struct { reg_errcode_t code; const char *name; } codes []
+#define C(x) { REG_##x, #x }
+	    = { C(NOERROR), C(NOMATCH), C(BADPAT), C(ECOLLATE),
+		C(ECTYPE), C(EESCAPE), C(ESUBREG), C(EBRACK),
+		C(EPAREN), C(EBRACE), C(BADBR), C(ERANGE),
+		C(ESPACE), C(BADRPT) };
+
+	  for (int i = 0; i < sizeof (codes) / sizeof (codes[0]); ++i)
+	    if (n == codes[i].code)
+	      {
+		if (strcmp (string, codes[i].name))
+		  {
+		    printf ("%s regcomp returned REG_%s (expected REG_%s)\n",
+			    fail, codes[i].name, string);
+		    return 1;
+		  }
+	        return 0;
+	      }
+
+	  printf ("%s regcomp return value REG_%d\n", fail, n);
+	  return 1;
+	}
+
+      char buf[500];
+      regerror (n, &re, buf, sizeof (buf));
+      printf ("%s regcomp failed: %s\n", fail, buf);
+      return 1;
+    }
+
+  if (eflags == -1)
+    {
+      regfree (&re);
+
+      /* The test case file assumes something only guaranteed by the
+	 rxspencer regex implementation.  Namely that for empty
+	 expressions regcomp() return REG_EMPTY.  This is not the case
+	 for us and so we ignore this error.  */
+      if (strcmp (string, "EMPTY") == 0)
+	return 0;
+
+      printf ("%s regcomp unexpectedly succeeded\n", fail);
+      return 1;
+    }
+
+  if (regexec (&re, string, 10, rm, eflags))
+    {
+      regfree (&re);
+      if (expect == NULL)
+	return 0;
+      printf ("%s regexec failed\n", fail);
+      return 1;
+    }
+
+  regfree (&re);
+
+  if (expect == NULL)
+    {
+      printf ("%s regexec unexpectedly succeeded\n", fail);
+      return 1;
+    }
+
+  if (cflags & REG_NOSUB)
+    return 0;
+
+  ret = check_match (rm, 0, string, expect, fail);
+  if (matches == NULL)
+    return ret;
+
+  for (n = 1; ret == 0 && n < 10; ++n)
+    {
+      char *p = NULL;
+
+      if (matches)
+	{
+	  p = strchr (matches, ',');
+	  if (p != NULL)
+	    *p = '\0';
+	}
+      ret = check_match (rm, n, string, matches ?: "-", fail);
+      if (p)
+	{
+	  *p = ',';
+	  matches = p + 1;
+	}
+      else
+	matches = NULL;
+    }
+
+  return ret;
+}
+
+static int
+mb_test (const char *pattern, int cflags, const char *string, int eflags,
+	 char *expect, const char *matches, const char *letters,
+	 const char *fail)
+{
+  char *pattern_mb = mb_frob_pattern (pattern, letters);
+  const char *string_mb
+    = eflags == -1 ? string : mb_frob_string (string, letters);
+  char *expect_mb = mb_frob_string (expect, letters);
+  char *matches_mb = mb_frob_string (matches, letters);
+  int ret = 0;
+
+  if (!pattern_mb || !string_mb
+      || (expect && !expect_mb) || (matches && !matches_mb))
+    {
+      printf ("%s %m", fail);
+      ret = 1;
+    }
+  else
+    ret = test (pattern_mb, cflags, string_mb, eflags, expect_mb,
+		matches_mb, fail);
+
+  free (matches_mb);
+  free (expect_mb);
+  if (string_mb != string)
+    free ((char *) string_mb);
+  free (pattern_mb);
+  return ret;
+}
+
+static int
+mb_tests (const char *pattern, int cflags, const char *string, int eflags,
+	  char *expect, const char *matches)
+{
+  int ret = 0;
+  int i;
+  char letters[9], fail[20];
+
+  /* The tests aren't supposed to work with xdigit, since a-dA-D are
+     hex digits while \'a \'A \v{c}\v{C}\v{d}\v{D}\'e \'E are not.  */
+  if (strstr (pattern, "[:xdigit:]"))
+    return 0;
+
+  for (i = 1; i < 16; ++i)
+    {
+      char *p = letters;
+      if (i & 1)
+	*p++ = 'a', *p++ = 'A';
+      if (i & 2)
+        *p++ = 'b', *p++ = 'B';
+      if (i & 4)
+        *p++ = 'c', *p++ = 'C';
+      if (i & 8)
+        *p++ = 'd', *p++ = 'D';
+      *p++ = '\0';
+      sprintf (fail, "UTF-8 %s FAIL", letters);
+      ret |= mb_test (pattern, cflags, string, eflags, expect, matches,
+		      letters, fail);
+    }
+  return ret;
+}
+
+int
+main (int argc, char **argv)
+{
+  int ret = 0;
+  char *line = NULL;
+  size_t line_len = 0;
+  ssize_t len;
+  FILE *f;
+  static int test_utf8 = 0;
+  static const struct option options[] =
+    {
+      {"utf8",	no_argument,	&test_utf8,	1},
+      {NULL,	0,		NULL,		0 }
+    };
+
+  while (getopt_long (argc, argv, "u", options, NULL) >= 0);
+
+  if (optind + 1 != argc)
+    {
+      fprintf (stderr, "Missing test filename\n");
+      return 1;
+    }
+
+  f = fopen (argv[optind], "r");
+  if (f == NULL)
+    {
+      fprintf (stderr, "Couldn't open %s\n", argv[1]);
+      return 1;
+    }
+
+  while ((len = getline (&line, &line_len, f)) > 0)
+    {
+      char *pattern, *flagstr, *string, *expect, *matches, *p;
+      int cflags = REG_EXTENDED, eflags = 0, try_bre_ere = 0;
+
+      if (line[len - 1] == '\n')
+        line[len - 1] = '\0';
+
+      /* Skip comments and empty lines.  */
+      if (*line == '#' || *line == '\0')
+	continue;
+
+      puts (line);
+      fflush (stdout);
+
+      pattern = strtok (line, "\t");
+      if (pattern == NULL)
+        continue;
+
+      if (strcmp (pattern, "\"\"") == 0)
+	pattern += 2;
+
+      flagstr = strtok (NULL, "\t");
+      if (flagstr == NULL)
+        continue;
+
+      string = strtok (NULL, "\t");
+      if (string == NULL)
+        continue;
+
+      if (strcmp (string, "\"\"") == 0)
+	string += 2;
+
+      for (p = flagstr; *p; ++p)
+	switch (*p)
+	  {
+	  case '-':
+	    break;
+	  case 'b':
+	    cflags &= ~REG_EXTENDED;
+	    break;
+	  case '&':
+	    try_bre_ere = 1;
+	    break;
+	  case 'C':
+	    eflags = -1;
+	    break;
+	  case 'i':
+	    cflags |= REG_ICASE;
+	    break;
+	  case 's':
+	    cflags |= REG_NOSUB;
+	    break;
+	  case 'n':
+	    cflags |= REG_NEWLINE;
+	    break;
+	  case '^':
+	    eflags |= REG_NOTBOL;
+	    break;
+	  case '$':
+	    eflags |= REG_NOTEOL;
+	    break;
+	  case 'm':
+	  case 'p':
+	  case '#':
+	    /* Not supported.  */
+	    flagstr = NULL;
+	    break;
+	  }
+
+      if (flagstr == NULL)
+	continue;
+
+      replace_special_chars (pattern);
+      glibc_re_syntax (pattern);
+      if (eflags != -1)
+        replace_special_chars (string);
+
+      expect = strtok (NULL, "\t");
+      matches = NULL;
+      if (expect != NULL)
+        {
+	  replace_special_chars (expect);
+	  matches = strtok (NULL, "\t");
+	  if (matches != NULL)
+	    replace_special_chars (matches);
+        }
+
+      setlocale (LC_ALL, "C");
+      if (test (pattern, cflags, string, eflags, expect, matches, "FAIL")
+	  || (try_bre_ere
+	      && test (pattern, cflags & ~REG_EXTENDED, string, eflags,
+		       expect, matches, "FAIL")))
+	ret = 1;
+      else if (test_utf8)
+	{
+	  setlocale (LC_ALL, "cs_CZ.UTF-8");
+	  if (test (pattern, cflags, string, eflags, expect, matches,
+		    "UTF-8 FAIL")
+	      || (try_bre_ere
+		  && test (pattern, cflags & ~REG_EXTENDED, string, eflags,
+			   expect, matches, "UTF-8 FAIL")))
+	    ret = 1;
+	  else if (mb_tests (pattern, cflags, string, eflags, expect, matches)
+		   || (try_bre_ere
+		       && mb_tests (pattern, cflags & ~REG_EXTENDED, string,
+				    eflags, expect, matches)))
+	    ret = 1;
+	}
+    }
+
+  fclose (f);
+  return ret;
+}
author	Ulrich Drepper <drepper@redhat.com>	2003-11-13 20:52:55 +0000
committer	Ulrich Drepper <drepper@redhat.com>	2003-11-13 20:52:55 +0000
commit	78c81ab7b4a25563697ce988ecff73c9937cef16 (patch)
tree	f001077b4dbd1250b2a747ceef6d6ca3f6b21830 /posix
parent	78d8b07a44111d861be5f54847faccbc1219c3e7 (diff)
download	glibc-78c81ab7b4a25563697ce988ecff73c9937cef16.tar.gz glibc-78c81ab7b4a25563697ce988ecff73c9937cef16.tar.xz glibc-78c81ab7b4a25563697ce988ecff73c9937cef16.zip