diff options
Diffstat (limited to 'posix/tst-fnmatch.input')
-rw-r--r-- | posix/tst-fnmatch.input | 125 |
1 files changed, 83 insertions, 42 deletions
diff --git a/posix/tst-fnmatch.input b/posix/tst-fnmatch.input index 589fb2a940..dc2ca8d01a 100644 --- a/posix/tst-fnmatch.input +++ b/posix/tst-fnmatch.input @@ -23,6 +23,63 @@ # wording describing the situations to be tested. It does not specify # any specific tests. I.e., the tests below are in no case sufficient. # They are hopefully necessary, though. +# +# See: +# +# http://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xbd_chap09.html +# +# > RE Bracket Expression +# > +# > Range expressions are, historically, an integral part of REs. +# > However, the requirements of "natural language behavior" and +# > portability do conflict. In the POSIX locale, ranges must be treated +# > according to the collating sequence and include such characters that +# > fall within the range based on that collating sequence, regardless +# > of character values. In other locales, ranges have unspecified behavior. +# > ... +# > The current standard leaves unspecified the behavior of a range +# > expression outside the POSIX locale. This makes it clearer that +# > conforming applications should avoid range expressions outside the +# > POSIX locale, and it allows implementations and compatible user-mode +# > matchers to interpret range expressions using native order, CEO, +# > collation sequence, or other, more advanced techniques. The concerns +# > which led to this change were raised in IEEE PASC interpretation +# > 1003.2 #43 and others, and related to ambiguities in the +# > specification of how multi-character collating elements should be +# > handled in range expressions. These ambiguities had led to multiple +# > interpretations of the specification, in conflicting ways, which led +# > to varying implementations. As noted above, efforts were made to +# > resolve the differences, but no solution has been found that would +# > be specific enough to allow for portable software while not +# > invalidating existing implementations. +# +# Therefore, using [a-z] does not make much sense except in the C/POSIX locale. +# The new iso14651_t1_common lists upper case and lower case Latin characters +# in a different order than the old one which causes surprising results +# for example in the de_DE locale: [a-z] now includes A because A comes +# after a in iso14651_t1_common but does not include Z because that comes +# after z in iso14651_t1_common. +# +# This lead to several bugs and problems with user scripts that do not +# expect [a-z] to match uppercase characters. +# +# See the following bugs: +# https://sourceware.org/bugzilla/show_bug.cgi?id=23393 +# https://sourceware.org/bugzilla/show_bug.cgi?id=23420 +# +# No consensus exists on how best to handle the changes so the +# iso14651_t1_common collation element order (CEO) has been changed to +# deinterlace the a-z and A-Z regions. +# +# With the deinterlacing commit ac3a3b4b0d561d776b60317d6a926050c8541655 +# could be reverted to re-test the correct non-interleaved expectations. +# +# Please note that despite the region being deinterlaced, the ordering +# of collation remains the same. In glibc we implement CEO and because of +# that we can reorder the elements to reorder ranges without impacting +# collation which depends on weights. The collation element ordering +# could have been changed to include just a-z, A-Z, and 0-9 in three +# distinct blocks, but this needs more discussion by the community. # B.6 004(C) C "!#%+,-./01234567889" "!#%+,-./01234567889" 0 @@ -418,47 +475,21 @@ C "-" "[Z-\\]]" NOMATCH # Following are tests outside the scope of IEEE 2003.2 since they are using # locales other than the C locale. The main focus of the tests is on the # handling of ranges and the recognition of character (vs bytes). -# -# See: -# -# http://pubs.opengroup.org/onlinepubs/7908799/xbd/re.html -# -# > A range expression represents the set of collating elements that fall -# > between two elements in the current collation sequence, -# > inclusively. It is expressed as the starting point and the ending -# > point separated by a hyphen (-). -# > -# > Range expressions must not be used in portable applications because -# > their behaviour is dependent on the collating sequence. Ranges will be -# > treated according to the current collating sequence, and include such -# > characters that fall within the range based on that collating -# > sequence, regardless of character values. This, however, means that -# > the interpretation will differ depending on collating sequence. If, -# > for instance, one collating sequence defines ä as a variant of a, -# > while another defines it as a letter following z, then the expression -# > [ä-z] is valid in the first language and invalid in the second. -# -# Therefore, using [a-z] does not make much sense except in the C/POSIX locale. -# The new iso14651_t1_common lists upper case and lower case Latin characters -# in a different order than the old one which causes surprising results -# for example in the de_DE locale: [a-z] now includes A because A comes -# after a in iso14651_t1_common but does not include Z because that comes -# after z in iso14651_t1_common. de_DE.ISO-8859-1 "a" "[a-z]" 0 de_DE.ISO-8859-1 "z" "[a-z]" 0 de_DE.ISO-8859-1 "ä" "[a-z]" 0 de_DE.ISO-8859-1 "ö" "[a-z]" 0 de_DE.ISO-8859-1 "ü" "[a-z]" 0 -de_DE.ISO-8859-1 "A" "[a-z]" 0 # surprising but correct! +de_DE.ISO-8859-1 "A" "[a-z]" NOMATCH de_DE.ISO-8859-1 "Z" "[a-z]" NOMATCH -de_DE.ISO-8859-1 "Ä" "[a-z]" 0 # surprising but correct! -de_DE.ISO-8859-1 "Ö" "[a-z]" 0 # surprising but correct! -de_DE.ISO-8859-1 "Ü" "[a-z]" 0 # surprising but correct! +de_DE.ISO-8859-1 "Ä" "[a-z]" NOMATCH +de_DE.ISO-8859-1 "Ö" "[a-z]" NOMATCH +de_DE.ISO-8859-1 "Ü" "[a-z]" NOMATCH de_DE.ISO-8859-1 "a" "[A-Z]" NOMATCH -de_DE.ISO-8859-1 "z" "[A-Z]" 0 # surprising but correct! -de_DE.ISO-8859-1 "ä" "[A-Z]" 0 # surprising but correct! -de_DE.ISO-8859-1 "ö" "[A-Z]" 0 # surprising but correct! -de_DE.ISO-8859-1 "ü" "[A-Z]" 0 # surprising but correct! +de_DE.ISO-8859-1 "z" "[A-Z]" NOMATCH +de_DE.ISO-8859-1 "ä" "[A-Z]" NOMATCH +de_DE.ISO-8859-1 "ö" "[A-Z]" NOMATCH +de_DE.ISO-8859-1 "ü" "[A-Z]" NOMATCH de_DE.ISO-8859-1 "A" "[A-Z]" 0 de_DE.ISO-8859-1 "Z" "[A-Z]" 0 de_DE.ISO-8859-1 "Ä" "[A-Z]" 0 @@ -536,21 +567,31 @@ de_DE.ISO-8859-1 "ba" "[[.a.]]a" NOMATCH # And with a multibyte character set. +en_US.UTF-8 "a" "[a-z]" 0 +en_US.UTF-8 "z" "[a-z]" 0 +en_US.UTF-8 "A" "[a-z]" NOMATCH +en_US.UTF-8 "Z" "[a-z]" NOMATCH +en_US.UTF-8 "a" "[A-Z]" NOMATCH +en_US.UTF-8 "z" "[A-Z]" NOMATCH +en_US.UTF-8 "A" "[A-Z]" 0 +en_US.UTF-8 "Z" "[A-Z]" 0 +en_US.UTF-8 "0" "[0-9]" 0 +en_US.UTF-8 "9" "[0-9]" 0 de_DE.UTF-8 "a" "[a-z]" 0 de_DE.UTF-8 "z" "[a-z]" 0 de_DE.UTF-8 "ä" "[a-z]" 0 de_DE.UTF-8 "ö" "[a-z]" 0 de_DE.UTF-8 "ü" "[a-z]" 0 -de_DE.UTF-8 "A" "[a-z]" 0 # surprising but correct! +de_DE.UTF-8 "A" "[a-z]" NOMATCH de_DE.UTF-8 "Z" "[a-z]" NOMATCH -de_DE.UTF-8 "Ä" "[a-z]" 0 # surprising but correct! -de_DE.UTF-8 "Ö" "[a-z]" 0 # surprising but correct! -de_DE.UTF-8 "Ãœ" "[a-z]" 0 # surprising but correct! +de_DE.UTF-8 "Ä" "[a-z]" NOMATCH +de_DE.UTF-8 "Ö" "[a-z]" NOMATCH +de_DE.UTF-8 "Ãœ" "[a-z]" NOMATCH de_DE.UTF-8 "a" "[A-Z]" NOMATCH -de_DE.UTF-8 "z" "[A-Z]" 0 # surprising but correct! -de_DE.UTF-8 "ä" "[A-Z]" 0 # surprising but correct! -de_DE.UTF-8 "ö" "[A-Z]" 0 # surprising but correct! -de_DE.UTF-8 "ü" "[A-Z]" 0 # surprising but correct! +de_DE.UTF-8 "z" "[A-Z]" NOMATCH +de_DE.UTF-8 "ä" "[A-Z]" NOMATCH +de_DE.UTF-8 "ö" "[A-Z]" NOMATCH +de_DE.UTF-8 "ü" "[A-Z]" NOMATCH de_DE.UTF-8 "A" "[A-Z]" 0 de_DE.UTF-8 "Z" "[A-Z]" 0 de_DE.UTF-8 "Ä" "[A-Z]" 0 |