From 09bc7ee2b56239020d1ef6bb1f5d7b4ef9f19db1 Mon Sep 17 00:00:00 2001 From: Peter Stephenson Date: Tue, 25 Jul 2006 18:10:37 +0000 Subject: 22557: turn on multibyte option by default --- ChangeLog | 5 +++++ Doc/Zsh/options.yo | 30 +++++++++++++++++++++--------- Misc/globtests | 1 - Src/options.c | 8 +++++++- Src/pattern.c | 4 ++-- Test/D02glob.ztst | 9 +++++++-- Test/D07multibyte.ztst | 38 +++++++++++++++++++++++++++++++++++++- 7 files changed, 79 insertions(+), 16 deletions(-) diff --git a/ChangeLog b/ChangeLog index 62f8cec19..cfc6373e3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ 2006-07-25 Peter Stephenson + * 22557: Doc/Zsh/options.yo, Misc/globtests, Src/options.c, + Src/pattern.c, Test/D02glob.ztst, Test/D07multibyte.ztst: + Turn on multibyte option by default for MULTIBYTE_SUPPORT and fix + tests and patterns. + * unposted: Src/pattern.c, Src/utils.c: minor typos in 22556 found when MULTIBYTE_SUPPORT is not defined. diff --git a/Doc/Zsh/options.yo b/Doc/Zsh/options.yo index 589ed79cb..02d8fa046 100644 --- a/Doc/Zsh/options.yo +++ b/Doc/Zsh/options.yo @@ -411,19 +411,31 @@ item(tt(MARK_DIRS) (tt(-8), ksh: tt(-X)))( Append a trailing `tt(/)' to all directory names resulting from filename generation (globbing). ) -pindex(MULTIBYTE) +pindex(MULTIBYTE ) cindex(characters, multibyte, in expansion and globbing) cindex(multibyte characters, in expansion and globbing) item(tt(MULTIBYTE))( -Respect multibyte characters when found during pattern matching. -When this option is set, characters strings are examined using the +Respect multibyte characters when found in strings. +When this option is set, strings are examined using the system library to determine how many bytes form a character, depending -on the current locale. If the option is unset -(or the shell was not compiled with the configuration option -tt(MULTIBYTE_SUPPORT)) a single byte is always treated as a single -character. The option will eventually be extended to cover expansion. -Note, however, that it does not affect the shellʼs editor, which always -uses the locale to determine multibyte characters. +on the current locale. This affects the way characters are counted in +pattern matching, parameter values and various delimiters. + +The option is on by default if the shell was compiled with +tt(MULTIBYTE_SUPPORT); otherwise it is off by default and has no effect if +turned on. + +If the option is off a single byte is always treated as a single +character. This setting is designed purely for examining strings +known to contain raw bytes or other values that may not be characters +in the current locale. It is not necessary to unset the option merely +because the character set for the current locale does not contain multibyte +characters. + +The option does not affect the shell's editor, which always uses the +locale to determine multibyte characters. This is because +the character set displayed by the terminal emulator is independent of +shell settings. ) pindex(NOMATCH) cindex(globbing, no matches) diff --git a/Misc/globtests b/Misc/globtests index 232fe3daa..a5f7c4a00 100755 --- a/Misc/globtests +++ b/Misc/globtests @@ -182,6 +182,5 @@ f atest/path *((#s)|/)test((#e)|/)* f path/testy *((#s)|/)test((#e)|/)* f path/testy/ohyes *((#s)|/)test((#e)|/)* f path/atest/ohyes *((#s)|/)test((#e)|/)* -t bjrn *[]* EOT print "$failed tests failed." diff --git a/Src/options.c b/Src/options.c index 307bd5430..05e878687 100644 --- a/Src/options.c +++ b/Src/options.c @@ -166,7 +166,13 @@ static struct optname optns[] = { {{NULL, "markdirs", 0}, MARKDIRS}, {{NULL, "menucomplete", 0}, MENUCOMPLETE}, {{NULL, "monitor", OPT_SPECIAL}, MONITOR}, -{{NULL, "multibyte", 0/*TBD*/}, MULTIBYTE}, +{{NULL, "multibyte", +#ifdef MULTIBYTE_SUPPORT + OPT_ALL +#else + 0 +#endif + }, MULTIBYTE}, {{NULL, "multios", OPT_EMULATE|OPT_ZSH}, MULTIOS}, {{NULL, "nomatch", OPT_EMULATE|OPT_NONBOURNE},NOMATCH}, {{NULL, "notify", OPT_ZSH}, NOTIFY}, diff --git a/Src/pattern.c b/Src/pattern.c index 24077768d..9ae00ca94 100644 --- a/Src/pattern.c +++ b/Src/pattern.c @@ -343,7 +343,7 @@ metacharinc(char **x) /* Error. Treat as single byte. */ /* Reset the shift state for next time. */ memset(&shiftstate, 0, sizeof(shiftstate)); - return (wchar_t) *(*x)++; + return (wchar_t) STOUC(*(*x)++); } #else @@ -595,7 +595,7 @@ patcompile(char *exp, int inflags, char **endexp) while (oplen--) { if (imeta(*opnd)) { *dst++ = Meta; - *dst++ = *opnd ^ 32; + *dst++ = *opnd++ ^ 32; } else { *dst++ = *opnd++; } diff --git a/Test/D02glob.ztst b/Test/D02glob.ztst index 409a73e30..7c76414f0 100644 --- a/Test/D02glob.ztst +++ b/Test/D02glob.ztst @@ -6,7 +6,9 @@ mkdir glob.tmp/dir3/subdir : >glob.tmp/{,{dir1,dir2}/}{a,b,c} - globtest () { $ZTST_testdir/../Src/zsh -f $ZTST_srcdir/../Misc/$1 } + globtest () { + $ZTST_testdir/../Src/zsh -f $ZTST_srcdir/../Misc/$1 + } regress_absolute_path_and_core_dump() { local absolute_dir=$(cd glob.tmp && pwd -P) @@ -175,7 +177,6 @@ >1: [[ path/testy = *((#s)|/)test((#e)|/)* ]] >1: [[ path/testy/ohyes = *((#s)|/)test((#e)|/)* ]] >1: [[ path/atest/ohyes = *((#s)|/)test((#e)|/)* ]] ->0: [[ bjrn = *[]* ]] >0 tests failed. globtest globtests.ksh @@ -263,6 +264,10 @@ >0: [[ Modules = (#i)*m* ]] >0 tests failed. + (unsetopt multibyte + [[ bjrn = *[]* ]]) +0:single byte match with top bit set + ( regress_absolute_path_and_core_dump ) 0:exclusions regression test > diff --git a/Test/D07multibyte.ztst b/Test/D07multibyte.ztst index 683e8350e..263a7a44e 100644 --- a/Test/D07multibyte.ztst +++ b/Test/D07multibyte.ztst @@ -176,7 +176,7 @@ ?(eval):1: command not found: hähä=3 foo="Ølaf«Ødd«øpénëd«ån«àpple" - print -l ${(s.«.)foo} + print -l ${(s.«.)foo} ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος." print -l ${=ioh} print ${(w)#ioh} @@ -228,3 +228,39 @@ 0:read multibyte characters <«»ignored >«» + + # See if the system grokks first-century Greek... + ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος." + for (( i = 1; i <= ${#ioh}; i++ )); do + # FC3 doesn't recognise ῇ (U+1FC7: Greek small letter eta with + # perispomeni and ypogegrammeni, of course) as a lower case character. + if [[ $ioh[i] != [[:lower:]] && $i != 7 ]]; then + for tp in upper space punct invalid; do + if [[ $tp = invalid || $ioh[i] = [[:${tp}:]] ]]; then + print "$i: $tp" + break + fi + done + fi + done +0:isw* functions on non-ASCII wide characters +>1: upper +>3: space +>8: space +>11: space +>13: space +>19: punct +>20: space +>24: space +>26: space +>32: space +>35: space +>40: space +>44: space +>49: punct +>50: space +>54: space +>59: space +>62: space +>64: space +>70: punct -- cgit 1.4.1