From b5a83cc7549e48a82dd57859f40a8f282f1534d0 Mon Sep 17 00:00:00 2001 From: Peter Stephenson Date: Thu, 15 Dec 2005 10:38:55 +0000 Subject: users/9788: add (oN) glob qualifier for no sorting 22076: more documentation for multibyte handling --- ChangeLog | 8 +++ Doc/Zsh/expn.yo | 6 +- Etc/FAQ.yo | 190 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- INSTALL | 13 +++- Src/glob.c | 47 +++++++++----- 5 files changed, 240 insertions(+), 24 deletions(-) diff --git a/ChangeLog b/ChangeLog index 1e8cd1e33..2e011e029 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2005-12-15 Peter Stephenson + + * 22076: INSTALL, Etc/FAQ.yo: more information on multibyte + handling. + + * users/9788: Doc/Zsh/expn.yo, Src/glob.c: add (oN) qualifier + for no sorting. + 2005-12-14 Bart Schaefer * 21814: Src/loop.c, Src/signals.c: if an error occurs in an diff --git a/Doc/Zsh/expn.yo b/Doc/Zsh/expn.yo index b8aa1cfd8..35451901f 100644 --- a/Doc/Zsh/expn.yo +++ b/Doc/Zsh/expn.yo @@ -1958,11 +1958,13 @@ they are sorted by the time of the last access, modification, or inode change respectively; if tt(d), files in subdirectories appear before those in the current directory at each level of the search DASH()- this is best combined with other criteria, for example `tt(odon)' to sort on names for -files within the same directory. Note that tt(a), tt(m), and tt(c) compare +files within the same directory; if tt(N), no sorting is performed. +Note that tt(a), tt(m), and tt(c) compare the age against the current time, hence the first name in the list is the youngest file. Also note that the modifiers tt(^) and tt(-) are used, so `tt(*(^-oL))' gives a list of all files sorted by file size in descending -order, following any symbolic links. +order, following any symbolic links. Unless tt(oN) is used, multiple order +specifiers may occur to resolve ties. ) item(tt(O)var(c))( like `tt(o)', but sorts in descending order; i.e. `tt(*(^oc))' is the diff --git a/Etc/FAQ.yo b/Etc/FAQ.yo index 3546e9d13..4d098e840 100644 --- a/Etc/FAQ.yo +++ b/Etc/FAQ.yo @@ -43,11 +43,11 @@ whenlatex(report(ARG1)(ARG2)(ARG3))\ whenman(report(ARG1)(ARG2)(ARG3))\ whenms(report(ARG1)(ARG2)(ARG3))\ whensgml(report(ARG1)(ARG2)(ARG3))) -myreport(Z-Shell Frequently-Asked Questions)(Peter Stephenson)(2005/07/18) +myreport(Z-Shell Frequently-Asked Questions)(Peter Stephenson)(2005/12/14) COMMENT(-- the following are for Usenet and must appear first)\ description(\ mydit(Archive-Name:) unix-faq/shell/zsh -mydit(Last-Modified:) 2005/07/18 +mydit(Last-Modified:) 2005/12/14 mydit(Submitted-By:) email(pws@pwstephenson.fsnet.co.uk (Peter Stephenson)) mydit(Posting-Frequency:) Monthly mydit(Copyright:) (C) P.W. Stephenson, 1995--2005 (see end of document) @@ -126,11 +126,18 @@ Chapter 4: The mysteries of completion 4.5. How do I get started with programmable completion? 4.6. Suppose I want to complete all files during a special completion? -Chapter 5: The future of zsh -5.1. What bugs are currently known and unfixed? (Plus recent important changes) -5.2. Where do I report bugs, get more info / who's working on zsh? -5.3. What's on the wish-list? -5.4. Did zsh have problems in the year 2000? +Chapter 5: Multibyte input + +5.1. What is multibyte input? +5.2. How does zsh handle multibyte input? +5.3. How do I ensure multibyte input works on my system? +5.4. How can I input characters that aren't on my keyboard? + +Chapter 6: The future of zsh +6.1. What bugs are currently known and unfixed? (Plus recent important changes) +6.2. Where do I report bugs, get more info / who's working on zsh? +6.3. What's on the wish-list? +6.4. Did zsh have problems in the year 2000? Acknowledgments @@ -1945,6 +1952,175 @@ sect(Suppose I want to complete all files during a special completion?) such as expansion or approximate completion. +chapter(Multibyte input) + +sect(What is multibyte input?) + + For a long time computers had a simple idea of a character: each octet + (8-bit byte) of text contained one character. This meant an application + could only use 256 characters at once. The first 128 characters (0 to + 127) on Unix and similar systems usually corresponded to the ASCII + character set, as they still do. So all other possibilities had to be + crammed into the remaining 128. This was done by picking the appropriate + character set for the use you were making. For example, ISO 8859 + specified a set of extensions to ASCII for various alphabets. + + This was fine for simple extensions and certain short enough relatives of + the Latin alphabet (with no more than a few dozen alphabetic characters), + but useless for complex alphabets. Also, having a different character + set for each language is inconvenient: you have to start a new terminal + to run the shell with each character set. So the character set had to be + extended. To cut a long story short, the world has mostly standardised + on a character set called Unicode, related to the international standard + ISO 10646. The intention is that this will contain every single + character used in all the languages of the world. + + This has far too many characters to fit into a single octet. What's + more, UNIX utilities such as zsh are so used to dealing with ASCII that + removing it would cause no end of trouble. So what happens is this: the + 128 ASCII characters are kept exactly the same (and they're the same as + the first 128 characters of Unicode), but the remaining 128 characters + are used to build up any other Unicode character by combining multiple + octets together. The shell doesn't need to interpret these directly; it + just needs to ask the system library how many octets form the next + character, and if there's a valid character there at all. (It can also + ask the system what width the character takes up on the screen, so that + characters no longer need to be exacxtly one position wide.) + + The way this is done is called UTF-8. Multibyte encodings of other + character sets exist (you might encounter them for Asian character sets); + zsh will be able to use any such encoding as long as it contains ASCII as + a single-octet subset and the system can provide information about other + characters. However, in the case of Unicode, UTF-8 is the only one you + are likely to enounter. + + (In case you're confused: Unicode is the characters set, while UTF-8 is + an encoding of it. You might hear about other encodings, such as UCS-2 + and UCS-4 which are basically the character's index in the character set + as a two-octet or four-octet integer. You might see files encoded this + way, for example on Windows, but the shell can't deal directly with text + in those formats.) + + +sect(How does zsh handle multibyte input?) + + Until version 4.3, zsh didn't handle multibyte input properly at all. + Each octet in a multibyte character would look to the shell like a + separate character. If your terminal handled the character set, + characters might appear correct on screen, but trying to edit them would + cause all sorts of odd effects. (It was possible to edit in zsh using + single-byte extensions of ASCII such as the ISO 8859 family, however.) + + From version 4.3, multibyte input is handled in the line editor if zsh + has been compiled with the appropriate definitions. This will happen + automatically if the compiler defines __STDC_ISO_10646__, which is true + for many recent GNU-based systems. On other systems you must configure + zsh with the argument --enable-multibyte to configure. (The reason for + this is that the presence of __STDC_ISO_10646__ ensures all the required + library support is present, short-circuiting a large number of + configuration tests.) Explicit use of --enable-multibyte should work on + many other recent UNIX systems; if it works on yours, and that's not + mentioned in the shell documentation, please report this to + zsh-workers@sunsite.dk, and if it doesn't but you can work out why not + we'd also be interested in hearing. + + You can test if multibyte handling is compiled into your version of the + shell by running: + verb( + (bindkey -m) + ) + which should output a warning: + verb( + bindkey: warning: `bindkey -m' disables multibyte support + ) + If it doesn't, you don't have multibyte support in your shell. The + parentheses are there to run the command in a subshell, which protects + your interactive shell from the effects being warned about. + + Multibyte strings are not yet handled anywhere else in the shell. This + means, for example, patterns treat multibyte characters as a set of single + octets and the ${#var} syntax counts octets, not characters. There will + probably be new syntax to ensure that zsh can work both in its traditional + way as well as when interpreting multibyte characters. + + +sect(How do I ensure multibyte input works on my system?) + + Once you have a version of zsh with multibyte support, you need to + ensure the envivronment is correct. We'll assume you're using UTF-8. + Many modern systems may come set up correctly already. Try one of + the editing widgets described in the next section to see. + + There are basically three components. + + itemize( + it() The locale. This describes a whole series of features specific + to countries or regions of which the character set is one. Usually + it is controlled by the environment variable tt(LANG) (there are + others but this is the one to start with). You need to find a + locale whose name contains mytt(UTF-8). This will be a variant on + your usual locale, which typically indicates the language and + country; for example, mine is mytt(en_GB.UTF-8). Luckily, zsh can + complete locale names, so if you have the new completion system + loaded you can type mytt(export LANG=) and attempt to complete a + suitable locale. It's the locale that tells the shell to expect the + right form of multibyte input. (However, there's no guarantee that + the shell is actually going to get this input: for example, if you + edit file names that have been created using a different character + set it won't work properly.) + it() The terminal emulator. Those that are supplied with a recent + desktop environment, such as gnome-terminal, are likely to have + extensive support for localization and may work correctly as soon + as they know the locale. + it() The font. If you selected this from a menu in your terminal + emulator, there's a good chance it already selected the right + character set to go with it. If you hand-picked an old fashioned + X font with a lot of dashes, you need to make sure it ends with + the right character encoding, mytt(iso10646-1) (and not, for + example, mytt(iso8859-1)). Not all characters will be available + in any font, and some fonts may have a more restricted range of + Unicode characters than others. + ) + + +sect(How can I input characters that aren't on my keyboard?) + + Two functions are provided with zsh that help you input characters. + As with all editing widgets implemented by functions, you need to + mark the function for autoload, create the widget, and, if you are + going to use it frequently, bind it to a key sequence. The + following binds tt(insert-composed-char) to F5 on my keyboard: + verb( + autoload -Uz insert-composed-char + zle -N insert-composed-char + bindkey '\e[15~' insert-composed-char + ) + + The two widgets are described in the tt(zshcontrib(1)) manual + page, but here is a brief summary: + + tt(insert-composed-char) is followed by two characters that + are a mnemonic for a multibyte character. For example mytt(a:) + is a with an umlaut; mytt(cH) is the symbol for hearts on a playing + card. Various accented characters, European and related alphabets, + and punctuation and mathematical symbols are available. The + mnemonics are mostly those given by RFC 1345, see + url(http://www.faqs.org/rfcs/rfc1345.html)\ +(http://www.faqs.org/rfcs/rfc1345.html). + + tt(insert-unicode-char) is used to input a Unicode character by + its hexadecimal number. This is the number given in the Unicode + character charts, see for example \ +url(http://www.unicode.org/charts/)(http://www.unicode.org/charts/). + You need to execute the function, then type the hexadecimal number + (you can omit any leading zeroes), then execute the function again. + + Both functions can be used without multibyte mode, provided the locale is + correct and the character selected exists in the current character set; + however, using UTF-8 massively extends the number of valid characters + that can be produced. + + chapter(The future of zsh) sect(What bugs are currently known and unfixed? (Plus recent \ diff --git a/INSTALL b/INSTALL index 855164f89..a7635e12e 100644 --- a/INSTALL +++ b/INSTALL @@ -272,7 +272,16 @@ The support can be explicitly enabled or disable with --enable-multibyte or --disable-multibyte. Reports of systems where multibyte support was not enabled by default but --enable-multibyte resulted in a usable shell would be appreciated. The developers are not aware of any need to use ---disable-multibyte and this should be reported as a bug. +--disable-multibyte and this should be reported as a bug. Currently +multibyte mode is believed to work automatically on: + + - All(?) current GNU/Linux distributions + - All(?) current BSD variants + - OS X 10.4.3 + +and to work when configured with --enable-multibyte on: + + - Solaris 8 and later The main shell is not yet aware of multibyte characters, so for example the length of a scalar parameter will return the number of bytes, not @@ -281,6 +290,8 @@ characters. This means that pattern tests such as ? and [[:alpha:]] do not work correctly with characters in multibyte character sets beyond the ASCII subset. +See chapter 5 in the FAQ for some notes on multibyte input. + Memory Routines --------------- diff --git a/Src/glob.c b/Src/glob.c index a4d02c3c8..efdce9fb3 100644 --- a/Src/glob.c +++ b/Src/glob.c @@ -56,11 +56,14 @@ struct gmatch { #define GS_NAME 1 #define GS_DEPTH 2 -#define GS_SIZE 4 -#define GS_ATIME 8 -#define GS_MTIME 16 -#define GS_CTIME 32 -#define GS_LINKS 64 + +#define GS_SHIFT_BASE 4 + +#define GS_SIZE (GS_SHIFT_BASE) +#define GS_ATIME (GS_SHIFT_BASE << 1) +#define GS_MTIME (GS_SHIFT_BASE << 2) +#define GS_CTIME (GS_SHIFT_BASE << 3) +#define GS_LINKS (GS_SHIFT_BASE << 4) #define GS_SHIFT 5 #define GS__SIZE (GS_SIZE << GS_SHIFT) @@ -69,7 +72,8 @@ struct gmatch { #define GS__CTIME (GS_CTIME << GS_SHIFT) #define GS__LINKS (GS_LINKS << GS_SHIFT) -#define GS_DESC 4096 +#define GS_DESC (GS_SHIFT_BASE << (2*GS_SHIFT)) +#define GS_NONE (GS_SHIFT_BASE << (2*GS_SHIFT+1)) #define GS_NORMAL (GS_SIZE | GS_ATIME | GS_MTIME | GS_CTIME | GS_LINKS) #define GS_LINKED (GS_NORMAL << GS_SHIFT) @@ -1414,6 +1418,7 @@ zglob(LinkList list, LinkNode np, int nountok) case 'm': t = GS_MTIME; break; case 'c': t = GS_CTIME; break; case 'd': t = GS_DEPTH; break; + case 'N': t = GS_NONE; break; default: zerr("unknown sort specifier", NULL, 0); restore_globstate(saved); @@ -1622,10 +1627,13 @@ zglob(LinkList list, LinkNode np, int nountok) matchct = 1; } } - /* Sort arguments in to lexical (and possibly numeric) order. * - * This is reversed to facilitate insertion into the list. */ - qsort((void *) & matchbuf[0], matchct, sizeof(struct gmatch), - (int (*) _((const void *, const void *)))gmatchcmp); + + if (!(gf_sortlist[0] & GS_NONE)) { + /* Sort arguments in to lexical (and possibly numeric) order. * + * This is reversed to facilitate insertion into the list. */ + qsort((void *) & matchbuf[0], matchct, sizeof(struct gmatch), + (int (*) _((const void *, const void *)))gmatchcmp); + } if (first < 0) { first += matchct; @@ -1637,10 +1645,21 @@ zglob(LinkList list, LinkNode np, int nountok) else if (end > matchct) end = matchct; if ((end -= first) > 0) { - matchptr = matchbuf + matchct - first - end; - while (end-- > 0) { /* insert matches in the arg list */ - insertlinknode(list, node, matchptr->name); - matchptr++; + if (gf_sortlist[0] & GS_NONE) { + /* Match list was never reversed, so insert back to front. */ + matchptr = matchbuf + matchct - first - 1; + while (end-- > 0) { + /* insert matches in the arg list */ + insertlinknode(list, node, matchptr->name); + matchptr--; + } + } else { + matchptr = matchbuf + matchct - first - end; + while (end-- > 0) { + /* insert matches in the arg list */ + insertlinknode(list, node, matchptr->name); + matchptr++; + } } } free(matchbuf); -- cgit 1.4.1