diff options
-rw-r--r-- | ChangeLog | 9 | ||||
-rw-r--r-- | NEWS | 16 | ||||
-rw-r--r-- | catgets/Makefile | 5 | ||||
-rw-r--r-- | catgets/gencat.c | 282 |
4 files changed, 274 insertions, 38 deletions
diff --git a/ChangeLog b/ChangeLog index 8641abe1fd..2161993d45 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2000-11-27 Ulrich Drepper <drepper@redhat.com> + + * catgets/Makefile (test1.cat): Set LC_ALL, LOCPATH, and GCONV_PATH + for gencat run. + (libc.cat): Likewise. + * catgets/gencat.c: Implement handling of message catalogs encoded + with stateful character sets. + Based on a patch by Shinya Hanataka <hanataka@abyss.rim.or.jp>. + 2000-11-26 Ulrich Drepper <drepper@redhat.com> * sysdeps/unix/opendir.c (__opendir): Add cast to avoid warning. diff --git a/NEWS b/NEWS index e6391a7f5f..4b3a977238 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -GNU C Library NEWS -- history of user-visible changes. 2000-08-13 +GNU C Library NEWS -- history of user-visible changes. 2000-11-27 Copyright (C) 1992-1999, 2000 Free Software Foundation, Inc. See the end for copying conditions. @@ -7,6 +7,20 @@ Please send GNU C library bug reports using the `glibcbug' script to <bugs@gnu.org>. Questions and suggestions should be send to <bug-glibc@gnu.org>. +Version 2.2.1 + +* The gencat program now parses the input file according to the charset + selected by the LC_CTYPE category. This is important for stateful + character sets. To make generating catalogs easier there is a way + to overwrite the charset selected by the locale: before the first + message or $ quote line the catalog can contain a line like + + $ codeset=ISO-8859-2 + + to select the charset (ISO-8859-2 in this case). + + Implemented by Shinya Hanataka and Ulrich Drepper. + Version 2.2 * Greg McGary added runtime support for bounds checking using gcc's diff --git a/catgets/Makefile b/catgets/Makefile index bc6575c245..caf8eec651 100644 --- a/catgets/Makefile +++ b/catgets/Makefile @@ -53,10 +53,13 @@ tests: $(objpfx)de/libc.cat $(objpfx)test1.cat # This test just checks whether the program produces any error or not. # The result is not tested. $(objpfx)test1.cat: test1.msg $(objpfx)gencat + LC_ALL=hr_HR.ISO-8859-2 LOCPATH=$(common-objpfx)localedata \ + GCONV_PATH=$(common-objpfx)iconvdata \ $(built-program-cmd) -H $(objpfx)test1.h $@ $< $(objpfx)de/libc.cat: $(objpfx)de.msg $(objpfx)gencat $(make-target-directory) - $(built-program-cmd) $@ $< + LC_ALL=de_DE.ISO-8859-1 LOCPATH=$(common-objpfx)localedata \ + GCONV_PATH=$(common-objpfx)iconvdata $(built-program-cmd) $@ $< $(objpfx)tst-catgets.out: $(objpfx)de/libc.cat # Generate a non-simple input file. diff --git a/catgets/gencat.c b/catgets/gencat.c index de6bdf65d8..0200ca44ef 100644 --- a/catgets/gencat.c +++ b/catgets/gencat.c @@ -22,11 +22,14 @@ #endif #include <argp.h> +#include <assert.h> #include <ctype.h> #include <endian.h> #include <errno.h> #include <error.h> #include <fcntl.h> +#include <iconv.h> +#include <langinfo.h> #include <locale.h> #include <libintl.h> #include <limits.h> @@ -37,6 +40,7 @@ #include <stdlib.h> #include <string.h> #include <unistd.h> +#include <wchar.h> #include "version.h" @@ -79,7 +83,7 @@ struct catalog struct set_list *all_sets; struct set_list *current_set; size_t total_messages; - char quote_char; + wint_t quote_char; int last_set; struct obstack mem_pool; @@ -137,6 +141,8 @@ static struct argp argp = /* Wrapper functions with error checking for standard functions. */ extern void *xmalloc (size_t n); extern void *xcalloc (size_t n, size_t s); +extern void *xrealloc (void *o, size_t n); +extern char *xstrdup (const char *); /* Prototypes for local functions. */ static void error_print (void); @@ -145,9 +151,11 @@ static struct catalog *read_input_file (struct catalog *current, static void write_out (struct catalog *result, const char *output_name, const char *header_name); static struct set_list *find_set (struct catalog *current, int number); -static void normalize_line (const char *fname, size_t line, char *string, - char quote_char); +static void normalize_line (const char *fname, size_t line, iconv_t cd, + wchar_t *string, wchar_t quote_char); static void read_old (struct catalog *catalog, const char *file_name); +static int open_conversion (const char *codesetp, iconv_t *cd_towcp, + iconv_t *cd_tombp); int @@ -260,6 +268,11 @@ read_input_file (struct catalog *current, const char *fname) char *buf; size_t len; size_t line_number; + wchar_t *wbuf; + size_t wbufsize; + iconv_t cd_towc = (iconv_t) -1; + iconv_t cd_tomb = (iconv_t) -1; + char *codeset = NULL; if (strcmp (fname, "-") == 0 || strcmp (fname, "/dev/stdin") == 0) { @@ -289,6 +302,10 @@ read_input_file (struct catalog *current, const char *fname) buf = NULL; len = 0; line_number = 0; + + wbufsize = 1024; + wbuf = (wchar_t *) xmalloc (wbufsize); + while (!feof (fp)) { int continued; @@ -328,7 +345,29 @@ read_input_file (struct catalog *current, const char *fname) if (this_line[0] == '$') { if (isblank (this_line[1])) - /* This is a comment line. Do nothing. */; + { + int cnt = 1; + while (isblank (this_line[cnt])) + ++cnt; + if (strncmp (&this_line[cnt], "codeset=", 8) != 0) + /* This is a comment line. Do nothing. */; + else if (codeset != NULL) + /* Ignore multiple codeset. */; + else + { + int start = cnt + 8; + cnt = start; + while (this_line[cnt] != '\0' && !isspace (this_line[cnt])) + ++cnt; + if (cnt != start) + { + int len = cnt - start; + codeset = xmalloc (len + 1); + *((char *) mempcpy (codeset, &this_line[start], len)) + = '\0'; + } + } + } else if (strncmp (&this_line[1], "set", 3) == 0) { int cnt = sizeof ("set"); @@ -470,12 +509,44 @@ this is the first definition")); } else if (strncmp (&this_line[1], "quote", 5) == 0) { - int cnt = sizeof ("quote"); + char buf[2]; + char *bufptr; + size_t buflen; + char *wbufptr; + size_t wbuflen; + int cnt; + + cnt = sizeof ("quote"); while (isspace (this_line[cnt])) ++cnt; + + /* We need the conversion. */ + if (cd_towc == (iconv_t) -1 + && open_conversion (codeset, &cd_towc, &cd_tomb) != 0) + /* Something is wrong. */ + goto out; + /* Yes, the quote char can be '\0'; this means no quote - char. */ - current->quote_char = this_line[cnt]; + char. The function using the information works on + wide characters so we have to convert it here. */ + buf[0] = this_line[cnt]; + buf[1] = '\0'; + bufptr = buf; + buflen = 2; + + wbufptr = (char *) wbuf; + wbuflen = wbufsize; + + /* Flush the state. */ + iconv (cd_towc, NULL, NULL, NULL, NULL); + + iconv (cd_towc, &bufptr, &buflen, &wbufptr, &wbuflen); + if (buflen != 0 || (wchar_t *) wbufptr != &wbuf[2]) + error_at_line (0, 0, fname, start_line, + gettext ("invalid quote character")); + else + /* Use the converted wide character. */ + current->quote_char = wbuf[0]; } else { @@ -568,15 +639,92 @@ duplicated message identifier")); if (message_number != 0) { + char *inbuf; + size_t inlen; + char *outbuf; + size_t outlen; struct message_list *newp; + size_t this_line_len = strlen (this_line) + 1; + + /* We need the conversion. */ + if (cd_towc == (iconv_t) -1 + && open_conversion (codeset, &cd_towc, &cd_tomb) != 0) + /* Something is wrong. */ + goto out; + + /* Convert to a wide character string. We have to + interpret escape sequences which will be impossible + without doing the conversion if the codeset of the + message is stateful. */ + while (1) + { + inbuf = this_line; + inlen = this_line_len; + outbuf = (char *) wbuf; + outlen = wbufsize; + + /* Flush the state. */ + iconv (cd_towc, NULL, NULL, NULL, NULL); + + iconv (cd_towc, &inbuf, &inlen, &outbuf, &outlen); + if (inlen == 0) + { + /* The string is converted. */ + assert (outlen < wbufsize); + assert (wbuf[(wbufsize - outlen) / sizeof (wchar_t) - 1] + == L'\0'); + break; + } + + if (outlen != 0) + { + /* Something is wrong with this string, we ignore it. */ + error_at_line (0, 0, fname, start_line, gettext ("\ +invalid character: message ignored")); + goto ignore; + } + + /* The output buffer is too small. */ + wbufsize *= 2; + wbuf = (wchar_t *) xrealloc (wbuf, wbufsize); + } used = 1; /* Yes, we use the line. */ /* Strip quote characters, change escape sequences into correct characters etc. */ - normalize_line (fname, start_line, this_line, + normalize_line (fname, start_line, cd_towc, wbuf, current->quote_char); + /* Now the string is free of escape sequences. Convert it + back into a multibyte character string. First free the + memory allocated for the original string. */ + obstack_free (¤t->mem_pool, this_line); + + /* Now fill in the new string. It should never happen that + the replaced string is longer than the original. */ + inbuf = (char *) wbuf; + inlen = (wcslen (wbuf) + 1) * sizeof (wchar_t); + + outlen = obstack_room (¤t->mem_pool); + start_line = (char *) obstack_alloc (¤t->mem_pool, outlen); + outbuf = start_line; + + /* Flush the state. */ + iconv (cd_tomb, NULL, NULL, NULL, NULL); + + iconv (cd_tomb, &inbuf, &inlen, &outbuf, &outlen); + if (inlen != 0) + { + error_at_line (0, 0, fname, start_line, + gettext ("invalid line")); + goto ignore; + } + assert (outbuf[-1] == '\0'); + + /* Free the memory in the obstack we don't use. */ + obstack_free (¤t->mem_pool, outbuf); + newp = (struct message_list *) xmalloc (sizeof (*newp)); newp->number = message_number; newp->message = this_line; @@ -625,11 +773,20 @@ duplicated message identifier")); gettext ("malformed line ignored")); } + ignore: /* We can save the memory for the line if it was not used. */ if (!used) obstack_free (¤t->mem_pool, this_line); } + /* Close the conversion modules. */ + iconv_close (cd_towc); + iconv_close (cd_tomb); + free (codeset); + + out: + free (wbuf); + if (fp != stdin) fclose (fp); return current; @@ -895,13 +1052,14 @@ find_set (struct catalog *current, int number) /* Normalize given string *in*place* by processing escape sequences and quote characters. */ static void -normalize_line (const char *fname, size_t line, char *string, char quote_char) +normalize_line (const char *fname, size_t line, iconv_t cd, wchar_t *string, + wchar_t quote_char) { int is_quoted; - char *rp = string; - char *wp = string; + wchar_t *rp = string; + wchar_t *wp = string; - if (quote_char != '\0' && *rp == quote_char) + if (quote_char != L'\0' && *rp == quote_char) { is_quoted = 1; ++rp; @@ -909,58 +1067,83 @@ normalize_line (const char *fname, size_t line, char *string, char quote_char) else is_quoted = 0; - while (*rp != '\0') + while (*rp != L'\0') if (*rp == quote_char) /* We simply end the string when we find the first time an not-escaped quote character. */ break; - else if (*rp == '\\') + else if (*rp == L'\\') { ++rp; - if (quote_char != '\0' && *rp == quote_char) + if (quote_char != L'\0' && *rp == quote_char) /* This is an extension to XPG. */ *wp++ = *rp++; else /* Recognize escape sequences. */ switch (*rp) { - case 'n': - *wp++ = '\n'; + case L'n': + *wp++ = L'\n'; ++rp; break; - case 't': - *wp++ = '\t'; + case L't': + *wp++ = L'\t'; ++rp; break; - case 'v': - *wp++ = '\v'; + case L'v': + *wp++ = L'\v'; ++rp; break; - case 'b': - *wp++ = '\b'; + case L'b': + *wp++ = L'\b'; ++rp; break; - case 'r': - *wp++ = '\r'; + case L'r': + *wp++ = L'\r'; ++rp; break; - case 'f': - *wp++ = '\f'; + case L'f': + *wp++ = L'\f'; ++rp; break; - case '\\': - *wp++ = '\\'; + case L'\\': + *wp++ = L'\\'; ++rp; break; - case '0' ... '7': + case L'0' ... L'7': { - int number = *rp++ - '0'; - while (number <= (255 / 8) && *rp >= '0' && *rp <= '7') + int number; + char cbuf[2]; + char *cbufptr; + size_t cbufin; + wchar_t wcbuf[2]; + char *wcbufptr; + size_t wcbufin; + + number = *rp++ - L'0'; + while (number <= (255 / 8) && *rp >= L'0' && *rp <= L'7') { number *= 8; - number += *rp++ - '0'; + number += *rp++ - L'0'; } - *wp++ = (char) number; + + cbuf[0] = (char) number; + cbuf[1] = '\0'; + cbufptr = cbuf; + cbufin = 2; + + wcbufptr = (char *) wcbuf; + wcbufin = sizeof (wcbuf); + + /* Flush the state. */ + iconv (cd, NULL, NULL, NULL, NULL); + + iconv (cd, &cbufptr, &cbufin, &wcbufptr, &wcbufin); + if (cbufptr != &cbuf[2] || (wchar_t *) wcbufptr != &wcbuf[2]) + error_at_line (0, 0, fname, line, + gettext ("invalid escape sequence")); + else + *wp++ = wcbuf[0]; } break; default: @@ -974,10 +1157,10 @@ normalize_line (const char *fname, size_t line, char *string, char quote_char) /* If we saw a quote character at the beginning we expect another one at the end. */ if (is_quoted && *rp != quote_char) - error (0, 0, fname, line, gettext ("unterminated message")); + error_at_line (0, 0, fname, line, gettext ("unterminated message")); /* Terminate string. */ - *wp = '\0'; + *wp = L'\0'; return; } @@ -1069,3 +1252,30 @@ read_old (struct catalog *catalog, const char *file_name) } } } + + +static int +open_conversion (const char *codeset, iconv_t *cd_towcp, iconv_t *cd_tombp) +{ + /* If the input file does not specify the codeset use the locale's. */ + if (codeset == NULL) + { + setlocale (LC_ALL, ""); + codeset = nl_langinfo (CODESET); + setlocale (LC_ALL, "C"); + } + + /* Get the conversion modules. */ + *cd_towcp = iconv_open ("WCHAR_T", codeset); + *cd_tombp = iconv_open (codeset, "WCHAR_T"); + if (*cd_towcp == (iconv_t) -1 || *cd_tombp == (iconv_t) -1) + { + error (0, 0, gettext ("conversion modules not available")); + if (*cd_towcp != (iconv_t) -1) + iconv_close (*cd_towcp); + + return 1; + } + + return 0; +} |