diff options
Diffstat (limited to 'locale/programs/linereader.c')
-rw-r--r-- | locale/programs/linereader.c | 452 |
1 files changed, 321 insertions, 131 deletions
diff --git a/locale/programs/linereader.c b/locale/programs/linereader.c index 31278d63c2..99ed0f2480 100644 --- a/locale/programs/linereader.c +++ b/locale/programs/linereader.c @@ -1,6 +1,6 @@ /* Copyright (C) 1996, 1997, 1998, 1999 Free Software Foundation, Inc. This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996. + Contributed by Ulrich Drepper <drepper@gnu.org>, 1996. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as @@ -28,22 +28,20 @@ #include <stdlib.h> #include <string.h> +#include "charmap.h" #include "error.h" #include "linereader.h" -#include "charset.h" +#include "localedef.h" #include "stringtrans.h" -void *xmalloc (size_t __n); -void *xrealloc (void *__p, size_t __n); -char *xstrdup (const char *__str); - - +/* Prototypes for local functions. */ static struct token *get_toplvl_escape (struct linereader *lr); static struct token *get_symname (struct linereader *lr); static struct token *get_ident (struct linereader *lr); static struct token *get_string (struct linereader *lr, - const struct charset_t *charset); + const struct charmap_t *charmap, + const struct repertoire_t *repertoire); struct linereader * @@ -126,9 +124,14 @@ lr_next (struct linereader *lr) if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n') { +#if 0 + /* XXX Is this correct? */ /* An escaped newline character is substituted with a single <SP>. */ --n; lr->buf[n - 1] = ' '; +#else + n -= 2; +#endif } lr->buf[n] = '\0'; @@ -149,7 +152,8 @@ extern char *program_name; struct token * -lr_token (struct linereader *lr, const struct charset_t *charset) +lr_token (struct linereader *lr, const struct charmap_t *charmap, + const struct repertoire_t *repertoire) { int ch; @@ -193,12 +197,29 @@ lr_token (struct linereader *lr, const struct charset_t *charset) return get_toplvl_escape (lr); /* Match ellipsis. */ - if (ch == '.' && strncmp (&lr->buf[lr->idx], "..", 2) == 0) + if (ch == '.') { - lr_getc (lr); - lr_getc (lr); - lr->token.tok = tok_ellipsis; - return &lr->token; + if (strncmp (&lr->buf[lr->idx], "...", 3) == 0) + { + lr_getc (lr); + lr_getc (lr); + lr_getc (lr); + lr->token.tok = tok_ellipsis4; + return &lr->token; + } + if (strncmp (&lr->buf[lr->idx], "..", 2) == 0) + { + lr_getc (lr); + lr_getc (lr); + lr->token.tok = tok_ellipsis3; + return &lr->token; + } + if (lr->buf[lr->idx] == '.') + { + lr_getc (lr); + lr->token.tok = tok_ellipsis2; + return &lr->token; + } } switch (ch) @@ -238,7 +259,7 @@ lr_token (struct linereader *lr, const struct charset_t *charset) return &lr->token; case '"': - return get_string (lr, charset); + return get_string (lr, charmap, repertoire); case '-': ch = lr_getc (lr); @@ -261,7 +282,7 @@ get_toplvl_escape (struct linereader *lr) /* This is supposed to be a numeric value. We return the numerical value and the number of bytes. */ size_t start_idx = lr->idx - 1; - unsigned int value = 0; + char *bytes = lr->token.val.charcode.bytes; int nbytes = 0; int ch; @@ -287,11 +308,11 @@ get_toplvl_escape (struct linereader *lr) || (base != 16 && (ch < '0' || ch >= (int) ('0' + base)))) { esc_error: - lr->token.val.str.start = &lr->buf[start_idx]; + lr->token.val.str.startmb = &lr->buf[start_idx]; while (ch != EOF && !isspace (ch)) ch = lr_getc (lr); - lr->token.val.str.len = lr->idx - start_idx; + lr->token.val.str.lenmb = lr->idx - start_idx; lr->token.tok = tok_error; return &lr->token; @@ -300,7 +321,7 @@ get_toplvl_escape (struct linereader *lr) if (isdigit (ch)) byte = ch - '0'; else - byte = _tolower (ch) - 'a' + 10; + byte = tolower (ch) - 'a' + 10; ch = lr_getc (lr); if ((base == 16 && !isxdigit (ch)) @@ -311,7 +332,7 @@ get_toplvl_escape (struct linereader *lr) if (isdigit (ch)) byte += ch - '0'; else - byte += _tolower (ch) - 'a' + 10; + byte += tolower (ch) - 'a' + 10; ch = lr_getc (lr); if (base != 16 && isdigit (ch)) @@ -322,10 +343,7 @@ get_toplvl_escape (struct linereader *lr) ch = lr_getc (lr); } - value *= 256; - value += byte; - - ++nbytes; + bytes[nbytes++] = byte; } while (ch == lr->escape_char && nbytes < 4); @@ -335,23 +353,52 @@ get_toplvl_escape (struct linereader *lr) lr_ungetn (lr, 1); lr->token.tok = tok_charcode; - lr->token.val.charcode.val = value; lr->token.val.charcode.nbytes = nbytes; return &lr->token; } -#define ADDC(ch) \ - do \ - { \ - if (bufact == bufmax) \ - { \ - bufmax *= 2; \ - buf = xrealloc (buf, bufmax); \ - } \ - buf[bufact++] = (ch); \ - } \ +#define ADDC(ch) \ + do \ + { \ + if (bufact == bufmax) \ + { \ + bufmax *= 2; \ + buf = xrealloc (buf, bufmax); \ + } \ + buf[bufact++] = (ch); \ + } \ + while (0) + + +#define ADDS(s, l) \ + do \ + { \ + size_t _l = (l); \ + if (bufact + _l > bufmax) \ + { \ + if (bufact < _l) \ + bufact = _l; \ + bufmax *= 2; \ + buf = xrealloc (buf, bufmax); \ + } \ + memcpy (&buf[bufact], s, _l); \ + bufact += _l; \ + } \ + while (0) + + +#define ADDWC(ch) \ + do \ + { \ + if (buf2act == buf2max) \ + { \ + buf2max *= 2; \ + buf2 = xrealloc (buf2, buf2max * 4); \ + } \ + buf2[buf2act++] = (ch); \ + } \ while (0) @@ -399,9 +446,8 @@ get_symname (struct linereader *lr) if (cp == &buf[bufact - 1]) { /* Yes, it is. */ - lr->token.tok = bufact == 6 ? tok_ucs2 : tok_ucs4; - lr->token.val.charcode.val = strtoul (buf, NULL, 16); - lr->token.val.charcode.nbytes = lr->token.tok == tok_ucs2 ? 2 : 4; + lr->token.tok = tok_ucs4; + lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16); return &lr->token; } @@ -422,8 +468,8 @@ get_symname (struct linereader *lr) buf[bufact] = '\0'; buf = xrealloc (buf, bufact + 1); - lr->token.val.str.start = buf; - lr->token.val.str.len = bufact - 1; + lr->token.val.str.startmb = buf; + lr->token.val.str.lenmb = bufact - 1; } return &lr->token; @@ -446,8 +492,18 @@ get_ident (struct linereader *lr) while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';' && ch != '<' && ch != ',') - /* XXX Handle escape sequences? */ - ADDC (ch); + { + if (ch == lr->escape_char) + { + ch = lr_getc (lr); + if (ch == '\n' || ch == EOF) + { + lr_error (lr, _("invalid escape sequence")); + break; + } + } + ADDC (ch); + } lr_ungetn (lr, 1); @@ -465,8 +521,8 @@ get_ident (struct linereader *lr) buf[bufact] = '\0'; buf = xrealloc (buf, bufact + 1); - lr->token.val.str.start = buf; - lr->token.val.str.len = bufact; + lr->token.val.str.startmb = buf; + lr->token.val.str.lenmb = bufact; } return &lr->token; @@ -474,113 +530,247 @@ get_ident (struct linereader *lr) static struct token * -get_string (struct linereader *lr, const struct charset_t *charset) +get_string (struct linereader *lr, const struct charmap_t *charmap, + const struct repertoire_t *repertoire) { - int illegal_string = 0; - char *buf, *cp; + int return_widestr = lr->return_widestr; + char *buf; + char *buf2 = NULL; size_t bufact; size_t bufmax = 56; - int ch; + /* We must return two different strings. */ buf = xmalloc (bufmax); bufact = 0; - while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) - if (ch != '<' || charset == NULL) - { - if (ch == lr->escape_char) - { - ch = lr_getc (lr); - if (ch == '\n' || ch == EOF) - break; - } + /* We know it'll be a string. */ + lr->token.tok = tok_string; + + /* If we need not translate the strings (i.e., expand <...> parts) + we can run a simple loop. */ + if (!lr->translate_strings) + { + int ch; + + buf2 = NULL; + while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) ADDC (ch); - } - else - { - /* We have to get the value of the symbol. */ - unsigned int value; - size_t startidx = bufact; - - if (!lr->translate_strings) - ADDC ('<'); - - while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF) - { - if (ch == lr->escape_char) - { - ch = lr_getc (lr); - if (ch == '\n' || ch == EOF) - break; - } - ADDC (ch); - } - - if (ch == '\n' || ch == EOF) - lr_error (lr, _("unterminated string")); - else - if (!lr->translate_strings) - ADDC ('>'); - - if (lr->translate_strings) - { - value = charset_find_value (&charset->char_table, &buf[startidx], - bufact - startidx); - if ((wchar_t) value == ILLEGAL_CHAR_VALUE) + + /* Catch errors with trailing escape character. */ + if (bufact > 0 && buf[bufact - 1] == lr->escape_char + && (bufact == 1 || buf[bufact - 2] != lr->escape_char)) + { + lr_error (lr, _("illegal escape sequence at end of string")); + --bufact; + } + else if (ch == '\n' || ch == EOF) + lr_error (lr, _("unterminated string")); + + ADDC ('\0'); + } + else + { + int illegal_string = 0; + size_t buf2act = 0; + size_t buf2max = 56 * sizeof (uint32_t); + int ch; + int warned = 0; + + /* We have to provide the wide character result as well. */ + if (return_widestr) + buf2 = xmalloc (buf2max); + + /* Read until the end of the string (or end of the line or file). */ + while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) + { + size_t startidx; + uint32_t wch; + struct charseq *seq; + + if (ch != '<') + { + /* The standards leave it up to the implementation to decide + what to do with character which stand for themself. We + could jump through hoops to find out the value relative to + the charmap and the repertoire map, but instead we leave + it up to the locale definition author to write a better + definition. We assume here that every character which + stands for itself is encoded using ISO 8859-1. Using the + escape character is allowed. */ + if (ch == lr->escape_char) + { + ch = lr_getc (lr); + if (ch == '\n' || ch == EOF) + break; + } + + if (verbose && !warned) + { + lr_error (lr, _("\ +non-symbolic character value should not be used")); + warned = 1; + } + + ADDC (ch); + if (return_widestr) + ADDWC ((uint32_t) ch); + + continue; + } + + /* Now we have to search for the end of the symbolic name, i.e., + the closing '>'. */ + startidx = bufact; + while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF) + { + if (ch == lr->escape_char) + { + ch = lr_getc (lr); + if (ch == '\n' || ch == EOF) + break; + } + ADDC (ch); + } + if (ch == '\n' || ch == EOF) + /* Not a correct string. */ + break; + if (bufact == startidx) + { + /* <> is no correct name. Ignore it and also signal an + error. */ illegal_string = 1; - bufact = startidx; + continue; + } - if (bufmax - bufact < 8) - { - bufmax *= 2; - buf = (char *) xrealloc (buf, bufmax); - } + /* It might be a Uxxxx symbol. */ + if (buf[startidx] == 'U' + && (bufact - startidx == 5 || bufact - startidx == 9)) + { + char *cp = buf + startidx + 1; + while (cp < &buf[bufact] && isxdigit (*cp)) + ++cp; + + if (cp == &buf[bufact]) + { + const char *symbol = NULL; + + /* Yes, it is. */ + ADDC ('\0'); + wch = strtoul (buf + startidx + 1, NULL, 16); + + /* Now forget about the name we just added. */ + bufact = startidx; + + if (return_widestr) + ADDWC (wch); + + /* Now determine from the repertoire the name of the + character and find it in the charmap. */ + if (repertoire != NULL) + symbol = repertoire_find_symbol (repertoire, wch); + + if (symbol == NULL) + { + /* We cannot generate a string since we cannot map + from the Unicode number to the character symbol. */ + lr_error (lr, + _("character <U%0*X> not in repertoire map"), + wch > 0xffff ? 8 : 4, wch); + + illegal_string = 1; + } + else + { + seq = charmap_find_value (charmap, symbol, + strlen (symbol)); + + if (seq == NULL) + { + /* Not a known name. */ + lr_error (lr, + _("symbol `%s' not in charmap"), symbol); + illegal_string = 1; + } + else + ADDS (seq->bytes, seq->nbytes); + } + + continue; + } + } + + if (return_widestr) + { + /* We now have the symbolic name in buf[startidx] to + buf[bufact-1]. Now find out the value for this + character in the repertoire map as well as in the + charmap (in this order). */ + wch = repertoire_find_value (repertoire, &buf[startidx], + bufact - startidx); + if (wch == ILLEGAL_CHAR_VALUE) + { + /* This name is not in the repertoire map. */ + lr_error (lr, _("symbol `%.*s' not in repertoire map"), + bufact - startidx, &buf[startidx]); + illegal_string = 1; + } + else + ADDWC (wch); + } + + /* Now the same for the multibyte representation. */ + seq = charmap_find_value (charmap, &buf[startidx], + bufact - startidx); - cp = &buf[bufact]; - if (encode_char (value, &cp)) + if (seq == NULL) + { + /* This name is not in the charmap. */ + lr_error (lr, _("symbol `%.*s' not in charmap"), + bufact - startidx, &buf[startidx]); illegal_string = 1; - bufact = cp - buf; - } - } + /* Now forget about the name we just added. */ + bufact = startidx; + } + else + { + /* Now forget about the name we just added. */ + bufact = startidx; - /* Catch errors with trailing escape character. */ - if (bufact > 0 && buf[bufact - 1] == lr->escape_char - && (bufact == 1 || buf[bufact - 2] != lr->escape_char)) - { - lr_error (lr, _("illegal escape sequence at end of string")); - --bufact; - } - else if (ch == '\n' || ch == EOF) - lr_error (lr, _("unterminated string")); + ADDS (seq->bytes, seq->nbytes); + } + } - /* Terminate string if necessary. */ - if (lr->translate_strings) - { - cp = &buf[bufact]; - if (encode_char (0, &cp)) - illegal_string = 1; + if (ch == '\n' || ch == EOF) + { + lr_error (lr, _("unterminated string")); + illegal_string = 1; + } - bufact = cp - buf; - } - else - ADDC ('\0'); + if (illegal_string) + { + free (buf); + if (buf2 != NULL) + free (buf2); + lr->token.val.str.startmb = NULL; + lr->token.val.str.lenmb = 0; - lr->token.tok = tok_string; + return &lr->token; + } - if (illegal_string) - { - free (buf); - lr->token.val.str.start = NULL; - lr->token.val.str.len = 0; - } - else - { - buf = xrealloc (buf, bufact + 1); + ADDC ('\0'); - lr->token.val.str.start = buf; - lr->token.val.str.len = bufact; + if (return_widestr) + { + ADDWC (0); + lr->token.val.str.startwc = xrealloc (buf2, + buf2act * sizeof (uint32_t)); + lr->token.val.str.lenwc = buf2act; + } } + lr->token.val.str.startmb = xrealloc (buf, bufact); + lr->token.val.str.lenmb = bufact; + return &lr->token; } |