diff options
Diffstat (limited to 'REORG.TODO/locale/programs/linereader.c')
-rw-r--r-- | REORG.TODO/locale/programs/linereader.c | 886 |
1 files changed, 886 insertions, 0 deletions
diff --git a/REORG.TODO/locale/programs/linereader.c b/REORG.TODO/locale/programs/linereader.c new file mode 100644 index 0000000000..52b340963a --- /dev/null +++ b/REORG.TODO/locale/programs/linereader.c @@ -0,0 +1,886 @@ +/* Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gnu.org>, 1996. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. */ + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +#include <assert.h> +#include <ctype.h> +#include <errno.h> +#include <libintl.h> +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> +#include <stdint.h> + +#include "localedef.h" +#include "charmap.h" +#include "error.h" +#include "linereader.h" +#include "locfile.h" + +/* Prototypes for local functions. */ +static struct token *get_toplvl_escape (struct linereader *lr); +static struct token *get_symname (struct linereader *lr); +static struct token *get_ident (struct linereader *lr); +static struct token *get_string (struct linereader *lr, + const struct charmap_t *charmap, + struct localedef_t *locale, + const struct repertoire_t *repertoire, + int verbose); + + +struct linereader * +lr_open (const char *fname, kw_hash_fct_t hf) +{ + FILE *fp; + + if (fname == NULL || strcmp (fname, "-") == 0 + || strcmp (fname, "/dev/stdin") == 0) + return lr_create (stdin, "<stdin>", hf); + else + { + fp = fopen (fname, "rm"); + if (fp == NULL) + return NULL; + return lr_create (fp, fname, hf); + } +} + +struct linereader * +lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf) +{ + struct linereader *result; + int n; + + result = (struct linereader *) xmalloc (sizeof (*result)); + + result->fp = fp; + result->fname = xstrdup (fname); + result->buf = NULL; + result->bufsize = 0; + result->lineno = 1; + result->idx = 0; + result->comment_char = '#'; + result->escape_char = '\\'; + result->translate_strings = 1; + result->return_widestr = 0; + + n = getdelim (&result->buf, &result->bufsize, '\n', result->fp); + if (n < 0) + { + int save = errno; + fclose (result->fp); + free ((char *) result->fname); + free (result); + errno = save; + return NULL; + } + + if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n') + n -= 2; + + result->buf[n] = '\0'; + result->bufact = n; + result->hash_fct = hf; + + return result; +} + + +int +lr_eof (struct linereader *lr) +{ + return lr->bufact = 0; +} + + +void +lr_ignore_rest (struct linereader *lr, int verbose) +{ + if (verbose) + { + while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != '\n' + && lr->buf[lr->idx] != lr->comment_char) + if (lr->buf[lr->idx] == '\0') + { + if (lr_next (lr) < 0) + return; + } + else + ++lr->idx; + + if (lr->buf[lr->idx] != '\n' && ! feof (lr->fp) + && lr->buf[lr->idx] != lr->comment_char) + lr_error (lr, _("trailing garbage at end of line")); + } + + /* Ignore continued line. */ + while (lr->bufact > 0 && lr->buf[lr->bufact - 1] != '\n') + if (lr_next (lr) < 0) + break; + + lr->idx = lr->bufact; +} + + +void +lr_close (struct linereader *lr) +{ + fclose (lr->fp); + free (lr->buf); + free (lr); +} + + +int +lr_next (struct linereader *lr) +{ + int n; + + n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp); + if (n < 0) + return -1; + + ++lr->lineno; + + if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n') + { +#if 0 + /* XXX Is this correct? */ + /* An escaped newline character is substituted with a single <SP>. */ + --n; + lr->buf[n - 1] = ' '; +#else + n -= 2; +#endif + } + + lr->buf[n] = '\0'; + lr->bufact = n; + lr->idx = 0; + + return 0; +} + + +/* Defined in error.c. */ +/* This variable is incremented each time `error' is called. */ +extern unsigned int error_message_count; + +/* The calling program should define program_name and set it to the + name of the executing program. */ +extern char *program_name; + + +struct token * +lr_token (struct linereader *lr, const struct charmap_t *charmap, + struct localedef_t *locale, const struct repertoire_t *repertoire, + int verbose) +{ + int ch; + + while (1) + { + do + { + ch = lr_getc (lr); + + if (ch == EOF) + { + lr->token.tok = tok_eof; + return &lr->token; + }; + + if (ch == '\n') + { + lr->token.tok = tok_eol; + return &lr->token; + } + } + while (isspace (ch)); + + if (ch != lr->comment_char) + break; + + /* Is there an newline at the end of the buffer? */ + if (lr->buf[lr->bufact - 1] != '\n') + { + /* No. Some people want this to mean that only the line in + the file not the logical, concatenated line is ignored. + Let's try this. */ + lr->idx = lr->bufact; + continue; + } + + /* Ignore rest of line. */ + lr_ignore_rest (lr, 0); + lr->token.tok = tok_eol; + return &lr->token; + } + + /* Match escape sequences. */ + if (ch == lr->escape_char) + return get_toplvl_escape (lr); + + /* Match ellipsis. */ + if (ch == '.') + { + if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0) + { + int cnt; + for (cnt = 0; cnt < 10; ++cnt) + lr_getc (lr); + lr->token.tok = tok_ellipsis4_2; + return &lr->token; + } + if (strncmp (&lr->buf[lr->idx], "...", 3) == 0) + { + lr_getc (lr); + lr_getc (lr); + lr_getc (lr); + lr->token.tok = tok_ellipsis4; + return &lr->token; + } + if (strncmp (&lr->buf[lr->idx], "..", 2) == 0) + { + lr_getc (lr); + lr_getc (lr); + lr->token.tok = tok_ellipsis3; + return &lr->token; + } + if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0) + { + int cnt; + for (cnt = 0; cnt < 6; ++cnt) + lr_getc (lr); + lr->token.tok = tok_ellipsis2_2; + return &lr->token; + } + if (lr->buf[lr->idx] == '.') + { + lr_getc (lr); + lr->token.tok = tok_ellipsis2; + return &lr->token; + } + } + + switch (ch) + { + case '<': + return get_symname (lr); + + case '0' ... '9': + lr->token.tok = tok_number; + lr->token.val.num = ch - '0'; + + while (isdigit (ch = lr_getc (lr))) + { + lr->token.val.num *= 10; + lr->token.val.num += ch - '0'; + } + if (isalpha (ch)) + lr_error (lr, _("garbage at end of number")); + lr_ungetn (lr, 1); + + return &lr->token; + + case ';': + lr->token.tok = tok_semicolon; + return &lr->token; + + case ',': + lr->token.tok = tok_comma; + return &lr->token; + + case '(': + lr->token.tok = tok_open_brace; + return &lr->token; + + case ')': + lr->token.tok = tok_close_brace; + return &lr->token; + + case '"': + return get_string (lr, charmap, locale, repertoire, verbose); + + case '-': + ch = lr_getc (lr); + if (ch == '1') + { + lr->token.tok = tok_minus1; + return &lr->token; + } + lr_ungetn (lr, 2); + break; + } + + return get_ident (lr); +} + + +static struct token * +get_toplvl_escape (struct linereader *lr) +{ + /* This is supposed to be a numeric value. We return the + numerical value and the number of bytes. */ + size_t start_idx = lr->idx - 1; + unsigned char *bytes = lr->token.val.charcode.bytes; + size_t nbytes = 0; + int ch; + + do + { + unsigned int byte = 0; + unsigned int base = 8; + + ch = lr_getc (lr); + + if (ch == 'd') + { + base = 10; + ch = lr_getc (lr); + } + else if (ch == 'x') + { + base = 16; + ch = lr_getc (lr); + } + + if ((base == 16 && !isxdigit (ch)) + || (base != 16 && (ch < '0' || ch >= (int) ('0' + base)))) + { + esc_error: + lr->token.val.str.startmb = &lr->buf[start_idx]; + + while (ch != EOF && !isspace (ch)) + ch = lr_getc (lr); + lr->token.val.str.lenmb = lr->idx - start_idx; + + lr->token.tok = tok_error; + return &lr->token; + } + + if (isdigit (ch)) + byte = ch - '0'; + else + byte = tolower (ch) - 'a' + 10; + + ch = lr_getc (lr); + if ((base == 16 && !isxdigit (ch)) + || (base != 16 && (ch < '0' || ch >= (int) ('0' + base)))) + goto esc_error; + + byte *= base; + if (isdigit (ch)) + byte += ch - '0'; + else + byte += tolower (ch) - 'a' + 10; + + ch = lr_getc (lr); + if (base != 16 && isdigit (ch)) + { + byte *= base; + byte += ch - '0'; + + ch = lr_getc (lr); + } + + bytes[nbytes++] = byte; + } + while (ch == lr->escape_char + && nbytes < (int) sizeof (lr->token.val.charcode.bytes)); + + if (!isspace (ch)) + lr_error (lr, _("garbage at end of character code specification")); + + lr_ungetn (lr, 1); + + lr->token.tok = tok_charcode; + lr->token.val.charcode.nbytes = nbytes; + + return &lr->token; +} + + +#define ADDC(ch) \ + do \ + { \ + if (bufact == bufmax) \ + { \ + bufmax *= 2; \ + buf = xrealloc (buf, bufmax); \ + } \ + buf[bufact++] = (ch); \ + } \ + while (0) + + +#define ADDS(s, l) \ + do \ + { \ + size_t _l = (l); \ + if (bufact + _l > bufmax) \ + { \ + if (bufact < _l) \ + bufact = _l; \ + bufmax *= 2; \ + buf = xrealloc (buf, bufmax); \ + } \ + memcpy (&buf[bufact], s, _l); \ + bufact += _l; \ + } \ + while (0) + + +#define ADDWC(ch) \ + do \ + { \ + if (buf2act == buf2max) \ + { \ + buf2max *= 2; \ + buf2 = xrealloc (buf2, buf2max * 4); \ + } \ + buf2[buf2act++] = (ch); \ + } \ + while (0) + + +static struct token * +get_symname (struct linereader *lr) +{ + /* Symbol in brackets. We must distinguish three kinds: + 1. reserved words + 2. ISO 10646 position values + 3. all other. */ + char *buf; + size_t bufact = 0; + size_t bufmax = 56; + const struct keyword_t *kw; + int ch; + + buf = (char *) xmalloc (bufmax); + + do + { + ch = lr_getc (lr); + if (ch == lr->escape_char) + { + int c2 = lr_getc (lr); + ADDC (c2); + + if (c2 == '\n') + ch = '\n'; + } + else + ADDC (ch); + } + while (ch != '>' && ch != '\n'); + + if (ch == '\n') + lr_error (lr, _("unterminated symbolic name")); + + /* Test for ISO 10646 position value. */ + if (buf[0] == 'U' && (bufact == 6 || bufact == 10)) + { + char *cp = buf + 1; + while (cp < &buf[bufact - 1] && isxdigit (*cp)) + ++cp; + + if (cp == &buf[bufact - 1]) + { + /* Yes, it is. */ + lr->token.tok = tok_ucs4; + lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16); + + return &lr->token; + } + } + + /* It is a symbolic name. Test for reserved words. */ + kw = lr->hash_fct (buf, bufact - 1); + + if (kw != NULL && kw->symname_or_ident == 1) + { + lr->token.tok = kw->token; + free (buf); + } + else + { + lr->token.tok = tok_bsymbol; + + buf = xrealloc (buf, bufact + 1); + buf[bufact] = '\0'; + + lr->token.val.str.startmb = buf; + lr->token.val.str.lenmb = bufact - 1; + } + + return &lr->token; +} + + +static struct token * +get_ident (struct linereader *lr) +{ + char *buf; + size_t bufact; + size_t bufmax = 56; + const struct keyword_t *kw; + int ch; + + buf = xmalloc (bufmax); + bufact = 0; + + ADDC (lr->buf[lr->idx - 1]); + + while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';' + && ch != '<' && ch != ',' && ch != EOF) + { + if (ch == lr->escape_char) + { + ch = lr_getc (lr); + if (ch == '\n' || ch == EOF) + { + lr_error (lr, _("invalid escape sequence")); + break; + } + } + ADDC (ch); + } + + lr_ungetc (lr, ch); + + kw = lr->hash_fct (buf, bufact); + + if (kw != NULL && kw->symname_or_ident == 0) + { + lr->token.tok = kw->token; + free (buf); + } + else + { + lr->token.tok = tok_ident; + + buf = xrealloc (buf, bufact + 1); + buf[bufact] = '\0'; + + lr->token.val.str.startmb = buf; + lr->token.val.str.lenmb = bufact; + } + + return &lr->token; +} + + +static struct token * +get_string (struct linereader *lr, const struct charmap_t *charmap, + struct localedef_t *locale, const struct repertoire_t *repertoire, + int verbose) +{ + int return_widestr = lr->return_widestr; + char *buf; + wchar_t *buf2 = NULL; + size_t bufact; + size_t bufmax = 56; + + /* We must return two different strings. */ + buf = xmalloc (bufmax); + bufact = 0; + + /* We know it'll be a string. */ + lr->token.tok = tok_string; + + /* If we need not translate the strings (i.e., expand <...> parts) + we can run a simple loop. */ + if (!lr->translate_strings) + { + int ch; + + buf2 = NULL; + while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) + ADDC (ch); + + /* Catch errors with trailing escape character. */ + if (bufact > 0 && buf[bufact - 1] == lr->escape_char + && (bufact == 1 || buf[bufact - 2] != lr->escape_char)) + { + lr_error (lr, _("illegal escape sequence at end of string")); + --bufact; + } + else if (ch == '\n' || ch == EOF) + lr_error (lr, _("unterminated string")); + + ADDC ('\0'); + } + else + { + int illegal_string = 0; + size_t buf2act = 0; + size_t buf2max = 56 * sizeof (uint32_t); + int ch; + int warned = 0; + + /* We have to provide the wide character result as well. */ + if (return_widestr) + buf2 = xmalloc (buf2max); + + /* Read until the end of the string (or end of the line or file). */ + while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) + { + size_t startidx; + uint32_t wch; + struct charseq *seq; + + if (ch != '<') + { + /* The standards leave it up to the implementation to decide + what to do with character which stand for themself. We + could jump through hoops to find out the value relative to + the charmap and the repertoire map, but instead we leave + it up to the locale definition author to write a better + definition. We assume here that every character which + stands for itself is encoded using ISO 8859-1. Using the + escape character is allowed. */ + if (ch == lr->escape_char) + { + ch = lr_getc (lr); + if (ch == '\n' || ch == EOF) + break; + } + + if (verbose && !warned) + { + lr_error (lr, _("\ +non-symbolic character value should not be used")); + warned = 1; + } + + ADDC (ch); + if (return_widestr) + ADDWC ((uint32_t) ch); + + continue; + } + + /* Now we have to search for the end of the symbolic name, i.e., + the closing '>'. */ + startidx = bufact; + while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF) + { + if (ch == lr->escape_char) + { + ch = lr_getc (lr); + if (ch == '\n' || ch == EOF) + break; + } + ADDC (ch); + } + if (ch == '\n' || ch == EOF) + /* Not a correct string. */ + break; + if (bufact == startidx) + { + /* <> is no correct name. Ignore it and also signal an + error. */ + illegal_string = 1; + continue; + } + + /* It might be a Uxxxx symbol. */ + if (buf[startidx] == 'U' + && (bufact - startidx == 5 || bufact - startidx == 9)) + { + char *cp = buf + startidx + 1; + while (cp < &buf[bufact] && isxdigit (*cp)) + ++cp; + + if (cp == &buf[bufact]) + { + char utmp[10]; + + /* Yes, it is. */ + ADDC ('\0'); + wch = strtoul (buf + startidx + 1, NULL, 16); + + /* Now forget about the name we just added. */ + bufact = startidx; + + if (return_widestr) + ADDWC (wch); + + /* See whether the charmap contains the Uxxxxxxxx names. */ + snprintf (utmp, sizeof (utmp), "U%08X", wch); + seq = charmap_find_value (charmap, utmp, 9); + + if (seq == NULL) + { + /* No, this isn't the case. Now determine from + the repertoire the name of the character and + find it in the charmap. */ + if (repertoire != NULL) + { + const char *symbol; + + symbol = repertoire_find_symbol (repertoire, wch); + + if (symbol != NULL) + seq = charmap_find_value (charmap, symbol, + strlen (symbol)); + } + + if (seq == NULL) + { +#ifndef NO_TRANSLITERATION + /* Transliterate if possible. */ + if (locale != NULL) + { + uint32_t *translit; + + if ((locale->avail & CTYPE_LOCALE) == 0) + { + /* Load the CTYPE data now. */ + int old_needed = locale->needed; + + locale->needed = 0; + locale = load_locale (LC_CTYPE, + locale->name, + locale->repertoire_name, + charmap, locale); + locale->needed = old_needed; + } + + if ((locale->avail & CTYPE_LOCALE) != 0 + && ((translit = find_translit (locale, + charmap, wch)) + != NULL)) + /* The CTYPE data contains a matching + transliteration. */ + { + int i; + + for (i = 0; translit[i] != 0; ++i) + { + char utmp[10]; + + snprintf (utmp, sizeof (utmp), "U%08X", + translit[i]); + seq = charmap_find_value (charmap, utmp, + 9); + assert (seq != NULL); + ADDS (seq->bytes, seq->nbytes); + } + + continue; + } + } +#endif /* NO_TRANSLITERATION */ + + /* Not a known name. */ + illegal_string = 1; + } + } + + if (seq != NULL) + ADDS (seq->bytes, seq->nbytes); + + continue; + } + } + + /* We now have the symbolic name in buf[startidx] to + buf[bufact-1]. Now find out the value for this character + in the charmap as well as in the repertoire map (in this + order). */ + seq = charmap_find_value (charmap, &buf[startidx], + bufact - startidx); + + if (seq == NULL) + { + /* This name is not in the charmap. */ + lr_error (lr, _("symbol `%.*s' not in charmap"), + (int) (bufact - startidx), &buf[startidx]); + illegal_string = 1; + } + + if (return_widestr) + { + /* Now the same for the multibyte representation. */ + if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE) + wch = seq->ucs4; + else + { + wch = repertoire_find_value (repertoire, &buf[startidx], + bufact - startidx); + if (seq != NULL) + seq->ucs4 = wch; + } + + if (wch == ILLEGAL_CHAR_VALUE) + { + /* This name is not in the repertoire map. */ + lr_error (lr, _("symbol `%.*s' not in repertoire map"), + (int) (bufact - startidx), &buf[startidx]); + illegal_string = 1; + } + else + ADDWC (wch); + } + + /* Now forget about the name we just added. */ + bufact = startidx; + + /* And copy the bytes. */ + if (seq != NULL) + ADDS (seq->bytes, seq->nbytes); + } + + if (ch == '\n' || ch == EOF) + { + lr_error (lr, _("unterminated string")); + illegal_string = 1; + } + + if (illegal_string) + { + free (buf); + free (buf2); + lr->token.val.str.startmb = NULL; + lr->token.val.str.lenmb = 0; + lr->token.val.str.startwc = NULL; + lr->token.val.str.lenwc = 0; + + return &lr->token; + } + + ADDC ('\0'); + + if (return_widestr) + { + ADDWC (0); + lr->token.val.str.startwc = xrealloc (buf2, + buf2act * sizeof (uint32_t)); + lr->token.val.str.lenwc = buf2act; + } + } + + lr->token.val.str.startmb = xrealloc (buf, bufact); + lr->token.val.str.lenmb = bufact; + + return &lr->token; +} |