summary refs log tree commit diff
path: root/locale/programs/linereader.c
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>1999-08-31 07:04:41 +0000
committerUlrich Drepper <drepper@redhat.com>1999-08-31 07:04:41 +0000
commit4b10dd6c1959577f57850ca427a94fe22b9f3299 (patch)
treeb385d9b27e5a40d5baf7cd7e27c7cc5ef7129b5b /locale/programs/linereader.c
parent1d1740d6b12894ed6a430e2e98bf73c5243b2925 (diff)
downloadglibc-4b10dd6c1959577f57850ca427a94fe22b9f3299.tar.gz
glibc-4b10dd6c1959577f57850ca427a94fe22b9f3299.tar.xz
glibc-4b10dd6c1959577f57850ca427a94fe22b9f3299.zip
Update.
	* locale/Makefile (distribute): Add iso-639.def and iso-3166.def.
	Change charset.h to charmap.h.
	(categories): Add new categories.  Leave out collate for now.
	Update build rules.
	* locale/categories.def: Add definitions for new categories.
	* locale/langinfo.h: Likewise.
	* locale/locale.h: Likewise.
	* locale/C-address.c: New file.
	* locale/C-identification.c: New file.
	* locale/C-measurement.c: New file.
	* locale/C-name.c: New file.
	* locale/C-paper.c: New file.
	* locale/C-telephone.c: New file.
	* locale/lc-address.c: Likewise.
	* locale/lc-identification.c: Likewise.
	* locale/lc-measurement.c: Likewise.
	* locale/lc-name.c: Likewise.
	* locale/lc-paper.c: Likewise.
	* locale/lc-telephone.c: Likewise.
	* locale/C-ctype.c: Update for locale rewrite.
	* locale/C-messages.c: Likewise.
	* locale/C-monetary.c: Likewise.
	* locale/C-time.c: Likewise.
	* locale/lc-collate.c: Likewise.
	* locale/lc-ctype.c: Likewise.
	* locale/lc-monetary.c: Likewise.
	* locale/lc-time.c: Likewise.
	* locale/localeinfo.h: Likewise.
	* locale/newlocale.c: Likewise.
	* locale/setlocale.c: Likewise.
	* locale/weight.h: Likewise.
	* locale/findlocale.c: Unconditionally use mmap.
	Handle new categories.
	* locale/loadlocale.c: Likewise.
	* locale/iso-3166.def: New file.
	* locale/iso-639.def: New file.
	* locale/programs/charmap-kw.gperf: Add new keywords.
	* locale/programs/locfile-kw.gperf: Likewise.
	* locale/programs/locfile-token.h: Define new tokens.
	* locale/programs/charmap.c: Rewrite to handle multibyte charsets.
	* locale/programs/charmap.h: New file.
	* locale/programs/charset.h: Removed.
	* locale/programs/config.h: Add __LC_LAST.
	* locale/programs/lc-address.c: New file.
	* locale/programs/lc-identification.c: New file.
	* locale/programs/lc-measurement.c: New file.
	* locale/programs/lc-name.c: New file.
	* locale/programs/lc-paper.c: New file.
	* locale/programs/lc-telephone.c: New file.
	* locale/programs/lc-collate.c: Update for locale rewrite.
	* locale/programs/lc-ctype.c: Likewise.
	* locale/programs/lc-messages.c: Likewise.
	* locale/programs/lc-monetary.c: Likewise.
	* locale/programs/lc-numeric.c: Likewise.
	* locale/programs/lc-time.c: Likewise.
	* locale/programs/locale.c: Likewise.
	* locale/programs/localedef.c: Likewise.
	* locale/programs/locfile.c: Likewise.
	* locale/programs/repertoire.c: Likewise.
	* locale/programs/repertoire.h: Likewise.
	* locale/programs/locfile.c: Update prototypes.
	Update handle_copy definition.
	* locale/programs/linereader.c: Add handling of wide char strings and
	new definition file syntax.
	* locale/programs/linereader.h (struct token): Add elements for wide
	character strings.
	* locale/programs/locale-spec.c: Disable handling of collation
	elements for now.
	* locale/programs/simple-hash.h: Cleanup.
	* locale/programs/stringtrans.h: Handle quite of end of line.
	* string/strcoll.c: Fall back on strcmp for now.
	* string/strxfrm.c: Fall back on strncpy/strlen for now.
	* time/strftime.c: Use new wide character data for wcsftime.
	* time/strptime.c: Remove _nl_C_LC_TIME declaration.
	* wctype/cname-lookup.h: Update for new LC_CTYPE data.
Diffstat (limited to 'locale/programs/linereader.c')
-rw-r--r--locale/programs/linereader.c452
1 files changed, 321 insertions, 131 deletions
diff --git a/locale/programs/linereader.c b/locale/programs/linereader.c
index 31278d63c2..99ed0f2480 100644
--- a/locale/programs/linereader.c
+++ b/locale/programs/linereader.c
@@ -1,6 +1,6 @@
 /* Copyright (C) 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
+   Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Library General Public License as
@@ -28,22 +28,20 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "charmap.h"
 #include "error.h"
 #include "linereader.h"
-#include "charset.h"
+#include "localedef.h"
 #include "stringtrans.h"
 
 
-void *xmalloc (size_t __n);
-void *xrealloc (void *__p, size_t __n);
-char *xstrdup (const char *__str);
-
-
+/* Prototypes for local functions.  */
 static struct token *get_toplvl_escape (struct linereader *lr);
 static struct token *get_symname (struct linereader *lr);
 static struct token *get_ident (struct linereader *lr);
 static struct token *get_string (struct linereader *lr,
-				 const struct charset_t *charset);
+				 const struct charmap_t *charmap,
+				 const struct repertoire_t *repertoire);
 
 
 struct linereader *
@@ -126,9 +124,14 @@ lr_next (struct linereader *lr)
 
   if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
     {
+#if 0
+      /* XXX Is this correct?  */
       /* An escaped newline character is substituted with a single <SP>.  */
       --n;
       lr->buf[n - 1] = ' ';
+#else
+      n -= 2;
+#endif
     }
 
   lr->buf[n] = '\0';
@@ -149,7 +152,8 @@ extern char *program_name;
 
 
 struct token *
-lr_token (struct linereader *lr, const struct charset_t *charset)
+lr_token (struct linereader *lr, const struct charmap_t *charmap,
+	  const struct repertoire_t *repertoire)
 {
   int ch;
 
@@ -193,12 +197,29 @@ lr_token (struct linereader *lr, const struct charset_t *charset)
     return get_toplvl_escape (lr);
 
   /* Match ellipsis.  */
-  if (ch == '.' && strncmp (&lr->buf[lr->idx], "..", 2) == 0)
+  if (ch == '.')
     {
-      lr_getc (lr);
-      lr_getc (lr);
-      lr->token.tok = tok_ellipsis;
-      return &lr->token;
+      if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
+	{
+	  lr_getc (lr);
+	  lr_getc (lr);
+	  lr_getc (lr);
+	  lr->token.tok = tok_ellipsis4;
+	  return &lr->token;
+	}
+      if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
+	{
+	  lr_getc (lr);
+	  lr_getc (lr);
+	  lr->token.tok = tok_ellipsis3;
+	  return &lr->token;
+	}
+      if (lr->buf[lr->idx] == '.')
+	{
+	  lr_getc (lr);
+	  lr->token.tok = tok_ellipsis2;
+	  return &lr->token;
+	}
     }
 
   switch (ch)
@@ -238,7 +259,7 @@ lr_token (struct linereader *lr, const struct charset_t *charset)
       return &lr->token;
 
     case '"':
-      return get_string (lr, charset);
+      return get_string (lr, charmap, repertoire);
 
     case '-':
       ch = lr_getc (lr);
@@ -261,7 +282,7 @@ get_toplvl_escape (struct linereader *lr)
   /* This is supposed to be a numeric value.  We return the
      numerical value and the number of bytes.  */
   size_t start_idx = lr->idx - 1;
-  unsigned int value = 0;
+  char *bytes = lr->token.val.charcode.bytes;
   int nbytes = 0;
   int ch;
 
@@ -287,11 +308,11 @@ get_toplvl_escape (struct linereader *lr)
 	  || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
 	{
 	esc_error:
-	  lr->token.val.str.start = &lr->buf[start_idx];
+	  lr->token.val.str.startmb = &lr->buf[start_idx];
 
 	  while (ch != EOF && !isspace (ch))
 	    ch = lr_getc (lr);
-	  lr->token.val.str.len = lr->idx - start_idx;
+	  lr->token.val.str.lenmb = lr->idx - start_idx;
 
 	  lr->token.tok = tok_error;
 	  return &lr->token;
@@ -300,7 +321,7 @@ get_toplvl_escape (struct linereader *lr)
       if (isdigit (ch))
 	byte = ch - '0';
       else
-	byte = _tolower (ch) - 'a' + 10;
+	byte = tolower (ch) - 'a' + 10;
 
       ch = lr_getc (lr);
       if ((base == 16 && !isxdigit (ch))
@@ -311,7 +332,7 @@ get_toplvl_escape (struct linereader *lr)
       if (isdigit (ch))
 	byte += ch - '0';
       else
-	byte += _tolower (ch) - 'a' + 10;
+	byte += tolower (ch) - 'a' + 10;
 
       ch = lr_getc (lr);
       if (base != 16 && isdigit (ch))
@@ -322,10 +343,7 @@ get_toplvl_escape (struct linereader *lr)
 	  ch = lr_getc (lr);
 	}
 
-      value *= 256;
-      value += byte;
-
-      ++nbytes;
+      bytes[nbytes++] = byte;
     }
   while (ch == lr->escape_char && nbytes < 4);
 
@@ -335,23 +353,52 @@ get_toplvl_escape (struct linereader *lr)
   lr_ungetn (lr, 1);
 
   lr->token.tok = tok_charcode;
-  lr->token.val.charcode.val = value;
   lr->token.val.charcode.nbytes = nbytes;
 
   return &lr->token;
 }
 
 
-#define ADDC(ch)							    \
-  do									    \
-    {									    \
-      if (bufact == bufmax)						    \
-	{								    \
-	  bufmax *= 2;							    \
-	  buf = xrealloc (buf, bufmax);					    \
-	}								    \
-      buf[bufact++] = (ch);						    \
-    }									    \
+#define ADDC(ch) \
+  do									      \
+    {									      \
+      if (bufact == bufmax)						      \
+	{								      \
+	  bufmax *= 2;							      \
+	  buf = xrealloc (buf, bufmax);					      \
+	}								      \
+      buf[bufact++] = (ch);						      \
+    }									      \
+  while (0)
+
+
+#define ADDS(s, l) \
+  do									      \
+    {									      \
+      size_t _l = (l);							      \
+      if (bufact + _l > bufmax)						      \
+	{								      \
+	  if (bufact < _l)						      \
+	    bufact = _l;						      \
+	  bufmax *= 2;							      \
+	  buf = xrealloc (buf, bufmax);					      \
+	}								      \
+      memcpy (&buf[bufact], s, _l);					      \
+      bufact += _l;							      \
+    }									      \
+  while (0)
+
+
+#define ADDWC(ch) \
+  do									      \
+    {									      \
+      if (buf2act == buf2max)						      \
+	{								      \
+	  buf2max *= 2;							      \
+	  buf2 = xrealloc (buf2, buf2max * 4);				      \
+	}								      \
+      buf2[buf2act++] = (ch);						      \
+    }									      \
   while (0)
 
 
@@ -399,9 +446,8 @@ get_symname (struct linereader *lr)
       if (cp == &buf[bufact - 1])
 	{
 	  /* Yes, it is.  */
-	  lr->token.tok = bufact == 6 ? tok_ucs2 : tok_ucs4;
-	  lr->token.val.charcode.val = strtoul (buf, NULL, 16);
-	  lr->token.val.charcode.nbytes = lr->token.tok == tok_ucs2 ? 2 : 4;
+	  lr->token.tok = tok_ucs4;
+	  lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);
 
 	  return &lr->token;
 	}
@@ -422,8 +468,8 @@ get_symname (struct linereader *lr)
       buf[bufact] = '\0';
       buf = xrealloc (buf, bufact + 1);
 
-      lr->token.val.str.start = buf;
-      lr->token.val.str.len = bufact - 1;
+      lr->token.val.str.startmb = buf;
+      lr->token.val.str.lenmb = bufact - 1;
     }
 
   return &lr->token;
@@ -446,8 +492,18 @@ get_ident (struct linereader *lr)
 
   while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
 	 && ch != '<' && ch != ',')
-    /* XXX Handle escape sequences?  */
-    ADDC (ch);
+    {
+      if (ch == lr->escape_char)
+	{
+	  ch = lr_getc (lr);
+	  if (ch == '\n' || ch == EOF)
+	    {
+	      lr_error (lr, _("invalid escape sequence"));
+	      break;
+	    }
+	}
+      ADDC (ch);
+    }
 
   lr_ungetn (lr, 1);
 
@@ -465,8 +521,8 @@ get_ident (struct linereader *lr)
       buf[bufact] = '\0';
       buf = xrealloc (buf, bufact + 1);
 
-      lr->token.val.str.start = buf;
-      lr->token.val.str.len = bufact;
+      lr->token.val.str.startmb = buf;
+      lr->token.val.str.lenmb = bufact;
     }
 
   return &lr->token;
@@ -474,113 +530,247 @@ get_ident (struct linereader *lr)
 
 
 static struct token *
-get_string (struct linereader *lr, const struct charset_t *charset)
+get_string (struct linereader *lr, const struct charmap_t *charmap,
+	    const struct repertoire_t *repertoire)
 {
-  int illegal_string = 0;
-  char *buf, *cp;
+  int return_widestr = lr->return_widestr;
+  char *buf;
+  char *buf2 = NULL;
   size_t bufact;
   size_t bufmax = 56;
-  int ch;
 
+  /* We must return two different strings.  */
   buf = xmalloc (bufmax);
   bufact = 0;
 
-  while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
-    if (ch != '<' || charset == NULL)
-      {
-	if (ch == lr->escape_char)
-	  {
-	    ch = lr_getc (lr);
-	    if (ch == '\n' || ch == EOF)
-	      break;
-	  }
+  /* We know it'll be a string.  */
+  lr->token.tok = tok_string;
+
+  /* If we need not translate the strings (i.e., expand <...> parts)
+     we can run a simple loop.  */
+  if (!lr->translate_strings)
+    {
+      int ch;
+
+      buf2 = NULL;
+      while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
 	ADDC (ch);
-      }
-    else
-      {
-	/* We have to get the value of the symbol.  */
-	unsigned int value;
-	size_t startidx = bufact;
-
-	if (!lr->translate_strings)
-	  ADDC ('<');
-
-	while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
-	  {
-	    if (ch == lr->escape_char)
-	      {
-		ch = lr_getc (lr);
-		if (ch == '\n' || ch == EOF)
-		  break;
-	      }
-	    ADDC (ch);
-	  }
-
-	if (ch == '\n' || ch == EOF)
-	  lr_error (lr, _("unterminated string"));
-	else
-	  if (!lr->translate_strings)
-	    ADDC ('>');
-
-	if (lr->translate_strings)
-	  {
-	    value = charset_find_value (&charset->char_table, &buf[startidx],
-					bufact - startidx);
-	    if ((wchar_t) value == ILLEGAL_CHAR_VALUE)
+
+      /* Catch errors with trailing escape character.  */
+      if (bufact > 0 && buf[bufact - 1] == lr->escape_char
+	  && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
+	{
+	  lr_error (lr, _("illegal escape sequence at end of string"));
+	  --bufact;
+	}
+      else if (ch == '\n' || ch == EOF)
+	lr_error (lr, _("unterminated string"));
+
+      ADDC ('\0');
+    }
+  else
+    {
+      int illegal_string = 0;
+      size_t buf2act = 0;
+      size_t buf2max = 56 * sizeof (uint32_t);
+      int ch;
+      int warned = 0;
+
+      /* We have to provide the wide character result as well.  */
+      if (return_widestr)
+	buf2 = xmalloc (buf2max);
+
+      /* Read until the end of the string (or end of the line or file).  */
+      while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
+	{
+	  size_t startidx;
+	  uint32_t wch;
+	  struct charseq *seq;
+
+	  if (ch != '<')
+	    {
+	      /* The standards leave it up to the implementation to decide
+		 what to do with character which stand for themself.  We
+		 could jump through hoops to find out the value relative to
+		 the charmap and the repertoire map, but instead we leave
+		 it up to the locale definition author to write a better
+		 definition.  We assume here that every character which
+		 stands for itself is encoded using ISO 8859-1.  Using the
+		 escape character is allowed.  */
+	      if (ch == lr->escape_char)
+		{
+		  ch = lr_getc (lr);
+		  if (ch == '\n' || ch == EOF)
+		    break;
+		}
+
+	      if (verbose && !warned)
+		{
+		  lr_error (lr, _("\
+non-symbolic character value should not be used"));
+		  warned = 1;
+		}
+
+	      ADDC (ch);
+	      if (return_widestr)
+		ADDWC ((uint32_t) ch);
+
+	      continue;
+	    }
+
+	  /* Now we have to search for the end of the symbolic name, i.e.,
+	     the closing '>'.  */
+	  startidx = bufact;
+	  while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
+	    {
+	      if (ch == lr->escape_char)
+		{
+		  ch = lr_getc (lr);
+		  if (ch == '\n' || ch == EOF)
+		    break;
+		}
+	      ADDC (ch);
+	    }
+	  if (ch == '\n' || ch == EOF)
+	    /* Not a correct string.  */
+	    break;
+	  if (bufact == startidx)
+	    {
+	      /* <> is no correct name.  Ignore it and also signal an
+		 error.  */
 	      illegal_string = 1;
-	    bufact = startidx;
+	      continue;
+	    }
 
-	    if (bufmax - bufact < 8)
-	      {
-		bufmax *= 2;
-		buf = (char *) xrealloc (buf, bufmax);
-	      }
+	  /* It might be a Uxxxx symbol.  */
+	  if (buf[startidx] == 'U'
+	      && (bufact - startidx == 5 || bufact - startidx == 9))
+	    {
+	      char *cp = buf + startidx + 1;
+	      while (cp < &buf[bufact] && isxdigit (*cp))
+		++cp;
+
+	      if (cp == &buf[bufact])
+		{
+		  const char *symbol = NULL;
+
+		  /* Yes, it is.  */
+		  ADDC ('\0');
+		  wch = strtoul (buf + startidx + 1, NULL, 16);
+
+		  /* Now forget about the name we just added.  */
+		  bufact = startidx;
+
+		  if (return_widestr)
+		    ADDWC (wch);
+
+		  /* Now determine from the repertoire the name of the
+		     character and find it in the charmap.  */
+		  if (repertoire != NULL)
+		    symbol = repertoire_find_symbol (repertoire, wch);
+
+		  if (symbol == NULL)
+		    {
+		      /* We cannot generate a string since we cannot map
+			 from the Unicode number to the character symbol.  */
+		      lr_error (lr,
+			        _("character <U%0*X> not in repertoire map"),
+				wch > 0xffff ? 8 : 4, wch);
+
+		      illegal_string = 1;
+		    }
+		  else
+		    {
+		      seq = charmap_find_value (charmap, symbol,
+						strlen (symbol));
+
+		      if (seq == NULL)
+			{
+			  /* Not a known name.  */
+			  lr_error (lr,
+				    _("symbol `%s' not in charmap"), symbol);
+			  illegal_string = 1;
+			}
+		      else
+		        ADDS (seq->bytes, seq->nbytes);
+		    }
+
+		  continue;
+		}
+	    }
+
+	  if (return_widestr)
+	    {
+	      /* We now have the symbolic name in buf[startidx] to
+		 buf[bufact-1].  Now find out the value for this
+		 character in the repertoire map as well as in the
+		 charmap (in this order).  */
+	      wch = repertoire_find_value (repertoire, &buf[startidx],
+					   bufact - startidx);
+	      if (wch == ILLEGAL_CHAR_VALUE)
+		{
+		  /* This name is not in the repertoire map.  */
+		  lr_error (lr, _("symbol `%.*s' not in repertoire map"),
+			    bufact - startidx, &buf[startidx]);
+		  illegal_string = 1;
+		}
+	      else
+		ADDWC (wch);
+	    }
+
+	  /* Now the same for the multibyte representation.  */
+	  seq = charmap_find_value (charmap, &buf[startidx],
+				    bufact - startidx);
 
-	    cp = &buf[bufact];
-	    if (encode_char (value, &cp))
+	  if (seq == NULL)
+	    {
+	      /* This name is not in the charmap.  */
+	      lr_error (lr, _("symbol `%.*s' not in charmap"),
+			bufact - startidx, &buf[startidx]);
 	      illegal_string = 1;
 
-	    bufact = cp - buf;
-	  }
-      }
+	      /* Now forget about the name we just added.  */
+	      bufact = startidx;
+	    }
+	  else
+	    {
+	      /* Now forget about the name we just added.  */
+	      bufact = startidx;
 
-  /* Catch errors with trailing escape character.  */
-  if (bufact > 0 && buf[bufact - 1] == lr->escape_char
-      && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
-    {
-      lr_error (lr, _("illegal escape sequence at end of string"));
-      --bufact;
-    }
-  else if (ch == '\n' || ch == EOF)
-    lr_error (lr, _("unterminated string"));
+	      ADDS (seq->bytes, seq->nbytes);
+	    }
+	}
 
-  /* Terminate string if necessary.  */
-  if (lr->translate_strings)
-    {
-      cp = &buf[bufact];
-      if (encode_char (0, &cp))
-	illegal_string = 1;
+      if (ch == '\n' || ch == EOF)
+	{
+	  lr_error (lr, _("unterminated string"));
+	  illegal_string = 1;
+	}
 
-      bufact = cp - buf;
-    }
-  else
-    ADDC ('\0');
+      if (illegal_string)
+	{
+	  free (buf);
+	  if (buf2 != NULL)
+	    free (buf2);
+	  lr->token.val.str.startmb = NULL;
+	  lr->token.val.str.lenmb = 0;
 
-  lr->token.tok = tok_string;
+	  return &lr->token;
+	}
 
-  if (illegal_string)
-    {
-      free (buf);
-      lr->token.val.str.start = NULL;
-      lr->token.val.str.len = 0;
-    }
-  else
-    {
-      buf = xrealloc (buf, bufact + 1);
+      ADDC ('\0');
 
-      lr->token.val.str.start = buf;
-      lr->token.val.str.len = bufact;
+      if (return_widestr)
+	{
+	  ADDWC (0);
+	  lr->token.val.str.startwc = xrealloc (buf2,
+						buf2act * sizeof (uint32_t));
+	  lr->token.val.str.lenwc = buf2act;
+	}
     }
 
+  lr->token.val.str.startmb = xrealloc (buf, bufact);
+  lr->token.val.str.lenmb = bufact;
+
   return &lr->token;
 }