about summary refs log tree commit diff
path: root/REORG.TODO/locale/programs/charmap.c
diff options
context:
space:
mode:
Diffstat (limited to 'REORG.TODO/locale/programs/charmap.c')
-rw-r--r--REORG.TODO/locale/programs/charmap.c1104
1 files changed, 1104 insertions, 0 deletions
diff --git a/REORG.TODO/locale/programs/charmap.c b/REORG.TODO/locale/programs/charmap.c
new file mode 100644
index 0000000000..129aefffc1
--- /dev/null
+++ b/REORG.TODO/locale/programs/charmap.c
@@ -0,0 +1,1104 @@
+/* Copyright (C) 1996-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published
+   by the Free Software Foundation; version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.  */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <ctype.h>
+#include <errno.h>
+#include <libintl.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <error.h>
+#include <stdint.h>
+
+#include "localedef.h"
+#include "linereader.h"
+#include "charmap.h"
+#include "charmap-dir.h"
+
+#include <assert.h>
+
+
+/* Define the lookup function.  */
+#include "charmap-kw.h"
+
+
+/* Prototypes for local functions.  */
+static struct charmap_t *parse_charmap (struct linereader *cmfile,
+					int verbose, int be_quiet);
+static void new_width (struct linereader *cmfile, struct charmap_t *result,
+		       const char *from, const char *to,
+		       unsigned long int width);
+static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
+			      size_t nbytes, unsigned char *bytes,
+			      const char *from, const char *to,
+			      int decimal_ellipsis, int step);
+
+
+bool enc_not_ascii_compatible;
+
+
+#ifdef NEED_NULL_POINTER
+static const char *null_pointer;
+#endif
+
+static struct linereader *
+cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
+{
+  FILE *fp;
+
+  fp = charmap_open (directory, name);
+  if (fp == NULL)
+    return NULL;
+  else
+    {
+      size_t dlen = strlen (directory);
+      int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
+      size_t nlen = strlen (name);
+      char *pathname;
+      char *p;
+
+      pathname = alloca (dlen + add_slash + nlen + 1);
+      p = stpcpy (pathname, directory);
+      if (add_slash)
+	*p++ = '/';
+      stpcpy (p, name);
+
+      return lr_create (fp, pathname, hf);
+    }
+}
+
+struct charmap_t *
+charmap_read (const char *filename, int verbose, int error_not_found,
+	      int be_quiet, int use_default)
+{
+  struct charmap_t *result = NULL;
+
+  if (filename != NULL)
+    {
+      struct linereader *cmfile;
+
+      /* First try the name as found in the parameter.  */
+      cmfile = lr_open (filename, charmap_hash);
+      if (cmfile == NULL)
+	{
+	  /* No successful.  So start looking through the directories
+	     in the I18NPATH if this is a simple name.  */
+	  if (strchr (filename, '/') == NULL)
+	    {
+	      char *i18npath = getenv ("I18NPATH");
+	      if (i18npath != NULL && *i18npath != '\0')
+		{
+		  const size_t pathlen = strlen (i18npath);
+		  char i18npathbuf[pathlen + 1];
+		  char path[pathlen + sizeof ("/charmaps")];
+		  char *next;
+		  i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
+
+		  while (cmfile == NULL
+			 && (next = strsep (&i18npath, ":")) != NULL)
+		    {
+		      stpcpy (stpcpy (path, next), "/charmaps");
+		      cmfile = cmlr_open (path, filename, charmap_hash);
+
+		      if (cmfile == NULL)
+			/* Try without the "/charmaps" part.  */
+			cmfile = cmlr_open (next, filename, charmap_hash);
+		    }
+		}
+
+	      if (cmfile == NULL)
+		/* Try the default directory.  */
+		cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
+	    }
+	}
+
+      if (cmfile != NULL)
+	result = parse_charmap (cmfile, verbose, be_quiet);
+
+      if (result == NULL && error_not_found)
+	WITH_CUR_LOCALE (error (0, errno, _("\
+character map file `%s' not found"), filename));
+    }
+
+  if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
+    {
+      /* OK, one more try.  We also accept the names given to the
+	 character sets in the files.  Sometimes they differ from the
+	 file name.  */
+      CHARMAP_DIR *dir;
+
+      dir = charmap_opendir (CHARMAP_PATH);
+      if (dir != NULL)
+	{
+	  const char *dirent;
+
+	  while ((dirent = charmap_readdir (dir)) != NULL)
+	    {
+	      char **aliases;
+	      char **p;
+	      int found;
+
+	      aliases = charmap_aliases (CHARMAP_PATH, dirent);
+	      found = 0;
+	      for (p = aliases; *p; p++)
+		if (strcasecmp (*p, filename) == 0)
+		  {
+		    found = 1;
+		    break;
+		  }
+	      charmap_free_aliases (aliases);
+
+	      if (found)
+		{
+		  struct linereader *cmfile;
+
+		  cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
+		  if (cmfile != NULL)
+		    result = parse_charmap (cmfile, verbose, be_quiet);
+
+		  break;
+		}
+	    }
+
+	  charmap_closedir (dir);
+	}
+    }
+
+  if (result == NULL && DEFAULT_CHARMAP != NULL)
+    {
+      struct linereader *cmfile;
+
+      cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
+      if (cmfile != NULL)
+	result = parse_charmap (cmfile, verbose, be_quiet);
+
+      if (result == NULL)
+	WITH_CUR_LOCALE (error (4, errno, _("\
+default character map file `%s' not found"), DEFAULT_CHARMAP));
+    }
+
+  if (result != NULL && result->code_set_name == NULL)
+    /* The input file does not specify a code set name.  This
+       shouldn't happen but we should cope with it.  */
+    result->code_set_name = basename (filename);
+
+  /* Test of ASCII compatibility of locale encoding.
+
+     Verify that the encoding to be used in a locale is ASCII compatible,
+     at least for the graphic characters, excluding the control characters,
+     '$' and '@'.  This constraint comes from an ISO C 99 restriction.
+
+     ISO C 99 section 7.17.(2) (about wchar_t):
+       the null character shall have the code value zero and each member of
+       the basic character set shall have a code value equal to its value
+       when used as the lone character in an integer character constant.
+     ISO C 99 section 5.2.1.(3):
+       Both the basic source and basic execution character sets shall have
+       the following members: the 26 uppercase letters of the Latin alphabet
+            A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
+       the 26 lowercase letters of the Latin alphabet
+            a b c d e f g h i j k l m n o p q r s t u v w x y z
+       the 10 decimal digits
+            0 1 2 3 4 5 6 7 8 9
+       the following 29 graphic characters
+            ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
+       the space character, and control characters representing horizontal
+       tab, vertical tab, and form feed.
+
+     Therefore, for all members of the "basic character set", the 'char' code
+     must have the same value as the 'wchar_t' code, which in glibc is the
+     same as the Unicode code, which for all of the enumerated characters
+     is identical to the ASCII code. */
+  if (result != NULL && use_default)
+    {
+      static const char basic_charset[] =
+	{
+	  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+	  'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+	  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
+	  'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+	  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+	  '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
+	  '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
+	  '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
+	};
+      int failed = 0;
+      const char *p = basic_charset;
+
+      do
+	{
+	  struct charseq *seq = charmap_find_symbol (result, p, 1);
+
+	  if (seq == NULL || seq->ucs4 != (uint32_t) *p)
+	    failed = 1;
+	}
+      while (*p++ != '\0');
+
+      if (failed)
+	{
+	  WITH_CUR_LOCALE (fprintf (stderr, _("\
+character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
+				    result->code_set_name));
+	  enc_not_ascii_compatible = true;
+	}
+    }
+
+  return result;
+}
+
+
+static struct charmap_t *
+parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
+{
+  struct charmap_t *result;
+  int state;
+  enum token_t expected_tok = tok_error;
+  const char *expected_str = NULL;
+  char *from_name = NULL;
+  char *to_name = NULL;
+  enum token_t ellipsis = 0;
+  int step = 1;
+
+  /* We don't want symbolic names in string to be translated.  */
+  cmfile->translate_strings = 0;
+
+  /* Allocate room for result.  */
+  result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
+  memset (result, '\0', sizeof (struct charmap_t));
+  /* The default DEFAULT_WIDTH is 1.  */
+  result->width_default = 1;
+
+#define obstack_chunk_alloc malloc
+#define obstack_chunk_free free
+  obstack_init (&result->mem_pool);
+
+  if (init_hash (&result->char_table, 256)
+      || init_hash (&result->byte_table, 256))
+    {
+      free (result);
+      return NULL;
+    }
+
+  /* We use a state machine to describe the charmap description file
+     format.  */
+  state = 1;
+  while (1)
+    {
+      /* What's on?  */
+      struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
+      enum token_t nowtok = now->tok;
+      struct token *arg;
+
+      if (nowtok == tok_eof)
+	break;
+
+      switch (state)
+	{
+	case 1:
+	  /* The beginning.  We expect the special declarations, EOL or
+	     `CHARMAP'.  */
+	  if (nowtok == tok_eol)
+	    /* Ignore empty lines.  */
+	    continue;
+
+	  if (nowtok == tok_charmap)
+	    {
+	      from_name = NULL;
+	      to_name = NULL;
+
+	      /* We have to set up the real work.  Fill in some
+		 default values.  */
+	      if (result->mb_cur_max == 0)
+		result->mb_cur_max = 1;
+	      if (result->mb_cur_min == 0)
+		result->mb_cur_min = result->mb_cur_max;
+	      if (result->mb_cur_min > result->mb_cur_max)
+		{
+		  if (!be_quiet)
+		    WITH_CUR_LOCALE (error (0, 0, _("\
+%s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
+					    cmfile->fname));
+
+		  result->mb_cur_min = result->mb_cur_max;
+		}
+
+	      lr_ignore_rest (cmfile, 1);
+
+	      state = 2;
+	      continue;
+	    }
+
+	  if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
+	      && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
+	      && nowtok != tok_comment_char && nowtok != tok_g0esc
+	      && nowtok != tok_g1esc && nowtok != tok_g2esc
+	      && nowtok != tok_g3esc && nowtok != tok_repertoiremap
+	      && nowtok != tok_include)
+	    {
+	      lr_error (cmfile, _("syntax error in prolog: %s"),
+			_("invalid definition"));
+
+	      lr_ignore_rest (cmfile, 0);
+	      continue;
+	    }
+
+	  /* We know that we need an argument.  */
+	  arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
+
+	  switch (nowtok)
+	    {
+	    case tok_code_set_name:
+	    case tok_repertoiremap:
+	      if (arg->tok != tok_ident && arg->tok != tok_string)
+		{
+		badarg:
+		  lr_error (cmfile, _("syntax error in prolog: %s"),
+			    _("bad argument"));
+
+		  lr_ignore_rest (cmfile, 0);
+		  continue;
+		}
+
+	      if (nowtok == tok_code_set_name)
+		result->code_set_name = obstack_copy0 (&result->mem_pool,
+						       arg->val.str.startmb,
+						       arg->val.str.lenmb);
+	      else
+		result->repertoiremap = obstack_copy0 (&result->mem_pool,
+						       arg->val.str.startmb,
+						       arg->val.str.lenmb);
+
+	      lr_ignore_rest (cmfile, 1);
+	      continue;
+
+	    case tok_mb_cur_max:
+	    case tok_mb_cur_min:
+	      if (arg->tok != tok_number)
+		goto badarg;
+
+	      if (verbose
+		  && ((nowtok == tok_mb_cur_max
+		       && result->mb_cur_max != 0)
+		      || (nowtok == tok_mb_cur_max
+			  && result->mb_cur_max != 0)))
+		lr_error (cmfile, _("duplicate definition of <%s>"),
+			  nowtok == tok_mb_cur_min
+			  ? "mb_cur_min" : "mb_cur_max");
+
+	      if (arg->val.num < 1)
+		{
+		  lr_error (cmfile,
+			    _("value for <%s> must be 1 or greater"),
+			    nowtok == tok_mb_cur_min
+			    ? "mb_cur_min" : "mb_cur_max");
+
+		  lr_ignore_rest (cmfile, 0);
+		  continue;
+		}
+	      if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
+		   && (int) arg->val.num < result->mb_cur_min)
+		  || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
+		      && (int) arg->val.num > result->mb_cur_max))
+		{
+		  lr_error (cmfile, _("\
+value of <%s> must be greater or equal than the value of <%s>"),
+			    "mb_cur_max", "mb_cur_min");
+
+		  lr_ignore_rest (cmfile, 0);
+		  continue;
+		}
+
+	      if (nowtok == tok_mb_cur_max)
+		result->mb_cur_max = arg->val.num;
+	      else
+		result->mb_cur_min = arg->val.num;
+
+	      lr_ignore_rest (cmfile, 1);
+	      continue;
+
+	    case tok_escape_char:
+	    case tok_comment_char:
+	      if (arg->tok != tok_ident)
+		goto badarg;
+
+	      if (arg->val.str.lenmb != 1)
+		{
+		  lr_error (cmfile, _("\
+argument to <%s> must be a single character"),
+			    nowtok == tok_escape_char ? "escape_char"
+						      : "comment_char");
+
+		  lr_ignore_rest (cmfile, 0);
+		  continue;
+		}
+
+	      if (nowtok == tok_escape_char)
+		cmfile->escape_char = *arg->val.str.startmb;
+	      else
+		cmfile->comment_char = *arg->val.str.startmb;
+
+	      lr_ignore_rest (cmfile, 1);
+	      continue;
+
+	    case tok_g0esc:
+	    case tok_g1esc:
+	    case tok_g2esc:
+	    case tok_g3esc:
+	    case tok_escseq:
+	      lr_ignore_rest (cmfile, 0); /* XXX */
+	      continue;
+
+	    case tok_include:
+	      lr_error (cmfile, _("\
+character sets with locking states are not supported"));
+	      exit (4);
+
+	    default:
+	      /* Cannot happen.  */
+	      assert (! "Should not happen");
+	    }
+	  break;
+
+	case 2:
+	  /* We have seen `CHARMAP' and now are in the body.  Each line
+	     must have the format "%s %s %s\n" or "%s...%s %s %s\n".  */
+	  if (nowtok == tok_eol)
+	    /* Ignore empty lines.  */
+	    continue;
+
+	  if (nowtok == tok_end)
+	    {
+	      expected_tok = tok_charmap;
+	      expected_str = "CHARMAP";
+	      state = 90;
+	      continue;
+	    }
+
+	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
+	    {
+	      lr_error (cmfile, _("syntax error in %s definition: %s"),
+			"CHARMAP", _("no symbolic name given"));
+
+	      lr_ignore_rest (cmfile, 0);
+	      continue;
+	    }
+
+	  /* If the previous line was not completely correct free the
+	     used memory.  */
+	  if (from_name != NULL)
+	    obstack_free (&result->mem_pool, from_name);
+
+	  if (nowtok == tok_bsymbol)
+	    from_name = (char *) obstack_copy0 (&result->mem_pool,
+						now->val.str.startmb,
+						now->val.str.lenmb);
+	  else
+	    {
+	      obstack_printf (&result->mem_pool, "U%08X",
+			      cmfile->token.val.ucs4);
+	      obstack_1grow (&result->mem_pool, '\0');
+	      from_name = (char *) obstack_finish (&result->mem_pool);
+	    }
+	  to_name = NULL;
+
+	  state = 3;
+	  continue;
+
+	case 3:
+	  /* We have two possibilities: We can see an ellipsis or an
+	     encoding value.  */
+	  if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
+	      || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
+	      || nowtok == tok_ellipsis2_2)
+	    {
+	      ellipsis = nowtok;
+	      if (nowtok == tok_ellipsis4_2)
+		{
+		  step = 2;
+		  nowtok = tok_ellipsis4;
+		}
+	      else if (nowtok == tok_ellipsis2_2)
+		{
+		  step = 2;
+		  nowtok = tok_ellipsis2;
+		}
+	      state = 4;
+	      continue;
+	    }
+	  /* FALLTHROUGH */
+
+	case 5:
+	  if (nowtok != tok_charcode)
+	    {
+	      lr_error (cmfile, _("syntax error in %s definition: %s"),
+			"CHARMAP", _("invalid encoding given"));
+
+	      lr_ignore_rest (cmfile, 0);
+
+	      state = 2;
+	      continue;
+	    }
+
+	  if (now->val.charcode.nbytes < result->mb_cur_min)
+	    lr_error (cmfile, _("too few bytes in character encoding"));
+	  else if (now->val.charcode.nbytes > result->mb_cur_max)
+	    lr_error (cmfile, _("too many bytes in character encoding"));
+	  else
+	    charmap_new_char (cmfile, result, now->val.charcode.nbytes,
+			      now->val.charcode.bytes, from_name, to_name,
+			      ellipsis != tok_ellipsis2, step);
+
+	  /* Ignore trailing comment silently.  */
+	  lr_ignore_rest (cmfile, 0);
+
+	  from_name = NULL;
+	  to_name = NULL;
+	  ellipsis = tok_none;
+	  step = 1;
+
+	  state = 2;
+	  continue;
+
+	case 4:
+	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
+	    {
+	      lr_error (cmfile, _("syntax error in %s definition: %s"),
+			"CHARMAP",
+			_("no symbolic name given for end of range"));
+
+	      lr_ignore_rest (cmfile, 0);
+	      continue;
+	    }
+
+	  /* Copy the to-name in a safe place.  */
+	  if (nowtok == tok_bsymbol)
+	    to_name = (char *) obstack_copy0 (&result->mem_pool,
+					      cmfile->token.val.str.startmb,
+					      cmfile->token.val.str.lenmb);
+	  else
+	    {
+	      obstack_printf (&result->mem_pool, "U%08X",
+			      cmfile->token.val.ucs4);
+	      obstack_1grow (&result->mem_pool, '\0');
+	      to_name = (char *) obstack_finish (&result->mem_pool);
+	    }
+
+	  state = 5;
+	  continue;
+
+	case 90:
+	  if (nowtok != expected_tok)
+	    lr_error (cmfile, _("\
+%1$s: definition does not end with `END %1$s'"), expected_str);
+
+	  lr_ignore_rest (cmfile, nowtok == expected_tok);
+	  state = 91;
+	  continue;
+
+	case 91:
+	  /* Waiting for WIDTH... */
+	  if (nowtok == tok_eol)
+	    /* Ignore empty lines.  */
+	    continue;
+
+	  if (nowtok == tok_width_default)
+	    {
+	      state = 92;
+	      continue;
+	    }
+
+	  if (nowtok == tok_width)
+	    {
+	      lr_ignore_rest (cmfile, 1);
+	      state = 93;
+	      continue;
+	    }
+
+	  if (nowtok == tok_width_variable)
+	    {
+	      lr_ignore_rest (cmfile, 1);
+	      state = 98;
+	      continue;
+	    }
+
+	  lr_error (cmfile, _("\
+only WIDTH definitions are allowed to follow the CHARMAP definition"));
+
+	  lr_ignore_rest (cmfile, 0);
+	  continue;
+
+	case 92:
+	  if (nowtok != tok_number)
+	    lr_error (cmfile, _("value for %s must be an integer"),
+		      "WIDTH_DEFAULT");
+	  else
+	    result->width_default = now->val.num;
+
+	  lr_ignore_rest (cmfile, nowtok == tok_number);
+
+	  state = 91;
+	  continue;
+
+	case 93:
+	  /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
+	     "%s...%s %d\n".  */
+	  if (nowtok == tok_eol)
+	    /* ignore empty lines.  */
+	    continue;
+
+	  if (nowtok == tok_end)
+	    {
+	      expected_tok = tok_width;
+	      expected_str = "WIDTH";
+	      state = 90;
+	      continue;
+	    }
+
+	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
+	    {
+	      lr_error (cmfile, _("syntax error in %s definition: %s"),
+			"WIDTH", _("no symbolic name given"));
+
+	      lr_ignore_rest (cmfile, 0);
+	      continue;
+	    }
+
+	  if (from_name != NULL)
+	    obstack_free (&result->mem_pool, from_name);
+
+	  if (nowtok == tok_bsymbol)
+	    from_name = (char *) obstack_copy0 (&result->mem_pool,
+						now->val.str.startmb,
+						now->val.str.lenmb);
+	  else
+	    {
+	      obstack_printf (&result->mem_pool, "U%08X",
+			      cmfile->token.val.ucs4);
+	      obstack_1grow (&result->mem_pool, '\0');
+	      from_name = (char *) obstack_finish (&result->mem_pool);
+	    }
+
+	  to_name = NULL;
+
+	  state = 94;
+	  continue;
+
+	case 94:
+	  if (nowtok == tok_ellipsis3)
+	    {
+	      state = 95;
+	      continue;
+	    }
+
+	case 96:
+	  if (nowtok != tok_number)
+	    lr_error (cmfile, _("value for %s must be an integer"),
+		      "WIDTH");
+	  else
+	    {
+	      /* Store width for chars.  */
+	      new_width (cmfile, result, from_name, to_name, now->val.num);
+
+	      from_name = NULL;
+	      to_name = NULL;
+	    }
+
+	  lr_ignore_rest (cmfile, nowtok == tok_number);
+
+	  state = 93;
+	  continue;
+
+	case 95:
+	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
+	    {
+	      lr_error (cmfile, _("syntax error in %s definition: %s"),
+			"WIDTH", _("no symbolic name given for end of range"));
+
+	      lr_ignore_rest (cmfile, 0);
+
+	      state = 93;
+	      continue;
+	    }
+
+	  if (nowtok == tok_bsymbol)
+	    to_name = (char *) obstack_copy0 (&result->mem_pool,
+					      now->val.str.startmb,
+					      now->val.str.lenmb);
+	  else
+	    {
+	      obstack_printf (&result->mem_pool, "U%08X",
+			      cmfile->token.val.ucs4);
+	      obstack_1grow (&result->mem_pool, '\0');
+	      to_name = (char *) obstack_finish (&result->mem_pool);
+	    }
+
+	  state = 96;
+	  continue;
+
+	case 98:
+	  /* We now expect `END WIDTH_VARIABLE' or lines of the format
+	     "%s\n" or "%s...%s\n".  */
+	  if (nowtok == tok_eol)
+	    /* ignore empty lines.  */
+	    continue;
+
+	  if (nowtok == tok_end)
+	    {
+	      expected_tok = tok_width_variable;
+	      expected_str = "WIDTH_VARIABLE";
+	      state = 90;
+	      continue;
+	    }
+
+	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
+	    {
+	      lr_error (cmfile, _("syntax error in %s definition: %s"),
+			"WIDTH_VARIABLE", _("no symbolic name given"));
+
+	      lr_ignore_rest (cmfile, 0);
+
+	      continue;
+	    }
+
+	  if (from_name != NULL)
+	    obstack_free (&result->mem_pool, from_name);
+
+	  if (nowtok == tok_bsymbol)
+	    from_name = (char *) obstack_copy0 (&result->mem_pool,
+						now->val.str.startmb,
+						now->val.str.lenmb);
+	  else
+	    {
+	      obstack_printf (&result->mem_pool, "U%08X",
+			      cmfile->token.val.ucs4);
+	      obstack_1grow (&result->mem_pool, '\0');
+	      from_name = (char *) obstack_finish (&result->mem_pool);
+	    }
+	  to_name = NULL;
+
+	  state = 99;
+	  continue;
+
+	case 99:
+	  if (nowtok == tok_ellipsis3)
+	    state = 100;
+
+	  /* Store info.  */
+	  from_name = NULL;
+
+	  /* Warn */
+	  state = 98;
+	  continue;
+
+	case 100:
+	  if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
+	    {
+	      lr_error (cmfile, _("syntax error in %s definition: %s"),
+			"WIDTH_VARIABLE",
+			_("no symbolic name given for end of range"));
+	      lr_ignore_rest (cmfile, 0);
+	      continue;
+	    }
+
+	  if (nowtok == tok_bsymbol)
+	    to_name = (char *) obstack_copy0 (&result->mem_pool,
+					      now->val.str.startmb,
+					      now->val.str.lenmb);
+	  else
+	    {
+	      obstack_printf (&result->mem_pool, "U%08X",
+			      cmfile->token.val.ucs4);
+	      obstack_1grow (&result->mem_pool, '\0');
+	      to_name = (char *) obstack_finish (&result->mem_pool);
+	    }
+
+	  /* XXX Enter value into table.  */
+
+	  lr_ignore_rest (cmfile, 1);
+
+	  state = 98;
+	  continue;
+
+	default:
+	  WITH_CUR_LOCALE (error (5, 0, _("%s: error in state machine"),
+				  __FILE__));
+	  /* NOTREACHED */
+	}
+      break;
+    }
+
+  if (state != 91 && !be_quiet)
+    WITH_CUR_LOCALE (error (0, 0, _("%s: premature end of file"),
+			    cmfile->fname));
+
+  lr_close (cmfile);
+
+  return result;
+}
+
+
+static void
+new_width (struct linereader *cmfile, struct charmap_t *result,
+	   const char *from, const char *to, unsigned long int width)
+{
+  struct charseq *from_val;
+  struct charseq *to_val;
+
+  from_val = charmap_find_value (result, from, strlen (from));
+  if (from_val == NULL)
+    {
+      lr_error (cmfile, _("unknown character `%s'"), from);
+      return;
+    }
+
+  if (to == NULL)
+    to_val = from_val;
+  else
+    {
+      to_val = charmap_find_value (result, to, strlen (to));
+      if (to_val == NULL)
+	{
+	  lr_error (cmfile, _("unknown character `%s'"), to);
+	  return;
+	}
+
+      /* Make sure the number of bytes for the end points of the range
+	 is correct.  */
+      if (from_val->nbytes != to_val->nbytes)
+	{
+	  lr_error (cmfile, _("\
+number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
+		    from_val->nbytes, to_val->nbytes);
+	  return;
+	}
+    }
+
+  if (result->nwidth_rules >= result->nwidth_rules_max)
+    {
+      size_t new_size = result->nwidth_rules + 32;
+      struct width_rule *new_rules =
+	(struct width_rule *) obstack_alloc (&result->mem_pool,
+					     (new_size
+					      * sizeof (struct width_rule)));
+
+      memcpy (new_rules, result->width_rules,
+	      result->nwidth_rules_max * sizeof (struct width_rule));
+
+      result->width_rules = new_rules;
+      result->nwidth_rules_max = new_size;
+    }
+
+  result->width_rules[result->nwidth_rules].from = from_val;
+  result->width_rules[result->nwidth_rules].to = to_val;
+  result->width_rules[result->nwidth_rules].width = (unsigned int) width;
+  ++result->nwidth_rules;
+}
+
+
+struct charseq *
+charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
+{
+  void *result;
+
+  return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
+	  < 0 ? NULL : (struct charseq *) result);
+}
+
+
+static void
+charmap_new_char (struct linereader *lr, struct charmap_t *cm,
+		  size_t nbytes, unsigned char *bytes,
+		  const char *from, const char *to,
+		  int decimal_ellipsis, int step)
+{
+  hash_table *ht = &cm->char_table;
+  hash_table *bt = &cm->byte_table;
+  struct obstack *ob = &cm->mem_pool;
+  char *from_end;
+  char *to_end;
+  const char *cp;
+  int prefix_len, len1, len2;
+  unsigned int from_nr, to_nr, cnt;
+  struct charseq *newp;
+
+  len1 = strlen (from);
+
+  if (to == NULL)
+    {
+      newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
+      newp->nbytes = nbytes;
+      memcpy (newp->bytes, bytes, nbytes);
+      newp->name = from;
+
+      newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
+      if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
+	{
+	  /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
+	     xxxx and xxxxxxxx are hexadecimal numbers.  In this case
+	     we use the value of xxxx or xxxxxxxx as the UCS4 value of
+	     this character and we don't have to consult the repertoire
+	     map.
+
+	     If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
+	     and xxxxxxxx also give the code point in UCS4 but this must
+	     be in the private, i.e., unassigned, area.  This should be
+	     used for characters which do not (yet) have an equivalent
+	     in ISO 10646 and Unicode.  */
+	  char *endp;
+
+	  errno = 0;
+	  newp->ucs4 = strtoul (from + 1, &endp, 16);
+	  if (endp - from != len1
+	      || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
+	      || newp->ucs4 >= 0x80000000)
+	    /* This wasn't successful.  Signal this name cannot be a
+	       correct UCS value.  */
+	    newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
+	}
+
+      insert_entry (ht, from, len1, newp);
+      insert_entry (bt, newp->bytes, nbytes, newp);
+      /* Please note that it isn't a bug if a symbol is defined more
+	 than once.  All later definitions are simply discarded.  */
+      return;
+    }
+
+  /* We have a range: the names must have names with equal prefixes
+     and an equal number of digits, where the second number is greater
+     or equal than the first.  */
+  len2 = strlen (to);
+
+  if (len1 != len2)
+    {
+    illegal_range:
+      lr_error (lr, _("invalid names for character range"));
+      return;
+    }
+
+  cp = &from[len1 - 1];
+  if (decimal_ellipsis)
+    while (isdigit (*cp) && cp >= from)
+      --cp;
+  else
+    while (isxdigit (*cp) && cp >= from)
+      {
+	if (!isdigit (*cp) && !isupper (*cp))
+	  lr_error (lr, _("\
+hexadecimal range format should use only capital characters"));
+	--cp;
+      }
+
+  prefix_len = (cp - from) + 1;
+
+  if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
+    goto illegal_range;
+
+  errno = 0;
+  from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
+  if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
+      || ((to_nr = strtoul (&to[prefix_len], &to_end,
+			    decimal_ellipsis ? 10 : 16)) == UINT_MAX
+	  && errno == ERANGE)
+      || *to_end != '\0')
+    {
+      lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
+      return;
+    }
+
+  if (from_nr > to_nr)
+    {
+      lr_error (lr, _("upper limit in range is smaller than lower limit"));
+      return;
+    }
+
+  for (cnt = from_nr; cnt <= to_nr; cnt += step)
+    {
+      char *name_end;
+      obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
+		      prefix_len, from, len1 - prefix_len, cnt);
+      obstack_1grow (ob, '\0');
+      name_end = obstack_finish (ob);
+
+      newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
+      newp->nbytes = nbytes;
+      memcpy (newp->bytes, bytes, nbytes);
+      newp->name = name_end;
+
+      newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
+      if ((name_end[0] == 'U' || name_end[0] == 'P')
+	  && (len1 == 5 || len1 == 9))
+	{
+	  /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
+	     xxxx and xxxxxxxx are hexadecimal numbers.  In this case
+	     we use the value of xxxx or xxxxxxxx as the UCS4 value of
+	     this character and we don't have to consult the repertoire
+	     map.
+
+	     If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
+	     and xxxxxxxx also give the code point in UCS4 but this must
+	     be in the private, i.e., unassigned, area.  This should be
+	     used for characters which do not (yet) have an equivalent
+	     in ISO 10646 and Unicode.  */
+	  char *endp;
+
+	  errno = 0;
+	  newp->ucs4 = strtoul (name_end + 1, &endp, 16);
+	  if (endp - name_end != len1
+	      || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
+	      || newp->ucs4 >= 0x80000000)
+	    /* This wasn't successful.  Signal this name cannot be a
+	       correct UCS value.  */
+	    newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
+	}
+
+      insert_entry (ht, name_end, len1, newp);
+      insert_entry (bt, newp->bytes, nbytes, newp);
+      /* Please note we don't examine the return value since it is no error
+	 if we have two definitions for a symbol.  */
+
+      /* Increment the value in the byte sequence.  */
+      if (++bytes[nbytes - 1] == '\0')
+	{
+	  int b = nbytes - 2;
+
+	  do
+	    if (b < 0)
+	      {
+		lr_error (lr,
+			  _("resulting bytes for range not representable."));
+		return;
+	      }
+	  while (++bytes[b--] == 0);
+	}
+    }
+}
+
+
+struct charseq *
+charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
+		     size_t nbytes)
+{
+  void *result;
+
+  return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
+	  < 0 ? NULL : (struct charseq *) result);
+}