about summary refs log tree commit diff
path: root/REORG.TODO/iconv/iconv_prog.c
diff options
context:
space:
mode:
Diffstat (limited to 'REORG.TODO/iconv/iconv_prog.c')
-rw-r--r--REORG.TODO/iconv/iconv_prog.c803
1 files changed, 803 insertions, 0 deletions
diff --git a/REORG.TODO/iconv/iconv_prog.c b/REORG.TODO/iconv/iconv_prog.c
new file mode 100644
index 0000000000..1397d2e9bd
--- /dev/null
+++ b/REORG.TODO/iconv/iconv_prog.c
@@ -0,0 +1,803 @@
+/* Convert text in given files from the specified from-set to the to-set.
+   Copyright (C) 1998-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published
+   by the Free Software Foundation; version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.  */
+
+#include <argp.h>
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <error.h>
+#include <fcntl.h>
+#include <iconv.h>
+#include <langinfo.h>
+#include <locale.h>
+#include <search.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <libintl.h>
+#ifdef _POSIX_MAPPED_FILES
+# include <sys/mman.h>
+#endif
+#include <charmap.h>
+#include <gconv_int.h>
+#include "iconv_prog.h"
+#include "iconvconfig.h"
+
+/* Get libc version number.  */
+#include "../version.h"
+
+#define PACKAGE _libc_intl_domainname
+
+
+/* Name and version of program.  */
+static void print_version (FILE *stream, struct argp_state *state);
+void (*argp_program_version_hook) (FILE *, struct argp_state *) = print_version;
+
+#define OPT_VERBOSE	1000
+#define OPT_LIST	'l'
+
+/* Definitions of arguments for argp functions.  */
+static const struct argp_option options[] =
+{
+  { NULL, 0, NULL, 0, N_("Input/Output format specification:") },
+  { "from-code", 'f', N_("NAME"), 0, N_("encoding of original text") },
+  { "to-code", 't', N_("NAME"), 0, N_("encoding for output") },
+  { NULL, 0, NULL, 0, N_("Information:") },
+  { "list", 'l', NULL, 0, N_("list all known coded character sets") },
+  { NULL, 0, NULL, 0, N_("Output control:") },
+  { NULL, 'c', NULL, 0, N_("omit invalid characters from output") },
+  { "output", 'o', N_("FILE"), 0, N_("output file") },
+  { "silent", 's', NULL, 0, N_("suppress warnings") },
+  { "verbose", OPT_VERBOSE, NULL, 0, N_("print progress information") },
+  { NULL, 0, NULL, 0, NULL }
+};
+
+/* Short description of program.  */
+static const char doc[] = N_("\
+Convert encoding of given files from one encoding to another.");
+
+/* Strings for arguments in help texts.  */
+static const char args_doc[] = N_("[FILE...]");
+
+/* Prototype for option handler.  */
+static error_t parse_opt (int key, char *arg, struct argp_state *state);
+
+/* Function to print some extra text in the help message.  */
+static char *more_help (int key, const char *text, void *input);
+
+/* Data structure to communicate with argp functions.  */
+static struct argp argp =
+{
+  options, parse_opt, args_doc, doc, NULL, more_help
+};
+
+/* Code sets to convert from and to respectively.  An empty string as the
+   default causes the 'iconv_open' function to look up the charset of the
+   currently selected locale and use it.  */
+static const char *from_code = "";
+static const char *to_code = "";
+
+/* File to write output to.  If NULL write to stdout.  */
+static const char *output_file;
+
+/* Nonzero if verbose ouput is wanted.  */
+int verbose;
+
+/* Nonzero if list of all coded character sets is wanted.  */
+static int list;
+
+/* If nonzero omit invalid character from output.  */
+int omit_invalid;
+
+/* Prototypes for the functions doing the actual work.  */
+static int process_block (iconv_t cd, char *addr, size_t len, FILE **output,
+			  const char *output_file);
+static int process_fd (iconv_t cd, int fd, FILE **output,
+		       const char *output_file);
+static int process_file (iconv_t cd, FILE *input, FILE **output,
+			 const char *output_file);
+static void print_known_names (void) internal_function;
+
+
+int
+main (int argc, char *argv[])
+{
+  int status = EXIT_SUCCESS;
+  int remaining;
+  iconv_t cd;
+  const char *orig_to_code;
+  struct charmap_t *from_charmap = NULL;
+  struct charmap_t *to_charmap = NULL;
+
+  /* Set locale via LC_ALL.  */
+  setlocale (LC_ALL, "");
+
+  /* Set the text message domain.  */
+  textdomain (_libc_intl_domainname);
+
+  /* Parse and process arguments.  */
+  argp_parse (&argp, argc, argv, 0, &remaining, NULL);
+
+  /* List all coded character sets if wanted.  */
+  if (list)
+    {
+      print_known_names ();
+      exit (EXIT_SUCCESS);
+    }
+
+  /* If we have to ignore errors make sure we use the appropriate name for
+     the to-character-set.  */
+  orig_to_code = to_code;
+  if (omit_invalid)
+    {
+      const char *errhand = strchrnul (to_code, '/');
+      int nslash = 2;
+      char *newp;
+      char *cp;
+
+      if (*errhand == '/')
+	{
+	  --nslash;
+	  errhand = strchrnul (errhand + 1, '/');
+
+	  if (*errhand == '/')
+	    {
+	      --nslash;
+	      errhand = strchr (errhand, '\0');
+	    }
+	}
+
+      newp = (char *) alloca (errhand - to_code + nslash + 7 + 1);
+      cp = mempcpy (newp, to_code, errhand - to_code);
+      while (nslash-- > 0)
+	*cp++ = '/';
+      if (cp[-1] != '/')
+	*cp++ = ',';
+      memcpy (cp, "IGNORE", sizeof ("IGNORE"));
+
+      to_code = newp;
+    }
+
+  /* POSIX 1003.2b introduces a silly thing: the arguments to -t anf -f
+     can be file names of charmaps.  In this case iconv will have to read
+     those charmaps and use them to do the conversion.  But there are
+     holes in the specification.  There is nothing said that if -f is a
+     charmap filename that -t must be, too.  And vice versa.  There is
+     also no word about the symbolic names used.  What if they don't
+     match?  */
+  if (strchr (from_code, '/') != NULL)
+    /* The from-name might be a charmap file name.  Try reading the
+       file.  */
+    from_charmap = charmap_read (from_code, /*0, 1*/1, 0, 0, 0);
+
+  if (strchr (orig_to_code, '/') != NULL)
+    /* The to-name might be a charmap file name.  Try reading the
+       file.  */
+    to_charmap = charmap_read (orig_to_code, /*0, 1,*/1, 0, 0, 0);
+
+
+  /* At this point we have to handle two cases.  The first one is
+     where a charmap is used for the from- or to-charset, or both.  We
+     handle this special since it is very different from the sane way of
+     doing things.  The other case allows converting using the iconv()
+     function.  */
+  if (from_charmap != NULL || to_charmap != NULL)
+    /* Construct the conversion table and do the conversion.  */
+    status = charmap_conversion (from_code, from_charmap, to_code, to_charmap,
+				 argc, remaining, argv, output_file);
+  else
+    {
+      /* Let's see whether we have these coded character sets.  */
+      cd = iconv_open (to_code, from_code);
+      if (cd == (iconv_t) -1)
+	{
+	  if (errno == EINVAL)
+	    {
+	      /* Try to be nice with the user and tell her which of the
+		 two encoding names is wrong.  This is possible because
+		 all supported encodings can be converted from/to Unicode,
+		 in other words, because the graph of encodings is
+		 connected.  */
+	      bool from_wrong =
+		(iconv_open ("UTF-8", from_code) == (iconv_t) -1
+		 && errno == EINVAL);
+	      bool to_wrong =
+		(iconv_open (to_code, "UTF-8") == (iconv_t) -1
+		 && errno == EINVAL);
+	      const char *from_pretty =
+		(from_code[0] ? from_code : nl_langinfo (CODESET));
+	      const char *to_pretty =
+		(orig_to_code[0] ? orig_to_code : nl_langinfo (CODESET));
+
+	      if (from_wrong)
+		{
+		  if (to_wrong)
+		    error (0, 0,
+			   _("\
+conversions from `%s' and to `%s' are not supported"),
+			   from_pretty, to_pretty);
+		  else
+		    error (0, 0,
+			   _("conversion from `%s' is not supported"),
+			   from_pretty);
+		}
+	      else
+		{
+		  if (to_wrong)
+		    error (0, 0,
+			   _("conversion to `%s' is not supported"),
+			   to_pretty);
+		  else
+		    error (0, 0,
+			   _("conversion from `%s' to `%s' is not supported"),
+			   from_pretty, to_pretty);
+		}
+
+	      argp_help (&argp, stderr, ARGP_HELP_SEE,
+			 program_invocation_short_name);
+	      exit (1);
+	    }
+	  else
+	    error (EXIT_FAILURE, errno,
+		   _("failed to start conversion processing"));
+	}
+
+      /* The output file.  Will be opened when we are ready to produce
+	 output.  */
+      FILE *output = NULL;
+
+      /* Now process the remaining files.  Write them to stdout or the file
+	 specified with the `-o' parameter.  If we have no file given as
+	 the parameter process all from stdin.  */
+      if (remaining == argc)
+	{
+	  if (process_file (cd, stdin, &output, output_file) != 0)
+	    status = EXIT_FAILURE;
+	}
+      else
+	do
+	  {
+#ifdef _POSIX_MAPPED_FILES
+	    struct stat64 st;
+	    char *addr;
+#endif
+	    int fd, ret;
+
+	    if (verbose)
+	      fprintf (stderr, "%s:\n", argv[remaining]);
+	    if (strcmp (argv[remaining], "-") == 0)
+	      fd = 0;
+	    else
+	      {
+		fd = open (argv[remaining], O_RDONLY);
+
+		if (fd == -1)
+		  {
+		    error (0, errno, _("cannot open input file `%s'"),
+			   argv[remaining]);
+		    status = EXIT_FAILURE;
+		    continue;
+		  }
+	      }
+
+#ifdef _POSIX_MAPPED_FILES
+	    /* We have possibilities for reading the input file.  First try
+	       to mmap() it since this will provide the fastest solution.  */
+	    if (fstat64 (fd, &st) == 0
+		&& ((addr = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE,
+				  fd, 0)) != MAP_FAILED))
+	      {
+		/* Yes, we can use mmap().  The descriptor is not needed
+		   anymore.  */
+		if (close (fd) != 0)
+		  error (EXIT_FAILURE, errno,
+			 _("error while closing input `%s'"),
+			 argv[remaining]);
+
+		ret = process_block (cd, addr, st.st_size, &output,
+				     output_file);
+
+		/* We don't need the input data anymore.  */
+		munmap ((void *) addr, st.st_size);
+
+		if (ret != 0)
+		  {
+		    status = EXIT_FAILURE;
+
+		    if (ret < 0)
+		      /* We cannot go on with producing output since it might
+			 lead to problem because the last output might leave
+			 the output stream in an undefined state.  */
+		      break;
+		  }
+	      }
+	    else
+#endif	/* _POSIX_MAPPED_FILES */
+	      {
+		/* Read the file in pieces.  */
+		ret = process_fd (cd, fd, &output, output_file);
+
+		/* Now close the file.  */
+		close (fd);
+
+		if (ret != 0)
+		  {
+		    /* Something went wrong.  */
+		    status = EXIT_FAILURE;
+
+		    if (ret < 0)
+		      /* We cannot go on with producing output since it might
+			 lead to problem because the last output might leave
+			 the output stream in an undefined state.  */
+		      break;
+		  }
+	      }
+	  }
+	while (++remaining < argc);
+
+      /* Close the output file now.  */
+      if (output != NULL && fclose (output))
+	error (EXIT_FAILURE, errno, _("error while closing output file"));
+    }
+
+  return status;
+}
+
+
+/* Handle program arguments.  */
+static error_t
+parse_opt (int key, char *arg, struct argp_state *state)
+{
+  switch (key)
+    {
+    case 'f':
+      from_code = arg;
+      break;
+    case 't':
+      to_code = arg;
+      break;
+    case 'o':
+      output_file = arg;
+      break;
+    case 's':
+      /* Nothing, for now at least.  We are not giving out any information
+	 about missing character or so.  */
+      break;
+    case 'c':
+      /* Omit invalid characters from output.  */
+      omit_invalid = 1;
+      break;
+    case OPT_VERBOSE:
+      verbose = 1;
+      break;
+    case OPT_LIST:
+      list = 1;
+      break;
+    default:
+      return ARGP_ERR_UNKNOWN;
+    }
+  return 0;
+}
+
+
+static char *
+more_help (int key, const char *text, void *input)
+{
+  char *tp = NULL;
+  switch (key)
+    {
+    case ARGP_KEY_HELP_EXTRA:
+      /* We print some extra information.  */
+      if (asprintf (&tp, gettext ("\
+For bug reporting instructions, please see:\n\
+%s.\n"), REPORT_BUGS_TO) < 0)
+	return NULL;
+      return tp;
+    default:
+      break;
+    }
+  return (char *) text;
+}
+
+
+/* Print the version information.  */
+static void
+print_version (FILE *stream, struct argp_state *state)
+{
+  fprintf (stream, "iconv %s%s\n", PKGVERSION, VERSION);
+  fprintf (stream, gettext ("\
+Copyright (C) %s Free Software Foundation, Inc.\n\
+This is free software; see the source for copying conditions.  There is NO\n\
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
+"), "2017");
+  fprintf (stream, gettext ("Written by %s.\n"), "Ulrich Drepper");
+}
+
+
+static int
+write_output (const char *outbuf, const char *outptr, FILE **output,
+	      const char *output_file)
+{
+  /* We have something to write out.  */
+  int errno_save = errno;
+
+  if (*output == NULL)
+    {
+      /* Determine output file.  */
+      if (output_file != NULL && strcmp (output_file, "-") != 0)
+	{
+	  *output = fopen (output_file, "w");
+	  if (*output == NULL)
+	    error (EXIT_FAILURE, errno, _("cannot open output file"));
+	}
+      else
+	*output = stdout;
+    }
+
+  if (fwrite (outbuf, 1, outptr - outbuf, *output) < (size_t) (outptr - outbuf)
+      || ferror (*output))
+    {
+      /* Error occurred while printing the result.  */
+      error (0, 0, _("\
+conversion stopped due to problem in writing the output"));
+      return -1;
+    }
+
+  errno = errno_save;
+
+  return 0;
+}
+
+
+static int
+process_block (iconv_t cd, char *addr, size_t len, FILE **output,
+	       const char *output_file)
+{
+#define OUTBUF_SIZE	32768
+  const char *start = addr;
+  char outbuf[OUTBUF_SIZE];
+  char *outptr;
+  size_t outlen;
+  size_t n;
+  int ret = 0;
+
+  while (len > 0)
+    {
+      outptr = outbuf;
+      outlen = OUTBUF_SIZE;
+      n = iconv (cd, &addr, &len, &outptr, &outlen);
+
+      if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
+	{
+	  ret = 1;
+	  if (len == 0)
+	    n = 0;
+	  else
+	    errno = E2BIG;
+	}
+
+      if (outptr != outbuf)
+	{
+	  ret = write_output (outbuf, outptr, output, output_file);
+	  if (ret != 0)
+	    break;
+	}
+
+      if (n != (size_t) -1)
+	{
+	  /* All the input test is processed.  For state-dependent
+	     character sets we have to flush the state now.  */
+	  outptr = outbuf;
+	  outlen = OUTBUF_SIZE;
+	  n = iconv (cd, NULL, NULL, &outptr, &outlen);
+
+	  if (outptr != outbuf)
+	    {
+	      ret = write_output (outbuf, outptr, output, output_file);
+	      if (ret != 0)
+		break;
+	    }
+
+	  if (n != (size_t) -1)
+	    break;
+
+	  if (omit_invalid && errno == EILSEQ)
+	    {
+	      ret = 1;
+	      break;
+	    }
+	}
+
+      if (errno != E2BIG)
+	{
+	  /* iconv() ran into a problem.  */
+	  switch (errno)
+	    {
+	    case EILSEQ:
+	      if (! omit_invalid)
+		error (0, 0, _("illegal input sequence at position %ld"),
+		       (long int) (addr - start));
+	      break;
+	    case EINVAL:
+	      error (0, 0, _("\
+incomplete character or shift sequence at end of buffer"));
+	      break;
+	    case EBADF:
+	      error (0, 0, _("internal error (illegal descriptor)"));
+	      break;
+	    default:
+	      error (0, 0, _("unknown iconv() error %d"), errno);
+	      break;
+	    }
+
+	  return -1;
+	}
+    }
+
+  return ret;
+}
+
+
+static int
+process_fd (iconv_t cd, int fd, FILE **output, const char *output_file)
+{
+  /* we have a problem with reading from a desriptor since we must not
+     provide the iconv() function an incomplete character or shift
+     sequence at the end of the buffer.  Since we have to deal with
+     arbitrary encodings we must read the whole text in a buffer and
+     process it in one step.  */
+  static char *inbuf = NULL;
+  static size_t maxlen = 0;
+  char *inptr = NULL;
+  size_t actlen = 0;
+
+  while (actlen < maxlen)
+    {
+      ssize_t n = read (fd, inptr, maxlen - actlen);
+
+      if (n == 0)
+	/* No more text to read.  */
+	break;
+
+      if (n == -1)
+	{
+	  /* Error while reading.  */
+	  error (0, errno, _("error while reading the input"));
+	  return -1;
+	}
+
+      inptr += n;
+      actlen += n;
+    }
+
+  if (actlen == maxlen)
+    while (1)
+      {
+	ssize_t n;
+	char *new_inbuf;
+
+	/* Increase the buffer.  */
+	new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
+	if (new_inbuf == NULL)
+	  {
+	    error (0, errno, _("unable to allocate buffer for input"));
+	    return -1;
+	  }
+	inbuf = new_inbuf;
+	maxlen += 32768;
+	inptr = inbuf + actlen;
+
+	do
+	  {
+	    n = read (fd, inptr, maxlen - actlen);
+
+	    if (n == 0)
+	      /* No more text to read.  */
+	      break;
+
+	    if (n == -1)
+	      {
+		/* Error while reading.  */
+		error (0, errno, _("error while reading the input"));
+		return -1;
+	      }
+
+	    inptr += n;
+	    actlen += n;
+	  }
+	while (actlen < maxlen);
+
+	if (n == 0)
+	  /* Break again so we leave both loops.  */
+	  break;
+      }
+
+  /* Now we have all the input in the buffer.  Process it in one run.  */
+  return process_block (cd, inbuf, actlen, output, output_file);
+}
+
+
+static int
+process_file (iconv_t cd, FILE *input, FILE **output, const char *output_file)
+{
+  /* This should be safe since we use this function only for `stdin' and
+     we haven't read anything so far.  */
+  return process_fd (cd, fileno (input), output, output_file);
+}
+
+
+/* Print all known character sets/encodings.  */
+static void *printlist;
+static size_t column;
+static int not_first;
+
+static void
+insert_print_list (const void *nodep, VISIT value, int level)
+{
+  if (value == leaf || value == postorder)
+    {
+      const struct gconv_alias *s = *(const struct gconv_alias **) nodep;
+      tsearch (s->fromname, &printlist, (__compar_fn_t) strverscmp);
+    }
+}
+
+static void
+do_print_human  (const void *nodep, VISIT value, int level)
+{
+  if (value == leaf || value == postorder)
+    {
+      const char *s = *(const char **) nodep;
+      size_t len = strlen (s);
+      size_t cnt;
+
+      while (len > 0 && s[len - 1] == '/')
+	--len;
+
+      for (cnt = 0; cnt < len; ++cnt)
+	if (isalnum (s[cnt]))
+	  break;
+      if (cnt == len)
+	return;
+
+      if (not_first)
+	{
+	  putchar (',');
+	  ++column;
+
+	  if (column > 2 && column + len > 77)
+	    {
+	      fputs ("\n  ", stdout);
+	      column = 2;
+	    }
+	  else
+	    {
+	      putchar (' ');
+	      ++column;
+	    }
+	}
+      else
+	not_first = 1;
+
+      fwrite (s, len, 1, stdout);
+      column += len;
+    }
+}
+
+static void
+do_print  (const void *nodep, VISIT value, int level)
+{
+  if (value == leaf || value == postorder)
+    {
+      const char *s = *(const char **) nodep;
+
+      puts (s);
+    }
+}
+
+static void
+internal_function
+add_known_names (struct gconv_module *node)
+{
+  if (node->left != NULL)
+    add_known_names (node->left);
+  if (node->right != NULL)
+    add_known_names (node->right);
+  do
+    {
+      if (strcmp (node->from_string, "INTERNAL") != 0)
+	tsearch (node->from_string, &printlist, (__compar_fn_t) strverscmp);
+      if (strcmp (node->to_string, "INTERNAL") != 0)
+	tsearch (node->to_string, &printlist, (__compar_fn_t) strverscmp);
+
+      node = node->same;
+    }
+  while (node != NULL);
+}
+
+
+static void
+insert_cache (void)
+{
+  const struct gconvcache_header *header;
+  const char *strtab;
+  const struct hash_entry *hashtab;
+  size_t cnt;
+
+  header = (const struct gconvcache_header *) __gconv_get_cache ();
+  strtab = (char *) header + header->string_offset;
+  hashtab = (struct hash_entry *) ((char *) header + header->hash_offset);
+
+  for (cnt = 0; cnt < header->hash_size; ++cnt)
+    if (hashtab[cnt].string_offset != 0)
+      {
+	const char *str = strtab + hashtab[cnt].string_offset;
+
+	if (strcmp (str, "INTERNAL") != 0)
+	  tsearch (str, &printlist, (__compar_fn_t) strverscmp);
+      }
+}
+
+
+static void
+internal_function
+print_known_names (void)
+{
+  iconv_t h;
+  void *cache;
+
+  /* We must initialize the internal databases first.  */
+  h = iconv_open ("L1", "L1");
+  iconv_close (h);
+
+  /* See whether we have a cache.  */
+  cache = __gconv_get_cache ();
+  if (cache != NULL)
+    /* Yep, use only this information.  */
+    insert_cache ();
+  else
+    {
+      struct gconv_module *modules;
+
+      /* No, then use the information read from the gconv-modules file.
+	 First add the aliases.  */
+      twalk (__gconv_get_alias_db (), insert_print_list);
+
+      /* Add the from- and to-names from the known modules.  */
+      modules = __gconv_get_modules_db ();
+      if (modules != NULL)
+	add_known_names (modules);
+    }
+
+  bool human_readable = isatty (fileno (stdout));
+
+  if (human_readable)
+    fputs (_("\
+The following list contains all the coded character sets known.  This does\n\
+not necessarily mean that all combinations of these names can be used for\n\
+the FROM and TO command line parameters.  One coded character set can be\n\
+listed with several different names (aliases).\n\n  "), stdout);
+
+  /* Now print the collected names.  */
+  column = 2;
+  twalk (printlist, human_readable ? do_print_human : do_print);
+
+  if (human_readable && column != 0)
+    puts ("");
+}