diff options
Diffstat (limited to 'REORG.TODO/iconv/iconv_prog.c')
-rw-r--r-- | REORG.TODO/iconv/iconv_prog.c | 803 |
1 files changed, 803 insertions, 0 deletions
diff --git a/REORG.TODO/iconv/iconv_prog.c b/REORG.TODO/iconv/iconv_prog.c new file mode 100644 index 0000000000..1397d2e9bd --- /dev/null +++ b/REORG.TODO/iconv/iconv_prog.c @@ -0,0 +1,803 @@ +/* Convert text in given files from the specified from-set to the to-set. + Copyright (C) 1998-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <http://www.gnu.org/licenses/>. */ + +#include <argp.h> +#include <assert.h> +#include <ctype.h> +#include <errno.h> +#include <error.h> +#include <fcntl.h> +#include <iconv.h> +#include <langinfo.h> +#include <locale.h> +#include <search.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <libintl.h> +#ifdef _POSIX_MAPPED_FILES +# include <sys/mman.h> +#endif +#include <charmap.h> +#include <gconv_int.h> +#include "iconv_prog.h" +#include "iconvconfig.h" + +/* Get libc version number. */ +#include "../version.h" + +#define PACKAGE _libc_intl_domainname + + +/* Name and version of program. */ +static void print_version (FILE *stream, struct argp_state *state); +void (*argp_program_version_hook) (FILE *, struct argp_state *) = print_version; + +#define OPT_VERBOSE 1000 +#define OPT_LIST 'l' + +/* Definitions of arguments for argp functions. */ +static const struct argp_option options[] = +{ + { NULL, 0, NULL, 0, N_("Input/Output format specification:") }, + { "from-code", 'f', N_("NAME"), 0, N_("encoding of original text") }, + { "to-code", 't', N_("NAME"), 0, N_("encoding for output") }, + { NULL, 0, NULL, 0, N_("Information:") }, + { "list", 'l', NULL, 0, N_("list all known coded character sets") }, + { NULL, 0, NULL, 0, N_("Output control:") }, + { NULL, 'c', NULL, 0, N_("omit invalid characters from output") }, + { "output", 'o', N_("FILE"), 0, N_("output file") }, + { "silent", 's', NULL, 0, N_("suppress warnings") }, + { "verbose", OPT_VERBOSE, NULL, 0, N_("print progress information") }, + { NULL, 0, NULL, 0, NULL } +}; + +/* Short description of program. */ +static const char doc[] = N_("\ +Convert encoding of given files from one encoding to another."); + +/* Strings for arguments in help texts. */ +static const char args_doc[] = N_("[FILE...]"); + +/* Prototype for option handler. */ +static error_t parse_opt (int key, char *arg, struct argp_state *state); + +/* Function to print some extra text in the help message. */ +static char *more_help (int key, const char *text, void *input); + +/* Data structure to communicate with argp functions. */ +static struct argp argp = +{ + options, parse_opt, args_doc, doc, NULL, more_help +}; + +/* Code sets to convert from and to respectively. An empty string as the + default causes the 'iconv_open' function to look up the charset of the + currently selected locale and use it. */ +static const char *from_code = ""; +static const char *to_code = ""; + +/* File to write output to. If NULL write to stdout. */ +static const char *output_file; + +/* Nonzero if verbose ouput is wanted. */ +int verbose; + +/* Nonzero if list of all coded character sets is wanted. */ +static int list; + +/* If nonzero omit invalid character from output. */ +int omit_invalid; + +/* Prototypes for the functions doing the actual work. */ +static int process_block (iconv_t cd, char *addr, size_t len, FILE **output, + const char *output_file); +static int process_fd (iconv_t cd, int fd, FILE **output, + const char *output_file); +static int process_file (iconv_t cd, FILE *input, FILE **output, + const char *output_file); +static void print_known_names (void) internal_function; + + +int +main (int argc, char *argv[]) +{ + int status = EXIT_SUCCESS; + int remaining; + iconv_t cd; + const char *orig_to_code; + struct charmap_t *from_charmap = NULL; + struct charmap_t *to_charmap = NULL; + + /* Set locale via LC_ALL. */ + setlocale (LC_ALL, ""); + + /* Set the text message domain. */ + textdomain (_libc_intl_domainname); + + /* Parse and process arguments. */ + argp_parse (&argp, argc, argv, 0, &remaining, NULL); + + /* List all coded character sets if wanted. */ + if (list) + { + print_known_names (); + exit (EXIT_SUCCESS); + } + + /* If we have to ignore errors make sure we use the appropriate name for + the to-character-set. */ + orig_to_code = to_code; + if (omit_invalid) + { + const char *errhand = strchrnul (to_code, '/'); + int nslash = 2; + char *newp; + char *cp; + + if (*errhand == '/') + { + --nslash; + errhand = strchrnul (errhand + 1, '/'); + + if (*errhand == '/') + { + --nslash; + errhand = strchr (errhand, '\0'); + } + } + + newp = (char *) alloca (errhand - to_code + nslash + 7 + 1); + cp = mempcpy (newp, to_code, errhand - to_code); + while (nslash-- > 0) + *cp++ = '/'; + if (cp[-1] != '/') + *cp++ = ','; + memcpy (cp, "IGNORE", sizeof ("IGNORE")); + + to_code = newp; + } + + /* POSIX 1003.2b introduces a silly thing: the arguments to -t anf -f + can be file names of charmaps. In this case iconv will have to read + those charmaps and use them to do the conversion. But there are + holes in the specification. There is nothing said that if -f is a + charmap filename that -t must be, too. And vice versa. There is + also no word about the symbolic names used. What if they don't + match? */ + if (strchr (from_code, '/') != NULL) + /* The from-name might be a charmap file name. Try reading the + file. */ + from_charmap = charmap_read (from_code, /*0, 1*/1, 0, 0, 0); + + if (strchr (orig_to_code, '/') != NULL) + /* The to-name might be a charmap file name. Try reading the + file. */ + to_charmap = charmap_read (orig_to_code, /*0, 1,*/1, 0, 0, 0); + + + /* At this point we have to handle two cases. The first one is + where a charmap is used for the from- or to-charset, or both. We + handle this special since it is very different from the sane way of + doing things. The other case allows converting using the iconv() + function. */ + if (from_charmap != NULL || to_charmap != NULL) + /* Construct the conversion table and do the conversion. */ + status = charmap_conversion (from_code, from_charmap, to_code, to_charmap, + argc, remaining, argv, output_file); + else + { + /* Let's see whether we have these coded character sets. */ + cd = iconv_open (to_code, from_code); + if (cd == (iconv_t) -1) + { + if (errno == EINVAL) + { + /* Try to be nice with the user and tell her which of the + two encoding names is wrong. This is possible because + all supported encodings can be converted from/to Unicode, + in other words, because the graph of encodings is + connected. */ + bool from_wrong = + (iconv_open ("UTF-8", from_code) == (iconv_t) -1 + && errno == EINVAL); + bool to_wrong = + (iconv_open (to_code, "UTF-8") == (iconv_t) -1 + && errno == EINVAL); + const char *from_pretty = + (from_code[0] ? from_code : nl_langinfo (CODESET)); + const char *to_pretty = + (orig_to_code[0] ? orig_to_code : nl_langinfo (CODESET)); + + if (from_wrong) + { + if (to_wrong) + error (0, 0, + _("\ +conversions from `%s' and to `%s' are not supported"), + from_pretty, to_pretty); + else + error (0, 0, + _("conversion from `%s' is not supported"), + from_pretty); + } + else + { + if (to_wrong) + error (0, 0, + _("conversion to `%s' is not supported"), + to_pretty); + else + error (0, 0, + _("conversion from `%s' to `%s' is not supported"), + from_pretty, to_pretty); + } + + argp_help (&argp, stderr, ARGP_HELP_SEE, + program_invocation_short_name); + exit (1); + } + else + error (EXIT_FAILURE, errno, + _("failed to start conversion processing")); + } + + /* The output file. Will be opened when we are ready to produce + output. */ + FILE *output = NULL; + + /* Now process the remaining files. Write them to stdout or the file + specified with the `-o' parameter. If we have no file given as + the parameter process all from stdin. */ + if (remaining == argc) + { + if (process_file (cd, stdin, &output, output_file) != 0) + status = EXIT_FAILURE; + } + else + do + { +#ifdef _POSIX_MAPPED_FILES + struct stat64 st; + char *addr; +#endif + int fd, ret; + + if (verbose) + fprintf (stderr, "%s:\n", argv[remaining]); + if (strcmp (argv[remaining], "-") == 0) + fd = 0; + else + { + fd = open (argv[remaining], O_RDONLY); + + if (fd == -1) + { + error (0, errno, _("cannot open input file `%s'"), + argv[remaining]); + status = EXIT_FAILURE; + continue; + } + } + +#ifdef _POSIX_MAPPED_FILES + /* We have possibilities for reading the input file. First try + to mmap() it since this will provide the fastest solution. */ + if (fstat64 (fd, &st) == 0 + && ((addr = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE, + fd, 0)) != MAP_FAILED)) + { + /* Yes, we can use mmap(). The descriptor is not needed + anymore. */ + if (close (fd) != 0) + error (EXIT_FAILURE, errno, + _("error while closing input `%s'"), + argv[remaining]); + + ret = process_block (cd, addr, st.st_size, &output, + output_file); + + /* We don't need the input data anymore. */ + munmap ((void *) addr, st.st_size); + + if (ret != 0) + { + status = EXIT_FAILURE; + + if (ret < 0) + /* We cannot go on with producing output since it might + lead to problem because the last output might leave + the output stream in an undefined state. */ + break; + } + } + else +#endif /* _POSIX_MAPPED_FILES */ + { + /* Read the file in pieces. */ + ret = process_fd (cd, fd, &output, output_file); + + /* Now close the file. */ + close (fd); + + if (ret != 0) + { + /* Something went wrong. */ + status = EXIT_FAILURE; + + if (ret < 0) + /* We cannot go on with producing output since it might + lead to problem because the last output might leave + the output stream in an undefined state. */ + break; + } + } + } + while (++remaining < argc); + + /* Close the output file now. */ + if (output != NULL && fclose (output)) + error (EXIT_FAILURE, errno, _("error while closing output file")); + } + + return status; +} + + +/* Handle program arguments. */ +static error_t +parse_opt (int key, char *arg, struct argp_state *state) +{ + switch (key) + { + case 'f': + from_code = arg; + break; + case 't': + to_code = arg; + break; + case 'o': + output_file = arg; + break; + case 's': + /* Nothing, for now at least. We are not giving out any information + about missing character or so. */ + break; + case 'c': + /* Omit invalid characters from output. */ + omit_invalid = 1; + break; + case OPT_VERBOSE: + verbose = 1; + break; + case OPT_LIST: + list = 1; + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + + +static char * +more_help (int key, const char *text, void *input) +{ + char *tp = NULL; + switch (key) + { + case ARGP_KEY_HELP_EXTRA: + /* We print some extra information. */ + if (asprintf (&tp, gettext ("\ +For bug reporting instructions, please see:\n\ +%s.\n"), REPORT_BUGS_TO) < 0) + return NULL; + return tp; + default: + break; + } + return (char *) text; +} + + +/* Print the version information. */ +static void +print_version (FILE *stream, struct argp_state *state) +{ + fprintf (stream, "iconv %s%s\n", PKGVERSION, VERSION); + fprintf (stream, gettext ("\ +Copyright (C) %s Free Software Foundation, Inc.\n\ +This is free software; see the source for copying conditions. There is NO\n\ +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\ +"), "2017"); + fprintf (stream, gettext ("Written by %s.\n"), "Ulrich Drepper"); +} + + +static int +write_output (const char *outbuf, const char *outptr, FILE **output, + const char *output_file) +{ + /* We have something to write out. */ + int errno_save = errno; + + if (*output == NULL) + { + /* Determine output file. */ + if (output_file != NULL && strcmp (output_file, "-") != 0) + { + *output = fopen (output_file, "w"); + if (*output == NULL) + error (EXIT_FAILURE, errno, _("cannot open output file")); + } + else + *output = stdout; + } + + if (fwrite (outbuf, 1, outptr - outbuf, *output) < (size_t) (outptr - outbuf) + || ferror (*output)) + { + /* Error occurred while printing the result. */ + error (0, 0, _("\ +conversion stopped due to problem in writing the output")); + return -1; + } + + errno = errno_save; + + return 0; +} + + +static int +process_block (iconv_t cd, char *addr, size_t len, FILE **output, + const char *output_file) +{ +#define OUTBUF_SIZE 32768 + const char *start = addr; + char outbuf[OUTBUF_SIZE]; + char *outptr; + size_t outlen; + size_t n; + int ret = 0; + + while (len > 0) + { + outptr = outbuf; + outlen = OUTBUF_SIZE; + n = iconv (cd, &addr, &len, &outptr, &outlen); + + if (n == (size_t) -1 && omit_invalid && errno == EILSEQ) + { + ret = 1; + if (len == 0) + n = 0; + else + errno = E2BIG; + } + + if (outptr != outbuf) + { + ret = write_output (outbuf, outptr, output, output_file); + if (ret != 0) + break; + } + + if (n != (size_t) -1) + { + /* All the input test is processed. For state-dependent + character sets we have to flush the state now. */ + outptr = outbuf; + outlen = OUTBUF_SIZE; + n = iconv (cd, NULL, NULL, &outptr, &outlen); + + if (outptr != outbuf) + { + ret = write_output (outbuf, outptr, output, output_file); + if (ret != 0) + break; + } + + if (n != (size_t) -1) + break; + + if (omit_invalid && errno == EILSEQ) + { + ret = 1; + break; + } + } + + if (errno != E2BIG) + { + /* iconv() ran into a problem. */ + switch (errno) + { + case EILSEQ: + if (! omit_invalid) + error (0, 0, _("illegal input sequence at position %ld"), + (long int) (addr - start)); + break; + case EINVAL: + error (0, 0, _("\ +incomplete character or shift sequence at end of buffer")); + break; + case EBADF: + error (0, 0, _("internal error (illegal descriptor)")); + break; + default: + error (0, 0, _("unknown iconv() error %d"), errno); + break; + } + + return -1; + } + } + + return ret; +} + + +static int +process_fd (iconv_t cd, int fd, FILE **output, const char *output_file) +{ + /* we have a problem with reading from a desriptor since we must not + provide the iconv() function an incomplete character or shift + sequence at the end of the buffer. Since we have to deal with + arbitrary encodings we must read the whole text in a buffer and + process it in one step. */ + static char *inbuf = NULL; + static size_t maxlen = 0; + char *inptr = NULL; + size_t actlen = 0; + + while (actlen < maxlen) + { + ssize_t n = read (fd, inptr, maxlen - actlen); + + if (n == 0) + /* No more text to read. */ + break; + + if (n == -1) + { + /* Error while reading. */ + error (0, errno, _("error while reading the input")); + return -1; + } + + inptr += n; + actlen += n; + } + + if (actlen == maxlen) + while (1) + { + ssize_t n; + char *new_inbuf; + + /* Increase the buffer. */ + new_inbuf = (char *) realloc (inbuf, maxlen + 32768); + if (new_inbuf == NULL) + { + error (0, errno, _("unable to allocate buffer for input")); + return -1; + } + inbuf = new_inbuf; + maxlen += 32768; + inptr = inbuf + actlen; + + do + { + n = read (fd, inptr, maxlen - actlen); + + if (n == 0) + /* No more text to read. */ + break; + + if (n == -1) + { + /* Error while reading. */ + error (0, errno, _("error while reading the input")); + return -1; + } + + inptr += n; + actlen += n; + } + while (actlen < maxlen); + + if (n == 0) + /* Break again so we leave both loops. */ + break; + } + + /* Now we have all the input in the buffer. Process it in one run. */ + return process_block (cd, inbuf, actlen, output, output_file); +} + + +static int +process_file (iconv_t cd, FILE *input, FILE **output, const char *output_file) +{ + /* This should be safe since we use this function only for `stdin' and + we haven't read anything so far. */ + return process_fd (cd, fileno (input), output, output_file); +} + + +/* Print all known character sets/encodings. */ +static void *printlist; +static size_t column; +static int not_first; + +static void +insert_print_list (const void *nodep, VISIT value, int level) +{ + if (value == leaf || value == postorder) + { + const struct gconv_alias *s = *(const struct gconv_alias **) nodep; + tsearch (s->fromname, &printlist, (__compar_fn_t) strverscmp); + } +} + +static void +do_print_human (const void *nodep, VISIT value, int level) +{ + if (value == leaf || value == postorder) + { + const char *s = *(const char **) nodep; + size_t len = strlen (s); + size_t cnt; + + while (len > 0 && s[len - 1] == '/') + --len; + + for (cnt = 0; cnt < len; ++cnt) + if (isalnum (s[cnt])) + break; + if (cnt == len) + return; + + if (not_first) + { + putchar (','); + ++column; + + if (column > 2 && column + len > 77) + { + fputs ("\n ", stdout); + column = 2; + } + else + { + putchar (' '); + ++column; + } + } + else + not_first = 1; + + fwrite (s, len, 1, stdout); + column += len; + } +} + +static void +do_print (const void *nodep, VISIT value, int level) +{ + if (value == leaf || value == postorder) + { + const char *s = *(const char **) nodep; + + puts (s); + } +} + +static void +internal_function +add_known_names (struct gconv_module *node) +{ + if (node->left != NULL) + add_known_names (node->left); + if (node->right != NULL) + add_known_names (node->right); + do + { + if (strcmp (node->from_string, "INTERNAL") != 0) + tsearch (node->from_string, &printlist, (__compar_fn_t) strverscmp); + if (strcmp (node->to_string, "INTERNAL") != 0) + tsearch (node->to_string, &printlist, (__compar_fn_t) strverscmp); + + node = node->same; + } + while (node != NULL); +} + + +static void +insert_cache (void) +{ + const struct gconvcache_header *header; + const char *strtab; + const struct hash_entry *hashtab; + size_t cnt; + + header = (const struct gconvcache_header *) __gconv_get_cache (); + strtab = (char *) header + header->string_offset; + hashtab = (struct hash_entry *) ((char *) header + header->hash_offset); + + for (cnt = 0; cnt < header->hash_size; ++cnt) + if (hashtab[cnt].string_offset != 0) + { + const char *str = strtab + hashtab[cnt].string_offset; + + if (strcmp (str, "INTERNAL") != 0) + tsearch (str, &printlist, (__compar_fn_t) strverscmp); + } +} + + +static void +internal_function +print_known_names (void) +{ + iconv_t h; + void *cache; + + /* We must initialize the internal databases first. */ + h = iconv_open ("L1", "L1"); + iconv_close (h); + + /* See whether we have a cache. */ + cache = __gconv_get_cache (); + if (cache != NULL) + /* Yep, use only this information. */ + insert_cache (); + else + { + struct gconv_module *modules; + + /* No, then use the information read from the gconv-modules file. + First add the aliases. */ + twalk (__gconv_get_alias_db (), insert_print_list); + + /* Add the from- and to-names from the known modules. */ + modules = __gconv_get_modules_db (); + if (modules != NULL) + add_known_names (modules); + } + + bool human_readable = isatty (fileno (stdout)); + + if (human_readable) + fputs (_("\ +The following list contains all the coded character sets known. This does\n\ +not necessarily mean that all combinations of these names can be used for\n\ +the FROM and TO command line parameters. One coded character set can be\n\ +listed with several different names (aliases).\n\n "), stdout); + + /* Now print the collected names. */ + column = 2; + twalk (printlist, human_readable ? do_print_human : do_print); + + if (human_readable && column != 0) + puts (""); +} |