diff options
author | Florian Weimer <fweimer@redhat.com> | 2024-09-20 13:10:54 +0200 |
---|---|---|
committer | Florian Weimer <fweimer@redhat.com> | 2024-09-20 13:51:09 +0200 |
commit | fa1b0d5e9f6e0353e16339430770a7a8824c0468 (patch) | |
tree | 13ba422f04c0cce6332708ecc929307ed67b7056 /iconv | |
parent | 75819cdd29a193cc2db980878bec305905b22bbc (diff) | |
download | glibc-fa1b0d5e9f6e0353e16339430770a7a8824c0468.tar.gz glibc-fa1b0d5e9f6e0353e16339430770a7a8824c0468.tar.xz glibc-fa1b0d5e9f6e0353e16339430770a7a8824c0468.zip |
iconv: Input buffering for the iconv program (bug 6050)
Do not read the entire input file into memory. Reviewed-by: DJ Delorie <dj@redhat.com>
Diffstat (limited to 'iconv')
-rw-r--r-- | iconv/iconv_prog.c | 184 | ||||
-rw-r--r-- | iconv/tst-iconv_prog-buffer.sh | 31 |
2 files changed, 109 insertions, 106 deletions
diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c index dd4bc3a59a..a2f1d34e45 100644 --- a/iconv/iconv_prog.c +++ b/iconv/iconv_prog.c @@ -118,8 +118,9 @@ static size_t output_buffer_size = 1024 * 1024; /* Prototypes for the functions doing the actual work. */ static void prepare_output_file (char **argv); -static void close_output_file (int status); -static int process_block (iconv_t cd, char *addr, size_t len); +static void close_output_file (__gconv_t cd, int status); +static int process_block (iconv_t cd, char **addr, size_t *len, + off64_t file_offset, bool *incomplete); static int process_fd (iconv_t cd, int fd); static int process_file (iconv_t cd, FILE *input); static void print_known_names (void); @@ -311,7 +312,7 @@ conversions from `%s' and to `%s' are not supported"), status = EXIT_FAILURE; /* Close the output file now. */ - close_output_file (status); + close_output_file (cd, status); } return status; @@ -599,7 +600,7 @@ flush_output (void) } static void -close_output_file (int status) +close_output_file (__gconv_t cd, int status) { /* Do not perform a flush if a temporary file or the in-memory buffer is in use and there was an error. It would clobber the @@ -608,10 +609,28 @@ close_output_file (int status) (output_using_temporary_file || output_fd < 0)) return; - /* The current_input_file_index variable is now larger than - last_overlapping_file_index, so the flush_output call switches + /* All the input text is processed. For state-dependent character + sets we have to flush the state now. + + The current_input_file_index variable is now larger than + last_overlapping_file_index, so the flush_output calls switch away from the temporary file. */ + size_t n = iconv (cd, NULL, NULL, + &output_buffer_current, &output_buffer_remaining); + if (n == (size_t) -1 && errno == E2BIG) + { + /* Try again if the state flush exceeded the buffer space. */ + flush_output (); + n = iconv (cd, NULL, NULL, + &output_buffer_current, &output_buffer_remaining); + } + int saved_errno = errno; flush_output (); + if (n == (size_t) -1 && !omit_invalid) + { + errno = saved_errno; + output_error (); + } if (output_fd == STDOUT_FILENO) { @@ -625,51 +644,35 @@ close_output_file (int status) output_error (); } +/* CD is the iconv handle. Input processing starts at *ADDR, and + consumes upto *LEN bytes. *ADDR and *LEN are updated. FILE_OFFSET + is the file offset of the data initially at ADDR. *INCOMPLETE is + set to true if conversion stops due to an incomplete input + sequence. */ static int -process_block (iconv_t cd, char *addr, size_t len) +process_block (iconv_t cd, char **addr, size_t *len, off64_t file_offset, + bool *incomplete) { - const char *start = addr; + const char *start = *addr; size_t n; int ret = 0; - while (len > 0) + while (*len > 0) { - n = iconv (cd, &addr, &len, + n = iconv (cd, addr, len, &output_buffer_current, &output_buffer_remaining); if (n == (size_t) -1 && omit_invalid && errno == EILSEQ) { ret = 1; - if (len == 0) + if (*len == 0) n = 0; else errno = E2BIG; } if (n != (size_t) -1) - { - /* All the input test is processed. For state-dependent - character sets we have to flush the state now. */ - n = iconv (cd, NULL, NULL, - &output_buffer_current, &output_buffer_remaining); - if (n == (size_t) -1 && errno == E2BIG) - { - /* Try again if the state flush exceeded the buffer space. */ - flush_output (); - n = iconv (cd, NULL, NULL, - &output_buffer_current, &output_buffer_remaining); - } - bool errno_is_EILSEQ = errno == EILSEQ; - - if (n != (size_t) -1) - break; - - if (omit_invalid && errno_is_EILSEQ) - { - ret = 1; - break; - } - } + break; if (errno == E2BIG) flush_output (); @@ -680,13 +683,12 @@ process_block (iconv_t cd, char *addr, size_t len) { case EILSEQ: if (! omit_invalid) - error (0, 0, _("illegal input sequence at position %ld"), - (long int) (addr - start)); + error (0, 0, _("illegal input sequence at position %lld"), + (long long int) (file_offset + (*addr - start))); break; case EINVAL: - error (0, 0, _("\ -incomplete character or shift sequence at end of buffer")); - break; + *incomplete = true; + return ret; case EBADF: error (0, 0, _("internal error (illegal descriptor)")); break; @@ -706,79 +708,49 @@ incomplete character or shift sequence at end of buffer")); static int process_fd (iconv_t cd, int fd) { - /* we have a problem with reading from a descriptor since we must not - provide the iconv() function an incomplete character or shift - sequence at the end of the buffer. Since we have to deal with - arbitrary encodings we must read the whole text in a buffer and - process it in one step. */ - static char *inbuf = NULL; - static size_t maxlen = 0; - char *inptr = inbuf; - size_t actlen = 0; - - while (actlen < maxlen) + char inbuf[BUFSIZ]; + char *inbuf_end = inbuf + sizeof (inbuf); + size_t inbuf_used = 0; + off64_t file_offset = 0; + int status = 0; + bool incomplete = false; + + while (true) { - ssize_t n = read (fd, inptr, maxlen - actlen); - - if (n == 0) - /* No more text to read. */ - break; - - if (n == -1) + char *p = inbuf + inbuf_used; + ssize_t read_ret = read (fd, p, inbuf_end - p); + if (read_ret == 0) + { + /* On EOF, check if the previous iconv invocation saw an + incomplete sequence. */ + if (incomplete) + { + error (0, 0, _("\ +incomplete character or shift sequence at end of buffer")); + return 1; + } + return 0; + } + if (read_ret < 0) { - /* Error while reading. */ error (0, errno, _("error while reading the input")); return -1; } - - inptr += n; - actlen += n; + inbuf_used += read_ret; + incomplete = false; + p = inbuf; + int ret = process_block (cd, &p, &inbuf_used, file_offset, &incomplete); + if (ret != 0) + { + status = ret; + if (ret < 0) + break; + } + /* The next loop iteration consumes the leftover bytes. */ + memmove (inbuf, p, inbuf_used); + file_offset += read_ret - inbuf_used; } - - if (actlen == maxlen) - while (1) - { - ssize_t n; - char *new_inbuf; - - /* Increase the buffer. */ - new_inbuf = (char *) realloc (inbuf, maxlen + 32768); - if (new_inbuf == NULL) - { - error (0, errno, _("unable to allocate buffer for input")); - return -1; - } - inbuf = new_inbuf; - maxlen += 32768; - inptr = inbuf + actlen; - - do - { - n = read (fd, inptr, maxlen - actlen); - - if (n == 0) - /* No more text to read. */ - break; - - if (n == -1) - { - /* Error while reading. */ - error (0, errno, _("error while reading the input")); - return -1; - } - - inptr += n; - actlen += n; - } - while (actlen < maxlen); - - if (n == 0) - /* Break again so we leave both loops. */ - break; - } - - /* Now we have all the input in the buffer. Process it in one run. */ - return process_block (cd, inbuf, actlen); + return status; } diff --git a/iconv/tst-iconv_prog-buffer.sh b/iconv/tst-iconv_prog-buffer.sh index a9c3729d94..23098ac56a 100644 --- a/iconv/tst-iconv_prog-buffer.sh +++ b/iconv/tst-iconv_prog-buffer.sh @@ -50,6 +50,9 @@ echo OUT > "$tmp/out-template" : > "$tmp/empty" printf '\xff' > "$tmp/0xff" +# Length should be a prime number, to help with buffer alignment testing. +printf '\xc3\xa4\xe2\x80\x94\xe2\x80\x94\xc3\xa4\n' > "$tmp/utf8-sequence" + # Double all files to produce larger buffers. for p in "$tmp"/* ; do i=0 @@ -270,6 +273,34 @@ expect_exit 1 run_iconv -o "$tmp/out" "$tmp/abc" - < "$tmp/0xff" "$tmp/def" run_iconv -o "$tmp/out" "$tmp/xy" - - "$tmp/zt" < "$tmp/abc" expect_files xy abc zt +# NB: Extra iconv args are ignored after this point. Actual +# multi-byte conversion does not work with tiny buffers. +iconv_args="-f UTF-8 -t ASCII" + +printf 'x\n\xc3' > "$tmp/incomplete" +expect_exit 1 run_iconv -o "$tmp/out" "$tmp/incomplete" +check_out <<EOF +x +EOF + +# Test buffering behavior if the buffer ends with an incomplete +# multi-byte sequence. +prefix="" +prefix_length=0 +while test $prefix_length -lt 12; do + echo "info: testing prefix length $prefix_length" 2>&$logfd + printf "%s" "$prefix" > "$tmp/prefix" + cat "$tmp/prefix" "$tmp/utf8-sequence" > "$tmp/tmp" + iconv_args="-f UTF-8 -t UCS-4" + run_iconv -o "$tmp/out1" "$tmp/tmp" + iconv_args="-f UCS-4 -t UTF-8" + run_iconv -o "$tmp/out" "$tmp/out1" + expect_files prefix utf8-sequence + + prefix="$prefix@" + prefix_length=$(($prefix_length + 1)) +done + if $failure ; then exit 1 fi |