summary refs log tree commit diff
path: root/iconvdata
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2000-03-21 20:18:34 +0000
committerUlrich Drepper <drepper@redhat.com>2000-03-21 20:18:34 +0000
commit8d617a716df0ed5fd9ea1c8e65dd8e59b168573e (patch)
tree9bae23f82c38958c166ca3d65bb308f52ab2d14c /iconvdata
parentbc4831b956f96efd9f4185b739b8ce8f3fa4dae6 (diff)
downloadglibc-8d617a716df0ed5fd9ea1c8e65dd8e59b168573e.tar.gz
glibc-8d617a716df0ed5fd9ea1c8e65dd8e59b168573e.tar.xz
glibc-8d617a716df0ed5fd9ea1c8e65dd8e59b168573e.zip
Update.
	* iconv/gconv_builtin.c: Include <endian.h>.
	* iconv/gconv_builtin.h: Add UCS-BE aliases.
	Add UCS-4LE transformation.  Define UNICODEBIG and UNICODELITTLE
	according to current platform.
	* iconv/gconv_int.h: Declare __gconv_transform_ucs2reverse_internal,
	__gconv_transform_internal_ucs2reverse, and
	__gconv_transform_internal_ucs4le.
	* iconv/gconv_simple.c: Implement __gconv_transform_internal_ucs4le,
	__gconv_transform_ucs2reverse_internal and
	__gconv_transform_internal_ucs2reverse.
	* iconvdata/Makefile (modules): Add UNICODE.
	(distribute): Add unicode.c.
	* iconvdata/gconv-modules: Add definitions for UNICODE module.
	* iconvdata/unicode.c: New file.

	* iconvdata/utf-16.c: Rewrite code to emit BOM.  Correct code to
	determine byte order of input and convert accordingly.
Diffstat (limited to 'iconvdata')
-rw-r--r--iconvdata/Makefile4
-rw-r--r--iconvdata/gconv-modules5
-rw-r--r--iconvdata/unicode.c190
-rw-r--r--iconvdata/utf-16.c51
4 files changed, 229 insertions, 21 deletions
diff --git a/iconvdata/Makefile b/iconvdata/Makefile
index dee418e6a9..05ae2ac62e 100644
--- a/iconvdata/Makefile
+++ b/iconvdata/Makefile
@@ -45,7 +45,7 @@ modules	:= ISO8859-1 ISO8859-2 ISO8859-3 ISO8859-4 ISO8859-5		 \
 	   INIS-CYRILLIC ISO_6937-2 ISO_2033 ISO_5427 ISO_5427-EXT	 \
 	   ISO_5428 ISO_10367-BOX MAC-IS MAC-UK NATS-DANO NATS-SEFI	 \
 	   SAMI-WS2 ISO-IR-197 TIS-620 KOI8-U GBK ISIRI-3342 GBGBK	 \
-	   ISO-2022-CN libISOIR165 UTF-16
+	   ISO-2022-CN libISOIR165 UTF-16 UNICODE
 
 modules.so := $(addsuffix .so, $(modules))
 
@@ -118,7 +118,7 @@ distribute := gconv-modules extra-module.mk gap.awk gaptab.awk		    \
 	      macintosh.c mac-is.c mac-uk.c nats-dano.c nats-sefi.c sjis.c  \
 	      t.61.c uhc.c sami-ws2.c iso-ir-197.c tis-620.c koi8-u.c	    \
 	      isiri-3342.c gbgbk.c iso-2022-cn.c cns11643l2.h iso8859-16.c  \
-	      utf-16.c
+	      utf-16.c unicode.c
 
 # We build the transformation modules only when we build shared libs.
 ifeq (yes,$(build-shared))
diff --git a/iconvdata/gconv-modules b/iconvdata/gconv-modules
index 6a3c521c3c..3ca028501b 100644
--- a/iconvdata/gconv-modules
+++ b/iconvdata/gconv-modules
@@ -1184,3 +1184,8 @@ module	INTERNAL		UTF-16LE//		UTF-16		1
 #	from			to			module		cost
 module	UTF-16BE//		INTERNAL		UTF-16		1
 module	INTERNAL		UTF-16BE//		UTF-16		1
+
+#	from			to			module		cost
+alias	CSUNICODE//		UNICODE//
+module	UNICODE//		INTERNAL		UNICODE		1
+module	INTERNAL		UNICODE//		UNICODE		1
diff --git a/iconvdata/unicode.c b/iconvdata/unicode.c
new file mode 100644
index 0000000000..b29976b0c8
--- /dev/null
+++ b/iconvdata/unicode.c
@@ -0,0 +1,190 @@
+/* Conversion module for Unicode
+   Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Library General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Library General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If not,
+   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+   Boston, MA 02111-1307, USA.  */
+
+#include <byteswap.h>
+#include <gconv.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* This is the Byte Order Mark character (BOM).  */
+#define BOM	0xfeff
+/* And in the other endian format.  */
+#define BOM_OE	0xfffe
+
+
+/* Definitions used in the body of the `gconv' function.  */
+#define FROM_LOOP		from_unicode_loop
+#define TO_LOOP			to_unicode_loop
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		2
+#define MIN_NEEDED_TO		4
+#define FROM_DIRECTION		(dir == from_unicode)
+#define PREPARE_LOOP \
+  enum direction dir = ((struct unicode_data *) step->__data)->dir;	      \
+  int swap;								      \
+  if (FROM_DIRECTION)							      \
+    {									      \
+      if (data->__invocation_counter == 0)				      \
+	{								      \
+	  /* We have to find out which byte order the file is encoded in.  */ \
+	  if (inptr + 2 > inbufend)					      \
+	    return __GCONV_EMPTY_INPUT;					      \
+									      \
+	  if (*(uint16_t *) inptr == BOM)				      \
+	    /* Simply ignore the BOM character.  */			      \
+	    inptr += 2;							      \
+	  else if (*(uint16_t *) inptr == BOM_OE)			      \
+	    {								      \
+	      ((struct unicode_data *) step->__data)->swap = 1;		      \
+	      inptr += 2;						      \
+	    }								      \
+	}								      \
+    }									      \
+  else if (!data->__internal_use && data->__invocation_counter == 0)	      \
+    {									      \
+      /* Emit the Byte Order Mark.  */					      \
+      if (outbuf + 2 > outend)						      \
+	return __GCONV_FULL_OUTPUT;					      \
+									      \
+      *(uint16_t *) outbuf = BOM;					      \
+      outbuf += 2;							      \
+    }									      \
+  swap = ((struct unicode_data *) step->__data)->swap;
+#define EXTRA_LOOP_ARGS		, data, swap
+
+
+/* Direction of the transformation.  */
+enum direction
+{
+  illegal_dir,
+  to_unicode,
+  from_unicode
+};
+
+struct unicode_data
+{
+  enum direction dir;
+  int swap;
+};
+
+
+int
+gconv_init (struct __gconv_step *step)
+{
+  /* Determine which direction.  */
+  struct unicode_data *new_data;
+  enum direction dir = illegal_dir;
+  int result;
+
+  if (__strcasecmp (step->__from_name, "UNICODE") == 0)
+    dir = from_unicode;
+  else
+    dir = to_unicode;
+
+  new_data = (struct unicode_data *) malloc (sizeof (struct unicode_data));
+
+  result = __GCONV_NOMEM;
+  if (new_data != NULL)
+    {
+      new_data->dir = dir;
+      new_data->swap = 0;
+      step->__data = new_data;
+
+      if (dir == from_unicode)
+	{
+	  step->__min_needed_from = MIN_NEEDED_FROM;
+	  step->__max_needed_from = MIN_NEEDED_FROM;
+	  step->__min_needed_to = MIN_NEEDED_TO;
+	  step->__max_needed_to = MIN_NEEDED_TO;
+	}
+      else
+	{
+	  step->__min_needed_from = MIN_NEEDED_TO;
+	  step->__max_needed_from = MIN_NEEDED_TO;
+	  step->__min_needed_to = MIN_NEEDED_FROM;
+	  step->__max_needed_to = MIN_NEEDED_FROM;
+	}
+
+      step->__stateful = 0;
+
+      result = __GCONV_OK;
+    }
+
+  return result;
+}
+
+
+void
+gconv_end (struct __gconv_step *data)
+{
+  free (data->__data);
+}
+
+
+/* Convert from the internal (UCS4-like) format to UCS2.  */
+#define MIN_NEEDED_INPUT	MIN_NEEDED_TO
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
+#define LOOPFCT			TO_LOOP
+#define BODY \
+  {									      \
+    uint32_t c = *((uint32_t *) inptr);					      \
+									      \
+    if (c >= 0x10000)							      \
+      {									      \
+	result = __GCONV_ILLEGAL_INPUT;					      \
+	break;								      \
+      }									      \
+									      \
+    *((uint16_t *) outptr) = c;						      \
+									      \
+    outptr += 2;							      \
+    inptr += 4;								      \
+  }
+#define EXTRA_LOOP_DECLS \
+	, struct __gconv_step_data *step_data, int swap
+#include <iconv/loop.c>
+
+
+/* Convert from UCS2 to the internal (UCS4-like) format.  */
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    uint16_t u1 = *(uint16_t *) inptr;					      \
+									      \
+    if (swap)								      \
+      u1 = bswap_16 (u1);						      \
+									      \
+    *((uint32_t *) outptr) = u1;					      \
+									      \
+    inptr += 2;								      \
+    outptr += 4;							      \
+  }
+#define EXTRA_LOOP_DECLS \
+	, struct __gconv_step_data *step_data, int swap
+#include <iconv/loop.c>
+
+
+/* Now define the toplevel functions.  */
+#include <iconv/skeleton.c>
diff --git a/iconvdata/utf-16.c b/iconvdata/utf-16.c
index 55540c9849..c7e677e376 100644
--- a/iconvdata/utf-16.c
+++ b/iconvdata/utf-16.c
@@ -27,6 +27,8 @@
 
 /* This is the Byte Order Mark character (BOM).  */
 #define BOM	0xfeff
+/* And in the other byte order.  */
+#define BOM_OE	0xfffe
 
 
 /* Definitions used in the body of the `gconv' function.  */
@@ -41,8 +43,27 @@
 #define PREPARE_LOOP \
   enum direction dir = ((struct utf16_data *) step->__data)->dir;	      \
   enum variant var = ((struct utf16_data *) step->__data)->var;		      \
-  if (!FROM_DIRECTION && var == UTF_16 && !data->__internal_use		      \
-      && data->__invocation_counter == 0)				      \
+  int swap = ((struct utf16_data *) step->__data)->swap;		      \
+  if (FROM_DIRECTION || var == UTF_16)					      \
+    {									      \
+      if (data->__invocation_counter == 0)				      \
+	{								      \
+	  /* We have to find out which byte order the file is encoded in.  */ \
+	  if (inptr + 2 > inbufend)					      \
+	    return __GCONV_EMPTY_INPUT;					      \
+									      \
+	  if (*(uint16_t *) inptr == BOM)				      \
+	    /* Simply ignore the BOM character.  */			      \
+	    inptr += 2;							      \
+	  else if (*(uint16_t *) inptr == BOM_OE)			      \
+	    {								      \
+	      ((struct utf16_data *) step->__data)->swap = 1;		      \
+	      inptr += 2;						      \
+	    }								      \
+	}								      \
+    }									      \
+  else if (!FROM_DIRECTION && var == UTF_16 && !data->__internal_use	      \
+	   && data->__invocation_counter == 0)				      \
     {									      \
       /* Emit the Byte Order Mark.  */					      \
       if (outbuf + 2 > outend)						      \
@@ -51,7 +72,7 @@
       *(uint16_t *) outbuf = BOM;					      \
       outbuf += 2;							      \
     }
-#define EXTRA_LOOP_ARGS		, var, data
+#define EXTRA_LOOP_ARGS		, var, data, swap
 
 
 /* Direction of the transformation.  */
@@ -74,6 +95,7 @@ struct utf16_data
 {
   enum direction dir;
   enum variant var;
+  int swap;
 };
 
 
@@ -127,6 +149,9 @@ gconv_init (struct __gconv_step *step)
 	{
 	  new_data->dir = dir;
 	  new_data->var = var;
+	  new_data->swap = ((var == UTF_16LE && BYTE_ORDER == BIG_ENDIAN)
+			    || (var == UTF_16BE
+				&& BYTE_ORDER == LITTLE_ENDIAN));
 	  step->__data = new_data;
 
 	  if (dir == from_utf16)
@@ -170,8 +195,7 @@ gconv_end (struct __gconv_step *data)
   {									      \
     uint32_t c = *((uint32_t *) inptr);					      \
 									      \
-    if ((__BYTE_ORDER == __LITTLE_ENDIAN && var == UTF_16BE)		      \
-        || (__BYTE_ORDER == __BIG_ENDIAN && var == UTF_16LE))		      \
+    if (swap)								      \
       {									      \
 	if (c >= 0x10000)						      \
 	  {								      \
@@ -225,7 +249,7 @@ gconv_end (struct __gconv_step *data)
     inptr += 4;								      \
   }
 #define EXTRA_LOOP_DECLS \
-	, enum variant var, struct __gconv_step_data *step_data
+	, enum variant var, struct __gconv_step_data *step_data, int swap
 #include <iconv/loop.c>
 
 
@@ -238,8 +262,7 @@ gconv_end (struct __gconv_step *data)
   {									      \
     uint16_t u1 = *(uint16_t *) inptr;					      \
 									      \
-    if ((__BYTE_ORDER == __LITTLE_ENDIAN && var == UTF_16BE) 		      \
-        || (__BYTE_ORDER == __BIG_ENDIAN && var == UTF_16LE))		      \
+    if (swap)								      \
       {									      \
 	u1 = bswap_16 (u1);						      \
 									      \
@@ -277,16 +300,6 @@ gconv_end (struct __gconv_step *data)
       }									      \
     else								      \
       {									      \
-	if (u1 == BOM && var == UTF_16 && !step_data->__internal_use	      \
-	    && step_data->__invocation_counter == 0 && inptr == *inptrp)      \
-	  {								      \
-	    /* This is the first word in the file and it is the BOM and	      \
-	       we are converting a file without specified byte order.	      \
-	       Simply sack the BOM.  */					      \
-	    inptr += 2;							      \
-	    continue;							      \
-	  }								      \
-									      \
 	if (u1 < 0xd800 || u1 > 0xdfff)					      \
 	  {								      \
 	    /* No surrogate.  */					      \
@@ -322,7 +335,7 @@ gconv_end (struct __gconv_step *data)
     outptr += 4;							      \
   }
 #define EXTRA_LOOP_DECLS \
-	, enum variant var, struct __gconv_step_data *step_data
+	, enum variant var, struct __gconv_step_data *step_data, int swap
 #include <iconv/loop.c>