Update transliteration support to Unicode 7.0.0.

The transliteration files are now autogenerated from upstream Unicode data.
author: Carlos O'Donell <carlos@systemhalted.org> 2015-12-09 22:27:41 -0500
committer: Carlos O'Donell <carlos@systemhalted.org> 2015-12-09 22:52:13 -0500
commit: dd8e8e547647bf7a3f6feb816a848a846feeaf14 (patch)
tree: a2565747c02ddaa9b178a5aa9de6fa42aa5ae979 /localedata/unicode-gen
parent: 40b59cace2fd5e5aa04367073a54efc995059376 (diff)
download: glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.tar.gz
glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.tar.xz
glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.zip
11 files changed, 2112 insertions, 665 deletions
diff --git a/localedata/unicode-gen/Makefile b/localedata/unicode-gen/Makefile
index 166ee310d8..920bf0eec8 100644
--- a/localedata/unicode-gen/Makefile
+++ b/localedata/unicode-gen/Makefile
@@ -41,7 +41,7 @@ PYTHON3 = python3
 WGET = wget
 
 DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt
-GENERATED = i18n UTF-8
+GENERATED = i18n UTF-8 translit_combining translit_compat translit_circle translit_cjk_compat translit_font translit_fraction
 REPORTS = i18n-report UTF-8-report
 
 all: $(GENERATED)
@@ -51,6 +51,12 @@ check: check-i18n check-UTF-8
 install:
 	cp -p i18n ../locales/i18n
 	cp -p UTF-8 ../charmaps/UTF-8
+	cp -p translit_combining ../locales/translit_combining
+	cp -p translit_compat ../locales/translit_compat
+	cp -p translit_circle ../locales/translit_circle
+	cp -p translit_cjk_compat ../locales/translit_cjk_compat
+	cp -p translit_font ../locales/translit_font
+	cp -p translit_fraction ../locales/translit_fraction
 
 clean: mostlyclean
 	-rm -rf __pycache__
@@ -82,13 +88,43 @@ UTF-8: utf8_gen.py
 
 UTF-8-report: UTF-8 ../charmaps/UTF-8
 UTF-8-report: utf8_compatibility.py
-	$(PYTHON3) ./utf8_compatibility.py -o ../charmaps/UTF-8 \
-	  -n UTF-8 -a -m > $@
+	$(PYTHON3) ./utf8_compatibility.py -u UnicodeData.txt \
+	-e EastAsianWidth.txt -o ../charmaps/UTF-8 \
+	-n UTF-8 -a -m -c > $@
 
 check-UTF-8: UTF-8-report
 	@if grep '^Total.*: [^0]' UTF-8-report; \
 	then echo manual verification required; false; else true; fi
 
+translit_combining: UnicodeData.txt
+translit_combining: gen_translit_combining.py
+	$(PYTHON3) ./gen_translit_combining.py -u UnicodeData.txt \
+	-o $@ --unicode_version $(UNICODE_VERSION)
+
+translit_compat: UnicodeData.txt
+translit_compat: gen_translit_compat.py
+	$(PYTHON3) ./gen_translit_compat.py -u UnicodeData.txt \
+	-o $@ --unicode_version $(UNICODE_VERSION)
+
+translit_circle: UnicodeData.txt
+translit_circle: gen_translit_circle.py
+	$(PYTHON3) ./gen_translit_circle.py -u UnicodeData.txt \
+	-o $@ --unicode_version $(UNICODE_VERSION)
+
+translit_cjk_compat: UnicodeData.txt
+translit_cjk_compat: gen_translit_cjk_compat.py
+	$(PYTHON3) ./gen_translit_cjk_compat.py -u UnicodeData.txt \
+	-o $@ --unicode_version $(UNICODE_VERSION)
+
+translit_font: UnicodeData.txt
+translit_font: gen_translit_font.py
+	$(PYTHON3) ./gen_translit_font.py -u UnicodeData.txt \
+	-o $@ --unicode_version $(UNICODE_VERSION)
+
+translit_fraction: UnicodeData.txt
+translit_fraction: gen_translit_fraction.py
+	$(PYTHON3) ./gen_translit_fraction.py -u UnicodeData.txt \
+	-o $@ --unicode_version $(UNICODE_VERSION)
 
 .PHONY: downloads clean-downloads
 downloads: $(DOWNLOADS)
diff --git a/localedata/unicode-gen/gen_translit_circle.py b/localedata/unicode-gen/gen_translit_circle.py
new file mode 100644
index 0000000000..6142859d58
--- /dev/null
+++ b/localedata/unicode-gen/gen_translit_circle.py
@@ -0,0 +1,150 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# Generate a translit_circle file from a UnicodeData file.
+# Copyright (C) 2015 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+'''
+Generate a translit_circle file from UnicodeData.txt
+
+To see how this script is used, call it with the “-h” option:
+
+    $ ./gen_translit_circle -h
+    … prints usage message …
+'''
+
+import argparse
+import time
+import unicode_utils
+
+def read_input_file(filename):
+    '''Reads the original glibc translit_circle file to get the
+    original head and tail.
+
+    We want to replace only the part of the file between
+    “translit_start” and “translit_end”
+    '''
+    head = tail = ''
+    with open(filename, mode='r') as translit_file:
+        for line in translit_file:
+            head = head + line
+            if line.startswith('translit_start'):
+                break
+        for line in translit_file:
+            if line.startswith('translit_end'):
+                tail = line
+                break
+        for line in translit_file:
+            tail = tail + line
+    return (head, tail)
+
+def output_head(translit_file, unicode_version, head=''):
+    '''Write the header of the output file, i.e. the part of the file
+    before the “translit_start” line.
+    '''
+    if ARGS.input_file and head:
+        translit_file.write(head)
+    else:
+        translit_file.write('escape_char /\n')
+        translit_file.write('comment_char %\n')
+        translit_file.write('\n')
+        translit_file.write('% Transliterations of encircled characters.\n')
+        translit_file.write('% Generated automatically from UnicodeData.txt '
+                            + 'by gen_translit_circle.py '
+                            + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+                            + 'for Unicode {:s}.\n'.format(unicode_version))
+        translit_file.write('\n')
+        translit_file.write('LC_CTYPE\n')
+        translit_file.write('\n')
+        translit_file.write('translit_start\n')
+
+def output_tail(translit_file, tail=''):
+    '''Write the tail of the output file'''
+    if ARGS.input_file and tail:
+        translit_file.write(tail)
+    else:
+        translit_file.write('translit_end\n')
+        translit_file.write('\n')
+        translit_file.write('END LC_CTYPE\n')
+
+def output_transliteration(translit_file):
+    '''Write the new transliteration to the output file'''
+    translit_file.write('\n')
+    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
+        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
+        decomposition = unicode_utils.UNICODE_ATTRIBUTES[
+            code_point]['decomposition']
+        if decomposition.startswith('<circle>'):
+            decomposition = decomposition[9:]
+            decomposed_code_points = [int(x, 16)
+                                      for x in decomposition.split(' ')]
+            translit_file.write('% {:s}\n'.format(name))
+            translit_file.write('{:s} "<U0028>'.format(
+                unicode_utils.ucs_symbol(code_point)))
+            for decomposed_code_point in decomposed_code_points:
+                translit_file.write('{:s}'.format(
+                    unicode_utils.ucs_symbol(decomposed_code_point)))
+            translit_file.write('<U0029>"\n')
+    translit_file.write('\n')
+
+
+if __name__ == "__main__":
+    PARSER = argparse.ArgumentParser(
+        description='''
+        Generate a translit_circle file from UnicodeData.txt.
+        ''')
+    PARSER.add_argument(
+        '-u', '--unicode_data_file',
+        nargs='?',
+        type=str,
+        default='UnicodeData.txt',
+        help=('The UnicodeData.txt file to read, '
+              + 'default: %(default)s'))
+    PARSER.add_argument(
+        '-i', '--input_file',
+        nargs='?',
+        type=str,
+        help=''' The original glibc/localedata/locales/translit_combining
+        file.''')
+    PARSER.add_argument(
+        '-o', '--output_file',
+        nargs='?',
+        type=str,
+        default='translit_circle.new',
+        help='''The new translit_circle file, default: %(default)s.  If the
+        original glibc/localedata/locales/translit_circle file has
+        been given as an option, the header up to the
+        “translit_start” line and the tail from the “translit_end”
+        line to the end of the file will be copied unchanged into the
+        output file.  ''')
+    PARSER.add_argument(
+        '--unicode_version',
+        nargs='?',
+        required=True,
+        type=str,
+        help='The Unicode version of the input files used.')
+    ARGS = PARSER.parse_args()
+
+    unicode_utils.fill_attributes(ARGS.unicode_data_file)
+    HEAD = TAIL = ''
+    if ARGS.input_file:
+        (HEAD, TAIL) = read_input_file(ARGS.input_file)
+    with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
+        output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
+        output_transliteration(TRANSLIT_FILE)
+        output_tail(TRANSLIT_FILE, tail=TAIL)
diff --git a/localedata/unicode-gen/gen_translit_cjk_compat.py b/localedata/unicode-gen/gen_translit_cjk_compat.py
new file mode 100644
index 0000000000..627ff6bdd9
--- /dev/null
+++ b/localedata/unicode-gen/gen_translit_cjk_compat.py
@@ -0,0 +1,220 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# Generate a translit_cjk_compat file from a UnicodeData file.
+# Copyright (C) 2015 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+'''
+Generate a translit_cjk_compat file from UnicodeData.txt
+
+To see how this script is used, call it with the “-h” option:
+
+    $ ./gen_translit_cjk_compat -h
+    … prints usage message …
+'''
+
+import argparse
+import time
+import sys
+import unicode_utils
+
+def read_input_file(filename):
+    '''Reads the original glibc translit_cjk_compat file to get the
+    original head and tail.
+
+    We want to replace only the part of the file between
+    “translit_start” and “translit_end”
+    '''
+    head = tail = ''
+    with open(filename, mode='r') as translit_file:
+        for line in translit_file:
+            head = head + line
+            if line.startswith('translit_start'):
+                break
+        for line in translit_file:
+            if line.startswith('translit_end'):
+                tail = line
+                break
+        for line in translit_file:
+            tail = tail + line
+    return (head, tail)
+
+def output_head(translit_file, unicode_version, head=''):
+    '''Write the header of the output file, i.e. the part of the file
+    before the “translit_start” line.
+    '''
+    if ARGS.input_file and head:
+        translit_file.write(head)
+    else:
+        translit_file.write('escape_char /\n')
+        translit_file.write('comment_char %\n')
+        translit_file.write('\n')
+        translit_file.write('% Transliterations of CJK compatibility ')
+        translit_file.write('characters.\n')
+        translit_file.write('% Generated automatically from UnicodeData.txt '
+                            + 'by gen_translit_cjk_compat.py '
+                            + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+                            + 'for Unicode {:s}.\n'.format(unicode_version))
+        translit_file.write('\n')
+        translit_file.write('LC_CTYPE\n')
+        translit_file.write('\n')
+        translit_file.write('translit_start\n')
+
+def output_tail(translit_file, tail=''):
+    '''Write the tail of the output file'''
+    if ARGS.input_file and tail:
+        translit_file.write(tail)
+    else:
+        translit_file.write('translit_end\n')
+        translit_file.write('\n')
+        translit_file.write('END LC_CTYPE\n')
+
+def special_decompose(code_point_list):
+    '''
+    Decompositions which are not in UnicodeData.txt at all but which
+    were used in the original translit_cjk_compat file in glibc and
+    which seem to make sense.  I want to keep the update of
+    translit_cjk_compat close to the spirit of the original file,
+    therefore I added this special decomposition rules here.
+    '''
+    special_decompose_dict = {
+        (0x2215,): [0x002F], # ∕ → /
+        (0x00B2,): [0x005E, 0x0032], # ² → ^2
+        (0x03BC,): [0x00B5], # μ → µ (GREEK SMALL LETTER MU → MICRO SIGN)
+        (0x2113,): [0x006C], # ℓ → l
+        (0x00B3,): [0x005E, 0x0033], # ³ → ^3
+        (0x00B5,): [0x0075], # µ → u
+        (0x03BC, 0x2113): [0x03BC, 0x006C], # μℓ → μl
+        (0x0072, 0x0061, 0x0064, 0x2215, 0x0073, 0x00B2): [
+            0x0072, 0x0061, 0x0064, 0x002F, 0x0073, 0x00B2],
+        (0x006D, 0x2215, 0x0073, 0x00B2): [0x006D, 0x002F, 0x0073, 0x00B2],
+    }
+    if tuple(code_point_list) in special_decompose_dict:
+        return special_decompose_dict[tuple(code_point_list)]
+    else:
+        return code_point_list
+
+def output_transliteration(translit_file):
+    '''Write the new transliteration to the output file'''
+    translit_file.write('\n')
+    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
+        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
+        decomposition = unicode_utils.UNICODE_ATTRIBUTES[
+            code_point]['decomposition']
+        if decomposition.startswith('<square>'):
+            decomposition = decomposition[9:]
+            decomposed_code_points = [[int(x, 16)
+                                       for x in decomposition.split(' ')]]
+            if decomposed_code_points[0]:
+                while True:
+                    special_decomposed_code_points = special_decompose(
+                        decomposed_code_points[-1])
+                    if (special_decomposed_code_points
+                            != decomposed_code_points[-1]):
+                        decomposed_code_points.append(
+                            special_decomposed_code_points)
+                        continue
+                    special_decomposed_code_points = []
+                    for decomposed_code_point in decomposed_code_points[-1]:
+                        special_decomposed_code_points += special_decompose(
+                            [decomposed_code_point])
+                    if (special_decomposed_code_points
+                            == decomposed_code_points[-1]):
+                        break
+                    decomposed_code_points.append(
+                        special_decomposed_code_points)
+                translit_file.write('% {:s}\n'.format(name))
+                translit_file.write('{:s} '.format(
+                    unicode_utils.ucs_symbol(code_point)))
+                for index in range(0, len(decomposed_code_points)):
+                    if index > 0:
+                        translit_file.write(';')
+                    if len(decomposed_code_points[index]) > 1:
+                        translit_file.write('"')
+                    for decomposed_code_point in decomposed_code_points[index]:
+                        translit_file.write('{:s}'.format(
+                            unicode_utils.ucs_symbol(decomposed_code_point)))
+                    if len(decomposed_code_points[index]) > 1:
+                        translit_file.write('"')
+                translit_file.write('\n')
+    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
+        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
+        decomposition = unicode_utils.UNICODE_ATTRIBUTES[
+            code_point]['decomposition']
+        if decomposition and name.startswith('CJK COMPATIBILITY IDEOGRAPH'):
+            decomposed_code_points = [int(x, 16)
+                                      for x in decomposition.split(' ')]
+            if len(decomposed_code_points) != 1:
+                sys.stderr.write(
+                    'Unexpected decomposition length {:x} {:s} {:s}\n'.format(
+                        code_point, name, decomposition))
+                exit(1)
+            translit_file.write('% {:s}\n'.format(name))
+            translit_file.write('{:s} '.format(
+                unicode_utils.ucs_symbol(code_point)))
+            for decomposed_code_point in decomposed_code_points:
+                translit_file.write('{:s}'.format(
+                    unicode_utils.ucs_symbol(decomposed_code_point)))
+            translit_file.write('\n')
+    translit_file.write('\n')
+
+if __name__ == "__main__":
+    PARSER = argparse.ArgumentParser(
+        description='''
+        Generate a translit_cjk_compat file from UnicodeData.txt.
+        ''')
+    PARSER.add_argument(
+        '-u', '--unicode_data_file',
+        nargs='?',
+        type=str,
+        default='UnicodeData.txt',
+        help=('The UnicodeData.txt file to read, '
+              + 'default: %(default)s'))
+    PARSER.add_argument(
+        '-i', '--input_file',
+        nargs='?',
+        type=str,
+        help=''' The original glibc/localedata/locales/translit_cjk_compat
+        file.''')
+    PARSER.add_argument(
+        '-o', '--output_file',
+        nargs='?',
+        type=str,
+        default='translit_cjk_compat.new',
+        help='''The new translit_cjk_compat file, default: %(default)s.  If the
+        original glibc/localedata/locales/translit_cjk_compat file has
+        been given as an option, the header up to the
+        “translit_start” line and the tail from the “translit_end”
+        line to the end of the file will be copied unchanged into the
+        output file.  ''')
+    PARSER.add_argument(
+        '--unicode_version',
+        nargs='?',
+        required=True,
+        type=str,
+        help='The Unicode version of the input files used.')
+    ARGS = PARSER.parse_args()
+
+    unicode_utils.fill_attributes(ARGS.unicode_data_file)
+    HEAD = TAIL = ''
+    if ARGS.input_file:
+        (HEAD, TAIL) = read_input_file(ARGS.input_file)
+    with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
+        output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
+        output_transliteration(TRANSLIT_FILE)
+        output_tail(TRANSLIT_FILE, tail=TAIL)
diff --git a/localedata/unicode-gen/gen_translit_combining.py b/localedata/unicode-gen/gen_translit_combining.py
new file mode 100644
index 0000000000..2551ce1652
--- /dev/null
+++ b/localedata/unicode-gen/gen_translit_combining.py
@@ -0,0 +1,442 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# Generate a translit_combining file from a UnicodeData file.
+# Copyright (C) 2015 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+'''
+Generate a translit_combining file from UnicodeData.txt
+
+To see how this script is used, call it with the “-h” option:
+
+    $ ./gen_translit_combining -h
+    … prints usage message …
+'''
+
+import argparse
+import time
+import unicode_utils
+
+def read_input_file(filename):
+    '''Reads the original glibc translit_combining file to get the
+    original head and tail.
+
+    We want to replace only the part of the file between
+    “translit_start” and “translit_end”
+    '''
+    head = tail = ''
+    with open(filename, mode='r') as translit_file:
+        for line in translit_file:
+            head = head + line
+            if line.startswith('translit_start'):
+                break
+        for line in translit_file:
+            if line.startswith('translit_end'):
+                tail = line
+                break
+        for line in translit_file:
+            tail = tail + line
+    return (head, tail)
+
+def output_head(translit_file, unicode_version, head=''):
+    '''Write the header of the output file, i.e. the part of the file
+    before the “translit_start” line.
+    '''
+    if ARGS.input_file and head:
+        translit_file.write(head)
+    else:
+        translit_file.write('escape_char /\n')
+        translit_file.write('comment_char %\n')
+        translit_file.write('\n')
+        translit_file.write('% Transliterations that remove all ')
+        translit_file.write('combining characters (accents,\n')
+        translit_file.write('% pronounciation marks, etc.).\n')
+        translit_file.write('% Generated automatically from UnicodeData.txt '
+                            + 'by gen_translit_combining.py '
+                            + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+                            + 'for Unicode {:s}.\n'.format(unicode_version))
+        translit_file.write('\n')
+        translit_file.write('LC_CTYPE\n')
+        translit_file.write('\n')
+        translit_file.write('translit_start\n')
+
+def output_tail(translit_file, tail=''):
+    '''Write the tail of the output file'''
+    if ARGS.input_file and tail:
+        translit_file.write(tail)
+    else:
+        translit_file.write('translit_end\n')
+        translit_file.write('\n')
+        translit_file.write('END LC_CTYPE\n')
+
+def is_combining_remove(code_point):
+    '''Check whether this is a combining character which should be listed
+    in the section of the translit_combining file where combining
+    characters are replaced by empty strings.
+
+    We ignore combining characters from many scripts here because
+    the original translit_combining file didn’t do this for the
+    combining characters from these scripts either and I am not
+    sure yet whether this would be useful to do for all combining
+    characters or not. For the moment I think it is better to keep
+    close to the spirit of the original file.
+    '''
+    if not unicode_utils.is_combining(code_point):
+        return False
+    name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
+    for substring in ('DEVANAGARI',
+                      'BENGALI',
+                      'CYRILLIC',
+                      'SYRIAC',
+                      'THAANA',
+                      'NKO',
+                      'GURMUKHI',
+                      'TAMIL',
+                      'GUJARATI',
+                      'ORIYA',
+                      'TELUGU',
+                      'KANNADA',
+                      'MALAYALAM',
+                      'SINHALA',
+                      'THAI',
+                      'LAO',
+                      'TIBETAN',
+                      'MYANMAR',
+                      'ETHIOPIC',
+                      'TAGALOG',
+                      'HANUNOO',
+                      'BUHID',
+                      'TAGBANWA',
+                      'KHMER',
+                      'MONGOLIAN',
+                      'LIMBU',
+                      'NEW TAI LUE',
+                      'BUGINESE',
+                      'BALINESE',
+                      'SUNDANESE',
+                      'LEPCHA',
+                      'IDEOGRAPHIC',
+                      'HANGUL',
+                      'SYLOTI',
+                      'SAURASHTRA',
+                      'KAYAH',
+                      'REJANG',
+                      'CHAM',
+                      'VARIATION SELECTOR',
+                      'KHAROSHTHI',
+                      'MUSICAL SYMBOL',
+                      'SAMARITAN',
+                      'MANDAIC',
+                      'TAI THAM',
+                      'BATAK',
+                      'VEDIC',
+                      'COPTIC',
+                      'TIFINAGH',
+                      'BAMUM',
+                      'JAVANESE',
+                      'TAI VIET',
+                      'MEETEI',
+                      'MANICHAEAN',
+                      'BRAHMI',
+                      'KAITHI',
+                      'CHAKMA',
+                      'MAHAJANI',
+                      'SHARADA',
+                      'KHOJKI',
+                      'KHUDAWADI',
+                      'GRANTHA',
+                      'TIRHUTA',
+                      'SIDDHAM',
+                      'MODI VOWEL',
+                      'MODI SIGN',
+                      'TAKRI',
+                      'BASSA VAH',
+                      'PAHAWH HMONG',
+                      'MIAO',
+                      'DUPLOYAN',
+                      'MENDE KIKAKUI'
+    ):
+        if substring in name:
+            return False
+    return True
+
+def canonical_decompose(code_point):
+    '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
+
+    In some instances a canonical mapping or a compatibility mapping
+    may consist of a single character. For a canonical mapping, this
+    indicates that the character is a canonical equivalent of another
+    single character. For a compatibility mapping, this indicates that
+    the character is a compatibility equivalent of another single
+    character.
+
+    A canonical mapping may also consist of a pair of characters, but
+    is never longer than two characters. When a canonical mapping
+    consists of a pair of characters, the first character may itself
+    be a character with a decomposition mapping, but the second
+    character never has a decomposition mapping.
+
+    We ignore the canonical decomposition for code points
+    matching certain substrings because the original translit_combining
+    file didn’t include these types of characters either. I am unsure
+    about the usefulness of including them and want to keep close
+    to the spirit of the original file for the moment.
+    '''
+    name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
+    for substring in ('MUSICAL SYMBOL',
+                      'CJK COMPATIBILITY IDEOGRAPH',
+                      'BALINESE',
+                      'KAITHI LETTER',
+                      'CHAKMA VOWEL',
+                      'GRANTHA VOWEL',
+                      'TIRHUTA VOWEL',
+                      'SIDDHAM VOWEL'):
+        if substring in name:
+            return []
+    decomposition = unicode_utils.UNICODE_ATTRIBUTES[
+        code_point]['decomposition']
+    if decomposition and not decomposition.startswith('<'):
+        decomposed_code_points = [int(x, 16) for x in decomposition.split(' ')]
+        if decomposed_code_points:
+            cd0 = canonical_decompose(decomposed_code_points[0])
+            if cd0:
+                decomposed_code_points = cd0 + decomposed_code_points[1:]
+        return decomposed_code_points
+    else:
+        return []
+
+def special_decompose(code_point_list):
+    '''
+    Decompositions which are not canonical or which are not in
+    UnicodeData.txt at all but some of these were used in the original
+    translit_combining file in glibc and they seemed to make sense.
+    I want to keep the update of translit_combining close to the
+    spirit of the original file, therefore I added these special
+    decomposition rules here.
+    '''
+    special_decompose_dict = {
+        # Ø U+00D8 is already handled in translit_neutral. But
+        # translit_combining is usually included after translit_neutral
+        # and Ǿ U+01FE LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
+        # has a canonical decomposition to Ø U+00D8 and we want to
+        # further decompose this to U+004F.
+        (0x00D8,): [0x004F], # Ø → O
+        # ø U+00F8 is already handled in translit_neutral. But
+        # translit_combining is usually included after translit_neutral
+        # and ǿ U+01FF LATIN SMALL LETTER O WITH STROKE AND ACUTE
+        # has a canonical decomposition to ø U+00F8 and we want to
+        # further decompose this to U+006F.
+        (0x00F8,): [0x006F], # ø → o
+        # æ U+00E6 is already in translit_compat because ligatures
+        # are handled in translit_compat. But ǣ U+01E3 has a
+        # canonical decomposition to U+00E6, U+0304 and we want to
+        # further decompose this to “ae”.
+        (0x00E6,): [0x0061, 0x0065], # æ → ae
+        # Æ U+00C6  is already in translit_compat because ligatures
+        # are handled in translit_compat. But Ǣ U+01E2 has a
+        # canonical decomposition to U+00C6, U+0304 and we want to
+        # further decompose this to “AE”
+        (0x00C6,): [0x0041, 0x0045], # Æ → AE
+        # U+05F2 HEBREW LIGATURE YIDDISH DOUBLE YOD is already in
+        # translit_compat because ligatures are handled in translit_compat.
+        # But U+FB1F has a canonical decomposition to U+05F2 and
+        # we want to further decompose this to U+05D9, U+05D9.
+        (0x05F2,): [0x05D9, 0x05D9], # ײ → יי
+        # 0x2002 has a <compat> decomposition to 0x0020 in UnicodeData.txt
+        # But U+2000 EN QUAD has a canonical decomposition U+2002
+        # and we want to further decompose this to U+0020.
+        (0x2002,): [0x0020], # EN SPACE → SPACE
+        # 0x2003 has a <compat> decomposition to 0x0020 in UnicodeData.txt
+        # But U+2001 EM QUAD has a canonical decomposition to U+2003
+        # and we want to further decompose this to U+0020.
+        (0x2003,): [0x0020], # EM SPACE → SPACE
+        # U+2260 ≠ has the canonical decomposition U+003D U+0338
+        # (= followed by ̸). After stripping the combining characters,
+        # the result is only = which reverses the meaning.
+        # Therefore, we add a special rules here for such mathematical
+        # negations:
+        (0x21AE,): [0x0021, 0x003C, 0x002D, 0x003E], # ↮ → !<->
+        (0x21CD,): [0x0021, 0x003C, 0x003D], # ⇍ → !<=
+        (0x21CE,): [0x0021, 0x003C, 0x003D, 0x003E], # ⇎ → !<=>
+        (0x21CF,): [0x0021, 0x003D, 0x003E], # ⇏ → !=>
+        (0x2204,): [0x0021, 0x2203], # ∄ → !∃
+        (0x2209,): [0x0021, 0x2208], # ∉ → !∈
+        (0x220C,): [0x0021, 0x220B], # ∌ → !∋
+        (0x2224,): [0x0021, 0x2223], # ∤ → !∣
+        (0x2226,): [0x0021, 0x2225], # ∦ → !∥
+        (0x2241,): [0x0021, 0x007E], # ≁ → !~
+        (0x2244,): [0x0021, 0x007E, 0x002D], # ≄ → !~-
+        (0x2247,): [0x0021, 0x007E, 0x003D], # ≇ → !~=
+        (0x2249,): [0x0021, 0x007E, 0x007E], # ≉ → !~~
+        (0x2260,): [0x0021, 0x003D], # ≠ → !=
+        (0x2262,): [0x0021, 0x003D, 0x003D], # ≢ → !==
+        (0x226D,): [0x0021, 0x224D], # ≭ → !≍
+        (0x226E,): [0x0021, 0x003C], # ≮ → !<
+        (0x226F,): [0x0021, 0x003E], # ≯ → !>
+        (0x2270,): [0x0021, 0x003C, 0x003D], # ≰ → !<=
+        (0x2271,): [0x0021, 0x003E, 0x003D], # ≱ → !>=
+        (0x2274,): [0x0021, 0x003C, 0x007E], # ≴ → !<~
+        (0x2275,): [0x0021, 0x003E, 0x007E], # ≵ → !>~
+        (0x2278,): [0x0021, 0x003C, 0x003E], # ≸ → !<>
+        (0x2279,): [0x0021, 0x003E, 0x003C], # ≹ → !><
+        (0x2280,): [0x0021, 0x227A], # ⊀ → !≺
+        (0x2281,): [0x0021, 0x227B], # ⊁ → !≻
+        (0x2284,): [0x0021, 0x2282], # ⊄ → !⊂
+        (0x2285,): [0x0021, 0x2283], # ⊅ → !⊃
+        (0x2288,): [0x0021, 0x2282, 0x003D], # ⊈ → !⊂=
+        (0x2289,): [0x0021, 0x2283, 0x003D], # ⊉ → !⊃=
+        (0x22AC,): [0x0021, 0x22A2], # ⊬ → !⊢
+        (0x22AD,): [0x0021, 0x22A8], # ⊭ → !⊨
+        (0x22AE,): [0x0021, 0x22A9], # ⊮ → !⊩
+        (0x22AF,): [0x0021, 0x22AB], # ⊯ → !⊫
+        (0x22E0,): [0x0021, 0x227C], # ⋠ → !≼
+        (0x22E1,): [0x0021, 0x227D], # ⋡ → !≽
+        (0x22E2,): [0x0021, 0x2291], # ⋢ → !⊑
+        (0x22E3,): [0x0021, 0x2292], # ⋣ → !⊒
+        (0x22EA,): [0x0021, 0x22B2], # ⋪ → !⊲
+        (0x22EB,): [0x0021, 0x22B3], # ⋫ → !⊳
+        (0x22EC,): [0x0021, 0x22B4], # ⋬ → !⊴
+        (0x22ED,): [0x0021, 0x22B5], # ⋭ → !⊵
+        (0x2ADC,): [0x0021, 0x2ADD], # ⫝̸ → !⫝
+        # Special rule for 〈 U+3008 is added
+        # because 〉 U+2329 has the canonical decomposition U+3008
+        # and we want to further decompose this to > U+003C.
+        (0x3008,): [0x003C], # 〈 → <
+        # Special rule for 〉 U+3009 is added
+        # because 〉 U+232A has the canonical decomposition U+3009
+        # and we want to further decompose this to < U+003E.
+        (0x3009,): [0x003E], # 〉→ >
+    }
+    if tuple(code_point_list) in special_decompose_dict:
+        return special_decompose_dict[tuple(code_point_list)]
+    else:
+        return code_point_list
+
+def output_combining_remove(translit_file):
+    '''Write the section of the translit_combining file where combining
+    characters are replaced by empty strings.
+    '''
+    translit_file.write('\n')
+    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
+        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
+        if is_combining_remove(code_point):
+            translit_file.write('% {:s}\n'.format(name))
+            translit_file.write('{:s} ""\n'.format(
+                unicode_utils.ucs_symbol(code_point)))
+    translit_file.write('\n')
+
+def output_decompositions(translit_file):
+    '''Write the section of the translit_combining file where characters
+    characters are decomposed and combining characters stripped from
+    the decompositions.
+    '''
+    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
+        if special_decompose([code_point]) != [code_point]:
+            decomposed_code_points = [special_decompose([code_point])]
+        else:
+            decomposed_code_points = [canonical_decompose(code_point)]
+        if decomposed_code_points[0]:
+            while True:
+                special_decomposed_code_points = special_decompose(
+                    decomposed_code_points[-1])
+                if (special_decomposed_code_points
+                        != decomposed_code_points[-1]):
+                    decomposed_code_points.append(
+                        special_decomposed_code_points)
+                    continue
+                special_decomposed_code_points = []
+                for decomposed_code_point in decomposed_code_points[-1]:
+                    special_decomposed_code_points += special_decompose(
+                        [decomposed_code_point])
+                if (special_decomposed_code_points
+                        == decomposed_code_points[-1]):
+                    break
+                decomposed_code_points.append(
+                    special_decomposed_code_points)
+            for index in range(0, len(decomposed_code_points)):
+                decomposed_code_points[index] = [
+                    x for x in decomposed_code_points[index]
+                    if not is_combining_remove(x)]
+        if decomposed_code_points[0]:
+            translit_file.write('% {:s}\n'.format(
+                unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']))
+            translit_file.write('{:s} '.format(
+                unicode_utils.ucs_symbol(code_point)))
+            for index in range(0, len(decomposed_code_points)):
+                if index > 0:
+                    translit_file.write(';')
+                if len(decomposed_code_points[index]) > 1:
+                    translit_file.write('"')
+                for decomposed_code_point in decomposed_code_points[index]:
+                    translit_file.write('{:s}'.format(
+                        unicode_utils.ucs_symbol(decomposed_code_point)))
+                if len(decomposed_code_points[index]) > 1:
+                    translit_file.write('"')
+            translit_file.write('\n')
+    translit_file.write('\n')
+
+def output_transliteration(translit_file):
+    '''Write the new transliteration to the output file'''
+    output_combining_remove(translit_file)
+    output_decompositions(translit_file)
+
+if __name__ == "__main__":
+    PARSER = argparse.ArgumentParser(
+        description='''
+        Generate a translit_combining file from UnicodeData.txt.
+        ''')
+    PARSER.add_argument(
+        '-u', '--unicode_data_file',
+        nargs='?',
+        type=str,
+        default='UnicodeData.txt',
+        help=('The UnicodeData.txt file to read, '
+              + 'default: %(default)s'))
+    PARSER.add_argument(
+        '-i', '--input_file',
+        nargs='?',
+        type=str,
+        help=''' The original glibc/localedata/locales/translit_combining
+        file.''')
+    PARSER.add_argument(
+        '-o', '--output_file',
+        nargs='?',
+        type=str,
+        default='translit_combining.new',
+        help='''The new translit_combining file, default: %(default)s.  If the
+        original glibc/localedata/locales/translit_combining file has
+        been given as an option, the header up to the
+        “translit_start” line and the tail from the “translit_end”
+        line to the end of the file will be copied unchanged into the
+        output file.  ''')
+    PARSER.add_argument(
+        '--unicode_version',
+        nargs='?',
+        required=True,
+        type=str,
+        help='The Unicode version of the input files used.')
+    ARGS = PARSER.parse_args()
+
+    unicode_utils.fill_attributes(ARGS.unicode_data_file)
+    HEAD = TAIL = ''
+    if ARGS.input_file:
+        (HEAD, TAIL) = read_input_file(ARGS.input_file)
+    with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
+        output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
+        output_transliteration(TRANSLIT_FILE)
+        output_tail(TRANSLIT_FILE, tail=TAIL)
diff --git a/localedata/unicode-gen/gen_translit_compat.py b/localedata/unicode-gen/gen_translit_compat.py
new file mode 100644
index 0000000000..0e824a877e
--- /dev/null
+++ b/localedata/unicode-gen/gen_translit_compat.py
@@ -0,0 +1,326 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# Generate a translit_compat file from a UnicodeData file.
+# Copyright (C) 2015 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+'''
+Generate a translit_compat file from UnicodeData.txt
+
+To see how this script is used, call it with the “-h” option:
+
+    $ ./gen_translit_compat -h
+    … prints usage message …
+'''
+
+import argparse
+import time
+import unicode_utils
+
+def read_input_file(filename):
+    '''Reads the original glibc translit_compat file to get the
+    original head and tail.
+
+    We want to replace only the part of the file between
+    “translit_start” and “translit_end”
+    '''
+    head = tail = ''
+    with open(filename, mode='r') as translit_file:
+        for line in translit_file:
+            head = head + line
+            if line.startswith('translit_start'):
+                break
+        for line in translit_file:
+            if line.startswith('translit_end'):
+                tail = line
+                break
+        for line in translit_file:
+            tail = tail + line
+    return (head, tail)
+
+def output_head(translit_file, unicode_version, head=''):
+    '''Write the header of the output file, i.e. the part of the file
+    before the “translit_start” line.
+    '''
+    if ARGS.input_file and head:
+        translit_file.write(head)
+    else:
+        translit_file.write('escape_char /\n')
+        translit_file.write('comment_char %\n')
+        translit_file.write('\n')
+        translit_file.write('% Transliterations of compatibility characters ')
+        translit_file.write('and ligatures.\n')
+        translit_file.write('% Generated automatically from UnicodeData.txt '
+                            + 'by gen_translit_compat.py '
+                            + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+                            + 'for Unicode {:s}.\n'.format(unicode_version))
+        translit_file.write('\n')
+        translit_file.write('LC_CTYPE\n')
+        translit_file.write('\n')
+        translit_file.write('translit_start\n')
+
+def output_tail(translit_file, tail=''):
+    '''Write the tail of the output file'''
+    if ARGS.input_file and tail:
+        translit_file.write(tail)
+    else:
+        translit_file.write('translit_end\n')
+        translit_file.write('\n')
+        translit_file.write('END LC_CTYPE\n')
+
+def compatibility_decompose(code_point):
+    '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
+
+    “The compatibility decomposition is formed by recursively applying
+    the canonical and compatibility mappings, then applying the
+    Canonical Ordering Algorithm.”
+
+    We don’t do the canonical decomposition here because this is
+    done in gen_translit_combining.py to generate translit_combining.
+
+    And we ignore some of the possible compatibility formatting tags
+    here. Some of them are used in other translit_* files, not
+    translit_compat:
+
+    <font>:   translit_font
+    <circle>: translit_circle
+    <wide>:   translit_wide
+    <narrow>: translit_narrow
+    <square>: translit_cjk_compat
+    <fraction>: translit_fraction
+
+    And we ignore
+
+    <noBreak>, <initial>, <medial>, <final>, <isolated>
+
+    because they seem to be not useful for transliteration.
+    '''
+    decomposition = unicode_utils.UNICODE_ATTRIBUTES[
+        code_point]['decomposition']
+    compatibility_tags = (
+        '<compat>', '<super>', '<sub>', '<vertical>')
+    for compatibility_tag in compatibility_tags:
+        if decomposition.startswith(compatibility_tag):
+            decomposition = decomposition[len(compatibility_tag)+1:]
+            decomposed_code_points = [int(x, 16)
+                                      for x in decomposition.split(' ')]
+            if (len(decomposed_code_points) > 1
+                    and decomposed_code_points[0] == 0x0020
+                    and decomposed_code_points[1] >= 0x0300
+                    and decomposed_code_points[1] <= 0x03FF):
+                # Decomposes into a space followed by a combining character.
+                # This is not useful fo transliteration.
+                return []
+            else:
+                return_value = []
+                for index in range(0, len(decomposed_code_points)):
+                    cd_code_points = compatibility_decompose(
+                        decomposed_code_points[index])
+                    if cd_code_points:
+                        return_value += cd_code_points
+                    else:
+                        return_value += [decomposed_code_points[index]]
+                return return_value
+    return []
+
+def special_decompose(code_point_list):
+    '''
+    Decompositions which are not in UnicodeData.txt at all but which
+    were used in the original translit_compat file in glibc and
+    which seem to make sense.  I want to keep the update of
+    translit_compat close to the spirit of the original file,
+    therefore I added this special decomposition rules here.
+    '''
+    special_decompose_dict = {
+        (0x03BC,): [0x0075], # μ → u
+        (0x02BC,): [0x0027], # ʼ → '
+    }
+    if tuple(code_point_list) in special_decompose_dict:
+        return special_decompose_dict[tuple(code_point_list)]
+    else:
+        return code_point_list
+
+def special_ligature_decompose(code_point):
+    '''
+    Decompositions for ligatures which are not in UnicodeData.txt at
+    all but which were used in the original translit_compat file in
+    glibc and which seem to make sense.  I want to keep the update of
+    translit_compat close to the spirit of the original file,
+    therefore I added these special ligature decomposition rules here.
+
+    '''
+    special_ligature_decompose_dict = {
+        0x00E6: [0x0061, 0x0065], # æ → ae
+        0x00C6: [0x0041, 0x0045], # Æ → AE
+        # These following 5 special ligature decompositions were
+        # in the original glibc/localedata/locales/translit_compat file
+        0x0152: [0x004F, 0x0045], # Œ → OE
+        0x0153: [0x006F, 0x0065], # œ → oe
+        0x05F0: [0x05D5, 0x05D5], # װ → וו
+        0x05F1: [0x05D5, 0x05D9], # ױ → וי
+        0x05F2: [0x05D9, 0x05D9], # ײ → יי
+        # The following special ligature decompositions were
+        # not in the original glibc/localedata/locales/translit_compat file
+        # U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE
+        # → U+041D CYRILLIC CAPITAL LETTER EN,
+        #   U+0413 CYRILLIC CAPITAL LETTER GHE
+        0x04A4: [0x041D, 0x0413], # Ҥ → НГ
+        # U+04A5 CYRILLIC SMALL LIGATURE EN GHE
+        # → U+043D CYRILLIC SMALL LETTER EN,
+        #   U+0433 CYRILLIC SMALL LETTER GHE
+        0x04A5: [0x043D, 0x0433], # ҥ → нг
+        # U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE
+        # → U+0422 CYRILLIC CAPITAL LETTER TE,
+        #   U+0426 CYRILLIC CAPITAL LETTER TSE
+        0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ
+        # U+04B5 CYRILLIC SMALL LIGATURE TE TSE
+        # → U+0442 CYRILLIC SMALL LETTER TE,
+        #   U+0446 CYRILLIC SMALL LETTER TSE
+        0x04B5: [0x0442, 0x0446], # ҵ → тц
+        # U+04d4 CYRILLIC CAPITAL LIGATURE A IE
+        # → U+0410 CYRILLIC CAPITAL LETTER A
+        #   U+0415;CYRILLIC CAPITAL LETTER IE
+        0x04D4: [0x0410, 0x0415], # Ӕ → АЕ
+        # U+04D5 CYRILLIC SMALL LIGATURE A IE
+        # → U+0430 CYRILLIC SMALL LETTER A,
+        #   U+0435 CYRILLIC SMALL LETTER IE
+        0x04D5: [0x0430, 0x0435], # ӕ → ае
+        # I am not sure what to do with the following ligatures
+        # maybe it makes no sense to decompose them:
+        # U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
+        # U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
+        # U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
+        # U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
+        # U+fe20 COMBINING LIGATURE LEFT HALF
+        # U+fe21 COMBINING LIGATURE RIGHT HALF
+        # U+fe27 COMBINING LIGATURE LEFT HALF BELOW
+        # U+fe28 COMBINING LIGATURE RIGHT HALF BELOW
+        # U+11176 MAHAJANI LIGATURE SHRI
+        # U+1f670 SCRIPT LIGATURE ET ORNAMENT
+        # U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT
+        # U+1f672 LIGATURE OPEN ET ORNAMENT
+        # U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT
+    }
+    if code_point in special_ligature_decompose_dict:
+        return special_ligature_decompose_dict[code_point]
+    else:
+        return [code_point]
+
+def output_transliteration(translit_file):
+    '''Write the new transliteration to the output file'''
+    translit_file.write('\n')
+    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
+        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
+        decomposed_code_points = [compatibility_decompose(code_point)]
+        if not decomposed_code_points[0]:
+            if special_decompose([code_point]) != [code_point]:
+                decomposed_code_points[0] = special_decompose([code_point])
+        else:
+            special_decomposed_code_points = []
+            while True:
+                special_decomposed_code_points = special_decompose(
+                    decomposed_code_points[-1])
+                if (special_decomposed_code_points
+                        != decomposed_code_points[-1]):
+                    decomposed_code_points.append(
+                        special_decomposed_code_points)
+                    continue
+                special_decomposed_code_points = []
+                for decomposed_code_point in decomposed_code_points[-1]:
+                    special_decomposed_code_points += special_decompose(
+                        [decomposed_code_point])
+                if (special_decomposed_code_points
+                        == decomposed_code_points[-1]):
+                    break
+                decomposed_code_points.append(
+                    special_decomposed_code_points)
+        if decomposed_code_points[0]:
+            translit_file.write('% {:s}\n'.format(name))
+            translit_file.write('{:s} '.format(
+                unicode_utils.ucs_symbol(code_point)))
+            for index in range(0, len(decomposed_code_points)):
+                if index > 0:
+                    translit_file.write(';')
+                translit_file.write('"')
+                for decomposed_code_point in decomposed_code_points[index]:
+                    translit_file.write('{:s}'.format(
+                        unicode_utils.ucs_symbol(decomposed_code_point)))
+                translit_file.write('"')
+            translit_file.write('\n')
+        elif 'LIGATURE' in name and 'ARABIC' not in name:
+            decomposed_code_points = special_ligature_decompose(code_point)
+            if decomposed_code_points[0] != code_point:
+                translit_file.write('% {:s}\n'.format(name))
+                translit_file.write('{:s} '.format(
+                    unicode_utils.ucs_symbol(code_point)))
+                translit_file.write('"')
+                for decomposed_code_point in decomposed_code_points:
+                    translit_file.write('{:s}'.format(
+                        unicode_utils.ucs_symbol(decomposed_code_point)))
+                translit_file.write('"')
+                translit_file.write('\n')
+            else:
+                print('Warning: unhandled ligature: {:x} {:s}'.format(
+                    code_point, name))
+    translit_file.write('\n')
+
+if __name__ == "__main__":
+    PARSER = argparse.ArgumentParser(
+        description='''
+        Generate a translit_compat file from UnicodeData.txt.
+        ''')
+    PARSER.add_argument(
+        '-u', '--unicode_data_file',
+        nargs='?',
+        type=str,
+        default='UnicodeData.txt',
+        help=('The UnicodeData.txt file to read, '
+              + 'default: %(default)s'))
+    PARSER.add_argument(
+        '-i', '--input_file',
+        nargs='?',
+        type=str,
+        help=''' The original glibc/localedata/locales/translit_compat
+        file.''')
+    PARSER.add_argument(
+        '-o', '--output_file',
+        nargs='?',
+        type=str,
+        default='translit_compat.new',
+        help='''The new translit_compat file, default: %(default)s.  If the
+        original glibc/localedata/locales/translit_compat file has
+        been given as an option, the header up to the
+        “translit_start” line and the tail from the “translit_end”
+        line to the end of the file will be copied unchanged into the
+        output file.  ''')
+    PARSER.add_argument(
+        '--unicode_version',
+        nargs='?',
+        required=True,
+        type=str,
+        help='The Unicode version of the input files used.')
+    ARGS = PARSER.parse_args()
+
+    unicode_utils.fill_attributes(ARGS.unicode_data_file)
+    HEAD = TAIL = ''
+    if ARGS.input_file:
+        (HEAD, TAIL) = read_input_file(ARGS.input_file)
+    with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
+        output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
+        output_transliteration(TRANSLIT_FILE)
+        output_tail(TRANSLIT_FILE, tail=TAIL)
diff --git a/localedata/unicode-gen/gen_translit_font.py b/localedata/unicode-gen/gen_translit_font.py
new file mode 100644
index 0000000000..072362223f
--- /dev/null
+++ b/localedata/unicode-gen/gen_translit_font.py
@@ -0,0 +1,156 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# Generate a translit_font file from a UnicodeData file.
+# Copyright (C) 2015 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+'''
+Generate a translit_font file from UnicodeData.txt
+
+To see how this script is used, call it with the “-h” option:
+
+    $ ./gen_translit_font -h
+    … prints usage message …
+'''
+
+import argparse
+import time
+import unicode_utils
+
+def read_input_file(filename):
+    '''Reads the original glibc translit_font file to get the
+    original head and tail.
+
+    We want to replace only the part of the file between
+    “translit_start” and “translit_end”
+    '''
+    head = tail = ''
+    with open(filename, mode='r') as translit_file:
+        for line in translit_file:
+            head = head + line
+            if line.startswith('translit_start'):
+                break
+        for line in translit_file:
+            if line.startswith('translit_end'):
+                tail = line
+                break
+        for line in translit_file:
+            tail = tail + line
+    return (head, tail)
+
+def output_head(translit_file, unicode_version, head=''):
+    '''Write the header of the output file, i.e. the part of the file
+    before the “translit_start” line.
+    '''
+    if ARGS.input_file and head:
+        translit_file.write(head)
+    else:
+        translit_file.write('escape_char /\n')
+        translit_file.write('comment_char %\n')
+        translit_file.write('\n')
+        translit_file.write('% Transliterations of font equivalents.\n')
+        translit_file.write('% Generated automatically from UnicodeData.txt '
+                            + 'by gen_translit_font.py '
+                            + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+                            + 'for Unicode {:s}.\n'.format(unicode_version))
+        translit_file.write('\n')
+        translit_file.write('LC_CTYPE\n')
+        translit_file.write('\n')
+        translit_file.write('translit_start\n')
+
+def output_tail(translit_file, tail=''):
+    '''Write the tail of the output file'''
+    if ARGS.input_file and tail:
+        translit_file.write(tail)
+    else:
+        translit_file.write('translit_end\n')
+        translit_file.write('\n')
+        translit_file.write('END LC_CTYPE\n')
+
+def output_transliteration(translit_file):
+    '''Write the new transliteration to the output file'''
+    translit_file.write('\n')
+    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
+        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
+        decomposition = unicode_utils.UNICODE_ATTRIBUTES[
+            code_point]['decomposition']
+        if decomposition.startswith('<font>'):
+            decomposition = decomposition[7:]
+            decomposed_code_points = [[int(x, 16)
+                                       for x in decomposition.split(' ')]]
+            if decomposed_code_points[0]:
+                translit_file.write('{:s} '.format(
+                    unicode_utils.ucs_symbol(code_point)))
+                for index in range(0, len(decomposed_code_points)):
+                    if index > 0:
+                        translit_file.write(';')
+                    if len(decomposed_code_points[index]) > 1:
+                        translit_file.write('"')
+                    for decomposed_code_point in decomposed_code_points[index]:
+                        translit_file.write('{:s}'.format(
+                            unicode_utils.ucs_symbol(decomposed_code_point)))
+                    if len(decomposed_code_points[index]) > 1:
+                        translit_file.write('"')
+                translit_file.write(' % {:s}\n'.format(name))
+    translit_file.write('\n')
+
+if __name__ == "__main__":
+    PARSER = argparse.ArgumentParser(
+        description='''
+        Generate a translit_font file from UnicodeData.txt.
+        ''')
+    PARSER.add_argument(
+        '-u', '--unicode_data_file',
+        nargs='?',
+        type=str,
+        default='UnicodeData.txt',
+        help=('The UnicodeData.txt file to read, '
+              + 'default: %(default)s'))
+    PARSER.add_argument(
+        '-i', '--input_file',
+        nargs='?',
+        type=str,
+        help=''' The original glibc/localedata/locales/translit_font
+        file.''')
+    PARSER.add_argument(
+        '-o', '--output_file',
+        nargs='?',
+        type=str,
+        default='translit_font.new',
+        help='''The new translit_font file, default: %(default)s.  If the
+        original glibc/localedata/locales/translit_font file has
+        been given as an option, the header up to the
+        “translit_start” line and the tail from the “translit_end”
+        line to the end of the file will be copied unchanged into the
+        output file.  ''')
+    PARSER.add_argument(
+        '--unicode_version',
+        nargs='?',
+        required=True,
+        type=str,
+        help='The Unicode version of the input files used.')
+    ARGS = PARSER.parse_args()
+
+    unicode_utils.fill_attributes(ARGS.unicode_data_file)
+    HEAD = TAIL = ''
+    if ARGS.input_file:
+        (HEAD, TAIL) = read_input_file(ARGS.input_file)
+    with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
+        output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
+        output_transliteration(TRANSLIT_FILE)
+        output_tail(TRANSLIT_FILE, tail=TAIL)
diff --git a/localedata/unicode-gen/gen_translit_fraction.py b/localedata/unicode-gen/gen_translit_fraction.py
new file mode 100644
index 0000000000..5bf63ea344
--- /dev/null
+++ b/localedata/unicode-gen/gen_translit_fraction.py
@@ -0,0 +1,197 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# Generate a translit_fraction file from a UnicodeData file.
+# Copyright (C) 2015 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+'''
+Generate a translit_fraction file from UnicodeData.txt
+
+To see how this script is used, call it with the “-h” option:
+
+    $ ./gen_translit_fraction -h
+    … prints usage message …
+'''
+
+import argparse
+import time
+import unicode_utils
+
+def read_input_file(filename):
+    '''Reads the original glibc translit_fraction file to get the
+    original head and tail.
+
+    We want to replace only the part of the file between
+    “translit_start” and “translit_end”
+    '''
+    head = tail = ''
+    with open(filename, mode='r') as translit_file:
+        for line in translit_file:
+            head = head + line
+            if line.startswith('translit_start'):
+                break
+        for line in translit_file:
+            if line.startswith('translit_end'):
+                tail = line
+                break
+        for line in translit_file:
+            tail = tail + line
+    return (head, tail)
+
+def output_head(translit_file, unicode_version, head=''):
+    '''Write the header of the output file, i.e. the part of the file
+    before the “translit_start” line.
+    '''
+    if ARGS.input_file and head:
+        translit_file.write(head)
+    else:
+        translit_file.write('escape_char /\n')
+        translit_file.write('comment_char %\n')
+        translit_file.write('\n')
+        translit_file.write('% Transliterations of fractions.\n')
+        translit_file.write('% Generated automatically from UnicodeData.txt '
+                            + 'by gen_translit_fraction.py '
+                            + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+                            + 'for Unicode {:s}.\n'.format(unicode_version))
+        translit_file.write('% The replacements have been surrounded ')
+        translit_file.write('with spaces, because fractions are\n')
+        translit_file.write('% often preceded by a decimal number and ')
+        translit_file.write('followed by a unit or a math symbol.\n')
+        translit_file.write('\n')
+        translit_file.write('LC_CTYPE\n')
+        translit_file.write('\n')
+        translit_file.write('translit_start\n')
+
+def output_tail(translit_file, tail=''):
+    '''Write the tail of the output file'''
+    if ARGS.input_file and tail:
+        translit_file.write(tail)
+    else:
+        translit_file.write('translit_end\n')
+        translit_file.write('\n')
+        translit_file.write('END LC_CTYPE\n')
+
+def special_decompose(code_point_list):
+    '''
+    Decompositions which are not in UnicodeData.txt at all but which
+    were used in the original translit_fraction file in glibc and
+    which seem to make sense.  I want to keep the update of
+    translit_fraction close to the spirit of the original file,
+    therefore I added this special decomposition rules here.
+    '''
+    special_decompose_dict = {
+        (0x2044,): [0x002F], # ⁄ → /
+    }
+    if tuple(code_point_list) in special_decompose_dict:
+        return special_decompose_dict[tuple(code_point_list)]
+    else:
+        return code_point_list
+
+def output_transliteration(translit_file):
+    '''Write the new transliteration to the output file'''
+    translit_file.write('\n')
+    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
+        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
+        decomposition = unicode_utils.UNICODE_ATTRIBUTES[
+            code_point]['decomposition']
+        if decomposition.startswith('<fraction>'):
+            decomposition = decomposition[11:]
+            decomposed_code_points = [[int(x, 16)
+                                       for x in decomposition.split(' ')]]
+            if decomposed_code_points[0]:
+                decomposed_code_points[0] = [0x0020] \
+                                            + decomposed_code_points[0] \
+                                            + [0x0020]
+                while True:
+                    special_decomposed_code_points = special_decompose(
+                        decomposed_code_points[-1])
+                    if (special_decomposed_code_points
+                            != decomposed_code_points[-1]):
+                        decomposed_code_points.append(
+                            special_decomposed_code_points)
+                        continue
+                    special_decomposed_code_points = []
+                    for decomposed_code_point in decomposed_code_points[-1]:
+                        special_decomposed_code_points += special_decompose(
+                            [decomposed_code_point])
+                    if (special_decomposed_code_points
+                            == decomposed_code_points[-1]):
+                        break
+                    decomposed_code_points.append(
+                        special_decomposed_code_points)
+                translit_file.write('% {:s}\n'.format(name))
+                translit_file.write('{:s} '.format(
+                    unicode_utils.ucs_symbol(code_point)))
+                for index in range(0, len(decomposed_code_points)):
+                    if index > 0:
+                        translit_file.write(';')
+                    if len(decomposed_code_points[index]) > 1:
+                        translit_file.write('"')
+                    for decomposed_code_point in decomposed_code_points[index]:
+                        translit_file.write('{:s}'.format(
+                            unicode_utils.ucs_symbol(decomposed_code_point)))
+                    if len(decomposed_code_points[index]) > 1:
+                        translit_file.write('"')
+                translit_file.write('\n')
+    translit_file.write('\n')
+
+if __name__ == "__main__":
+    PARSER = argparse.ArgumentParser(
+        description='''
+        Generate a translit_cjk_compat file from UnicodeData.txt.
+        ''')
+    PARSER.add_argument(
+        '-u', '--unicode_data_file',
+        nargs='?',
+        type=str,
+        default='UnicodeData.txt',
+        help=('The UnicodeData.txt file to read, '
+              + 'default: %(default)s'))
+    PARSER.add_argument(
+        '-i', '--input_file',
+        nargs='?',
+        type=str,
+        help=''' The original glibc/localedata/locales/translit_fraction
+        file.''')
+    PARSER.add_argument(
+        '-o', '--output_file',
+        nargs='?',
+        type=str,
+        default='translit_fraction.new',
+        help='''The new translit_fraction file, default: %(default)s.  If the
+        original glibc/localedata/locales/translit_fraction file has
+        been given as an option, the header up to the
+        “translit_start” line and the tail from the “translit_end”
+        line to the end of the file will be copied unchanged into the
+        output file.  ''')
+    PARSER.add_argument(
+        '--unicode_version',
+        nargs='?',
+        required=True,
+        type=str,
+        help='The Unicode version of the input files used.')
+    ARGS = PARSER.parse_args()
+
+    unicode_utils.fill_attributes(ARGS.unicode_data_file)
+    HEAD = TAIL = ''
+    if ARGS.input_file:
+        (HEAD, TAIL) = read_input_file(ARGS.input_file)
+    with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
+        output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
+        output_transliteration(TRANSLIT_FILE)
+        output_tail(TRANSLIT_FILE, tail=TAIL)
diff --git a/localedata/unicode-gen/gen_unicode_ctype.py b/localedata/unicode-gen/gen_unicode_ctype.py
index 0c74f2a849..0f064f5ba5 100755
--- a/localedata/unicode-gen/gen_unicode_ctype.py
+++ b/localedata/unicode-gen/gen_unicode_ctype.py
@@ -30,345 +30,9 @@ To see how this script is used, call it with the “-h” option:
 '''
 
 import argparse
-import sys
 import time
 import re
-
-# Dictionary holding the entire contents of the UnicodeData.txt file
-#
-# Contents of this dictionary look like this:
-#
-# {0: {'category': 'Cc',
-#      'title': None,
-#      'digit': '',
-#      'name': '<control>',
-#      'bidi': 'BN',
-#      'combining': '0',
-#      'comment': '',
-#      'oldname': 'NULL',
-#      'decomposition': '',
-#      'upper': None,
-#      'mirrored': 'N',
-#      'lower': None,
-#      'decdigit': '',
-#      'numeric': ''},
-#      …
-# }
-UNICODE_ATTRIBUTES = {}
-
-# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
-#
-# Contents of this dictionary look like this:
-#
-# {917504: ['Default_Ignorable_Code_Point'],
-#  917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
-#  …
-# }
-DERIVED_CORE_PROPERTIES = {}
-
-def fill_attribute(code_point, fields):
-    '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
-
-    One entry in the UNICODE_ATTRIBUTES dictionary represents one line
-    in the UnicodeData.txt file.
-
-    '''
-    UNICODE_ATTRIBUTES[code_point] =  {
-        'name': fields[1],          # Character name
-        'category': fields[2],      # General category
-        'combining': fields[3],     # Canonical combining classes
-        'bidi': fields[4],          # Bidirectional category
-        'decomposition': fields[5], # Character decomposition mapping
-        'decdigit': fields[6],      # Decimal digit value
-        'digit': fields[7],         # Digit value
-        'numeric': fields[8],       # Numeric value
-        'mirrored': fields[9],      # mirrored
-        'oldname': fields[10],      # Old Unicode 1.0 name
-        'comment': fields[11],      # comment
-        # Uppercase mapping
-        'upper': int(fields[12], 16) if fields[12] else None,
-        # Lowercase mapping
-        'lower': int(fields[13], 16) if fields[13] else None,
-        # Titlecase mapping
-        'title': int(fields[14], 16) if fields[14] else None,
-    }
-
-def fill_attributes(filename):
-    '''Stores the entire contents of the UnicodeData.txt file
-    in the UNICODE_ATTRIBUTES dictionary.
-
-    A typical line for a single code point in UnicodeData.txt looks
-    like this:
-
-    0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
-
-    Code point ranges are indicated by pairs of lines like this:
-
-    4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
-    9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
-    '''
-    with open(filename, mode='r') as unicode_data_file:
-        fields_start = []
-        for line in unicode_data_file:
-            fields = line.strip().split(';')
-            if len(fields) != 15:
-                sys.stderr.write(
-                    'short line in file "%(f)s": %(l)s\n' %{
-                    'f': filename, 'l': line})
-                exit(1)
-            if fields[2] == 'Cs':
-                # Surrogates are UTF-16 artefacts,
-                # not real characters. Ignore them.
-                fields_start = []
-                continue
-            if fields[1].endswith(', First>'):
-                fields_start = fields
-                fields_start[1] = fields_start[1].split(',')[0][1:]
-                continue
-            if fields[1].endswith(', Last>'):
-                fields[1] = fields[1].split(',')[0][1:]
-                if fields[1:] != fields_start[1:]:
-                    sys.stderr.write(
-                        'broken code point range in file "%(f)s": %(l)s\n' %{
-                            'f': filename, 'l': line})
-                    exit(1)
-                for code_point in range(
-                        int(fields_start[0], 16),
-                        int(fields[0], 16)+1):
-                    fill_attribute(code_point, fields)
-                fields_start = []
-                continue
-            fill_attribute(int(fields[0], 16), fields)
-            fields_start = []
-
-def fill_derived_core_properties(filename):
-    '''Stores the entire contents of the DerivedCoreProperties.txt file
-    in the DERIVED_CORE_PROPERTIES dictionary.
-
-    Lines in DerivedCoreProperties.txt are either a code point range like
-    this:
-
-    0061..007A    ; Lowercase # L&  [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
-
-    or a single code point like this:
-
-    00AA          ; Lowercase # Lo       FEMININE ORDINAL INDICATOR
-
-    '''
-    with open(filename, mode='r') as derived_core_properties_file:
-        for line in derived_core_properties_file:
-            match = re.match(
-                r'^(?P<codepoint1>[0-9A-F]{4,6})'
-                + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
-                + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
-                line)
-            if not match:
-                continue
-            start = match.group('codepoint1')
-            end = match.group('codepoint2')
-            if not end:
-                end = start
-            for code_point in range(int(start, 16), int(end, 16)+1):
-                prop = match.group('property')
-                if code_point in DERIVED_CORE_PROPERTIES:
-                    DERIVED_CORE_PROPERTIES[code_point].append(prop)
-                else:
-                    DERIVED_CORE_PROPERTIES[code_point] = [prop]
-
-def to_upper(code_point):
-    '''Returns the code point of the uppercase version
-    of the given code point'''
-    if (UNICODE_ATTRIBUTES[code_point]['name']
-        and UNICODE_ATTRIBUTES[code_point]['upper']):
-        return UNICODE_ATTRIBUTES[code_point]['upper']
-    else:
-        return code_point
-
-def to_lower(code_point):
-    '''Returns the code point of the lowercase version
-    of the given code point'''
-    if (UNICODE_ATTRIBUTES[code_point]['name']
-        and UNICODE_ATTRIBUTES[code_point]['lower']):
-        return UNICODE_ATTRIBUTES[code_point]['lower']
-    else:
-        return code_point
-
-def to_title(code_point):
-    '''Returns the code point of the titlecase version
-    of the given code point'''
-    if (UNICODE_ATTRIBUTES[code_point]['name']
-        and UNICODE_ATTRIBUTES[code_point]['title']):
-        return UNICODE_ATTRIBUTES[code_point]['title']
-    else:
-        return code_point
-
-def is_upper(code_point):
-    '''Checks whether the character with this code point is uppercase'''
-    return (to_lower(code_point) != code_point
-            or (code_point in DERIVED_CORE_PROPERTIES
-                and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
-
-def is_lower(code_point):
-    '''Checks whether the character with this code point is lowercase'''
-    # Some characters are defined as “Lowercase” in
-    # DerivedCoreProperties.txt but do not have a mapping to upper
-    # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
-    # one of these.
-    return (to_upper(code_point) != code_point
-            # <U00DF> is lowercase, but without simple to_upper mapping.
-            or code_point == 0x00DF
-            or (code_point in DERIVED_CORE_PROPERTIES
-                and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
-
-def is_alpha(code_point):
-    '''Checks whether the character with this code point is alphabetic'''
-    return ((code_point in DERIVED_CORE_PROPERTIES
-             and
-             'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
-            or
-            # Consider all the non-ASCII digits as alphabetic.
-            # ISO C 99 forbids us to have them in category “digit”,
-            # but we want iswalnum to return true on them.
-            (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
-             and not (code_point >= 0x0030 and code_point <= 0x0039)))
-
-def is_digit(code_point):
-    '''Checks whether the character with this code point is a digit'''
-    if False:
-        return (UNICODE_ATTRIBUTES[code_point]['name']
-                and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
-        # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
-        # a zero.  Must add <0> in front of them by hand.
-    else:
-        # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
-        # takes it away:
-        # 7.25.2.1.5:
-        #    The iswdigit function tests for any wide character that
-        #    corresponds to a decimal-digit character (as defined in 5.2.1).
-        # 5.2.1:
-        #    the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
-        return (code_point >= 0x0030 and code_point <= 0x0039)
-
-def is_outdigit(code_point):
-    '''Checks whether the character with this code point is outdigit'''
-    return (code_point >= 0x0030 and code_point <= 0x0039)
-
-def is_blank(code_point):
-    '''Checks whether the character with this code point is blank'''
-    return (code_point == 0x0009 # '\t'
-            # Category Zs without mention of '<noBreak>'
-            or (UNICODE_ATTRIBUTES[code_point]['name']
-                and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
-                and '<noBreak>' not in
-                UNICODE_ATTRIBUTES[code_point]['decomposition']))
-
-def is_space(code_point):
-    '''Checks whether the character with this code point is a space'''
-    # Don’t make U+00A0 a space. Non-breaking space means that all programs
-    # should treat it like a punctuation character, not like a space.
-    return (code_point == 0x0020 # ' '
-            or code_point == 0x000C # '\f'
-            or code_point == 0x000A # '\n'
-            or code_point == 0x000D # '\r'
-            or code_point == 0x0009 # '\t'
-            or code_point == 0x000B # '\v'
-            # Categories Zl, Zp, and Zs without mention of "<noBreak>"
-            or (UNICODE_ATTRIBUTES[code_point]['name']
-                and
-                (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
-                 or
-                 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
-                  and
-                  '<noBreak>' not in
-                  UNICODE_ATTRIBUTES[code_point]['decomposition']))))
-
-def is_cntrl(code_point):
-    '''Checks whether the character with this code point is
-    a control character'''
-    return (UNICODE_ATTRIBUTES[code_point]['name']
-            and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
-                 or
-                 UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
-
-def is_xdigit(code_point):
-    '''Checks whether the character with this code point is
-    a hexadecimal digit'''
-    if False:
-        return (is_digit(code_point)
-                or (code_point >= 0x0041 and code_point <= 0x0046)
-                or (code_point >= 0x0061 and code_point <= 0x0066))
-    else:
-        # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
-        # takes it away:
-        # 7.25.2.1.12:
-        #    The iswxdigit function tests for any wide character that
-        #    corresponds to a hexadecimal-digit character (as defined
-        #    in 6.4.4.1).
-        # 6.4.4.1:
-        #    hexadecimal-digit: one of
-        #    0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
-        return ((code_point >= 0x0030 and code_point  <= 0x0039)
-                or (code_point >= 0x0041 and code_point <= 0x0046)
-                or (code_point >= 0x0061 and code_point <= 0x0066))
-
-def is_graph(code_point):
-    '''Checks whether the character with this code point is
-    a graphical character'''
-    return (UNICODE_ATTRIBUTES[code_point]['name']
-            and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
-            and not is_space(code_point))
-
-def is_print(code_point):
-    '''Checks whether the character with this code point is printable'''
-    return (UNICODE_ATTRIBUTES[code_point]['name']
-            and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
-            and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
-
-def is_punct(code_point):
-    '''Checks whether the character with this code point is punctuation'''
-    if False:
-        return (UNICODE_ATTRIBUTES[code_point]['name']
-                and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
-    else:
-        # The traditional POSIX definition of punctuation is every graphic,
-        # non-alphanumeric character.
-        return (is_graph(code_point)
-                and not is_alpha(code_point)
-                and not is_digit(code_point))
-
-def is_combining(code_point):
-    '''Checks whether the character with this code point is
-    a combining character'''
-    # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
-    # file. In 3.0.1 it was identical to the union of the general categories
-    # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
-    # PropList.txt file, so we take the latter definition.
-    return (UNICODE_ATTRIBUTES[code_point]['name']
-            and
-            UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
-
-def is_combining_level3(code_point):
-    '''Checks whether the character with this code point is
-    a combining level3 character'''
-    return (is_combining(code_point)
-            and
-            int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
-
-def ucs_symbol(code_point):
-    '''Return the UCS symbol string for a Unicode character.'''
-    if code_point < 0x10000:
-        return '<U{:04X}>'.format(code_point)
-    else:
-        return '<U{:08X}>'.format(code_point)
-
-def ucs_symbol_range(code_point_low, code_point_high):
-    '''Returns a string UCS symbol string for a code point range.
-
-    Example:
-
-    <U0041>..<U005A>
-    '''
-    return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
+import unicode_utils
 
 def code_point_ranges(is_class_function):
     '''Returns a list of ranges of code points for which is_class_function
@@ -379,7 +43,7 @@ def code_point_ranges(is_class_function):
     [[65, 90], [192, 214], [216, 222], [256], … ]
     '''
     cp_ranges  = []
-    for code_point in sorted(UNICODE_ATTRIBUTES):
+    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
         if is_class_function(code_point):
             if (cp_ranges
                 and cp_ranges[-1][-1] == code_point - 1):
@@ -413,9 +77,9 @@ def output_charclass(i18n_file, class_name, is_class_function):
             if line.strip():
                 line  += ';'
             if len(code_point_range) == 1:
-                range_string = ucs_symbol(code_point_range[0])
+                range_string = unicode_utils.ucs_symbol(code_point_range[0])
             else:
-                range_string = ucs_symbol_range(
+                range_string = unicode_utils.ucs_symbol_range(
                     code_point_range[0], code_point_range[-1])
             if len(line+range_string) > max_column:
                 i18n_file.write(line+'/\n')
@@ -441,15 +105,15 @@ def output_charmap(i18n_file, map_name, map_function):
     line = prefix
     map_string = ''
     i18n_file.write('%s /\n' %map_name)
-    for code_point in sorted(UNICODE_ATTRIBUTES):
+    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
         mapped = map_function(code_point)
         if code_point != mapped:
             if line.strip():
                 line += ';'
             map_string = '(' \
-                         + ucs_symbol(code_point) \
+                         + unicode_utils.ucs_symbol(code_point) \
                          + ',' \
-                         + ucs_symbol(mapped) \
+                         + unicode_utils.ucs_symbol(mapped) \
                          + ')'
             if len(line+map_string) > max_column:
                 i18n_file.write(line+'/\n')
@@ -459,110 +123,6 @@ def output_charmap(i18n_file, map_name, map_function):
         i18n_file.write(line+'\n')
     i18n_file.write('\n')
 
-def verifications():
-    '''Tests whether the is_* functions observe the known restrictions'''
-    for code_point in sorted(UNICODE_ATTRIBUTES):
-        # toupper restriction: "Only characters specified for the keywords
-        # lower and upper shall be specified.
-        if (to_upper(code_point) != code_point
-            and not (is_lower(code_point) or is_upper(code_point))):
-            sys.stderr.write(
-                ('%(sym)s is not upper|lower '
-                 + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
-                    'sym': ucs_symbol(code_point),
-                    'c': code_point,
-                    'uc': to_upper(code_point)})
-        # tolower restriction: "Only characters specified for the keywords
-        # lower and upper shall be specified.
-        if (to_lower(code_point) != code_point
-            and not (is_lower(code_point) or is_upper(code_point))):
-            sys.stderr.write(
-                ('%(sym)s is not upper|lower '
-                 + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
-                    'sym': ucs_symbol(code_point),
-                    'c': code_point,
-                    'uc': to_lower(code_point)})
-        # alpha restriction: "Characters classified as either upper or lower
-        # shall automatically belong to this class.
-        if ((is_lower(code_point) or is_upper(code_point))
-             and not is_alpha(code_point)):
-            sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
-                'sym': ucs_symbol(code_point)})
-        # alpha restriction: “No character specified for the keywords cntrl,
-        # digit, punct or space shall be specified.”
-        if (is_alpha(code_point) and is_cntrl(code_point)):
-            sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
-                'sym': ucs_symbol(code_point)})
-        if (is_alpha(code_point) and is_digit(code_point)):
-            sys.stderr.write('%(sym)s is alpha and digit\n' %{
-                'sym': ucs_symbol(code_point)})
-        if (is_alpha(code_point) and is_punct(code_point)):
-            sys.stderr.write('%(sym)s is alpha and punct\n' %{
-                'sym': ucs_symbol(code_point)})
-        if (is_alpha(code_point) and is_space(code_point)):
-            sys.stderr.write('%(sym)s is alpha and space\n' %{
-                'sym': ucs_symbol(code_point)})
-        # space restriction: “No character specified for the keywords upper,
-        # lower, alpha, digit, graph or xdigit shall be specified.”
-        # upper, lower, alpha already checked above.
-        if (is_space(code_point) and is_digit(code_point)):
-            sys.stderr.write('%(sym)s is space and digit\n' %{
-                'sym': ucs_symbol(code_point)})
-        if (is_space(code_point) and is_graph(code_point)):
-            sys.stderr.write('%(sym)s is space and graph\n' %{
-                'sym': ucs_symbol(code_point)})
-        if (is_space(code_point) and is_xdigit(code_point)):
-            sys.stderr.write('%(sym)s is space and xdigit\n' %{
-                'sym': ucs_symbol(code_point)})
-        # cntrl restriction: “No character specified for the keywords upper,
-        # lower, alpha, digit, punct, graph, print or xdigit shall be
-        # specified.”  upper, lower, alpha already checked above.
-        if (is_cntrl(code_point) and is_digit(code_point)):
-            sys.stderr.write('%(sym)s is cntrl and digit\n' %{
-                'sym': ucs_symbol(code_point)})
-        if (is_cntrl(code_point) and is_punct(code_point)):
-            sys.stderr.write('%(sym)s is cntrl and punct\n' %{
-                'sym': ucs_symbol(code_point)})
-        if (is_cntrl(code_point) and is_graph(code_point)):
-            sys.stderr.write('%(sym)s is cntrl and graph\n' %{
-                'sym': ucs_symbol(code_point)})
-        if (is_cntrl(code_point) and is_print(code_point)):
-            sys.stderr.write('%(sym)s is cntrl and print\n' %{
-                'sym': ucs_symbol(code_point)})
-        if (is_cntrl(code_point) and is_xdigit(code_point)):
-            sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
-                'sym': ucs_symbol(code_point)})
-        # punct restriction: “No character specified for the keywords upper,
-        # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
-        # be specified.”  upper, lower, alpha, cntrl already checked above.
-        if (is_punct(code_point) and is_digit(code_point)):
-            sys.stderr.write('%(sym)s is punct and digit\n' %{
-                'sym': ucs_symbol(code_point)})
-        if (is_punct(code_point) and is_xdigit(code_point)):
-            sys.stderr.write('%(sym)s is punct and xdigit\n' %{
-                'sym': ucs_symbol(code_point)})
-        if (is_punct(code_point) and code_point == 0x0020):
-            sys.stderr.write('%(sym)s is punct\n' %{
-                'sym': ucs_symbol(code_point)})
-        # graph restriction: “No character specified for the keyword cntrl
-        # shall be specified.”  Already checked above.
-
-        # print restriction: “No character specified for the keyword cntrl
-        # shall be specified.”  Already checked above.
-
-        # graph - print relation: differ only in the <space> character.
-        # How is this possible if there are more than one space character?!
-        # I think susv2/xbd/locale.html should speak of “space characters”,
-        # not “space character”.
-        if (is_print(code_point)
-            and not (is_graph(code_point) or is_space(code_point))):
-            sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
-                'sym': ucs_symbol(code_point)})
-        if (not is_print(code_point)
-            and (is_graph(code_point) or code_point == 0x0020)):
-            sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
-                'sym': ucs_symbol(code_point)})
-
 def read_input_file(filename):
     '''Reads the original glibc i18n file to get the original head
     and tail.
@@ -648,18 +208,18 @@ def output_tables(i18n_file, unicode_version):
                     + 'program.\n\n')
     i18n_file.write('% The "upper" class reflects the uppercase '
                     + 'characters of class "alpha"\n')
-    output_charclass(i18n_file, 'upper', is_upper)
+    output_charclass(i18n_file, 'upper', unicode_utils.is_upper)
     i18n_file.write('% The "lower" class reflects the lowercase '
                     + 'characters of class "alpha"\n')
-    output_charclass(i18n_file, 'lower', is_lower)
+    output_charclass(i18n_file, 'lower', unicode_utils.is_lower)
     i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is '
                     + 'reflecting\n')
     i18n_file.write('% the recommendations in TR 10176 annex A\n')
-    output_charclass(i18n_file, 'alpha', is_alpha)
+    output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha)
     i18n_file.write('% The "digit" class must only contain the '
                     + 'BASIC LATIN digits, says ISO C 99\n')
     i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
-    output_charclass(i18n_file, 'digit', is_digit)
+    output_charclass(i18n_file, 'digit', unicode_utils.is_digit)
     i18n_file.write('% The "outdigit" information is by default '
                     + '"0" to "9".  We don\'t have to\n')
     i18n_file.write('% provide it here since localedef will fill '
@@ -669,29 +229,30 @@ def output_tables(i18n_file, unicode_version):
     i18n_file.write('% outdigit /\n')
     i18n_file.write('%    <U0030>..<U0039>\n\n')
     # output_charclass(i18n_file, 'outdigit', is_outdigit)
-    output_charclass(i18n_file, 'space', is_space)
-    output_charclass(i18n_file, 'cntrl', is_cntrl)
-    output_charclass(i18n_file, 'punct', is_punct)
-    output_charclass(i18n_file, 'graph', is_graph)
-    output_charclass(i18n_file, 'print', is_print)
+    output_charclass(i18n_file, 'space', unicode_utils.is_space)
+    output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl)
+    output_charclass(i18n_file, 'punct', unicode_utils.is_punct)
+    output_charclass(i18n_file, 'graph', unicode_utils.is_graph)
+    output_charclass(i18n_file, 'print', unicode_utils.is_print)
     i18n_file.write('% The "xdigit" class must only contain the '
                     + 'BASIC LATIN digits and A-F, a-f,\n')
     i18n_file.write('% says ISO C 99 '
                     + '(sections 7.25.2.1.12 and 6.4.4.1).\n')
-    output_charclass(i18n_file, 'xdigit', is_xdigit)
-    output_charclass(i18n_file, 'blank', is_blank)
-    output_charmap(i18n_file, 'toupper', to_upper)
-    output_charmap(i18n_file, 'tolower', to_lower)
-    output_charmap(i18n_file, 'map "totitle";', to_title)
+    output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit)
+    output_charclass(i18n_file, 'blank', unicode_utils.is_blank)
+    output_charmap(i18n_file, 'toupper', unicode_utils.to_upper)
+    output_charmap(i18n_file, 'tolower', unicode_utils.to_lower)
+    output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title)
     i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
                     + 'annex B.1\n')
     i18n_file.write('% That is, all combining characters (level 2+3).\n')
-    output_charclass(i18n_file, 'class "combining";', is_combining)
+    output_charclass(i18n_file, 'class "combining";',
+                     unicode_utils.is_combining)
     i18n_file.write('% The "combining_level3" class reflects '
                     + 'ISO/IEC 10646-1 annex B.2\n')
     i18n_file.write('% That is, combining characters of level 3.\n')
-    output_charclass(i18n_file,
-                     'class "combining_level3";', is_combining_level3)
+    output_charclass(i18n_file, 'class "combining_level3";',
+                     unicode_utils.is_combining_level3)
 
 if __name__ == "__main__":
     PARSER = argparse.ArgumentParser(
@@ -739,9 +300,11 @@ if __name__ == "__main__":
         help='The Unicode version of the input files used.')
     ARGS = PARSER.parse_args()
 
-    fill_attributes(ARGS.unicode_data_file)
-    fill_derived_core_properties(ARGS.derived_core_properties_file)
-    verifications()
+    unicode_utils.fill_attributes(
+        ARGS.unicode_data_file)
+    unicode_utils.fill_derived_core_properties(
+        ARGS.derived_core_properties_file)
+    unicode_utils.verifications()
     HEAD = TAIL = ''
     if ARGS.input_file:
         (HEAD, TAIL) = read_input_file(ARGS.input_file)
diff --git a/localedata/unicode-gen/unicode_utils.py b/localedata/unicode-gen/unicode_utils.py
new file mode 100644
index 0000000000..ee91582823
--- /dev/null
+++ b/localedata/unicode-gen/unicode_utils.py
@@ -0,0 +1,502 @@
+# Utilities to generate Unicode data for glibc from upstream Unicode data.
+#
+# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+'''
+This module contains utilities used by the scripts to generate
+Unicode data for glibc from upstream Unicode data files.
+'''
+
+import sys
+import re
+
+# Dictionary holding the entire contents of the UnicodeData.txt file
+#
+# Contents of this dictionary look like this:
+#
+# {0: {'category': 'Cc',
+#      'title': None,
+#      'digit': '',
+#      'name': '<control>',
+#      'bidi': 'BN',
+#      'combining': '0',
+#      'comment': '',
+#      'oldname': 'NULL',
+#      'decomposition': '',
+#      'upper': None,
+#      'mirrored': 'N',
+#      'lower': None,
+#      'decdigit': '',
+#      'numeric': ''},
+#      …
+# }
+UNICODE_ATTRIBUTES = {}
+
+# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
+#
+# Contents of this dictionary look like this:
+#
+# {917504: ['Default_Ignorable_Code_Point'],
+#  917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
+#  …
+# }
+DERIVED_CORE_PROPERTIES = {}
+
+# Dictionary holding the entire contents of the EastAsianWidths.txt file
+#
+# Contents of this dictionary look like this:
+#
+# {0: 'N', … , 45430: 'W', …}
+EAST_ASIAN_WIDTHS = {}
+
+def fill_attribute(code_point, fields):
+    '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
+
+    One entry in the UNICODE_ATTRIBUTES dictionary represents one line
+    in the UnicodeData.txt file.
+
+    '''
+    UNICODE_ATTRIBUTES[code_point] =  {
+        'name': fields[1],          # Character name
+        'category': fields[2],      # General category
+        'combining': fields[3],     # Canonical combining classes
+        'bidi': fields[4],          # Bidirectional category
+        'decomposition': fields[5], # Character decomposition mapping
+        'decdigit': fields[6],      # Decimal digit value
+        'digit': fields[7],         # Digit value
+        'numeric': fields[8],       # Numeric value
+        'mirrored': fields[9],      # mirrored
+        'oldname': fields[10],      # Old Unicode 1.0 name
+        'comment': fields[11],      # comment
+        # Uppercase mapping
+        'upper': int(fields[12], 16) if fields[12] else None,
+        # Lowercase mapping
+        'lower': int(fields[13], 16) if fields[13] else None,
+        # Titlecase mapping
+        'title': int(fields[14], 16) if fields[14] else None,
+    }
+
+def fill_attributes(filename):
+    '''Stores the entire contents of the UnicodeData.txt file
+    in the UNICODE_ATTRIBUTES dictionary.
+
+    A typical line for a single code point in UnicodeData.txt looks
+    like this:
+
+    0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
+
+    Code point ranges are indicated by pairs of lines like this:
+
+    4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
+    9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
+    '''
+    with open(filename, mode='r') as unicode_data_file:
+        fields_start = []
+        for line in unicode_data_file:
+            fields = line.strip().split(';')
+            if len(fields) != 15:
+                sys.stderr.write(
+                    'short line in file "%(f)s": %(l)s\n' %{
+                    'f': filename, 'l': line})
+                exit(1)
+            if fields[2] == 'Cs':
+                # Surrogates are UTF-16 artefacts,
+                # not real characters. Ignore them.
+                fields_start = []
+                continue
+            if fields[1].endswith(', First>'):
+                fields_start = fields
+                fields_start[1] = fields_start[1].split(',')[0][1:]
+                continue
+            if fields[1].endswith(', Last>'):
+                fields[1] = fields[1].split(',')[0][1:]
+                if fields[1:] != fields_start[1:]:
+                    sys.stderr.write(
+                        'broken code point range in file "%(f)s": %(l)s\n' %{
+                            'f': filename, 'l': line})
+                    exit(1)
+                for code_point in range(
+                        int(fields_start[0], 16),
+                        int(fields[0], 16)+1):
+                    fill_attribute(code_point, fields)
+                fields_start = []
+                continue
+            fill_attribute(int(fields[0], 16), fields)
+            fields_start = []
+
+def fill_derived_core_properties(filename):
+    '''Stores the entire contents of the DerivedCoreProperties.txt file
+    in the DERIVED_CORE_PROPERTIES dictionary.
+
+    Lines in DerivedCoreProperties.txt are either a code point range like
+    this:
+
+    0061..007A    ; Lowercase # L&  [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
+
+    or a single code point like this:
+
+    00AA          ; Lowercase # Lo       FEMININE ORDINAL INDICATOR
+
+    '''
+    with open(filename, mode='r') as derived_core_properties_file:
+        for line in derived_core_properties_file:
+            match = re.match(
+                r'^(?P<codepoint1>[0-9A-F]{4,6})'
+                + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
+                + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
+                line)
+            if not match:
+                continue
+            start = match.group('codepoint1')
+            end = match.group('codepoint2')
+            if not end:
+                end = start
+            for code_point in range(int(start, 16), int(end, 16)+1):
+                prop = match.group('property')
+                if code_point in DERIVED_CORE_PROPERTIES:
+                    DERIVED_CORE_PROPERTIES[code_point].append(prop)
+                else:
+                    DERIVED_CORE_PROPERTIES[code_point] = [prop]
+
+def fill_east_asian_widths(filename):
+    '''Stores the entire contents of the EastAsianWidths.txt file
+    in the EAST_ASIAN_WIDTHS dictionary.
+
+    Lines in EastAsianWidths.txt are either a code point range like
+    this:
+
+    9FCD..9FFF;W     # Cn    [51] <reserved-9FCD>..<reserved-9FFF>
+
+    or a single code point like this:
+
+    A015;W           # Lm         YI SYLLABLE WU
+    '''
+    with open(filename, mode='r') as east_asian_widths_file:
+        for line in east_asian_widths_file:
+            match = re.match(
+                r'^(?P<codepoint1>[0-9A-F]{4,6})'
+                +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
+                +r'\s*;\s*(?P<property>[a-zA-Z]+)',
+                line)
+            if not match:
+                continue
+            start = match.group('codepoint1')
+            end = match.group('codepoint2')
+            if not end:
+                end = start
+            for code_point in range(int(start, 16), int(end, 16)+1):
+                EAST_ASIAN_WIDTHS[code_point] = match.group('property')
+
+def to_upper(code_point):
+    '''Returns the code point of the uppercase version
+    of the given code point'''
+    if (UNICODE_ATTRIBUTES[code_point]['name']
+        and UNICODE_ATTRIBUTES[code_point]['upper']):
+        return UNICODE_ATTRIBUTES[code_point]['upper']
+    else:
+        return code_point
+
+def to_lower(code_point):
+    '''Returns the code point of the lowercase version
+    of the given code point'''
+    if (UNICODE_ATTRIBUTES[code_point]['name']
+        and UNICODE_ATTRIBUTES[code_point]['lower']):
+        return UNICODE_ATTRIBUTES[code_point]['lower']
+    else:
+        return code_point
+
+def to_title(code_point):
+    '''Returns the code point of the titlecase version
+    of the given code point'''
+    if (UNICODE_ATTRIBUTES[code_point]['name']
+        and UNICODE_ATTRIBUTES[code_point]['title']):
+        return UNICODE_ATTRIBUTES[code_point]['title']
+    else:
+        return code_point
+
+def is_upper(code_point):
+    '''Checks whether the character with this code point is uppercase'''
+    return (to_lower(code_point) != code_point
+            or (code_point in DERIVED_CORE_PROPERTIES
+                and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
+
+def is_lower(code_point):
+    '''Checks whether the character with this code point is lowercase'''
+    # Some characters are defined as “Lowercase” in
+    # DerivedCoreProperties.txt but do not have a mapping to upper
+    # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
+    # one of these.
+    return (to_upper(code_point) != code_point
+            # <U00DF> is lowercase, but without simple to_upper mapping.
+            or code_point == 0x00DF
+            or (code_point in DERIVED_CORE_PROPERTIES
+                and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
+
+def is_alpha(code_point):
+    '''Checks whether the character with this code point is alphabetic'''
+    return ((code_point in DERIVED_CORE_PROPERTIES
+             and
+             'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
+            or
+            # Consider all the non-ASCII digits as alphabetic.
+            # ISO C 99 forbids us to have them in category “digit”,
+            # but we want iswalnum to return true on them.
+            (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
+             and not (code_point >= 0x0030 and code_point <= 0x0039)))
+
+def is_digit(code_point):
+    '''Checks whether the character with this code point is a digit'''
+    if False:
+        return (UNICODE_ATTRIBUTES[code_point]['name']
+                and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
+        # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
+        # a zero.  Must add <0> in front of them by hand.
+    else:
+        # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
+        # takes it away:
+        # 7.25.2.1.5:
+        #    The iswdigit function tests for any wide character that
+        #    corresponds to a decimal-digit character (as defined in 5.2.1).
+        # 5.2.1:
+        #    the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
+        return (code_point >= 0x0030 and code_point <= 0x0039)
+
+def is_outdigit(code_point):
+    '''Checks whether the character with this code point is outdigit'''
+    return (code_point >= 0x0030 and code_point <= 0x0039)
+
+def is_blank(code_point):
+    '''Checks whether the character with this code point is blank'''
+    return (code_point == 0x0009 # '\t'
+            # Category Zs without mention of '<noBreak>'
+            or (UNICODE_ATTRIBUTES[code_point]['name']
+                and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
+                and '<noBreak>' not in
+                UNICODE_ATTRIBUTES[code_point]['decomposition']))
+
+def is_space(code_point):
+    '''Checks whether the character with this code point is a space'''
+    # Don’t make U+00A0 a space. Non-breaking space means that all programs
+    # should treat it like a punctuation character, not like a space.
+    return (code_point == 0x0020 # ' '
+            or code_point == 0x000C # '\f'
+            or code_point == 0x000A # '\n'
+            or code_point == 0x000D # '\r'
+            or code_point == 0x0009 # '\t'
+            or code_point == 0x000B # '\v'
+            # Categories Zl, Zp, and Zs without mention of "<noBreak>"
+            or (UNICODE_ATTRIBUTES[code_point]['name']
+                and
+                (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
+                 or
+                 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
+                  and
+                  '<noBreak>' not in
+                  UNICODE_ATTRIBUTES[code_point]['decomposition']))))
+
+def is_cntrl(code_point):
+    '''Checks whether the character with this code point is
+    a control character'''
+    return (UNICODE_ATTRIBUTES[code_point]['name']
+            and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
+                 or
+                 UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
+
+def is_xdigit(code_point):
+    '''Checks whether the character with this code point is
+    a hexadecimal digit'''
+    if False:
+        return (is_digit(code_point)
+                or (code_point >= 0x0041 and code_point <= 0x0046)
+                or (code_point >= 0x0061 and code_point <= 0x0066))
+    else:
+        # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
+        # takes it away:
+        # 7.25.2.1.12:
+        #    The iswxdigit function tests for any wide character that
+        #    corresponds to a hexadecimal-digit character (as defined
+        #    in 6.4.4.1).
+        # 6.4.4.1:
+        #    hexadecimal-digit: one of
+        #    0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
+        return ((code_point >= 0x0030 and code_point  <= 0x0039)
+                or (code_point >= 0x0041 and code_point <= 0x0046)
+                or (code_point >= 0x0061 and code_point <= 0x0066))
+
+def is_graph(code_point):
+    '''Checks whether the character with this code point is
+    a graphical character'''
+    return (UNICODE_ATTRIBUTES[code_point]['name']
+            and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
+            and not is_space(code_point))
+
+def is_print(code_point):
+    '''Checks whether the character with this code point is printable'''
+    return (UNICODE_ATTRIBUTES[code_point]['name']
+            and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
+            and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
+
+def is_punct(code_point):
+    '''Checks whether the character with this code point is punctuation'''
+    if False:
+        return (UNICODE_ATTRIBUTES[code_point]['name']
+                and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
+    else:
+        # The traditional POSIX definition of punctuation is every graphic,
+        # non-alphanumeric character.
+        return (is_graph(code_point)
+                and not is_alpha(code_point)
+                and not is_digit(code_point))
+
+def is_combining(code_point):
+    '''Checks whether the character with this code point is
+    a combining character'''
+    # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
+    # file. In 3.0.1 it was identical to the union of the general categories
+    # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
+    # PropList.txt file, so we take the latter definition.
+    return (UNICODE_ATTRIBUTES[code_point]['name']
+            and
+            UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
+
+def is_combining_level3(code_point):
+    '''Checks whether the character with this code point is
+    a combining level3 character'''
+    return (is_combining(code_point)
+            and
+            int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
+
+def ucs_symbol(code_point):
+    '''Return the UCS symbol string for a Unicode character.'''
+    if code_point < 0x10000:
+        return '<U{:04X}>'.format(code_point)
+    else:
+        return '<U{:08X}>'.format(code_point)
+
+def ucs_symbol_range(code_point_low, code_point_high):
+    '''Returns a string UCS symbol string for a code point range.
+
+    Example:
+
+    <U0041>..<U005A>
+    '''
+    return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
+
+def verifications():
+    '''Tests whether the is_* functions observe the known restrictions'''
+    for code_point in sorted(UNICODE_ATTRIBUTES):
+        # toupper restriction: "Only characters specified for the keywords
+        # lower and upper shall be specified.
+        if (to_upper(code_point) != code_point
+            and not (is_lower(code_point) or is_upper(code_point))):
+            sys.stderr.write(
+                ('%(sym)s is not upper|lower '
+                 + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
+                    'sym': ucs_symbol(code_point),
+                    'c': code_point,
+                    'uc': to_upper(code_point)})
+        # tolower restriction: "Only characters specified for the keywords
+        # lower and upper shall be specified.
+        if (to_lower(code_point) != code_point
+            and not (is_lower(code_point) or is_upper(code_point))):
+            sys.stderr.write(
+                ('%(sym)s is not upper|lower '
+                 + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
+                    'sym': ucs_symbol(code_point),
+                    'c': code_point,
+                    'uc': to_lower(code_point)})
+        # alpha restriction: "Characters classified as either upper or lower
+        # shall automatically belong to this class.
+        if ((is_lower(code_point) or is_upper(code_point))
+             and not is_alpha(code_point)):
+            sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
+                'sym': ucs_symbol(code_point)})
+        # alpha restriction: “No character specified for the keywords cntrl,
+        # digit, punct or space shall be specified.”
+        if (is_alpha(code_point) and is_cntrl(code_point)):
+            sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_alpha(code_point) and is_digit(code_point)):
+            sys.stderr.write('%(sym)s is alpha and digit\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_alpha(code_point) and is_punct(code_point)):
+            sys.stderr.write('%(sym)s is alpha and punct\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_alpha(code_point) and is_space(code_point)):
+            sys.stderr.write('%(sym)s is alpha and space\n' %{
+                'sym': ucs_symbol(code_point)})
+        # space restriction: “No character specified for the keywords upper,
+        # lower, alpha, digit, graph or xdigit shall be specified.”
+        # upper, lower, alpha already checked above.
+        if (is_space(code_point) and is_digit(code_point)):
+            sys.stderr.write('%(sym)s is space and digit\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_space(code_point) and is_graph(code_point)):
+            sys.stderr.write('%(sym)s is space and graph\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_space(code_point) and is_xdigit(code_point)):
+            sys.stderr.write('%(sym)s is space and xdigit\n' %{
+                'sym': ucs_symbol(code_point)})
+        # cntrl restriction: “No character specified for the keywords upper,
+        # lower, alpha, digit, punct, graph, print or xdigit shall be
+        # specified.”  upper, lower, alpha already checked above.
+        if (is_cntrl(code_point) and is_digit(code_point)):
+            sys.stderr.write('%(sym)s is cntrl and digit\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_cntrl(code_point) and is_punct(code_point)):
+            sys.stderr.write('%(sym)s is cntrl and punct\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_cntrl(code_point) and is_graph(code_point)):
+            sys.stderr.write('%(sym)s is cntrl and graph\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_cntrl(code_point) and is_print(code_point)):
+            sys.stderr.write('%(sym)s is cntrl and print\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_cntrl(code_point) and is_xdigit(code_point)):
+            sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
+                'sym': ucs_symbol(code_point)})
+        # punct restriction: “No character specified for the keywords upper,
+        # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
+        # be specified.”  upper, lower, alpha, cntrl already checked above.
+        if (is_punct(code_point) and is_digit(code_point)):
+            sys.stderr.write('%(sym)s is punct and digit\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_punct(code_point) and is_xdigit(code_point)):
+            sys.stderr.write('%(sym)s is punct and xdigit\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_punct(code_point) and code_point == 0x0020):
+            sys.stderr.write('%(sym)s is punct\n' %{
+                'sym': ucs_symbol(code_point)})
+        # graph restriction: “No character specified for the keyword cntrl
+        # shall be specified.”  Already checked above.
+
+        # print restriction: “No character specified for the keyword cntrl
+        # shall be specified.”  Already checked above.
+
+        # graph - print relation: differ only in the <space> character.
+        # How is this possible if there are more than one space character?!
+        # I think susv2/xbd/locale.html should speak of “space characters”,
+        # not “space character”.
+        if (is_print(code_point)
+            and not (is_graph(code_point) or is_space(code_point))):
+            sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
+                'sym': unicode_utils.ucs_symbol(code_point)})
+        if (not is_print(code_point)
+            and (is_graph(code_point) or code_point == 0x0020)):
+            sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
+                'sym': unicode_utils.ucs_symbol(code_point)})
diff --git a/localedata/unicode-gen/utf8_compatibility.py b/localedata/unicode-gen/utf8_compatibility.py
index b84a1eb3de..3b7a94ccc9 100755
--- a/localedata/unicode-gen/utf8_compatibility.py
+++ b/localedata/unicode-gen/utf8_compatibility.py
@@ -30,146 +30,7 @@ To see how this script is used, call it with the “-h” option:
 import sys
 import re
 import argparse
-
-# Dictionary holding the entire contents of the UnicodeData.txt file
-#
-# Contents of this dictionary look like this:
-#
-# {0: {'category': 'Cc',
-#      'title': None,
-#      'digit': '',
-#      'name': '<control>',
-#      'bidi': 'BN',
-#      'combining': '0',
-#      'comment': '',
-#      'oldname': 'NULL',
-#      'decomposition': '',
-#      'upper': None,
-#      'mirrored': 'N',
-#      'lower': None,
-#      'decdigit': '',
-#      'numeric': ''},
-#      …
-# }
-UNICODE_ATTRIBUTES = {}
-
-# Dictionary holding the entire contents of the EastAsianWidths.txt file
-#
-# Contents of this dictionary look like this:
-#
-# {0: 'N', … , 45430: 'W', …}
-EAST_ASIAN_WIDTHS = {}
-
-def fill_attribute(code_point, fields):
-    '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
-
-    One entry in the UNICODE_ATTRIBUTES dictionary represents one line
-    in the UnicodeData.txt file.
-
-    '''
-    UNICODE_ATTRIBUTES[code_point] =  {
-        'name': fields[1],          # Character name
-        'category': fields[2],      # General category
-        'combining': fields[3],     # Canonical combining classes
-        'bidi': fields[4],          # Bidirectional category
-        'decomposition': fields[5], # Character decomposition mapping
-        'decdigit': fields[6],      # Decimal digit value
-        'digit': fields[7],         # Digit value
-        'numeric': fields[8],       # Numeric value
-        'mirrored': fields[9],      # mirrored
-        'oldname': fields[10],      # Old Unicode 1.0 name
-        'comment': fields[11],      # comment
-        # Uppercase mapping
-        'upper': int(fields[12], 16) if fields[12] else None,
-        # Lowercase mapping
-        'lower': int(fields[13], 16) if fields[13] else None,
-        # Titlecase mapping
-        'title': int(fields[14], 16) if fields[14] else None,
-    }
-
-def fill_attributes(filename):
-    '''Stores the entire contents of the UnicodeData.txt file
-    in the UNICODE_ATTRIBUTES dictionary.
-
-    A typical line for a single code point in UnicodeData.txt looks
-    like this:
-
-    0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
-
-    Code point ranges are indicated by pairs of lines like this:
-
-    4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
-    9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
-    '''
-    with open(filename, mode='r') as unicode_data_file:
-        fields_start = []
-        for line in unicode_data_file:
-            fields = line.strip().split(';')
-            if len(fields) != 15:
-                sys.stderr.write(
-                    'short line in file "%(f)s": %(l)s\n' %{
-                    'f': filename, 'l': line})
-                exit(1)
-            if fields[2] == 'Cs':
-                # Surrogates are UTF-16 artefacts,
-                # not real characters. Ignore them.
-                fields_start = []
-                continue
-            if fields[1].endswith(', First>'):
-                fields_start = fields
-                fields_start[1] = fields_start[1].split(',')[0][1:]
-                continue
-            if fields[1].endswith(', Last>'):
-                fields[1] = fields[1].split(',')[0][1:]
-                if fields[1:] != fields_start[1:]:
-                    sys.stderr.write(
-                        'broken code point range in file "%(f)s": %(l)s\n' %{
-                            'f': filename, 'l': line})
-                    exit(1)
-                for code_point in range(
-                        int(fields_start[0], 16),
-                        int(fields[0], 16)+1):
-                    fill_attribute(code_point, fields)
-                fields_start = []
-                continue
-            fill_attribute(int(fields[0], 16), fields)
-            fields_start = []
-
-def fill_east_asian_widths(filename):
-    '''Stores the entire contents of the EastAsianWidths.txt file
-    in the EAST_ASIAN_WIDTHS dictionary.
-
-    Lines in EastAsianWidths.txt are either a code point range like
-    this:
-
-    9FCD..9FFF;W     # Cn    [51] <reserved-9FCD>..<reserved-9FFF>
-
-    or a single code point like this:
-
-    A015;W           # Lm         YI SYLLABLE WU
-    '''
-    with open(filename, mode='r') as east_asian_widths_file:
-        for line in east_asian_widths_file:
-            match = re.match(
-                r'^(?P<codepoint1>[0-9A-F]{4,6})'
-                +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
-                +r'\s*;\s*(?P<property>[a-zA-Z]+)',
-                line)
-            if not match:
-                continue
-            start = match.group('codepoint1')
-            end = match.group('codepoint2')
-            if not end:
-                end = start
-            for code_point in range(int(start, 16), int(end, 16)+1):
-                EAST_ASIAN_WIDTHS[code_point] = match.group('property')
-
-def ucs_symbol(code_point):
-    '''Return the UCS symbol string for a Unicode character.'''
-    if code_point < 0x10000:
-        return '<U{:04X}>'.format(code_point)
-    else:
-        return '<U{:08X}>'.format(code_point)
+import unicode_utils
 
 def create_charmap_dictionary(file_name):
     '''Create a dictionary for all code points found in the CHARMAP
@@ -217,10 +78,10 @@ def check_charmap(original_file_name, new_file_name):
     if ARGS.show_missing_characters:
         for key in sorted(set(ocharmap)-set(ncharmap)):
             print('removed: {:s}     {:s} {:s}'.format(
-                ucs_symbol(key),
+                unicode_utils.ucs_symbol(key),
                 ocharmap[key],
-                UNICODE_ATTRIBUTES[key]['name'] \
-                if key in UNICODE_ATTRIBUTES else None))
+                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
+                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
     print('------------------------------------------------------------')
     changed_charmap = {}
     for key in set(ocharmap).intersection(set(ncharmap)):
@@ -231,21 +92,21 @@ def check_charmap(original_file_name, new_file_name):
     if ARGS.show_changed_characters:
         for key in sorted(changed_charmap):
             print('changed: {:s}     {:s}->{:s} {:s}'.format(
-                ucs_symbol(key),
+                unicode_utils.ucs_symbol(key),
                 changed_charmap[key][0],
                 changed_charmap[key][1],
-                UNICODE_ATTRIBUTES[key]['name'] \
-                if key in UNICODE_ATTRIBUTES else None))
+                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
+                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
     print('------------------------------------------------------------')
     print('Total added characters in newly generated CHARMAP: %d'
           %len(set(ncharmap)-set(ocharmap)))
     if ARGS.show_added_characters:
         for key in sorted(set(ncharmap)-set(ocharmap)):
             print('added: {:s}     {:s} {:s}'.format(
-                ucs_symbol(key),
+                unicode_utils.ucs_symbol(key),
                 ncharmap[key],
-                UNICODE_ATTRIBUTES[key]['name'] \
-                if key in UNICODE_ATTRIBUTES else None))
+                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
+                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
 
 def create_width_dictionary(file_name):
     '''Create a dictionary for all code points found in the WIDTH
@@ -290,20 +151,20 @@ def check_width(original_file_name, new_file_name):
           + 'i.e. these have width 1 now.)')
     if ARGS.show_missing_characters:
         for key in sorted(set(owidth)-set(nwidth)):
-            print('removed: {:s} '.format(ucs_symbol(key))
+            print('removed: {:s} '.format(unicode_utils.ucs_symbol(key))
                   + '{:d} : '.format(owidth[key])
                   + 'eaw={:s} '.format(
-                      EAST_ASIAN_WIDTHS[key]
-                      if key in EAST_ASIAN_WIDTHS else None)
+                      unicode_utils.EAST_ASIAN_WIDTHS[key]
+                      if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
                   + 'category={:2s} '.format(
-                      UNICODE_ATTRIBUTES[key]['category']
-                      if key in UNICODE_ATTRIBUTES else None)
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['category']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'bidi={:3s} '.format(
-                      UNICODE_ATTRIBUTES[key]['bidi']
-                      if key in UNICODE_ATTRIBUTES else None)
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'name={:s}'.format(
-                      UNICODE_ATTRIBUTES[key]['name']
-                      if key in UNICODE_ATTRIBUTES else None))
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['name']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
     print('------------------------------------------------------------')
     changed_width = {}
     for key in set(owidth).intersection(set(nwidth)):
@@ -313,21 +174,21 @@ def check_width(original_file_name, new_file_name):
           %len(changed_width))
     if ARGS.show_changed_characters:
         for key in sorted(changed_width):
-            print('changed width: {:s} '.format(ucs_symbol(key))
+            print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key))
                   + '{:d}->{:d} : '.format(changed_width[key][0],
                                           changed_width[key][1])
                   + 'eaw={:s} '.format(
-                      EAST_ASIAN_WIDTHS[key]
-                      if key in EAST_ASIAN_WIDTHS else None)
+                      unicode_utils.EAST_ASIAN_WIDTHS[key]
+                      if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
                   + 'category={:2s} '.format(
-                      UNICODE_ATTRIBUTES[key]['category']
-                      if key in UNICODE_ATTRIBUTES else None)
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['category']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'bidi={:3s} '.format(
-                      UNICODE_ATTRIBUTES[key]['bidi']
-                      if key in UNICODE_ATTRIBUTES else None)
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'name={:s}'.format(
-                      UNICODE_ATTRIBUTES[key]['name']
-                      if key in UNICODE_ATTRIBUTES else None))
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['name']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
     print('------------------------------------------------------------')
     print('Total added characters in newly generated WIDTH: %d'
           %len(set(nwidth)-set(owidth)))
@@ -335,20 +196,20 @@ def check_width(original_file_name, new_file_name):
           + 'i.e. these had width 1 before.)')
     if ARGS.show_added_characters:
         for key in sorted(set(nwidth)-set(owidth)):
-            print('added: {:s} '.format(ucs_symbol(key))
+            print('added: {:s} '.format(unicode_utils.ucs_symbol(key))
                   + '{:d} : '.format(nwidth[key])
                   + 'eaw={:s} '.format(
-                      EAST_ASIAN_WIDTHS[key]
-                      if key in EAST_ASIAN_WIDTHS else None)
+                      unicode_utils.EAST_ASIAN_WIDTHS[key]
+                      if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
                   + 'category={:2s} '.format(
-                      UNICODE_ATTRIBUTES[key]['category']
-                      if key in UNICODE_ATTRIBUTES else None)
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['category']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'bidi={:3s} '.format(
-                      UNICODE_ATTRIBUTES[key]['bidi']
-                      if key in UNICODE_ATTRIBUTES else None)
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'name={:s}'.format(
-                      UNICODE_ATTRIBUTES[key]['name']
-                      if key in UNICODE_ATTRIBUTES else None))
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['name']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
 
 if __name__ == "__main__":
     PARSER = argparse.ArgumentParser(
@@ -392,8 +253,8 @@ if __name__ == "__main__":
     ARGS = PARSER.parse_args()
 
     if ARGS.unicode_data_file:
-        fill_attributes(ARGS.unicode_data_file)
+        unicode_utils.fill_attributes(ARGS.unicode_data_file)
     if ARGS.east_asian_width_file:
-        fill_east_asian_widths(ARGS.east_asian_width_file)
+        unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file)
     check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
     check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)
diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py
index f1b88f5b29..bc84c07617 100755
--- a/localedata/unicode-gen/utf8_gen.py
+++ b/localedata/unicode-gen/utf8_gen.py
@@ -29,6 +29,7 @@ It will output UTF-8 file
 
 import sys
 import re
+import unicode_utils
 
 # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
 # sections 3.11 and 4.4.
@@ -49,13 +50,6 @@ JAMO_FINAL_SHORT_NAME = (
     'P', 'H'
 )
 
-def ucs_symbol(code_point):
-    '''Return the UCS symbol string for a Unicode character.'''
-    if code_point < 0x10000:
-        return '<U{:04X}>'.format(code_point)
-    else:
-        return '<U{:08X}>'.format(code_point)
-
 def process_range(start, end, outfile, name):
     '''Writes a range of code points into the CHARMAP section of the
     output file
@@ -78,7 +72,7 @@ def process_range(start, end, outfile, name):
                                    + JAMO_MEDIAL_SHORT_NAME[index2] \
                                    + JAMO_FINAL_SHORT_NAME[index3]
             outfile.write('{:<11s} {:<12s} {:s}\n'.format(
-                ucs_symbol(i), convert_to_hex(i),
+                unicode_utils.ucs_symbol(i), convert_to_hex(i),
                 hangul_syllable_name))
         return
     # UnicodeData.txt file has contains code point ranges like this:
@@ -95,14 +89,14 @@ def process_range(start, end, outfile, name):
     for i in range(int(start, 16), int(end, 16), 64 ):
         if i > (int(end, 16)-64):
             outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
-                    ucs_symbol(i),
-                    ucs_symbol(int(end,16)),
+                    unicode_utils.ucs_symbol(i),
+                    unicode_utils.ucs_symbol(int(end,16)),
                     convert_to_hex(i),
                     name))
             break
         outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
-                ucs_symbol(i),
-                ucs_symbol(i+63),
+                unicode_utils.ucs_symbol(i),
+                unicode_utils.ucs_symbol(i+63),
                 convert_to_hex(i),
                 name))
 
@@ -168,7 +162,7 @@ def process_charmap(flines, outfile):
             # comments, so we keep these comment lines.
             outfile.write('%')
         outfile.write('{:<11s} {:<12s} {:s}\n'.format(
-                ucs_symbol(int(fields[0], 16)),
+                unicode_utils.ucs_symbol(int(fields[0], 16)),
                 convert_to_hex(int(fields[0], 16)),
                 fields[1]))
 
@@ -230,7 +224,7 @@ def process_width(outfile, ulines, elines):
     for line in ulines:
         fields = line.split(";")
         if fields[4] == "NSM" or fields[2] == "Cf":
-            width_dict[int(fields[0], 16)] = ucs_symbol(
+            width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
                 int(fields[0], 16)) + '\t0'
 
     for line in elines:
@@ -238,7 +232,7 @@ def process_width(outfile, ulines, elines):
         # UnicodeData.txt:
         fields = line.split(";")
         if not '..' in fields[0]:
-            width_dict[int(fields[0], 16)] = ucs_symbol(
+            width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
                 int(fields[0], 16)) + '\t2'
         else:
             code_points = fields[0].split("..")
@@ -247,8 +241,8 @@ def process_width(outfile, ulines, elines):
                 if  key in width_dict:
                     del width_dict[key]
             width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format(
-                ucs_symbol(int(code_points[0], 16)),
-                ucs_symbol(int(code_points[1], 16)))
+                unicode_utils.ucs_symbol(int(code_points[0], 16)),
+                unicode_utils.ucs_symbol(int(code_points[1], 16)))
 
     for key in sorted(width_dict):
         outfile.write(width_dict[key]+'\n')
author	Carlos O'Donell <carlos@systemhalted.org>	2015-12-09 22:27:41 -0500
committer	Carlos O'Donell <carlos@systemhalted.org>	2015-12-09 22:52:13 -0500
commit	dd8e8e547647bf7a3f6feb816a848a846feeaf14 (patch)
tree	a2565747c02ddaa9b178a5aa9de6fa42aa5ae979 /localedata/unicode-gen
parent	40b59cace2fd5e5aa04367073a54efc995059376 (diff)
download	glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.tar.gz glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.tar.xz glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.zip