1 files changed, 0 insertions, 280 deletions
diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py
deleted file mode 100755
index ab03e750a6..0000000000
--- a/localedata/unicode-gen/utf8_gen.py
+++ /dev/null
@@ -1,280 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-# Copyright (C) 2014-2017 Free Software Foundation, Inc.
-# This file is part of the GNU C Library.
-#
-# The GNU C Library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# The GNU C Library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with the GNU C Library; if not, see
-# <http://www.gnu.org/licenses/>.
-
-'''glibc/localedata/charmaps/UTF-8 file generator script
-
-This script generates a glibc/localedata/charmaps/UTF-8 file
-from Unicode data.
-
-Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
-
-It will output UTF-8 file
-'''
-
-import sys
-import re
-import unicode_utils
-
-# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
-# sections 3.11 and 4.4.
-
-JAMO_INITIAL_SHORT_NAME = (
-    'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
-    'C', 'K', 'T', 'P', 'H'
-)
-
-JAMO_MEDIAL_SHORT_NAME = (
-    'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
-    'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
-)
-
-JAMO_FINAL_SHORT_NAME = (
-    '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
-    'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
-    'P', 'H'
-)
-
-def process_range(start, end, outfile, name):
-    '''Writes a range of code points into the CHARMAP section of the
-    output file
-
-    '''
-    if 'Hangul Syllable' in name:
-        # from glibc/localedata/ChangeLog:
-        #
-        #  2000-09-24  Bruno Haible  <haible@clisp.cons.org>
-        #  * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
-        #  so they become printable and carry a width. Comment out surrogate
-        #  ranges. Add a WIDTH table
-        #
-        # So we expand the Hangul Syllables here:
-        for i in range(int(start, 16), int(end, 16)+1 ):
-            index2, index3 = divmod(i - 0xaC00, 28)
-            index1, index2 = divmod(index2, 21)
-            hangul_syllable_name = 'HANGUL SYLLABLE ' \
-                                   + JAMO_INITIAL_SHORT_NAME[index1] \
-                                   + JAMO_MEDIAL_SHORT_NAME[index2] \
-                                   + JAMO_FINAL_SHORT_NAME[index3]
-            outfile.write('{:<11s} {:<12s} {:s}\n'.format(
-                unicode_utils.ucs_symbol(i), convert_to_hex(i),
-                hangul_syllable_name))
-        return
-    # UnicodeData.txt file has contains code point ranges like this:
-    #
-    # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
-    # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
-    #
-    # The glibc UTF-8 file splits ranges like these into shorter
-    # ranges of 64 code points each:
-    #
-    # <U3400>..<U343F>     /xe3/x90/x80         <CJK Ideograph Extension A>
-    # …
-    # <U4D80>..<U4DB5>     /xe4/xb6/x80         <CJK Ideograph Extension A>
-    for i in range(int(start, 16), int(end, 16), 64 ):
-        if i > (int(end, 16)-64):
-            outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
-                    unicode_utils.ucs_symbol(i),
-                    unicode_utils.ucs_symbol(int(end,16)),
-                    convert_to_hex(i),
-                    name))
-            break
-        outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
-                unicode_utils.ucs_symbol(i),
-                unicode_utils.ucs_symbol(i+63),
-                convert_to_hex(i),
-                name))
-
-def process_charmap(flines, outfile):
-    '''This function takes an array which contains *all* lines of
-    of UnicodeData.txt and write lines to outfile as used in the
-
-    CHARMAP
-    …
-    END CHARMAP
-
-    section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
-
-    Samples for input lines:
-
-    0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
-    3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
-    4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
-    D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
-    DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
-    100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
-    10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
-
-    Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
-
-    <U0010>     /x10 DATA LINK ESCAPE
-    <U3400>..<U343F>     /xe3/x90/x80 <CJK Ideograph Extension A>
-    %<UD800>     /xed/xa0/x80 <Non Private Use High Surrogate, First>
-    %<UDB7F>     /xed/xad/xbf <Non Private Use High Surrogate, Last>
-    <U0010FFC0>..<U0010FFFD>     /xf4/x8f/xbf/x80 <Plane 16 Private Use>
-
-    '''
-    fields_start = []
-    for line in flines:
-        fields = line.split(";")
-         # Some characters have “<control>” as their name. We try to
-         # use the “Unicode 1.0 Name” (10th field in
-         # UnicodeData.txt) for them.
-         #
-         # The Characters U+0080, U+0081, U+0084 and U+0099 have
-         # “<control>” as their name but do not even have aa
-         # ”Unicode 1.0 Name”. We could write code to take their
-         # alternate names from NameAliases.txt.
-        if fields[1] == "<control>" and fields[10]:
-            fields[1] = fields[10]
-        # Handling code point ranges like:
-        #
-        # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
-        # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
-        if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
-            fields_start = fields
-            continue
-        if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
-            process_range(fields_start[0], fields[0],
-                          outfile, fields[1][:-7]+'>')
-            fields_start = []
-            continue
-        fields_start = []
-        if 'Surrogate,' in fields[1]:
-            # Comment out the surrogates in the UTF-8 file.
-            # One could of course skip them completely but
-            # the original UTF-8 file in glibc had them as
-            # comments, so we keep these comment lines.
-            outfile.write('%')
-        outfile.write('{:<11s} {:<12s} {:s}\n'.format(
-                unicode_utils.ucs_symbol(int(fields[0], 16)),
-                convert_to_hex(int(fields[0], 16)),
-                fields[1]))
-
-def convert_to_hex(code_point):
-    '''Converts a code point to a hexadecimal UTF-8 representation
-    like /x**/x**/x**.'''
-    # Getting UTF8 of Unicode characters.
-    # In Python3, .encode('UTF-8') does not work for
-    # surrogates. Therefore, we use this conversion table
-    surrogates = {
-        0xD800: '/xed/xa0/x80',
-        0xDB7F: '/xed/xad/xbf',
-        0xDB80: '/xed/xae/x80',
-        0xDBFF: '/xed/xaf/xbf',
-        0xDC00: '/xed/xb0/x80',
-        0xDFFF: '/xed/xbf/xbf',
-    }
-    if code_point in surrogates:
-        return surrogates[code_point]
-    return ''.join([
-        '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
-    ])
-
-def write_header_charmap(outfile):
-    '''Write the header on top of the CHARMAP section to the output file'''
-    outfile.write("<code_set_name> UTF-8\n")
-    outfile.write("<comment_char> %\n")
-    outfile.write("<escape_char> /\n")
-    outfile.write("<mb_cur_min> 1\n")
-    outfile.write("<mb_cur_max> 6\n\n")
-    outfile.write("% CHARMAP generated using utf8_gen.py\n")
-    outfile.write("% alias ISO-10646/UTF-8\n")
-    outfile.write("CHARMAP\n")
-
-def write_header_width(outfile):
-    '''Writes the header on top of the WIDTH section to the output file'''
-    outfile.write('% Character width according to Unicode 7.0.0.\n')
-    outfile.write('% - Default width is 1.\n')
-    outfile.write('% - Double-width characters have width 2; generated from\n')
-    outfile.write('%        "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
-    outfile.write('% - Non-spacing characters have width 0; '
-                  + 'generated from PropList.txt or\n')
-    outfile.write('%   "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
-                  + 'UnicodeData.txt"\n')
-    outfile.write('% - Format control characters have width 0; '
-                  + 'generated from\n')
-    outfile.write("%   \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
-#   Not needed covered by Cf
-#    outfile.write("% - Zero width characters have width 0; generated from\n")
-#    outfile.write("%   \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
-    outfile.write("WIDTH\n")
-
-def process_width(outfile, ulines, elines):
-    '''ulines are lines from UnicodeData.txt, elines are lines from
-    EastAsianWidth.txt
-
-    '''
-    width_dict = {}
-    for line in ulines:
-        fields = line.split(";")
-        if fields[4] == "NSM" or fields[2] == "Cf":
-            width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
-                int(fields[0], 16)) + '\t0'
-
-    for line in elines:
-        # If an entry in EastAsianWidth.txt is found, it overrides entries in
-        # UnicodeData.txt:
-        fields = line.split(";")
-        if not '..' in fields[0]:
-            width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
-                int(fields[0], 16)) + '\t2'
-        else:
-            code_points = fields[0].split("..")
-            for key in range(int(code_points[0], 16),
-                             int(code_points[1], 16)+1):
-                if  key in width_dict:
-                    del width_dict[key]
-            width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format(
-                unicode_utils.ucs_symbol(int(code_points[0], 16)),
-                unicode_utils.ucs_symbol(int(code_points[1], 16)))
-
-    for key in sorted(width_dict):
-        outfile.write(width_dict[key]+'\n')
-
-if __name__ == "__main__":
-    if len(sys.argv) < 3:
-        print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt")
-    else:
-        with open(sys.argv[1], mode='r') as UNIDATA_FILE:
-            UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
-        with open(sys.argv[2], mode='r') as EAST_ASIAN_WIDTH_FILE:
-            EAST_ASIAN_WIDTH_LINES = []
-            for LINE in EAST_ASIAN_WIDTH_FILE:
-                # If characters from EastAasianWidth.txt which are from
-                # from reserved ranges (i.e. not yet assigned code points)
-                # are added to the WIDTH section of the UTF-8 file, then
-                # “make check” produces “Unknown Character” errors for
-                # these code points because such unassigned code points
-                # are not in the CHARMAP section of the UTF-8 file.
-                #
-                # Therefore, we skip all reserved code points when reading
-                # the EastAsianWidth.txt file.
-                if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
-                    continue
-                if re.match(r'^[^;]*;[WF]', LINE):
-                    EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
-        with open('UTF-8', mode='w') as OUTFILE:
-            # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
-            write_header_charmap(OUTFILE)
-            process_charmap(UNICODE_DATA_LINES, OUTFILE)
-            OUTFILE.write("END CHARMAP\n\n")
-            # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
-            write_header_width(OUTFILE)
-            process_width(OUTFILE, UNICODE_DATA_LINES, EAST_ASIAN_WIDTH_LINES)
-            OUTFILE.write("END WIDTH\n")