Unicode 7.0.0 update; added generator scripts.

for localedata/ChangeLog [BZ #17588] [BZ #13064] [BZ #14094] [BZ #17998] * unicode-gen/Makefile: New. * unicode-gen/unicode-license.txt: New, from Unicode. * unicode-gen/UnicodeData.txt: New, from Unicode. * unicode-gen/DerivedCoreProperties.txt: New, from Unicode. * unicode-gen/EastAsianWidth.txt: New, from Unicode. * unicode-gen/gen_unicode_ctype.py: New generator, from Mike FABIAN <mfabian@redhat.com>. * unicode-gen/ctype_compatibility.py: New verifier, from Pravin Satpute <psatpute@redhat.com> and Mike FABIAN. * unicode-gen/ctype_compatibility_test_cases.py: New verifier module, from Mike FABIAN. * unicode-gen/utf8_gen.py: New generator, from Pravin Satpute and Mike FABIAN. * unicode-gen/utf8_compatibility.py: New verifier, from Pravin Satpute and Mike FABIAN. * charmaps/UTF-8: Update. * locales/i18n: Update. * gen-unicode-ctype.c: Remove. * tst-ctype-de_DE.ISO-8859-1.in: Adjust, islower now returns true for ordinal indicators.
author: Alexandre Oliva <aoliva@redhat.com> 2015-02-20 20:14:59 -0200
committer: Alexandre Oliva <aoliva@redhat.com> 2015-02-20 20:14:59 -0200
commit: 4a4839c94a4c93ffc0d5b95c69a08b02a57007f2 (patch)
tree: d60950243872d9beb0993b12173e6bbf998d779d /localedata/unicode-gen/gen_unicode_ctype.py
parent: e4a399dc3dbb3228eb39af230ad11bc42a018c93 (diff)
download: glibc-4a4839c94a4c93ffc0d5b95c69a08b02a57007f2.tar.gz
glibc-4a4839c94a4c93ffc0d5b95c69a08b02a57007f2.tar.xz
glibc-4a4839c94a4c93ffc0d5b95c69a08b02a57007f2.zip
1 files changed, 751 insertions, 0 deletions
diff --git a/localedata/unicode-gen/gen_unicode_ctype.py b/localedata/unicode-gen/gen_unicode_ctype.py
new file mode 100755
index 0000000000..559af7957a
--- /dev/null
+++ b/localedata/unicode-gen/gen_unicode_ctype.py
@@ -0,0 +1,751 @@
+#!/usr/bin/python3
+#
+# Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
+# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+# Based on gen-unicode-ctype.c by Bruno Haible <haible@clisp.cons.org>, 2000.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+'''
+Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and
+DerivedCoreProperties.txt files.
+
+To see how this script is used, call it with the “-h” option:
+
+    $ ./gen_unicode_ctype.py -h
+    … prints usage message …
+'''
+
+import argparse
+import sys
+import time
+import re
+
+# Dictionary holding the entire contents of the UnicodeData.txt file
+#
+# Contents of this dictionary look like this:
+#
+# {0: {'category': 'Cc',
+#      'title': None,
+#      'digit': '',
+#      'name': '<control>',
+#      'bidi': 'BN',
+#      'combining': '0',
+#      'comment': '',
+#      'oldname': 'NULL',
+#      'decomposition': '',
+#      'upper': None,
+#      'mirrored': 'N',
+#      'lower': None,
+#      'decdigit': '',
+#      'numeric': ''},
+#      …
+# }
+UNICODE_ATTRIBUTES = {}
+
+# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
+#
+# Contents of this dictionary look like this:
+#
+# {917504: ['Default_Ignorable_Code_Point'],
+#  917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
+#  …
+# }
+DERIVED_CORE_PROPERTIES = {}
+
+def fill_attribute(code_point, fields):
+    '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
+
+    One entry in the UNICODE_ATTRIBUTES dictionary represents one line
+    in the UnicodeData.txt file.
+
+    '''
+    UNICODE_ATTRIBUTES[code_point] =  {
+        'name': fields[1],          # Character name
+        'category': fields[2],      # General category
+        'combining': fields[3],     # Canonical combining classes
+        'bidi': fields[4],          # Bidirectional category
+        'decomposition': fields[5], # Character decomposition mapping
+        'decdigit': fields[6],      # Decimal digit value
+        'digit': fields[7],         # Digit value
+        'numeric': fields[8],       # Numeric value
+        'mirrored': fields[9],      # mirrored
+        'oldname': fields[10],      # Old Unicode 1.0 name
+        'comment': fields[11],      # comment
+        # Uppercase mapping
+        'upper': int(fields[12], 16) if fields[12] else None,
+        # Lowercase mapping
+        'lower': int(fields[13], 16) if fields[13] else None,
+        # Titlecase mapping
+        'title': int(fields[14], 16) if fields[14] else None,
+    }
+
+def fill_attributes(filename):
+    '''Stores the entire contents of the UnicodeData.txt file
+    in the UNICODE_ATTRIBUTES dictionary.
+
+    A typical line for a single code point in UnicodeData.txt looks
+    like this:
+
+    0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
+
+    Code point ranges are indicated by pairs of lines like this:
+
+    4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
+    9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
+    '''
+    with open(filename, mode='r') as unicode_data_file:
+        fields_start = []
+        for line in unicode_data_file:
+            fields = line.strip().split(';')
+            if len(fields) != 15:
+                sys.stderr.write(
+                    'short line in file "%(f)s": %(l)s\n' %{
+                    'f': filename, 'l': line})
+                exit(1)
+            if fields[2] == 'Cs':
+                # Surrogates are UTF-16 artefacts,
+                # not real characters. Ignore them.
+                fields_start = []
+                continue
+            if fields[1].endswith(', First>'):
+                fields_start = fields
+                fields_start[1] = fields_start[1].split(',')[0][1:]
+                continue
+            if fields[1].endswith(', Last>'):
+                fields[1] = fields[1].split(',')[0][1:]
+                if fields[1:] != fields_start[1:]:
+                    sys.stderr.write(
+                        'broken code point range in file "%(f)s": %(l)s\n' %{
+                            'f': filename, 'l': line})
+                    exit(1)
+                for code_point in range(
+                        int(fields_start[0], 16),
+                        int(fields[0], 16)+1):
+                    fill_attribute(code_point, fields)
+                fields_start = []
+                continue
+            fill_attribute(int(fields[0], 16), fields)
+            fields_start = []
+
+def fill_derived_core_properties(filename):
+    '''Stores the entire contents of the DerivedCoreProperties.txt file
+    in the DERIVED_CORE_PROPERTIES dictionary.
+
+    Lines in DerivedCoreProperties.txt are either a code point range like
+    this:
+
+    0061..007A    ; Lowercase # L&  [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
+
+    or a single code point like this:
+
+    00AA          ; Lowercase # Lo       FEMININE ORDINAL INDICATOR
+
+    '''
+    with open(filename, mode='r') as derived_core_properties_file:
+        for line in derived_core_properties_file:
+            match = re.match(
+                r'^(?P<codepoint1>[0-9A-F]{4,6})'
+                + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
+                + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
+                line)
+            if not match:
+                continue
+            start = match.group('codepoint1')
+            end = match.group('codepoint2')
+            if not end:
+                end = start
+            for code_point in range(int(start, 16), int(end, 16)+1):
+                prop = match.group('property')
+                if code_point in DERIVED_CORE_PROPERTIES:
+                    DERIVED_CORE_PROPERTIES[code_point].append(prop)
+                else:
+                    DERIVED_CORE_PROPERTIES[code_point] = [prop]
+
+def to_upper(code_point):
+    '''Returns the code point of the uppercase version
+    of the given code point'''
+    if (UNICODE_ATTRIBUTES[code_point]['name']
+        and UNICODE_ATTRIBUTES[code_point]['upper']):
+        return UNICODE_ATTRIBUTES[code_point]['upper']
+    else:
+        return code_point
+
+def to_lower(code_point):
+    '''Returns the code point of the lowercase version
+    of the given code point'''
+    if (UNICODE_ATTRIBUTES[code_point]['name']
+        and UNICODE_ATTRIBUTES[code_point]['lower']):
+        return UNICODE_ATTRIBUTES[code_point]['lower']
+    else:
+        return code_point
+
+def to_title(code_point):
+    '''Returns the code point of the titlecase version
+    of the given code point'''
+    if (UNICODE_ATTRIBUTES[code_point]['name']
+        and UNICODE_ATTRIBUTES[code_point]['title']):
+        return UNICODE_ATTRIBUTES[code_point]['title']
+    else:
+        return code_point
+
+def is_upper(code_point):
+    '''Checks whether the character with this code point is uppercase'''
+    return (to_lower(code_point) != code_point
+            or (code_point in DERIVED_CORE_PROPERTIES
+                and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
+
+def is_lower(code_point):
+    '''Checks whether the character with this code point is lowercase'''
+    # Some characters are defined as “Lowercase” in
+    # DerivedCoreProperties.txt but do not have a mapping to upper
+    # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
+    # one of these.
+    return (to_upper(code_point) != code_point
+            # <U00DF> is lowercase, but without simple to_upper mapping.
+            or code_point == 0x00DF
+            or (code_point in DERIVED_CORE_PROPERTIES
+                and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
+
+def is_alpha(code_point):
+    '''Checks whether the character with this code point is alphabetic'''
+    return ((code_point in DERIVED_CORE_PROPERTIES
+             and
+             'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
+            or
+            # Consider all the non-ASCII digits as alphabetic.
+            # ISO C 99 forbids us to have them in category “digit”,
+            # but we want iswalnum to return true on them.
+            (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
+             and not (code_point >= 0x0030 and code_point <= 0x0039)))
+
+def is_digit(code_point):
+    '''Checks whether the character with this code point is a digit'''
+    if False:
+        return (UNICODE_ATTRIBUTES[code_point]['name']
+                and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
+        # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
+        # a zero.  Must add <0> in front of them by hand.
+    else:
+        # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
+        # takes it away:
+        # 7.25.2.1.5:
+        #    The iswdigit function tests for any wide character that
+        #    corresponds to a decimal-digit character (as defined in 5.2.1).
+        # 5.2.1:
+        #    the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
+        return (code_point >= 0x0030 and code_point <= 0x0039)
+
+def is_outdigit(code_point):
+    '''Checks whether the character with this code point is outdigit'''
+    return (code_point >= 0x0030 and code_point <= 0x0039)
+
+def is_blank(code_point):
+    '''Checks whether the character with this code point is blank'''
+    return (code_point == 0x0009 # '\t'
+            # Category Zs without mention of '<noBreak>'
+            or (UNICODE_ATTRIBUTES[code_point]['name']
+                and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
+                and '<noBreak>' not in
+                UNICODE_ATTRIBUTES[code_point]['decomposition']))
+
+def is_space(code_point):
+    '''Checks whether the character with this code point is a space'''
+    # Don’t make U+00A0 a space. Non-breaking space means that all programs
+    # should treat it like a punctuation character, not like a space.
+    return (code_point == 0x0020 # ' '
+            or code_point == 0x000C # '\f'
+            or code_point == 0x000A # '\n'
+            or code_point == 0x000D # '\r'
+            or code_point == 0x0009 # '\t'
+            or code_point == 0x000B # '\v'
+            # Categories Zl, Zp, and Zs without mention of "<noBreak>"
+            or (UNICODE_ATTRIBUTES[code_point]['name']
+                and
+                (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
+                 or
+                 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
+                  and
+                  '<noBreak>' not in
+                  UNICODE_ATTRIBUTES[code_point]['decomposition']))))
+
+def is_cntrl(code_point):
+    '''Checks whether the character with this code point is
+    a control character'''
+    return (UNICODE_ATTRIBUTES[code_point]['name']
+            and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
+                 or
+                 UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
+
+def is_xdigit(code_point):
+    '''Checks whether the character with this code point is
+    a hexadecimal digit'''
+    if False:
+        return (is_digit(code_point)
+                or (code_point >= 0x0041 and code_point <= 0x0046)
+                or (code_point >= 0x0061 and code_point <= 0x0066))
+    else:
+        # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
+        # takes it away:
+        # 7.25.2.1.12:
+        #    The iswxdigit function tests for any wide character that
+        #    corresponds to a hexadecimal-digit character (as defined
+        #    in 6.4.4.1).
+        # 6.4.4.1:
+        #    hexadecimal-digit: one of
+        #    0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
+        return ((code_point >= 0x0030 and code_point  <= 0x0039)
+                or (code_point >= 0x0041 and code_point <= 0x0046)
+                or (code_point >= 0x0061 and code_point <= 0x0066))
+
+def is_graph(code_point):
+    '''Checks whether the character with this code point is
+    a graphical character'''
+    return (UNICODE_ATTRIBUTES[code_point]['name']
+            and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
+            and not is_space(code_point))
+
+def is_print(code_point):
+    '''Checks whether the character with this code point is printable'''
+    return (UNICODE_ATTRIBUTES[code_point]['name']
+            and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
+            and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
+
+def is_punct(code_point):
+    '''Checks whether the character with this code point is punctuation'''
+    if False:
+        return (UNICODE_ATTRIBUTES[code_point]['name']
+                and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
+    else:
+        # The traditional POSIX definition of punctuation is every graphic,
+        # non-alphanumeric character.
+        return (is_graph(code_point)
+                and not is_alpha(code_point)
+                and not is_digit(code_point))
+
+def is_combining(code_point):
+    '''Checks whether the character with this code point is
+    a combining character'''
+    # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
+    # file. In 3.0.1 it was identical to the union of the general categories
+    # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
+    # PropList.txt file, so we take the latter definition.
+    return (UNICODE_ATTRIBUTES[code_point]['name']
+            and
+            UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
+
+def is_combining_level3(code_point):
+    '''Checks whether the character with this code point is
+    a combining level3 character'''
+    return (is_combining(code_point)
+            and
+            int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
+
+def ucs_symbol(code_point):
+    '''Return the UCS symbol string for a Unicode character.'''
+    if code_point < 0x10000:
+        return '<U{:04X}>'.format(code_point)
+    else:
+        return '<U{:08X}>'.format(code_point)
+
+def ucs_symbol_range(code_point_low, code_point_high):
+    '''Returns a string UCS symbol string for a code point range.
+
+    Example:
+
+    <U0041>..<U005A>
+    '''
+    return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
+
+def code_point_ranges(is_class_function):
+    '''Returns a list of ranges of code points for which is_class_function
+    returns True.
+
+    Example:
+
+    [[65, 90], [192, 214], [216, 222], [256], … ]
+    '''
+    cp_ranges  = []
+    for code_point in sorted(UNICODE_ATTRIBUTES):
+        if is_class_function(code_point):
+            if (cp_ranges
+                and cp_ranges[-1][-1] == code_point - 1):
+                if len(cp_ranges[-1]) == 1:
+                    cp_ranges[-1].append(code_point)
+                else:
+                    cp_ranges[-1][-1] = code_point
+            else:
+                cp_ranges.append([code_point])
+    return cp_ranges
+
+def output_charclass(i18n_file, class_name, is_class_function):
+    '''Output a LC_CTYPE character class section
+
+    Example:
+
+    upper /
+       <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
+       …
+       <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
+       <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
+    '''
+    cp_ranges = code_point_ranges(is_class_function)
+    if cp_ranges:
+        i18n_file.write('%s /\n' %class_name)
+        max_column = 75
+        prefix = '   '
+        line = prefix
+        range_string = ''
+        for code_point_range in cp_ranges:
+            if line.strip():
+                line  += ';'
+            if len(code_point_range) == 1:
+                range_string = ucs_symbol(code_point_range[0])
+            else:
+                range_string = ucs_symbol_range(
+                    code_point_range[0], code_point_range[-1])
+            if len(line+range_string) > max_column:
+                i18n_file.write(line+'/\n')
+                line = prefix
+            line += range_string
+        if line.strip():
+            i18n_file.write(line+'\n')
+        i18n_file.write('\n')
+
+def output_charmap(i18n_file, map_name, map_function):
+    '''Output a LC_CTYPE character map section
+
+    Example:
+
+    toupper /
+      (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
+      …
+      (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
+      (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
+    '''
+    max_column = 75
+    prefix = '   '
+    line = prefix
+    map_string = ''
+    i18n_file.write('%s /\n' %map_name)
+    for code_point in sorted(UNICODE_ATTRIBUTES):
+        mapped = map_function(code_point)
+        if code_point != mapped:
+            if line.strip():
+                line += ';'
+            map_string = '(' \
+                         + ucs_symbol(code_point) \
+                         + ',' \
+                         + ucs_symbol(mapped) \
+                         + ')'
+            if len(line+map_string) > max_column:
+                i18n_file.write(line+'/\n')
+                line = prefix
+            line += map_string
+    if line.strip():
+        i18n_file.write(line+'\n')
+    i18n_file.write('\n')
+
+def verifications():
+    '''Tests whether the is_* functions observe the known restrictions'''
+    for code_point in sorted(UNICODE_ATTRIBUTES):
+        # toupper restriction: "Only characters specified for the keywords
+        # lower and upper shall be specified.
+        if (to_upper(code_point) != code_point
+            and not (is_lower(code_point) or is_upper(code_point))):
+            sys.stderr.write(
+                ('%(sym)s is not upper|lower '
+                 + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
+                    'sym': ucs_symbol(code_point),
+                    'c': code_point,
+                    'uc': to_upper(code_point)})
+        # tolower restriction: "Only characters specified for the keywords
+        # lower and upper shall be specified.
+        if (to_lower(code_point) != code_point
+            and not (is_lower(code_point) or is_upper(code_point))):
+            sys.stderr.write(
+                ('%(sym)s is not upper|lower '
+                 + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
+                    'sym': ucs_symbol(code_point),
+                    'c': code_point,
+                    'uc': to_lower(code_point)})
+        # alpha restriction: "Characters classified as either upper or lower
+        # shall automatically belong to this class.
+        if ((is_lower(code_point) or is_upper(code_point))
+             and not is_alpha(code_point)):
+            sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
+                'sym': ucs_symbol(code_point)})
+        # alpha restriction: “No character specified for the keywords cntrl,
+        # digit, punct or space shall be specified.”
+        if (is_alpha(code_point) and is_cntrl(code_point)):
+            sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_alpha(code_point) and is_digit(code_point)):
+            sys.stderr.write('%(sym)s is alpha and digit\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_alpha(code_point) and is_punct(code_point)):
+            sys.stderr.write('%(sym)s is alpha and punct\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_alpha(code_point) and is_space(code_point)):
+            sys.stderr.write('%(sym)s is alpha and space\n' %{
+                'sym': ucs_symbol(code_point)})
+        # space restriction: “No character specified for the keywords upper,
+        # lower, alpha, digit, graph or xdigit shall be specified.”
+        # upper, lower, alpha already checked above.
+        if (is_space(code_point) and is_digit(code_point)):
+            sys.stderr.write('%(sym)s is space and digit\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_space(code_point) and is_graph(code_point)):
+            sys.stderr.write('%(sym)s is space and graph\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_space(code_point) and is_xdigit(code_point)):
+            sys.stderr.write('%(sym)s is space and xdigit\n' %{
+                'sym': ucs_symbol(code_point)})
+        # cntrl restriction: “No character specified for the keywords upper,
+        # lower, alpha, digit, punct, graph, print or xdigit shall be
+        # specified.”  upper, lower, alpha already checked above.
+        if (is_cntrl(code_point) and is_digit(code_point)):
+            sys.stderr.write('%(sym)s is cntrl and digit\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_cntrl(code_point) and is_punct(code_point)):
+            sys.stderr.write('%(sym)s is cntrl and punct\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_cntrl(code_point) and is_graph(code_point)):
+            sys.stderr.write('%(sym)s is cntrl and graph\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_cntrl(code_point) and is_print(code_point)):
+            sys.stderr.write('%(sym)s is cntrl and print\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_cntrl(code_point) and is_xdigit(code_point)):
+            sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
+                'sym': ucs_symbol(code_point)})
+        # punct restriction: “No character specified for the keywords upper,
+        # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
+        # be specified.”  upper, lower, alpha, cntrl already checked above.
+        if (is_punct(code_point) and is_digit(code_point)):
+            sys.stderr.write('%(sym)s is punct and digit\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_punct(code_point) and is_xdigit(code_point)):
+            sys.stderr.write('%(sym)s is punct and xdigit\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (is_punct(code_point) and code_point == 0x0020):
+            sys.stderr.write('%(sym)s is punct\n' %{
+                'sym': ucs_symbol(code_point)})
+        # graph restriction: “No character specified for the keyword cntrl
+        # shall be specified.”  Already checked above.
+
+        # print restriction: “No character specified for the keyword cntrl
+        # shall be specified.”  Already checked above.
+
+        # graph - print relation: differ only in the <space> character.
+        # How is this possible if there are more than one space character?!
+        # I think susv2/xbd/locale.html should speak of “space characters”,
+        # not “space character”.
+        if (is_print(code_point)
+            and not (is_graph(code_point) or is_space(code_point))):
+            sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
+                'sym': ucs_symbol(code_point)})
+        if (not is_print(code_point)
+            and (is_graph(code_point) or code_point == 0x0020)):
+            sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
+                'sym': ucs_symbol(code_point)})
+
+def read_input_file(filename):
+    '''Reads the original glibc i18n file to get the original head
+    and tail.
+
+    We want to replace only the character classes in LC_CTYPE, and the
+    date stamp. All the rest of the i18n file should stay unchanged.
+    To avoid having to cut and paste the generated data into the
+    original file, it is helpful to read the original file here
+    to be able to generate a complete result file.
+    '''
+    head = tail = ''
+    with open(filename, mode='r') as i18n_file:
+        for line in i18n_file:
+            match = re.match(
+                r'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")',
+                line)
+            if match:
+                line = match.group('key') \
+                       + '"{:s}"\n'.format(time.strftime('%Y-%m-%d'))
+            head = head + line
+            if line.startswith('LC_CTYPE'):
+                break
+        for line in i18n_file:
+            if line.startswith('translit_start'):
+                tail = line
+                break
+        for line in i18n_file:
+            tail = tail + line
+    return (head, tail)
+
+def output_head(i18n_file, unicode_version, head=''):
+    '''Write the header of the output file, i.e. the part of the file
+    before the “LC_CTYPE” line.
+    '''
+    if ARGS.input_file and head:
+        i18n_file.write(head)
+    else:
+        i18n_file.write('escape_char /\n')
+        i18n_file.write('comment_char %\n')
+        i18n_file.write('\n')
+        i18n_file.write('% Generated automatically by '
+                        + 'gen_unicode_ctype.py '
+                        + 'for Unicode {:s}.\n'.format(unicode_version))
+        i18n_file.write('\n')
+        i18n_file.write('LC_IDENTIFICATION\n')
+        i18n_file.write('title     "Unicode {:s} FDCC-set"\n'.format(
+            unicode_version))
+        i18n_file.write('source    "UnicodeData.txt, '
+                        + 'DerivedCoreProperties.txt"\n')
+        i18n_file.write('address   ""\n')
+        i18n_file.write('contact   ""\n')
+        i18n_file.write('email     "bug-glibc-locales@gnu.org"\n')
+        i18n_file.write('tel       ""\n')
+        i18n_file.write('fax       ""\n')
+        i18n_file.write('language  ""\n')
+        i18n_file.write('territory "Earth"\n')
+        i18n_file.write('revision  "{:s}"\n'.format(unicode_version))
+        i18n_file.write('date      "{:s}"\n'.format(
+            time.strftime('%Y-%m-%d')))
+        i18n_file.write('category  "unicode:2014";LC_CTYPE\n')
+        i18n_file.write('END LC_IDENTIFICATION\n')
+        i18n_file.write('\n')
+        i18n_file.write('LC_CTYPE\n')
+
+def output_tail(i18n_file, tail=''):
+    '''Write the tail of the output file, i.e. the part of the file
+    after the last “LC_CTYPE” character class.
+    '''
+    if ARGS.input_file and tail:
+        i18n_file.write(tail)
+    else:
+        i18n_file.write('END LC_CTYPE\n')
+
+def output_tables(i18n_file, unicode_version):
+    '''Write the new LC_CTYPE character classes to the output file'''
+    i18n_file.write('% The following is the 14652 i18n fdcc-set '
+                    + 'LC_CTYPE category.\n')
+    i18n_file.write('% It covers Unicode version {:s}.\n'.format(
+        unicode_version))
+    i18n_file.write('% The character classes and mapping tables were '
+                    + 'automatically\n')
+    i18n_file.write('% generated using the gen_unicode_ctype.py '
+                    + 'program.\n\n')
+    i18n_file.write('% The "upper" class reflects the uppercase '
+                    + 'characters of class "alpha"\n')
+    output_charclass(i18n_file, 'upper', is_upper)
+    i18n_file.write('% The "lower" class reflects the lowercase '
+                    + 'characters of class "alpha"\n')
+    output_charclass(i18n_file, 'lower', is_lower)
+    i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is '
+                    + 'reflecting\n')
+    i18n_file.write('% the recommendations in TR 10176 annex A\n')
+    output_charclass(i18n_file, 'alpha', is_alpha)
+    i18n_file.write('% The "digit" class must only contain the '
+                    + 'BASIC LATIN digits, says ISO C 99\n')
+    i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
+    output_charclass(i18n_file, 'digit', is_digit)
+    i18n_file.write('% The "outdigit" information is by default '
+                    + '"0" to "9".  We don\'t have to\n')
+    i18n_file.write('% provide it here since localedef will fill '
+               + 'in the bits and it would\n')
+    i18n_file.write('% prevent locales copying this file define '
+                    + 'their own values.\n')
+    i18n_file.write('% outdigit /\n')
+    i18n_file.write('%    <U0030>..<U0039>\n\n')
+    # output_charclass(i18n_file, 'outdigit', is_outdigit)
+    output_charclass(i18n_file, 'space', is_space)
+    output_charclass(i18n_file, 'cntrl', is_cntrl)
+    output_charclass(i18n_file, 'punct', is_punct)
+    output_charclass(i18n_file, 'graph', is_graph)
+    output_charclass(i18n_file, 'print', is_print)
+    i18n_file.write('% The "xdigit" class must only contain the '
+                    + 'BASIC LATIN digits and A-F, a-f,\n')
+    i18n_file.write('% says ISO C 99 '
+                    + '(sections 7.25.2.1.12 and 6.4.4.1).\n')
+    output_charclass(i18n_file, 'xdigit', is_xdigit)
+    output_charclass(i18n_file, 'blank', is_blank)
+    output_charmap(i18n_file, 'toupper', to_upper)
+    output_charmap(i18n_file, 'tolower', to_lower)
+    output_charmap(i18n_file, 'map "totitle";', to_title)
+    i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
+                    + 'annex B.1\n')
+    i18n_file.write('% That is, all combining characters (level 2+3).\n')
+    output_charclass(i18n_file, 'class "combining";', is_combining)
+    i18n_file.write('% The "combining_level3" class reflects '
+                    + 'ISO/IEC 10646-1 annex B.2\n')
+    i18n_file.write('% That is, combining characters of level 3.\n')
+    output_charclass(i18n_file,
+                     'class "combining_level3";', is_combining_level3)
+
+if __name__ == "__main__":
+    PARSER = argparse.ArgumentParser(
+        description='''
+        Generate a Unicode conforming LC_CTYPE category from
+        UnicodeData.txt and DerivedCoreProperties.txt files.
+        ''')
+    PARSER.add_argument(
+        '-u', '--unicode_data_file',
+        nargs='?',
+        type=str,
+        default='UnicodeData.txt',
+        help=('The UnicodeData.txt file to read, '
+              + 'default: %(default)s'))
+    PARSER.add_argument(
+        '-d', '--derived_core_properties_file',
+        nargs='?',
+        type=str,
+        default='DerivedCoreProperties.txt',
+        help=('The DerivedCoreProperties.txt file to read, '
+              + 'default: %(default)s'))
+    PARSER.add_argument(
+        '-i', '--input_file',
+        nargs='?',
+        type=str,
+        help='''The original glibc/localedata/locales/i18n file.''')
+    PARSER.add_argument(
+        '-o', '--output_file',
+        nargs='?',
+        type=str,
+        default='i18n.new',
+        help='''The file which shall contain the generated LC_CTYPE category,
+        default: %(default)s.  If the original
+        glibc/localedata/locales/i18n has been given
+        as an option, all data from the original file
+        except the newly generated LC_CTYPE character
+        classes and the date stamp in
+        LC_IDENTIFICATION will be copied unchanged
+        into the output file.  ''')
+    PARSER.add_argument(
+        '--unicode_version',
+        nargs='?',
+        required=True,
+        type=str,
+        help='The Unicode version of the input files used.')
+    ARGS = PARSER.parse_args()
+
+    fill_attributes(ARGS.unicode_data_file)
+    fill_derived_core_properties(ARGS.derived_core_properties_file)
+    verifications()
+    HEAD = TAIL = ''
+    if ARGS.input_file:
+        (HEAD, TAIL) = read_input_file(ARGS.input_file)
+    with open(ARGS.output_file, mode='w') as I18N_FILE:
+        output_head(I18N_FILE, ARGS.unicode_version, head=HEAD)
+        output_tables(I18N_FILE, ARGS.unicode_version)
+        output_tail(I18N_FILE, tail=TAIL)
author	Alexandre Oliva <aoliva@redhat.com>	2015-02-20 20:14:59 -0200
committer	Alexandre Oliva <aoliva@redhat.com>	2015-02-20 20:14:59 -0200
commit	4a4839c94a4c93ffc0d5b95c69a08b02a57007f2 (patch)
tree	d60950243872d9beb0993b12173e6bbf998d779d /localedata/unicode-gen/gen_unicode_ctype.py
parent	e4a399dc3dbb3228eb39af230ad11bc42a018c93 (diff)
download	glibc-4a4839c94a4c93ffc0d5b95c69a08b02a57007f2.tar.gz glibc-4a4839c94a4c93ffc0d5b95c69a08b02a57007f2.tar.xz glibc-4a4839c94a4c93ffc0d5b95c69a08b02a57007f2.zip