about summary refs log tree commit diff
path: root/REORG.TODO/localedata/unicode-gen/ctype_compatibility.py
diff options
context:
space:
mode:
Diffstat (limited to 'REORG.TODO/localedata/unicode-gen/ctype_compatibility.py')
-rwxr-xr-xREORG.TODO/localedata/unicode-gen/ctype_compatibility.py546
1 files changed, 546 insertions, 0 deletions
diff --git a/REORG.TODO/localedata/unicode-gen/ctype_compatibility.py b/REORG.TODO/localedata/unicode-gen/ctype_compatibility.py
new file mode 100755
index 0000000000..561458fad6
--- /dev/null
+++ b/REORG.TODO/localedata/unicode-gen/ctype_compatibility.py
@@ -0,0 +1,546 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# Copyright (C) 2014-2017 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+'''
+This script is useful for checking the differences between
+an old LC_CTYPE file /usr/share/i18n/locale/i18n and a
+new one generated by gen_unicode_ctype.py
+
+To see how it is used, call it with the “-h” option:
+
+    $ ./ctype_compatibility.py -h
+    … prints usage message …
+'''
+
+import sys
+import re
+import unicodedata
+import argparse
+
+from ctype_compatibility_test_cases import TEST_CASES
+
+def get_lines_from_file(filename):
+    '''Get all non-comment lines from a i18n file
+
+    Also merge all lines which are continued on the next line because
+    they end in “/” into a single line.
+    '''
+    with open(filename) as i18n_file:
+        current_line = ''
+        for line in i18n_file:
+            line = line.strip('\n')
+            if '%' in line:
+                if line.endswith('/'):
+                    line = line[0:line.find('%')] + '/'
+                else:
+                    line = line[0:line.find('%')]
+            line = line.strip()
+            if line.endswith('/'):
+                current_line += line[:-1]
+            else:
+                yield current_line + line
+                current_line = ''
+    if current_line: # file ends with a continuation line
+        yield current_line
+
+def extract_character_classes(filename):
+    '''Get all Unicode code points for each character class from a file
+
+    Store these code points in a dictionary using the character classes
+    as keys and the list of code points in this character class as values.
+
+    In case  of the character classes “toupper”, “tolower”, and “totitle”,
+    these area actually pairs of code points
+    '''
+    ctype_dict = {}
+    for line in get_lines_from_file(filename):
+        for char_class in [
+                'upper',
+                'lower',
+                'alpha',
+                'digit',
+                'outdigit',
+                'space',
+                'cntrl',
+                'punct',
+                'graph',
+                'print',
+                'xdigit',
+                'blank',
+                'combining',
+                'combining_level3',
+                'toupper',
+                'tolower',
+                'totitle']:
+            match = re.match(r'^('
+                             +'(?:(?:class|map)\s+")'
+                             +re.escape(char_class)+
+                             '(?:";)\s+'
+                             +'|'
+                             +re.escape(char_class)+'\s+'
+                             +')', line)
+            if match:
+                if char_class not in ctype_dict:
+                    ctype_dict[char_class] = []
+                process_chars(
+                    ctype_dict[char_class],
+                    line[match.end():])
+    return ctype_dict
+
+def process_chars(char_class_list, code_point_line):
+    '''
+    Extract Unicode values from code_point_line
+    and add to the list of code points in a character class
+    '''
+    for code_points in code_point_line.split(';'):
+        code_points = code_points.strip()
+        match = re.match(r'^<U(?P<codepoint>[0-9A-F]{4,8})>$', code_points)
+        if match: # <Uxxxx>
+            char_class_list.append(
+                int(match.group('codepoint'), 16))
+            continue
+        match = re.match(
+            r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
+            +'\.\.'+
+            '<U(?P<codepoint2>[0-9A-F]{4,8})>$',
+            code_points)
+        if match: # <Uxxxx>..<Uxxxx>
+            for codepoint in range(
+                    int(match.group('codepoint1'), 16),
+                    int(match.group('codepoint2'), 16) + 1):
+                char_class_list.append(codepoint)
+            continue
+        match = re.match(
+            r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
+            +'\.\.\(2\)\.\.'+
+            '<U(?P<codepoint2>[0-9A-F]{4,8})>$',
+            code_points)
+        if match: # <Uxxxx>..(2)..<Uxxxx>
+            for codepoint in range(
+                    int(match.group('codepoint1'), 16),
+                    int(match.group('codepoint2'), 16) + 1,
+                    2):
+                char_class_list.append(codepoint)
+            continue
+        match = re.match(
+            r'^\('
+            +'<U(?P<codepoint1>[0-9A-F]{4,8})>'
+            +','+
+            '<U(?P<codepoint2>[0-9A-F]{4,8})>'
+            +'\)$',
+            code_points)
+        if match: # (<Uxxxx>,<Uxxxx>)
+            char_class_list.append((
+                int(match.group('codepoint1'), 16),
+                int(match.group('codepoint2'), 16)))
+            continue
+        sys.stderr.write(
+            ('None of the regexps matched '
+             + 'code_points=%(cp)s in code_point_line=%(cpl)s\n') %{
+            'cp': code_points,
+            'cpl': code_point_line
+        })
+        exit(1)
+
+def compare_lists(old_ctype_dict, new_ctype_dict):
+    '''Compare character classes in the old and the new LC_CTYPE'''
+    print('****************************************************')
+    print('Character classes which are only in the new '
+          + 'or only in the old file:')
+    for char_class in sorted(old_ctype_dict):
+        if char_class not in new_ctype_dict:
+            print('Character class %s is in old ctype but not in new ctype'
+                  %char_class)
+    for char_class in sorted(new_ctype_dict):
+        if char_class not in old_ctype_dict:
+            print('Character class %s is in new ctype but not in old ctype'
+                  %char_class)
+    for char_class in sorted(old_ctype_dict):
+        print("****************************************************")
+        print("%s: %d chars in old ctype and %d chars in new ctype" %(
+            char_class,
+            len(old_ctype_dict[char_class]),
+            len(new_ctype_dict[char_class])))
+        print("----------------------------------------------------")
+        report(char_class,
+               old_ctype_dict[char_class],
+               new_ctype_dict[char_class])
+
+def report_code_points(char_class, code_point_list, text=''):
+    '''Report all code points which have been added to or removed from a
+    character class.
+    '''
+    for code_point in sorted(code_point_list):
+        if type(code_point) == type(int()):
+            print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s'
+                  %{'text': text,
+                    'char': chr(code_point),
+                    'char_class': char_class,
+                    'code_point': hex(code_point),
+                    'name': unicodedata.name(chr(code_point), 'name unknown')})
+        else:
+            print(('%(char_class)s: %(text)s: '
+                   + '%(char0)s → %(char1)s '
+                   + '%(code_point0)s → %(code_point1)s '
+                   + '%(name0)s → %(name1)s') %{
+                'text': text,
+                'char_class': char_class,
+                'char0': chr(code_point[0]),
+                'code_point0': hex(code_point[0]),
+                'name0': unicodedata.name(chr(code_point[0]), 'name unknown'),
+                'char1': chr(code_point[1]),
+                'code_point1': hex(code_point[1]),
+                'name1': unicodedata.name(chr(code_point[1]), 'name unknown')
+            })
+
+def report(char_class, old_list, new_list):
+    '''Report the differences for a certain LC_CTYPE character class
+    between the old and the newly generated state
+    '''
+    missing_chars = list(set(old_list)-set(new_list))
+    print(('%(char_class)s: Missing %(number)d characters '
+           + 'of old ctype in new ctype ')
+          %{'char_class': char_class, 'number': len(missing_chars)})
+    if ARGS.show_missing_characters:
+        report_code_points(char_class, missing_chars, 'Missing')
+    added_chars = list(set(new_list)-set(old_list))
+    print(('%(char_class)s: Added %(number)d characters '
+           + 'in new ctype which were not in old ctype')
+          %{'char_class': char_class, 'number': len(added_chars)})
+    if ARGS.show_added_characters:
+        report_code_points(char_class, added_chars, 'Added')
+
+
+def cperror(error_message, errorcounter=0):
+    '''Increase number of errors by one and print an error message'''
+    print(error_message)
+    return errorcounter + 1
+
+def cpcheck(ctype_dict, code_point_list_with_ranges, char_classes, reason='',
+            errorcounter=0):
+    '''The parameter “code_point_list_with_ranges” is a list of
+    integers or pairs of integers, for example:
+
+    [0x0E31, (0x0E34, 0x0E3A), (0x0E47, 0x0E4E)]
+
+    where the pairs of integers stand for all the code points in the range
+    of the two integers given, including the two integers of the pair.
+
+    '''
+    for code_point_range in code_point_list_with_ranges:
+        for code_point in ([code_point_range]
+                           if type(code_point_range) == type(int())
+                           else range(code_point_range[0],
+                                      code_point_range[1]+1)):
+            for char_class_tuple in char_classes:
+                char_class = char_class_tuple[0]
+                in_char_class = char_class_tuple[1]
+                if (code_point in ctype_dict[char_class]) != in_char_class:
+                    errorcounter = cperror(
+                        ('error: %(code_point)s %(char)s '
+                         + '%(char_class)s %(in)s: %(reason)s') %{
+                             'code_point': hex(code_point),
+                             'char': chr(code_point),
+                             'char_class': char_class,
+                             'in': not in_char_class,
+                             'reason': reason},
+                        errorcounter)
+    return errorcounter
+
+def tests(ctype_dict, errorcounter = 0):
+    '''Test a LC_CTYPE character class dictionary for known errors'''
+    # copy the information from ctype_dict (which contains lists) in
+    # a new dictionary ctype_dict2 (which contains dictionaries).
+    # The checks below are easier with that type of data structure.
+
+    ctype_dict2 = {}
+    for key in ctype_dict:
+        ctype_dict2[key] = {}
+        if ctype_dict[key]:
+            if type(ctype_dict[key][0]) == type(int()):
+                for value in ctype_dict[key]:
+                    ctype_dict2[key][value] = 1
+            else: # key is 'toupper', 'tolower', or 'totitle'
+                for value in ctype_dict[key]:
+                    ctype_dict2[key][value[0]] = value[1]
+
+    for test_case in TEST_CASES:
+        errorcounter = cpcheck(ctype_dict2,
+                               test_case[0],
+                               test_case[1],
+                               test_case[2],
+                               errorcounter = errorcounter)
+
+    for code_point in range(0, 0x110000):
+        # toupper restriction: "Only characters specified for the keywords
+	# lower and upper shall be specified.
+        if (code_point in ctype_dict2['toupper']
+            and code_point != ctype_dict2['toupper'][code_point]
+            and not (code_point in ctype_dict2['lower']
+                     or code_point in ctype_dict2['upper'])):
+            errorcounter = cperror(
+                ('error: %(char1)s is not upper|lower '
+                 + 'but toupper(%(cp1)s)=%(cp2)s (%(char2)s)') %{
+                     'char1': chr(code_point),
+                     'cp1': hex(code_point),
+                     'cp2': hex(ctype_dict2['toupper'][code_point]),
+                     'char2': chr(ctype_dict2['toupper'][code_point])
+                 },
+                errorcounter)
+        # tolower restriction: "Only characters specified for the keywords
+	# lower and upper shall be specified.
+        if (code_point in ctype_dict2['tolower']
+            and code_point != ctype_dict2['tolower'][code_point]
+            and not (code_point in ctype_dict2['lower']
+                     or code_point in ctype_dict2['upper'])):
+            errorcounter = cperror(
+                ('error: %(char1)s is not upper|lower '
+                 + 'but tolower(%(cp1)s)=%(cp2)s (%(char2)s)') %{
+                     'char1': chr(code_point),
+                     'cp1': hex(code_point),
+                     'cp2': hex(ctype_dict2['tolower'][code_point]),
+                     'char2': chr(ctype_dict2['tolower'][code_point])
+                 },
+                errorcounter)
+        # alpha restriction: "Characters classified as either upper or lower
+	# shall automatically belong to this class.
+        if ((code_point in ctype_dict2['lower']
+             or code_point in ctype_dict2['upper'])
+            and code_point not in ctype_dict2['alpha']):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s is upper|lower but not alpha' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+        # alpha restriction: "No character specified for the keywords cntrl,
+	# digit, punct or space shall be specified."
+        if (code_point in ctype_dict2['alpha']
+            and code_point in ctype_dict2['cntrl']):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s is alpha and cntrl' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+        if (code_point in ctype_dict2['alpha']
+            and code_point in ctype_dict2['digit']):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s is alpha and digit' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+        if (code_point in ctype_dict2['alpha']
+            and code_point in ctype_dict2['punct']):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s is alpha and punct' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+        if (code_point in ctype_dict2['alpha']
+            and code_point in ctype_dict2['space']):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s is alpha and space' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+        # space restriction: "No character specified for the keywords upper,
+	# lower, alpha, digit, graph or xdigit shall be specified."
+	# upper, lower, alpha already checked above.
+        if (code_point in ctype_dict2['space']
+            and code_point in ctype_dict2['digit']):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s is space and digit' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+        if (code_point in ctype_dict2['space']
+            and code_point in ctype_dict2['graph']):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s is space and graph' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+        if (code_point in ctype_dict2['space']
+            and code_point in ctype_dict2['xdigit']):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s is space and xdigit' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+        # cntrl restriction: "No character specified for the keywords upper,
+	# lower, alpha, digit, punct, graph, print or xdigit shall be
+	# specified."  upper, lower, alpha already checked above.
+        if (code_point in ctype_dict2['cntrl']
+            and code_point in ctype_dict2['digit']):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s is cntrl and digit' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+        if (code_point in ctype_dict2['cntrl']
+            and code_point in ctype_dict2['punct']):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s is cntrl and punct' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+        if (code_point in ctype_dict2['cntrl']
+            and code_point in ctype_dict2['graph']):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s is cntrl and graph' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+        if (code_point in ctype_dict2['cntrl']
+            and code_point in ctype_dict2['print']):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s is cntrl and print' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+        if (code_point in ctype_dict2['cntrl']
+            and code_point in ctype_dict2['xdigit']):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s is cntrl and xdigit' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+        # punct restriction: "No character specified for the keywords upper,
+	# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
+	# be specified."  upper, lower, alpha, cntrl already checked above.
+        if (code_point in ctype_dict2['punct']
+            and code_point in ctype_dict2['digit']):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s is punct and digit' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+        if (code_point in ctype_dict2['punct']
+            and code_point in ctype_dict2['xdigit']):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s is punct and xdigit' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+        if (code_point in ctype_dict2['punct']
+            and code_point == 0x0020):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s is punct.' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+        # graph restriction: "No character specified for the keyword cntrl
+	# shall be specified."  Already checked above.
+
+        # print restriction: "No character specified for the keyword cntrl
+	# shall be specified."  Already checked above.
+
+        # graph - print relation: differ only in the <space> character.
+	# How is this possible if there are more than one space character?!
+	# I think susv2/xbd/locale.html should speak of "space characters",
+	# not "space character".
+        if (code_point in ctype_dict2['print']
+            and not (code_point in ctype_dict2['graph']
+                     or code_point in ctype_dict2['space'])):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s is print but not graph|space' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+        if (code_point not in ctype_dict2['print']
+            and (code_point in ctype_dict2['graph']
+                 or code_point ==  0x0020)):
+            errorcounter = cperror(
+                'error: %(char)s %(cp)s graph|space but not print' %{
+                    'char': chr(code_point),
+                    'cp': hex(code_point)
+                },
+                errorcounter)
+    return errorcounter
+
+if __name__ == "__main__":
+    PARSER = argparse.ArgumentParser(
+        description='''
+        Compare the contents of LC_CTYPE in two files and check for errors.
+        ''')
+    PARSER.add_argument(
+        '-o', '--old_ctype_file',
+        nargs='?',
+        type=str,
+        default='i18n',
+        help='The old ctype file, default: %(default)s')
+    PARSER.add_argument(
+        '-n', '--new_ctype_file',
+        nargs='?',
+        type=str,
+        default='unicode-ctype',
+        help='The new ctype file, default: %(default)s')
+    PARSER.add_argument(
+        '-a', '--show_added_characters',
+        action='store_true',
+        help=('Show characters which were added to each '
+              + 'character class in detail.'))
+    PARSER.add_argument(
+        '-m', '--show_missing_characters',
+        action='store_true',
+        help=('Show characters which were removed from each '
+              + 'character class in detail.'))
+    ARGS = PARSER.parse_args()
+
+    OLD_CTYPE_DICT = extract_character_classes(
+        ARGS.old_ctype_file)
+    NEW_CTYPE_DICT = extract_character_classes(
+        ARGS.new_ctype_file)
+    compare_lists(OLD_CTYPE_DICT, NEW_CTYPE_DICT)
+    print('============================================================')
+    print('Checking for errors in old ctype file: %s' %ARGS.old_ctype_file)
+    print('------------------------------------------------------------')
+    NUMBER_OF_ERRORS_IN_OLD_FILE = tests(OLD_CTYPE_DICT, errorcounter = 0)
+    print('------------------------------------------------------------')
+    print('Old file = %s' %ARGS.old_ctype_file)
+    print('Number of errors in old file = %s' %NUMBER_OF_ERRORS_IN_OLD_FILE)
+    print('------------------------------------------------------------')
+    print('============================================================')
+    print('Checking for errors in new ctype file: %s' %ARGS.new_ctype_file)
+    print('------------------------------------------------------------')
+    NUMBER_OF_ERRORS_IN_NEW_FILE = tests(NEW_CTYPE_DICT, errorcounter = 0)
+    print('------------------------------------------------------------')
+    print('New file = %s' %ARGS.new_ctype_file)
+    print('Number of errors in new file = %s' %NUMBER_OF_ERRORS_IN_NEW_FILE)
+    print('------------------------------------------------------------')
+    if NUMBER_OF_ERRORS_IN_NEW_FILE > 0:
+        exit(1)
+    else:
+        exit(0)