about summary refs log tree commit diff
path: root/localedata/unicode-gen/utf8_compatibility.py
diff options
context:
space:
mode:
Diffstat (limited to 'localedata/unicode-gen/utf8_compatibility.py')
-rwxr-xr-xlocaledata/unicode-gen/utf8_compatibility.py217
1 files changed, 39 insertions, 178 deletions
diff --git a/localedata/unicode-gen/utf8_compatibility.py b/localedata/unicode-gen/utf8_compatibility.py
index b84a1eb3de..3b7a94ccc9 100755
--- a/localedata/unicode-gen/utf8_compatibility.py
+++ b/localedata/unicode-gen/utf8_compatibility.py
@@ -30,146 +30,7 @@ To see how this script is used, call it with the “-h” option:
 import sys
 import re
 import argparse
-
-# Dictionary holding the entire contents of the UnicodeData.txt file
-#
-# Contents of this dictionary look like this:
-#
-# {0: {'category': 'Cc',
-#      'title': None,
-#      'digit': '',
-#      'name': '<control>',
-#      'bidi': 'BN',
-#      'combining': '0',
-#      'comment': '',
-#      'oldname': 'NULL',
-#      'decomposition': '',
-#      'upper': None,
-#      'mirrored': 'N',
-#      'lower': None,
-#      'decdigit': '',
-#      'numeric': ''},
-#      …
-# }
-UNICODE_ATTRIBUTES = {}
-
-# Dictionary holding the entire contents of the EastAsianWidths.txt file
-#
-# Contents of this dictionary look like this:
-#
-# {0: 'N', … , 45430: 'W', …}
-EAST_ASIAN_WIDTHS = {}
-
-def fill_attribute(code_point, fields):
-    '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
-
-    One entry in the UNICODE_ATTRIBUTES dictionary represents one line
-    in the UnicodeData.txt file.
-
-    '''
-    UNICODE_ATTRIBUTES[code_point] =  {
-        'name': fields[1],          # Character name
-        'category': fields[2],      # General category
-        'combining': fields[3],     # Canonical combining classes
-        'bidi': fields[4],          # Bidirectional category
-        'decomposition': fields[5], # Character decomposition mapping
-        'decdigit': fields[6],      # Decimal digit value
-        'digit': fields[7],         # Digit value
-        'numeric': fields[8],       # Numeric value
-        'mirrored': fields[9],      # mirrored
-        'oldname': fields[10],      # Old Unicode 1.0 name
-        'comment': fields[11],      # comment
-        # Uppercase mapping
-        'upper': int(fields[12], 16) if fields[12] else None,
-        # Lowercase mapping
-        'lower': int(fields[13], 16) if fields[13] else None,
-        # Titlecase mapping
-        'title': int(fields[14], 16) if fields[14] else None,
-    }
-
-def fill_attributes(filename):
-    '''Stores the entire contents of the UnicodeData.txt file
-    in the UNICODE_ATTRIBUTES dictionary.
-
-    A typical line for a single code point in UnicodeData.txt looks
-    like this:
-
-    0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
-
-    Code point ranges are indicated by pairs of lines like this:
-
-    4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
-    9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
-    '''
-    with open(filename, mode='r') as unicode_data_file:
-        fields_start = []
-        for line in unicode_data_file:
-            fields = line.strip().split(';')
-            if len(fields) != 15:
-                sys.stderr.write(
-                    'short line in file "%(f)s": %(l)s\n' %{
-                    'f': filename, 'l': line})
-                exit(1)
-            if fields[2] == 'Cs':
-                # Surrogates are UTF-16 artefacts,
-                # not real characters. Ignore them.
-                fields_start = []
-                continue
-            if fields[1].endswith(', First>'):
-                fields_start = fields
-                fields_start[1] = fields_start[1].split(',')[0][1:]
-                continue
-            if fields[1].endswith(', Last>'):
-                fields[1] = fields[1].split(',')[0][1:]
-                if fields[1:] != fields_start[1:]:
-                    sys.stderr.write(
-                        'broken code point range in file "%(f)s": %(l)s\n' %{
-                            'f': filename, 'l': line})
-                    exit(1)
-                for code_point in range(
-                        int(fields_start[0], 16),
-                        int(fields[0], 16)+1):
-                    fill_attribute(code_point, fields)
-                fields_start = []
-                continue
-            fill_attribute(int(fields[0], 16), fields)
-            fields_start = []
-
-def fill_east_asian_widths(filename):
-    '''Stores the entire contents of the EastAsianWidths.txt file
-    in the EAST_ASIAN_WIDTHS dictionary.
-
-    Lines in EastAsianWidths.txt are either a code point range like
-    this:
-
-    9FCD..9FFF;W     # Cn    [51] <reserved-9FCD>..<reserved-9FFF>
-
-    or a single code point like this:
-
-    A015;W           # Lm         YI SYLLABLE WU
-    '''
-    with open(filename, mode='r') as east_asian_widths_file:
-        for line in east_asian_widths_file:
-            match = re.match(
-                r'^(?P<codepoint1>[0-9A-F]{4,6})'
-                +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
-                +r'\s*;\s*(?P<property>[a-zA-Z]+)',
-                line)
-            if not match:
-                continue
-            start = match.group('codepoint1')
-            end = match.group('codepoint2')
-            if not end:
-                end = start
-            for code_point in range(int(start, 16), int(end, 16)+1):
-                EAST_ASIAN_WIDTHS[code_point] = match.group('property')
-
-def ucs_symbol(code_point):
-    '''Return the UCS symbol string for a Unicode character.'''
-    if code_point < 0x10000:
-        return '<U{:04X}>'.format(code_point)
-    else:
-        return '<U{:08X}>'.format(code_point)
+import unicode_utils
 
 def create_charmap_dictionary(file_name):
     '''Create a dictionary for all code points found in the CHARMAP
@@ -217,10 +78,10 @@ def check_charmap(original_file_name, new_file_name):
     if ARGS.show_missing_characters:
         for key in sorted(set(ocharmap)-set(ncharmap)):
             print('removed: {:s}     {:s} {:s}'.format(
-                ucs_symbol(key),
+                unicode_utils.ucs_symbol(key),
                 ocharmap[key],
-                UNICODE_ATTRIBUTES[key]['name'] \
-                if key in UNICODE_ATTRIBUTES else None))
+                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
+                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
     print('------------------------------------------------------------')
     changed_charmap = {}
     for key in set(ocharmap).intersection(set(ncharmap)):
@@ -231,21 +92,21 @@ def check_charmap(original_file_name, new_file_name):
     if ARGS.show_changed_characters:
         for key in sorted(changed_charmap):
             print('changed: {:s}     {:s}->{:s} {:s}'.format(
-                ucs_symbol(key),
+                unicode_utils.ucs_symbol(key),
                 changed_charmap[key][0],
                 changed_charmap[key][1],
-                UNICODE_ATTRIBUTES[key]['name'] \
-                if key in UNICODE_ATTRIBUTES else None))
+                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
+                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
     print('------------------------------------------------------------')
     print('Total added characters in newly generated CHARMAP: %d'
           %len(set(ncharmap)-set(ocharmap)))
     if ARGS.show_added_characters:
         for key in sorted(set(ncharmap)-set(ocharmap)):
             print('added: {:s}     {:s} {:s}'.format(
-                ucs_symbol(key),
+                unicode_utils.ucs_symbol(key),
                 ncharmap[key],
-                UNICODE_ATTRIBUTES[key]['name'] \
-                if key in UNICODE_ATTRIBUTES else None))
+                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
+                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
 
 def create_width_dictionary(file_name):
     '''Create a dictionary for all code points found in the WIDTH
@@ -290,20 +151,20 @@ def check_width(original_file_name, new_file_name):
           + 'i.e. these have width 1 now.)')
     if ARGS.show_missing_characters:
         for key in sorted(set(owidth)-set(nwidth)):
-            print('removed: {:s} '.format(ucs_symbol(key))
+            print('removed: {:s} '.format(unicode_utils.ucs_symbol(key))
                   + '{:d} : '.format(owidth[key])
                   + 'eaw={:s} '.format(
-                      EAST_ASIAN_WIDTHS[key]
-                      if key in EAST_ASIAN_WIDTHS else None)
+                      unicode_utils.EAST_ASIAN_WIDTHS[key]
+                      if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
                   + 'category={:2s} '.format(
-                      UNICODE_ATTRIBUTES[key]['category']
-                      if key in UNICODE_ATTRIBUTES else None)
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['category']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'bidi={:3s} '.format(
-                      UNICODE_ATTRIBUTES[key]['bidi']
-                      if key in UNICODE_ATTRIBUTES else None)
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'name={:s}'.format(
-                      UNICODE_ATTRIBUTES[key]['name']
-                      if key in UNICODE_ATTRIBUTES else None))
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['name']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
     print('------------------------------------------------------------')
     changed_width = {}
     for key in set(owidth).intersection(set(nwidth)):
@@ -313,21 +174,21 @@ def check_width(original_file_name, new_file_name):
           %len(changed_width))
     if ARGS.show_changed_characters:
         for key in sorted(changed_width):
-            print('changed width: {:s} '.format(ucs_symbol(key))
+            print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key))
                   + '{:d}->{:d} : '.format(changed_width[key][0],
                                           changed_width[key][1])
                   + 'eaw={:s} '.format(
-                      EAST_ASIAN_WIDTHS[key]
-                      if key in EAST_ASIAN_WIDTHS else None)
+                      unicode_utils.EAST_ASIAN_WIDTHS[key]
+                      if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
                   + 'category={:2s} '.format(
-                      UNICODE_ATTRIBUTES[key]['category']
-                      if key in UNICODE_ATTRIBUTES else None)
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['category']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'bidi={:3s} '.format(
-                      UNICODE_ATTRIBUTES[key]['bidi']
-                      if key in UNICODE_ATTRIBUTES else None)
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'name={:s}'.format(
-                      UNICODE_ATTRIBUTES[key]['name']
-                      if key in UNICODE_ATTRIBUTES else None))
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['name']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
     print('------------------------------------------------------------')
     print('Total added characters in newly generated WIDTH: %d'
           %len(set(nwidth)-set(owidth)))
@@ -335,20 +196,20 @@ def check_width(original_file_name, new_file_name):
           + 'i.e. these had width 1 before.)')
     if ARGS.show_added_characters:
         for key in sorted(set(nwidth)-set(owidth)):
-            print('added: {:s} '.format(ucs_symbol(key))
+            print('added: {:s} '.format(unicode_utils.ucs_symbol(key))
                   + '{:d} : '.format(nwidth[key])
                   + 'eaw={:s} '.format(
-                      EAST_ASIAN_WIDTHS[key]
-                      if key in EAST_ASIAN_WIDTHS else None)
+                      unicode_utils.EAST_ASIAN_WIDTHS[key]
+                      if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
                   + 'category={:2s} '.format(
-                      UNICODE_ATTRIBUTES[key]['category']
-                      if key in UNICODE_ATTRIBUTES else None)
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['category']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'bidi={:3s} '.format(
-                      UNICODE_ATTRIBUTES[key]['bidi']
-                      if key in UNICODE_ATTRIBUTES else None)
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'name={:s}'.format(
-                      UNICODE_ATTRIBUTES[key]['name']
-                      if key in UNICODE_ATTRIBUTES else None))
+                      unicode_utils.UNICODE_ATTRIBUTES[key]['name']
+                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
 
 if __name__ == "__main__":
     PARSER = argparse.ArgumentParser(
@@ -392,8 +253,8 @@ if __name__ == "__main__":
     ARGS = PARSER.parse_args()
 
     if ARGS.unicode_data_file:
-        fill_attributes(ARGS.unicode_data_file)
+        unicode_utils.fill_attributes(ARGS.unicode_data_file)
     if ARGS.east_asian_width_file:
-        fill_east_asian_widths(ARGS.east_asian_width_file)
+        unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file)
     check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
     check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)