about summary refs log tree commit diff
path: root/localedata/unicode-gen/gen_translit_compat.py
diff options
context:
space:
mode:
Diffstat (limited to 'localedata/unicode-gen/gen_translit_compat.py')
-rw-r--r--localedata/unicode-gen/gen_translit_compat.py326
1 files changed, 326 insertions, 0 deletions
diff --git a/localedata/unicode-gen/gen_translit_compat.py b/localedata/unicode-gen/gen_translit_compat.py
new file mode 100644
index 0000000000..0e824a877e
--- /dev/null
+++ b/localedata/unicode-gen/gen_translit_compat.py
@@ -0,0 +1,326 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# Generate a translit_compat file from a UnicodeData file.
+# Copyright (C) 2015 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+'''
+Generate a translit_compat file from UnicodeData.txt
+
+To see how this script is used, call it with the “-h” option:
+
+    $ ./gen_translit_compat -h
+    … prints usage message …
+'''
+
+import argparse
+import time
+import unicode_utils
+
+def read_input_file(filename):
+    '''Reads the original glibc translit_compat file to get the
+    original head and tail.
+
+    We want to replace only the part of the file between
+    “translit_start” and “translit_end”
+    '''
+    head = tail = ''
+    with open(filename, mode='r') as translit_file:
+        for line in translit_file:
+            head = head + line
+            if line.startswith('translit_start'):
+                break
+        for line in translit_file:
+            if line.startswith('translit_end'):
+                tail = line
+                break
+        for line in translit_file:
+            tail = tail + line
+    return (head, tail)
+
+def output_head(translit_file, unicode_version, head=''):
+    '''Write the header of the output file, i.e. the part of the file
+    before the “translit_start” line.
+    '''
+    if ARGS.input_file and head:
+        translit_file.write(head)
+    else:
+        translit_file.write('escape_char /\n')
+        translit_file.write('comment_char %\n')
+        translit_file.write('\n')
+        translit_file.write('% Transliterations of compatibility characters ')
+        translit_file.write('and ligatures.\n')
+        translit_file.write('% Generated automatically from UnicodeData.txt '
+                            + 'by gen_translit_compat.py '
+                            + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+                            + 'for Unicode {:s}.\n'.format(unicode_version))
+        translit_file.write('\n')
+        translit_file.write('LC_CTYPE\n')
+        translit_file.write('\n')
+        translit_file.write('translit_start\n')
+
+def output_tail(translit_file, tail=''):
+    '''Write the tail of the output file'''
+    if ARGS.input_file and tail:
+        translit_file.write(tail)
+    else:
+        translit_file.write('translit_end\n')
+        translit_file.write('\n')
+        translit_file.write('END LC_CTYPE\n')
+
+def compatibility_decompose(code_point):
+    '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
+
+    “The compatibility decomposition is formed by recursively applying
+    the canonical and compatibility mappings, then applying the
+    Canonical Ordering Algorithm.”
+
+    We don’t do the canonical decomposition here because this is
+    done in gen_translit_combining.py to generate translit_combining.
+
+    And we ignore some of the possible compatibility formatting tags
+    here. Some of them are used in other translit_* files, not
+    translit_compat:
+
+    <font>:   translit_font
+    <circle>: translit_circle
+    <wide>:   translit_wide
+    <narrow>: translit_narrow
+    <square>: translit_cjk_compat
+    <fraction>: translit_fraction
+
+    And we ignore
+
+    <noBreak>, <initial>, <medial>, <final>, <isolated>
+
+    because they seem to be not useful for transliteration.
+    '''
+    decomposition = unicode_utils.UNICODE_ATTRIBUTES[
+        code_point]['decomposition']
+    compatibility_tags = (
+        '<compat>', '<super>', '<sub>', '<vertical>')
+    for compatibility_tag in compatibility_tags:
+        if decomposition.startswith(compatibility_tag):
+            decomposition = decomposition[len(compatibility_tag)+1:]
+            decomposed_code_points = [int(x, 16)
+                                      for x in decomposition.split(' ')]
+            if (len(decomposed_code_points) > 1
+                    and decomposed_code_points[0] == 0x0020
+                    and decomposed_code_points[1] >= 0x0300
+                    and decomposed_code_points[1] <= 0x03FF):
+                # Decomposes into a space followed by a combining character.
+                # This is not useful fo transliteration.
+                return []
+            else:
+                return_value = []
+                for index in range(0, len(decomposed_code_points)):
+                    cd_code_points = compatibility_decompose(
+                        decomposed_code_points[index])
+                    if cd_code_points:
+                        return_value += cd_code_points
+                    else:
+                        return_value += [decomposed_code_points[index]]
+                return return_value
+    return []
+
+def special_decompose(code_point_list):
+    '''
+    Decompositions which are not in UnicodeData.txt at all but which
+    were used in the original translit_compat file in glibc and
+    which seem to make sense.  I want to keep the update of
+    translit_compat close to the spirit of the original file,
+    therefore I added this special decomposition rules here.
+    '''
+    special_decompose_dict = {
+        (0x03BC,): [0x0075], # μ → u
+        (0x02BC,): [0x0027], # ʼ → '
+    }
+    if tuple(code_point_list) in special_decompose_dict:
+        return special_decompose_dict[tuple(code_point_list)]
+    else:
+        return code_point_list
+
+def special_ligature_decompose(code_point):
+    '''
+    Decompositions for ligatures which are not in UnicodeData.txt at
+    all but which were used in the original translit_compat file in
+    glibc and which seem to make sense.  I want to keep the update of
+    translit_compat close to the spirit of the original file,
+    therefore I added these special ligature decomposition rules here.
+
+    '''
+    special_ligature_decompose_dict = {
+        0x00E6: [0x0061, 0x0065], # æ → ae
+        0x00C6: [0x0041, 0x0045], # Æ → AE
+        # These following 5 special ligature decompositions were
+        # in the original glibc/localedata/locales/translit_compat file
+        0x0152: [0x004F, 0x0045], # Œ → OE
+        0x0153: [0x006F, 0x0065], # œ → oe
+        0x05F0: [0x05D5, 0x05D5], # װ → וו
+        0x05F1: [0x05D5, 0x05D9], # ױ → וי
+        0x05F2: [0x05D9, 0x05D9], # ײ → יי
+        # The following special ligature decompositions were
+        # not in the original glibc/localedata/locales/translit_compat file
+        # U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE
+        # → U+041D CYRILLIC CAPITAL LETTER EN,
+        #   U+0413 CYRILLIC CAPITAL LETTER GHE
+        0x04A4: [0x041D, 0x0413], # Ҥ → НГ
+        # U+04A5 CYRILLIC SMALL LIGATURE EN GHE
+        # → U+043D CYRILLIC SMALL LETTER EN,
+        #   U+0433 CYRILLIC SMALL LETTER GHE
+        0x04A5: [0x043D, 0x0433], # ҥ → нг
+        # U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE
+        # → U+0422 CYRILLIC CAPITAL LETTER TE,
+        #   U+0426 CYRILLIC CAPITAL LETTER TSE
+        0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ
+        # U+04B5 CYRILLIC SMALL LIGATURE TE TSE
+        # → U+0442 CYRILLIC SMALL LETTER TE,
+        #   U+0446 CYRILLIC SMALL LETTER TSE
+        0x04B5: [0x0442, 0x0446], # ҵ → тц
+        # U+04d4 CYRILLIC CAPITAL LIGATURE A IE
+        # → U+0410 CYRILLIC CAPITAL LETTER A
+        #   U+0415;CYRILLIC CAPITAL LETTER IE
+        0x04D4: [0x0410, 0x0415], # Ӕ → АЕ
+        # U+04D5 CYRILLIC SMALL LIGATURE A IE
+        # → U+0430 CYRILLIC SMALL LETTER A,
+        #   U+0435 CYRILLIC SMALL LETTER IE
+        0x04D5: [0x0430, 0x0435], # ӕ → ае
+        # I am not sure what to do with the following ligatures
+        # maybe it makes no sense to decompose them:
+        # U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
+        # U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
+        # U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
+        # U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
+        # U+fe20 COMBINING LIGATURE LEFT HALF
+        # U+fe21 COMBINING LIGATURE RIGHT HALF
+        # U+fe27 COMBINING LIGATURE LEFT HALF BELOW
+        # U+fe28 COMBINING LIGATURE RIGHT HALF BELOW
+        # U+11176 MAHAJANI LIGATURE SHRI
+        # U+1f670 SCRIPT LIGATURE ET ORNAMENT
+        # U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT
+        # U+1f672 LIGATURE OPEN ET ORNAMENT
+        # U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT
+    }
+    if code_point in special_ligature_decompose_dict:
+        return special_ligature_decompose_dict[code_point]
+    else:
+        return [code_point]
+
+def output_transliteration(translit_file):
+    '''Write the new transliteration to the output file'''
+    translit_file.write('\n')
+    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
+        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
+        decomposed_code_points = [compatibility_decompose(code_point)]
+        if not decomposed_code_points[0]:
+            if special_decompose([code_point]) != [code_point]:
+                decomposed_code_points[0] = special_decompose([code_point])
+        else:
+            special_decomposed_code_points = []
+            while True:
+                special_decomposed_code_points = special_decompose(
+                    decomposed_code_points[-1])
+                if (special_decomposed_code_points
+                        != decomposed_code_points[-1]):
+                    decomposed_code_points.append(
+                        special_decomposed_code_points)
+                    continue
+                special_decomposed_code_points = []
+                for decomposed_code_point in decomposed_code_points[-1]:
+                    special_decomposed_code_points += special_decompose(
+                        [decomposed_code_point])
+                if (special_decomposed_code_points
+                        == decomposed_code_points[-1]):
+                    break
+                decomposed_code_points.append(
+                    special_decomposed_code_points)
+        if decomposed_code_points[0]:
+            translit_file.write('% {:s}\n'.format(name))
+            translit_file.write('{:s} '.format(
+                unicode_utils.ucs_symbol(code_point)))
+            for index in range(0, len(decomposed_code_points)):
+                if index > 0:
+                    translit_file.write(';')
+                translit_file.write('"')
+                for decomposed_code_point in decomposed_code_points[index]:
+                    translit_file.write('{:s}'.format(
+                        unicode_utils.ucs_symbol(decomposed_code_point)))
+                translit_file.write('"')
+            translit_file.write('\n')
+        elif 'LIGATURE' in name and 'ARABIC' not in name:
+            decomposed_code_points = special_ligature_decompose(code_point)
+            if decomposed_code_points[0] != code_point:
+                translit_file.write('% {:s}\n'.format(name))
+                translit_file.write('{:s} '.format(
+                    unicode_utils.ucs_symbol(code_point)))
+                translit_file.write('"')
+                for decomposed_code_point in decomposed_code_points:
+                    translit_file.write('{:s}'.format(
+                        unicode_utils.ucs_symbol(decomposed_code_point)))
+                translit_file.write('"')
+                translit_file.write('\n')
+            else:
+                print('Warning: unhandled ligature: {:x} {:s}'.format(
+                    code_point, name))
+    translit_file.write('\n')
+
+if __name__ == "__main__":
+    PARSER = argparse.ArgumentParser(
+        description='''
+        Generate a translit_compat file from UnicodeData.txt.
+        ''')
+    PARSER.add_argument(
+        '-u', '--unicode_data_file',
+        nargs='?',
+        type=str,
+        default='UnicodeData.txt',
+        help=('The UnicodeData.txt file to read, '
+              + 'default: %(default)s'))
+    PARSER.add_argument(
+        '-i', '--input_file',
+        nargs='?',
+        type=str,
+        help=''' The original glibc/localedata/locales/translit_compat
+        file.''')
+    PARSER.add_argument(
+        '-o', '--output_file',
+        nargs='?',
+        type=str,
+        default='translit_compat.new',
+        help='''The new translit_compat file, default: %(default)s.  If the
+        original glibc/localedata/locales/translit_compat file has
+        been given as an option, the header up to the
+        “translit_start” line and the tail from the “translit_end”
+        line to the end of the file will be copied unchanged into the
+        output file.  ''')
+    PARSER.add_argument(
+        '--unicode_version',
+        nargs='?',
+        required=True,
+        type=str,
+        help='The Unicode version of the input files used.')
+    ARGS = PARSER.parse_args()
+
+    unicode_utils.fill_attributes(ARGS.unicode_data_file)
+    HEAD = TAIL = ''
+    if ARGS.input_file:
+        (HEAD, TAIL) = read_input_file(ARGS.input_file)
+    with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
+        output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
+        output_transliteration(TRANSLIT_FILE)
+        output_tail(TRANSLIT_FILE, tail=TAIL)