about summary refs log tree commit diff
path: root/localedata/unicode-gen/gen_translit_compat.py
diff options
context:
space:
mode:
Diffstat (limited to 'localedata/unicode-gen/gen_translit_compat.py')
-rw-r--r--localedata/unicode-gen/gen_translit_compat.py327
1 files changed, 0 insertions, 327 deletions
diff --git a/localedata/unicode-gen/gen_translit_compat.py b/localedata/unicode-gen/gen_translit_compat.py
deleted file mode 100644
index 111e770afc..0000000000
--- a/localedata/unicode-gen/gen_translit_compat.py
+++ /dev/null
@@ -1,327 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-#
-# Generate a translit_compat file from a UnicodeData file.
-# Copyright (C) 2015-2017 Free Software Foundation, Inc.
-# This file is part of the GNU C Library.
-#
-# The GNU C Library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# The GNU C Library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with the GNU C Library; if not, see
-# <http://www.gnu.org/licenses/>.
-
-'''
-Generate a translit_compat file from UnicodeData.txt
-
-To see how this script is used, call it with the “-h” option:
-
-    $ ./gen_translit_compat -h
-    … prints usage message …
-'''
-
-import argparse
-import time
-import unicode_utils
-
-def read_input_file(filename):
-    '''Reads the original glibc translit_compat file to get the
-    original head and tail.
-
-    We want to replace only the part of the file between
-    “translit_start” and “translit_end”
-    '''
-    head = tail = ''
-    with open(filename, mode='r') as translit_file:
-        for line in translit_file:
-            head = head + line
-            if line.startswith('translit_start'):
-                break
-        for line in translit_file:
-            if line.startswith('translit_end'):
-                tail = line
-                break
-        for line in translit_file:
-            tail = tail + line
-    return (head, tail)
-
-def output_head(translit_file, unicode_version, head=''):
-    '''Write the header of the output file, i.e. the part of the file
-    before the “translit_start” line.
-    '''
-    if ARGS.input_file and head:
-        translit_file.write(head)
-    else:
-        translit_file.write('escape_char /\n')
-        translit_file.write('comment_char %\n')
-        translit_file.write(unicode_utils.COMMENT_HEADER)
-        translit_file.write('\n')
-        translit_file.write('% Transliterations of compatibility characters ')
-        translit_file.write('and ligatures.\n')
-        translit_file.write('% Generated automatically from UnicodeData.txt '
-                            + 'by gen_translit_compat.py '
-                            + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
-                            + 'for Unicode {:s}.\n'.format(unicode_version))
-        translit_file.write('\n')
-        translit_file.write('LC_CTYPE\n')
-        translit_file.write('\n')
-        translit_file.write('translit_start\n')
-
-def output_tail(translit_file, tail=''):
-    '''Write the tail of the output file'''
-    if ARGS.input_file and tail:
-        translit_file.write(tail)
-    else:
-        translit_file.write('translit_end\n')
-        translit_file.write('\n')
-        translit_file.write('END LC_CTYPE\n')
-
-def compatibility_decompose(code_point):
-    '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
-
-    “The compatibility decomposition is formed by recursively applying
-    the canonical and compatibility mappings, then applying the
-    Canonical Ordering Algorithm.”
-
-    We don’t do the canonical decomposition here because this is
-    done in gen_translit_combining.py to generate translit_combining.
-
-    And we ignore some of the possible compatibility formatting tags
-    here. Some of them are used in other translit_* files, not
-    translit_compat:
-
-    <font>:   translit_font
-    <circle>: translit_circle
-    <wide>:   translit_wide
-    <narrow>: translit_narrow
-    <square>: translit_cjk_compat
-    <fraction>: translit_fraction
-
-    And we ignore
-
-    <noBreak>, <initial>, <medial>, <final>, <isolated>
-
-    because they seem to be not useful for transliteration.
-    '''
-    decomposition = unicode_utils.UNICODE_ATTRIBUTES[
-        code_point]['decomposition']
-    compatibility_tags = (
-        '<compat>', '<super>', '<sub>', '<vertical>')
-    for compatibility_tag in compatibility_tags:
-        if decomposition.startswith(compatibility_tag):
-            decomposition = decomposition[len(compatibility_tag)+1:]
-            decomposed_code_points = [int(x, 16)
-                                      for x in decomposition.split(' ')]
-            if (len(decomposed_code_points) > 1
-                    and decomposed_code_points[0] == 0x0020
-                    and decomposed_code_points[1] >= 0x0300
-                    and decomposed_code_points[1] <= 0x03FF):
-                # Decomposes into a space followed by a combining character.
-                # This is not useful fo transliteration.
-                return []
-            else:
-                return_value = []
-                for index in range(0, len(decomposed_code_points)):
-                    cd_code_points = compatibility_decompose(
-                        decomposed_code_points[index])
-                    if cd_code_points:
-                        return_value += cd_code_points
-                    else:
-                        return_value += [decomposed_code_points[index]]
-                return return_value
-    return []
-
-def special_decompose(code_point_list):
-    '''
-    Decompositions which are not in UnicodeData.txt at all but which
-    were used in the original translit_compat file in glibc and
-    which seem to make sense.  I want to keep the update of
-    translit_compat close to the spirit of the original file,
-    therefore I added this special decomposition rules here.
-    '''
-    special_decompose_dict = {
-        (0x03BC,): [0x0075], # μ → u
-        (0x02BC,): [0x0027], # ʼ → '
-    }
-    if tuple(code_point_list) in special_decompose_dict:
-        return special_decompose_dict[tuple(code_point_list)]
-    else:
-        return code_point_list
-
-def special_ligature_decompose(code_point):
-    '''
-    Decompositions for ligatures which are not in UnicodeData.txt at
-    all but which were used in the original translit_compat file in
-    glibc and which seem to make sense.  I want to keep the update of
-    translit_compat close to the spirit of the original file,
-    therefore I added these special ligature decomposition rules here.
-
-    '''
-    special_ligature_decompose_dict = {
-        0x00E6: [0x0061, 0x0065], # æ → ae
-        0x00C6: [0x0041, 0x0045], # Æ → AE
-        # These following 5 special ligature decompositions were
-        # in the original glibc/localedata/locales/translit_compat file
-        0x0152: [0x004F, 0x0045], # Œ → OE
-        0x0153: [0x006F, 0x0065], # œ → oe
-        0x05F0: [0x05D5, 0x05D5], # װ → וו
-        0x05F1: [0x05D5, 0x05D9], # ױ → וי
-        0x05F2: [0x05D9, 0x05D9], # ײ → יי
-        # The following special ligature decompositions were
-        # not in the original glibc/localedata/locales/translit_compat file
-        # U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE
-        # → U+041D CYRILLIC CAPITAL LETTER EN,
-        #   U+0413 CYRILLIC CAPITAL LETTER GHE
-        0x04A4: [0x041D, 0x0413], # Ҥ → НГ
-        # U+04A5 CYRILLIC SMALL LIGATURE EN GHE
-        # → U+043D CYRILLIC SMALL LETTER EN,
-        #   U+0433 CYRILLIC SMALL LETTER GHE
-        0x04A5: [0x043D, 0x0433], # ҥ → нг
-        # U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE
-        # → U+0422 CYRILLIC CAPITAL LETTER TE,
-        #   U+0426 CYRILLIC CAPITAL LETTER TSE
-        0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ
-        # U+04B5 CYRILLIC SMALL LIGATURE TE TSE
-        # → U+0442 CYRILLIC SMALL LETTER TE,
-        #   U+0446 CYRILLIC SMALL LETTER TSE
-        0x04B5: [0x0442, 0x0446], # ҵ → тц
-        # U+04d4 CYRILLIC CAPITAL LIGATURE A IE
-        # → U+0410 CYRILLIC CAPITAL LETTER A
-        #   U+0415;CYRILLIC CAPITAL LETTER IE
-        0x04D4: [0x0410, 0x0415], # Ӕ → АЕ
-        # U+04D5 CYRILLIC SMALL LIGATURE A IE
-        # → U+0430 CYRILLIC SMALL LETTER A,
-        #   U+0435 CYRILLIC SMALL LETTER IE
-        0x04D5: [0x0430, 0x0435], # ӕ → ае
-        # I am not sure what to do with the following ligatures
-        # maybe it makes no sense to decompose them:
-        # U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
-        # U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
-        # U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
-        # U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
-        # U+fe20 COMBINING LIGATURE LEFT HALF
-        # U+fe21 COMBINING LIGATURE RIGHT HALF
-        # U+fe27 COMBINING LIGATURE LEFT HALF BELOW
-        # U+fe28 COMBINING LIGATURE RIGHT HALF BELOW
-        # U+11176 MAHAJANI LIGATURE SHRI
-        # U+1f670 SCRIPT LIGATURE ET ORNAMENT
-        # U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT
-        # U+1f672 LIGATURE OPEN ET ORNAMENT
-        # U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT
-    }
-    if code_point in special_ligature_decompose_dict:
-        return special_ligature_decompose_dict[code_point]
-    else:
-        return [code_point]
-
-def output_transliteration(translit_file):
-    '''Write the new transliteration to the output file'''
-    translit_file.write('\n')
-    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
-        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
-        decomposed_code_points = [compatibility_decompose(code_point)]
-        if not decomposed_code_points[0]:
-            if special_decompose([code_point]) != [code_point]:
-                decomposed_code_points[0] = special_decompose([code_point])
-        else:
-            special_decomposed_code_points = []
-            while True:
-                special_decomposed_code_points = special_decompose(
-                    decomposed_code_points[-1])
-                if (special_decomposed_code_points
-                        != decomposed_code_points[-1]):
-                    decomposed_code_points.append(
-                        special_decomposed_code_points)
-                    continue
-                special_decomposed_code_points = []
-                for decomposed_code_point in decomposed_code_points[-1]:
-                    special_decomposed_code_points += special_decompose(
-                        [decomposed_code_point])
-                if (special_decomposed_code_points
-                        == decomposed_code_points[-1]):
-                    break
-                decomposed_code_points.append(
-                    special_decomposed_code_points)
-        if decomposed_code_points[0]:
-            translit_file.write('% {:s}\n'.format(name))
-            translit_file.write('{:s} '.format(
-                unicode_utils.ucs_symbol(code_point)))
-            for index in range(0, len(decomposed_code_points)):
-                if index > 0:
-                    translit_file.write(';')
-                translit_file.write('"')
-                for decomposed_code_point in decomposed_code_points[index]:
-                    translit_file.write('{:s}'.format(
-                        unicode_utils.ucs_symbol(decomposed_code_point)))
-                translit_file.write('"')
-            translit_file.write('\n')
-        elif 'LIGATURE' in name and 'ARABIC' not in name:
-            decomposed_code_points = special_ligature_decompose(code_point)
-            if decomposed_code_points[0] != code_point:
-                translit_file.write('% {:s}\n'.format(name))
-                translit_file.write('{:s} '.format(
-                    unicode_utils.ucs_symbol(code_point)))
-                translit_file.write('"')
-                for decomposed_code_point in decomposed_code_points:
-                    translit_file.write('{:s}'.format(
-                        unicode_utils.ucs_symbol(decomposed_code_point)))
-                translit_file.write('"')
-                translit_file.write('\n')
-            else:
-                print('Warning: unhandled ligature: {:x} {:s}'.format(
-                    code_point, name))
-    translit_file.write('\n')
-
-if __name__ == "__main__":
-    PARSER = argparse.ArgumentParser(
-        description='''
-        Generate a translit_compat file from UnicodeData.txt.
-        ''')
-    PARSER.add_argument(
-        '-u', '--unicode_data_file',
-        nargs='?',
-        type=str,
-        default='UnicodeData.txt',
-        help=('The UnicodeData.txt file to read, '
-              + 'default: %(default)s'))
-    PARSER.add_argument(
-        '-i', '--input_file',
-        nargs='?',
-        type=str,
-        help=''' The original glibc/localedata/locales/translit_compat
-        file.''')
-    PARSER.add_argument(
-        '-o', '--output_file',
-        nargs='?',
-        type=str,
-        default='translit_compat.new',
-        help='''The new translit_compat file, default: %(default)s.  If the
-        original glibc/localedata/locales/translit_compat file has
-        been given as an option, the header up to the
-        “translit_start” line and the tail from the “translit_end”
-        line to the end of the file will be copied unchanged into the
-        output file.  ''')
-    PARSER.add_argument(
-        '--unicode_version',
-        nargs='?',
-        required=True,
-        type=str,
-        help='The Unicode version of the input files used.')
-    ARGS = PARSER.parse_args()
-
-    unicode_utils.fill_attributes(ARGS.unicode_data_file)
-    HEAD = TAIL = ''
-    if ARGS.input_file:
-        (HEAD, TAIL) = read_input_file(ARGS.input_file)
-    with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
-        output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
-        output_transliteration(TRANSLIT_FILE)
-        output_tail(TRANSLIT_FILE, tail=TAIL)