diff options
Diffstat (limited to 'localedata/unicode-gen/gen_translit_compat.py')
-rw-r--r-- | localedata/unicode-gen/gen_translit_compat.py | 327 |
1 files changed, 0 insertions, 327 deletions
diff --git a/localedata/unicode-gen/gen_translit_compat.py b/localedata/unicode-gen/gen_translit_compat.py deleted file mode 100644 index 111e770afc..0000000000 --- a/localedata/unicode-gen/gen_translit_compat.py +++ /dev/null @@ -1,327 +0,0 @@ -#!/usr/bin/python3 -# -*- coding: utf-8 -*- -# -# Generate a translit_compat file from a UnicodeData file. -# Copyright (C) 2015-2017 Free Software Foundation, Inc. -# This file is part of the GNU C Library. -# -# The GNU C Library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2.1 of the License, or (at your option) any later version. -# -# The GNU C Library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with the GNU C Library; if not, see -# <http://www.gnu.org/licenses/>. - -''' -Generate a translit_compat file from UnicodeData.txt - -To see how this script is used, call it with the “-h” option: - - $ ./gen_translit_compat -h - … prints usage message … -''' - -import argparse -import time -import unicode_utils - -def read_input_file(filename): - '''Reads the original glibc translit_compat file to get the - original head and tail. - - We want to replace only the part of the file between - “translit_start” and “translit_end” - ''' - head = tail = '' - with open(filename, mode='r') as translit_file: - for line in translit_file: - head = head + line - if line.startswith('translit_start'): - break - for line in translit_file: - if line.startswith('translit_end'): - tail = line - break - for line in translit_file: - tail = tail + line - return (head, tail) - -def output_head(translit_file, unicode_version, head=''): - '''Write the header of the output file, i.e. the part of the file - before the “translit_start” line. - ''' - if ARGS.input_file and head: - translit_file.write(head) - else: - translit_file.write('escape_char /\n') - translit_file.write('comment_char %\n') - translit_file.write(unicode_utils.COMMENT_HEADER) - translit_file.write('\n') - translit_file.write('% Transliterations of compatibility characters ') - translit_file.write('and ligatures.\n') - translit_file.write('% Generated automatically from UnicodeData.txt ' - + 'by gen_translit_compat.py ' - + 'on {:s} '.format(time.strftime('%Y-%m-%d')) - + 'for Unicode {:s}.\n'.format(unicode_version)) - translit_file.write('\n') - translit_file.write('LC_CTYPE\n') - translit_file.write('\n') - translit_file.write('translit_start\n') - -def output_tail(translit_file, tail=''): - '''Write the tail of the output file''' - if ARGS.input_file and tail: - translit_file.write(tail) - else: - translit_file.write('translit_end\n') - translit_file.write('\n') - translit_file.write('END LC_CTYPE\n') - -def compatibility_decompose(code_point): - '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings - - “The compatibility decomposition is formed by recursively applying - the canonical and compatibility mappings, then applying the - Canonical Ordering Algorithm.” - - We don’t do the canonical decomposition here because this is - done in gen_translit_combining.py to generate translit_combining. - - And we ignore some of the possible compatibility formatting tags - here. Some of them are used in other translit_* files, not - translit_compat: - - <font>: translit_font - <circle>: translit_circle - <wide>: translit_wide - <narrow>: translit_narrow - <square>: translit_cjk_compat - <fraction>: translit_fraction - - And we ignore - - <noBreak>, <initial>, <medial>, <final>, <isolated> - - because they seem to be not useful for transliteration. - ''' - decomposition = unicode_utils.UNICODE_ATTRIBUTES[ - code_point]['decomposition'] - compatibility_tags = ( - '<compat>', '<super>', '<sub>', '<vertical>') - for compatibility_tag in compatibility_tags: - if decomposition.startswith(compatibility_tag): - decomposition = decomposition[len(compatibility_tag)+1:] - decomposed_code_points = [int(x, 16) - for x in decomposition.split(' ')] - if (len(decomposed_code_points) > 1 - and decomposed_code_points[0] == 0x0020 - and decomposed_code_points[1] >= 0x0300 - and decomposed_code_points[1] <= 0x03FF): - # Decomposes into a space followed by a combining character. - # This is not useful fo transliteration. - return [] - else: - return_value = [] - for index in range(0, len(decomposed_code_points)): - cd_code_points = compatibility_decompose( - decomposed_code_points[index]) - if cd_code_points: - return_value += cd_code_points - else: - return_value += [decomposed_code_points[index]] - return return_value - return [] - -def special_decompose(code_point_list): - ''' - Decompositions which are not in UnicodeData.txt at all but which - were used in the original translit_compat file in glibc and - which seem to make sense. I want to keep the update of - translit_compat close to the spirit of the original file, - therefore I added this special decomposition rules here. - ''' - special_decompose_dict = { - (0x03BC,): [0x0075], # μ → u - (0x02BC,): [0x0027], # ʼ → ' - } - if tuple(code_point_list) in special_decompose_dict: - return special_decompose_dict[tuple(code_point_list)] - else: - return code_point_list - -def special_ligature_decompose(code_point): - ''' - Decompositions for ligatures which are not in UnicodeData.txt at - all but which were used in the original translit_compat file in - glibc and which seem to make sense. I want to keep the update of - translit_compat close to the spirit of the original file, - therefore I added these special ligature decomposition rules here. - - ''' - special_ligature_decompose_dict = { - 0x00E6: [0x0061, 0x0065], # æ → ae - 0x00C6: [0x0041, 0x0045], # Æ → AE - # These following 5 special ligature decompositions were - # in the original glibc/localedata/locales/translit_compat file - 0x0152: [0x004F, 0x0045], # Œ → OE - 0x0153: [0x006F, 0x0065], # œ → oe - 0x05F0: [0x05D5, 0x05D5], # װ → וו - 0x05F1: [0x05D5, 0x05D9], # ױ → וי - 0x05F2: [0x05D9, 0x05D9], # ײ → יי - # The following special ligature decompositions were - # not in the original glibc/localedata/locales/translit_compat file - # U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE - # → U+041D CYRILLIC CAPITAL LETTER EN, - # U+0413 CYRILLIC CAPITAL LETTER GHE - 0x04A4: [0x041D, 0x0413], # Ҥ → НГ - # U+04A5 CYRILLIC SMALL LIGATURE EN GHE - # → U+043D CYRILLIC SMALL LETTER EN, - # U+0433 CYRILLIC SMALL LETTER GHE - 0x04A5: [0x043D, 0x0433], # ҥ → нг - # U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE - # → U+0422 CYRILLIC CAPITAL LETTER TE, - # U+0426 CYRILLIC CAPITAL LETTER TSE - 0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ - # U+04B5 CYRILLIC SMALL LIGATURE TE TSE - # → U+0442 CYRILLIC SMALL LETTER TE, - # U+0446 CYRILLIC SMALL LETTER TSE - 0x04B5: [0x0442, 0x0446], # ҵ → тц - # U+04d4 CYRILLIC CAPITAL LIGATURE A IE - # → U+0410 CYRILLIC CAPITAL LETTER A - # U+0415;CYRILLIC CAPITAL LETTER IE - 0x04D4: [0x0410, 0x0415], # Ӕ → АЕ - # U+04D5 CYRILLIC SMALL LIGATURE A IE - # → U+0430 CYRILLIC SMALL LETTER A, - # U+0435 CYRILLIC SMALL LETTER IE - 0x04D5: [0x0430, 0x0435], # ӕ → ае - # I am not sure what to do with the following ligatures - # maybe it makes no sense to decompose them: - # U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH - # U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA - # U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA - # U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM - # U+fe20 COMBINING LIGATURE LEFT HALF - # U+fe21 COMBINING LIGATURE RIGHT HALF - # U+fe27 COMBINING LIGATURE LEFT HALF BELOW - # U+fe28 COMBINING LIGATURE RIGHT HALF BELOW - # U+11176 MAHAJANI LIGATURE SHRI - # U+1f670 SCRIPT LIGATURE ET ORNAMENT - # U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT - # U+1f672 LIGATURE OPEN ET ORNAMENT - # U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT - } - if code_point in special_ligature_decompose_dict: - return special_ligature_decompose_dict[code_point] - else: - return [code_point] - -def output_transliteration(translit_file): - '''Write the new transliteration to the output file''' - translit_file.write('\n') - for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): - name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] - decomposed_code_points = [compatibility_decompose(code_point)] - if not decomposed_code_points[0]: - if special_decompose([code_point]) != [code_point]: - decomposed_code_points[0] = special_decompose([code_point]) - else: - special_decomposed_code_points = [] - while True: - special_decomposed_code_points = special_decompose( - decomposed_code_points[-1]) - if (special_decomposed_code_points - != decomposed_code_points[-1]): - decomposed_code_points.append( - special_decomposed_code_points) - continue - special_decomposed_code_points = [] - for decomposed_code_point in decomposed_code_points[-1]: - special_decomposed_code_points += special_decompose( - [decomposed_code_point]) - if (special_decomposed_code_points - == decomposed_code_points[-1]): - break - decomposed_code_points.append( - special_decomposed_code_points) - if decomposed_code_points[0]: - translit_file.write('% {:s}\n'.format(name)) - translit_file.write('{:s} '.format( - unicode_utils.ucs_symbol(code_point))) - for index in range(0, len(decomposed_code_points)): - if index > 0: - translit_file.write(';') - translit_file.write('"') - for decomposed_code_point in decomposed_code_points[index]: - translit_file.write('{:s}'.format( - unicode_utils.ucs_symbol(decomposed_code_point))) - translit_file.write('"') - translit_file.write('\n') - elif 'LIGATURE' in name and 'ARABIC' not in name: - decomposed_code_points = special_ligature_decompose(code_point) - if decomposed_code_points[0] != code_point: - translit_file.write('% {:s}\n'.format(name)) - translit_file.write('{:s} '.format( - unicode_utils.ucs_symbol(code_point))) - translit_file.write('"') - for decomposed_code_point in decomposed_code_points: - translit_file.write('{:s}'.format( - unicode_utils.ucs_symbol(decomposed_code_point))) - translit_file.write('"') - translit_file.write('\n') - else: - print('Warning: unhandled ligature: {:x} {:s}'.format( - code_point, name)) - translit_file.write('\n') - -if __name__ == "__main__": - PARSER = argparse.ArgumentParser( - description=''' - Generate a translit_compat file from UnicodeData.txt. - ''') - PARSER.add_argument( - '-u', '--unicode_data_file', - nargs='?', - type=str, - default='UnicodeData.txt', - help=('The UnicodeData.txt file to read, ' - + 'default: %(default)s')) - PARSER.add_argument( - '-i', '--input_file', - nargs='?', - type=str, - help=''' The original glibc/localedata/locales/translit_compat - file.''') - PARSER.add_argument( - '-o', '--output_file', - nargs='?', - type=str, - default='translit_compat.new', - help='''The new translit_compat file, default: %(default)s. If the - original glibc/localedata/locales/translit_compat file has - been given as an option, the header up to the - “translit_start” line and the tail from the “translit_end” - line to the end of the file will be copied unchanged into the - output file. ''') - PARSER.add_argument( - '--unicode_version', - nargs='?', - required=True, - type=str, - help='The Unicode version of the input files used.') - ARGS = PARSER.parse_args() - - unicode_utils.fill_attributes(ARGS.unicode_data_file) - HEAD = TAIL = '' - if ARGS.input_file: - (HEAD, TAIL) = read_input_file(ARGS.input_file) - with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: - output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) - output_transliteration(TRANSLIT_FILE) - output_tail(TRANSLIT_FILE, tail=TAIL) |