%PDF- %PDF-
Direktori : /usr/share/ibus-table/engine/ |
Current File : //usr/share/ibus-table/engine/tabcreatedb.py |
# -*- coding: utf-8 -*- # vim:et sts=4 sw=4 # # ibus-table - The Tables engine for IBus # # Copyright (c) 2008-2009 Yu Yuwei <acevery@gmail.com> # Copyright (c) 2009-2014 Caius "kaio" CHANCE <me@kaio.net> # Copyright (c) 2012-2015, 2021-2022 Mike FABIAN <mfabian@redhat.com> # Copyright (c) 2019 Peng Wu <alexepico@gmail.com> # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see <http://www.gnu.org/licenses/> ''' Program to create sqlite databases from the table sources ''' from typing import Tuple from typing import List from typing import Iterable from typing import Dict from typing import Any import os import sys import bz2 import re import argparse sys.path.append(os.path.dirname(os.path.abspath(__file__))) import tabsqlitedb _INVALID_KEYNAME_CHARS = " \t\r\n\"$&<>,+=#!()'|{}[]?~`;%\\" def gconf_valid_keyname(keyname: str) -> bool: """ Keynames must be ascii, and must not contain any invalid characters >>> gconf_valid_keyname('nyannyan') True >>> gconf_valid_keyname('nyan nyan') False >>> gconf_valid_keyname('nyannyan[') False >>> gconf_valid_keyname('nyan\tnyan') False """ return not any(char in _INVALID_KEYNAME_CHARS or ord(char) > 127 for char in keyname) class InvalidTableName(Exception): """ Raised when an invalid table name is given """ def __init__(self, name: str) -> None: super().__init__() self.table_name = name def __str__(self) -> str: return ('Value of NAME attribute (%s) ' % self.table_name + 'cannot contain any of %r ' % _INVALID_KEYNAME_CHARS + 'and must be all ascii') def parse_args() -> Any: '''Parse the command line arguments''' parser = argparse.ArgumentParser() parser.add_argument( '-n', '--name', action='store', dest='name', default='', help=('Specifies the file name for the binary database for the IME. ' 'The default is “%(default)s”. If the file name of the database ' 'is not specified, the file name of the source file before ' 'the first “.” will be appended with “.db” and that will be ' 'used as the file name of the database.')) parser.add_argument( '-s', '--source', action='store', dest='source', default='', help=('Specifies the file which contains the source of the IME. ' 'The default is “%(default)s”.')) parser.add_argument( '-e', '--extra', action='store', dest='extra', default='', help=('Specifies the file name for the extra words for the IME. ' 'The default is “%(default)s”.')) parser.add_argument( '-p', '--pinyin', action='store', dest='pinyin', default='/usr/share/ibus-table/data/pinyin_table.txt.bz2', help=('Specifies the source file for the pinyin. ' 'The default is “%(default)s”.')) parser.add_argument( '-g', '--suggestion', action='store', dest='suggestion', default='/usr/share/ibus-table/data/phrase.txt.bz2', help=('Specifies the source file for the suggestion candidate. ' 'The default is “%(default)s”.')) parser.add_argument( '-o', '--no-create-index', action='store_false', dest='index', default=True, help=('Do not create an index for a database ' '(Only for distribution purposes, ' 'a normal user should not use this flag!). ' 'The default is “%(default)s”.')) parser.add_argument( '-i', '--create-index-only', action='store_true', dest='only_index', default=False, help=('Only create an index for an existing database. ' 'Specifying the file name of the binary database ' 'with the -n or --name option is required ' 'when this option is used.' 'The default is “%(default)s”.')) parser.add_argument( '-d', '--debug', action='store_true', dest='debug', default=False, help=('Print extra debug messages. ' 'The default is “%(default)s”.')) return parser.parse_args() _ARGS = parse_args() if _ARGS.only_index: if not _ARGS.name: print('\nPlease specify the file name of the database ' 'you want to create an index on!') sys.exit(2) if not os.path.exists(_ARGS.name) or not os.path.isfile(_ARGS.name): print("\nThe database file '%s' does not exist." % _ARGS.name) sys.exit(2) if not _ARGS.name and _ARGS.source: _ARGS.name = os.path.basename(_ARGS.source).split('.')[0] + '.db' if not _ARGS.name: print('\nYou need to specify the file which ' 'contains the source of the IME!') sys.exit(2) class Section: '''Helper class for parsing the sections of the tables marked with BEGIN_* and END_*. ''' # Actually the “more exact” type for patt is re.Pattern[str] but # Python 3.8 fails parsing this, therefore use type “Any” for patt # to make it work with Pytyon 3.8 as well: patt: Any start: str end: str in_section: bool def __init__(self, patt: Any, start: str, end: str): self.patt = patt self.start = start.strip() self.end = end.strip() self.in_section = False def match(self, line: str) -> bool: ''' Returns True if the line is inside the section and matches the pattern of the section. ''' if self.in_section: if self.end == line.strip(): self.in_section = False elif self.patt.match(line): return True elif self.start == line.strip(): self.in_section = True return False def main() -> None: '''Main program''' def debug_print(message: str) -> None: if _ARGS.debug: print(message) if not _ARGS.only_index: try: os.unlink(_ARGS.name) except Exception: pass debug_print('Processing Database') db = tabsqlitedb.TabSqliteDb(filename=_ARGS.name, user_db='', create_database=True) def parse_source( f: Iterable[str]) -> Tuple[List[str], List[str], List[str]]: _attri: List[str] = [] _table: List[str] = [] _table_extra: List[str] = [] _gouci: List[str] = [] patt_com = re.compile(r'^###.*') patt_blank = re.compile(r'^[ \t]*$') patt_conf = re.compile(r'[^\t]*=[^\t]*') patt_table = re.compile(r'([^\t]+)\t([^\t]+)\t([0-9]+)(\t.*)?$') patt_gouci = re.compile(r' *[^\s]+ *\t *[^\s]+ *$') sec_conf = Section( patt_conf, "BEGIN_DEFINITION", "END_DEFINITION") sec_table = Section( patt_table, "BEGIN_TABLE", "END_TABLE") sec_table_extra = Section( patt_table, "BEGIN_TABLE_EXTRA", "END_TABLE_EXTRA") sec_gouci = Section( patt_gouci, "BEGIN_GOUCI", "END_GOUCI") for line in f: if (not patt_com.match(line)) and (not patt_blank.match(line)): for _sec, _list in ( (sec_table, _table), (sec_table_extra, _table_extra), (sec_gouci, _gouci), (sec_conf, _attri)): if _sec.match(line): _list.append(line) break if not _gouci: # The user didn’t provide goucima (goucima = 構詞碼 = # “word formation keys”) in the table source, so we use # the longest encoding for a single character as the # goucima for that character. # # Example: # # wubi-jidian86.txt contains: # # a 工 99454797 # aaa 工 551000000 # aaaa 工 551000000 # aaad 工期 5350000 # ... and more matches for compounds containing 工 # # The longest key sequence to type 工 as a single # character is “aaaa”. Therefore, the goucima of 工 is # “aaaa” (There is one other character with the same goucima # in wubi-jidian86.txt, 㠭 also has the goucima “aaaa”). gouci_dict: Dict[str, str] = {} for line in _table: res = patt_table.match(line) if res and len(res.group(2)) == 1: if res.group(2) in gouci_dict: if len(res.group(1)) > len(gouci_dict[res.group(2)]): gouci_dict[res.group(2)] = res.group(1) else: gouci_dict[res.group(2)] = res.group(1) for key in gouci_dict: _gouci.append('%s\t%s' %(key, gouci_dict[key])) _gouci.sort() _table += _table_extra return (_attri, _table, _gouci) def parse_pinyin(f: Iterable[str]) -> List[str]: _pinyins: List[str] = [] patt_com = re.compile(r'^#.*') patt_blank = re.compile(r'^[ \t]*$') patt_py = re.compile(r'(.*)\t(.*)\t(.*)') patt_yin = re.compile(r'[a-z]+[1-5]') for line in f: if (not patt_com.match(line)) and (not patt_blank.match(line)): res = patt_py.match(line) if res: yins = patt_yin.findall(res.group(2)) for yin in yins: _pinyins.append("%s\t%s\t%s" \ % (res.group(1), yin, res.group(3))) return _pinyins[:] def parse_suggestion(f: Iterable[str]) -> List[str]: _suggestions: List[str] = [] patt_com = re.compile(r'^#.*') patt_blank = re.compile(r'^[ \t]*$') patt_sg = re.compile(r'(.*)\s+(.*)') for line in f: if (not patt_com.match(line)) and (not patt_blank.match(line)): res = patt_sg.match(line) if res: phrase = res.group(1) freq = res.group(2) _suggestions.append("%s %s" % (phrase, freq)) return _suggestions[:] def parse_extra(f: Iterable[str]) -> List[str]: _extra: List[str] = [] patt_com = re.compile(r'^###.*') patt_blank = re.compile(r'^[ \t]*$') patt_extra = re.compile(r'(.*)\t(.*)') for line in f: if (not patt_com.match(line)) and (not patt_blank.match(line)): if patt_extra.match(line): _extra.append(line) return _extra def pinyin_parser(f: Iterable[str]) -> Iterable[Tuple[str, str, int]]: for pinyin_line in f: _zi, _pinyin, _freq = pinyin_line.strip().split() yield (_pinyin, _zi, int(_freq)) def suggestion_parser(f: Iterable[str]) -> Iterable[Tuple[str, int]]: for suggestion_line in f: _phrase, _freq = suggestion_line.strip().split() yield (_phrase, int(_freq)) def phrase_parser(f: Iterable[str]) -> List[Tuple[str, str, int, int]]: phrase_list: List[Tuple[str, str, int, int]] = [] for line in f: xingma, phrase, freq = line.split('\t')[:3] if phrase == 'NOSYMBOL': phrase = u'' phrase_list.append((xingma, phrase, int(freq), 0)) return phrase_list def goucima_parser(f: Iterable[str]) -> Iterable[Tuple[str, str]]: for line in f: zi, gcm = line.strip().split() yield (zi, gcm) def attribute_parser(f: Iterable[str]) -> Iterable[Tuple[str, str]]: for line in f: try: attr, val = line.strip().split('=') except Exception: attr, val = line.strip().split('==') attr = attr.strip().lower() val = val.strip() yield (attr, val) def extra_parser(f: Iterable[str]) -> List[Tuple[str, str, int, int]]: extra_list: List[Tuple[str, str, int, int]] = [] for line in f: phrase, freq = line.strip().split() _tabkey = db.parse_phrase(phrase) if _tabkey: extra_list.append((_tabkey, phrase, int(freq), 0)) else: print('No tabkeys found for “%s”, not adding.\n' %phrase) return extra_list def get_char_prompts(f: Iterable[str]) -> Tuple[str, str]: ''' Returns something like ("char_prompts", "{'a': '日', 'b': '日', 'c': '金', ...}") i.e. the attribute name "char_prompts" and as its value the string representation of a Python dictionary. ''' char_prompts: Dict[str, str] = {} start = False for line in f: if re.match(r'^BEGIN_CHAR_PROMPTS_DEFINITION', line): start = True continue if not start: continue if re.match(r'^END_CHAR_PROMPTS_DEFINITION', line): break match = re.search( r'^(?P<char>[^\s]+)[\s]+(?P<prompt>[^\s]+)', line) if match: char_prompts[match.group('char')] = match.group('prompt') return ("char_prompts", repr(char_prompts)) if _ARGS.only_index: debug_print('Only create Indexes') debug_print('Optimizing database ') db.optimize_database() debug_print('Create Indexes ') db.create_indexes('main') debug_print('Done! :D') return # now we parse the ime source file debug_print('\tLoad sources "%s"' % _ARGS.source) patt_s = re.compile(r'.*\.bz2') _bz2s = patt_s.match(_ARGS.source) if _bz2s: source_str = bz2.open( _ARGS.source, mode='rt', encoding='UTF-8').read() else: source_str = open(_ARGS.source, mode='r', encoding='UTF-8').read() source_str = source_str.replace('\r\n', '\n') source = source_str.split('\n') # first get config line and table line and goucima line respectively debug_print('\tParsing table source file ') attri, table, gouci = parse_source(source) debug_print('\t get attribute of IME :)') attributes = list(attribute_parser(attri)) attributes.append(get_char_prompts(source)) debug_print('\t add attributes into DB ') db.update_ime(attributes) db.create_tables('main') # second, we use generators for database generating: debug_print('\t get phrases of IME :)') phrases = phrase_parser(table) # now we add things into db debug_print('\t add phrases into DB ') db.add_phrases(phrases) if db.ime_properties.get('user_can_define_phrase').lower() == u'true': debug_print('\t get goucima of IME :)') goucima = goucima_parser(gouci) debug_print('\t add goucima into DB ') db.add_goucima(goucima) if db.ime_properties.get('pinyin_mode').lower() == u'true': debug_print('\tLoad pinyin source \"%s\"' % _ARGS.pinyin) _bz2p = patt_s.match(_ARGS.pinyin) if _bz2p: pinyin_s = bz2.open(_ARGS.pinyin, mode='rt', encoding='UTF-8') else: pinyin_s = open(_ARGS.pinyin, mode='r', encoding='UTF-8') debug_print('\tParsing pinyin source file ') pyline = parse_pinyin(pinyin_s) debug_print('\tPreapring pinyin entries') pinyin = pinyin_parser(pyline) debug_print('\t add pinyin into DB ') db.add_pinyin(pinyin) if db.ime_properties.get('suggestion_mode').lower() == u'true': debug_print('\tLoad suggestion source \"%s\"' % _ARGS.suggestion) _bz2p = patt_s.match(_ARGS.suggestion) if _bz2p: suggestion_s = bz2.open( _ARGS.suggestion, mode="rt", encoding='UTF-8') else: suggestion_s = open( _ARGS.suggestion, mode='r', encoding='UTF-8') debug_print('\tParsing suggestion source file ') sgline = parse_suggestion(suggestion_s) debug_print('\tPreapring suggestion entries') suggestions = suggestion_parser(sgline) debug_print('\t add suggestion candidates into DB ') db.add_suggestion(suggestions) debug_print('Optimizing database ') db.optimize_database() if (db.ime_properties.get('user_can_define_phrase').lower() == u'true' and _ARGS.extra): debug_print('\tPreparing for adding extra words') db.create_indexes('main') debug_print('\tLoad extra words source "%s"' % _ARGS.extra) _bz2p = patt_s.match(_ARGS.extra) if _bz2p: extra_s = bz2.open(_ARGS.extra, mode='rt', encoding='UTF-8') else: extra_s = open(_ARGS.extra, 'r') debug_print('\tParsing extra words source file ') extraline = parse_extra(extra_s) debug_print('\tPreparing extra words lines') extrawords = extra_parser(extraline) debug_print('\t we have %d extra phrases from source' % len(extrawords)) # first get the entry of original phrases from # phrases-[(xingma, phrase, int(freq), 0)] orig_phrases = {} for phrase in phrases: orig_phrases.update({"%s\t%s" % (phrase[0], phrase[1]): phrase}) debug_print('\t the len of orig_phrases is: %d' % len(orig_phrases)) extra_phrases = {} for extraword in extrawords: extra_phrases.update( {"%s\t%s" % (extraword[0], extraword[1]): extraword}) debug_print('\t the len of extra_phrases is: %d' % len(extra_phrases)) # pop duplicated keys for extra_phrase in extra_phrases: if extra_phrase in orig_phrases: extra_phrases.pop(extra_phrase) debug_print('\t %d extra phrases will be added' % len(extra_phrases)) new_phrases = list(extra_phrases.values()) debug_print('\tAdding extra words into DB ') db.add_phrases(new_phrases) debug_print('Optimizing database ') db.optimize_database() if _ARGS.index: debug_print('Create Indexes ') db.create_indexes('main') else: debug_print('We do not create an index on the database, ' 'you should only activate this function ' 'for distribution purposes.') db.drop_indexes('main') debug_print('Done! :D') if __name__ == "__main__": main()