Source code for ctnx.misc

# -*- coding: utf-8 -*-

from unicodedata import name as unicode_name, lookup as unicode_lookup, normalize as unicode_normalize
from functools import lru_cache

from .constants import TONES, TONE_NAMES, NO_TONE_CHAR_TRANS, CONFUSABLE_CHAR_TRANS

[docs]def normalize_confusables(text: str) -> str: """Converts a confusable text to a potentially normal text. Replace similar-looking characters and homoglyphs with theirs equivalent Vietnamese characters. Small cap letters will be converted to lowercase. """ return text.translate(CONFUSABLE_CHAR_TRANS)
[docs]def normalize(text: str) -> str: """Converts combining Unicode characters to theirs equivalent precomposed characters.""" return unicode_normalize('NFC', text)
[docs]def remove_tones(text: str) -> str: """Remove tone marks from text. Replace characters with tone marks with theirs equivalent non-toned characters. Other diacritics will be kept. """ return text.translate(NO_TONE_CHAR_TRANS)
[docs]def remove_diacritics(text: str) -> str: """Remove all diacritics from text. Replace characters with diacritics with theirs equivalent ASCII characters. """ SPECIAL_TRANS = str.maketrans('đĐ', 'dD') return unicode_normalize('NFKD', text.translate(SPECIAL_TRANS)).encode('ascii', 'ignore').decode()
[docs]@lru_cache(maxsize=160) def sep_tone_from_char(char: str): """Extract the tone mark from a character. The returned tone is denoted as the following: '': unmarked (ngang) '/': acute accent (sắc) '\\': grave accent (huyền) '?': hook above (hỏi) '~': tilde (ngã) '.': dot below (nặng) Parameters ---------- char : str The character from which the tone will be extracted Returns ------- tuple a tuple of the same character without tone mark and its tone """ try: name = unicode_name(char) #print(name) except ValueError: return ('', char) nname = '' tone = '' for ti, tname in enumerate(TONE_NAMES): if tname in name: tone = TONES[ti] nname = name.replace(tname, '') break else: return ('', char) if nname.endswith('WITH '): nname = nname[:-5] elif nname.endswith('AND '): nname = nname[:-4] nname = nname.strip() try: new_char = unicode_lookup(nname) return (tone, new_char) except KeyError: raise
[docs]def separate_tone(text: str, all=False): """Extract the tone mark from text. The returned tone is denoted as the following: '': unmarked (ngang) '/': acute accent (sắc) '\\': grave accent (huyền) '?': hook above (hỏi) '~': tilde (ngã) '.': dot below (nặng) Parameters ---------- char : str The text from which the tone will be extracted all : bool, default : False If set to True, the last tone will be returned instead of the first one Returns ------- tuple a tuple of the text without tone marks and its tone """ text = normalize(text) tone = '' for i, lett in enumerate(text): tone, new_char = sep_tone_from_char(lett) if tone == '': continue else: text = text[:i] + new_char + text[i+1:] if not all: break return (text, tone)