112 lines
3.2 KiB
Python
112 lines
3.2 KiB
Python
"""
|
|
various string helpers1
|
|
"""
|
|
|
|
import unicodedata
|
|
|
|
# this is for looking up if string is non latin letters
|
|
# this is used by isLatin and onlyLatinChars
|
|
cache_latin_letters = {}
|
|
|
|
|
|
def shorten_string(string, width, placeholder=".."):
|
|
"""
|
|
shortens a string to width and attached placeholder
|
|
|
|
Args:
|
|
string(str): string to shorten
|
|
width (int): length th shorten to
|
|
placeholder (str, optional): optional string for removed shortend part. Defaults to '..'.
|
|
|
|
Returns:
|
|
string: shortened string
|
|
"""
|
|
# get the length with double byte charactes
|
|
string_length_cjk = string_len_cjk(str(string))
|
|
# if double byte width is too big
|
|
if string_length_cjk > width:
|
|
# set current length and output string
|
|
cur_len = 0
|
|
out_string = ""
|
|
# loop through each character
|
|
for char in str(string):
|
|
# set the current length if we add the character
|
|
cur_len += 2 if unicodedata.east_asian_width(char) in "WF" else 1
|
|
# if the new length is smaller than the output length to shorten too add the char
|
|
if cur_len <= (width - len(placeholder)):
|
|
out_string += char
|
|
# return string with new width and placeholder
|
|
return f"{out_string}{placeholder}"
|
|
else:
|
|
return str(string)
|
|
|
|
|
|
def string_len_cjk(string):
|
|
"""
|
|
because len on string in python counts characters but we need the width
|
|
count for formatting, we count two for a double byte characters
|
|
|
|
Args:
|
|
string (string): string to check length
|
|
|
|
Returns:
|
|
int: length including double count for double width characters
|
|
"""
|
|
# return string len including double count for double width characters
|
|
return sum(1 + (unicodedata.east_asian_width(c) in "WF") for c in string)
|
|
|
|
|
|
def is_latin(uchr):
|
|
"""
|
|
checks via the unciode class if a character is LATIN char based
|
|
|
|
from
|
|
https://stackoverflow.com/a/3308844/7811993
|
|
|
|
Args:
|
|
uchr (str): _description_
|
|
|
|
Returns:
|
|
str: flagged LATIN or not char
|
|
"""
|
|
try:
|
|
# if we found in the dictionary return
|
|
return cache_latin_letters[uchr]
|
|
except KeyError:
|
|
# find LATIN in uncide type returned and set in dictionary for this character
|
|
return cache_latin_letters.setdefault(uchr, "LATIN" in unicodedata.name(uchr))
|
|
|
|
|
|
def only_latin_chars(unistr):
|
|
"""
|
|
chekcs if a string is based on LATIN chars. No for any CJK, Cyrillic, Hebrew, etc
|
|
|
|
from:
|
|
https://stackoverflow.com/a/3308844/7811993
|
|
|
|
Args:
|
|
unistr (str): string
|
|
|
|
Returns:
|
|
bool: True/False for if string is LATIN char based
|
|
"""
|
|
return all(is_latin(uchr) for uchr in unistr if uchr.isalpha())
|
|
|
|
|
|
def format_len(string, length):
|
|
"""
|
|
in case of CJK characters we need to adjust the format length dynamically
|
|
calculate correct length based on string given
|
|
|
|
Args:
|
|
string (str): string
|
|
length (int): format length
|
|
|
|
Returns:
|
|
int: adjusted format legnth
|
|
"""
|
|
# returns length udpated for string with double byte characters
|
|
# get string length normal, get string length including double byte characters
|
|
# then subtract that from the original length
|
|
return length - (string_len_cjk(string) - len(string))
|