Files
reverse-geolocate/bin/utils/string_helpers.py

112 lines
3.2 KiB
Python

"""
various string helpers1
"""
import unicodedata
# this is for looking up if string is non latin letters
# this is used by isLatin and onlyLatinChars
cache_latin_letters = {}
def shorten_string(string, width, placeholder=".."):
"""
shortens a string to width and attached placeholder
Args:
string(str): string to shorten
width (int): length th shorten to
placeholder (str, optional): optional string for removed shortend part. Defaults to '..'.
Returns:
string: shortened string
"""
# get the length with double byte charactes
string_length_cjk = string_len_cjk(str(string))
# if double byte width is too big
if string_length_cjk > width:
# set current length and output string
cur_len = 0
out_string = ""
# loop through each character
for char in str(string):
# set the current length if we add the character
cur_len += 2 if unicodedata.east_asian_width(char) in "WF" else 1
# if the new length is smaller than the output length to shorten too add the char
if cur_len <= (width - len(placeholder)):
out_string += char
# return string with new width and placeholder
return f"{out_string}{placeholder}"
else:
return str(string)
def string_len_cjk(string):
"""
because len on string in python counts characters but we need the width
count for formatting, we count two for a double byte characters
Args:
string (string): string to check length
Returns:
int: length including double count for double width characters
"""
# return string len including double count for double width characters
return sum(1 + (unicodedata.east_asian_width(c) in "WF") for c in string)
def is_latin(uchr):
"""
checks via the unciode class if a character is LATIN char based
from
https://stackoverflow.com/a/3308844/7811993
Args:
uchr (str): _description_
Returns:
str: flagged LATIN or not char
"""
try:
# if we found in the dictionary return
return cache_latin_letters[uchr]
except KeyError:
# find LATIN in uncide type returned and set in dictionary for this character
return cache_latin_letters.setdefault(uchr, "LATIN" in unicodedata.name(uchr))
def only_latin_chars(unistr):
"""
chekcs if a string is based on LATIN chars. No for any CJK, Cyrillic, Hebrew, etc
from:
https://stackoverflow.com/a/3308844/7811993
Args:
unistr (str): string
Returns:
bool: True/False for if string is LATIN char based
"""
return all(is_latin(uchr) for uchr in unistr if uchr.isalpha())
def format_len(string, length):
"""
in case of CJK characters we need to adjust the format length dynamically
calculate correct length based on string given
Args:
string (str): string
length (int): format length
Returns:
int: adjusted format legnth
"""
# returns length udpated for string with double byte characters
# get string length normal, get string length including double byte characters
# then subtract that from the original length
return length - (string_len_cjk(string) - len(string))