Move functions into modules

2022-12-21 23:57:02 +09:00
parent 829407bb14
commit b52e31a4dc
5 changed files with 506 additions and 480 deletions
--- a/bin/utils/string_helpers.py
+++ b/bin/utils/string_helpers.py
@@ -0,0 +1,106 @@
+"""
+various string helpers1
+"""
+
+import unicodedata
+
+# this is for looking up if string is non latin letters
+# this is used by isLatin and onlyLatinChars
+cache_latin_letters = {}
+
+def shorten_string(string, width, placeholder='..'):
+    """
+    shortens a string to width and attached placeholder
+
+    Args:
+        string(str): string to shorten
+        width (int): length th shorten to
+        placeholder (str, optional): optional string for removed shortend part. Defaults to '..'.
+
+    Returns:
+        string: shortened string
+    """
+    # get the length with double byte charactes
+    string_length_cjk = string_len_cjk(str(string))
+    # if double byte width is too big
+    if string_length_cjk > width:
+        # set current length and output string
+        cur_len = 0
+        out_string = ''
+        # loop through each character
+        for char in str(string):
+            # set the current length if we add the character
+            cur_len += 2 if unicodedata.east_asian_width(char) in "WF" else 1
+            # if the new length is smaller than the output length to shorten too add the char
+            if cur_len <= (width - len(placeholder)):
+                out_string += char
+        # return string with new width and placeholder
+        return f"{out_string}{placeholder}"
+    else:
+        return str(string)
+
+def string_len_cjk(string):
+    """
+    because len on string in python counts characters but we need the width
+    count for formatting, we count two for a double byte characters
+
+    Args:
+        string (string): string to check length
+
+    Returns:
+        int: length including double count for double width characters
+    """
+    # return string len including double count for double width characters
+    return sum(1 + (unicodedata.east_asian_width(c) in "WF") for c in string)
+
+def is_latin(uchr):
+    """
+    checks via the unciode class if a character is LATIN char based
+
+    from
+    https://stackoverflow.com/a/3308844/7811993
+
+    Args:
+        uchr (str): _description_
+
+    Returns:
+        str: flagged LATIN or not char
+    """
+    try:
+        # if we found in the dictionary return
+        return cache_latin_letters[uchr]
+    except KeyError:
+        # find LATIN in uncide type returned and set in dictionary for this character
+        return cache_latin_letters.setdefault(uchr, 'LATIN' in unicodedata.name(uchr))
+
+def only_latin_chars(unistr):
+    """
+    chekcs if a string is based on LATIN chars. No for any CJK, Cyrillic, Hebrew, etc
+
+    from:
+    https://stackoverflow.com/a/3308844/7811993
+
+    Args:
+        unistr (str): string
+
+    Returns:
+        bool: True/False for if string is LATIN char based
+    """
+    return all(is_latin(uchr) for uchr in unistr if uchr.isalpha())
+
+def format_len(string, length):
+    """
+    in case of CJK characters we need to adjust the format length dynamically
+    calculate correct length based on string given
+
+    Args:
+        string (str): string
+        length (int): format length
+
+    Returns:
+        int: adjusted format legnth
+    """
+    # returns length udpated for string with double byte characters
+    # get string length normal, get string length including double byte characters
+    # then subtract that from the original length
+    return length - (string_len_cjk(string) - len(string))