Fix fingerprint with mixed int and str keys
Create a fallback hash function to handle mixed key types in dictionaries and lists, ensuring consistent hashing across different data structures. Fallback called is prefixed with "HO_" to indicate its usage.
This commit is contained in:
@@ -4,11 +4,37 @@ Various dictionary, object and list hashers
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import hashlib
|
import hashlib
|
||||||
from typing import Any
|
from typing import Any, cast, Sequence
|
||||||
|
|
||||||
|
|
||||||
|
def hash_object(obj: Any) -> str:
|
||||||
|
"""
|
||||||
|
Create a hash for any dict or list with mixed key types
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
obj {Any} -- _description_
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str -- _description_
|
||||||
|
"""
|
||||||
|
def normalize(o: Any) -> Any:
|
||||||
|
if isinstance(o, dict):
|
||||||
|
# Sort by repr of keys to handle mixed types (str, int, etc.)
|
||||||
|
o = cast(dict[Any, Any], o)
|
||||||
|
return tuple(sorted((repr(k), normalize(v)) for k, v in o.items()))
|
||||||
|
if isinstance(o, (list, tuple)):
|
||||||
|
o = cast(Sequence[Any], o)
|
||||||
|
return tuple(normalize(item) for item in o)
|
||||||
|
return repr(o)
|
||||||
|
|
||||||
|
normalized = normalize(obj)
|
||||||
|
return hashlib.sha256(str(normalized).encode()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
def dict_hash_frozen(data: dict[Any, Any]) -> int:
|
def dict_hash_frozen(data: dict[Any, Any]) -> int:
|
||||||
"""
|
"""
|
||||||
|
NOT RECOMMENDED, use dict_hash_crc or hash_object instead
|
||||||
|
If used, DO NOT CHANGE
|
||||||
hash a dict via freeze
|
hash a dict via freeze
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -22,18 +48,25 @@ def dict_hash_frozen(data: dict[Any, Any]) -> int:
|
|||||||
|
|
||||||
def dict_hash_crc(data: dict[Any, Any] | list[Any]) -> str:
|
def dict_hash_crc(data: dict[Any, Any] | list[Any]) -> str:
|
||||||
"""
|
"""
|
||||||
Create a sha256 hash over dict
|
Legacy Method, must be kept for fallback
|
||||||
|
Create a sha256 hash over dict or list
|
||||||
alternative for
|
alternative for
|
||||||
dict_hash_frozen
|
dict_hash_frozen
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
data (dict | list): _description_
|
data (dict[Any, Any] | list[Any]): _description_
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: _description_
|
str: sha256 hash, prefiex with HO_ if fallback used
|
||||||
"""
|
"""
|
||||||
return hashlib.sha256(
|
try:
|
||||||
json.dumps(data, sort_keys=True, ensure_ascii=True).encode('utf-8')
|
return hashlib.sha256(
|
||||||
).hexdigest()
|
# IT IS IMPORTANT THAT THE BELOW CALL STAYS THE SAME AND DOES NOT CHANGE OR WE WILL GET DIFFERENT HASHES
|
||||||
|
# separators=(',', ':') to get rid of spaces, but if this is used the hash will be different, DO NOT ADD
|
||||||
|
json.dumps(data, sort_keys=True, ensure_ascii=True, default=str).encode('utf-8')
|
||||||
|
).hexdigest()
|
||||||
|
except TypeError:
|
||||||
|
# Fallback tod different hasher, will return DIFFERENT hash than above, so only usable in int/str key mixes
|
||||||
|
return "HO_" + hash_object(data)
|
||||||
|
|
||||||
# __END__
|
# __END__
|
||||||
|
|||||||
@@ -58,7 +58,12 @@ def make_unique_list_of_dicts(dict_list: list[Any]) -> list[Any]:
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# try json dumps, can fail with int and str index types
|
# try json dumps, can fail with int and str index types
|
||||||
return list({json.dumps(d, sort_keys=True, ensure_ascii=True): d for d in dict_list}.values())
|
return list(
|
||||||
|
{
|
||||||
|
json.dumps(d, sort_keys=True, ensure_ascii=True, separators=(',', ':')): d
|
||||||
|
for d in dict_list
|
||||||
|
}.values()
|
||||||
|
)
|
||||||
except TypeError:
|
except TypeError:
|
||||||
# Fallback for non-serializable entries, slow but works
|
# Fallback for non-serializable entries, slow but works
|
||||||
unique: list[Any] = []
|
unique: list[Any] = []
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ test list helpers
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
from corelibs.debug_handling.dump_data import dump_data
|
from corelibs.debug_handling.dump_data import dump_data
|
||||||
from corelibs.iterator_handling.list_helpers import is_list_in_list, convert_to_list, make_unique_list_of_dicts
|
from corelibs.iterator_handling.list_helpers import is_list_in_list, convert_to_list, make_unique_list_of_dicts
|
||||||
|
from corelibs.iterator_handling.fingerprint import dict_hash_crc
|
||||||
|
|
||||||
|
|
||||||
def __test_is_list_in_list_a():
|
def __test_is_list_in_list_a():
|
||||||
@@ -29,7 +30,8 @@ def __make_unique_list_of_dicts():
|
|||||||
{"a": 3, "b": 4, "nested": {"x": 30, "y": 40}}
|
{"a": 3, "b": 4, "nested": {"x": 30, "y": 40}}
|
||||||
]
|
]
|
||||||
unique_dicts = make_unique_list_of_dicts(dict_list)
|
unique_dicts = make_unique_list_of_dicts(dict_list)
|
||||||
print(f"Unique dicts: {dump_data(unique_dicts)}")
|
dhf = dict_hash_crc(unique_dicts)
|
||||||
|
print(f"Unique dicts: {dump_data(unique_dicts)} [{dhf}]")
|
||||||
|
|
||||||
dict_list = [
|
dict_list = [
|
||||||
{"a": 1, 1: "one"},
|
{"a": 1, 1: "one"},
|
||||||
@@ -37,7 +39,8 @@ def __make_unique_list_of_dicts():
|
|||||||
{"a": 2, 1: "one"}
|
{"a": 2, 1: "one"}
|
||||||
]
|
]
|
||||||
unique_dicts = make_unique_list_of_dicts(dict_list)
|
unique_dicts = make_unique_list_of_dicts(dict_list)
|
||||||
print(f"Unique dicts: {dump_data(unique_dicts)}")
|
dhf = dict_hash_crc(unique_dicts)
|
||||||
|
print(f"Unique dicts: {dump_data(unique_dicts)} [{dhf}]")
|
||||||
|
|
||||||
dict_list = [
|
dict_list = [
|
||||||
{"a": 1, "b": [1, 2, 3]},
|
{"a": 1, "b": [1, 2, 3]},
|
||||||
@@ -46,7 +49,8 @@ def __make_unique_list_of_dicts():
|
|||||||
1, 2, "String", 1, "Foobar"
|
1, 2, "String", 1, "Foobar"
|
||||||
]
|
]
|
||||||
unique_dicts = make_unique_list_of_dicts(dict_list)
|
unique_dicts = make_unique_list_of_dicts(dict_list)
|
||||||
print(f"Unique dicts: {dump_data(unique_dicts)}")
|
dhf = dict_hash_crc(unique_dicts)
|
||||||
|
print(f"Unique dicts: {dump_data(unique_dicts)} [{dhf}]")
|
||||||
|
|
||||||
dict_list: list[Any] = [
|
dict_list: list[Any] = [
|
||||||
[],
|
[],
|
||||||
@@ -59,7 +63,17 @@ def __make_unique_list_of_dicts():
|
|||||||
{"a": {}},
|
{"a": {}},
|
||||||
]
|
]
|
||||||
unique_dicts = make_unique_list_of_dicts(dict_list)
|
unique_dicts = make_unique_list_of_dicts(dict_list)
|
||||||
print(f"Unique dicts: {dump_data(unique_dicts)}")
|
dhf = dict_hash_crc(unique_dicts)
|
||||||
|
print(f"Unique dicts: {dump_data(unique_dicts)} [{dhf}]")
|
||||||
|
|
||||||
|
dict_list: list[Any] = [
|
||||||
|
(1, 2),
|
||||||
|
(1, 2),
|
||||||
|
(2, 3),
|
||||||
|
]
|
||||||
|
unique_dicts = make_unique_list_of_dicts(dict_list)
|
||||||
|
dhf = dict_hash_crc(unique_dicts)
|
||||||
|
print(f"Unique dicts: {dump_data(unique_dicts)} [{dhf}]")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|||||||
@@ -4,7 +4,101 @@ tests for corelibs.iterator_handling.fingerprint
|
|||||||
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
import pytest
|
import pytest
|
||||||
from corelibs.iterator_handling.fingerprint import dict_hash_frozen, dict_hash_crc
|
from corelibs.iterator_handling.fingerprint import dict_hash_frozen, dict_hash_crc, hash_object
|
||||||
|
|
||||||
|
|
||||||
|
class TestHashObject:
|
||||||
|
"""Tests for hash_object function"""
|
||||||
|
|
||||||
|
def test_hash_object_simple_dict(self):
|
||||||
|
"""Test hashing a simple dictionary with hash_object"""
|
||||||
|
data = {"key1": "value1", "key2": "value2"}
|
||||||
|
result = hash_object(data)
|
||||||
|
|
||||||
|
assert isinstance(result, str)
|
||||||
|
assert len(result) == 64 # SHA256 produces 64 hex characters
|
||||||
|
|
||||||
|
def test_hash_object_mixed_keys(self):
|
||||||
|
"""Test hash_object with mixed int and string keys"""
|
||||||
|
data = {"key1": "value1", 1: "value2", 2: "value3"}
|
||||||
|
result = hash_object(data)
|
||||||
|
|
||||||
|
assert isinstance(result, str)
|
||||||
|
assert len(result) == 64
|
||||||
|
|
||||||
|
def test_hash_object_consistency(self):
|
||||||
|
"""Test that hash_object produces consistent results"""
|
||||||
|
data = {"str_key": "value", 123: "number_key"}
|
||||||
|
hash1 = hash_object(data)
|
||||||
|
hash2 = hash_object(data)
|
||||||
|
|
||||||
|
assert hash1 == hash2
|
||||||
|
|
||||||
|
def test_hash_object_order_independence(self):
|
||||||
|
"""Test that hash_object is order-independent"""
|
||||||
|
data1 = {"a": 1, 1: "one", "b": 2, 2: "two"}
|
||||||
|
data2 = {2: "two", "b": 2, 1: "one", "a": 1}
|
||||||
|
hash1 = hash_object(data1)
|
||||||
|
hash2 = hash_object(data2)
|
||||||
|
|
||||||
|
assert hash1 == hash2
|
||||||
|
|
||||||
|
def test_hash_object_list_of_dicts_mixed_keys(self):
|
||||||
|
"""Test hash_object with list of dicts containing mixed keys"""
|
||||||
|
data = [
|
||||||
|
{"name": "item1", 1: "value1"},
|
||||||
|
{"name": "item2", 2: "value2"}
|
||||||
|
]
|
||||||
|
result = hash_object(data)
|
||||||
|
|
||||||
|
assert isinstance(result, str)
|
||||||
|
assert len(result) == 64
|
||||||
|
|
||||||
|
def test_hash_object_nested_mixed_keys(self):
|
||||||
|
"""Test hash_object with nested structures containing mixed keys"""
|
||||||
|
data = {
|
||||||
|
"outer": {
|
||||||
|
"inner": "value",
|
||||||
|
1: "mixed_key"
|
||||||
|
},
|
||||||
|
2: "another_mixed"
|
||||||
|
}
|
||||||
|
result = hash_object(data)
|
||||||
|
|
||||||
|
assert isinstance(result, str)
|
||||||
|
assert len(result) == 64
|
||||||
|
|
||||||
|
def test_hash_object_different_data(self):
|
||||||
|
"""Test that different data produces different hashes"""
|
||||||
|
data1 = {"key": "value", 1: "one"}
|
||||||
|
data2 = {"key": "value", 2: "two"}
|
||||||
|
hash1 = hash_object(data1)
|
||||||
|
hash2 = hash_object(data2)
|
||||||
|
|
||||||
|
assert hash1 != hash2
|
||||||
|
|
||||||
|
def test_hash_object_complex_nested(self):
|
||||||
|
"""Test hash_object with complex nested structures"""
|
||||||
|
data = {
|
||||||
|
"level1": {
|
||||||
|
"level2": {
|
||||||
|
1: "value",
|
||||||
|
"key": [1, 2, {"nested": "deep", 3: "int_key"}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result = hash_object(data)
|
||||||
|
|
||||||
|
assert isinstance(result, str)
|
||||||
|
assert len(result) == 64
|
||||||
|
|
||||||
|
def test_hash_object_list_with_tuples(self):
|
||||||
|
"""Test hash_object with lists containing tuples"""
|
||||||
|
data = [("a", 1), ("b", 2), {1: "mixed", "key": "value"}]
|
||||||
|
result = hash_object(data)
|
||||||
|
|
||||||
|
assert isinstance(result, str)
|
||||||
|
assert len(result) == 64
|
||||||
|
|
||||||
|
|
||||||
class TestDictHashFrozen:
|
class TestDictHashFrozen:
|
||||||
@@ -279,6 +373,116 @@ class TestDictHashCrc:
|
|||||||
assert isinstance(result, str)
|
assert isinstance(result, str)
|
||||||
assert len(result) == 64
|
assert len(result) == 64
|
||||||
|
|
||||||
|
def test_dict_hash_crc_fallback_mixed_keys(self):
|
||||||
|
"""Test dict_hash_crc fallback with mixed int and string keys"""
|
||||||
|
data = {"key1": "value1", 1: "value2", 2: "value3"}
|
||||||
|
result = dict_hash_crc(data)
|
||||||
|
|
||||||
|
assert isinstance(result, str)
|
||||||
|
# Fallback prefixes with "HO_"
|
||||||
|
assert result.startswith("HO_")
|
||||||
|
# Hash should be 64 chars + 3 char prefix = 67 total
|
||||||
|
assert len(result) == 67
|
||||||
|
|
||||||
|
def test_dict_hash_crc_fallback_consistency(self):
|
||||||
|
"""Test that fallback produces consistent hashes"""
|
||||||
|
data = {"str_key": "value", 123: "number_key", 456: "another"}
|
||||||
|
hash1 = dict_hash_crc(data)
|
||||||
|
hash2 = dict_hash_crc(data)
|
||||||
|
|
||||||
|
assert hash1 == hash2
|
||||||
|
assert hash1.startswith("HO_")
|
||||||
|
|
||||||
|
def test_dict_hash_crc_fallback_order_independence(self):
|
||||||
|
"""Test that fallback is order-independent for mixed-key dicts"""
|
||||||
|
data1 = {"a": 1, 1: "one", "b": 2, 2: "two"}
|
||||||
|
data2 = {2: "two", "b": 2, 1: "one", "a": 1}
|
||||||
|
hash1 = dict_hash_crc(data1)
|
||||||
|
hash2 = dict_hash_crc(data2)
|
||||||
|
|
||||||
|
assert hash1 == hash2
|
||||||
|
assert hash1.startswith("HO_")
|
||||||
|
|
||||||
|
def test_dict_hash_crc_fallback_list_of_dicts_mixed_keys(self):
|
||||||
|
"""Test fallback with list of dicts containing mixed keys"""
|
||||||
|
data = [
|
||||||
|
{"name": "item1", 1: "value1"},
|
||||||
|
{"name": "item2", 2: "value2"},
|
||||||
|
{3: "value3", "type": "mixed"}
|
||||||
|
]
|
||||||
|
result = dict_hash_crc(data)
|
||||||
|
|
||||||
|
assert isinstance(result, str)
|
||||||
|
assert result.startswith("HO_")
|
||||||
|
assert len(result) == 67
|
||||||
|
|
||||||
|
def test_dict_hash_crc_fallback_nested_mixed_keys(self):
|
||||||
|
"""Test fallback with nested dicts containing mixed keys"""
|
||||||
|
data = {
|
||||||
|
"outer": {
|
||||||
|
"inner": "value",
|
||||||
|
1: "mixed_key"
|
||||||
|
},
|
||||||
|
2: "another_mixed"
|
||||||
|
}
|
||||||
|
result = dict_hash_crc(data)
|
||||||
|
|
||||||
|
assert isinstance(result, str)
|
||||||
|
assert result.startswith("HO_")
|
||||||
|
assert len(result) == 67
|
||||||
|
|
||||||
|
def test_dict_hash_crc_fallback_different_data(self):
|
||||||
|
"""Test that different mixed-key data produces different hashes"""
|
||||||
|
data1 = {"key": "value", 1: "one"}
|
||||||
|
data2 = {"key": "value", 2: "two"}
|
||||||
|
hash1 = dict_hash_crc(data1)
|
||||||
|
hash2 = dict_hash_crc(data2)
|
||||||
|
|
||||||
|
assert hash1 != hash2
|
||||||
|
assert hash1.startswith("HO_")
|
||||||
|
assert hash2.startswith("HO_")
|
||||||
|
|
||||||
|
def test_dict_hash_crc_fallback_complex_structure(self):
|
||||||
|
"""Test fallback with complex nested structure with mixed keys"""
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
1: "first",
|
||||||
|
"data": {
|
||||||
|
"nested": "value",
|
||||||
|
100: "nested_int_key"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
2: "second",
|
||||||
|
"items": [1, 2, 3]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
result = dict_hash_crc(data)
|
||||||
|
|
||||||
|
assert isinstance(result, str)
|
||||||
|
assert result.startswith("HO_")
|
||||||
|
assert len(result) == 67
|
||||||
|
|
||||||
|
def test_dict_hash_crc_no_fallback_string_keys_only(self):
|
||||||
|
"""Test that string-only keys don't trigger fallback"""
|
||||||
|
data = {"key1": "value1", "key2": "value2", "key3": "value3"}
|
||||||
|
result = dict_hash_crc(data)
|
||||||
|
|
||||||
|
assert isinstance(result, str)
|
||||||
|
assert not result.startswith("HO_")
|
||||||
|
assert len(result) == 64
|
||||||
|
|
||||||
|
def test_dict_hash_crc_no_fallback_int_keys_only(self):
|
||||||
|
"""Test that int-only keys don't trigger fallback"""
|
||||||
|
data = {1: "one", 2: "two", 3: "three"}
|
||||||
|
result = dict_hash_crc(data)
|
||||||
|
|
||||||
|
assert isinstance(result, str)
|
||||||
|
assert not result.startswith("HO_")
|
||||||
|
assert len(result) == 64
|
||||||
|
|
||||||
|
|
||||||
class TestComparisonBetweenHashFunctions:
|
class TestComparisonBetweenHashFunctions:
|
||||||
"""Tests comparing dict_hash_frozen and dict_hash_crc"""
|
"""Tests comparing dict_hash_frozen and dict_hash_crc"""
|
||||||
|
|||||||
Reference in New Issue
Block a user