Fix fingerprint with mixed int and str keys
Create a fallback hash function to handle mixed key types in dictionaries and lists, ensuring consistent hashing across different data structures. Fallback called is prefixed with "HO_" to indicate its usage.
This commit is contained in:
@@ -4,11 +4,37 @@ Various dictionary, object and list hashers
|
||||
|
||||
import json
|
||||
import hashlib
|
||||
from typing import Any
|
||||
from typing import Any, cast, Sequence
|
||||
|
||||
|
||||
def hash_object(obj: Any) -> str:
|
||||
"""
|
||||
Create a hash for any dict or list with mixed key types
|
||||
|
||||
Arguments:
|
||||
obj {Any} -- _description_
|
||||
|
||||
Returns:
|
||||
str -- _description_
|
||||
"""
|
||||
def normalize(o: Any) -> Any:
|
||||
if isinstance(o, dict):
|
||||
# Sort by repr of keys to handle mixed types (str, int, etc.)
|
||||
o = cast(dict[Any, Any], o)
|
||||
return tuple(sorted((repr(k), normalize(v)) for k, v in o.items()))
|
||||
if isinstance(o, (list, tuple)):
|
||||
o = cast(Sequence[Any], o)
|
||||
return tuple(normalize(item) for item in o)
|
||||
return repr(o)
|
||||
|
||||
normalized = normalize(obj)
|
||||
return hashlib.sha256(str(normalized).encode()).hexdigest()
|
||||
|
||||
|
||||
def dict_hash_frozen(data: dict[Any, Any]) -> int:
|
||||
"""
|
||||
NOT RECOMMENDED, use dict_hash_crc or hash_object instead
|
||||
If used, DO NOT CHANGE
|
||||
hash a dict via freeze
|
||||
|
||||
Args:
|
||||
@@ -22,18 +48,25 @@ def dict_hash_frozen(data: dict[Any, Any]) -> int:
|
||||
|
||||
def dict_hash_crc(data: dict[Any, Any] | list[Any]) -> str:
|
||||
"""
|
||||
Create a sha256 hash over dict
|
||||
Legacy Method, must be kept for fallback
|
||||
Create a sha256 hash over dict or list
|
||||
alternative for
|
||||
dict_hash_frozen
|
||||
|
||||
Args:
|
||||
data (dict | list): _description_
|
||||
data (dict[Any, Any] | list[Any]): _description_
|
||||
|
||||
Returns:
|
||||
str: _description_
|
||||
str: sha256 hash, prefiex with HO_ if fallback used
|
||||
"""
|
||||
try:
|
||||
return hashlib.sha256(
|
||||
json.dumps(data, sort_keys=True, ensure_ascii=True).encode('utf-8')
|
||||
# IT IS IMPORTANT THAT THE BELOW CALL STAYS THE SAME AND DOES NOT CHANGE OR WE WILL GET DIFFERENT HASHES
|
||||
# separators=(',', ':') to get rid of spaces, but if this is used the hash will be different, DO NOT ADD
|
||||
json.dumps(data, sort_keys=True, ensure_ascii=True, default=str).encode('utf-8')
|
||||
).hexdigest()
|
||||
except TypeError:
|
||||
# Fallback tod different hasher, will return DIFFERENT hash than above, so only usable in int/str key mixes
|
||||
return "HO_" + hash_object(data)
|
||||
|
||||
# __END__
|
||||
|
||||
@@ -58,7 +58,12 @@ def make_unique_list_of_dicts(dict_list: list[Any]) -> list[Any]:
|
||||
"""
|
||||
try:
|
||||
# try json dumps, can fail with int and str index types
|
||||
return list({json.dumps(d, sort_keys=True, ensure_ascii=True): d for d in dict_list}.values())
|
||||
return list(
|
||||
{
|
||||
json.dumps(d, sort_keys=True, ensure_ascii=True, separators=(',', ':')): d
|
||||
for d in dict_list
|
||||
}.values()
|
||||
)
|
||||
except TypeError:
|
||||
# Fallback for non-serializable entries, slow but works
|
||||
unique: list[Any] = []
|
||||
|
||||
@@ -5,6 +5,7 @@ test list helpers
|
||||
from typing import Any
|
||||
from corelibs.debug_handling.dump_data import dump_data
|
||||
from corelibs.iterator_handling.list_helpers import is_list_in_list, convert_to_list, make_unique_list_of_dicts
|
||||
from corelibs.iterator_handling.fingerprint import dict_hash_crc
|
||||
|
||||
|
||||
def __test_is_list_in_list_a():
|
||||
@@ -29,7 +30,8 @@ def __make_unique_list_of_dicts():
|
||||
{"a": 3, "b": 4, "nested": {"x": 30, "y": 40}}
|
||||
]
|
||||
unique_dicts = make_unique_list_of_dicts(dict_list)
|
||||
print(f"Unique dicts: {dump_data(unique_dicts)}")
|
||||
dhf = dict_hash_crc(unique_dicts)
|
||||
print(f"Unique dicts: {dump_data(unique_dicts)} [{dhf}]")
|
||||
|
||||
dict_list = [
|
||||
{"a": 1, 1: "one"},
|
||||
@@ -37,7 +39,8 @@ def __make_unique_list_of_dicts():
|
||||
{"a": 2, 1: "one"}
|
||||
]
|
||||
unique_dicts = make_unique_list_of_dicts(dict_list)
|
||||
print(f"Unique dicts: {dump_data(unique_dicts)}")
|
||||
dhf = dict_hash_crc(unique_dicts)
|
||||
print(f"Unique dicts: {dump_data(unique_dicts)} [{dhf}]")
|
||||
|
||||
dict_list = [
|
||||
{"a": 1, "b": [1, 2, 3]},
|
||||
@@ -46,7 +49,8 @@ def __make_unique_list_of_dicts():
|
||||
1, 2, "String", 1, "Foobar"
|
||||
]
|
||||
unique_dicts = make_unique_list_of_dicts(dict_list)
|
||||
print(f"Unique dicts: {dump_data(unique_dicts)}")
|
||||
dhf = dict_hash_crc(unique_dicts)
|
||||
print(f"Unique dicts: {dump_data(unique_dicts)} [{dhf}]")
|
||||
|
||||
dict_list: list[Any] = [
|
||||
[],
|
||||
@@ -59,7 +63,17 @@ def __make_unique_list_of_dicts():
|
||||
{"a": {}},
|
||||
]
|
||||
unique_dicts = make_unique_list_of_dicts(dict_list)
|
||||
print(f"Unique dicts: {dump_data(unique_dicts)}")
|
||||
dhf = dict_hash_crc(unique_dicts)
|
||||
print(f"Unique dicts: {dump_data(unique_dicts)} [{dhf}]")
|
||||
|
||||
dict_list: list[Any] = [
|
||||
(1, 2),
|
||||
(1, 2),
|
||||
(2, 3),
|
||||
]
|
||||
unique_dicts = make_unique_list_of_dicts(dict_list)
|
||||
dhf = dict_hash_crc(unique_dicts)
|
||||
print(f"Unique dicts: {dump_data(unique_dicts)} [{dhf}]")
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@@ -4,7 +4,101 @@ tests for corelibs.iterator_handling.fingerprint
|
||||
|
||||
from typing import Any
|
||||
import pytest
|
||||
from corelibs.iterator_handling.fingerprint import dict_hash_frozen, dict_hash_crc
|
||||
from corelibs.iterator_handling.fingerprint import dict_hash_frozen, dict_hash_crc, hash_object
|
||||
|
||||
|
||||
class TestHashObject:
|
||||
"""Tests for hash_object function"""
|
||||
|
||||
def test_hash_object_simple_dict(self):
|
||||
"""Test hashing a simple dictionary with hash_object"""
|
||||
data = {"key1": "value1", "key2": "value2"}
|
||||
result = hash_object(data)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert len(result) == 64 # SHA256 produces 64 hex characters
|
||||
|
||||
def test_hash_object_mixed_keys(self):
|
||||
"""Test hash_object with mixed int and string keys"""
|
||||
data = {"key1": "value1", 1: "value2", 2: "value3"}
|
||||
result = hash_object(data)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert len(result) == 64
|
||||
|
||||
def test_hash_object_consistency(self):
|
||||
"""Test that hash_object produces consistent results"""
|
||||
data = {"str_key": "value", 123: "number_key"}
|
||||
hash1 = hash_object(data)
|
||||
hash2 = hash_object(data)
|
||||
|
||||
assert hash1 == hash2
|
||||
|
||||
def test_hash_object_order_independence(self):
|
||||
"""Test that hash_object is order-independent"""
|
||||
data1 = {"a": 1, 1: "one", "b": 2, 2: "two"}
|
||||
data2 = {2: "two", "b": 2, 1: "one", "a": 1}
|
||||
hash1 = hash_object(data1)
|
||||
hash2 = hash_object(data2)
|
||||
|
||||
assert hash1 == hash2
|
||||
|
||||
def test_hash_object_list_of_dicts_mixed_keys(self):
|
||||
"""Test hash_object with list of dicts containing mixed keys"""
|
||||
data = [
|
||||
{"name": "item1", 1: "value1"},
|
||||
{"name": "item2", 2: "value2"}
|
||||
]
|
||||
result = hash_object(data)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert len(result) == 64
|
||||
|
||||
def test_hash_object_nested_mixed_keys(self):
|
||||
"""Test hash_object with nested structures containing mixed keys"""
|
||||
data = {
|
||||
"outer": {
|
||||
"inner": "value",
|
||||
1: "mixed_key"
|
||||
},
|
||||
2: "another_mixed"
|
||||
}
|
||||
result = hash_object(data)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert len(result) == 64
|
||||
|
||||
def test_hash_object_different_data(self):
|
||||
"""Test that different data produces different hashes"""
|
||||
data1 = {"key": "value", 1: "one"}
|
||||
data2 = {"key": "value", 2: "two"}
|
||||
hash1 = hash_object(data1)
|
||||
hash2 = hash_object(data2)
|
||||
|
||||
assert hash1 != hash2
|
||||
|
||||
def test_hash_object_complex_nested(self):
|
||||
"""Test hash_object with complex nested structures"""
|
||||
data = {
|
||||
"level1": {
|
||||
"level2": {
|
||||
1: "value",
|
||||
"key": [1, 2, {"nested": "deep", 3: "int_key"}]
|
||||
}
|
||||
}
|
||||
}
|
||||
result = hash_object(data)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert len(result) == 64
|
||||
|
||||
def test_hash_object_list_with_tuples(self):
|
||||
"""Test hash_object with lists containing tuples"""
|
||||
data = [("a", 1), ("b", 2), {1: "mixed", "key": "value"}]
|
||||
result = hash_object(data)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert len(result) == 64
|
||||
|
||||
|
||||
class TestDictHashFrozen:
|
||||
@@ -279,6 +373,116 @@ class TestDictHashCrc:
|
||||
assert isinstance(result, str)
|
||||
assert len(result) == 64
|
||||
|
||||
def test_dict_hash_crc_fallback_mixed_keys(self):
|
||||
"""Test dict_hash_crc fallback with mixed int and string keys"""
|
||||
data = {"key1": "value1", 1: "value2", 2: "value3"}
|
||||
result = dict_hash_crc(data)
|
||||
|
||||
assert isinstance(result, str)
|
||||
# Fallback prefixes with "HO_"
|
||||
assert result.startswith("HO_")
|
||||
# Hash should be 64 chars + 3 char prefix = 67 total
|
||||
assert len(result) == 67
|
||||
|
||||
def test_dict_hash_crc_fallback_consistency(self):
|
||||
"""Test that fallback produces consistent hashes"""
|
||||
data = {"str_key": "value", 123: "number_key", 456: "another"}
|
||||
hash1 = dict_hash_crc(data)
|
||||
hash2 = dict_hash_crc(data)
|
||||
|
||||
assert hash1 == hash2
|
||||
assert hash1.startswith("HO_")
|
||||
|
||||
def test_dict_hash_crc_fallback_order_independence(self):
|
||||
"""Test that fallback is order-independent for mixed-key dicts"""
|
||||
data1 = {"a": 1, 1: "one", "b": 2, 2: "two"}
|
||||
data2 = {2: "two", "b": 2, 1: "one", "a": 1}
|
||||
hash1 = dict_hash_crc(data1)
|
||||
hash2 = dict_hash_crc(data2)
|
||||
|
||||
assert hash1 == hash2
|
||||
assert hash1.startswith("HO_")
|
||||
|
||||
def test_dict_hash_crc_fallback_list_of_dicts_mixed_keys(self):
|
||||
"""Test fallback with list of dicts containing mixed keys"""
|
||||
data = [
|
||||
{"name": "item1", 1: "value1"},
|
||||
{"name": "item2", 2: "value2"},
|
||||
{3: "value3", "type": "mixed"}
|
||||
]
|
||||
result = dict_hash_crc(data)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert result.startswith("HO_")
|
||||
assert len(result) == 67
|
||||
|
||||
def test_dict_hash_crc_fallback_nested_mixed_keys(self):
|
||||
"""Test fallback with nested dicts containing mixed keys"""
|
||||
data = {
|
||||
"outer": {
|
||||
"inner": "value",
|
||||
1: "mixed_key"
|
||||
},
|
||||
2: "another_mixed"
|
||||
}
|
||||
result = dict_hash_crc(data)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert result.startswith("HO_")
|
||||
assert len(result) == 67
|
||||
|
||||
def test_dict_hash_crc_fallback_different_data(self):
|
||||
"""Test that different mixed-key data produces different hashes"""
|
||||
data1 = {"key": "value", 1: "one"}
|
||||
data2 = {"key": "value", 2: "two"}
|
||||
hash1 = dict_hash_crc(data1)
|
||||
hash2 = dict_hash_crc(data2)
|
||||
|
||||
assert hash1 != hash2
|
||||
assert hash1.startswith("HO_")
|
||||
assert hash2.startswith("HO_")
|
||||
|
||||
def test_dict_hash_crc_fallback_complex_structure(self):
|
||||
"""Test fallback with complex nested structure with mixed keys"""
|
||||
data = [
|
||||
{
|
||||
"id": 1,
|
||||
1: "first",
|
||||
"data": {
|
||||
"nested": "value",
|
||||
100: "nested_int_key"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
2: "second",
|
||||
"items": [1, 2, 3]
|
||||
}
|
||||
]
|
||||
result = dict_hash_crc(data)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert result.startswith("HO_")
|
||||
assert len(result) == 67
|
||||
|
||||
def test_dict_hash_crc_no_fallback_string_keys_only(self):
|
||||
"""Test that string-only keys don't trigger fallback"""
|
||||
data = {"key1": "value1", "key2": "value2", "key3": "value3"}
|
||||
result = dict_hash_crc(data)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert not result.startswith("HO_")
|
||||
assert len(result) == 64
|
||||
|
||||
def test_dict_hash_crc_no_fallback_int_keys_only(self):
|
||||
"""Test that int-only keys don't trigger fallback"""
|
||||
data = {1: "one", 2: "two", 3: "three"}
|
||||
result = dict_hash_crc(data)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert not result.startswith("HO_")
|
||||
assert len(result) == 64
|
||||
|
||||
|
||||
class TestComparisonBetweenHashFunctions:
|
||||
"""Tests comparing dict_hash_frozen and dict_hash_crc"""
|
||||
|
||||
Reference in New Issue
Block a user