diff --git a/src/corelibs/iterator_handling/fingerprint.py b/src/corelibs/iterator_handling/fingerprint.py index 63f2dbc..8b828d5 100644 --- a/src/corelibs/iterator_handling/fingerprint.py +++ b/src/corelibs/iterator_handling/fingerprint.py @@ -4,11 +4,37 @@ Various dictionary, object and list hashers import json import hashlib -from typing import Any +from typing import Any, cast, Sequence + + +def hash_object(obj: Any) -> str: + """ + Create a hash for any dict or list with mixed key types + + Arguments: + obj {Any} -- _description_ + + Returns: + str -- _description_ + """ + def normalize(o: Any) -> Any: + if isinstance(o, dict): + # Sort by repr of keys to handle mixed types (str, int, etc.) + o = cast(dict[Any, Any], o) + return tuple(sorted((repr(k), normalize(v)) for k, v in o.items())) + if isinstance(o, (list, tuple)): + o = cast(Sequence[Any], o) + return tuple(normalize(item) for item in o) + return repr(o) + + normalized = normalize(obj) + return hashlib.sha256(str(normalized).encode()).hexdigest() def dict_hash_frozen(data: dict[Any, Any]) -> int: """ + NOT RECOMMENDED, use dict_hash_crc or hash_object instead + If used, DO NOT CHANGE hash a dict via freeze Args: @@ -22,18 +48,25 @@ def dict_hash_frozen(data: dict[Any, Any]) -> int: def dict_hash_crc(data: dict[Any, Any] | list[Any]) -> str: """ - Create a sha256 hash over dict + Legacy Method, must be kept for fallback + Create a sha256 hash over dict or list alternative for dict_hash_frozen Args: - data (dict | list): _description_ + data (dict[Any, Any] | list[Any]): _description_ Returns: - str: _description_ + str: sha256 hash, prefiex with HO_ if fallback used """ - return hashlib.sha256( - json.dumps(data, sort_keys=True, ensure_ascii=True).encode('utf-8') - ).hexdigest() + try: + return hashlib.sha256( + # IT IS IMPORTANT THAT THE BELOW CALL STAYS THE SAME AND DOES NOT CHANGE OR WE WILL GET DIFFERENT HASHES + # separators=(',', ':') to get rid of spaces, but if this is used the hash will be different, DO NOT ADD + json.dumps(data, sort_keys=True, ensure_ascii=True, default=str).encode('utf-8') + ).hexdigest() + except TypeError: + # Fallback tod different hasher, will return DIFFERENT hash than above, so only usable in int/str key mixes + return "HO_" + hash_object(data) # __END__ diff --git a/src/corelibs/iterator_handling/list_helpers.py b/src/corelibs/iterator_handling/list_helpers.py index 64635a4..ec7d1ab 100644 --- a/src/corelibs/iterator_handling/list_helpers.py +++ b/src/corelibs/iterator_handling/list_helpers.py @@ -58,7 +58,12 @@ def make_unique_list_of_dicts(dict_list: list[Any]) -> list[Any]: """ try: # try json dumps, can fail with int and str index types - return list({json.dumps(d, sort_keys=True, ensure_ascii=True): d for d in dict_list}.values()) + return list( + { + json.dumps(d, sort_keys=True, ensure_ascii=True, separators=(',', ':')): d + for d in dict_list + }.values() + ) except TypeError: # Fallback for non-serializable entries, slow but works unique: list[Any] = [] diff --git a/test-run/iterator_handling/list_helpers.py b/test-run/iterator_handling/list_helpers.py index ca8f1d9..b2997bf 100644 --- a/test-run/iterator_handling/list_helpers.py +++ b/test-run/iterator_handling/list_helpers.py @@ -5,6 +5,7 @@ test list helpers from typing import Any from corelibs.debug_handling.dump_data import dump_data from corelibs.iterator_handling.list_helpers import is_list_in_list, convert_to_list, make_unique_list_of_dicts +from corelibs.iterator_handling.fingerprint import dict_hash_crc def __test_is_list_in_list_a(): @@ -29,7 +30,8 @@ def __make_unique_list_of_dicts(): {"a": 3, "b": 4, "nested": {"x": 30, "y": 40}} ] unique_dicts = make_unique_list_of_dicts(dict_list) - print(f"Unique dicts: {dump_data(unique_dicts)}") + dhf = dict_hash_crc(unique_dicts) + print(f"Unique dicts: {dump_data(unique_dicts)} [{dhf}]") dict_list = [ {"a": 1, 1: "one"}, @@ -37,7 +39,8 @@ def __make_unique_list_of_dicts(): {"a": 2, 1: "one"} ] unique_dicts = make_unique_list_of_dicts(dict_list) - print(f"Unique dicts: {dump_data(unique_dicts)}") + dhf = dict_hash_crc(unique_dicts) + print(f"Unique dicts: {dump_data(unique_dicts)} [{dhf}]") dict_list = [ {"a": 1, "b": [1, 2, 3]}, @@ -46,7 +49,8 @@ def __make_unique_list_of_dicts(): 1, 2, "String", 1, "Foobar" ] unique_dicts = make_unique_list_of_dicts(dict_list) - print(f"Unique dicts: {dump_data(unique_dicts)}") + dhf = dict_hash_crc(unique_dicts) + print(f"Unique dicts: {dump_data(unique_dicts)} [{dhf}]") dict_list: list[Any] = [ [], @@ -59,7 +63,17 @@ def __make_unique_list_of_dicts(): {"a": {}}, ] unique_dicts = make_unique_list_of_dicts(dict_list) - print(f"Unique dicts: {dump_data(unique_dicts)}") + dhf = dict_hash_crc(unique_dicts) + print(f"Unique dicts: {dump_data(unique_dicts)} [{dhf}]") + + dict_list: list[Any] = [ + (1, 2), + (1, 2), + (2, 3), + ] + unique_dicts = make_unique_list_of_dicts(dict_list) + dhf = dict_hash_crc(unique_dicts) + print(f"Unique dicts: {dump_data(unique_dicts)} [{dhf}]") def main(): diff --git a/tests/unit/iterator_handling/test_fingerprint.py b/tests/unit/iterator_handling/test_fingerprint.py index daf7f2a..bde74ae 100644 --- a/tests/unit/iterator_handling/test_fingerprint.py +++ b/tests/unit/iterator_handling/test_fingerprint.py @@ -4,7 +4,101 @@ tests for corelibs.iterator_handling.fingerprint from typing import Any import pytest -from corelibs.iterator_handling.fingerprint import dict_hash_frozen, dict_hash_crc +from corelibs.iterator_handling.fingerprint import dict_hash_frozen, dict_hash_crc, hash_object + + +class TestHashObject: + """Tests for hash_object function""" + + def test_hash_object_simple_dict(self): + """Test hashing a simple dictionary with hash_object""" + data = {"key1": "value1", "key2": "value2"} + result = hash_object(data) + + assert isinstance(result, str) + assert len(result) == 64 # SHA256 produces 64 hex characters + + def test_hash_object_mixed_keys(self): + """Test hash_object with mixed int and string keys""" + data = {"key1": "value1", 1: "value2", 2: "value3"} + result = hash_object(data) + + assert isinstance(result, str) + assert len(result) == 64 + + def test_hash_object_consistency(self): + """Test that hash_object produces consistent results""" + data = {"str_key": "value", 123: "number_key"} + hash1 = hash_object(data) + hash2 = hash_object(data) + + assert hash1 == hash2 + + def test_hash_object_order_independence(self): + """Test that hash_object is order-independent""" + data1 = {"a": 1, 1: "one", "b": 2, 2: "two"} + data2 = {2: "two", "b": 2, 1: "one", "a": 1} + hash1 = hash_object(data1) + hash2 = hash_object(data2) + + assert hash1 == hash2 + + def test_hash_object_list_of_dicts_mixed_keys(self): + """Test hash_object with list of dicts containing mixed keys""" + data = [ + {"name": "item1", 1: "value1"}, + {"name": "item2", 2: "value2"} + ] + result = hash_object(data) + + assert isinstance(result, str) + assert len(result) == 64 + + def test_hash_object_nested_mixed_keys(self): + """Test hash_object with nested structures containing mixed keys""" + data = { + "outer": { + "inner": "value", + 1: "mixed_key" + }, + 2: "another_mixed" + } + result = hash_object(data) + + assert isinstance(result, str) + assert len(result) == 64 + + def test_hash_object_different_data(self): + """Test that different data produces different hashes""" + data1 = {"key": "value", 1: "one"} + data2 = {"key": "value", 2: "two"} + hash1 = hash_object(data1) + hash2 = hash_object(data2) + + assert hash1 != hash2 + + def test_hash_object_complex_nested(self): + """Test hash_object with complex nested structures""" + data = { + "level1": { + "level2": { + 1: "value", + "key": [1, 2, {"nested": "deep", 3: "int_key"}] + } + } + } + result = hash_object(data) + + assert isinstance(result, str) + assert len(result) == 64 + + def test_hash_object_list_with_tuples(self): + """Test hash_object with lists containing tuples""" + data = [("a", 1), ("b", 2), {1: "mixed", "key": "value"}] + result = hash_object(data) + + assert isinstance(result, str) + assert len(result) == 64 class TestDictHashFrozen: @@ -279,6 +373,116 @@ class TestDictHashCrc: assert isinstance(result, str) assert len(result) == 64 + def test_dict_hash_crc_fallback_mixed_keys(self): + """Test dict_hash_crc fallback with mixed int and string keys""" + data = {"key1": "value1", 1: "value2", 2: "value3"} + result = dict_hash_crc(data) + + assert isinstance(result, str) + # Fallback prefixes with "HO_" + assert result.startswith("HO_") + # Hash should be 64 chars + 3 char prefix = 67 total + assert len(result) == 67 + + def test_dict_hash_crc_fallback_consistency(self): + """Test that fallback produces consistent hashes""" + data = {"str_key": "value", 123: "number_key", 456: "another"} + hash1 = dict_hash_crc(data) + hash2 = dict_hash_crc(data) + + assert hash1 == hash2 + assert hash1.startswith("HO_") + + def test_dict_hash_crc_fallback_order_independence(self): + """Test that fallback is order-independent for mixed-key dicts""" + data1 = {"a": 1, 1: "one", "b": 2, 2: "two"} + data2 = {2: "two", "b": 2, 1: "one", "a": 1} + hash1 = dict_hash_crc(data1) + hash2 = dict_hash_crc(data2) + + assert hash1 == hash2 + assert hash1.startswith("HO_") + + def test_dict_hash_crc_fallback_list_of_dicts_mixed_keys(self): + """Test fallback with list of dicts containing mixed keys""" + data = [ + {"name": "item1", 1: "value1"}, + {"name": "item2", 2: "value2"}, + {3: "value3", "type": "mixed"} + ] + result = dict_hash_crc(data) + + assert isinstance(result, str) + assert result.startswith("HO_") + assert len(result) == 67 + + def test_dict_hash_crc_fallback_nested_mixed_keys(self): + """Test fallback with nested dicts containing mixed keys""" + data = { + "outer": { + "inner": "value", + 1: "mixed_key" + }, + 2: "another_mixed" + } + result = dict_hash_crc(data) + + assert isinstance(result, str) + assert result.startswith("HO_") + assert len(result) == 67 + + def test_dict_hash_crc_fallback_different_data(self): + """Test that different mixed-key data produces different hashes""" + data1 = {"key": "value", 1: "one"} + data2 = {"key": "value", 2: "two"} + hash1 = dict_hash_crc(data1) + hash2 = dict_hash_crc(data2) + + assert hash1 != hash2 + assert hash1.startswith("HO_") + assert hash2.startswith("HO_") + + def test_dict_hash_crc_fallback_complex_structure(self): + """Test fallback with complex nested structure with mixed keys""" + data = [ + { + "id": 1, + 1: "first", + "data": { + "nested": "value", + 100: "nested_int_key" + } + }, + { + "id": 2, + 2: "second", + "items": [1, 2, 3] + } + ] + result = dict_hash_crc(data) + + assert isinstance(result, str) + assert result.startswith("HO_") + assert len(result) == 67 + + def test_dict_hash_crc_no_fallback_string_keys_only(self): + """Test that string-only keys don't trigger fallback""" + data = {"key1": "value1", "key2": "value2", "key3": "value3"} + result = dict_hash_crc(data) + + assert isinstance(result, str) + assert not result.startswith("HO_") + assert len(result) == 64 + + def test_dict_hash_crc_no_fallback_int_keys_only(self): + """Test that int-only keys don't trigger fallback""" + data = {1: "one", 2: "two", 3: "three"} + result = dict_hash_crc(data) + + assert isinstance(result, str) + assert not result.startswith("HO_") + assert len(result) == 64 + class TestComparisonBetweenHashFunctions: """Tests comparing dict_hash_frozen and dict_hash_crc"""