Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
setuptools>=68
wheel
numpy>=1.24 # arrays
# Optional dependencies
pandas>=1.5 # dataframes
pyarrow>=12.0.0 # arrow
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
packages=find_packages(where="src"),
package_dir={"": "src"},
include_package_data=True,
install_requires=[],
install_requires=['numpy>=1.24'],
extras_require={
"dataframes": ["pandas"],
"arrow": ["pyarrow"],
Expand Down
96 changes: 92 additions & 4 deletions tests/test_core.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import sys
import os
import struct
from unittest.mock import patch

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))

import pytest

from src.keyedstablehash.canonical import canonicalize_to_bytes
from src.keyedstablehash.siphash import siphash24
from src.keyedstablehash.stable import stable_keyed_hash
import src.keyedstablehash.canonical as canonical_module

SIPHASH_VECTORS = {
0: "310e0edd47db6f72",
Expand Down Expand Up @@ -55,20 +59,49 @@ def test_canonicalization_handles_sets_and_lists():


def test_rejects_unsupported_type():
class Example:
class ExampleSlots:
__slots__ = ("value",)

def __init__(self, value: int):
self.value = value

with pytest.raises(TypeError):
stable_keyed_hash(Example(1), key=b"\x00" * 16)
# Slots classes do not have __dict__ and are not automatically supported
with pytest.raises(TypeError) as excinfo:
stable_keyed_hash(ExampleSlots(1), key=b"\x00" * 16)
assert "Unsupported type" in str(excinfo.value)


def test_canonical_custom_object_with_dict():
"""Test that objects with __dict__ are canonicalized by class name and vars."""

class ExampleObj:
def __init__(self, value):
self.value = value
self.ignore = None # Just to have multiple fields

obj = ExampleObj(42)
encoded = canonicalize_to_bytes(obj)

# Check tag for Object
assert encoded.startswith(b"O")
# Check that class name is encoded
assert b"ExampleObj" in encoded
# Check that the internal value 42 is present (encoded as int)
assert canonicalize_to_bytes(42) in encoded


def test_encode_length_and_int():
# Zero
assert canonicalize_to_bytes(0) == b"I" + struct.pack("<Q", 1) + b"\x00"
# Positive small
assert canonicalize_to_bytes(123) == canonicalize_to_bytes(123)
# Negative small
assert canonicalize_to_bytes(-123) == canonicalize_to_bytes(-123)
assert canonicalize_to_bytes(0) == canonicalize_to_bytes(0)
# Large integers
large_int = 2**64 + 7
encoded_large = canonicalize_to_bytes(large_int)
assert b"I" in encoded_large
assert len(encoded_large) > 10


def test_feed_canonical_dict_order():
Expand All @@ -83,12 +116,67 @@ def test_feed_canonical_list_vs_tuple():
lst = [1, 2, 3]
t = (1, 2, 3)
assert canonicalize_to_bytes(lst) != canonicalize_to_bytes(t)
assert canonicalize_to_bytes(lst).startswith(b"L")
assert canonicalize_to_bytes(t).startswith(b"T")


def test_feed_canonical_set_order():
s1 = {1, 2, 3}
s2 = {3, 2, 1}
assert canonicalize_to_bytes(s1) == canonicalize_to_bytes(s2)
assert canonicalize_to_bytes(s1).startswith(b"E")


def test_canonical_frozenset():
fs = frozenset([3, 2, 1])
s = {1, 2, 3}
# Frozenset and Set should produce the same content encoding if logic allows,
# or at least be supported. In canonical.py both use _handle_set.
assert canonicalize_to_bytes(fs) == canonicalize_to_bytes(s)


def test_canonical_primitives_full_coverage():
# None
assert canonicalize_to_bytes(None) == b"N"

# Bool
assert canonicalize_to_bytes(True) == b"B\x01"
assert canonicalize_to_bytes(False) == b"B\x00"

# Float
f_val = 1.234
encoded_f = canonicalize_to_bytes(f_val)
assert encoded_f.startswith(b"F")
assert struct.pack("<d", f_val) in encoded_f

# Bytes / Bytearray / Memoryview
raw = b"hello"
expected = b"Y" + struct.pack("<Q", 5) + raw
assert canonicalize_to_bytes(raw) == expected
assert canonicalize_to_bytes(bytearray(raw)) == expected
assert canonicalize_to_bytes(memoryview(raw)) == expected


def test_numpy_normalization_mock():
"""
Simulate presence of numpy to test _normalize_scalar without requiring numpy.
We patch _NUMPY_GENERIC to include a dummy class, then ensure .item() is called.
"""

class FakeNumpyInt:
def __init__(self, val):
self.val = val

def item(self):
return self.val

# Patch the constant in the module
with patch("src.keyedstablehash.canonical._NUMPY_GENERIC", (FakeNumpyInt,)):
fake_np = FakeNumpyInt(99)
# Should be converted to python int 99
encoded = canonicalize_to_bytes(fake_np)
expected = canonicalize_to_bytes(99)
assert encoded == expected


def test_stable_keyed_hash_algo_error():
Expand Down