Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/3708.misc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Optimize Morton order computation with hypercube optimization, vectorized decoding, magic number bit manipulation for 2D-5D, and singleton dimension removal, providing 10-45x speedup for typical chunk shapes.
94 changes: 90 additions & 4 deletions src/zarr/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1452,7 +1452,7 @@ def make_slice_selection(selection: Any) -> list[slice]:
def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]:
# Inspired by compressed morton code as implemented in Neuroglancer
# https://github.com/google/neuroglancer/blob/master/src/neuroglancer/datasource/precomputed/volume.md#compressed-morton-code
bits = tuple(math.ceil(math.log2(c)) for c in chunk_shape)
bits = tuple((c - 1).bit_length() for c in chunk_shape)
max_coords_bits = max(bits)
input_bit = 0
input_value = z
Expand All @@ -1467,16 +1467,102 @@ def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]:
return tuple(out)


@lru_cache
def decode_morton_vectorized(
z: npt.NDArray[np.intp], chunk_shape: tuple[int, ...]
) -> npt.NDArray[np.intp]:
"""Vectorized Morton code decoding for multiple z values.

Parameters
----------
z : ndarray
1D array of Morton codes to decode.
chunk_shape : tuple of int
Shape defining the coordinate space.

Returns
-------
ndarray
2D array of shape (len(z), len(chunk_shape)) containing decoded coordinates.
"""
n_dims = len(chunk_shape)
bits = tuple((c - 1).bit_length() for c in chunk_shape)

max_coords_bits = max(bits) if bits else 0
out = np.zeros((len(z), n_dims), dtype=np.intp)

input_bit = 0
for coord_bit in range(max_coords_bits):
for dim in range(n_dims):
if coord_bit < bits[dim]:
# Extract bit at position input_bit from all z values
bit_values = (z >> input_bit) & 1
# Place bit at coord_bit position in dimension dim
out[:, dim] |= bit_values << coord_bit
input_bit += 1

return out


@lru_cache(maxsize=16)
def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
n_total = product(chunk_shape)
order: list[tuple[int, ...]] = []
i = 0
if n_total == 0:
return ()

# Optimization: Remove singleton dimensions to enable magic number usage
# for shapes like (1,1,32,32,32). Compute Morton on squeezed shape, then expand.
singleton_dims = tuple(i for i, s in enumerate(chunk_shape) if s == 1)
if singleton_dims:
squeezed_shape = tuple(s for s in chunk_shape if s != 1)
if squeezed_shape:
# Compute Morton order on squeezed shape
squeezed_order = _morton_order(squeezed_shape)
# Expand coordinates to include singleton dimensions (always 0)
expanded: list[tuple[int, ...]] = []
for coord in squeezed_order:
full_coord: list[int] = []
squeezed_idx = 0
for i in range(len(chunk_shape)):
if chunk_shape[i] == 1:
full_coord.append(0)
else:
full_coord.append(coord[squeezed_idx])
squeezed_idx += 1
expanded.append(tuple(full_coord))
return tuple(expanded)
else:
# All dimensions are singletons, just return the single point
return ((0,) * len(chunk_shape),)

n_dims = len(chunk_shape)

# Find the largest power-of-2 hypercube that fits within chunk_shape.
# Within this hypercube, Morton codes are guaranteed to be in bounds.
min_dim = min(chunk_shape)
if min_dim >= 1:
power = min_dim.bit_length() - 1 # floor(log2(min_dim))
hypercube_size = 1 << power # 2^power
n_hypercube = hypercube_size**n_dims
else:
n_hypercube = 0

# Within the hypercube, no bounds checking needed - use vectorized decoding
order: list[tuple[int, ...]]
if n_hypercube > 0:
z_values = np.arange(n_hypercube, dtype=np.intp)
hypercube_coords = decode_morton_vectorized(z_values, chunk_shape)
order = [tuple(row) for row in hypercube_coords]
else:
order = []

# For remaining elements, bounds checking is needed
i = n_hypercube
while len(order) < n_total:
m = decode_morton(i, chunk_shape)
if all(x < y for x, y in zip(m, chunk_shape, strict=False)):
order.append(m)
i += 1

return tuple(order)


Expand Down
98 changes: 98 additions & 0 deletions tests/benchmarks/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,101 @@ def test_slice_indexing(

data[:] = 1
benchmark(getitem, data, indexer)


# Benchmark for Morton order optimization with power-of-2 shards
# Morton order is used internally by sharding codec for chunk iteration
morton_shards = (
(16,) * 3, # With 2x2x2 chunks: 8x8x8 = 512 chunks per shard
(32,) * 3, # With 2x2x2 chunks: 16x16x16 = 4096 chunks per shard
)


@pytest.mark.parametrize("store", ["memory"], indirect=["store"])
@pytest.mark.parametrize("shards", morton_shards, ids=str)
def test_sharded_morton_indexing(
store: Store,
shards: tuple[int, ...],
benchmark: BenchmarkFixture,
) -> None:
"""Benchmark sharded array indexing with power-of-2 chunks per shard.

This benchmark exercises the Morton order iteration path in the sharding
codec, which benefits from the hypercube and vectorization optimizations.
The Morton order cache is cleared before each iteration to measure the
full computation cost.
"""
from zarr.core.indexing import _morton_order

# Create array where each shard contains many small chunks
# e.g., shards=(32,32,32) with chunks=(2,2,2) means 16x16x16 = 4096 chunks per shard
shape = tuple(s * 2 for s in shards) # 2 shards per dimension
chunks = (2,) * 3 # Small chunks to maximize chunks per shard

data = create_array(
store=store,
shape=shape,
dtype="uint8",
chunks=chunks,
shards=shards,
compressors=None,
filters=None,
fill_value=0,
)

data[:] = 1
# Read a sub-shard region to exercise Morton order iteration
indexer = (slice(shards[0]),) * 3

def read_with_cache_clear() -> None:
_morton_order.cache_clear()
getitem(data, indexer)

benchmark(read_with_cache_clear)


# Benchmark with larger chunks_per_shard to make Morton order impact more visible
large_morton_shards = (
(32,) * 3, # With 1x1x1 chunks: 32x32x32 = 32768 chunks per shard
)


@pytest.mark.parametrize("store", ["memory"], indirect=["store"])
@pytest.mark.parametrize("shards", large_morton_shards, ids=str)
def test_sharded_morton_indexing_large(
store: Store,
shards: tuple[int, ...],
benchmark: BenchmarkFixture,
) -> None:
"""Benchmark sharded array indexing with large chunks_per_shard.

Uses 1x1x1 chunks to maximize chunks_per_shard (32^3 = 32768), making
the Morton order computation a more significant portion of total time.
The Morton order cache is cleared before each iteration.
"""
from zarr.core.indexing import _morton_order

# 1x1x1 chunks means chunks_per_shard equals shard shape
shape = tuple(s * 2 for s in shards) # 2 shards per dimension
chunks = (1,) * 3 # 1x1x1 chunks: chunks_per_shard = shards

data = create_array(
store=store,
shape=shape,
dtype="uint8",
chunks=chunks,
shards=shards,
compressors=None,
filters=None,
fill_value=0,
)

data[:] = 1
# Read one full shard
indexer = (slice(shards[0]),) * 3

def read_with_cache_clear() -> None:
_morton_order.cache_clear()
getitem(data, indexer)
Comment on lines +146 to +148
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Claude's test involves clearing the cache before each benchmark run. Let's pretend that this represents thrashing the now bounded cache.


benchmark(read_with_cache_clear)