From 47730f3c3a6cd9f2436b74d0f828eec353ed2d01 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 13 Feb 2026 00:15:57 -0500 Subject: [PATCH 01/14] perf: Skip bounds check for initial elements in 2^n hypercube --- src/zarr/core/indexing.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index beffa99cfa..a6288ce548 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1470,13 +1470,31 @@ def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]: @lru_cache def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: n_total = product(chunk_shape) + n_dims = len(chunk_shape) order: list[tuple[int, ...]] = [] - i = 0 + + # Find the largest power-of-2 hypercube that fits within chunk_shape. + # Within this hypercube, Morton codes are guaranteed to be in bounds. + min_dim = min(chunk_shape) + if min_dim >= 1: + power = min_dim.bit_length() - 1 # floor(log2(min_dim)) + hypercube_size = 1 << power # 2^power + n_hypercube = hypercube_size**n_dims + else: + n_hypercube = 0 + + # Within the hypercube, no bounds checking needed + for i in range(n_hypercube): + order.append(decode_morton(i, chunk_shape)) + + # For remaining elements, bounds checking is needed + i = n_hypercube while len(order) < n_total: m = decode_morton(i, chunk_shape) if all(x < y for x, y in zip(m, chunk_shape, strict=False)): order.append(m) i += 1 + return tuple(order) From 865df2ac15230e24efd97f64d78fbed28a60b805 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 13 Feb 2026 00:32:51 -0500 Subject: [PATCH 02/14] lint:Use a list comprehension rather than a for loop --- src/zarr/core/indexing.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index a6288ce548..97c75b1372 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1484,8 +1484,7 @@ def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: n_hypercube = 0 # Within the hypercube, no bounds checking needed - for i in range(n_hypercube): - order.append(decode_morton(i, chunk_shape)) + order = [decode_morton(i, chunk_shape) for i in range(n_hypercube)] # For remaining elements, bounds checking is needed i = n_hypercube From ce0099c6b08559df9d93df5d66e46b1d3069a1c3 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 13 Feb 2026 00:47:37 -0500 Subject: [PATCH 03/14] pref:Add decode_morton_vectorized --- src/zarr/core/indexing.py | 44 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 97c75b1372..8da5b31f46 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1467,11 +1467,47 @@ def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]: return tuple(out) +def decode_morton_vectorized( + z: npt.NDArray[np.intp], chunk_shape: tuple[int, ...] +) -> npt.NDArray[np.intp]: + """Vectorized Morton code decoding for multiple z values. + + Parameters + ---------- + z : ndarray + 1D array of Morton codes to decode. + chunk_shape : tuple of int + Shape defining the coordinate space. + + Returns + ------- + ndarray + 2D array of shape (len(z), len(chunk_shape)) containing decoded coordinates. + """ + n_dims = len(chunk_shape) + bits = tuple(math.ceil(math.log2(c)) if c > 1 else 1 for c in chunk_shape) + max_coords_bits = max(bits) + + # Output array: each row is a decoded coordinate + out = np.zeros((len(z), n_dims), dtype=np.intp) + + input_bit = 0 + for coord_bit in range(max_coords_bits): + for dim in range(n_dims): + if coord_bit < bits[dim]: + # Extract bit at position input_bit from all z values + bit_values = (z >> input_bit) & 1 + # Place bit at coord_bit position in dimension dim + out[:, dim] |= bit_values << coord_bit + input_bit += 1 + + return out + + @lru_cache def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: n_total = product(chunk_shape) n_dims = len(chunk_shape) - order: list[tuple[int, ...]] = [] # Find the largest power-of-2 hypercube that fits within chunk_shape. # Within this hypercube, Morton codes are guaranteed to be in bounds. @@ -1483,8 +1519,10 @@ def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: else: n_hypercube = 0 - # Within the hypercube, no bounds checking needed - order = [decode_morton(i, chunk_shape) for i in range(n_hypercube)] + # Within the hypercube, no bounds checking needed - use vectorized decoding + z_values = np.arange(n_hypercube, dtype=np.intp) + hypercube_coords = decode_morton_vectorized(z_values, chunk_shape) + order: list[tuple[int, ...]] = [tuple(row) for row in hypercube_coords] # For remaining elements, bounds checking is needed i = n_hypercube From 1b1076f136645300d5132839403494ea2920ac13 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 13 Feb 2026 01:06:26 -0500 Subject: [PATCH 04/14] perf:Replace math.log2() with bit_length() --- src/zarr/core/indexing.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 8da5b31f46..c9c7e0f35a 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1452,7 +1452,7 @@ def make_slice_selection(selection: Any) -> list[slice]: def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]: # Inspired by compressed morton code as implemented in Neuroglancer # https://github.com/google/neuroglancer/blob/master/src/neuroglancer/datasource/precomputed/volume.md#compressed-morton-code - bits = tuple(math.ceil(math.log2(c)) for c in chunk_shape) + bits = tuple((c - 1).bit_length() for c in chunk_shape) max_coords_bits = max(bits) input_bit = 0 input_value = z @@ -1485,8 +1485,8 @@ def decode_morton_vectorized( 2D array of shape (len(z), len(chunk_shape)) containing decoded coordinates. """ n_dims = len(chunk_shape) - bits = tuple(math.ceil(math.log2(c)) if c > 1 else 1 for c in chunk_shape) - max_coords_bits = max(bits) + bits = tuple((c - 1).bit_length() for c in chunk_shape) + max_coords_bits = max(bits) if bits else 0 # Output array: each row is a decoded coordinate out = np.zeros((len(z), n_dims), dtype=np.intp) @@ -1507,6 +1507,9 @@ def decode_morton_vectorized( @lru_cache def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: n_total = product(chunk_shape) + if n_total == 0: + return () + n_dims = len(chunk_shape) # Find the largest power-of-2 hypercube that fits within chunk_shape. @@ -1520,9 +1523,12 @@ def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: n_hypercube = 0 # Within the hypercube, no bounds checking needed - use vectorized decoding - z_values = np.arange(n_hypercube, dtype=np.intp) - hypercube_coords = decode_morton_vectorized(z_values, chunk_shape) - order: list[tuple[int, ...]] = [tuple(row) for row in hypercube_coords] + if n_hypercube > 0: + z_values = np.arange(n_hypercube, dtype=np.intp) + hypercube_coords = decode_morton_vectorized(z_values, chunk_shape) + order: list[tuple[int, ...]] = [tuple(row) for row in hypercube_coords] + else: + order = [] # For remaining elements, bounds checking is needed i = n_hypercube From 47a68eb8d5b476410bb97703b2c332bd5a18c39f Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 13 Feb 2026 01:38:09 -0500 Subject: [PATCH 05/14] perf:Use magic numbers for 2D and 3D --- src/zarr/core/indexing.py | 85 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index c9c7e0f35a..45fc300bf3 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1467,6 +1467,78 @@ def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]: return tuple(out) +# Magic numbers for 2D Morton decode: extract every 2nd bit and compact them. +# Bits for dimension d are at positions d, d+2, d+4, ... +_MORTON_2D_MASKS: tuple[int, ...] = ( + 0x5555555555555555, # Extract mask: bits 0, 2, 4, ... + 0x3333333333333333, # Compact stage 1 + 0x0F0F0F0F0F0F0F0F, # Compact stage 2 + 0x00FF00FF00FF00FF, # Compact stage 3 + 0x0000FFFF0000FFFF, # Compact stage 4 +) + +# Magic numbers for 3D Morton decode: extract every 3rd bit and compact them. +# Bits for dimension d are at positions d, d+3, d+6, d+9, ... +# Uses uint64 to handle large mask values correctly with numpy. +_MORTON_3D_MASKS: tuple[np.uint64, ...] = ( + np.uint64(0x1249249249249249), # Extract mask: bits 0, 3, 6, 9, ... + np.uint64(0x30C30C30C30C30C3), # Compact stage 1 + np.uint64(0xF00F00F00F00F00F), # Compact stage 2 + np.uint64(0x00FF0000FF0000FF), # Compact stage 3 + np.uint64(0x00FF00000000FFFF), # Compact stage 4 + np.uint64(0x00000000001FFFFF), # Compact stage 5 +) + + +def _decode_morton_2d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: + """Decode 2D Morton codes using magic number bit manipulation. + + This extracts interleaved x,y coordinates from Morton codes using + parallel bit operations instead of bit-by-bit loops. + """ + out = np.zeros((len(z), 2), dtype=np.intp) + + # Extract x (bits 0, 2, 4, ...) and compact to (bits 0, 1, 2, ...) + x = z & _MORTON_2D_MASKS[0] + x = (x | (x >> 1)) & _MORTON_2D_MASKS[1] + x = (x | (x >> 2)) & _MORTON_2D_MASKS[2] + x = (x | (x >> 4)) & _MORTON_2D_MASKS[3] + x = (x | (x >> 8)) & _MORTON_2D_MASKS[4] + out[:, 0] = x + + # Extract y (bits 1, 3, 5, ...) and compact + y = (z >> 1) & _MORTON_2D_MASKS[0] + y = (y | (y >> 1)) & _MORTON_2D_MASKS[1] + y = (y | (y >> 2)) & _MORTON_2D_MASKS[2] + y = (y | (y >> 4)) & _MORTON_2D_MASKS[3] + y = (y | (y >> 8)) & _MORTON_2D_MASKS[4] + out[:, 1] = y + + return out + + +def _decode_morton_3d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: + """Decode 3D Morton codes using magic number bit manipulation. + + This extracts interleaved x,y,z coordinates from Morton codes using + parallel bit operations instead of bit-by-bit loops. + """ + # Convert to uint64 for bitwise operations with large masks + z_u64 = z.astype(np.uint64) + out = np.zeros((len(z), 3), dtype=np.intp) + + for dim in range(3): + x = (z_u64 >> dim) & _MORTON_3D_MASKS[0] + x = (x ^ (x >> 2)) & _MORTON_3D_MASKS[1] + x = (x ^ (x >> 4)) & _MORTON_3D_MASKS[2] + x = (x ^ (x >> 8)) & _MORTON_3D_MASKS[3] + x = (x ^ (x >> 16)) & _MORTON_3D_MASKS[4] + x = (x ^ (x >> 32)) & _MORTON_3D_MASKS[5] + out[:, dim] = x + + return out + + def decode_morton_vectorized( z: npt.NDArray[np.intp], chunk_shape: tuple[int, ...] ) -> npt.NDArray[np.intp]: @@ -1486,9 +1558,18 @@ def decode_morton_vectorized( """ n_dims = len(chunk_shape) bits = tuple((c - 1).bit_length() for c in chunk_shape) - max_coords_bits = max(bits) if bits else 0 - # Output array: each row is a decoded coordinate + # Use magic number optimization for 2D/3D with uniform bit widths. + # Magic numbers have bit limits: 2D supports up to 32 bits, 3D up to 21 bits per dimension. + if len(set(bits)) == 1: # All dimensions have same bit width + max_bits = bits[0] if bits else 0 + if n_dims == 2 and max_bits <= 32: + return _decode_morton_2d(z) + if n_dims == 3 and max_bits <= 21: + return _decode_morton_3d(z) + + # Fall back to generic bit-by-bit decoding + max_coords_bits = max(bits) if bits else 0 out = np.zeros((len(z), n_dims), dtype=np.intp) input_bit = 0 From 6fb6d00f6854a27002001ce74bfd441c3a88bc59 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 13 Feb 2026 01:48:12 -0500 Subject: [PATCH 06/14] perf:Add 4D Morton magic numbers --- src/zarr/core/indexing.py | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 45fc300bf3..01606d44e8 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1489,6 +1489,16 @@ def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]: np.uint64(0x00000000001FFFFF), # Compact stage 5 ) +# Magic numbers for 4D Morton decode: extract every 4th bit and compact them. +# Bits for dimension d are at positions d, d+4, d+8, d+12, ... +_MORTON_4D_MASKS: tuple[np.uint64, ...] = ( + np.uint64(0x1111111111111111), # Extract mask: bits 0, 4, 8, 12, ... + np.uint64(0x0303030303030303), # Compact stage 1 + np.uint64(0x000F000F000F000F), # Compact stage 2 + np.uint64(0x000000FF000000FF), # Compact stage 3 + np.uint64(0x000000000000FFFF), # Compact stage 4 +) + def _decode_morton_2d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: """Decode 2D Morton codes using magic number bit manipulation. @@ -1539,6 +1549,27 @@ def _decode_morton_3d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: return out +def _decode_morton_4d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: + """Decode 4D Morton codes using magic number bit manipulation. + + This extracts interleaved x,y,z,w coordinates from Morton codes using + parallel bit operations instead of bit-by-bit loops. + """ + # Convert to uint64 for bitwise operations with large masks + z_u64 = z.astype(np.uint64) + out = np.zeros((len(z), 4), dtype=np.intp) + + for dim in range(4): + x = (z_u64 >> dim) & _MORTON_4D_MASKS[0] + x = (x ^ (x >> 3)) & _MORTON_4D_MASKS[1] + x = (x ^ (x >> 6)) & _MORTON_4D_MASKS[2] + x = (x ^ (x >> 12)) & _MORTON_4D_MASKS[3] + x = (x ^ (x >> 24)) & _MORTON_4D_MASKS[4] + out[:, dim] = x + + return out + + def decode_morton_vectorized( z: npt.NDArray[np.intp], chunk_shape: tuple[int, ...] ) -> npt.NDArray[np.intp]: @@ -1559,14 +1590,16 @@ def decode_morton_vectorized( n_dims = len(chunk_shape) bits = tuple((c - 1).bit_length() for c in chunk_shape) - # Use magic number optimization for 2D/3D with uniform bit widths. - # Magic numbers have bit limits: 2D supports up to 32 bits, 3D up to 21 bits per dimension. + # Use magic number optimization for 2D/3D/4D with uniform bit widths. + # Magic numbers have bit limits: 2D up to 32, 3D up to 21, 4D up to 16 bits per dimension. if len(set(bits)) == 1: # All dimensions have same bit width max_bits = bits[0] if bits else 0 if n_dims == 2 and max_bits <= 32: return _decode_morton_2d(z) if n_dims == 3 and max_bits <= 21: return _decode_morton_3d(z) + if n_dims == 4 and max_bits <= 16: + return _decode_morton_4d(z) # Fall back to generic bit-by-bit decoding max_coords_bits = max(bits) if bits else 0 From db31842b072c9d68038b2c37d1350fd0938e6310 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 13 Feb 2026 01:58:31 -0500 Subject: [PATCH 07/14] perf:Add Morton magic numbers for 5D --- src/zarr/core/indexing.py | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 01606d44e8..3ef53d8737 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1499,6 +1499,17 @@ def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]: np.uint64(0x000000000000FFFF), # Compact stage 4 ) +# Magic numbers for 5D Morton decode: extract every 5th bit and compact them. +# Bits for dimension d are at positions d, d+5, d+10, d+15, ... +# Max 12 bits per dimension (64 // 5 = 12), supporting values up to 4095. +_MORTON_5D_MASKS: tuple[np.uint64, ...] = ( + np.uint64(0x1084210842108421), # Extract mask: bits 0, 5, 10, 15, ... + np.uint64(0x0318C6318C6318C63), # Compact stage 1 + np.uint64(0xF03C0F03C0F03C0F), # Compact stage 2 + np.uint64(0x0000FF000FF000FF), # Compact stage 3 + np.uint64(0x0000000000000FFF), # Compact stage 4 (12 bits max) +) + def _decode_morton_2d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: """Decode 2D Morton codes using magic number bit manipulation. @@ -1570,6 +1581,27 @@ def _decode_morton_4d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: return out +def _decode_morton_5d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: + """Decode 5D Morton codes using magic number bit manipulation. + + This extracts interleaved coordinates from Morton codes using + parallel bit operations instead of bit-by-bit loops. + """ + # Convert to uint64 for bitwise operations with large masks + z_u64 = z.astype(np.uint64) + out = np.zeros((len(z), 5), dtype=np.intp) + + for dim in range(5): + x = (z_u64 >> dim) & _MORTON_5D_MASKS[0] + x = (x ^ (x >> 4)) & _MORTON_5D_MASKS[1] + x = (x ^ (x >> 8)) & _MORTON_5D_MASKS[2] + x = (x ^ (x >> 16)) & _MORTON_5D_MASKS[3] + x = (x ^ (x >> 32)) & _MORTON_5D_MASKS[4] + out[:, dim] = x + + return out + + def decode_morton_vectorized( z: npt.NDArray[np.intp], chunk_shape: tuple[int, ...] ) -> npt.NDArray[np.intp]: @@ -1590,8 +1622,8 @@ def decode_morton_vectorized( n_dims = len(chunk_shape) bits = tuple((c - 1).bit_length() for c in chunk_shape) - # Use magic number optimization for 2D/3D/4D with uniform bit widths. - # Magic numbers have bit limits: 2D up to 32, 3D up to 21, 4D up to 16 bits per dimension. + # Use magic number optimization for 2D/3D/4D/5D with uniform bit widths. + # Bit limits are floor(64 / n_dims): 2D=32, 3D=21, 4D=16, 5D=12 bits per dimension. if len(set(bits)) == 1: # All dimensions have same bit width max_bits = bits[0] if bits else 0 if n_dims == 2 and max_bits <= 32: @@ -1600,6 +1632,8 @@ def decode_morton_vectorized( return _decode_morton_3d(z) if n_dims == 4 and max_bits <= 16: return _decode_morton_4d(z) + if n_dims == 5 and max_bits <= 12: + return _decode_morton_5d(z) # Fall back to generic bit-by-bit decoding max_coords_bits = max(bits) if bits else 0 From f9952f123de393c7125e787c8e46bd7ac6f52d8d Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 13 Feb 2026 02:13:18 -0500 Subject: [PATCH 08/14] perf:Remove singleton dimensions to reduce ndims --- src/zarr/core/indexing.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 3ef53d8737..b39752938d 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1658,6 +1658,31 @@ def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: if n_total == 0: return () + # Optimization: Remove singleton dimensions to enable magic number usage + # for shapes like (1,1,32,32,32). Compute Morton on squeezed shape, then expand. + singleton_dims = tuple(i for i, s in enumerate(chunk_shape) if s == 1) + if singleton_dims: + squeezed_shape = tuple(s for s in chunk_shape if s != 1) + if squeezed_shape: + # Compute Morton order on squeezed shape + squeezed_order = _morton_order(squeezed_shape) + # Expand coordinates to include singleton dimensions (always 0) + order: list[tuple[int, ...]] = [] + for coord in squeezed_order: + full_coord: list[int] = [] + squeezed_idx = 0 + for i in range(len(chunk_shape)): + if chunk_shape[i] == 1: + full_coord.append(0) + else: + full_coord.append(coord[squeezed_idx]) + squeezed_idx += 1 + order.append(tuple(full_coord)) + return tuple(order) + else: + # All dimensions are singletons, just return the single point + return ((0,) * len(chunk_shape),) + n_dims = len(chunk_shape) # Find the largest power-of-2 hypercube that fits within chunk_shape. From aedce5a8e8079aba8f93f29d52a632f40b6e27bc Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 13 Feb 2026 02:25:21 -0500 Subject: [PATCH 09/14] Add changes --- changes/3708.misc.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/3708.misc.md diff --git a/changes/3708.misc.md b/changes/3708.misc.md new file mode 100644 index 0000000000..1ea865bdbe --- /dev/null +++ b/changes/3708.misc.md @@ -0,0 +1 @@ +Optimize Morton order computation with hypercube optimization, vectorized decoding, magic number bit manipulation for 2D-5D, and singleton dimension removal, providing 10-45x speedup for typical chunk shapes. From ef182106d490a5f1cbabf1487626849359bb4de6 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 13 Feb 2026 02:30:38 -0500 Subject: [PATCH 10/14] fix:Address type annotation and linting issues --- src/zarr/core/indexing.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index b39752938d..a9287e55af 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1667,7 +1667,7 @@ def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: # Compute Morton order on squeezed shape squeezed_order = _morton_order(squeezed_shape) # Expand coordinates to include singleton dimensions (always 0) - order: list[tuple[int, ...]] = [] + expanded: list[tuple[int, ...]] = [] for coord in squeezed_order: full_coord: list[int] = [] squeezed_idx = 0 @@ -1677,8 +1677,8 @@ def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: else: full_coord.append(coord[squeezed_idx]) squeezed_idx += 1 - order.append(tuple(full_coord)) - return tuple(order) + expanded.append(tuple(full_coord)) + return tuple(expanded) else: # All dimensions are singletons, just return the single point return ((0,) * len(chunk_shape),) @@ -1696,10 +1696,11 @@ def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: n_hypercube = 0 # Within the hypercube, no bounds checking needed - use vectorized decoding + order: list[tuple[int, ...]] if n_hypercube > 0: z_values = np.arange(n_hypercube, dtype=np.intp) hypercube_coords = decode_morton_vectorized(z_values, chunk_shape) - order: list[tuple[int, ...]] = [tuple(row) for row in hypercube_coords] + order = [tuple(row) for row in hypercube_coords] else: order = [] From 24dcbd5e662825c49cff6af2930275c46074c351 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 13 Feb 2026 06:32:18 -0500 Subject: [PATCH 11/14] perf:Remove magic number functions --- src/zarr/core/indexing.py | 149 -------------------------------------- 1 file changed, 149 deletions(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index a9287e55af..d300a3982e 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1467,141 +1467,6 @@ def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]: return tuple(out) -# Magic numbers for 2D Morton decode: extract every 2nd bit and compact them. -# Bits for dimension d are at positions d, d+2, d+4, ... -_MORTON_2D_MASKS: tuple[int, ...] = ( - 0x5555555555555555, # Extract mask: bits 0, 2, 4, ... - 0x3333333333333333, # Compact stage 1 - 0x0F0F0F0F0F0F0F0F, # Compact stage 2 - 0x00FF00FF00FF00FF, # Compact stage 3 - 0x0000FFFF0000FFFF, # Compact stage 4 -) - -# Magic numbers for 3D Morton decode: extract every 3rd bit and compact them. -# Bits for dimension d are at positions d, d+3, d+6, d+9, ... -# Uses uint64 to handle large mask values correctly with numpy. -_MORTON_3D_MASKS: tuple[np.uint64, ...] = ( - np.uint64(0x1249249249249249), # Extract mask: bits 0, 3, 6, 9, ... - np.uint64(0x30C30C30C30C30C3), # Compact stage 1 - np.uint64(0xF00F00F00F00F00F), # Compact stage 2 - np.uint64(0x00FF0000FF0000FF), # Compact stage 3 - np.uint64(0x00FF00000000FFFF), # Compact stage 4 - np.uint64(0x00000000001FFFFF), # Compact stage 5 -) - -# Magic numbers for 4D Morton decode: extract every 4th bit and compact them. -# Bits for dimension d are at positions d, d+4, d+8, d+12, ... -_MORTON_4D_MASKS: tuple[np.uint64, ...] = ( - np.uint64(0x1111111111111111), # Extract mask: bits 0, 4, 8, 12, ... - np.uint64(0x0303030303030303), # Compact stage 1 - np.uint64(0x000F000F000F000F), # Compact stage 2 - np.uint64(0x000000FF000000FF), # Compact stage 3 - np.uint64(0x000000000000FFFF), # Compact stage 4 -) - -# Magic numbers for 5D Morton decode: extract every 5th bit and compact them. -# Bits for dimension d are at positions d, d+5, d+10, d+15, ... -# Max 12 bits per dimension (64 // 5 = 12), supporting values up to 4095. -_MORTON_5D_MASKS: tuple[np.uint64, ...] = ( - np.uint64(0x1084210842108421), # Extract mask: bits 0, 5, 10, 15, ... - np.uint64(0x0318C6318C6318C63), # Compact stage 1 - np.uint64(0xF03C0F03C0F03C0F), # Compact stage 2 - np.uint64(0x0000FF000FF000FF), # Compact stage 3 - np.uint64(0x0000000000000FFF), # Compact stage 4 (12 bits max) -) - - -def _decode_morton_2d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: - """Decode 2D Morton codes using magic number bit manipulation. - - This extracts interleaved x,y coordinates from Morton codes using - parallel bit operations instead of bit-by-bit loops. - """ - out = np.zeros((len(z), 2), dtype=np.intp) - - # Extract x (bits 0, 2, 4, ...) and compact to (bits 0, 1, 2, ...) - x = z & _MORTON_2D_MASKS[0] - x = (x | (x >> 1)) & _MORTON_2D_MASKS[1] - x = (x | (x >> 2)) & _MORTON_2D_MASKS[2] - x = (x | (x >> 4)) & _MORTON_2D_MASKS[3] - x = (x | (x >> 8)) & _MORTON_2D_MASKS[4] - out[:, 0] = x - - # Extract y (bits 1, 3, 5, ...) and compact - y = (z >> 1) & _MORTON_2D_MASKS[0] - y = (y | (y >> 1)) & _MORTON_2D_MASKS[1] - y = (y | (y >> 2)) & _MORTON_2D_MASKS[2] - y = (y | (y >> 4)) & _MORTON_2D_MASKS[3] - y = (y | (y >> 8)) & _MORTON_2D_MASKS[4] - out[:, 1] = y - - return out - - -def _decode_morton_3d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: - """Decode 3D Morton codes using magic number bit manipulation. - - This extracts interleaved x,y,z coordinates from Morton codes using - parallel bit operations instead of bit-by-bit loops. - """ - # Convert to uint64 for bitwise operations with large masks - z_u64 = z.astype(np.uint64) - out = np.zeros((len(z), 3), dtype=np.intp) - - for dim in range(3): - x = (z_u64 >> dim) & _MORTON_3D_MASKS[0] - x = (x ^ (x >> 2)) & _MORTON_3D_MASKS[1] - x = (x ^ (x >> 4)) & _MORTON_3D_MASKS[2] - x = (x ^ (x >> 8)) & _MORTON_3D_MASKS[3] - x = (x ^ (x >> 16)) & _MORTON_3D_MASKS[4] - x = (x ^ (x >> 32)) & _MORTON_3D_MASKS[5] - out[:, dim] = x - - return out - - -def _decode_morton_4d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: - """Decode 4D Morton codes using magic number bit manipulation. - - This extracts interleaved x,y,z,w coordinates from Morton codes using - parallel bit operations instead of bit-by-bit loops. - """ - # Convert to uint64 for bitwise operations with large masks - z_u64 = z.astype(np.uint64) - out = np.zeros((len(z), 4), dtype=np.intp) - - for dim in range(4): - x = (z_u64 >> dim) & _MORTON_4D_MASKS[0] - x = (x ^ (x >> 3)) & _MORTON_4D_MASKS[1] - x = (x ^ (x >> 6)) & _MORTON_4D_MASKS[2] - x = (x ^ (x >> 12)) & _MORTON_4D_MASKS[3] - x = (x ^ (x >> 24)) & _MORTON_4D_MASKS[4] - out[:, dim] = x - - return out - - -def _decode_morton_5d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]: - """Decode 5D Morton codes using magic number bit manipulation. - - This extracts interleaved coordinates from Morton codes using - parallel bit operations instead of bit-by-bit loops. - """ - # Convert to uint64 for bitwise operations with large masks - z_u64 = z.astype(np.uint64) - out = np.zeros((len(z), 5), dtype=np.intp) - - for dim in range(5): - x = (z_u64 >> dim) & _MORTON_5D_MASKS[0] - x = (x ^ (x >> 4)) & _MORTON_5D_MASKS[1] - x = (x ^ (x >> 8)) & _MORTON_5D_MASKS[2] - x = (x ^ (x >> 16)) & _MORTON_5D_MASKS[3] - x = (x ^ (x >> 32)) & _MORTON_5D_MASKS[4] - out[:, dim] = x - - return out - - def decode_morton_vectorized( z: npt.NDArray[np.intp], chunk_shape: tuple[int, ...] ) -> npt.NDArray[np.intp]: @@ -1622,20 +1487,6 @@ def decode_morton_vectorized( n_dims = len(chunk_shape) bits = tuple((c - 1).bit_length() for c in chunk_shape) - # Use magic number optimization for 2D/3D/4D/5D with uniform bit widths. - # Bit limits are floor(64 / n_dims): 2D=32, 3D=21, 4D=16, 5D=12 bits per dimension. - if len(set(bits)) == 1: # All dimensions have same bit width - max_bits = bits[0] if bits else 0 - if n_dims == 2 and max_bits <= 32: - return _decode_morton_2d(z) - if n_dims == 3 and max_bits <= 21: - return _decode_morton_3d(z) - if n_dims == 4 and max_bits <= 16: - return _decode_morton_4d(z) - if n_dims == 5 and max_bits <= 12: - return _decode_morton_5d(z) - - # Fall back to generic bit-by-bit decoding max_coords_bits = max(bits) if bits else 0 out = np.zeros((len(z), n_dims), dtype=np.intp) From 7b3db07621794b8f5a733e47e0a54cf0c318fe9f Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 13 Feb 2026 06:46:48 -0500 Subject: [PATCH 12/14] test:Add power of 2 sharding indexing tests --- tests/benchmarks/test_indexing.py | 42 +++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/benchmarks/test_indexing.py b/tests/benchmarks/test_indexing.py index 9ca0d8e1af..f8310d15d2 100644 --- a/tests/benchmarks/test_indexing.py +++ b/tests/benchmarks/test_indexing.py @@ -50,3 +50,45 @@ def test_slice_indexing( data[:] = 1 benchmark(getitem, data, indexer) + + +# Benchmark for Morton order optimization with power-of-2 shards +# Morton order is used internally by sharding codec for chunk iteration +morton_shards = ( + (16,) * 3, # With 2x2x2 chunks: 8x8x8 = 512 chunks per shard + (32,) * 3, # With 2x2x2 chunks: 16x16x16 = 4096 chunks per shard +) + + +@pytest.mark.parametrize("store", ["memory"], indirect=["store"]) +@pytest.mark.parametrize("shards", morton_shards, ids=str) +def test_sharded_morton_indexing( + store: Store, + shards: tuple[int, ...], + benchmark: BenchmarkFixture, +) -> None: + """Benchmark sharded array indexing with power-of-2 chunks per shard. + + This benchmark exercises the Morton order iteration path in the sharding + codec, which benefits from the hypercube and vectorization optimizations. + """ + # Create array where each shard contains many small chunks + # e.g., shards=(32,32,32) with chunks=(2,2,2) means 16x16x16 = 4096 chunks per shard + shape = tuple(s * 2 for s in shards) # 2 shards per dimension + chunks = (2,) * 3 # Small chunks to maximize chunks per shard + + data = create_array( + store=store, + shape=shape, + dtype="uint8", + chunks=chunks, + shards=shards, + compressors=None, + filters=None, + fill_value=0, + ) + + data[:] = 1 + # Read a sub-shard region to exercise Morton order iteration + indexer = (slice(shards[0]),) * 3 + benchmark(getitem, data, indexer) From 443b5d413c7cdb0d98396d6ba3ae6ef01e2ca97c Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 13 Feb 2026 07:08:58 -0500 Subject: [PATCH 13/14] test: Add Morton order benchmarks with cache clearing Add benchmarks that clear the _morton_order LRU cache before each iteration to measure the full Morton computation cost: - test_sharded_morton_indexing: 512-4096 chunks per shard - test_sharded_morton_indexing_large: 32768 chunks per shard Co-Authored-By: Claude Opus 4.5 --- tests/benchmarks/test_indexing.py | 58 ++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/tests/benchmarks/test_indexing.py b/tests/benchmarks/test_indexing.py index f8310d15d2..fbdc2a6a1a 100644 --- a/tests/benchmarks/test_indexing.py +++ b/tests/benchmarks/test_indexing.py @@ -71,7 +71,11 @@ def test_sharded_morton_indexing( This benchmark exercises the Morton order iteration path in the sharding codec, which benefits from the hypercube and vectorization optimizations. + The Morton order cache is cleared before each iteration to measure the + full computation cost. """ + from zarr.core.indexing import _morton_order + # Create array where each shard contains many small chunks # e.g., shards=(32,32,32) with chunks=(2,2,2) means 16x16x16 = 4096 chunks per shard shape = tuple(s * 2 for s in shards) # 2 shards per dimension @@ -91,4 +95,56 @@ def test_sharded_morton_indexing( data[:] = 1 # Read a sub-shard region to exercise Morton order iteration indexer = (slice(shards[0]),) * 3 - benchmark(getitem, data, indexer) + + def read_with_cache_clear() -> None: + _morton_order.cache_clear() + getitem(data, indexer) + + benchmark(read_with_cache_clear) + + +# Benchmark with larger chunks_per_shard to make Morton order impact more visible +large_morton_shards = ( + (32,) * 3, # With 1x1x1 chunks: 32x32x32 = 32768 chunks per shard +) + + +@pytest.mark.parametrize("store", ["memory"], indirect=["store"]) +@pytest.mark.parametrize("shards", large_morton_shards, ids=str) +def test_sharded_morton_indexing_large( + store: Store, + shards: tuple[int, ...], + benchmark: BenchmarkFixture, +) -> None: + """Benchmark sharded array indexing with large chunks_per_shard. + + Uses 1x1x1 chunks to maximize chunks_per_shard (32^3 = 32768), making + the Morton order computation a more significant portion of total time. + The Morton order cache is cleared before each iteration. + """ + from zarr.core.indexing import _morton_order + + # 1x1x1 chunks means chunks_per_shard equals shard shape + shape = tuple(s * 2 for s in shards) # 2 shards per dimension + chunks = (1,) * 3 # 1x1x1 chunks: chunks_per_shard = shards + + data = create_array( + store=store, + shape=shape, + dtype="uint8", + chunks=chunks, + shards=shards, + compressors=None, + filters=None, + fill_value=0, + ) + + data[:] = 1 + # Read one full shard + indexer = (slice(shards[0]),) * 3 + + def read_with_cache_clear() -> None: + _morton_order.cache_clear() + getitem(data, indexer) + + benchmark(read_with_cache_clear) From 1cdcbdfe3a6c83cb9dd9353bc6db2a610952eba8 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 13 Feb 2026 07:31:34 -0500 Subject: [PATCH 14/14] fix:Bound LRU cache of _morton_order to 16 --- src/zarr/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index d300a3982e..df79728a85 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1503,7 +1503,7 @@ def decode_morton_vectorized( return out -@lru_cache +@lru_cache(maxsize=16) def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]: n_total = product(chunk_shape) if n_total == 0: