From 47730f3c3a6cd9f2436b74d0f828eec353ed2d01 Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <markkitt@gmail.com>
Date: Fri, 13 Feb 2026 00:15:57 -0500
Subject: [PATCH 01/14] perf: Skip bounds check for initial elements in 2^n
 hypercube

---
 src/zarr/core/indexing.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
index beffa99cfa..a6288ce548 100644
--- a/src/zarr/core/indexing.py
+++ b/src/zarr/core/indexing.py
@@ -1470,13 +1470,31 @@ def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]:
 @lru_cache
 def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
     n_total = product(chunk_shape)
+    n_dims = len(chunk_shape)
     order: list[tuple[int, ...]] = []
-    i = 0
+
+    # Find the largest power-of-2 hypercube that fits within chunk_shape.
+    # Within this hypercube, Morton codes are guaranteed to be in bounds.
+    min_dim = min(chunk_shape)
+    if min_dim >= 1:
+        power = min_dim.bit_length() - 1  # floor(log2(min_dim))
+        hypercube_size = 1 << power  # 2^power
+        n_hypercube = hypercube_size**n_dims
+    else:
+        n_hypercube = 0
+
+    # Within the hypercube, no bounds checking needed
+    for i in range(n_hypercube):
+        order.append(decode_morton(i, chunk_shape))
+
+    # For remaining elements, bounds checking is needed
+    i = n_hypercube
     while len(order) < n_total:
         m = decode_morton(i, chunk_shape)
         if all(x < y for x, y in zip(m, chunk_shape, strict=False)):
             order.append(m)
         i += 1
+
     return tuple(order)
 
 

From 865df2ac15230e24efd97f64d78fbed28a60b805 Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <markkitt@gmail.com>
Date: Fri, 13 Feb 2026 00:32:51 -0500
Subject: [PATCH 02/14] lint:Use a list comprehension rather than a for loop

---
 src/zarr/core/indexing.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
index a6288ce548..97c75b1372 100644
--- a/src/zarr/core/indexing.py
+++ b/src/zarr/core/indexing.py
@@ -1484,8 +1484,7 @@ def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
         n_hypercube = 0
 
     # Within the hypercube, no bounds checking needed
-    for i in range(n_hypercube):
-        order.append(decode_morton(i, chunk_shape))
+    order = [decode_morton(i, chunk_shape) for i in range(n_hypercube)]
 
     # For remaining elements, bounds checking is needed
     i = n_hypercube

From ce0099c6b08559df9d93df5d66e46b1d3069a1c3 Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <markkitt@gmail.com>
Date: Fri, 13 Feb 2026 00:47:37 -0500
Subject: [PATCH 03/14] pref:Add decode_morton_vectorized

---
 src/zarr/core/indexing.py | 44 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
index 97c75b1372..8da5b31f46 100644
--- a/src/zarr/core/indexing.py
+++ b/src/zarr/core/indexing.py
@@ -1467,11 +1467,47 @@ def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]:
     return tuple(out)
 
 
+def decode_morton_vectorized(
+    z: npt.NDArray[np.intp], chunk_shape: tuple[int, ...]
+) -> npt.NDArray[np.intp]:
+    """Vectorized Morton code decoding for multiple z values.
+
+    Parameters
+    ----------
+    z : ndarray
+        1D array of Morton codes to decode.
+    chunk_shape : tuple of int
+        Shape defining the coordinate space.
+
+    Returns
+    -------
+    ndarray
+        2D array of shape (len(z), len(chunk_shape)) containing decoded coordinates.
+    """
+    n_dims = len(chunk_shape)
+    bits = tuple(math.ceil(math.log2(c)) if c > 1 else 1 for c in chunk_shape)
+    max_coords_bits = max(bits)
+
+    # Output array: each row is a decoded coordinate
+    out = np.zeros((len(z), n_dims), dtype=np.intp)
+
+    input_bit = 0
+    for coord_bit in range(max_coords_bits):
+        for dim in range(n_dims):
+            if coord_bit < bits[dim]:
+                # Extract bit at position input_bit from all z values
+                bit_values = (z >> input_bit) & 1
+                # Place bit at coord_bit position in dimension dim
+                out[:, dim] |= bit_values << coord_bit
+                input_bit += 1
+
+    return out
+
+
 @lru_cache
 def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
     n_total = product(chunk_shape)
     n_dims = len(chunk_shape)
-    order: list[tuple[int, ...]] = []
 
     # Find the largest power-of-2 hypercube that fits within chunk_shape.
     # Within this hypercube, Morton codes are guaranteed to be in bounds.
@@ -1483,8 +1519,10 @@ def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
     else:
         n_hypercube = 0
 
-    # Within the hypercube, no bounds checking needed
-    order = [decode_morton(i, chunk_shape) for i in range(n_hypercube)]
+    # Within the hypercube, no bounds checking needed - use vectorized decoding
+    z_values = np.arange(n_hypercube, dtype=np.intp)
+    hypercube_coords = decode_morton_vectorized(z_values, chunk_shape)
+    order: list[tuple[int, ...]] = [tuple(row) for row in hypercube_coords]
 
     # For remaining elements, bounds checking is needed
     i = n_hypercube

From 1b1076f136645300d5132839403494ea2920ac13 Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <markkitt@gmail.com>
Date: Fri, 13 Feb 2026 01:06:26 -0500
Subject: [PATCH 04/14] perf:Replace math.log2() with bit_length()

---
 src/zarr/core/indexing.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
index 8da5b31f46..c9c7e0f35a 100644
--- a/src/zarr/core/indexing.py
+++ b/src/zarr/core/indexing.py
@@ -1452,7 +1452,7 @@ def make_slice_selection(selection: Any) -> list[slice]:
 def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]:
     # Inspired by compressed morton code as implemented in Neuroglancer
     # https://github.com/google/neuroglancer/blob/master/src/neuroglancer/datasource/precomputed/volume.md#compressed-morton-code
-    bits = tuple(math.ceil(math.log2(c)) for c in chunk_shape)
+    bits = tuple((c - 1).bit_length() for c in chunk_shape)
     max_coords_bits = max(bits)
     input_bit = 0
     input_value = z
@@ -1485,8 +1485,8 @@ def decode_morton_vectorized(
         2D array of shape (len(z), len(chunk_shape)) containing decoded coordinates.
     """
     n_dims = len(chunk_shape)
-    bits = tuple(math.ceil(math.log2(c)) if c > 1 else 1 for c in chunk_shape)
-    max_coords_bits = max(bits)
+    bits = tuple((c - 1).bit_length() for c in chunk_shape)
+    max_coords_bits = max(bits) if bits else 0
 
     # Output array: each row is a decoded coordinate
     out = np.zeros((len(z), n_dims), dtype=np.intp)
@@ -1507,6 +1507,9 @@ def decode_morton_vectorized(
 @lru_cache
 def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
     n_total = product(chunk_shape)
+    if n_total == 0:
+        return ()
+
     n_dims = len(chunk_shape)
 
     # Find the largest power-of-2 hypercube that fits within chunk_shape.
@@ -1520,9 +1523,12 @@ def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
         n_hypercube = 0
 
     # Within the hypercube, no bounds checking needed - use vectorized decoding
-    z_values = np.arange(n_hypercube, dtype=np.intp)
-    hypercube_coords = decode_morton_vectorized(z_values, chunk_shape)
-    order: list[tuple[int, ...]] = [tuple(row) for row in hypercube_coords]
+    if n_hypercube > 0:
+        z_values = np.arange(n_hypercube, dtype=np.intp)
+        hypercube_coords = decode_morton_vectorized(z_values, chunk_shape)
+        order: list[tuple[int, ...]] = [tuple(row) for row in hypercube_coords]
+    else:
+        order = []
 
     # For remaining elements, bounds checking is needed
     i = n_hypercube

From 47a68eb8d5b476410bb97703b2c332bd5a18c39f Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <markkitt@gmail.com>
Date: Fri, 13 Feb 2026 01:38:09 -0500
Subject: [PATCH 05/14] perf:Use magic numbers for 2D and 3D

---
 src/zarr/core/indexing.py | 85 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 2 deletions(-)

diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
index c9c7e0f35a..45fc300bf3 100644
--- a/src/zarr/core/indexing.py
+++ b/src/zarr/core/indexing.py
@@ -1467,6 +1467,78 @@ def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]:
     return tuple(out)
 
 
+# Magic numbers for 2D Morton decode: extract every 2nd bit and compact them.
+# Bits for dimension d are at positions d, d+2, d+4, ...
+_MORTON_2D_MASKS: tuple[int, ...] = (
+    0x5555555555555555,  # Extract mask: bits 0, 2, 4, ...
+    0x3333333333333333,  # Compact stage 1
+    0x0F0F0F0F0F0F0F0F,  # Compact stage 2
+    0x00FF00FF00FF00FF,  # Compact stage 3
+    0x0000FFFF0000FFFF,  # Compact stage 4
+)
+
+# Magic numbers for 3D Morton decode: extract every 3rd bit and compact them.
+# Bits for dimension d are at positions d, d+3, d+6, d+9, ...
+# Uses uint64 to handle large mask values correctly with numpy.
+_MORTON_3D_MASKS: tuple[np.uint64, ...] = (
+    np.uint64(0x1249249249249249),  # Extract mask: bits 0, 3, 6, 9, ...
+    np.uint64(0x30C30C30C30C30C3),  # Compact stage 1
+    np.uint64(0xF00F00F00F00F00F),  # Compact stage 2
+    np.uint64(0x00FF0000FF0000FF),  # Compact stage 3
+    np.uint64(0x00FF00000000FFFF),  # Compact stage 4
+    np.uint64(0x00000000001FFFFF),  # Compact stage 5
+)
+
+
+def _decode_morton_2d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]:
+    """Decode 2D Morton codes using magic number bit manipulation.
+
+    This extracts interleaved x,y coordinates from Morton codes using
+    parallel bit operations instead of bit-by-bit loops.
+    """
+    out = np.zeros((len(z), 2), dtype=np.intp)
+
+    # Extract x (bits 0, 2, 4, ...) and compact to (bits 0, 1, 2, ...)
+    x = z & _MORTON_2D_MASKS[0]
+    x = (x | (x >> 1)) & _MORTON_2D_MASKS[1]
+    x = (x | (x >> 2)) & _MORTON_2D_MASKS[2]
+    x = (x | (x >> 4)) & _MORTON_2D_MASKS[3]
+    x = (x | (x >> 8)) & _MORTON_2D_MASKS[4]
+    out[:, 0] = x
+
+    # Extract y (bits 1, 3, 5, ...) and compact
+    y = (z >> 1) & _MORTON_2D_MASKS[0]
+    y = (y | (y >> 1)) & _MORTON_2D_MASKS[1]
+    y = (y | (y >> 2)) & _MORTON_2D_MASKS[2]
+    y = (y | (y >> 4)) & _MORTON_2D_MASKS[3]
+    y = (y | (y >> 8)) & _MORTON_2D_MASKS[4]
+    out[:, 1] = y
+
+    return out
+
+
+def _decode_morton_3d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]:
+    """Decode 3D Morton codes using magic number bit manipulation.
+
+    This extracts interleaved x,y,z coordinates from Morton codes using
+    parallel bit operations instead of bit-by-bit loops.
+    """
+    # Convert to uint64 for bitwise operations with large masks
+    z_u64 = z.astype(np.uint64)
+    out = np.zeros((len(z), 3), dtype=np.intp)
+
+    for dim in range(3):
+        x = (z_u64 >> dim) & _MORTON_3D_MASKS[0]
+        x = (x ^ (x >> 2)) & _MORTON_3D_MASKS[1]
+        x = (x ^ (x >> 4)) & _MORTON_3D_MASKS[2]
+        x = (x ^ (x >> 8)) & _MORTON_3D_MASKS[3]
+        x = (x ^ (x >> 16)) & _MORTON_3D_MASKS[4]
+        x = (x ^ (x >> 32)) & _MORTON_3D_MASKS[5]
+        out[:, dim] = x
+
+    return out
+
+
 def decode_morton_vectorized(
     z: npt.NDArray[np.intp], chunk_shape: tuple[int, ...]
 ) -> npt.NDArray[np.intp]:
@@ -1486,9 +1558,18 @@ def decode_morton_vectorized(
     """
     n_dims = len(chunk_shape)
     bits = tuple((c - 1).bit_length() for c in chunk_shape)
-    max_coords_bits = max(bits) if bits else 0
 
-    # Output array: each row is a decoded coordinate
+    # Use magic number optimization for 2D/3D with uniform bit widths.
+    # Magic numbers have bit limits: 2D supports up to 32 bits, 3D up to 21 bits per dimension.
+    if len(set(bits)) == 1:  # All dimensions have same bit width
+        max_bits = bits[0] if bits else 0
+        if n_dims == 2 and max_bits <= 32:
+            return _decode_morton_2d(z)
+        if n_dims == 3 and max_bits <= 21:
+            return _decode_morton_3d(z)
+
+    # Fall back to generic bit-by-bit decoding
+    max_coords_bits = max(bits) if bits else 0
     out = np.zeros((len(z), n_dims), dtype=np.intp)
 
     input_bit = 0

From 6fb6d00f6854a27002001ce74bfd441c3a88bc59 Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <markkitt@gmail.com>
Date: Fri, 13 Feb 2026 01:48:12 -0500
Subject: [PATCH 06/14] perf:Add 4D Morton magic numbers

---
 src/zarr/core/indexing.py | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
index 45fc300bf3..01606d44e8 100644
--- a/src/zarr/core/indexing.py
+++ b/src/zarr/core/indexing.py
@@ -1489,6 +1489,16 @@ def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]:
     np.uint64(0x00000000001FFFFF),  # Compact stage 5
 )
 
+# Magic numbers for 4D Morton decode: extract every 4th bit and compact them.
+# Bits for dimension d are at positions d, d+4, d+8, d+12, ...
+_MORTON_4D_MASKS: tuple[np.uint64, ...] = (
+    np.uint64(0x1111111111111111),  # Extract mask: bits 0, 4, 8, 12, ...
+    np.uint64(0x0303030303030303),  # Compact stage 1
+    np.uint64(0x000F000F000F000F),  # Compact stage 2
+    np.uint64(0x000000FF000000FF),  # Compact stage 3
+    np.uint64(0x000000000000FFFF),  # Compact stage 4
+)
+
 
 def _decode_morton_2d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]:
     """Decode 2D Morton codes using magic number bit manipulation.
@@ -1539,6 +1549,27 @@ def _decode_morton_3d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]:
     return out
 
 
+def _decode_morton_4d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]:
+    """Decode 4D Morton codes using magic number bit manipulation.
+
+    This extracts interleaved x,y,z,w coordinates from Morton codes using
+    parallel bit operations instead of bit-by-bit loops.
+    """
+    # Convert to uint64 for bitwise operations with large masks
+    z_u64 = z.astype(np.uint64)
+    out = np.zeros((len(z), 4), dtype=np.intp)
+
+    for dim in range(4):
+        x = (z_u64 >> dim) & _MORTON_4D_MASKS[0]
+        x = (x ^ (x >> 3)) & _MORTON_4D_MASKS[1]
+        x = (x ^ (x >> 6)) & _MORTON_4D_MASKS[2]
+        x = (x ^ (x >> 12)) & _MORTON_4D_MASKS[3]
+        x = (x ^ (x >> 24)) & _MORTON_4D_MASKS[4]
+        out[:, dim] = x
+
+    return out
+
+
 def decode_morton_vectorized(
     z: npt.NDArray[np.intp], chunk_shape: tuple[int, ...]
 ) -> npt.NDArray[np.intp]:
@@ -1559,14 +1590,16 @@ def decode_morton_vectorized(
     n_dims = len(chunk_shape)
     bits = tuple((c - 1).bit_length() for c in chunk_shape)
 
-    # Use magic number optimization for 2D/3D with uniform bit widths.
-    # Magic numbers have bit limits: 2D supports up to 32 bits, 3D up to 21 bits per dimension.
+    # Use magic number optimization for 2D/3D/4D with uniform bit widths.
+    # Magic numbers have bit limits: 2D up to 32, 3D up to 21, 4D up to 16 bits per dimension.
     if len(set(bits)) == 1:  # All dimensions have same bit width
         max_bits = bits[0] if bits else 0
         if n_dims == 2 and max_bits <= 32:
             return _decode_morton_2d(z)
         if n_dims == 3 and max_bits <= 21:
             return _decode_morton_3d(z)
+        if n_dims == 4 and max_bits <= 16:
+            return _decode_morton_4d(z)
 
     # Fall back to generic bit-by-bit decoding
     max_coords_bits = max(bits) if bits else 0

From db31842b072c9d68038b2c37d1350fd0938e6310 Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <markkitt@gmail.com>
Date: Fri, 13 Feb 2026 01:58:31 -0500
Subject: [PATCH 07/14] perf:Add Morton magic numbers for 5D

---
 src/zarr/core/indexing.py | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
index 01606d44e8..3ef53d8737 100644
--- a/src/zarr/core/indexing.py
+++ b/src/zarr/core/indexing.py
@@ -1499,6 +1499,17 @@ def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]:
     np.uint64(0x000000000000FFFF),  # Compact stage 4
 )
 
+# Magic numbers for 5D Morton decode: extract every 5th bit and compact them.
+# Bits for dimension d are at positions d, d+5, d+10, d+15, ...
+# Max 12 bits per dimension (64 // 5 = 12), supporting values up to 4095.
+_MORTON_5D_MASKS: tuple[np.uint64, ...] = (
+    np.uint64(0x1084210842108421),  # Extract mask: bits 0, 5, 10, 15, ...
+    np.uint64(0x0318C6318C6318C63),  # Compact stage 1
+    np.uint64(0xF03C0F03C0F03C0F),  # Compact stage 2
+    np.uint64(0x0000FF000FF000FF),  # Compact stage 3
+    np.uint64(0x0000000000000FFF),  # Compact stage 4 (12 bits max)
+)
+
 
 def _decode_morton_2d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]:
     """Decode 2D Morton codes using magic number bit manipulation.
@@ -1570,6 +1581,27 @@ def _decode_morton_4d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]:
     return out
 
 
+def _decode_morton_5d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]:
+    """Decode 5D Morton codes using magic number bit manipulation.
+
+    This extracts interleaved coordinates from Morton codes using
+    parallel bit operations instead of bit-by-bit loops.
+    """
+    # Convert to uint64 for bitwise operations with large masks
+    z_u64 = z.astype(np.uint64)
+    out = np.zeros((len(z), 5), dtype=np.intp)
+
+    for dim in range(5):
+        x = (z_u64 >> dim) & _MORTON_5D_MASKS[0]
+        x = (x ^ (x >> 4)) & _MORTON_5D_MASKS[1]
+        x = (x ^ (x >> 8)) & _MORTON_5D_MASKS[2]
+        x = (x ^ (x >> 16)) & _MORTON_5D_MASKS[3]
+        x = (x ^ (x >> 32)) & _MORTON_5D_MASKS[4]
+        out[:, dim] = x
+
+    return out
+
+
 def decode_morton_vectorized(
     z: npt.NDArray[np.intp], chunk_shape: tuple[int, ...]
 ) -> npt.NDArray[np.intp]:
@@ -1590,8 +1622,8 @@ def decode_morton_vectorized(
     n_dims = len(chunk_shape)
     bits = tuple((c - 1).bit_length() for c in chunk_shape)
 
-    # Use magic number optimization for 2D/3D/4D with uniform bit widths.
-    # Magic numbers have bit limits: 2D up to 32, 3D up to 21, 4D up to 16 bits per dimension.
+    # Use magic number optimization for 2D/3D/4D/5D with uniform bit widths.
+    # Bit limits are floor(64 / n_dims): 2D=32, 3D=21, 4D=16, 5D=12 bits per dimension.
     if len(set(bits)) == 1:  # All dimensions have same bit width
         max_bits = bits[0] if bits else 0
         if n_dims == 2 and max_bits <= 32:
@@ -1600,6 +1632,8 @@ def decode_morton_vectorized(
             return _decode_morton_3d(z)
         if n_dims == 4 and max_bits <= 16:
             return _decode_morton_4d(z)
+        if n_dims == 5 and max_bits <= 12:
+            return _decode_morton_5d(z)
 
     # Fall back to generic bit-by-bit decoding
     max_coords_bits = max(bits) if bits else 0

From f9952f123de393c7125e787c8e46bd7ac6f52d8d Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <markkitt@gmail.com>
Date: Fri, 13 Feb 2026 02:13:18 -0500
Subject: [PATCH 08/14] perf:Remove singleton dimensions to reduce ndims

---
 src/zarr/core/indexing.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
index 3ef53d8737..b39752938d 100644
--- a/src/zarr/core/indexing.py
+++ b/src/zarr/core/indexing.py
@@ -1658,6 +1658,31 @@ def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
     if n_total == 0:
         return ()
 
+    # Optimization: Remove singleton dimensions to enable magic number usage
+    # for shapes like (1,1,32,32,32). Compute Morton on squeezed shape, then expand.
+    singleton_dims = tuple(i for i, s in enumerate(chunk_shape) if s == 1)
+    if singleton_dims:
+        squeezed_shape = tuple(s for s in chunk_shape if s != 1)
+        if squeezed_shape:
+            # Compute Morton order on squeezed shape
+            squeezed_order = _morton_order(squeezed_shape)
+            # Expand coordinates to include singleton dimensions (always 0)
+            order: list[tuple[int, ...]] = []
+            for coord in squeezed_order:
+                full_coord: list[int] = []
+                squeezed_idx = 0
+                for i in range(len(chunk_shape)):
+                    if chunk_shape[i] == 1:
+                        full_coord.append(0)
+                    else:
+                        full_coord.append(coord[squeezed_idx])
+                        squeezed_idx += 1
+                order.append(tuple(full_coord))
+            return tuple(order)
+        else:
+            # All dimensions are singletons, just return the single point
+            return ((0,) * len(chunk_shape),)
+
     n_dims = len(chunk_shape)
 
     # Find the largest power-of-2 hypercube that fits within chunk_shape.

From aedce5a8e8079aba8f93f29d52a632f40b6e27bc Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <markkitt@gmail.com>
Date: Fri, 13 Feb 2026 02:25:21 -0500
Subject: [PATCH 09/14] Add changes

---
 changes/3708.misc.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changes/3708.misc.md

diff --git a/changes/3708.misc.md b/changes/3708.misc.md
new file mode 100644
index 0000000000..1ea865bdbe
--- /dev/null
+++ b/changes/3708.misc.md
@@ -0,0 +1 @@
+Optimize Morton order computation with hypercube optimization, vectorized decoding, magic number bit manipulation for 2D-5D, and singleton dimension removal, providing 10-45x speedup for typical chunk shapes.

From ef182106d490a5f1cbabf1487626849359bb4de6 Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <markkitt@gmail.com>
Date: Fri, 13 Feb 2026 02:30:38 -0500
Subject: [PATCH 10/14] fix:Address type annotation and linting issues

---
 src/zarr/core/indexing.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
index b39752938d..a9287e55af 100644
--- a/src/zarr/core/indexing.py
+++ b/src/zarr/core/indexing.py
@@ -1667,7 +1667,7 @@ def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
             # Compute Morton order on squeezed shape
             squeezed_order = _morton_order(squeezed_shape)
             # Expand coordinates to include singleton dimensions (always 0)
-            order: list[tuple[int, ...]] = []
+            expanded: list[tuple[int, ...]] = []
             for coord in squeezed_order:
                 full_coord: list[int] = []
                 squeezed_idx = 0
@@ -1677,8 +1677,8 @@ def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
                     else:
                         full_coord.append(coord[squeezed_idx])
                         squeezed_idx += 1
-                order.append(tuple(full_coord))
-            return tuple(order)
+                expanded.append(tuple(full_coord))
+            return tuple(expanded)
         else:
             # All dimensions are singletons, just return the single point
             return ((0,) * len(chunk_shape),)
@@ -1696,10 +1696,11 @@ def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
         n_hypercube = 0
 
     # Within the hypercube, no bounds checking needed - use vectorized decoding
+    order: list[tuple[int, ...]]
     if n_hypercube > 0:
         z_values = np.arange(n_hypercube, dtype=np.intp)
         hypercube_coords = decode_morton_vectorized(z_values, chunk_shape)
-        order: list[tuple[int, ...]] = [tuple(row) for row in hypercube_coords]
+        order = [tuple(row) for row in hypercube_coords]
     else:
         order = []
 

From 24dcbd5e662825c49cff6af2930275c46074c351 Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <markkitt@gmail.com>
Date: Fri, 13 Feb 2026 06:32:18 -0500
Subject: [PATCH 11/14] perf:Remove magic number functions

---
 src/zarr/core/indexing.py | 149 --------------------------------------
 1 file changed, 149 deletions(-)

diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
index a9287e55af..d300a3982e 100644
--- a/src/zarr/core/indexing.py
+++ b/src/zarr/core/indexing.py
@@ -1467,141 +1467,6 @@ def decode_morton(z: int, chunk_shape: tuple[int, ...]) -> tuple[int, ...]:
     return tuple(out)
 
 
-# Magic numbers for 2D Morton decode: extract every 2nd bit and compact them.
-# Bits for dimension d are at positions d, d+2, d+4, ...
-_MORTON_2D_MASKS: tuple[int, ...] = (
-    0x5555555555555555,  # Extract mask: bits 0, 2, 4, ...
-    0x3333333333333333,  # Compact stage 1
-    0x0F0F0F0F0F0F0F0F,  # Compact stage 2
-    0x00FF00FF00FF00FF,  # Compact stage 3
-    0x0000FFFF0000FFFF,  # Compact stage 4
-)
-
-# Magic numbers for 3D Morton decode: extract every 3rd bit and compact them.
-# Bits for dimension d are at positions d, d+3, d+6, d+9, ...
-# Uses uint64 to handle large mask values correctly with numpy.
-_MORTON_3D_MASKS: tuple[np.uint64, ...] = (
-    np.uint64(0x1249249249249249),  # Extract mask: bits 0, 3, 6, 9, ...
-    np.uint64(0x30C30C30C30C30C3),  # Compact stage 1
-    np.uint64(0xF00F00F00F00F00F),  # Compact stage 2
-    np.uint64(0x00FF0000FF0000FF),  # Compact stage 3
-    np.uint64(0x00FF00000000FFFF),  # Compact stage 4
-    np.uint64(0x00000000001FFFFF),  # Compact stage 5
-)
-
-# Magic numbers for 4D Morton decode: extract every 4th bit and compact them.
-# Bits for dimension d are at positions d, d+4, d+8, d+12, ...
-_MORTON_4D_MASKS: tuple[np.uint64, ...] = (
-    np.uint64(0x1111111111111111),  # Extract mask: bits 0, 4, 8, 12, ...
-    np.uint64(0x0303030303030303),  # Compact stage 1
-    np.uint64(0x000F000F000F000F),  # Compact stage 2
-    np.uint64(0x000000FF000000FF),  # Compact stage 3
-    np.uint64(0x000000000000FFFF),  # Compact stage 4
-)
-
-# Magic numbers for 5D Morton decode: extract every 5th bit and compact them.
-# Bits for dimension d are at positions d, d+5, d+10, d+15, ...
-# Max 12 bits per dimension (64 // 5 = 12), supporting values up to 4095.
-_MORTON_5D_MASKS: tuple[np.uint64, ...] = (
-    np.uint64(0x1084210842108421),  # Extract mask: bits 0, 5, 10, 15, ...
-    np.uint64(0x0318C6318C6318C63),  # Compact stage 1
-    np.uint64(0xF03C0F03C0F03C0F),  # Compact stage 2
-    np.uint64(0x0000FF000FF000FF),  # Compact stage 3
-    np.uint64(0x0000000000000FFF),  # Compact stage 4 (12 bits max)
-)
-
-
-def _decode_morton_2d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]:
-    """Decode 2D Morton codes using magic number bit manipulation.
-
-    This extracts interleaved x,y coordinates from Morton codes using
-    parallel bit operations instead of bit-by-bit loops.
-    """
-    out = np.zeros((len(z), 2), dtype=np.intp)
-
-    # Extract x (bits 0, 2, 4, ...) and compact to (bits 0, 1, 2, ...)
-    x = z & _MORTON_2D_MASKS[0]
-    x = (x | (x >> 1)) & _MORTON_2D_MASKS[1]
-    x = (x | (x >> 2)) & _MORTON_2D_MASKS[2]
-    x = (x | (x >> 4)) & _MORTON_2D_MASKS[3]
-    x = (x | (x >> 8)) & _MORTON_2D_MASKS[4]
-    out[:, 0] = x
-
-    # Extract y (bits 1, 3, 5, ...) and compact
-    y = (z >> 1) & _MORTON_2D_MASKS[0]
-    y = (y | (y >> 1)) & _MORTON_2D_MASKS[1]
-    y = (y | (y >> 2)) & _MORTON_2D_MASKS[2]
-    y = (y | (y >> 4)) & _MORTON_2D_MASKS[3]
-    y = (y | (y >> 8)) & _MORTON_2D_MASKS[4]
-    out[:, 1] = y
-
-    return out
-
-
-def _decode_morton_3d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]:
-    """Decode 3D Morton codes using magic number bit manipulation.
-
-    This extracts interleaved x,y,z coordinates from Morton codes using
-    parallel bit operations instead of bit-by-bit loops.
-    """
-    # Convert to uint64 for bitwise operations with large masks
-    z_u64 = z.astype(np.uint64)
-    out = np.zeros((len(z), 3), dtype=np.intp)
-
-    for dim in range(3):
-        x = (z_u64 >> dim) & _MORTON_3D_MASKS[0]
-        x = (x ^ (x >> 2)) & _MORTON_3D_MASKS[1]
-        x = (x ^ (x >> 4)) & _MORTON_3D_MASKS[2]
-        x = (x ^ (x >> 8)) & _MORTON_3D_MASKS[3]
-        x = (x ^ (x >> 16)) & _MORTON_3D_MASKS[4]
-        x = (x ^ (x >> 32)) & _MORTON_3D_MASKS[5]
-        out[:, dim] = x
-
-    return out
-
-
-def _decode_morton_4d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]:
-    """Decode 4D Morton codes using magic number bit manipulation.
-
-    This extracts interleaved x,y,z,w coordinates from Morton codes using
-    parallel bit operations instead of bit-by-bit loops.
-    """
-    # Convert to uint64 for bitwise operations with large masks
-    z_u64 = z.astype(np.uint64)
-    out = np.zeros((len(z), 4), dtype=np.intp)
-
-    for dim in range(4):
-        x = (z_u64 >> dim) & _MORTON_4D_MASKS[0]
-        x = (x ^ (x >> 3)) & _MORTON_4D_MASKS[1]
-        x = (x ^ (x >> 6)) & _MORTON_4D_MASKS[2]
-        x = (x ^ (x >> 12)) & _MORTON_4D_MASKS[3]
-        x = (x ^ (x >> 24)) & _MORTON_4D_MASKS[4]
-        out[:, dim] = x
-
-    return out
-
-
-def _decode_morton_5d(z: npt.NDArray[np.intp]) -> npt.NDArray[np.intp]:
-    """Decode 5D Morton codes using magic number bit manipulation.
-
-    This extracts interleaved coordinates from Morton codes using
-    parallel bit operations instead of bit-by-bit loops.
-    """
-    # Convert to uint64 for bitwise operations with large masks
-    z_u64 = z.astype(np.uint64)
-    out = np.zeros((len(z), 5), dtype=np.intp)
-
-    for dim in range(5):
-        x = (z_u64 >> dim) & _MORTON_5D_MASKS[0]
-        x = (x ^ (x >> 4)) & _MORTON_5D_MASKS[1]
-        x = (x ^ (x >> 8)) & _MORTON_5D_MASKS[2]
-        x = (x ^ (x >> 16)) & _MORTON_5D_MASKS[3]
-        x = (x ^ (x >> 32)) & _MORTON_5D_MASKS[4]
-        out[:, dim] = x
-
-    return out
-
-
 def decode_morton_vectorized(
     z: npt.NDArray[np.intp], chunk_shape: tuple[int, ...]
 ) -> npt.NDArray[np.intp]:
@@ -1622,20 +1487,6 @@ def decode_morton_vectorized(
     n_dims = len(chunk_shape)
     bits = tuple((c - 1).bit_length() for c in chunk_shape)
 
-    # Use magic number optimization for 2D/3D/4D/5D with uniform bit widths.
-    # Bit limits are floor(64 / n_dims): 2D=32, 3D=21, 4D=16, 5D=12 bits per dimension.
-    if len(set(bits)) == 1:  # All dimensions have same bit width
-        max_bits = bits[0] if bits else 0
-        if n_dims == 2 and max_bits <= 32:
-            return _decode_morton_2d(z)
-        if n_dims == 3 and max_bits <= 21:
-            return _decode_morton_3d(z)
-        if n_dims == 4 and max_bits <= 16:
-            return _decode_morton_4d(z)
-        if n_dims == 5 and max_bits <= 12:
-            return _decode_morton_5d(z)
-
-    # Fall back to generic bit-by-bit decoding
     max_coords_bits = max(bits) if bits else 0
     out = np.zeros((len(z), n_dims), dtype=np.intp)
 

From 7b3db07621794b8f5a733e47e0a54cf0c318fe9f Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <markkitt@gmail.com>
Date: Fri, 13 Feb 2026 06:46:48 -0500
Subject: [PATCH 12/14] test:Add power of 2 sharding indexing tests

---
 tests/benchmarks/test_indexing.py | 42 +++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/tests/benchmarks/test_indexing.py b/tests/benchmarks/test_indexing.py
index 9ca0d8e1af..f8310d15d2 100644
--- a/tests/benchmarks/test_indexing.py
+++ b/tests/benchmarks/test_indexing.py
@@ -50,3 +50,45 @@ def test_slice_indexing(
 
     data[:] = 1
     benchmark(getitem, data, indexer)
+
+
+# Benchmark for Morton order optimization with power-of-2 shards
+# Morton order is used internally by sharding codec for chunk iteration
+morton_shards = (
+    (16,) * 3,  # With 2x2x2 chunks: 8x8x8 = 512 chunks per shard
+    (32,) * 3,  # With 2x2x2 chunks: 16x16x16 = 4096 chunks per shard
+)
+
+
+@pytest.mark.parametrize("store", ["memory"], indirect=["store"])
+@pytest.mark.parametrize("shards", morton_shards, ids=str)
+def test_sharded_morton_indexing(
+    store: Store,
+    shards: tuple[int, ...],
+    benchmark: BenchmarkFixture,
+) -> None:
+    """Benchmark sharded array indexing with power-of-2 chunks per shard.
+
+    This benchmark exercises the Morton order iteration path in the sharding
+    codec, which benefits from the hypercube and vectorization optimizations.
+    """
+    # Create array where each shard contains many small chunks
+    # e.g., shards=(32,32,32) with chunks=(2,2,2) means 16x16x16 = 4096 chunks per shard
+    shape = tuple(s * 2 for s in shards)  # 2 shards per dimension
+    chunks = (2,) * 3  # Small chunks to maximize chunks per shard
+
+    data = create_array(
+        store=store,
+        shape=shape,
+        dtype="uint8",
+        chunks=chunks,
+        shards=shards,
+        compressors=None,
+        filters=None,
+        fill_value=0,
+    )
+
+    data[:] = 1
+    # Read a sub-shard region to exercise Morton order iteration
+    indexer = (slice(shards[0]),) * 3
+    benchmark(getitem, data, indexer)

From 443b5d413c7cdb0d98396d6ba3ae6ef01e2ca97c Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <markkitt@gmail.com>
Date: Fri, 13 Feb 2026 07:08:58 -0500
Subject: [PATCH 13/14] test: Add Morton order benchmarks with cache clearing

Add benchmarks that clear the _morton_order LRU cache before each
iteration to measure the full Morton computation cost:

- test_sharded_morton_indexing: 512-4096 chunks per shard
- test_sharded_morton_indexing_large: 32768 chunks per shard

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 tests/benchmarks/test_indexing.py | 58 ++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/tests/benchmarks/test_indexing.py b/tests/benchmarks/test_indexing.py
index f8310d15d2..fbdc2a6a1a 100644
--- a/tests/benchmarks/test_indexing.py
+++ b/tests/benchmarks/test_indexing.py
@@ -71,7 +71,11 @@ def test_sharded_morton_indexing(
 
     This benchmark exercises the Morton order iteration path in the sharding
     codec, which benefits from the hypercube and vectorization optimizations.
+    The Morton order cache is cleared before each iteration to measure the
+    full computation cost.
     """
+    from zarr.core.indexing import _morton_order
+
     # Create array where each shard contains many small chunks
     # e.g., shards=(32,32,32) with chunks=(2,2,2) means 16x16x16 = 4096 chunks per shard
     shape = tuple(s * 2 for s in shards)  # 2 shards per dimension
@@ -91,4 +95,56 @@ def test_sharded_morton_indexing(
     data[:] = 1
     # Read a sub-shard region to exercise Morton order iteration
     indexer = (slice(shards[0]),) * 3
-    benchmark(getitem, data, indexer)
+
+    def read_with_cache_clear() -> None:
+        _morton_order.cache_clear()
+        getitem(data, indexer)
+
+    benchmark(read_with_cache_clear)
+
+
+# Benchmark with larger chunks_per_shard to make Morton order impact more visible
+large_morton_shards = (
+    (32,) * 3,  # With 1x1x1 chunks: 32x32x32 = 32768 chunks per shard
+)
+
+
+@pytest.mark.parametrize("store", ["memory"], indirect=["store"])
+@pytest.mark.parametrize("shards", large_morton_shards, ids=str)
+def test_sharded_morton_indexing_large(
+    store: Store,
+    shards: tuple[int, ...],
+    benchmark: BenchmarkFixture,
+) -> None:
+    """Benchmark sharded array indexing with large chunks_per_shard.
+
+    Uses 1x1x1 chunks to maximize chunks_per_shard (32^3 = 32768), making
+    the Morton order computation a more significant portion of total time.
+    The Morton order cache is cleared before each iteration.
+    """
+    from zarr.core.indexing import _morton_order
+
+    # 1x1x1 chunks means chunks_per_shard equals shard shape
+    shape = tuple(s * 2 for s in shards)  # 2 shards per dimension
+    chunks = (1,) * 3  # 1x1x1 chunks: chunks_per_shard = shards
+
+    data = create_array(
+        store=store,
+        shape=shape,
+        dtype="uint8",
+        chunks=chunks,
+        shards=shards,
+        compressors=None,
+        filters=None,
+        fill_value=0,
+    )
+
+    data[:] = 1
+    # Read one full shard
+    indexer = (slice(shards[0]),) * 3
+
+    def read_with_cache_clear() -> None:
+        _morton_order.cache_clear()
+        getitem(data, indexer)
+
+    benchmark(read_with_cache_clear)

From 1cdcbdfe3a6c83cb9dd9353bc6db2a610952eba8 Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <markkitt@gmail.com>
Date: Fri, 13 Feb 2026 07:31:34 -0500
Subject: [PATCH 14/14] fix:Bound LRU cache of _morton_order to 16

---
 src/zarr/core/indexing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
index d300a3982e..df79728a85 100644
--- a/src/zarr/core/indexing.py
+++ b/src/zarr/core/indexing.py
@@ -1503,7 +1503,7 @@ def decode_morton_vectorized(
     return out
 
 
-@lru_cache
+@lru_cache(maxsize=16)
 def _morton_order(chunk_shape: tuple[int, ...]) -> tuple[tuple[int, ...], ...]:
     n_total = product(chunk_shape)
     if n_total == 0: