From 302987e8e1bc47a505a559af639558ab04baef76 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sat, 14 Mar 2026 15:08:19 +0100 Subject: [PATCH 1/7] First implementation of a VLArray store --- CMakeLists.txt | 3 +- src/blosc2/__init__.py | 3 + src/blosc2/blosc2_ext.pyx | 10 ++ src/blosc2/core.py | 10 +- src/blosc2/schunk.py | 27 +++++ src/blosc2/vlarray.py | 224 ++++++++++++++++++++++++++++++++++++ tests/test_schunk_update.py | 35 ++++++ tests/test_vlarray.py | 112 ++++++++++++++++++ 8 files changed, 420 insertions(+), 4 deletions(-) create mode 100644 src/blosc2/vlarray.py create mode 100644 tests/test_vlarray.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 4d921567c..ae63b85d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,7 +119,8 @@ else() include(FetchContent) FetchContent_Declare(blosc2 GIT_REPOSITORY https://github.com/Blosc/c-blosc2 - GIT_TAG 1386ef42f58b61c876edf714a2af84bd7b59dc5d # v2.23.1 + GIT_TAG ba55a6be9293faf9740f03b5953b82f1c955879e # variable-length chunks support in schunks + # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2 ) FetchContent_MakeAvailable(blosc2) include_directories("${blosc2_SOURCE_DIR}/include") diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index 244495545..e0e8f9ace 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -530,6 +530,7 @@ def _raise(exc): from .embed_store import EmbedStore, estore_from_cframe from .dict_store import DictStore from .tree_store import TreeStore +from .vlarray import VLArray, vlarray_from_cframe from .c2array import c2context, C2Array, URLPath @@ -739,6 +740,7 @@ def _raise(exc): "TreeStore", "Tuner", "URLPath", + "VLArray", # Version "__version__", # Utils @@ -934,6 +936,7 @@ def _raise(exc): "validate_expr", "var", "vecdot", + "vlarray_from_cframe", "where", "zeros", "zeros_like", diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx index c56104ab6..604be0bd3 100644 --- a/src/blosc2/blosc2_ext.pyx +++ b/src/blosc2/blosc2_ext.pyx @@ -1565,6 +1565,16 @@ cdef class SChunk: raise RuntimeError("Could not delete the desired chunk") return rc + def append_chunk(self, chunk): + cdef const uint8_t[:] typed_view_chunk + mem_view_chunk = memoryview(chunk) + typed_view_chunk = mem_view_chunk.cast('B') + _check_comp_length('chunk', len(typed_view_chunk)) + rc = blosc2_schunk_append_chunk(self.schunk, &typed_view_chunk[0], True) + if rc < 0: + raise RuntimeError("Could not append the desired chunk") + return rc + def insert_chunk(self, nchunk, chunk): cdef const uint8_t[:] typed_view_chunk mem_view_chunk = memoryview(chunk) diff --git a/src/blosc2/core.py b/src/blosc2/core.py index 4ec139c40..5526a7f20 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -1918,8 +1918,9 @@ def ndarray_from_cframe(cframe: bytes | str, copy: bool = False) -> blosc2.NDArr def from_cframe( cframe: bytes | str, copy: bool = True -) -> blosc2.EmbedStore | blosc2.NDArray | blosc2.SChunk: - """Create a :ref:`EmbedStore `, :ref:`NDArray ` or :ref:`SChunk ` instance +) -> blosc2.EmbedStore | blosc2.NDArray | blosc2.SChunk | blosc2.VLArray: + """Create a :ref:`EmbedStore `, :ref:`NDArray `, :ref:`SChunk ` + or :ref:`VLArray ` instance from a contiguous frame buffer. Parameters @@ -1936,7 +1937,8 @@ def from_cframe( Returns ------- - out: :ref:`EmbedStore `, :ref:`NDArray ` or :ref:`SChunk ` + out: :ref:`EmbedStore `, :ref:`NDArray `, :ref:`SChunk ` + or :ref:`VLArray ` A new instance of the appropriate type containing the data passed. See Also @@ -1950,6 +1952,8 @@ def from_cframe( # Check the metalayer to determine the type if "b2embed" in schunk.meta: return blosc2.estore_from_cframe(cframe, copy=copy) + if "vlarray" in schunk.meta: + return blosc2.vlarray_from_cframe(cframe, copy=copy) if "b2nd" in schunk.meta: return ndarray_from_cframe(cframe, copy=copy) return schunk_from_cframe(cframe, copy=copy) diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index cd2dbe9d4..3e0014451 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -801,6 +801,27 @@ def insert_data(self, nchunk: int, data: object, copy: bool) -> int: blosc2_ext.check_access_mode(self.urlpath, self.mode) return super().insert_data(nchunk, data, copy) + def append_chunk(self, chunk: bytes) -> int: + """Append a compressed chunk to the end of the SChunk. + + Parameters + ---------- + chunk: bytes object + The compressed chunk to append. + + Returns + ------- + out: int + The number of chunks in the SChunk. + + Raises + ------ + RuntimeError + If the chunk could not be appended. + """ + blosc2_ext.check_access_mode(self.urlpath, self.mode) + return super().append_chunk(chunk) + def update_chunk(self, nchunk: int, chunk: bytes) -> int: """Update an existing chunk in the SChunk. @@ -1603,6 +1624,11 @@ def _process_opened_object(res): elif not proxy_src["caterva2_env"]: raise RuntimeError("Could not find the source when opening a Proxy") + if "vlarray" in meta: + from blosc2.vlarray import VLArray + + return VLArray(_from_schunk=getattr(res, "schunk", res)) + if isinstance(res, blosc2.NDArray) and "LazyArray" in res.schunk.meta: return blosc2._open_lazyarray(res) else: @@ -1614,6 +1640,7 @@ def open( ) -> ( blosc2.SChunk | blosc2.NDArray + | blosc2.VLArray | blosc2.C2Array | blosc2.LazyArray | blosc2.Proxy diff --git a/src/blosc2/vlarray.py b/src/blosc2/vlarray.py new file mode 100644 index 000000000..4d1721374 --- /dev/null +++ b/src/blosc2/vlarray.py @@ -0,0 +1,224 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import copy +import pathlib +from typing import TYPE_CHECKING, Any + +from msgpack import packb, unpackb + +import blosc2 +from blosc2 import blosc2_ext + +if TYPE_CHECKING: + from collections.abc import Iterator + + from blosc2.schunk import SChunk + +_VLARRAY_META = {"version": 1, "serializer": "msgpack"} + + +def _check_serialized_size(buffer: bytes) -> None: + if len(buffer) > blosc2.MAX_BUFFERSIZE: + raise ValueError(f"Serialized objects cannot be larger than {blosc2.MAX_BUFFERSIZE} bytes") + + +class VLArray: + """A variable-length array backed by an :class:`blosc2.SChunk`.""" + + @staticmethod + def _set_typesize_one(cparams: blosc2.CParams | dict | None) -> blosc2.CParams | dict: + if cparams is None: + cparams = blosc2.CParams() + elif isinstance(cparams, blosc2.CParams): + cparams = copy.deepcopy(cparams) + else: + cparams = dict(cparams) + + if isinstance(cparams, blosc2.CParams): + cparams.typesize = 1 + else: + cparams["typesize"] = 1 + return cparams + + @staticmethod + def _coerce_storage(storage: blosc2.Storage | dict | None, kwargs: dict[str, Any]) -> blosc2.Storage: + if storage is not None: + storage_keys = set(blosc2.Storage.__annotations__) + storage_kwargs = storage_keys.intersection(kwargs) + if storage_kwargs: + unexpected = ", ".join(sorted(storage_kwargs)) + raise AttributeError( + f"Cannot pass both `storage` and other kwargs already included in Storage: {unexpected}" + ) + if isinstance(storage, blosc2.Storage): + return copy.deepcopy(storage) + return blosc2.Storage(**storage) + + storage_kwargs = { + name: kwargs.pop(name) for name in list(blosc2.Storage.__annotations__) if name in kwargs + } + return blosc2.Storage(**storage_kwargs) + + @staticmethod + def _validate_storage(storage: blosc2.Storage) -> None: + if storage.mmap_mode not in (None, "r"): + raise ValueError("For VLArray containers, mmap_mode must be None or 'r'") + if storage.mmap_mode == "r" and storage.mode != "r": + raise ValueError("For VLArray containers, mmap_mode='r' requires mode='r'") + + def _attach_schunk(self, schunk: SChunk) -> None: + self.schunk = schunk + self.urlpath = schunk.urlpath + self.mode = schunk.mode + self.mmap_mode = getattr(schunk, "mmap_mode", None) + self._validate_tag() + + def _maybe_open_existing(self, storage: blosc2.Storage) -> bool: + urlpath = storage.urlpath + if urlpath is None or storage.mode not in ("r", "a") or not pathlib.Path(urlpath).exists(): + return False + + schunk = blosc2.blosc2_ext.open(urlpath, mode=storage.mode, offset=0, mmap_mode=storage.mmap_mode) + self._attach_schunk(schunk) + return True + + def __init__( + self, + chunksize: int | None = None, + _from_schunk: SChunk | None = None, + **kwargs: Any, + ) -> None: + if _from_schunk is not None: + if chunksize is not None: + raise ValueError("Cannot pass `chunksize` together with `_from_schunk`") + if kwargs: + unexpected = ", ".join(sorted(kwargs)) + raise ValueError(f"Cannot pass {unexpected} together with `_from_schunk`") + self._attach_schunk(_from_schunk) + return + + cparams = kwargs.pop("cparams", None) + dparams = kwargs.pop("dparams", None) + storage = kwargs.pop("storage", None) + storage = self._coerce_storage(storage, kwargs) + + if kwargs: + unexpected = ", ".join(sorted(kwargs)) + raise ValueError(f"Unsupported VLArray keyword argument(s): {unexpected}") + + self._validate_storage(storage) + cparams = self._set_typesize_one(cparams) + + if dparams is None: + dparams = blosc2.DParams() + + if self._maybe_open_existing(storage): + return + + fixed_meta = dict(storage.meta or {}) + fixed_meta["vlarray"] = dict(_VLARRAY_META) + storage.meta = fixed_meta + if chunksize is None: + chunksize = -1 + schunk = blosc2.SChunk( + chunksize=chunksize, data=None, cparams=cparams, dparams=dparams, storage=storage + ) + self._attach_schunk(schunk) + + def _validate_tag(self) -> None: + if "vlarray" not in self.schunk.meta: + raise ValueError("The supplied SChunk is not tagged as a VLArray") + + def _check_writable(self) -> None: + if self.mode == "r": + raise ValueError("Cannot modify a VLArray opened in read-only mode") + + def _normalize_index(self, index: int) -> int: + if not isinstance(index, int): + raise TypeError("VLArray indices must be integers") + if index < 0: + index += len(self) + if index < 0 or index >= len(self): + raise IndexError("VLArray index out of range") + return index + + def _serialize(self, value: Any) -> bytes: + payload = packb(value, default=blosc2_ext.encode_tuple, strict_types=True, use_bin_type=True) + _check_serialized_size(payload) + return payload + + def _compress(self, payload: bytes) -> bytes: + return blosc2.compress2(payload, cparams=self.schunk.cparams) + + def append(self, value: Any) -> int: + self._check_writable() + chunk = self._compress(self._serialize(value)) + return self.schunk.append_chunk(chunk) + + def __getitem__(self, index: int) -> Any: + if isinstance(index, slice): + raise NotImplementedError("Slicing is not supported for VLArray") + index = self._normalize_index(index) + payload = self.schunk.decompress_chunk(index) + return unpackb(payload, list_hook=blosc2_ext.decode_tuple) + + def __setitem__(self, index: int, value: Any) -> None: + if isinstance(index, slice): + raise NotImplementedError("Slicing is not supported for VLArray") + self._check_writable() + index = self._normalize_index(index) + chunk = self._compress(self._serialize(value)) + self.schunk.update_chunk(index, chunk) + + def __len__(self) -> int: + return self.schunk.nchunks + + def __iter__(self) -> Iterator[Any]: + for i in range(len(self)): + yield self[i] + + @property + def meta(self): + return self.schunk.meta + + @property + def vlmeta(self): + return self.schunk.vlmeta + + @property + def cparams(self): + return self.schunk.cparams + + @property + def dparams(self): + return self.schunk.dparams + + @property + def chunksize(self) -> int: + return self.schunk.chunksize + + def to_cframe(self) -> bytes: + return self.schunk.to_cframe() + + def __enter__(self) -> VLArray: + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> bool: + return False + + def __repr__(self) -> str: + return f"VLArray(len={len(self)}, urlpath={self.urlpath!r})" + + +def vlarray_from_cframe(cframe: bytes, copy: bool = False) -> VLArray: + """Deserialize a CFrame buffer into a :class:`VLArray`.""" + + schunk = blosc2.schunk_from_cframe(cframe, copy=copy) + return VLArray(_from_schunk=schunk) diff --git a/tests/test_schunk_update.py b/tests/test_schunk_update.py index 2f3a8b7be..4115c6b0b 100644 --- a/tests/test_schunk_update.py +++ b/tests/test_schunk_update.py @@ -108,3 +108,38 @@ def test_update(contiguous, urlpath, nchunks, nupdates, copy, create_chunk, gil) for i in range(nchunks): schunk.decompress_chunk(i) blosc2.remove_urlpath(urlpath) + + +@pytest.mark.parametrize( + ("contiguous", "urlpath"), + [ + (False, None), + (True, None), + (True, "test_variable_append_chunk.b2frame"), + (False, "test_variable_append_chunk_s.b2frame"), + ], +) +def test_append_chunk_variable_sizes(contiguous, urlpath): + blosc2.remove_urlpath(urlpath) + + schunk = blosc2.SChunk(chunksize=-1, contiguous=contiguous, urlpath=urlpath, cparams={"typesize": 1}) + payloads = [b"a" * 13, b"b" * 29, b"c" * 41] + + for i, payload in enumerate(payloads, start=1): + chunk = blosc2.compress2(payload, typesize=1) + assert schunk.append_chunk(chunk) == i + assert schunk.decompress_chunk(i - 1) == payload + + assert schunk.chunksize == 0 + + replacement = b"z" * 17 + schunk.update_chunk(1, blosc2.compress2(replacement, typesize=1)) + expected = [payloads[0], replacement, payloads[2]] + assert [schunk.decompress_chunk(i) for i in range(schunk.nchunks)] == expected + + if urlpath is not None: + reopened = blosc2.open(urlpath, mode="r") + assert reopened.chunksize == 0 + assert [reopened.decompress_chunk(i) for i in range(reopened.nchunks)] == expected + + blosc2.remove_urlpath(urlpath) diff --git a/tests/test_vlarray.py b/tests/test_vlarray.py new file mode 100644 index 000000000..447d33d72 --- /dev/null +++ b/tests/test_vlarray.py @@ -0,0 +1,112 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +import pytest + +import blosc2 + +VALUES = [ + b"bytes\x00payload", + "plain text", + 42, + 3.5, + True, + None, + [1, "two", b"three"], + (1, 2, "three"), + {"nested": [1, 2], "tuple": (3, 4)}, +] + + +def _storage(contiguous, urlpath, mode="w"): + return blosc2.Storage(contiguous=contiguous, urlpath=urlpath, mode=mode) + + +@pytest.mark.parametrize( + ("contiguous", "urlpath"), + [ + (False, None), + (True, None), + (True, "test_vlarray.b2frame"), + (False, "test_vlarray_s.b2frame"), + ], +) +def test_vlarray_roundtrip(contiguous, urlpath): + blosc2.remove_urlpath(urlpath) + + vlarray = blosc2.VLArray(storage=_storage(contiguous, urlpath)) + assert vlarray.meta["vlarray"]["serializer"] == "msgpack" + + for i, value in enumerate(VALUES, start=1): + assert vlarray.append(value) == i + + assert len(vlarray) == len(VALUES) + assert list(vlarray) == VALUES + assert vlarray[-1] == VALUES[-1] + + expected = list(VALUES) + expected[1] = {"updated": ("tuple", 7)} + expected[-1] = "tiny" + vlarray[1] = expected[1] + vlarray[-1] = expected[-1] + assert list(vlarray) == expected + + if urlpath is not None: + reopened = blosc2.open(urlpath, mode="r") + assert isinstance(reopened, blosc2.VLArray) + assert list(reopened) == expected + with pytest.raises(ValueError): + reopened.append("nope") + with pytest.raises(ValueError): + reopened[0] = "nope" + + reopened_rw = blosc2.open(urlpath, mode="a") + reopened_rw[0] = "changed" + expected[0] = "changed" + assert list(reopened_rw) == expected + + if contiguous: + reopened_mmap = blosc2.open(urlpath, mode="r", mmap_mode="r") + assert isinstance(reopened_mmap, blosc2.VLArray) + assert list(reopened_mmap) == expected + + blosc2.remove_urlpath(urlpath) + + +def test_vlarray_from_cframe(): + vlarray = blosc2.VLArray() + for value in VALUES[:4]: + vlarray.append(value) + + restored = blosc2.from_cframe(vlarray.to_cframe()) + assert isinstance(restored, blosc2.VLArray) + assert list(restored) == VALUES[:4] + + restored2 = blosc2.vlarray_from_cframe(vlarray.to_cframe()) + assert isinstance(restored2, blosc2.VLArray) + assert list(restored2) == VALUES[:4] + + +def test_vlarray_constructor_kwargs(): + urlpath = "test_vlarray_kwargs.b2frame" + blosc2.remove_urlpath(urlpath) + + vlarray = blosc2.VLArray(urlpath=urlpath, mode="w", contiguous=True) + for value in VALUES[:3]: + vlarray.append(value) + + reopened = blosc2.VLArray(urlpath=urlpath, mode="r", contiguous=True, mmap_mode="r") + assert list(reopened) == VALUES[:3] + + blosc2.remove_urlpath(urlpath) + + +def test_vlarray_size_guard(monkeypatch): + vlarray = blosc2.VLArray() + monkeypatch.setattr(blosc2, "MAX_BUFFERSIZE", 4) + with pytest.raises(ValueError, match="Serialized objects cannot be larger"): + vlarray.append("payload") From aedf2b3243b8874ed5b28f28aed5faa5fc66ae70 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sat, 14 Mar 2026 17:38:22 +0100 Subject: [PATCH 2/7] Add several list-oriented methods (insert, delete, pop...); docs are here too --- doc/reference/classes.rst | 2 + doc/reference/misc.rst | 2 + src/blosc2/vlarray.py | 72 ++++++++++++++++++++++++++++++++ tests/test_vlarray.py | 87 ++++++++++++++++++++++++++++++++++++--- 4 files changed, 157 insertions(+), 6 deletions(-) diff --git a/doc/reference/classes.rst b/doc/reference/classes.rst index cca8c7e07..84af533ce 100644 --- a/doc/reference/classes.rst +++ b/doc/reference/classes.rst @@ -16,6 +16,7 @@ Main Classes DictStore TreeStore EmbedStore + VLArray Proxy ProxySource ProxyNDSource @@ -33,6 +34,7 @@ Main Classes dict_store tree_store embed_store + vlarray proxy proxysource proxyndsource diff --git a/doc/reference/misc.rst b/doc/reference/misc.rst index 50cb0c1b1..b6e0ddee6 100644 --- a/doc/reference/misc.rst +++ b/doc/reference/misc.rst @@ -134,6 +134,8 @@ This page documents the miscellaneous members of the ``blosc2`` module that do n TreeStore, DictStore, EmbedStore, + VLArray, + vlarray_from_cframe, abs, acos, acosh, diff --git a/src/blosc2/vlarray.py b/src/blosc2/vlarray.py index 4d1721374..2677b32a8 100644 --- a/src/blosc2/vlarray.py +++ b/src/blosc2/vlarray.py @@ -89,6 +89,16 @@ def _maybe_open_existing(self, storage: blosc2.Storage) -> bool: self._attach_schunk(schunk) return True + def _make_storage(self) -> blosc2.Storage: + meta = {name: self.meta[name] for name in self.meta} + return blosc2.Storage( + contiguous=self.schunk.contiguous, + urlpath=self.urlpath, + mode=self.mode, + mmap_mode=self.mmap_mode, + meta=meta, + ) + def __init__( self, chunksize: int | None = None, @@ -149,6 +159,17 @@ def _normalize_index(self, index: int) -> int: raise IndexError("VLArray index out of range") return index + def _normalize_insert_index(self, index: int) -> int: + if not isinstance(index, int): + raise TypeError("VLArray indices must be integers") + if index < 0: + index += len(self) + if index < 0: + return 0 + if index > len(self): + return len(self) + return index + def _serialize(self, value: Any) -> bytes: payload = packb(value, default=blosc2_ext.encode_tuple, strict_types=True, use_bin_type=True) _check_serialized_size(payload) @@ -158,10 +179,58 @@ def _compress(self, payload: bytes) -> bytes: return blosc2.compress2(payload, cparams=self.schunk.cparams) def append(self, value: Any) -> int: + """Append one value and return the new number of entries.""" self._check_writable() chunk = self._compress(self._serialize(value)) return self.schunk.append_chunk(chunk) + def insert(self, index: int, value: Any) -> int: + """Insert one value at ``index`` and return the new number of entries.""" + self._check_writable() + index = self._normalize_insert_index(index) + chunk = self._compress(self._serialize(value)) + return self.schunk.insert_chunk(index, chunk) + + def delete(self, index: int) -> int: + """Delete the value at ``index`` and return the new number of entries.""" + self._check_writable() + if isinstance(index, slice): + raise NotImplementedError("Slicing is not supported for VLArray") + index = self._normalize_index(index) + return self.schunk.delete_chunk(index) + + def pop(self, index: int = -1) -> Any: + """Remove and return the value at ``index``.""" + self._check_writable() + if isinstance(index, slice): + raise NotImplementedError("Slicing is not supported for VLArray") + index = self._normalize_index(index) + value = self[index] + self.schunk.delete_chunk(index) + return value + + def extend(self, values: object) -> None: + """Append all values from an iterable.""" + self._check_writable() + for value in values: + chunk = self._compress(self._serialize(value)) + self.schunk.append_chunk(chunk) + + def clear(self) -> None: + """Remove all entries from the container.""" + self._check_writable() + storage = self._make_storage() + if storage.urlpath is not None: + blosc2.remove_urlpath(storage.urlpath) + schunk = blosc2.SChunk( + chunksize=-1, + data=None, + cparams=copy.deepcopy(self.cparams), + dparams=copy.deepcopy(self.dparams), + storage=storage, + ) + self._attach_schunk(schunk) + def __getitem__(self, index: int) -> Any: if isinstance(index, slice): raise NotImplementedError("Slicing is not supported for VLArray") @@ -177,6 +246,9 @@ def __setitem__(self, index: int, value: Any) -> None: chunk = self._compress(self._serialize(value)) self.schunk.update_chunk(index, chunk) + def __delitem__(self, index: int) -> None: + self.delete(index) + def __len__(self) -> int: return self.schunk.nchunks diff --git a/tests/test_vlarray.py b/tests/test_vlarray.py index 447d33d72..bb643473a 100644 --- a/tests/test_vlarray.py +++ b/tests/test_vlarray.py @@ -53,6 +53,16 @@ def test_vlarray_roundtrip(contiguous, urlpath): expected[-1] = "tiny" vlarray[1] = expected[1] vlarray[-1] = expected[-1] + assert vlarray.insert(0, "head") == len(expected) + 1 + expected.insert(0, "head") + assert vlarray.insert(-1, {"between": 5}) == len(expected) + 1 + expected.insert(-1, {"between": 5}) + assert vlarray.insert(999, "tail") == len(expected) + 1 + expected.insert(999, "tail") + assert vlarray.delete(2) == len(expected) - 1 + del expected[2] + del vlarray[-2] + del expected[-2] assert list(vlarray) == expected if urlpath is not None: @@ -63,6 +73,18 @@ def test_vlarray_roundtrip(contiguous, urlpath): reopened.append("nope") with pytest.raises(ValueError): reopened[0] = "nope" + with pytest.raises(ValueError): + reopened.insert(0, "nope") + with pytest.raises(ValueError): + reopened.delete(0) + with pytest.raises(ValueError): + del reopened[0] + with pytest.raises(ValueError): + reopened.extend(["nope"]) + with pytest.raises(ValueError): + reopened.pop() + with pytest.raises(ValueError): + reopened.clear() reopened_rw = blosc2.open(urlpath, mode="a") reopened_rw[0] = "changed" @@ -79,16 +101,20 @@ def test_vlarray_roundtrip(contiguous, urlpath): def test_vlarray_from_cframe(): vlarray = blosc2.VLArray() - for value in VALUES[:4]: - vlarray.append(value) + vlarray.extend(VALUES) + vlarray.insert(1, {"inserted": True}) + del vlarray[3] + expected = list(VALUES) + expected.insert(1, {"inserted": True}) + del expected[3] restored = blosc2.from_cframe(vlarray.to_cframe()) assert isinstance(restored, blosc2.VLArray) - assert list(restored) == VALUES[:4] + assert list(restored) == expected restored2 = blosc2.vlarray_from_cframe(vlarray.to_cframe()) assert isinstance(restored2, blosc2.VLArray) - assert list(restored2) == VALUES[:4] + assert list(restored2) == expected def test_vlarray_constructor_kwargs(): @@ -96,11 +122,11 @@ def test_vlarray_constructor_kwargs(): blosc2.remove_urlpath(urlpath) vlarray = blosc2.VLArray(urlpath=urlpath, mode="w", contiguous=True) - for value in VALUES[:3]: + for value in VALUES: vlarray.append(value) reopened = blosc2.VLArray(urlpath=urlpath, mode="r", contiguous=True, mmap_mode="r") - assert list(reopened) == VALUES[:3] + assert list(reopened) == VALUES blosc2.remove_urlpath(urlpath) @@ -110,3 +136,52 @@ def test_vlarray_size_guard(monkeypatch): monkeypatch.setattr(blosc2, "MAX_BUFFERSIZE", 4) with pytest.raises(ValueError, match="Serialized objects cannot be larger"): vlarray.append("payload") + + +@pytest.mark.parametrize( + ("contiguous", "urlpath"), + [ + (False, None), + (True, None), + (True, "test_vlarray_list_ops.b2frame"), + (False, "test_vlarray_list_ops_s.b2frame"), + ], +) +def test_vlarray_list_like_ops(contiguous, urlpath): + blosc2.remove_urlpath(urlpath) + + vlarray = blosc2.VLArray(storage=_storage(contiguous, urlpath)) + vlarray.extend([1, 2, 3]) + assert list(vlarray) == [1, 2, 3] + assert vlarray.pop() == 3 + assert vlarray.pop(0) == 1 + assert list(vlarray) == [2] + + vlarray.clear() + assert len(vlarray) == 0 + assert list(vlarray) == [] + + vlarray.extend(["a", "b"]) + assert list(vlarray) == ["a", "b"] + + if urlpath is not None: + reopened = blosc2.open(urlpath, mode="r") + assert list(reopened) == ["a", "b"] + + blosc2.remove_urlpath(urlpath) + + +def test_vlarray_insert_delete_errors(): + vlarray = blosc2.VLArray() + vlarray.append("value") + + with pytest.raises(TypeError): + vlarray.insert("0", "bad") + with pytest.raises(IndexError): + vlarray.delete(3) + with pytest.raises(NotImplementedError): + vlarray.delete(slice(0, 1)) + with pytest.raises(IndexError): + blosc2.VLArray().pop() + with pytest.raises(NotImplementedError): + vlarray.pop(slice(0, 1)) From 004dfb59f2e3868918b63ca51efd045cdd13525b Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sat, 14 Mar 2026 18:00:11 +0100 Subject: [PATCH 3/7] Add example and tutorial --- doc/getting_started/tutorials.rst | 1 + .../tutorials/11.vlarray.ipynb | 325 ++++++++++++++++++ examples/vlarray.py | 69 ++++ src/blosc2/vlarray.py | 52 ++- tests/test_vlarray.py | 86 ++++- 5 files changed, 528 insertions(+), 5 deletions(-) create mode 100644 doc/getting_started/tutorials/11.vlarray.ipynb create mode 100644 examples/vlarray.py diff --git a/doc/getting_started/tutorials.rst b/doc/getting_started/tutorials.rst index 35f347d40..563ba8ea6 100644 --- a/doc/getting_started/tutorials.rst +++ b/doc/getting_started/tutorials.rst @@ -16,3 +16,4 @@ Tutorials tutorials/08.schunk-slicing_and_beyond tutorials/09.ucodecs-ufilters tutorials/10.prefilters + tutorials/11.vlarray diff --git a/doc/getting_started/tutorials/11.vlarray.ipynb b/doc/getting_started/tutorials/11.vlarray.ipynb new file mode 100644 index 000000000..e78733fb3 --- /dev/null +++ b/doc/getting_started/tutorials/11.vlarray.ipynb @@ -0,0 +1,325 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Working with VLArray\n", + "\n", + "A `VLArray` is a list-like container for variable-length Python values backed by a single `SChunk`. Each entry is stored in its own compressed chunk, and values are serialized with msgpack before reaching storage.\n", + "\n", + "This makes `VLArray` a good fit for heterogeneous, variable-length payloads such as small dictionaries, strings, tuples, byte blobs, or nested list/dict structures." + ], + "id": "ceb4789a488cc07f" + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-14T16:57:57.563663Z", + "start_time": "2026-03-14T16:57:57.294290Z" + } + }, + "source": [ + "import blosc2\n", + "\n", + "\n", + "def show(label, value):\n", + " print(f\"{label}: {value}\")\n", + "\n", + "\n", + "urlpath = \"vlarray_tutorial.b2frame\"\n", + "copy_path = \"vlarray_tutorial_copy.b2frame\"\n", + "blosc2.remove_urlpath(urlpath)\n", + "blosc2.remove_urlpath(copy_path)" + ], + "id": "f264f2e4bcb57029", + "outputs": [], + "execution_count": 1 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating and populating a VLArray\n", + "\n", + "Entries can be appended one by one or in batches with `extend()`. The container accepts the msgpack-safe Python types supported by the implementation: `bytes`, `str`, `int`, `float`, `bool`, `None`, `list`, `tuple`, and `dict`." + ], + "id": "24ceae332dfa437" + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-14T16:57:57.609603Z", + "start_time": "2026-03-14T16:57:57.569987Z" + } + }, + "source": [ + "vla = blosc2.VLArray(urlpath=urlpath, mode=\"w\")\n", + "vla.append({\"name\": \"alpha\", \"count\": 1})\n", + "vla.extend([b\"bytes\", (\"a\", 2), [\"x\", \"y\"], 42, None])\n", + "vla.insert(1, \"between\")\n", + "\n", + "show(\"Initial entries\", list(vla))\n", + "show(\"Length\", len(vla))" + ], + "id": "10e4e9ce600cda9d", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial entries: [{'name': 'alpha', 'count': 1}, 'between', b'bytes', ('a', 2), ['x', 'y'], 42, None]\n", + "Length: 7\n" + ] + } + ], + "execution_count": 2 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Indexing and slicing\n", + "\n", + "Indexing behaves like a Python list. Negative indexes are supported, and slice reads return a plain Python list." + ], + "id": "2f2dbe81b7653d8f" + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-14T16:57:57.677796Z", + "start_time": "2026-03-14T16:57:57.623048Z" + } + }, + "source": [ + "show(\"Last entry\", vla[-1])\n", + "show(\"Slice [1:6:2]\", vla[1:6:2])\n", + "show(\"Reverse slice\", vla[::-2])" + ], + "id": "82ea38dca631efb9", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Last entry: None\n", + "Slice [1:6:2]: ['between', ('a', 2), 42]\n", + "Reverse slice: [None, ['x', 'y'], b'bytes', {'name': 'alpha', 'count': 1}]\n" + ] + } + ], + "execution_count": 3 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Updating, inserting, and deleting\n", + "\n", + "Single entries can be overwritten by index. Slice assignment follows Python list rules: slices with `step == 1` may resize the container, while extended slices require matching lengths." + ], + "id": "a871bb9b21d6f36c" + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-14T16:57:57.727569Z", + "start_time": "2026-03-14T16:57:57.678936Z" + } + }, + "source": [ + "vla[2:5] = [\"replaced\", {\"nested\": True}]\n", + "show(\"After slice replacement\", list(vla))\n", + "\n", + "vla[::2] = [\"even-0\", \"even-1\", \"even-2\"]\n", + "show(\"After extended-slice update\", list(vla))\n", + "\n", + "del vla[1::3]\n", + "show(\"After slice deletion\", list(vla))\n", + "\n", + "removed = vla.pop()\n", + "show(\"Popped entry\", removed)\n", + "show(\"After pop\", list(vla))" + ], + "id": "e22e4f90499ae02", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "After slice replacement: [{'name': 'alpha', 'count': 1}, 'between', 'replaced', {'nested': True}, 42, None]\n", + "After extended-slice update: ['even-0', 'between', 'even-1', {'nested': True}, 'even-2', None]\n", + "After slice deletion: ['even-0', 'even-1', {'nested': True}, None]\n", + "Popped entry: None\n", + "After pop: ['even-0', 'even-1', {'nested': True}]\n" + ] + } + ], + "execution_count": 4 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Copying with new storage or compression parameters\n", + "\n", + "The `copy()` method can duplicate the container into a different storage layout or with different compression settings." + ], + "id": "f41af458cb5faa9f" + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-14T16:57:57.747309Z", + "start_time": "2026-03-14T16:57:57.730015Z" + } + }, + "source": [ + "vla_copy = vla.copy(\n", + " urlpath=copy_path,\n", + " contiguous=False,\n", + " cparams={\"codec\": blosc2.Codec.LZ4, \"clevel\": 5},\n", + ")\n", + "\n", + "show(\"Copied entries\", list(vla_copy))\n", + "show(\"Copy storage is contiguous\", vla_copy.schunk.contiguous)\n", + "show(\"Copy codec\", vla_copy.cparams.codec)" + ], + "id": "6e752260e010272e", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Copied entries: ['even-0', 'even-1', {'nested': True}]\n", + "Copy storage is contiguous: False\n", + "Copy codec: Codec.LZ4\n" + ] + } + ], + "execution_count": 5 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Round-tripping through cframes and reopening from disk\n", + "\n", + "Tagged persistent stores automatically reopen as `VLArray`, and a serialized cframe buffer does too." + ], + "id": "bb576497d4b6f537" + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-14T16:57:57.759998Z", + "start_time": "2026-03-14T16:57:57.748296Z" + } + }, + "source": [ + "cframe = vla.to_cframe()\n", + "restored = blosc2.from_cframe(cframe)\n", + "show(\"from_cframe type\", type(restored).__name__)\n", + "show(\"from_cframe entries\", list(restored))\n", + "\n", + "reopened = blosc2.open(urlpath, mode=\"r\", mmap_mode=\"r\")\n", + "show(\"Reopened type\", type(reopened).__name__)\n", + "show(\"Reopened entries\", list(reopened))" + ], + "id": "42d59dccf6ea9c44", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "from_cframe type: VLArray\n", + "from_cframe entries: ['even-0', 'even-1', {'nested': True}]\n", + "Reopened type: VLArray\n", + "Reopened entries: ['even-0', 'even-1', {'nested': True}]\n" + ] + } + ], + "execution_count": 6 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clearing and reusing a container\n", + "\n", + "Calling `clear()` resets the backing storage so the container remains ready for new variable-length entries." + ], + "id": "53778312cc1a03bc" + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-14T16:57:57.778160Z", + "start_time": "2026-03-14T16:57:57.761236Z" + } + }, + "source": [ + "scratch = vla.copy()\n", + "scratch.clear()\n", + "scratch.extend([\"fresh\", 123, {\"done\": True}])\n", + "show(\"After clear + extend on in-memory copy\", list(scratch))\n", + "\n", + "blosc2.remove_urlpath(urlpath)\n", + "blosc2.remove_urlpath(copy_path)" + ], + "id": "55b9ea793a41f38a", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "After clear + extend on in-memory copy: ['fresh', 123, {'done': True}]\n" + ] + } + ], + "execution_count": 7 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-14T16:57:57.789994Z", + "start_time": "2026-03-14T16:57:57.779434Z" + } + }, + "cell_type": "code", + "source": "", + "id": "34e77790ab2a0f94", + "outputs": [], + "execution_count": 7 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/vlarray.py b/examples/vlarray.py new file mode 100644 index 000000000..0e988c83b --- /dev/null +++ b/examples/vlarray.py @@ -0,0 +1,69 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +import blosc2 + + +def show(label, value): + print(f"{label}: {value}") + + +urlpath = "example_vlarray.b2frame" +copy_path = "example_vlarray_copy.b2frame" +blosc2.remove_urlpath(urlpath) +blosc2.remove_urlpath(copy_path) + +# Create a persistent VLArray and store heterogeneous Python values. +vla = blosc2.VLArray(urlpath=urlpath, mode="w", contiguous=True) +vla.append({"name": "alpha", "count": 1}) +vla.extend([b"bytes", ("a", 2), ["x", "y"], 42, None]) +vla.insert(1, "between") + +show("Initial entries", list(vla)) +show("Negative index", vla[-1]) +show("Slice [1:6:2]", vla[1:6:2]) + +# Slice assignment with step == 1 can resize the container. +vla[2:5] = ["replaced", {"nested": True}] +show("After slice replacement", list(vla)) + +# Extended slices require matching lengths. +vla[::2] = ["even-0", "even-1", "even-2"] +show("After extended-slice update", list(vla)) + +# Delete by index, by slice, or with pop(). +del vla[1::3] +show("After slice deletion", list(vla)) +removed = vla.pop() +show("Popped entry", removed) +show("After pop", list(vla)) + +# Copy into a different backing store and with different compression parameters. +vla_copy = vla.copy(urlpath=copy_path, contiguous=False, cparams={"codec": blosc2.Codec.LZ4, "clevel": 5}) +show("Copied entries", list(vla_copy)) +show("Copy storage is contiguous", vla_copy.schunk.contiguous) +show("Copy codec", vla_copy.cparams.codec) + +# Round-trip through a cframe buffer. +cframe = vla.to_cframe() +restored = blosc2.from_cframe(cframe) +show("from_cframe type", type(restored).__name__) +show("from_cframe entries", list(restored)) + +# Reopen from disk; tagged stores come back as VLArray. +reopened = blosc2.open(urlpath, mode="r", mmap_mode="r") +show("Reopened type", type(reopened).__name__) +show("Reopened entries", list(reopened)) + +# Clear and reuse an in-memory copy. +scratch = vla.copy() +scratch.clear() +scratch.extend(["fresh", 123, {"done": True}]) +show("After clear + extend on in-memory copy", list(scratch)) + +blosc2.remove_urlpath(urlpath) +blosc2.remove_urlpath(copy_path) diff --git a/src/blosc2/vlarray.py b/src/blosc2/vlarray.py index 2677b32a8..556b31ed7 100644 --- a/src/blosc2/vlarray.py +++ b/src/blosc2/vlarray.py @@ -170,6 +170,12 @@ def _normalize_insert_index(self, index: int) -> int: return len(self) return index + def _slice_indices(self, index: slice) -> list[int]: + return list(range(*index.indices(len(self)))) + + def _copy_meta(self) -> dict[str, Any]: + return {name: self.meta[name] for name in self.meta} + def _serialize(self, value: Any) -> bytes: payload = packb(value, default=blosc2_ext.encode_tuple, strict_types=True, use_bin_type=True) _check_serialized_size(payload) @@ -195,7 +201,9 @@ def delete(self, index: int) -> int: """Delete the value at ``index`` and return the new number of entries.""" self._check_writable() if isinstance(index, slice): - raise NotImplementedError("Slicing is not supported for VLArray") + for idx in reversed(self._slice_indices(index)): + self.schunk.delete_chunk(idx) + return len(self) index = self._normalize_index(index) return self.schunk.delete_chunk(index) @@ -233,14 +241,33 @@ def clear(self) -> None: def __getitem__(self, index: int) -> Any: if isinstance(index, slice): - raise NotImplementedError("Slicing is not supported for VLArray") + return [self[i] for i in self._slice_indices(index)] index = self._normalize_index(index) payload = self.schunk.decompress_chunk(index) return unpackb(payload, list_hook=blosc2_ext.decode_tuple) def __setitem__(self, index: int, value: Any) -> None: if isinstance(index, slice): - raise NotImplementedError("Slicing is not supported for VLArray") + self._check_writable() + indices = self._slice_indices(index) + values = list(value) + step = 1 if index.step is None else index.step + if step == 1: + start = self._normalize_insert_index(0 if index.start is None else index.start) + for idx in reversed(indices): + self.schunk.delete_chunk(idx) + for offset, item in enumerate(values): + chunk = self._compress(self._serialize(item)) + self.schunk.insert_chunk(start + offset, chunk) + return + if len(values) != len(indices): + raise ValueError( + f"attempt to assign sequence of size {len(values)} to extended slice of size {len(indices)}" + ) + for idx, item in zip(indices, values, strict=True): + chunk = self._compress(self._serialize(item)) + self.schunk.update_chunk(idx, chunk) + return self._check_writable() index = self._normalize_index(index) chunk = self._compress(self._serialize(value)) @@ -279,6 +306,25 @@ def chunksize(self) -> int: def to_cframe(self) -> bytes: return self.schunk.to_cframe() + def copy(self, **kwargs: Any) -> VLArray: + """Create a copy of the container with optional constructor overrides.""" + if "meta" in kwargs: + raise ValueError("meta should not be passed to copy") + + kwargs["cparams"] = kwargs.get("cparams", copy.deepcopy(self.cparams)) + kwargs["dparams"] = kwargs.get("dparams", copy.deepcopy(self.dparams)) + kwargs["chunksize"] = kwargs.get("chunksize", -1) + + if "storage" not in kwargs: + kwargs["meta"] = self._copy_meta() + kwargs["contiguous"] = kwargs.get("contiguous", self.schunk.contiguous) + if "urlpath" in kwargs and "mode" not in kwargs: + kwargs["mode"] = "w" + + out = VLArray(**kwargs) + out.extend(self) + return out + def __enter__(self) -> VLArray: return self diff --git a/tests/test_vlarray.py b/tests/test_vlarray.py index bb643473a..6f67c500d 100644 --- a/tests/test_vlarray.py +++ b/tests/test_vlarray.py @@ -171,6 +171,90 @@ def test_vlarray_list_like_ops(contiguous, urlpath): blosc2.remove_urlpath(urlpath) +@pytest.mark.parametrize( + ("contiguous", "urlpath"), + [ + (False, None), + (True, None), + (True, "test_vlarray_slices.b2frame"), + (False, "test_vlarray_slices_s.b2frame"), + ], +) +def test_vlarray_slices(contiguous, urlpath): + blosc2.remove_urlpath(urlpath) + + expected = list(range(8)) + vlarray = blosc2.VLArray(storage=_storage(contiguous, urlpath)) + vlarray.extend(expected) + + assert vlarray[1:6:2] == expected[1:6:2] + assert vlarray[::-2] == expected[::-2] + + vlarray[2:5] = ["a", "b"] + expected[2:5] = ["a", "b"] + assert list(vlarray) == expected + + vlarray[1:6:2] = [100, 101, 102] + expected[1:6:2] = [100, 101, 102] + assert list(vlarray) == expected + + del vlarray[::3] + del expected[::3] + assert list(vlarray) == expected + + if urlpath is not None: + reopened = blosc2.open(urlpath, mode="r") + assert reopened[::2] == expected[::2] + with pytest.raises(ValueError): + reopened[1:3] = [9] + with pytest.raises(ValueError): + del reopened[::2] + + blosc2.remove_urlpath(urlpath) + + +def test_vlarray_slice_errors(): + vlarray = blosc2.VLArray() + vlarray.extend([0, 1, 2, 3]) + + with pytest.raises(ValueError, match="extended slice"): + vlarray[::2] = [9] + with pytest.raises(TypeError): + vlarray[1:2] = 3 + with pytest.raises(ValueError): + _ = vlarray[::0] + + +def test_vlarray_copy(): + urlpath = "test_vlarray_copy.b2frame" + copy_path = "test_vlarray_copy_out.b2frame" + blosc2.remove_urlpath(urlpath) + blosc2.remove_urlpath(copy_path) + + original = blosc2.VLArray(urlpath=urlpath, mode="w", contiguous=True) + original.extend(VALUES) + original.insert(1, {"copy": True}) + + copied = original.copy( + urlpath=copy_path, contiguous=False, cparams={"codec": blosc2.Codec.LZ4, "clevel": 5} + ) + assert list(copied) == list(original) + assert copied.urlpath == copy_path + assert copied.schunk.contiguous is False + assert copied.cparams.codec == blosc2.Codec.LZ4 + assert copied.cparams.clevel == 5 + + inmem = original.copy() + assert list(inmem) == list(original) + assert inmem.urlpath is None + + with pytest.raises(ValueError, match="meta should not be passed to copy"): + original.copy(meta={}) + + blosc2.remove_urlpath(urlpath) + blosc2.remove_urlpath(copy_path) + + def test_vlarray_insert_delete_errors(): vlarray = blosc2.VLArray() vlarray.append("value") @@ -179,8 +263,6 @@ def test_vlarray_insert_delete_errors(): vlarray.insert("0", "bad") with pytest.raises(IndexError): vlarray.delete(3) - with pytest.raises(NotImplementedError): - vlarray.delete(slice(0, 1)) with pytest.raises(IndexError): blosc2.VLArray().pop() with pytest.raises(NotImplementedError): From 510ad85b8f85f95a591b16c9cc6669e9f639fc3b Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sat, 14 Mar 2026 18:08:53 +0100 Subject: [PATCH 4/7] Update to current C-Blosc2 main --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ae63b85d0..b748c7942 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,7 +119,7 @@ else() include(FetchContent) FetchContent_Declare(blosc2 GIT_REPOSITORY https://github.com/Blosc/c-blosc2 - GIT_TAG ba55a6be9293faf9740f03b5953b82f1c955879e # variable-length chunks support in schunks + GIT_TAG 25197eb96d05318c939b3252a6b373ccd6ae49fe # variable-length chunks support in schunks # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2 ) FetchContent_MakeAvailable(blosc2) From 5926a46e7a1c021b4ac0cc2862f65e828777c07a Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sat, 14 Mar 2026 19:11:11 +0100 Subject: [PATCH 5/7] Add support for empty lists in VLArray store --- src/blosc2/_msgpack_utils.py | 26 ++++++++++++++++++++++++++ src/blosc2/schunk.py | 18 +++++------------- src/blosc2/vlarray.py | 10 ++++------ tests/test_vlarray.py | 7 +++++++ tests/test_vlmeta.py | 9 +++++++++ 5 files changed, 51 insertions(+), 19 deletions(-) create mode 100644 src/blosc2/_msgpack_utils.py diff --git a/src/blosc2/_msgpack_utils.py b/src/blosc2/_msgpack_utils.py new file mode 100644 index 000000000..fd179b840 --- /dev/null +++ b/src/blosc2/_msgpack_utils.py @@ -0,0 +1,26 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +from msgpack import packb, unpackb + +from blosc2 import blosc2_ext + + +def msgpack_packb(value): + return packb(value, default=blosc2_ext.encode_tuple, strict_types=True, use_bin_type=True) + + +def decode_tuple_list_hook(obj): + if obj and obj[0] == "__tuple__": + return tuple(obj[1:]) + return obj + + +def msgpack_unpackb(payload): + return unpackb(payload, list_hook=decode_tuple_list_hook) diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index 3e0014451..5421ae41f 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -16,10 +16,10 @@ from typing import Any, NamedTuple import numpy as np -from msgpack import packb, unpackb import blosc2 from blosc2 import SpecialValue, blosc2_ext +from blosc2._msgpack_utils import msgpack_packb, msgpack_unpackb from blosc2.info import InfoReporter @@ -46,12 +46,7 @@ def __setitem__(self, name, content): return raise NotImplementedError("Slicing is not supported, unless [:]") cparams = {"typesize": 1} - content = packb( - content, - default=blosc2_ext.encode_tuple, - strict_types=True, - use_bin_type=True, - ) + content = msgpack_packb(content) super().set_vlmeta(name, content, **cparams) def __getitem__(self, name): @@ -60,7 +55,7 @@ def __getitem__(self, name): # Return all the vlmetalayers return self.getall() raise NotImplementedError("Slicing is not supported, unless [:]") - return unpackb(super().get_vlmeta(name), list_hook=blosc2_ext.decode_tuple) + return msgpack_unpackb(super().get_vlmeta(name)) def __delitem__(self, name): blosc2_ext.check_access_mode(self.urlpath, self.mode) @@ -120,7 +115,7 @@ def __setitem__(self, key: str, value: bytes) -> None: ..warning: Note that the *length* of the metalayer cannot change, otherwise an exception will be raised. """ - value = packb(value, default=blosc2_ext.encode_tuple, strict_types=True, use_bin_type=True) + value = msgpack_packb(value) blosc2_ext.meta__setitem__(self.schunk, key, value) def __getitem__(self, item: str | slice) -> bytes | dict[str, bytes]: @@ -144,10 +139,7 @@ def __getitem__(self, item: str | slice) -> bytes | dict[str, bytes]: return self.getall() raise NotImplementedError("Slicing is not supported, unless [:]") if self.__contains__(item): - return unpackb( - blosc2_ext.meta__getitem__(self.schunk, item), - list_hook=blosc2_ext.decode_tuple, - ) + return msgpack_unpackb(blosc2_ext.meta__getitem__(self.schunk, item)) else: raise KeyError(f"{item} not found") diff --git a/src/blosc2/vlarray.py b/src/blosc2/vlarray.py index 556b31ed7..d7d885e2c 100644 --- a/src/blosc2/vlarray.py +++ b/src/blosc2/vlarray.py @@ -11,10 +11,8 @@ import pathlib from typing import TYPE_CHECKING, Any -from msgpack import packb, unpackb - import blosc2 -from blosc2 import blosc2_ext +from blosc2._msgpack_utils import msgpack_packb, msgpack_unpackb if TYPE_CHECKING: from collections.abc import Iterator @@ -177,7 +175,7 @@ def _copy_meta(self) -> dict[str, Any]: return {name: self.meta[name] for name in self.meta} def _serialize(self, value: Any) -> bytes: - payload = packb(value, default=blosc2_ext.encode_tuple, strict_types=True, use_bin_type=True) + payload = msgpack_packb(value) _check_serialized_size(payload) return payload @@ -244,7 +242,7 @@ def __getitem__(self, index: int) -> Any: return [self[i] for i in self._slice_indices(index)] index = self._normalize_index(index) payload = self.schunk.decompress_chunk(index) - return unpackb(payload, list_hook=blosc2_ext.decode_tuple) + return msgpack_unpackb(payload) def __setitem__(self, index: int, value: Any) -> None: if isinstance(index, slice): @@ -335,7 +333,7 @@ def __repr__(self) -> str: return f"VLArray(len={len(self)}, urlpath={self.urlpath!r})" -def vlarray_from_cframe(cframe: bytes, copy: bool = False) -> VLArray: +def vlarray_from_cframe(cframe: bytes, copy: bool = True) -> VLArray: """Deserialize a CFrame buffer into a :class:`VLArray`.""" schunk = blosc2.schunk_from_cframe(cframe, copy=copy) diff --git a/tests/test_vlarray.py b/tests/test_vlarray.py index 6f67c500d..84692d242 100644 --- a/tests/test_vlarray.py +++ b/tests/test_vlarray.py @@ -255,6 +255,13 @@ def test_vlarray_copy(): blosc2.remove_urlpath(copy_path) +def test_vlarray_empty_list_roundtrip(): + values = [[], {"a": []}, [[], ["nested"]], None, ("tuple", []), {"rows": [[], []]}] + vlarray = blosc2.VLArray() + vlarray.extend(values) + assert list(vlarray) == values + + def test_vlarray_insert_delete_errors(): vlarray = blosc2.VLArray() vlarray.append("value") diff --git a/tests/test_vlmeta.py b/tests/test_vlmeta.py index 8269f43c8..ec5f07849 100644 --- a/tests/test_vlmeta.py +++ b/tests/test_vlmeta.py @@ -118,3 +118,12 @@ def clear(schunk): schunk.vlmeta.clear() assert schunk.vlmeta.__len__() == 0 + + +def test_vlmeta_empty_list_roundtrip(): + schunk = blosc2.SChunk() + schunk.vlmeta["empty"] = [] + schunk.vlmeta["nested"] = {"rows": [[], ["x"]]} + + assert schunk.vlmeta["empty"] == [] + assert schunk.vlmeta["nested"] == {"rows": [[], ["x"]]} From 2ba2532addd2a1850ca62975b4954099b40792b7 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sat, 14 Mar 2026 19:14:16 +0100 Subject: [PATCH 6/7] Better support for VLArrays in dict and tree stores --- src/blosc2/dict_store.py | 47 ++++++++++++++++++++++++++---------- src/blosc2/embed_store.py | 21 +++++++++------- src/blosc2/tree_store.py | 16 +++++++------ tests/test_dict_store.py | 43 +++++++++++++++++++++++++++++++++ tests/test_tree_store.py | 50 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 149 insertions(+), 28 deletions(-) diff --git a/src/blosc2/dict_store.py b/src/blosc2/dict_store.py index 9ac3cf469..1cb6dd3c2 100644 --- a/src/blosc2/dict_store.py +++ b/src/blosc2/dict_store.py @@ -5,19 +5,23 @@ # SPDX-License-Identifier: BSD-3-Clause ####################################################################### +from __future__ import annotations + import os import shutil import tempfile import zipfile -from collections.abc import Iterator, Set -from typing import Any +from typing import TYPE_CHECKING, Any import numpy as np import blosc2 from blosc2.c2array import C2Array from blosc2.embed_store import EmbedStore -from blosc2.schunk import SChunk +from blosc2.schunk import SChunk, _process_opened_object + +if TYPE_CHECKING: + from collections.abc import Iterator, Set class DictStore: @@ -244,7 +248,25 @@ def estore(self) -> EmbedStore: """Access the underlying EmbedStore.""" return self._estore - def __setitem__(self, key: str, value: blosc2.Array | SChunk) -> None: + @staticmethod + def _value_nbytes(value: blosc2.Array | SChunk | blosc2.VLArray) -> int: + if isinstance(value, blosc2.VLArray): + return value.schunk.nbytes + return value.nbytes + + @staticmethod + def _is_external_value(value: blosc2.Array | SChunk | blosc2.VLArray) -> bool: + return isinstance(value, (blosc2.NDArray, SChunk, blosc2.VLArray)) and bool( + getattr(value, "urlpath", None) + ) + + @staticmethod + def _external_ext(value: blosc2.Array | SChunk | blosc2.VLArray) -> str: + if isinstance(value, blosc2.NDArray): + return ".b2nd" + return ".b2f" + + def __setitem__(self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray) -> None: """Add a node to the DictStore.""" if isinstance(value, np.ndarray): value = blosc2.asarray(value, cparams=self.cparams, dparams=self.dparams) @@ -252,12 +274,10 @@ def __setitem__(self, key: str, value: blosc2.Array | SChunk) -> None: if isinstance(value, C2Array): self._estore[key] = value return - exceeds_threshold = self.threshold is not None and value.nbytes >= self.threshold - # Consider both NDArray and SChunk external files (have urlpath) - external_file = isinstance(value, (blosc2.NDArray, SChunk)) and getattr(value, "urlpath", None) + exceeds_threshold = self.threshold is not None and self._value_nbytes(value) >= self.threshold + external_file = self._is_external_value(value) if exceeds_threshold or (external_file and self.threshold is None): - # Choose extension based on type - ext = ".b2f" if isinstance(value, SChunk) else ".b2nd" + ext = self._external_ext(value) # Convert key to a proper file path within the tree directory rel_key = key.lstrip("/") dest_path = os.path.join(self.working_dir, rel_key + ext) @@ -272,7 +292,7 @@ def __setitem__(self, key: str, value: blosc2.Array | SChunk) -> None: if hasattr(value, "save"): value.save(urlpath=dest_path) else: - # An SChunk does not have a save() method + # SChunk and VLArray can both be persisted via their cframe. with open(dest_path, "wb") as f: f.write(value.to_cframe()) else: @@ -290,20 +310,21 @@ def __setitem__(self, key: str, value: blosc2.Array | SChunk) -> None: value = blosc2.from_cframe(value.to_cframe()) self._estore[key] = value - def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | C2Array: + def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray | C2Array: """Retrieve a node from the DictStore.""" # Check map_tree first if key in self.map_tree: filepath = self.map_tree[key] if filepath in self.offsets: offset = self.offsets[filepath]["offset"] - return blosc2.blosc2_ext.open( + opened = blosc2.blosc2_ext.open( self.b2z_path, mode="r", offset=offset, mmap_mode=self.mmap_mode, dparams=self.dparams, ) + return _process_opened_object(opened) else: urlpath = os.path.join(self.working_dir, filepath) if os.path.exists(urlpath): @@ -319,7 +340,7 @@ def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | C2Array: # Fall back to EmbedStore return self._estore[key] - def get(self, key: str, default: Any = None) -> blosc2.NDArray | SChunk | C2Array | Any: + def get(self, key: str, default: Any = None) -> blosc2.NDArray | SChunk | blosc2.VLArray | C2Array | Any: """Retrieve a node, or default if not found.""" try: return self[key] diff --git a/src/blosc2/embed_store.py b/src/blosc2/embed_store.py index 84b1b200e..b03d892e4 100644 --- a/src/blosc2/embed_store.py +++ b/src/blosc2/embed_store.py @@ -5,15 +5,20 @@ # SPDX-License-Identifier: BSD-3-Clause ####################################################################### +from __future__ import annotations + import copy -from collections.abc import Iterator, KeysView -from typing import Any +from typing import TYPE_CHECKING, Any import numpy as np import blosc2 from blosc2.c2array import C2Array -from blosc2.schunk import SChunk + +if TYPE_CHECKING: + from collections.abc import Iterator, KeysView + + from blosc2.schunk import SChunk PROFILE = False # Set to True to enable PROFILE prints in EmbedStore @@ -168,7 +173,7 @@ def _ensure_capacity(self, needed_bytes: int) -> None: new_size = max(required_size, int(self._store.shape[0] * 1.5)) self._store.resize((new_size,)) - def __setitem__(self, key: str, value: blosc2.Array | SChunk) -> None: + def __setitem__(self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray) -> None: """Add a node to the embed store.""" if self.mode == "r": raise ValueError("Cannot set items in read-only mode.") @@ -191,7 +196,7 @@ def __setitem__(self, key: str, value: blosc2.Array | SChunk) -> None: self._embed_map[key] = {"offset": offset, "length": data_len} self._save_metadata() - def __getitem__(self, key: str) -> blosc2.NDArray | SChunk: + def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray: """Retrieve a node from the embed store.""" if key not in self._embed_map: raise KeyError(f"Key '{key}' not found in the embed store.") @@ -207,7 +212,7 @@ def __getitem__(self, key: str) -> blosc2.NDArray | SChunk: # Use from_cframe so we can deserialize either an NDArray or an SChunk return blosc2.from_cframe(serialized_data, copy=True) - def get(self, key: str, default: Any = None) -> blosc2.NDArray | SChunk | Any: + def get(self, key: str, default: Any = None) -> blosc2.NDArray | SChunk | blosc2.VLArray | Any: """Retrieve a node, or default if not found.""" return self[key] if key in self._embed_map else default @@ -234,12 +239,12 @@ def keys(self) -> KeysView[str]: """Return all keys.""" return self._embed_map.keys() - def values(self) -> Iterator[blosc2.NDArray | SChunk]: + def values(self) -> Iterator[blosc2.NDArray | SChunk | blosc2.VLArray]: """Iterate over all values.""" for key in self._embed_map: yield self[key] - def items(self) -> Iterator[tuple[str, blosc2.NDArray | SChunk]]: + def items(self) -> Iterator[tuple[str, blosc2.NDArray | SChunk | blosc2.VLArray]]: """Iterate over (key, value) pairs.""" for key in self._embed_map: yield key, self[key] diff --git a/src/blosc2/tree_store.py b/src/blosc2/tree_store.py index 6aad81652..a96c11a44 100644 --- a/src/blosc2/tree_store.py +++ b/src/blosc2/tree_store.py @@ -5,6 +5,8 @@ # SPDX-License-Identifier: BSD-3-Clause ####################################################################### +from __future__ import annotations + import contextlib import os from collections.abc import Iterator, MutableMapping @@ -14,11 +16,11 @@ import blosc2 from blosc2.dict_store import DictStore -from blosc2.schunk import SChunk if TYPE_CHECKING: from blosc2.c2array import C2Array from blosc2.ndarray import NDArray + from blosc2.schunk import SChunk class vlmetaProxy(MutableMapping): @@ -29,7 +31,7 @@ class vlmetaProxy(MutableMapping): - Delegates iteration and length to the underlying vlmeta object. """ - def __init__(self, tstore: "TreeStore", inner_vlmeta): + def __init__(self, tstore: TreeStore, inner_vlmeta): self._tstore = tstore self._inner = inner_vlmeta @@ -224,7 +226,7 @@ def _validate_key(self, key: str) -> str: return key - def __setitem__(self, key: str, value: blosc2.Array | SChunk) -> None: + def __setitem__(self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray) -> None: """Add a node with hierarchical key validation. Parameters @@ -266,7 +268,7 @@ def __setitem__(self, key: str, value: blosc2.Array | SChunk) -> None: full_key = self._translate_key_to_full(key) super().__setitem__(full_key, value) - def __getitem__(self, key: str) -> "NDArray | C2Array | SChunk | TreeStore": + def __getitem__(self, key: str) -> NDArray | C2Array | SChunk | blosc2.VLArray | TreeStore: """Retrieve a node or subtree view. If the key points to a subtree (intermediate path with children), @@ -280,7 +282,7 @@ def __getitem__(self, key: str) -> "NDArray | C2Array | SChunk | TreeStore": Returns ------- - out : blosc2.NDArray or blosc2.C2Array or blosc2.SChunk or TreeStore + out : blosc2.NDArray or blosc2.C2Array or blosc2.SChunk or blosc2.VLArray or TreeStore The stored array/chunk if key is a leaf node, or a TreeStore subtree view if key is an intermediate path with children. @@ -416,7 +418,7 @@ def __iter__(self) -> Iterator[str]: """Iterate over keys, excluding vlmeta keys.""" return iter(self.keys()) - def items(self) -> Iterator[tuple[str, "NDArray | C2Array | SChunk | TreeStore"]]: + def items(self) -> Iterator[tuple[str, NDArray | C2Array | SChunk | TreeStore]]: """Return key-value pairs in the current subtree view.""" for key in self.keys(): yield key, self[key] @@ -575,7 +577,7 @@ def walk(self, path: str = "/", topdown: bool = True) -> Iterator[tuple[str, lis # Yield current level after children (post-order) yield path, children_dirs, leaf_nodes - def get_subtree(self, path: str) -> "TreeStore": + def get_subtree(self, path: str) -> TreeStore: """Create a subtree view with the specified path as root. Parameters diff --git a/tests/test_dict_store.py b/tests/test_dict_store.py index f18b006de..74122424c 100644 --- a/tests/test_dict_store.py +++ b/tests/test_dict_store.py @@ -223,6 +223,49 @@ def test_external_schunk_file_and_reopen(): os.remove(path) +def test_store_and_retrieve_vlarray_in_dict(tmp_path): + path = tmp_path / "test_dstore_vlarray_embed.b2z" + values = [{"name": "alpha", "count": 1}, None, ("tuple", 2), [1, "two", b"three"]] + + vlarray = blosc2.VLArray() + vlarray.extend(values) + + with DictStore(str(path), mode="w") as dstore: + dstore["/vlarray"] = vlarray + value = dstore["/vlarray"] + assert isinstance(value, blosc2.VLArray) + assert list(value) == values + + with DictStore(str(path), mode="r") as dstore_read: + value = dstore_read["/vlarray"] + assert isinstance(value, blosc2.VLArray) + assert list(value) == values + + +def test_external_vlarray_file_and_reopen(tmp_path): + ext_path = tmp_path / "ext_vlarray.b2frame" + path = tmp_path / "test_dstore_vlarray_external.b2z" + values = ["alpha", {"nested": True}, None, (1, 2, 3)] + + vlarray = blosc2.VLArray(urlpath=str(ext_path), mode="w", contiguous=True) + vlarray.extend(values) + vlarray.vlmeta["description"] = "External VLArray" + + with DictStore(str(path), mode="w", threshold=None) as dstore: + dstore["/dir1/vlarray_ext"] = vlarray + assert "/dir1/vlarray_ext" in dstore.map_tree + assert dstore.map_tree["/dir1/vlarray_ext"].endswith(".b2f") + + with zipfile.ZipFile(path, "r") as zf: + assert "dir1/vlarray_ext.b2f" in zf.namelist() + + with DictStore(str(path), mode="r") as dstore_read: + value = dstore_read["/dir1/vlarray_ext"] + assert isinstance(value, blosc2.VLArray) + assert list(value) == values + assert value.vlmeta["description"] == "External VLArray" + + def _digest_value(value): """Return a bytes digest of a stored value.""" if isinstance(value, blosc2.SChunk): diff --git a/tests/test_tree_store.py b/tests/test_tree_store.py index 49e26d71a..5da45f64d 100644 --- a/tests/test_tree_store.py +++ b/tests/test_tree_store.py @@ -604,6 +604,56 @@ def test_schunk_support(): os.remove("test_schunk.b2z") +def test_vlarray_support(): + """Test that TreeStore supports embedded VLArray objects.""" + values = [{"name": "alpha", "count": 1}, None, ("tuple", 2), [1, "two", b"three"]] + with TreeStore("test_vlarray.b2z", mode="w") as tstore: + vlarray = blosc2.VLArray() + vlarray.extend(values) + tstore["/data/vlarray1"] = vlarray + + retrieved = tstore["/data/vlarray1"] + assert isinstance(retrieved, blosc2.VLArray) + assert list(retrieved) == values + + data_subtree = tstore["/data"] + assert isinstance(data_subtree, TreeStore) + assert set(data_subtree.keys()) == {"/vlarray1"} + + with TreeStore("test_vlarray.b2z", mode="r") as tstore: + retrieved = tstore["/data/vlarray1"] + assert isinstance(retrieved, blosc2.VLArray) + assert list(retrieved) == values + + os.remove("test_vlarray.b2z") + + +def test_external_vlarray_support(): + """Test that TreeStore supports external VLArray objects.""" + ext_path = "ext_vlarray.b2frame" + values = ["alpha", {"nested": True}, None, (1, 2, 3)] + if os.path.exists(ext_path): + os.remove(ext_path) + + vlarray = blosc2.VLArray(urlpath=ext_path, mode="w", contiguous=True) + vlarray.extend(values) + vlarray.vlmeta["description"] = "External VLArray for TreeStore" + + with TreeStore("test_vlarray_external.b2z", mode="w", threshold=None) as tstore: + tstore["/data/vlarray_ext"] = vlarray + assert "/data/vlarray_ext" in tstore + + with TreeStore("test_vlarray_external.b2z", mode="r") as tstore: + retrieved = tstore["/data/vlarray_ext"] + assert isinstance(retrieved, blosc2.VLArray) + assert list(retrieved) == values + assert retrieved.vlmeta["description"] == "External VLArray for TreeStore" + + if os.path.exists(ext_path): + os.remove(ext_path) + os.remove("test_vlarray_external.b2z") + + def test_walk_topdown_argument_ordering(): """Ensure walk supports topdown argument mimicking os.walk order semantics.""" with TreeStore("test_walk_topdown.b2z", mode="w") as tstore: From 3d031966dd922ae62e96d1b9cf017aaf8f0daaaa Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sun, 15 Mar 2026 08:48:20 +0100 Subject: [PATCH 7/7] Add test for empty tuples too --- tests/test_vlarray.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_vlarray.py b/tests/test_vlarray.py index 84692d242..3a25fa78b 100644 --- a/tests/test_vlarray.py +++ b/tests/test_vlarray.py @@ -262,6 +262,13 @@ def test_vlarray_empty_list_roundtrip(): assert list(vlarray) == values +def test_vlarray_empty_tuple_roundtrip(): + values = [(), {"a": ()}, [(), ("nested",)], None, ("tuple", ()), {"rows": [[], ()]}] + vlarray = blosc2.VLArray() + vlarray.extend(values) + assert list(vlarray) == values + + def test_vlarray_insert_delete_errors(): vlarray = blosc2.VLArray() vlarray.append("value")