From f6ad6ead5ec1bef6286753f66b5df9aa0afb4bb5 Mon Sep 17 00:00:00 2001 From: William Silversmith Date: Thu, 5 Mar 2026 17:26:52 -0500 Subject: [PATCH 1/9] feat: add header/index cache for local disk --- mapbuffer/mapbuffer.py | 48 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/mapbuffer/mapbuffer.py b/mapbuffer/mapbuffer.py index e7e7f5f..7516998 100644 --- a/mapbuffer/mapbuffer.py +++ b/mapbuffer/mapbuffer.py @@ -9,6 +9,7 @@ from . import compression import crc32c +import fasteners import numpy as np import mapbufferaccel @@ -32,6 +33,7 @@ def __init__( frombytesfn:Optional[Callable[[bytes], Any]] = None, check_crc:bool = True, compute_crc:bool = True, + index_cache:Optional[str] = None, ): """ data: dict (int->byte serializable object) or bytes @@ -52,6 +54,7 @@ def __init__( self.buffer = None self.check_crc = check_crc self.compute_crc = compute_crc + self.index_cache = index_cache self._header = None self._index = None @@ -102,9 +105,31 @@ def header(self): if self._header is not None: return self._header + if self.index_cache is not None: + if os.path.exists(self.index_cache): + lock = fasteners.InterProcessReaderWriterLock(self.index_cache) + with lock.read_lock(): + with open(self.index_cache, "rb") as f: + self._header = f.read(HEADER_LENGTH) + + if len(self._header) == HEADER_LENGTH: + return self._header + # seems dumb, buf if self.buffer is an object that # requires network access, this is a valuable cache self._header = self.buffer[:HEADER_LENGTH] + + if self.index_cache is not None: + lock = fasteners.InterProcessReaderWriterLock(self.index_cache) + with lock.write_lock(): + try: + if os.path.getsize(self.index_cache) < HEADER_LENGTH: + with open(self.index_cache, "wb") as f: + f.write(self._header) + except FileNotFoundError: + with open(self.index_cache, "wb") as f: + f.write(self._header) + return self._header def index(self): @@ -115,6 +140,19 @@ def index(self): N = len(self) index_length = 2 * N + if self.index_cache is not None: + lock = fasteners.InterProcessReaderWriterLock(self.index_cache) + try: + if os.path.getsize(self.index_cache) > HEADER_LENGTH: + with lock.read_lock(): + with open(self.index_cache, "rb") as f: + f.seek(HEADER_LENGTH) + index = f.read(index_length * 8) + self._index = np.frombuffer(index, dtype=np.uint64).reshape((N,2)) + return self._index + except FileNotFoundError: + pass + if isinstance(self.buffer, (bytes,bytearray,np.ndarray,mmap.mmap)): self._index = np.frombuffer( self.buffer, @@ -127,6 +165,16 @@ def index(self): index = self.buffer[HEADER_LENGTH:index_length+HEADER_LENGTH] self._index = np.frombuffer(index, dtype=np.uint64).reshape((N,2)) + if self.index_cache is not None: + lock = fasteners.InterProcessReaderWriterLock(self.index_cache) + try: + if os.path.getsize(self.index_cache) == HEADER_LENGTH: + with lock.write_lock(): + with open(self.index_cache, "ab") as f: + f.write(self._index.tobytes('C')) + except FileNotFoundError: + pass + return self._index def keys(self): From 34da5ed410500420ecd62f51b95be9da394500e6 Mon Sep 17 00:00:00 2001 From: William Silversmith Date: Thu, 5 Mar 2026 17:27:36 -0500 Subject: [PATCH 2/9] fix: add slot for index_cache --- mapbuffer/mapbuffer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mapbuffer/mapbuffer.py b/mapbuffer/mapbuffer.py index 7516998..783d338 100644 --- a/mapbuffer/mapbuffer.py +++ b/mapbuffer/mapbuffer.py @@ -22,7 +22,7 @@ class MapBuffer: """Represents a usable int->bytes dictionary as a byte string.""" __slots__ = ( "data", "tobytesfn", "frombytesfn", - "dtype", "buffer", "check_crc", "compute_crc", + "dtype", "buffer", "check_crc", "compute_crc", "index_cache", "_header", "_index", "_compress" ) def __init__( From e243fda829d6e80263b2b9d66f760fbfce496432 Mon Sep 17 00:00:00 2001 From: William Silversmith Date: Thu, 5 Mar 2026 17:28:04 -0500 Subject: [PATCH 3/9] install: add fasteners --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index f3af510..8c9461d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ brotli crc32c deflate>=0.2.0 +fasteners numpy tqdm zstandard \ No newline at end of file From 980255ac11dbedd5ee4cd0c13ba93851d9c12cc9 Mon Sep 17 00:00:00 2001 From: William Silversmith Date: Thu, 5 Mar 2026 17:31:39 -0500 Subject: [PATCH 4/9] test: add test to check for mapbuffer index (mbi) file --- automated_test.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/automated_test.py b/automated_test.py index 2fac50d..accb889 100644 --- a/automated_test.py +++ b/automated_test.py @@ -248,6 +248,26 @@ def test_set_object_intmap(): except KeyError: pass +def test_index_cache(): + data = { + 1: b"hello", + 2: b"world", + } + mbuf = MapBuffer(data, index_cache="./hello.mbi") + + idx = mbuf.buffer.index(b"hello") + buf = list(mbuf.buffer) + buf[idx] = ord(b'H') + mbuf.buffer = bytes(buf) + + try: + mbuf[1] + assert False + except ValidationError: + pass + assert os.path.exists("./hello.mbi") + mbuf = MapBuffer(data, index_cache="./hello.mbi") + mbuf.index() From f96313fb0c6e25baf5bab3da8345ca65ae73b067 Mon Sep 17 00:00:00 2001 From: William Silversmith Date: Thu, 5 Mar 2026 17:31:58 -0500 Subject: [PATCH 5/9] fix: missing os import --- mapbuffer/mapbuffer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mapbuffer/mapbuffer.py b/mapbuffer/mapbuffer.py index 783d338..6b7c277 100644 --- a/mapbuffer/mapbuffer.py +++ b/mapbuffer/mapbuffer.py @@ -1,5 +1,6 @@ from typing import Optional, Any, Union, Literal from collections.abc import Callable +import os import mmap import io From f8f41f04fd5c206a85f8ded604566ad6a2dbb8a2 Mon Sep 17 00:00:00 2001 From: William Silversmith Date: Thu, 5 Mar 2026 17:45:46 -0500 Subject: [PATCH 6/9] perf: condense the number of lock files --- mapbuffer/mapbuffer.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/mapbuffer/mapbuffer.py b/mapbuffer/mapbuffer.py index 6b7c277..a563784 100644 --- a/mapbuffer/mapbuffer.py +++ b/mapbuffer/mapbuffer.py @@ -24,7 +24,7 @@ class MapBuffer: __slots__ = ( "data", "tobytesfn", "frombytesfn", "dtype", "buffer", "check_crc", "compute_crc", "index_cache", - "_header", "_index", "_compress" + "_header", "_index", "_compress", "_lock" ) def __init__( self, @@ -60,6 +60,9 @@ def __init__( self._header = None self._index = None self._compress = None + self._lock = None + if self.index_cache is not None: + self._lock = fasteners.InterProcessReaderWriterLock() if isinstance(data, dict): self.buffer = self.dict2buf(data, compress) @@ -108,8 +111,7 @@ def header(self): if self.index_cache is not None: if os.path.exists(self.index_cache): - lock = fasteners.InterProcessReaderWriterLock(self.index_cache) - with lock.read_lock(): + with self._lock.read_lock(): with open(self.index_cache, "rb") as f: self._header = f.read(HEADER_LENGTH) @@ -121,8 +123,7 @@ def header(self): self._header = self.buffer[:HEADER_LENGTH] if self.index_cache is not None: - lock = fasteners.InterProcessReaderWriterLock(self.index_cache) - with lock.write_lock(): + with self._lock.write_lock(): try: if os.path.getsize(self.index_cache) < HEADER_LENGTH: with open(self.index_cache, "wb") as f: @@ -142,10 +143,9 @@ def index(self): index_length = 2 * N if self.index_cache is not None: - lock = fasteners.InterProcessReaderWriterLock(self.index_cache) try: if os.path.getsize(self.index_cache) > HEADER_LENGTH: - with lock.read_lock(): + with self._lock.read_lock(): with open(self.index_cache, "rb") as f: f.seek(HEADER_LENGTH) index = f.read(index_length * 8) @@ -167,10 +167,9 @@ def index(self): self._index = np.frombuffer(index, dtype=np.uint64).reshape((N,2)) if self.index_cache is not None: - lock = fasteners.InterProcessReaderWriterLock(self.index_cache) try: if os.path.getsize(self.index_cache) == HEADER_LENGTH: - with lock.write_lock(): + with self._lock.write_lock(): with open(self.index_cache, "ab") as f: f.write(self._index.tobytes('C')) except FileNotFoundError: From a019f31877a2749181dfd894c227d3930234350a Mon Sep 17 00:00:00 2001 From: William Silversmith Date: Thu, 5 Mar 2026 20:11:49 -0500 Subject: [PATCH 7/9] fix: missing argument --- mapbuffer/mapbuffer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mapbuffer/mapbuffer.py b/mapbuffer/mapbuffer.py index a563784..46fcad7 100644 --- a/mapbuffer/mapbuffer.py +++ b/mapbuffer/mapbuffer.py @@ -62,7 +62,7 @@ def __init__( self._compress = None self._lock = None if self.index_cache is not None: - self._lock = fasteners.InterProcessReaderWriterLock() + self._lock = fasteners.InterProcessReaderWriterLock(self.index_cache) if isinstance(data, dict): self.buffer = self.dict2buf(data, compress) From 81a50e5240552f4a027b241fe8eb2754bb88f885 Mon Sep 17 00:00:00 2001 From: William Silversmith Date: Thu, 5 Mar 2026 21:27:34 -0500 Subject: [PATCH 8/9] test: add some more tests for cache thanks to Claude --- automated_test.py | 109 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 91 insertions(+), 18 deletions(-) diff --git a/automated_test.py b/automated_test.py index accb889..d9d7b6c 100644 --- a/automated_test.py +++ b/automated_test.py @@ -3,11 +3,14 @@ import mmap import os import random +from unittest.mock import patch import numpy as np from mapbuffer import ValidationError, IntMap, MapBuffer, HEADER_LENGTH +CACHE_PATH = "./test_index_cache.mbi" + @pytest.mark.parametrize("compress", (None, "gzip", "br", "zstd", "lzma")) def test_empty(compress): mbuf = MapBuffer({}, compress=compress) @@ -248,26 +251,96 @@ def test_set_object_intmap(): except KeyError: pass -def test_index_cache(): - data = { - 1: b"hello", - 2: b"world", - } - mbuf = MapBuffer(data, index_cache="./hello.mbi") +@pytest.fixture(autouse=True) +def cleanup_cache(): + """Ensure cache file is removed before and after each test.""" + if os.path.exists(CACHE_PATH): + os.remove(CACHE_PATH) + yield + if os.path.exists(CACHE_PATH): + os.remove(CACHE_PATH) - idx = mbuf.buffer.index(b"hello") - buf = list(mbuf.buffer) - buf[idx] = ord(b'H') - mbuf.buffer = bytes(buf) - try: - mbuf[1] - assert False - except ValidationError: - pass +def make_mapbuffer(data=None, **kwargs): + data = data or {1: b"hello", 2: b"world"} + return MapBuffer(data, index_cache=CACHE_PATH, **kwargs) + + +def test_index_cache_file_is_created(): + """Cache file should be written after first access.""" + mbuf = make_mapbuffer() + mbuf.index() + assert os.path.exists(CACHE_PATH) + + +def test_index_cache_header_and_index_written(): + """Cache file should contain header + full index bytes.""" + mbuf = make_mapbuffer() + index = mbuf.index() + + with open(CACHE_PATH, "rb") as f: + cached = f.read() + + assert len(cached) == HEADER_LENGTH + index.nbytes + + +def test_index_cache_is_loaded_from_disk(): + """Second MapBuffer with same cache should read index from disk, not buffer.""" + mbuf = make_mapbuffer() + original_index = mbuf.index().copy() + + # Reload — this time the cache exists, so index should come from disk + mbuf2 = make_mapbuffer() + mbuf2._index = None # ensure not inherited + + with patch.object(np, "frombuffer", wraps=np.frombuffer) as mock_frombuffer: + loaded_index = mbuf2.index() + # np.frombuffer should NOT be called on the main buffer for the index + for call in mock_frombuffer.call_args_list: + args, kwargs = call + # Ensure we're not reading index from the primary buffer + assert kwargs.get("offset") != HEADER_LENGTH, \ + "Index was re-read from buffer instead of cache" + + np.testing.assert_array_equal(loaded_index, original_index) + + +def test_index_cache_values_correct(): + """Values retrieved using cache should match those from a non-cached buffer.""" + mbuf_cached = make_mapbuffer() + mbuf_plain = MapBuffer({1: b"hello", 2: b"world"}) + + for key in [1, 2]: + assert mbuf_cached[key] == mbuf_plain[key] + + +def test_crc_error_raised_despite_cache(): + """CRC validation should still catch corruption even when cache exists.""" + data = {1: b"hello", 2: b"world"} + mbuf = make_mapbuffer(data) + mbuf.index() # populate cache + + # Corrupt the data region in the buffer + buf = bytearray(mbuf.buffer) + idx = bytes(buf).index(b"hello") + buf[idx] = ord(b"H") + mbuf.buffer = bytes(buf) + mbuf._index = None # force re-read so cache is used but data is still corrupt + + with pytest.raises(ValidationError): + mbuf[1] + + +def test_index_cache_not_rewritten_if_already_complete(): + """Cache file should not be overwritten on second load.""" + mbuf = make_mapbuffer() + mbuf.index() - assert os.path.exists("./hello.mbi") + mtime_after_first = os.path.getmtime(CACHE_PATH) - mbuf = MapBuffer(data, index_cache="./hello.mbi") - mbuf.index() + mbuf2 = make_mapbuffer() + mbuf2.index() + mtime_after_second = os.path.getmtime(CACHE_PATH) + assert mtime_after_first == mtime_after_second, \ + "Cache file was unexpectedly rewritten on second access" \ No newline at end of file From 04017074b8b64a667a61795cc6d72045829abe5a Mon Sep 17 00:00:00 2001 From: William Silversmith Date: Thu, 5 Mar 2026 21:31:15 -0500 Subject: [PATCH 9/9] docs: show how to use index caching --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 3b8dec5..52e54a6 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,11 @@ with open("data.im", "wb") as f: # You can skip computing or checking CRCs, e.g. if your # embedded object already contains CRCs mb = MapBuffer(..., check_crc=False, compute_crc=False) + +# If your access pattern is such that the index and the +# download are similar in size (e.g. watershed meshes) +# you can cache the index. +mb = MapBuffer(..., index_cache="/tmp/helloworld.mbi") ``` ## Installation