From 6de7c308f3434ccce06bf3ae6b732f24be3825ce Mon Sep 17 00:00:00 2001 From: jorge Date: Thu, 26 Mar 2026 11:05:01 +0100 Subject: [PATCH 01/11] feat: add CTable, a columnar in-memory table built on top of blosc2 Introduce CTable, a new columnar table class for efficient in-memory data storage using Blosc2 as the underlying compression engine. Each column is represented as a Column object wrapping a blosc2.NDArray with typed, compressed storage. Building on top of blosc2's existing infrastructure, CTable supports append, iteration and column-based queries. This is an early-stage (beta) implementation; the table is always fully loaded in memory. New files: - src/blosc2/ctable.py: CTable and Column class definitions - tests/ctable/: unit tests covering construction, slicing, deletion, compaction and row logic - bench/ctable/: benchmarks comparing CTable against pandas --- bench/ctable/compact.py | 81 +++ bench/ctable/ctable_v_panda.py | 127 +++++ bench/ctable/delete.py | 82 +++ bench/ctable/expected_size.py | 75 +++ bench/ctable/extend.py | 117 +++++ bench/ctable/extend_vs_apend.py | 84 +++ bench/ctable/index.py | 69 +++ bench/ctable/iteration_column.py | 85 ++++ bench/ctable/print.py | 112 ++++ bench/ctable/row_acces.py | 68 +++ bench/ctable/slice.py | 77 +++ bench/ctable/slice_steps.py | 67 +++ bench/ctable/slice_to_array.py | 77 +++ bench/ctable/where_chain.py | 79 +++ bench/ctable/where_selective.py | 68 +++ src/blosc2/__init__.py | 1 + src/blosc2/ctable.py | 793 +++++++++++++++++++++++++++++ tests/ctable/test_column.py | 294 +++++++++++ tests/ctable/test_compact.py | 157 ++++++ tests/ctable/test_construct.py | 225 ++++++++ tests/ctable/test_delete_rows.py | 210 ++++++++ tests/ctable/test_extend_delete.py | 226 ++++++++ tests/ctable/test_row_logic.py | 221 ++++++++ 23 files changed, 3395 insertions(+) create mode 100644 bench/ctable/compact.py create mode 100644 bench/ctable/ctable_v_panda.py create mode 100644 bench/ctable/delete.py create mode 100644 bench/ctable/expected_size.py create mode 100644 bench/ctable/extend.py create mode 100644 bench/ctable/extend_vs_apend.py create mode 100644 bench/ctable/index.py create mode 100644 bench/ctable/iteration_column.py create mode 100644 bench/ctable/print.py create mode 100644 bench/ctable/row_acces.py create mode 100644 bench/ctable/slice.py create mode 100644 bench/ctable/slice_steps.py create mode 100644 bench/ctable/slice_to_array.py create mode 100644 bench/ctable/where_chain.py create mode 100644 bench/ctable/where_selective.py create mode 100644 src/blosc2/ctable.py create mode 100644 tests/ctable/test_column.py create mode 100644 tests/ctable/test_compact.py create mode 100644 tests/ctable/test_construct.py create mode 100644 tests/ctable/test_delete_rows.py create mode 100644 tests/ctable/test_extend_delete.py create mode 100644 tests/ctable/test_row_logic.py diff --git a/bench/ctable/compact.py b/bench/ctable/compact.py new file mode 100644 index 00000000..f41bb008 --- /dev/null +++ b/bench/ctable/compact.py @@ -0,0 +1,81 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring compact() time and memory gain after deletions +# of varying fractions of the table. + +from time import time +from typing import Annotated + +import numpy as np +from pydantic import BaseModel, Field + +import blosc2 + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +# Row model +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +N = 1_000_000 + +print(f"compact() benchmark | N = {N:,}\n") + +# Build base data once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +delete_fractions = [0.1, 0.25, 0.5, 0.75, 0.9] + +print("=" * 75) +print(f"{'DELETED':>10} {'ROWS LEFT':>10} {'TIME (s)':>12} {'CBYTES BEFORE':>15} {'CBYTES AFTER':>14}") +print("-" * 75) + +for frac in delete_fractions: + ct = blosc2.CTable(RowModel, expected_size=N) + ct.extend(DATA) + + n_delete = int(N * frac) + ct.delete(list(range(n_delete))) + + cbytes_before = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes + + t0 = time() + ct.compact() + t_compact = time() - t0 + + cbytes_after = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes + + print( + f"{frac*100:>9.0f}%" + f" {N - n_delete:>10,}" + f" {t_compact:>12.4f}" + f" {cbytes_before / 1024**2:>13.2f} MB" + f" {cbytes_after / 1024**2:>12.2f} MB" + ) + +print("-" * 75) diff --git a/bench/ctable/ctable_v_panda.py b/bench/ctable/ctable_v_panda.py new file mode 100644 index 00000000..4f7d6c8a --- /dev/null +++ b/bench/ctable/ctable_v_panda.py @@ -0,0 +1,127 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark comparing CTable vs pandas DataFrame for: +# 1. Creation from a NumPy structured array +# 2. Column access (full column) +# 3. Filtering (where/query) +# 4. Row iteration + +from time import time +from typing import Annotated + +import numpy as np +import pandas as pd +from pydantic import BaseModel, Field + +import blosc2 + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +# Row model +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +N = 1_000_000 +rng = np.random.default_rng(42) + +print(f"CTable vs pandas benchmark | N = {N:,}\n") + +# Build base data once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.empty(N, dtype=np_dtype) +DATA["id"] = np.arange(N, dtype=np.int64) +DATA["c_val"] = rng.standard_normal(N) + 1j * rng.standard_normal(N) +DATA["score"] = rng.uniform(0, 100, N) +DATA["active"] = rng.integers(0, 2, N, dtype=np.bool_) + +print("=" * 65) +print(f"{'OPERATION':<30} {'CTable':>12} {'pandas':>12} {'SPEEDUP':>10}") +print("-" * 65) + +# 1. Creation +t0 = time() +ct = blosc2.CTable(RowModel, expected_size=N) +ct.extend(DATA) +t_ct_create = time() - t0 + +t0 = time() +df = pd.DataFrame(DATA) +t_pd_create = time() - t0 + +print(f"{'Creation':<30} {t_ct_create:>12.4f} {t_pd_create:>12.4f} {t_pd_create/t_ct_create:>9.2f}x") + +# 2. Column access (full column) +t0 = time() +arr = ct["score"] +t_ct_col = time() - t0 + +t0 = time() +arr = df["score"] +t_pd_col = time() - t0 + +print(f"{'Column access (full)':<30} {t_ct_col:>12.4f} {t_pd_col:>12.4f} {t_pd_col/t_ct_col:>9.2f}x") + +# 2.5 Column access (full column) +t0 = time() +arr = ct["score"].to_numpy() +t_ct_col = time() - t0 + +t0 = time() +arr = df["score"].to_numpy() +t_pd_col = time() - t0 + +print(f"{'Column access to numpy (full)':<30} {t_ct_col:>12.4f} {t_pd_col:>12.4f} {t_pd_col/t_ct_col:>9.3f}x") + +# 3. Filtering +t0 = time() +result_ct = ct.where((ct["id"] > 250_000) & (ct["id"] < 750_000)) +t_ct_filter = time() - t0 + +t0 = time() +result_pd = df.query("250000 < id < 750000") +t_pd_filter = time() - t0 + +print(f"{'Filter (id 250k-750k)':<30} {t_ct_filter:>12.4f} {t_pd_filter:>12.4f} {t_pd_filter/t_ct_filter:>9.2f}x") + +# 4. Row iteration +t0 = time() +for val in ct["score"]: + pass +t_ct_iter = time() - t0 + +t0 = time() +for val in df["score"]: + pass +t_pd_iter = time() - t0 + +print(f"{'Row iteration':<30} {t_ct_iter:>12.4f} {t_pd_iter:>12.4f} {t_pd_iter/t_ct_iter:>9.2f}x") + +print("-" * 65) + +# Memory +ct_cbytes = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes +ct_nbytes = sum(col.nbytes for col in ct._cols.values()) + ct._valid_rows.nbytes +pd_nbytes = df.memory_usage(deep=True).sum() + +print(f"\nMemory — CTable compressed: {ct_cbytes / 1024**2:.2f} MB") +print(f"Memory — CTable uncompressed: {ct_nbytes / 1024**2:.2f} MB") +print(f"Memory — pandas: {pd_nbytes / 1024**2:.2f} MB") +print(f"Compression ratio CTable: {ct_nbytes / ct_cbytes:.2f}x") diff --git a/bench/ctable/delete.py b/bench/ctable/delete.py new file mode 100644 index 00000000..fb147c7c --- /dev/null +++ b/bench/ctable/delete.py @@ -0,0 +1,82 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring delete() performance with different index types: +# int, slice, and list — with varying sizes. + +from time import time +from typing import Annotated + +import numpy as np +from pydantic import BaseModel, Field + +import blosc2 + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +# Row model +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +N = 1_000_000 + +print(f"delete() benchmark | N = {N:,}\n") + +# Build base data once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +delete_cases = [ + ("int", 0), + ("slice small", slice(0, 100)), + ("slice large", slice(0, 100_000)), + ("slice full", slice(0, N)), + ("list small", list(range(100))), + ("list large", list(range(100_000))), + ("list full", list(range(N))), +] + +print("=" * 60) +print(f"{'CASE':<20} {'ROWS DELETED':>14} {'TIME (s)':>12}") +print("-" * 60) + +for label, key in delete_cases: + ct = blosc2.CTable(RowModel, expected_size=N) + ct.extend(DATA) + + if isinstance(key, int): + n_deleted = 1 + elif isinstance(key, slice): + n_deleted = len(range(*key.indices(N))) + else: + n_deleted = len(key) + + t0 = time() + ct.delete(key) + t_delete = time() - t0 + print(f"{label:<20} {n_deleted:>14,} {t_delete:>12.6f}") + +print("-" * 60) diff --git a/bench/ctable/expected_size.py b/bench/ctable/expected_size.py new file mode 100644 index 00000000..c4444a62 --- /dev/null +++ b/bench/ctable/expected_size.py @@ -0,0 +1,75 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring the overhead of resize() when expected_size +# is too small (M rows) vs correctly sized (N rows) during extend(). + +from time import time +from typing import Annotated + +import numpy as np +from pydantic import BaseModel, Field + +import blosc2 + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +# Row model +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + + +M = 779 +N = 62_500 +MAX_N = 1_000_000 +print(f"expected_size benchmark | wrong expected_size = {M}") + +# Pre-generate full dataset once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(MAX_N) + ], + dtype=np_dtype, +) + +while N <= MAX_N: + print("-" * 80) + print(f"N = {N:,} rows") + + # 1. extend() with correct expected_size = N + ct_correct = blosc2.CTable(RowModel, expected_size=N) + t0 = time() + ct_correct.extend(DATA[:N]) + t_correct = time() - t0 + print(f"extend() expected_size=N ({N:>8,}): {t_correct:.4f} s rows: {len(ct_correct):,}") + + # 2. extend() with wrong expected_size = M (forces resize) + ct_wrong = blosc2.CTable(RowModel, expected_size=M) + t0 = time() + ct_wrong.extend(DATA[:N]) + t_wrong = time() - t0 + print(f"extend() expected_size=M ({M:>8,}): {t_wrong:.4f} s rows: {len(ct_wrong):,}") + + # Summary + print(f" Slowdown from wrong expected_size: {t_wrong / t_correct:.2f}x") + + N *= 2 diff --git a/bench/ctable/extend.py b/bench/ctable/extend.py new file mode 100644 index 00000000..f294b012 --- /dev/null +++ b/bench/ctable/extend.py @@ -0,0 +1,117 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring CTable creation time from three different sources: +# 1. Python list of lists (1M rows) +# 2. NumPy structured array (1M rows) — list of named tuples +# 3. An existing CTable (previously created from Python lists, 1M rows) + +from time import time +from typing import Annotated + +import numpy as np +from pydantic import BaseModel, Field + +import blosc2 + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + + + + +# --------------------------------------------------------------------------- +# Row model +# --------------------------------------------------------------------------- +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +N = 1_000_000 +print(f"CTable creation benchmark with {N:,} rows\n") + +# --------------------------------------------------------------------------- +# Base data generation (not part of the benchmark timing) +# --------------------------------------------------------------------------- +print("Generating base data...") + +t0 = time() +data_list = [ + [i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0] + for i in range(N) +] +t_gen_list = time() - t0 +print(f" Python list generated in: {t_gen_list:.4f} s") + +t0 = time() +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +data_np = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) +t_gen_np = time() - t0 +print(f" NumPy structured array generated: {t_gen_np:.4f} s\n") + +# --------------------------------------------------------------------------- +# 1. Creation from a Python list of lists +# --------------------------------------------------------------------------- +print("CTable from Python list of lists") +t0 = time() +ct_from_list = blosc2.CTable(RowModel, expected_size=N) +ct_from_list.extend(data_list) +t_from_list = time() - t0 +print(f" extend() time (Python list): {t_from_list:.4f} s") +print(f" Rows: {len(ct_from_list):,}") + +# --------------------------------------------------------------------------- +# 2. Creation from a NumPy structured array (list of named tuples) +# --------------------------------------------------------------------------- +print("CTable from NumPy structured array") +t0 = time() +ct_from_np = blosc2.CTable(RowModel, expected_size=N) +ct_from_np.extend(data_np) +t_from_np = time() - t0 +print(f" extend() time (NumPy struct): {t_from_np:.4f} s") +print(f" Rows: {len(ct_from_np):,}") + + +# --------------------------------------------------------------------------- +# 3. Creation from an existing CTable (ct_from_list, already built above) +# --------------------------------------------------------------------------- +print("CTable from an existing CTable") +t0 = time() +ct_from_ctable = blosc2.CTable(RowModel, expected_size=N) +ct_from_ctable.extend(ct_from_list) +t_from_ctable = time() - t0 +print(f" extend() time (CTable): {t_from_ctable:.4f} s") +print(f" Rows: {len(ct_from_ctable):,}") + +# --------------------------------------------------------------------------- +# Summary +# --------------------------------------------------------------------------- +print("\n") +print("=" * 60) +print(f"{'SOURCE':<30} {'TIME (s)':>12} {'SPEEDUP vs list':>18}") +print("-" * 60) +print(f"{'Python list of lists':<30} {t_from_list:>12.4f} {'1.00x':>18}") +print(f"{'NumPy structured array':<30} {t_from_np:>12.4f} {t_from_list / t_from_np:>17.2f}x") +print(f"{'Existing CTable':<30} {t_from_ctable:>12.4f} {t_from_list / t_from_ctable:>17.2f}x") + diff --git a/bench/ctable/extend_vs_apend.py b/bench/ctable/extend_vs_apend.py new file mode 100644 index 00000000..2036755c --- /dev/null +++ b/bench/ctable/extend_vs_apend.py @@ -0,0 +1,84 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for comparing append() (row by row) vs extend() (bulk), +# to find the crossover point where extend() becomes worth it. + +from time import time +from typing import Annotated + +import numpy as np +from pydantic import BaseModel, Field + +import blosc2 + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +# Row model +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +# Parameter — change N to test different crossover points +N = 2 +print(f"append() vs extend() benchmark") +for i in range(6): + print("\n") + print("%" * 100) + + + # Base data generation + data_list = [ + [i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0] for i in range(N) + ] + + # 1. N individual append() calls + print(f"{N} individual append() calls") + ct_append = blosc2.CTable(RowModel, expected_size=N) + t0 = time() + for row in data_list: + ct_append.append(row) + t_append = time() - t0 + print(f" Time: {t_append:.6f} s") + print(f" Rows: {len(ct_append):,}") + + # 2. N individual extend() calls (one row at a time) + print(f"{N} individual extend() calls (one row at a time)") + ct_extend_one = blosc2.CTable(RowModel, expected_size=N) + t0 = time() + for row in data_list: + ct_extend_one.extend([row]) + t_extend_one = time() - t0 + print(f" Time: {t_extend_one:.6f} s") + print(f" Rows: {len(ct_extend_one):,}") + + # 3. Single extend() call with all N rows at once + print(f"Single extend() call with all {N} rows at once") + ct_extend_bulk = blosc2.CTable(RowModel, expected_size=N) + t0 = time() + ct_extend_bulk.extend(data_list) + t_extend_bulk = time() - t0 + print(f" Time: {t_extend_bulk:.6f} s") + print(f" Rows: {len(ct_extend_bulk):,}") + + # Summary + print("=" * 70) + print(f"{'METHOD':<35} {'TIME (s)':>12} {'SPEEDUP vs append':>20}") + print("-" * 70) + print(f"{'append() x N':<35} {t_append:>12.6f} {'1.00x':>20}") + print(f"{'extend() x N (one row each)':<35} {t_extend_one:>12.6f} {t_append / t_extend_one:>19.2f}x") + print(f"{'extend() x 1 (all at once)':<35} {t_extend_bulk:>12.6f} {t_append / t_extend_bulk:>19.2f}x") + print("-" * 70) + + N=N*2 diff --git a/bench/ctable/index.py b/bench/ctable/index.py new file mode 100644 index 00000000..634a68e1 --- /dev/null +++ b/bench/ctable/index.py @@ -0,0 +1,69 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring Column[int] access (single row by logical index), +# which exercises _find_physical_index() traversal over chunk metadata. + +from time import time +from typing import Annotated + +import numpy as np +from pydantic import BaseModel, Field + +import blosc2 + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +# Row model +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +N = 1_000_000 +indices = [0, N // 4, N // 2, (3 * N) // 4, N - 1] + +print(f"Column[int] access benchmark | N = {N:,}\n") + +# Build CTable once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +ct = blosc2.CTable(RowModel, expected_size=N) +ct.extend(DATA) + +print(f"CTable built with {len(ct):,} rows\n") +print("=" * 60) +print(f"{'INDEX':<15} {'POSITION':>12} {'TIME (s)':>12}") +print("-" * 60) + +col = ct["score"] +for idx in indices: + t0 = time() + val = col[idx] + t_access = time() - t0 + position = f"{idx / N * 100:.0f}% into array" + print(f"{idx:<15,} {position:>12} {t_access:.6f}") + +print("-" * 60) diff --git a/bench/ctable/iteration_column.py b/bench/ctable/iteration_column.py new file mode 100644 index 00000000..5f0efaed --- /dev/null +++ b/bench/ctable/iteration_column.py @@ -0,0 +1,85 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for comparing full column iteration strategies: +# 1. for val in ct["score"] — Python iterator via __iter__ +# 2. np.array(list(ct["score"])) — materialize via list then convert +# 3. ct["score"][0:N].to_array() — slice view + to_array() + +from time import time +from typing import Annotated + +import numpy as np +from pydantic import BaseModel, Field + +import blosc2 + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +# Row model +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +N = 1_000_000 + +print(f"Column iteration benchmark | N = {N:,}\n") + +# Build CTable once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +ct = blosc2.CTable(RowModel, expected_size=N) +ct.extend(DATA) + +print(f"CTable built with {len(ct):,} rows\n") +print("=" * 60) + +col = ct["score"] + +# 1. Python iterator +t0 = time() +for val in col: + pass +t_iter = time() - t0 +print(f"for val in col: {t_iter:.4f} s") + +# 2. list() + np.array() +t0 = time() +arr = np.array(list(col)) +t_list = time() - t0 +print(f"np.array(list(col)): {t_list:.4f} s") + +# 3. slice view + to_array() +t0 = time() +arr = col[0:N].to_numpy() +for val in arr: + pass +t_toarray = time() - t0 +print(f"col[0:N].to_array(): {t_toarray:.4f} s") + +print("=" * 60) +print(f"Speedup to_array vs iter: {t_iter / t_toarray:.2f}x") +print(f"Speedup to_array vs list: {t_list / t_toarray:.2f}x") diff --git a/bench/ctable/print.py b/bench/ctable/print.py new file mode 100644 index 00000000..af352a2a --- /dev/null +++ b/bench/ctable/print.py @@ -0,0 +1,112 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark: iterative ingestion comparison — Pandas vs CTable +# Data source: randomly generated numpy structured array + +import time +from typing import Annotated + +import numpy as np +import pandas as pd +import blosc2 +from pydantic import BaseModel, Field + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] + name: Annotated[str, NumpyDtype(np.dtype(" np.ndarray: + arr = np.empty(n, dtype=np_dtype) + arr["id"] = np.arange(n, dtype=np.int64) + arr["name"] = np.array([rng.choice(NAMES) for _ in range(n)], dtype=" DataFrame) ---") +data = make_data(N) + +t0 = time.perf_counter() +df = pd.DataFrame(data) +t_pandas = time.perf_counter() - t0 + +mem_pandas = df.memory_usage(deep=True).sum() / (1024 ** 2) +print(f"Total time: {t_pandas:.4f} s") +print(f"Memory (RAM): {mem_pandas:.2f} MB") + +print("\n--- PANDAS: First 10 rows ---") +t0_print = time.perf_counter() +print(df.head(10).to_string()) +t_print_pandas = time.perf_counter() - t0_print +print(f"\nPrint time: {t_print_pandas:.6f} s") + +# ───────────────────────────────────────────────────────────── +# 2. BLOSC2 CTable +# ───────────────────────────────────────────────────────────── +print("\n" + "=" * 60) +print("--- 2. BLOSC2 CTable (structured array -> extend) ---") +data = make_data(N) + +t0 = time.perf_counter() +ct = blosc2.CTable(RowModel, expected_size=N) +ct.extend(data) +t_blosc = time.perf_counter() - t0 + +fields = list(RowModel.model_fields.keys()) +mem_blosc_c = (sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes) / (1024 ** 2) +mem_blosc_uc = (sum(col.nbytes for col in ct._cols.values()) + ct._valid_rows.nbytes) / (1024 ** 2) + +print(f"Total time: {t_blosc:.4f} s") +print(f"Memory (uncompressed): {mem_blosc_uc:.2f} MB") +print(f"Memory (compressed): {mem_blosc_c:.2f} MB") + +print("\n--- BLOSC2: First 10 rows ---") +t0_print = time.perf_counter() +print(ct.head(10)) +t_print_blosc = time.perf_counter() - t0_print +print(f"\nPrint time: {t_print_blosc:.6f} s") + +# ───────────────────────────────────────────────────────────── +# SUMMARY +# ───────────────────────────────────────────────────────────── +print("\n" + "=" * 60) +print("--- SUMMARY ---") +speedup = t_pandas / t_blosc +direction = "faster" if t_blosc < t_pandas else "slower" + +print(f"{'METRIC':<30} {'Pandas':>12} {'Blosc2':>12}") +print("-" * 55) +print(f"{'Ingestion time (s)':<30} {t_pandas:>12.4f} {t_blosc:>12.4f}") +print(f"{'Memory (MB)':<30} {mem_pandas:>12.2f} {mem_blosc_c:>12.2f}") +print(f"{'Print time (s)':<30} {t_print_pandas:>12.6f} {t_print_blosc:>12.6f}") +print("-" * 55) +print(f"\nSpeedup: {speedup:.2f}x {direction}") +print(f"Compression ratio: {mem_blosc_uc / mem_blosc_c:.2f}x") +print(f"Blosc2 vs Pandas size: {mem_blosc_c / mem_pandas * 100:.1f}%") diff --git a/bench/ctable/row_acces.py b/bench/ctable/row_acces.py new file mode 100644 index 00000000..c44439e0 --- /dev/null +++ b/bench/ctable/row_acces.py @@ -0,0 +1,68 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring row[int] access (full row via _RowIndexer), +# testing access at different positions across the array. + +from time import time +from typing import Annotated + +import numpy as np +from pydantic import BaseModel, Field + +import blosc2 + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +# Row model +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +N = 1_000_000 +indices = [0, N // 4, N // 2, (3 * N) // 4, N - 1] + +print(f"row[int] access benchmark | N = {N:,}\n") + +# Build CTable once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +ct = blosc2.CTable(RowModel, expected_size=N) +ct.extend(DATA) + +print(f"CTable built with {len(ct):,} rows\n") +print("=" * 60) +print(f"{'INDEX':<15} {'POSITION':>12} {'TIME (s)':>12}") +print("-" * 60) + +for idx in indices: + t0 = time() + row = ct.row[idx] + t_access = time() - t0 + position = f"{idx / N * 100:.0f}% into array" + print(f"{idx:<15,} {position:>12} {t_access:.6f}") + +print("-" * 60) diff --git a/bench/ctable/slice.py b/bench/ctable/slice.py new file mode 100644 index 00000000..4976a8d8 --- /dev/null +++ b/bench/ctable/slice.py @@ -0,0 +1,77 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring Column[slice] access with slices of different +# sizes and positions: small, large, and middle of the array. + +from time import time +from typing import Annotated + +import numpy as np +from pydantic import BaseModel, Field + +import blosc2 + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +# Row model +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +N = 1_000_000 +slices = [ + ("small — start", slice(0, 100)), + ("small — middle", slice(N // 2, N // 2 + 100)), + ("small — end", slice(N - 100, N)), + ("large — start", slice(0, 100_000)), + ("large — middle", slice(N // 2 - 50_000, N // 2 + 50_000)), + ("large — end", slice(N - 100_000, N)), + ("full — all", slice(0, N)), +] + +print(f"Column[slice] access benchmark | N = {N:,}\n") + +# Build CTable once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +ct = blosc2.CTable(RowModel, expected_size=N) +ct.extend(DATA) + +print(f"CTable built with {len(ct):,} rows\n") +print("=" * 65) +print(f"{'SLICE':<25} {'ROWS':>8} {'TIME (s)':>12}") +print("-" * 65) + +col = ct["score"] +for label, s in slices: + t0 = time() + val = col[s] + t_access = time() - t0 + n_rows = s.stop - s.start + print(f"{label:<25} {n_rows:>8,} {t_access:>12.6f}") + +print("-" * 65) diff --git a/bench/ctable/slice_steps.py b/bench/ctable/slice_steps.py new file mode 100644 index 00000000..311b5f9c --- /dev/null +++ b/bench/ctable/slice_steps.py @@ -0,0 +1,67 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring Column[::step].to_array() with varying step sizes. + +from time import time +from typing import Annotated + +import numpy as np +from pydantic import BaseModel, Field + +import blosc2 + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +# Row model +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +N = 1_000_000 +steps = [1, 2, 4, 8, 16, 100, 1000] + +print(f"Column[::step].to_array() benchmark | N = {N:,}\n") + +# Build CTable once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +ct = blosc2.CTable(RowModel, expected_size=N) +ct.extend(DATA) + +print(f"CTable built with {len(ct):,} rows\n") +print("=" * 60) +print(f"{'STEP':<10} {'ROWS RETURNED':>15} {'TIME (s)':>12}") +print("-" * 60) + +col = ct["score"] +for step in steps: + t0 = time() + arr = col[::step].to_numpy() + t_total = time() - t0 + print(f"::{ step:<8} {len(arr):>15,} {t_total:>12.6f}") + +print("-" * 60) diff --git a/bench/ctable/slice_to_array.py b/bench/ctable/slice_to_array.py new file mode 100644 index 00000000..2b072c8c --- /dev/null +++ b/bench/ctable/slice_to_array.py @@ -0,0 +1,77 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring Column[slice] + to_array() with slices of +# different sizes and positions: small, large, and middle of the array. + +from time import time +from typing import Annotated + +import numpy as np +from pydantic import BaseModel, Field + +import blosc2 + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +# Row model +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +N = 1_000_000 +slices = [ + ("small — start", slice(0, 100)), + ("small — middle", slice(N // 2, N // 2 + 100)), + ("small — end", slice(N - 100, N)), + ("large — start", slice(0, 100_000)), + ("large — middle", slice(N // 2 - 50_000, N // 2 + 50_000)), + ("large — end", slice(N - 100_000, N)), + ("full — all", slice(0, N)), +] + +print(f"Column[slice].to_array() benchmark | N = {N:,}\n") + +# Build CTable once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +ct = blosc2.CTable(RowModel, expected_size=N) +ct.extend(DATA) + +print(f"CTable built with {len(ct):,} rows\n") +print("=" * 65) +print(f"{'SLICE':<25} {'ROWS':>8} {'TIME (s)':>12}") +print("-" * 65) + +col = ct["score"] +for label, s in slices: + t0 = time() + arr = col[s].to_numpy() + t_total = time() - t0 + n_rows = s.stop - s.start + print(f"{label:<25} {n_rows:>8,} {t_total:>12.6f}") + +print("-" * 65) diff --git a/bench/ctable/where_chain.py b/bench/ctable/where_chain.py new file mode 100644 index 00000000..ff8b9b30 --- /dev/null +++ b/bench/ctable/where_chain.py @@ -0,0 +1,79 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for comparing chained where() calls vs a single combined filter. +# Filters: 250k < id < 750k, active == False, 25.0 < score < 75.0 + +from time import time +from typing import Annotated + +import numpy as np +from pydantic import BaseModel, Field + +import blosc2 + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +# Row model +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +N = 1_000_000 + +print(f"where() chained vs combined benchmark | N = {N:,}") + +# Build CTable once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +ct = blosc2.CTable(RowModel, expected_size=N) +ct.extend(DATA) + +print(f"CTable built with {len(ct):,} rows\n") +print("=" * 70) + +# 1. Three chained where() calls +t0 = time() +r1 = ct.where((ct["id"] > 250_000)) +r2 = r1.where((ct["id"] < 750_000)) +r3 = r2.where((ct["score"] > 25.0)) +r4 = r3.where((ct["score"] < 75.0)) +r5 = r4.where(ct["active"] == False) +t_chained = time() - t0 +print(f"Chained where() (5 calls): {t_chained:.6f} s rows: {len(r5):,}") + +# 2. Single combined where() call +t0 = time() +result = ct.where( + (ct["id"] > 250_000) & (ct["id"] < 750_000) & + (ct["active"] == False) & + (ct["score"] > 25.0) & (ct["score"] < 75.0) +) +t_combined = time() - t0 +print(f"Combined where() (1 call): {t_combined:.6f} s rows: {len(result):,}") + +print("=" * 70) +print(f"Speedup combined vs chained: {t_chained / t_combined:.2f}x") diff --git a/bench/ctable/where_selective.py b/bench/ctable/where_selective.py new file mode 100644 index 00000000..77767d45 --- /dev/null +++ b/bench/ctable/where_selective.py @@ -0,0 +1,68 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring where() performance with varying selectivity. +# Filter: id < threshold, with thresholds covering 1%, 10%, 50%, 90%, 100% + +from time import time +from typing import Annotated + +import numpy as np +from pydantic import BaseModel, Field + +import blosc2 + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +# Row model +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +N = 1_000_000 +thresholds = [10,10_000, 100_000,250_000, 500_000,750_000 ,900_000, 999_990, 1_000_000] + +print(f"where() selectivity benchmark | N = {N:,}") + +# Build CTable once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +ct = blosc2.CTable(RowModel, expected_size=N) +ct.extend(DATA) + +print(f"CTable built with {len(ct):,} rows\n") +print("=" * 70) +print(f"{'THRESHOLD':<15} {'ROWS RETURNED':>15} {'SELECTIVITY':>13} {'TIME (s)':>12}") +print("-" * 70) + +for threshold in thresholds: + t0 = time() + result = ct.where(ct["id"] < threshold) + t_where = time() - t0 + selectivity = threshold / N * 100 + print(f"id < {threshold:<10,} {len(result):>15,} {selectivity:>12.1f}% {t_where:>12.6f}") + +print("-" * 70) diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index e32b2f48..8afc0653 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -596,6 +596,7 @@ def _raise(exc): """ # Delayed imports for avoiding overwriting of python builtins +from .ctable import CTable, Column from .ndarray import ( abs, acos, diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py new file mode 100644 index 00000000..3acb1731 --- /dev/null +++ b/src/blosc2/ctable.py @@ -0,0 +1,793 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### + +"""Imports for CTable""" + +from __future__ import annotations + +from collections.abc import Iterable +from typing import Any, Generic, TypeVar + +import numpy as np + +from blosc2 import compute_chunks_blocks + +try: + from line_profiler import profile +except ImportError: + + def profile(func): + + def wrapper(*args, **kwargs): + return func(*args, **kwargs) + + wrapper.__name__ = func.__name__ + return wrapper + + +from pydantic import BaseModel + +import blosc2 + +RowT = TypeVar("RowT", bound=BaseModel) + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +class MaxLen: + def __init__(self, length: int): + self.length = int(length) + + +############################# +#### Row model examples ### +############################# +""" +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + +class RowModel2(BaseModel): + id: Annotated[int, NumpyDtype(np.int16)] = Field(ge=0) + name: Annotated[str, MaxLen(10)] = Field(default="unknown") + # name: Annotated[bytes, MaxLen(10)] = Field(default=b"unknown") + score: Annotated[float, NumpyDtype(np.float32)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + +class RowModel3(BaseModel): + id: Annotated[int, NumpyDtype(np.int16)] = Field(ge=0) + #name: Annotated[str, MaxLen(10)] = Field(default="unknown") + name: Annotated[bytes, MaxLen(10)] = Field(default=b"unknown")""" + + +class _RowIndexer: + def __init__(self, table): + self._table = table + + def __getitem__(self, item): + return self._table._run_row_logic(item) + + +def _resolve_field_dtype(field) -> tuple[np.dtype, int]: + """Return (numpy dtype, display_width) for a pydantic model field. + + Extracts dtype from NumpyDtype metadata when present, otherwise falls + back to a sensible default for each Python primitive type. + """ + annotation = field.annotation + origin = getattr(annotation, "__origin__", annotation) + + # str / bytes: look for MaxLen metadata, build fixed-width dtype + if origin in (str, bytes) or annotation in (str, bytes): + is_bytes = origin is bytes or annotation is bytes + max_len = 32 + if hasattr(annotation, "__metadata__"): + for meta in annotation.__metadata__: + if isinstance(meta, MaxLen): + max_len = meta.length + break + kind = "S" if is_bytes else "U" + dt = np.dtype(f"{kind}{max_len}") + display_width = max(10, min(max_len, 50)) + return dt, display_width + + # Check for explicit NumpyDtype metadata (overrides primitive defaults) + if hasattr(annotation, "__metadata__"): + for meta in annotation.__metadata__: + if isinstance(meta, NumpyDtype): + dt = np.dtype(meta.dtype) + display_width = _default_display_width(origin) + return dt, display_width + + # Primitive defaults + _PRIMITIVE_MAP = { + int: (np.int64, 12), + float: (np.float64, 15), + bool: (np.bool_, 6), + complex: (np.complex128, 25), + } + if origin in _PRIMITIVE_MAP: + dt_raw, display_width = _PRIMITIVE_MAP[origin] + return np.dtype(dt_raw), display_width + + return np.dtype(np.object_), 20 + + +def _default_display_width(origin) -> int: + """Return a sensible display column width for a given Python type.""" + return {int: 12, float: 15, bool: 6, complex: 25}.get(origin, 20) + + +def _find_physical_index(arr: blosc2.NDArray, logical_key: int) -> int: + """Translate a logical (valid-row) index into a physical array index. + + Iterates chunk metadata of the boolean *arr* (valid_rows) to locate the + *logical_key*-th True value without fully decompressing the array. + + Returns + ------- + int + Physical position in the underlying storage array. + + Raises + ------ + IndexError + If the logical index is out of range or the array is inconsistent. + """ + count = 0 + chunk_size = arr.chunks[0] + + for info in arr.iterchunks_info(): + actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size) + chunk_start = info.nchunk * chunk_size + + if info.special == blosc2.SpecialValue.ZERO: + continue + + if info.special == blosc2.SpecialValue.VALUE: + val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0] + if not val: + continue + if count + actual_size <= logical_key: + count += actual_size + continue + return chunk_start + (logical_key - count) + + chunk_data = arr[chunk_start : chunk_start + actual_size] + n_true = int(np.count_nonzero(chunk_data)) + if count + n_true <= logical_key: + count += n_true + continue + + return chunk_start + int(np.flatnonzero(chunk_data)[logical_key - count]) + + raise IndexError("Unexpected error finding physical index.") + + +class Column: + def __init__(self, table: CTable, col_name: str, mask=None): + self._table = table + self._col_name = col_name + self._mask = mask + + @property + def _raw_col(self): + return self._table._cols[self._col_name] + + @property + def _valid_rows(self): + if self._mask is None: + return self._table._valid_rows + return (self._table._valid_rows & self._mask).compute() + + def __getitem__(self, key: int | slice | list | np.ndarray): + if isinstance(key, int): + n_rows = len(self) + if key < 0: + key += n_rows + if not (0 <= key < n_rows): + raise IndexError(f"index {key} is out of bounds for column with size {n_rows}") + pos_true = _find_physical_index(self._valid_rows, key) + return self._raw_col[int(pos_true)] + + + + elif isinstance(key, slice): + real_pos = blosc2.where(self._valid_rows, np.arange(len(self._valid_rows))).compute() + start, stop, step = key.indices(len(real_pos)) + mask = blosc2.zeros(len(self._table._valid_rows), dtype=np.bool_) + if step == 1: + phys_start = real_pos[start] + phys_stop = real_pos[stop - 1] + mask[phys_start: phys_stop + 1] = True + else: + lindices = np.arange(start, stop, step) + phys_indices = real_pos[lindices] + mask[phys_indices[:]] = True + return Column(self._table, self._col_name, mask=mask) + + + elif isinstance(key, (list, tuple, np.ndarray)): + real_pos = blosc2.where(self._valid_rows, np.arange(len(self._valid_rows))).compute() + phys_indices = np.array([real_pos[i] for i in key], dtype=np.int64) + return self._raw_col[phys_indices] + + raise TypeError(f"Invalid index type: {type(key)}") + + def __setitem__(self, key: int | slice | list | np.ndarray, value): + if isinstance(key, int): + n_rows = len(self) + if key < 0: + key += n_rows + if not (0 <= key < n_rows): + raise IndexError(f"index {key} is out of bounds for column with size {n_rows}") + pos_true = _find_physical_index(self._valid_rows, key) + self._raw_col[int(pos_true)] = value + + elif isinstance(key, (slice, list, tuple, np.ndarray)): + real_pos = blosc2.where(self._valid_rows, np.arange(len(self._valid_rows))).compute() + if isinstance(key, slice): + lindices = range(*key.indices(len(real_pos))) + phys_indices = np.array([real_pos[i] for i in lindices], dtype=np.int64) + else: + phys_indices = np.array([real_pos[i] for i in key], dtype=np.int64) + + if isinstance(value, (list, tuple)): + value = np.array(value, dtype=self._raw_col.dtype) + self._raw_col[phys_indices] = value + + else: + raise TypeError(f"Invalid index type: {type(key)}") + + def __iter__(self): + arr = self._valid_rows + chunk_size = arr.chunks[0] + + for info in arr.iterchunks_info(): + actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size) + chunk_start = info.nchunk * chunk_size + + if info.special == blosc2.SpecialValue.ZERO: + continue + + if info.special == blosc2.SpecialValue.VALUE: + val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0] + if not val: + continue + yield from self._raw_col[chunk_start: chunk_start + actual_size] + continue + + mask_chunk = arr[chunk_start: chunk_start + actual_size] + data_chunk = self._raw_col[chunk_start: chunk_start + actual_size] + yield from data_chunk[mask_chunk] + + def __len__(self): + return blosc2.count_nonzero(self._valid_rows) + + def __lt__(self, other): + # < (Less than) + return self._raw_col < other + + def __le__(self, other): + # <= (Less than or equal to) + return self._raw_col <= other + + def __eq__(self, other): + # == (Equal to) + return self._raw_col == other + + def __ne__(self, other): + # != (Not equal to) + return self._raw_col != other + + def __gt__(self, other): + # > (Greater than) + return self._raw_col > other + + def __ge__(self, other): + # >= (Greater than or equal to) + return self._raw_col >= other + + @property + def dtype(self): + return self._raw_col.dtype + + def to_numpy(self): + real_pos = blosc2.where(self._valid_rows, np.arange(len(self._valid_rows))).compute() + return self._raw_col[real_pos[:]] + + +class CTable(Generic[RowT]): + def __init__(self, row_type: type[RowT], new_data=None, expected_size: int = 1_048_576, compact: bool = False) -> None: + self._row_type = row_type + self._cols: dict[str, blosc2.NDArray] = {} + self._n_rows: int = 0 + self._col_widths: dict[str, int] = {} + self.col_names = [] + self.row = _RowIndexer(self) + self.auto_compact = compact + self.base = None + + c, b = compute_chunks_blocks((expected_size,)) + self._valid_rows = blosc2.zeros(shape=(expected_size,), dtype=np.bool_, chunks=c, blocks=b) + + for name, field in row_type.model_fields.items(): + self.col_names.append(name) + dt, display_width = _resolve_field_dtype(field) + final_width = max(len(name), display_width) + self._col_widths[name] = final_width + self._cols[name] = blosc2.zeros(shape=(expected_size,), dtype=dt, chunks=c, blocks=b) + + if new_data is not None: + self._load_initial_data(new_data) + + def _load_initial_data(self, new_data) -> None: + """Dispatch new_data to append() or extend() as appropriate.""" + is_append = False + + if isinstance(new_data, (np.void, np.record)): + is_append = True + elif isinstance(new_data, np.ndarray): + if new_data.dtype.names is not None and new_data.ndim == 0: + is_append = True + elif isinstance(new_data, list) and len(new_data) > 0: + first_elem = new_data[0] + if isinstance(first_elem, (str, bytes, int, float, bool, complex)): + is_append = True + + if is_append: + self.append(new_data) + else: + self.extend(new_data) + + def __str__(self): + retval = [] + cont = 0 + + # We print the header + for name in self._cols: + retval.append(f"{name:^{self._col_widths[name]}} |") + cont += self._col_widths[name] + 2 + retval.append("\n") + for _i in range(cont): + retval.append("-") + retval.append("\n") + + # We print the rows + + """Change this. Use where""" + real_poss = blosc2.where(self._valid_rows, np.array(range(len(self._valid_rows)))).compute() + + for j in real_poss: + for name in self._cols: + retval.append(f"{self._cols[name][j]:^{self._col_widths[name]}}") + retval.append(" |") + retval.append("\n") + for _ in range(cont): + retval.append("-") + retval.append("\n") + return "".join(retval) + + def __len__(self): + return self._n_rows + + def view(self, new_valid_rows): + if not ( + isinstance(new_valid_rows, (blosc2.NDArray, blosc2.LazyExpr)) + and (getattr(new_valid_rows, "dtype", None) == np.bool_) + ): + raise TypeError( + f"Expected boolean blosc2.NDArray or LazyExpr, got {type(new_valid_rows).__name__}" + ) + + new_valid_rows = ( + new_valid_rows.compute() if isinstance(new_valid_rows, blosc2.LazyExpr) else new_valid_rows + ) + + if len(self._valid_rows) != len(new_valid_rows): + raise ValueError() + + retval = CTable(self._row_type, compact=self.auto_compact, expected_size=len(self._valid_rows)) + retval._cols = self._cols + retval._n_rows = blosc2.count_nonzero(new_valid_rows) + retval._col_widths = self._col_widths + retval.col_names = self.col_names + retval.base = self + retval._valid_rows = new_valid_rows + + return retval + + def head(self, N: int = 5) -> CTable: + """ + # Alternative code, slower with big data + if n <= 0: + return CTable(self._row_type, compact=self.auto_compact) + + real_poss = blosc2.where(self._valid_rows, np.array(range(len(self._valid_rows)))).compute() + n_take = min(n, self._n_rows) + + retval = CTable(self._row_type, compact=self.auto_compact) + retval._n_rows = n_take + retval._valid_rows[:n_take] = True + + for k in self._cols.keys(): + retval._cols[k][:n_take] = self._cols[k][real_poss[:n_take]] + + return retval""" + if N <= 0: + # If N is 0 or negative, return an empty table + return self.view(blosc2.zeros(shape=len(self._valid_rows), dtype=np.bool_)) + + arr = self._valid_rows + count = 0 + chunk_size = arr.chunks[0] + pos_N_true = -1 + if N <= 0: + return self.view(blosc2.zeros(shape=len(arr), dtype=np.bool_)) + for info in arr.iterchunks_info(): + actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size) + chunk_start = info.nchunk * chunk_size + + # All False without decompressing -> skip + if info.special == blosc2.SpecialValue.ZERO: + continue + + # Repeated value -> check if True or False + if info.special == blosc2.SpecialValue.VALUE: + val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0] + if not val: + continue # all False, skip + # All True: target is at offset (N - count - 1) within the chunk + if count + actual_size < N: + count += actual_size + continue + pos_N_true = chunk_start + (N - count - 1) + break + + # General case: decompress only this chunk + chunk_data = arr[chunk_start : chunk_start + actual_size] + + n_true = int(np.count_nonzero(chunk_data)) + if count + n_true < N: + count += n_true + continue + + # The N-th True is in this chunk + pos_N_true = chunk_start + int(np.flatnonzero(chunk_data)[N - count - 1]) + break + + if pos_N_true == -1: + return self.view(self._valid_rows) + + if pos_N_true < len(self._valid_rows) // 2: + mask_arr = blosc2.zeros(shape=len(arr), dtype=np.bool_) + mask_arr[: pos_N_true + 1] = True + else: + mask_arr = blosc2.ones(shape=len(arr), dtype=np.bool_) + mask_arr[pos_N_true + 1 :] = False + + mask_arr = (mask_arr & self._valid_rows).compute() + return self.view(mask_arr) + + def tail(self, N: int = 5) -> CTable: + if N <= 0: + # If N is 0 or negative, return an empty table + return self.view(blosc2.zeros(shape=len(self._valid_rows), dtype=np.bool_)) + + arr = self._valid_rows + count = 0 + chunk_size = arr.chunks[0] + pos_N_true = -1 + + # Convert to list to iterate chunks in reverse order (metadata only, ~0 memory) + for info in reversed(list(arr.iterchunks_info())): + actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size) + chunk_start = info.nchunk * chunk_size + + # All False without decompressing -> skip + if info.special == blosc2.SpecialValue.ZERO: + continue + + # Repeated value -> check if True or False + if info.special == blosc2.SpecialValue.VALUE: + val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0] + if not val: + continue # all False, skip + + # All True: target is at offset 'actual_size - (N - count)' from chunk start + if count + actual_size < N: + count += actual_size + continue + pos_N_true = chunk_start + actual_size - (N - count) + break + + # General case: decompress only this chunk + chunk_data = arr[chunk_start : chunk_start + actual_size] + + n_true = int(np.count_nonzero(chunk_data)) + if count + n_true < N: + count += n_true + continue + + # The N-th True from the end is in this chunk + # We use negative indexing [-(N - count)] to get elements from the back + pos_N_true = chunk_start + int(np.flatnonzero(chunk_data)[-(N - count)]) + break + + if pos_N_true == -1: + return self.view(self._valid_rows) + + # Mask creation logic reversed: keep everything from pos_N_true to the end + if pos_N_true > len(arr) // 2: + # We keep a small tail (less than half the array): start with zeros + mask_arr = blosc2.zeros(shape=len(arr), dtype=np.bool_) + mask_arr[pos_N_true:] = True + else: + # We keep a large tail (more than half the array): start with ones + mask_arr = blosc2.ones(shape=len(arr), dtype=np.bool_) + if pos_N_true > 0: + mask_arr[:pos_N_true] = False + + # Compute intersection with existing valid rows and creating view + mask_arr = (mask_arr & self._valid_rows).compute() + return self.view(mask_arr) + + def __getitem__(self, s: str): + if s in self._cols: + return Column(self, s) + return None + + def __getattr__(self, s: str): + if s in self._cols: + return Column(self, s) + return super().__getattribute__(s) + + def compact(self): + real_poss = blosc2.where(self._valid_rows, np.array(range(len(self._valid_rows)))).compute() + start = 0 + block_size = self._valid_rows.blocks[0] + end = min(block_size, self._n_rows) + while start < end: + for _k, v in self._cols.items(): + v[start:end] = v[real_poss[start:end]] + start += block_size + end = min(end + block_size, self._n_rows) + + self._valid_rows[: self._n_rows] = True + self._valid_rows[self._n_rows :] = False + + @property + def nrows(self) -> int: + return self._n_rows + + @property + def ncols(self) -> int: + return len(self._cols) + + def info(self) -> None: + """ + Prints a concise summary of the CTable, including the column names, + their data types, and memory layout. + """ + n_cols = len(self._cols) + n_rows = len(self) + + # Calculate global memory usage + cbytes = sum(col.cbytes for col in self._cols.values()) + self._valid_rows.cbytes + nbytes = sum(col.nbytes for col in self._cols.values()) + self._valid_rows.nbytes + + def format_bytes(bytes_size: float) -> str: + if bytes_size < 1024: + return f"{bytes_size} B" + elif bytes_size < 1024**2: + return f"{bytes_size / 1024:.2f} KB" + elif bytes_size < 1024**3: + return f"{bytes_size / (1024**2):.2f} MB" + else: + return f"{bytes_size / (1024**3):.2f} GB" + + ratio = (nbytes / cbytes) if cbytes > 0 else 0.0 + + lines = [] + lines.append("") + lines.append(f"nºColumns: {n_cols}") + lines.append(f"nºRows: {n_rows}") + lines.append("") + + # New Header: replaced "Non-Null Count" with internal Array length & Itemsize + header = f" {'#':>3} {'Column':<15} {'Itemsize':<12} {'Dtype':<15}" + lines.append(header) + lines.append(f" {'---':>3} {'------':<15} {'--------':<12} {'-----':<15}") + + for i, name in enumerate(self.col_names): + col_array = self._cols[name] + dtype_str = str(col_array.dtype) + itemsize = f"{col_array.dtype.itemsize} B" + + line = f" {i:>3} {name:<15} {itemsize:<12} {dtype_str:<15}" + lines.append(line) + + lines.append("") + lines.append(f"memory usage: {format_bytes(cbytes)}") + lines.append(f"uncompressed size: {format_bytes(nbytes)}") + lines.append(f"compression ratio: {ratio:.2f}x") + lines.append("") + + print("\n".join(lines)) + + def append(self, data: list | np.void | np.ndarray) -> None: + if self.base is not None: + raise TypeError("Cannot extend view.") + + is_list = isinstance(data, (list, tuple)) + + arr = self._valid_rows + chunk_size = arr.chunks[0] + last_true_pos = -1 + + for info in reversed(list(arr.iterchunks_info())): + actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size) + chunk_start = info.nchunk * chunk_size + + if info.special == blosc2.SpecialValue.ZERO: + continue + + if info.special == blosc2.SpecialValue.VALUE: + val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0] + if not val: + continue + last_true_pos = chunk_start + actual_size - 1 + break + + chunk_data = arr[chunk_start : chunk_start + actual_size] + nonzero = np.flatnonzero(chunk_data) + if len(nonzero) == 0: + continue + last_true_pos = chunk_start + int(nonzero[-1]) + break + + pos = last_true_pos + 1 + + if pos >= len(self._valid_rows): + c = len(self._valid_rows) + for v in self._cols.values(): + v.resize((c * 2,)) + self._valid_rows.resize((c * 2,)) + + if is_list: + for i, col_array in enumerate(self._cols.values()): + col_array[pos] = data[i] + else: + for name, col_array in self._cols.items(): + col_array[pos] = data[name] + + self._valid_rows[pos] = True + self._n_rows += 1 + + def delete(self, ind: int | slice | str | Iterable) -> blosc2.NDArray: + valid_rows_np = self._valid_rows[:] + true_pos = np.where(valid_rows_np)[0] + + if isinstance(ind, Iterable) and not isinstance(ind, (str, bytes)): + ind = list(ind) + elif not isinstance(ind, int) and not isinstance(ind, slice): + raise TypeError(f"Invalid type '{type(ind)}'") + + false_pos = true_pos[ind] + + new_mask_np = valid_rows_np.copy() + new_mask_np[false_pos] = False + + new_mask = blosc2.asarray(new_mask_np) + self._valid_rows = new_mask + self._n_rows = blosc2.count_nonzero(self._valid_rows) + + def extend(self, data: list | CTable | Any) -> None: + if self.base is not None: + raise TypeError("Cannot extend view.") + if len(data) <= 0: + return + ultimas_validas = blosc2.where(self._valid_rows, np.array(range(len(self._valid_rows)))).compute() + start_pos = ultimas_validas[-1] + 1 if len(ultimas_validas) > 0 else 0 + + current_col_names = self.col_names + columns_to_insert = [] + new_nrows = 0 + + if hasattr(data, "_cols") and hasattr(data, "_n_rows"): + for name in current_col_names: + col = data._cols[name][: data._n_rows] + columns_to_insert.append(col) + new_nrows = data._n_rows + else: + if isinstance(data, np.ndarray) and data.dtype.names is not None: + for name in current_col_names: + columns_to_insert.append(data[name]) + new_nrows = len(data) + else: + columns_to_insert = list(zip(*data, strict=False)) + new_nrows = len(data) + + processed_cols = [] + for i, raw_col in enumerate(columns_to_insert): + target_dtype = self._cols[current_col_names[i]].dtype + b2_arr = blosc2.asarray(raw_col, dtype=target_dtype) + processed_cols.append(b2_arr) + + end_pos = start_pos + new_nrows + + if self.auto_compact and end_pos >= len(self._valid_rows): + self.compact() + ultimas_validas = blosc2.where( + self._valid_rows, np.array(range(len(self._valid_rows))) + ).compute() + start_pos = ultimas_validas[-1] + 1 if len(ultimas_validas) > 0 else 0 + end_pos = start_pos + new_nrows + + while end_pos > len(self._valid_rows): + c = len(self._valid_rows) + for name in current_col_names: + self._cols[name].resize((c * 2,)) + self._valid_rows.resize((c * 2,)) + + # Do this per chunks + for j, name in enumerate(current_col_names): + self._cols[name][start_pos:end_pos] = processed_cols[j][:] + + self._valid_rows[start_pos:end_pos] = True + self._n_rows = blosc2.count_nonzero(self._valid_rows) + + @profile + def where(self, expr_result) -> CTable: + if not ( + isinstance(expr_result, (blosc2.NDArray, blosc2.LazyExpr)) + and (getattr(expr_result, "dtype", None) == np.bool_) + ): + raise TypeError(f"Expected boolean blosc2.NDArray or LazyExpr, got {type(expr_result).__name__}") + + filter = expr_result.compute() if isinstance(expr_result, blosc2.LazyExpr) else expr_result + + target_len = len(self._valid_rows) + + if len(filter) > target_len: + filter = filter[:target_len] + elif len(filter) < target_len: + padding = blosc2.zeros(target_len, dtype=np.bool_) + padding[: len(filter)] = filter[:] + filter = padding + + filter = (filter & self._valid_rows).compute() + + return self.view(filter) + + def _run_row_logic(self, ind: int | slice | str | Iterable) -> CTable: + valid_rows_np = self._valid_rows[:] + true_pos = np.where(valid_rows_np)[0] + + if isinstance(ind, Iterable) and not isinstance(ind, (str, bytes)): + ind = list(ind) + + mant_pos = true_pos[ind] + + new_mask_np = np.zeros_like(valid_rows_np, dtype=bool) + new_mask_np[mant_pos] = True + + new_mask = blosc2.asarray(new_mask_np) + return self.view(new_mask) + + """Save & load are blank for now""" + + def save(self, urlpath: str, group: str = "table") -> None: ... + + @classmethod + def load(cls, urlpath: str, group: str = "table", row_type: type[RowT] | None = None) -> CTable: ... diff --git a/tests/ctable/test_column.py b/tests/ctable/test_column.py new file mode 100644 index 00000000..4f2e450b --- /dev/null +++ b/tests/ctable/test_column.py @@ -0,0 +1,294 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from typing import Annotated + +import numpy as np +import pytest +from pydantic import BaseModel, Field +import blosc2 + +from blosc2 import CTable + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +DATA20 = [(i, float(i * 10), True) for i in range(20)] + + +# ------------------------------------------------------------------- +# Tests +# ------------------------------------------------------------------- + + +def test_column_metadata(): + """dtype correctness, internal reference consistency, and mask defaults.""" + tabla = CTable(RowModel, new_data=DATA20) + + assert tabla.id.dtype == np.int64 + assert tabla.score.dtype == np.float64 + assert tabla.active.dtype == np.bool_ + + assert tabla.id._raw_col is tabla._cols["id"] + assert tabla.id._valid_rows is tabla._valid_rows + + # mask is None by default + assert tabla.id._mask is None + assert tabla.score._mask is None + + +def test_column_getitem_no_holes(): + """int, slice, and list indexing on a full table.""" + tabla = CTable(RowModel, new_data=DATA20) + col = tabla.id + + # int + assert col[0] == 0 + assert col[5] == 5 + assert col[19] == 19 + assert col[-1] == 19 + assert col[-5] == 15 + + # slice returns a Column view + assert isinstance(col[0:5], blosc2.Column) + assert isinstance(col[10:15], blosc2.Column) + + # list + assert list(col[[0, 5, 10, 15]]) == [0, 5, 10, 15] + assert list(col[[19, 0, 10]]) == [19, 0, 10] + + +def test_column_getitem_with_holes(): + """int, slice, and list indexing after deletions.""" + tabla = CTable(RowModel, new_data=DATA20) + tabla.delete([1, 3, 5, 7, 9]) + col = tabla.id + + assert col[0] == 0 + assert col[1] == 2 + assert col[2] == 4 + assert col[3] == 6 + assert col[4] == 8 + assert col[-1] == 19 + assert col[-2] == 18 + + assert list(col[[0, 2, 4]]) == [0, 4, 8] + assert list(col[[5, 3, 1]]) == [10, 6, 2] + + tabla2 = CTable(RowModel, new_data=DATA20) + tabla2.delete([1, 3, 5, 7, 9, 11, 13, 15, 17, 19]) + col2 = tabla2.id + + assert list(col2[0:5].to_numpy()) == [0, 2, 4, 6, 8] + assert list(col2[5:10].to_numpy()) == [10, 12, 14, 16, 18] + assert list(col2[::2].to_numpy()) == [0, 4, 8, 12, 16] + + +def test_column_getitem_out_of_range(): + """int and list indexing raise IndexError when out of bounds.""" + tabla = CTable(RowModel, new_data=DATA20) + tabla.delete([1, 3, 5, 7, 9]) + col = tabla.id + + with pytest.raises(IndexError): + _ = col[100] + with pytest.raises(IndexError): + _ = col[-100] + with pytest.raises(IndexError): + _ = col[[0, 1, 100]] + + +def test_column_setitem_no_holes(): + """int, slice, and list assignment on a full table.""" + tabla = CTable(RowModel, new_data=DATA20) + col = tabla.id + + col[0] = 999 + assert col[0] == 999 + col[10] = 888 + assert col[10] == 888 + col[-1] = 777 + assert col[-1] == 777 + + col[0:5] = [100, 101, 102, 103, 104] + assert list(col[0:5].to_numpy()) == [100, 101, 102, 103, 104] + + col[[0, 5, 10]] = [10, 50, 100] + assert col[0] == 10 + assert col[5] == 50 + assert col[10] == 100 + + +def test_column_setitem_with_holes(): + """int, slice, and list assignment after deletions.""" + tabla = CTable(RowModel, new_data=DATA20) + tabla.delete([1, 3, 5, 7, 9]) + col = tabla.id + + col[0] = 999 + assert col[0] == 999 + assert tabla._cols["id"][0] == 999 + + col[2] = 888 + assert col[2] == 888 + assert tabla._cols["id"][4] == 888 + + col[-1] = 777 + assert col[-1] == 777 + + col[0:3] = [100, 200, 300] + assert col[0] == 100 + assert col[1] == 200 + assert col[2] == 300 + + col[[0, 2, 4]] = [11, 22, 33] + assert col[0] == 11 + assert col[2] == 22 + assert col[4] == 33 + + +def test_column_iter(): + """Iteration over full table, with odd-index holes, and on score column.""" + tabla = CTable(RowModel, new_data=DATA20) + assert list(tabla.id) == list(range(20)) + + tabla2 = CTable(RowModel, new_data=DATA20) + tabla2.delete([1, 3, 5, 7, 9, 11, 13, 15, 17, 19]) + assert list(tabla2.id) == [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] + + tabla3 = CTable(RowModel, new_data=DATA20) + tabla3.delete([0, 5, 10, 15]) + expected_score = [ + 10.0, 20.0, 30.0, 40.0, + 60.0, 70.0, 80.0, 90.0, + 110.0, 120.0, 130.0, 140.0, + 160.0, 170.0, 180.0, 190.0, + ] + assert list(tabla3.score) == expected_score + + +def test_column_len(): + """len() after no deletions, partial deletions, cumulative deletions, and cross-column.""" + tabla = CTable(RowModel, new_data=DATA20) + col = tabla.id + assert len(col) == 20 + + tabla.delete([1, 3, 5, 7, 9]) + assert len(col) == 15 + + tabla2 = CTable(RowModel, new_data=DATA20) + col2 = tabla2.id + tabla2.delete([0, 1, 2]) + assert len(col2) == 17 + tabla2.delete([0, 1, 2, 3, 4]) + assert len(col2) == 12 + + data = [(i, float(i * 10), i % 2 == 0) for i in range(10)] + tabla3 = CTable(RowModel, new_data=data, expected_size=10) + tabla3.delete([0, 1, 5, 6, 9]) + assert len(tabla3.id) == len(tabla3.score) == len(tabla3.active) == 5 + for i in range(len(tabla3.id)): + assert tabla3.score[i] == float(tabla3.id[i] * 10) + + +def test_column_edge_cases(): + """Empty table and fully-deleted table both behave as zero-length columns.""" + tabla = CTable(RowModel) + assert len(tabla.id) == 0 + assert list(tabla.id) == [] + + data = [(i, float(i * 10), True) for i in range(10)] + tabla2 = CTable(RowModel, new_data=data) + tabla2.delete(list(range(10))) + assert len(tabla2.id) == 0 + assert list(tabla2.id) == [] + + +# ------------------------------------------------------------------- +# New tests for Column view (mask) and to_array() +# ------------------------------------------------------------------- + + +def test_column_slice_returns_view(): + """Column[slice] returns a Column instance with a non-None mask.""" + tabla = CTable(RowModel, new_data=DATA20) + col = tabla.id + + view = col[0:5] + assert isinstance(view, blosc2.Column) + assert view._mask is not None + assert view._table is tabla + assert view._col_name == "id" + + +def test_to_array_no_holes(): + """to_array() on a slice view returns correct data on a full table.""" + tabla = CTable(RowModel, new_data=DATA20) + col = tabla.id + + np.testing.assert_array_equal(col[0:5].to_numpy(), np.array([0, 1, 2, 3, 4], dtype=np.int64)) + np.testing.assert_array_equal(col[5:10].to_numpy(), np.array([5, 6, 7, 8, 9], dtype=np.int64)) + np.testing.assert_array_equal(col[15:20].to_numpy(), np.array([15, 16, 17, 18, 19], dtype=np.int64)) + np.testing.assert_array_equal(col[0:20].to_numpy(), np.arange(20, dtype=np.int64)) + + +def test_to_array_with_holes(): + """to_array() on a slice view skips deleted rows correctly.""" + tabla = CTable(RowModel, new_data=DATA20) + tabla.delete([1, 3, 5, 7, 9, 11, 13, 15, 17, 19]) # keep evens: 0,2,4,...,18 + col = tabla.id + + # logical [0:5] → physical rows 0,2,4,6,8 + np.testing.assert_array_equal(col[0:5].to_numpy(), np.array([0, 2, 4, 6, 8], dtype=np.int64)) + # logical [5:10] → physical rows 10,12,14,16,18 + np.testing.assert_array_equal(col[5:10].to_numpy(), np.array([10, 12, 14, 16, 18], dtype=np.int64)) + + +def test_to_array_full_column(): + """to_array() with no slice (full column) returns all valid rows.""" + tabla = CTable(RowModel, new_data=DATA20) + tabla.delete([0, 10, 19]) + col = tabla.id + + expected = np.array([i for i in range(20) if i not in {0, 10, 19}], dtype=np.int64) + np.testing.assert_array_equal(col[0:len(col)].to_numpy(), expected) + + +def test_to_array_mask_does_not_include_deleted(): + """Mask & valid_rows intersection excludes deleted rows inside the slice range.""" + tabla = CTable(RowModel, new_data=DATA20) + # delete rows 2 and 3, which fall inside slice [0:5] + tabla.delete([2, 3]) + col = tabla.id + + # logical [0:5] should now map to physical rows 0,1,4,5,6 + result = col[0:5].to_numpy() + np.testing.assert_array_equal(result, np.array([0, 1, 4, 5, 6], dtype=np.int64)) + + +def test_column_view_mask_is_independent(): + """Two slice views on the same column have independent masks.""" + tabla = CTable(RowModel, new_data=DATA20) + col = tabla.id + + view_a = col[0:5] + + np.testing.assert_array_equal(view_a.to_numpy(), np.arange(0, 5, dtype=np.int64)) + + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_compact.py b/tests/ctable/test_compact.py new file mode 100644 index 00000000..a0eaebb5 --- /dev/null +++ b/tests/ctable/test_compact.py @@ -0,0 +1,157 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from typing import Annotated + +import numpy as np +from pydantic import BaseModel, Field + +from blosc2 import CTable + + +# --- Basic model setup for tests --- +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + + +def generate_test_data(n_rows: int) -> list: + return [(i, float(i)) for i in range(n_rows)] + + +def test_compact_empty_table(): + """Test compact() on a completely empty table (no data).""" + table = CTable(RowModel, expected_size=100) + + assert len(table) == 0 + + # Should not raise any error + table.compact() + + # Capacity might have drastically reduced, but the logical table must remain empty + assert len(table) == 0 + # Verify that if data is added later, it works correctly + table.append((1, 10.0)) + assert len(table) == 1 + assert table.id[0] == 1 + + +def test_compact_full_table(): + """Test compact() on a completely full table (no holes or free space).""" + data = generate_test_data(50) + table = CTable(RowModel, new_data=data, expected_size=50) + + assert len(table) == 50 + initial_capacity = len(table._valid_rows) + + # Should not raise any error or change the logical state + table.compact() + + assert len(table) == 50 + # Capacity should not have changed because it was already full + assert len(table._valid_rows) == initial_capacity + + # Verify data integrity + assert table.id[0] == 0 + assert table.id[-1] == 49 + + +def test_compact_already_compacted_table(): + """Test compact() on a table that has free space but no holes (contiguous data).""" + data = generate_test_data(20) + # Large expected_size to ensure free space at the end + table = CTable(RowModel, new_data=data, expected_size=100) + + assert len(table) == 20 + + # Execute compact. Since data is already contiguous, the table might reduce + # its size due to the < len//2 while loop, but it shouldn't fail. + table.compact() + + assert len(table) == 20 + + # Verify that data remains in place + for i in range(20): + assert table.id[i] == i + + # Validate that all True values are consecutive at the beginning + mask = table._valid_rows[: len(table._valid_rows)] + assert np.all(mask[:20]) + if len(mask) > 20: + assert not np.any(mask[20:]) + + +def test_compact_with_holes(): + """Test compact() on a table with high fragmentation (holes).""" + data = generate_test_data(30) + table = CTable(RowModel, new_data=data, expected_size=50) + + # Delete sparsely: leave only [0, 5, 10, 15, 20, 25] + to_delete = [i for i in range(30) if i % 5 != 0] + table.delete(to_delete) + + assert len(table) == 6 + + # Execute compact + table.compact() + + assert len(table) == 6 + + # Verify that the correct data survived and moved to the beginning + expected_ids = [0, 5, 10, 15, 20, 25] + for i, exp_id in enumerate(expected_ids): + # Through the logical view (Column wrapper) + assert table.id[i] == exp_id + # Through the physical blosc2 array (to ensure compact worked) + assert table._cols["id"][i] == exp_id + + # Verify physical mask: first 6 must be True, the rest False + mask = table._valid_rows[: len(table._valid_rows)] + assert np.all(mask[:6]) + if len(mask) > 6: + assert not np.any(mask[6:]) + + +def test_compact_all_deleted(): + """Test compact() on a table where absolutely all rows have been deleted.""" + data = generate_test_data(20) + table = CTable(RowModel, new_data=data, expected_size=20) + + # Delete everything + table.delete(list(range(20))) + assert len(table) == 0 + + # Should handle empty arrays correctly + table.compact() + + assert len(table) == 0 + + # Check that we can write to it again + table.append((99, 99.0)) + assert len(table) == 1 + assert table.id[0] == 99 + + +def test_compact_multiple_times(): + """Calling compact() multiple times in a row must not corrupt data or crash.""" + data = generate_test_data(10) + table = CTable(RowModel, new_data=data, expected_size=20) + + table.delete([1, 3, 5, 7, 9]) # 5 elements remaining + + # Compact 3 times in a row + table.compact() + table.compact() + table.compact() + + assert len(table) == 5 + assert list(table.id) == [0, 2, 4, 6, 8] diff --git a/tests/ctable/test_construct.py b/tests/ctable/test_construct.py new file mode 100644 index 00000000..63810076 --- /dev/null +++ b/tests/ctable/test_construct.py @@ -0,0 +1,225 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from typing import Annotated, TypeVar + +import numpy as np +import pytest +from pydantic import BaseModel, Field + +import blosc2 +from blosc2 import CTable + +RowT = TypeVar("RowT", bound=BaseModel) + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +class MaxLen: + def __init__(self, length: int): + self.length = int(length) + + +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +# ------------------------------------------------------------------- +# Predefined Test Data +# ------------------------------------------------------------------- +SMALL_DATA = [ + (1, 1 + 2j, 95.5, True), + (2, 3 - 4j, 80.0, False), + (3, 0j, 50.2, True), + (4, -1 + 1j, 12.3, False), + (5, 5j, 99.9, True), +] +SMALLEST_DATA = SMALL_DATA[:2] + +dtype_struct = [("id", "i8"), ("c_val", "c16"), ("score", "f8"), ("active", "?")] +SMALL_STRUCT = np.array(SMALL_DATA, dtype=dtype_struct) + + +# ------------------------------------------------------------------- +# Validation Utility +# ------------------------------------------------------------------- +def assert_table_equals_data(table: CTable, expected_data: list): + assert len(table) == len(expected_data), f"Expected length {len(expected_data)}, got {len(table)}" + col_names = table.col_names + for i, expected_row in enumerate(expected_data): + row_extracted = table.row[i] + for col_idx, expected_val in enumerate(expected_row): + col_name = col_names[col_idx] + extracted_val = getattr(row_extracted, col_name)[0] + if isinstance(expected_val, (float, complex)): + np.testing.assert_allclose( + extracted_val, expected_val, err_msg=f"Discrepancy at row {i}, col {col_name}" + ) + else: + assert extracted_val == expected_val, ( + f"Row {i}, col {col_name}: expected {expected_val}, got {extracted_val}" + ) + + +# ------------------------------------------------------------------- +# Tests +# ------------------------------------------------------------------- + + +def test_empty_table_variants(): + """Empty table: default, with expected_size, and with compact=True.""" + table = CTable(RowModel) + assert len(table) == 0 + assert table.nrows == 0 + assert table.ncols == 4 + for col_name in ["id", "c_val", "score", "active"]: + assert col_name in table._cols + assert isinstance(table._cols[col_name], blosc2.NDArray) + + table_sized = CTable(RowModel, expected_size=5000) + assert len(table_sized) == 0 + assert all(len(col) == 5000 for col in table_sized._cols.values()) + + table_compact = CTable(RowModel, compact=True) + assert len(table_compact) == 0 + assert table_compact.auto_compact is True + + +def test_empty_data_lifecycle(): + """Create from [], extend with [], then extend with real data.""" + table = CTable(RowModel, new_data=[]) + assert len(table) == 0 + + table.extend([]) + assert len(table) == 0 + + table.extend(SMALL_DATA) + assert_table_equals_data(table, SMALL_DATA) + + +def test_construction_sources(): + """List of tuples and structured array both produce identical tables.""" + assert_table_equals_data(CTable(RowModel, new_data=SMALL_DATA), SMALL_DATA) + assert_table_equals_data(CTable(RowModel, new_data=SMALL_STRUCT), SMALL_DATA) + + +def test_expected_size_variants(): + """expected_size smaller, exact, and larger than the inserted data.""" + for es in [1, 5]: + assert_table_equals_data(CTable(RowModel, new_data=SMALL_DATA, expected_size=es), SMALL_DATA) + + table_large = CTable(RowModel, new_data=SMALL_DATA, expected_size=1000) + assert_table_equals_data(table_large, SMALL_DATA) + assert all(len(col) == 1000 for col in table_large._cols.values()) + + +def test_compact_flag(): + """compact=False and compact=True both preserve data correctly.""" + table_false = CTable(RowModel, new_data=SMALL_DATA, compact=False) + assert table_false.auto_compact is False + assert_table_equals_data(table_false, SMALL_DATA) + + table_true = CTable(RowModel, new_data=SMALL_DATA, compact=True) + assert table_true.auto_compact is True + assert_table_equals_data(table_true, SMALL_DATA) + + +def test_append_and_clone(): + """Build table row by row, then clone it into a new CTable.""" + table = CTable(RowModel) + for row in SMALLEST_DATA: + table.append(row) + assert_table_equals_data(table, SMALLEST_DATA) + + cloned = CTable(RowModel, new_data=table) + assert_table_equals_data(cloned, SMALLEST_DATA) + assert table is not cloned + + +def test_invalid_append(): + """Wrong length, incompatible type, and dict all raise errors.""" + table = CTable(RowModel, expected_size=1) + + # Too few values → IndexError (NumPy raises natively after simplification) + with pytest.raises((IndexError, ValueError)): + table.append([1, 1 + 2j, 95.5]) # missing boolean + + # Incompatible type → TypeError or ValueError from NumPy + with pytest.raises((TypeError, ValueError)): + table.append(["invalid_text", 1 + 2j, 95.5, True]) + + +def test_extreme_values(): + """Extreme complex, float boundary, and large integer values.""" + extreme_complex = [ + (1, complex(1e308, -1e308), 50.0, True), + (2, complex(0, 0), 0.0, False), + (3, complex(-1e308, 1e308), 100.0, True), + ] + extreme_float = [ + (1, 0j, 0.0, True), + (2, 0j, 100.0, False), + (3, 0j, 0.0001, True), + (4, 0j, 99.9999, False), + ] + extreme_int = [ + (1, 0j, 50.0, True), + (2**32, 0j, 50.0, False), + (2**60, 0j, 50.0, True), + ] + for data in [extreme_complex, extreme_float, extreme_int]: + assert_table_equals_data(CTable(RowModel, new_data=data), data) + + +def test_extend_append_and_resize(): + """Auto-resize via append one-by-one, then extend+append beyond initial size.""" + # Append beyond expected_size triggers resize + table = CTable(RowModel, expected_size=2) + for row in SMALL_DATA: + table.append(row) + assert_table_equals_data(table, SMALL_DATA) + assert all(len(col) >= 5 for col in table._cols.values()) + + # Extend beyond expected_size, then append the last row + table2 = CTable(RowModel, expected_size=2) + table2.extend(SMALL_DATA[:4]) + assert len(table2) == 4 + table2.append(SMALL_DATA[4]) + assert_table_equals_data(table2, SMALL_DATA) + + +def test_column_integrity(): + """Column access via [] and getattr, and correct dtypes.""" + table = CTable(RowModel, new_data=SMALL_DATA) + + assert isinstance(table["id"], blosc2.ctable.Column) + assert isinstance(table.score, blosc2.ctable.Column) + + assert table._cols["id"].dtype == np.int64 + assert table._cols["c_val"].dtype == np.complex128 + assert table._cols["score"].dtype == np.float64 + assert table._cols["active"].dtype == np.bool_ + + +def test_valid_rows(): + """_valid_rows has exactly 5 True entries after creation and after extend.""" + table_direct = CTable(RowModel, new_data=SMALL_DATA) + assert blosc2.count_nonzero(table_direct._valid_rows) == 5 + + table_extended = CTable(RowModel) + table_extended.extend(SMALL_DATA) + assert blosc2.count_nonzero(table_extended._valid_rows) == 5 + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_delete_rows.py b/tests/ctable/test_delete_rows.py new file mode 100644 index 00000000..b6d04f59 --- /dev/null +++ b/tests/ctable/test_delete_rows.py @@ -0,0 +1,210 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from typing import Annotated, TypeVar + +import numpy as np +import pytest +from pydantic import BaseModel, Field + +from blosc2 import CTable + +RowT = TypeVar("RowT", bound=BaseModel) + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +def generate_test_data(n_rows: int) -> list: + return [(i, complex(i, -i), float((i * 7) % 100), bool(i % 2)) for i in range(1, n_rows + 1)] + + +# ------------------------------------------------------------------- +# Tests +# ------------------------------------------------------------------- + + +def test_delete_single_element(): + """First, last, middle deletion once; and repeated deletion from front/back.""" + data = generate_test_data(50) + + # Delete first + t = CTable(RowModel, new_data=data, expected_size=50) + t.delete(0) + assert len(t) == 49 + assert not t._valid_rows[0] + + # Delete last + t2 = CTable(RowModel, new_data=data, expected_size=50) + t2.delete(-1) + assert len(t2) == 49 + + # Delete middle + t3 = CTable(RowModel, new_data=data, expected_size=50) + t3.delete(25) + assert len(t3) == 49 + + # Delete first 10 times in a row + t4 = CTable(RowModel, new_data=data, expected_size=50) + for i in range(10): + t4.delete(0) + assert len(t4) == 50 - (i + 1) + assert len(t4) == 40 + + # Delete last 10 times in a row + t5 = CTable(RowModel, new_data=data, expected_size=50) + for i in range(10): + t5.delete(-1) + assert len(t5) == 50 - (i + 1) + assert len(t5) == 40 + + +def test_delete_list_of_positions(): + """Scattered, consecutive, even, odd, and slice-equivalent list deletions.""" + data = generate_test_data(50) + + # Scattered + t = CTable(RowModel, new_data=data, expected_size=50) + t.delete([0, 10, 20, 30, 40]) + assert len(t) == 45 + + # Consecutive block + t2 = CTable(RowModel, new_data=data, expected_size=50) + t2.delete([5, 6, 7, 8, 9]) + assert len(t2) == 45 + + # All even positions + t3 = CTable(RowModel, new_data=data, expected_size=50) + t3.delete(list(range(0, 50, 2))) + assert len(t3) == 25 + + # All odd positions + t4 = CTable(RowModel, new_data=data, expected_size=50) + t4.delete(list(range(1, 50, 2))) + assert len(t4) == 25 + + # Slice-equivalent: range(10, 20) + t5 = CTable(RowModel, new_data=data, expected_size=50) + t5.delete(list(range(10, 20))) + assert len(t5) == 40 + + # Slice with step: range(0, 20, 2) + t6 = CTable(RowModel, new_data=data, expected_size=50) + t6.delete(list(range(0, 20, 2))) + assert len(t6) == 40 + + # First 10 rows + t7 = CTable(RowModel, new_data=data, expected_size=50) + t7.delete(list(range(0, 10))) + assert len(t7) == 40 + + # Last 10 rows + t8 = CTable(RowModel, new_data=data, expected_size=50) + t8.delete(list(range(40, 50))) + assert len(t8) == 40 + + +def test_delete_out_of_bounds(): + """All IndexError scenarios: full table, partial table, empty table, negative.""" + data = generate_test_data(50) + + # Beyond length on full table + t = CTable(RowModel, new_data=data, expected_size=50) + with pytest.raises(IndexError): + t.delete(60) + with pytest.raises(IndexError): + t.delete(-60) + + # Beyond nrows on partial table (capacity 50, only 25 rows) + t2 = CTable(RowModel, new_data=generate_test_data(25), expected_size=50) + assert len(t2) == 25 + with pytest.raises(IndexError): + t2.delete(35) + + # Empty table: positions 0, 25, -1 all raise + for pos in [0, 25, -1]: + empty = CTable(RowModel, expected_size=50) + assert len(empty) == 0 + with pytest.raises(IndexError): + empty.delete(pos) + + +def test_delete_edge_cases(): + """Same position twice, all rows front/back, negative and mixed indices.""" + data = generate_test_data(50) + + # Same logical position twice: second delete hits what was position 11 + t = CTable(RowModel, new_data=data, expected_size=50) + t.delete(10) + assert len(t) == 49 + t.delete(10) + assert len(t) == 48 + + # Delete all rows from the front one by one + t2 = CTable(RowModel, new_data=data, expected_size=50) + for _ in range(50): + t2.delete(0) + assert len(t2) == 0 + + # Delete all rows from the back one by one + t3 = CTable(RowModel, new_data=data, expected_size=50) + for _ in range(50): + t3.delete(-1) + assert len(t3) == 0 + + # Negative indices list + t4 = CTable(RowModel, new_data=data, expected_size=50) + t4.delete([-1, -5, -10]) + assert len(t4) == 47 + + # Mixed positive and negative indices + t5 = CTable(RowModel, new_data=data, expected_size=50) + t5.delete([0, -1, 25]) + assert len(t5) == 47 + + +def test_delete_invalid_types(): + """string, float, and list-with-strings all raise errors.""" + data = generate_test_data(50) + + t = CTable(RowModel, new_data=data, expected_size=50) + with pytest.raises(TypeError): + t.delete("invalid") + with pytest.raises(TypeError): + t.delete(10.5) + with pytest.raises(IndexError): + t.delete([0, "invalid", 10]) + + +def test_delete_stress(): + """Large batch deletion and alternating multi-pass pattern.""" + data = generate_test_data(50) + + # Delete 40 out of 50 at once + t = CTable(RowModel, new_data=data, expected_size=50) + t.delete(list(range(0, 40))) + assert len(t) == 10 + + # Alternating two-pass deletion + t2 = CTable(RowModel, new_data=data, expected_size=50) + t2.delete(list(range(0, 50, 2))) # delete all even -> 25 remain + assert len(t2) == 25 + t2.delete(list(range(0, 25, 2))) # delete every other of remaining -> ~12 + assert len(t2) == 12 + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_extend_delete.py b/tests/ctable/test_extend_delete.py new file mode 100644 index 00000000..42cf974e --- /dev/null +++ b/tests/ctable/test_extend_delete.py @@ -0,0 +1,226 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from typing import Annotated, TypeVar + +import numpy as np +import pytest +from pydantic import BaseModel, Field + +from blosc2 import CTable + +RowT = TypeVar("RowT", bound=BaseModel) + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +def generate_test_data(n_rows: int, start_id: int = 1) -> list: + return [(start_id + i, complex(i, -i), float((i * 7) % 100), bool(i % 2)) for i in range(n_rows)] + + +def get_valid_mask(table: CTable) -> np.ndarray: + return np.array(table._valid_rows[: len(table._valid_rows)], dtype=bool) + + +def assert_mask_matches(table: CTable, expected_mask: list): + actual = get_valid_mask(table)[: len(expected_mask)] + np.testing.assert_array_equal( + actual, + np.array(expected_mask, dtype=bool), + err_msg=f"Mask mismatch.\nExpected: {expected_mask}\nGot: {actual}", + ) + + +def assert_data_at_positions(table: CTable, positions: list, expected_ids: list): + for pos, expected_id in zip(positions, expected_ids, strict=False): + actual_id = int(table._cols["id"][pos]) + assert actual_id == expected_id, f"Position {pos}: expected ID {expected_id}, got {actual_id}" + + +# ------------------------------------------------------------------- +# Tests +# ------------------------------------------------------------------- + + +def test_gap_fill_mask_and_positions(): + """extend and append fill from last valid position; mask is updated correctly.""" + # extend after deletions: mask and physical positions + t = CTable(RowModel, new_data=generate_test_data(7, 1), expected_size=10) + t.delete([0, 2, 4, 6]) + assert_mask_matches(t, [False, True, False, True, False, True, False]) + assert len(t) == 3 + t.extend(generate_test_data(3, 8)) + assert_mask_matches(t, [False, True, False, True, False, True, True, True, True]) + assert len(t) == 6 + assert_data_at_positions(t, [6, 7, 8], [8, 9, 10]) + + # append fills from last valid position, not into holes + t2 = CTable(RowModel, new_data=generate_test_data(5, 1), expected_size=10) + t2.delete([1, 3]) + assert_mask_matches(t2, [True, False, True, False, True]) + t2.append((6, 1j, 50.0, True)) + assert_mask_matches(t2, [True, False, True, False, True, True]) + t2.append((7, 2j, 60.0, False)) + assert_mask_matches(t2, [True, False, True, False, True, True, True]) + + # extend fills from last valid position when there's enough capacity + t3 = CTable(RowModel, new_data=generate_test_data(10, 1), expected_size=15) + t3.delete([2, 4, 6]) + t3.extend(generate_test_data(3, 20)) + assert_data_at_positions(t3, [10, 11, 12], [20, 21, 22]) + + +def test_resize_behavior(): + """Resize triggered when capacity is full; compact=True avoids massive resize.""" + # compact=False: append beyond capacity must resize + t = CTable(RowModel, new_data=generate_test_data(10, 1), expected_size=10, compact=False) + t.delete(list(range(9))) + assert len(t) == 1 + initial_cap = len(t._valid_rows) + t.append((11, 5j, 75.0, True)) + assert len(t._valid_rows) > initial_cap + + # compact=True: no massive resize after deletions + extend + t2 = CTable(RowModel, new_data=generate_test_data(10, 1), expected_size=10, compact=True) + t2.delete(list(range(9))) + assert len(t2) == 1 + initial_cap2 = len(t2._valid_rows) + t2.extend(generate_test_data(3, 11)) + assert len(t2._valid_rows) <= initial_cap2 * 2 + + # extend exceeding capacity always resizes regardless of compact + t3 = CTable(RowModel, new_data=generate_test_data(5, 1), expected_size=10, compact=False) + t3.delete([0, 2, 4]) + initial_cap3 = len(t3._valid_rows) + t3.extend(generate_test_data(20, 100)) + assert len(t3._valid_rows) > initial_cap3 + + +def test_mixed_append_extend_with_gaps(): + """Multiple extends, appends, and deletes interleaved; lengths stay correct.""" + # Multiple extends with intermediate deletions + t = CTable(RowModel, expected_size=20) + t.extend(generate_test_data(5, 1)) + t.extend(generate_test_data(3, 10)) + assert len(t) == 8 + t.delete([2, 4, 6]) + assert len(t) == 5 + t.extend(generate_test_data(2, 20)) + assert len(t) == 7 + t.delete([0, 1]) + assert len(t) == 5 + t.extend(generate_test_data(4, 30)) + assert len(t) == 9 + + # append + extend mixed, delete all then re-extend + t2 = CTable(RowModel, expected_size=20) + for i in range(5): + t2.append((i + 1, complex(i), float(i * 10), True)) + assert len(t2) == 5 + t2.extend(generate_test_data(5, 10)) + assert len(t2) == 10 + t2.delete([1, 3, 5, 7, 9]) + assert len(t2) == 5 + t2.append((100, 0j, 50.0, False)) + assert len(t2) == 6 + t2.extend(generate_test_data(3, 200)) + assert len(t2) == 9 + + # Fill all gaps then extend; delete all then extend from scratch + t3 = CTable(RowModel, new_data=generate_test_data(10, 1), expected_size=15) + t3.delete(list(range(0, 10, 2))) + assert len(t3) == 5 + t3.extend(generate_test_data(5, 20)) + assert len(t3) == 10 + + t4 = CTable(RowModel, new_data=generate_test_data(10, 1), expected_size=15) + t4.delete(list(range(10))) + assert len(t4) == 0 + t4.extend(generate_test_data(5, 100)) + assert len(t4) == 5 + + +def test_compact_behavior(): + """Manual compact consolidates mask; auto-compact keeps data correct after extend.""" + # Manual compact: valid rows packed to front, extend fills after them + t = CTable(RowModel, new_data=generate_test_data(10, 1), expected_size=15, compact=False) + t.delete([1, 3, 5, 7, 9]) + assert len(t) == 5 + t.compact() + assert_mask_matches(t, [True] * 5 + [False] * 10) + t.extend(generate_test_data(3, 20)) + assert len(t) == 8 + + # Auto-compact: table stays consistent after heavy deletions + extend + t2 = CTable(RowModel, new_data=generate_test_data(10, 1), expected_size=15, compact=True) + t2.delete(list(range(0, 8))) + assert len(t2) == 2 + t2.extend(generate_test_data(10, 100)) + assert len(t2) == 12 + + +def test_complex_scenarios(): + """Sparse gaps, alternating cycles, data integrity, and full workflow.""" + # Sparse table: many scattered deletions then bulk extend + t = CTable(RowModel, new_data=generate_test_data(20, 1), expected_size=30) + t.delete([0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18]) + assert len(t) == 5 + t.extend(generate_test_data(10, 100)) + assert len(t) == 15 + + # Alternating extend/delete cycles + t2 = CTable(RowModel, expected_size=50) + for cycle in range(5): + t2.extend(generate_test_data(10, cycle * 100)) + current_len = len(t2) + if current_len >= 5: + t2.delete(list(range(0, min(5, current_len)))) + + # Data integrity: correct row values survive delete + extend + t3 = CTable( + RowModel, new_data=[(1, 1j, 10.0, True), (2, 2j, 20.0, False), (3, 3j, 30.0, True)], expected_size=10 + ) + t3.delete(1) + assert t3.row[0].id[0] == 1 + assert t3.row[1].id[0] == 3 + t3.extend([(10, 10j, 100.0, True), (11, 11j, 100.0, False)]) + assert t3.row[0].id[0] == 1 + assert t3.row[1].id[0] == 3 + assert t3.row[2].id[0] == 10 + assert t3.row[3].id[0] == 11 + + # Full workflow + t4 = CTable(RowModel, expected_size=20, compact=False) + t4.extend(generate_test_data(10, 1)) + assert len(t4) == 10 + t4.delete([0, 2, 4, 6, 8]) + assert len(t4) == 5 + t4.append((100, 0j, 50.0, True)) + t4.append((101, 1j, 60.0, False)) + assert len(t4) == 7 + t4.extend(generate_test_data(5, 200)) + assert len(t4) == 12 + t4.delete([3, 7, 10]) + assert len(t4) == 9 + t4.extend(generate_test_data(3, 300)) + assert len(t4) == 12 + assert t4.nrows == 12 + assert t4.ncols == 4 + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_row_logic.py b/tests/ctable/test_row_logic.py new file mode 100644 index 00000000..483b7924 --- /dev/null +++ b/tests/ctable/test_row_logic.py @@ -0,0 +1,221 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from typing import Annotated + +import numpy as np +import pytest +from pydantic import BaseModel, Field + +from blosc2 import CTable +from blosc2.ctable import Column + + +class NumpyDtype: + def __init__(self, dtype): + self.dtype = dtype + + +class RowModel(BaseModel): + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) + active: Annotated[bool, NumpyDtype(np.bool_)] = True + + +def generate_test_data(n_rows: int, start_id: int = 0) -> list: + return [(start_id + i, float(i * 10), i % 2 == 0) for i in range(n_rows)] + + +# ------------------------------------------------------------------- +# Tests +# ------------------------------------------------------------------- + + +def test_row_int_indexing(): + """int indexing: no holes, with holes, negative indices, and out-of-range.""" + data = generate_test_data(20) + + # No holes: spot checks + t = CTable(RowModel, new_data=data) + r = t.row[0] + assert isinstance(r, CTable) + assert len(r) == 1 + assert r.id[0] == 0 + assert r.score[0] == 0.0 + assert r.active[0] + assert t.row[10].id[0] == 10 + assert t.row[10].score[0] == 100.0 + + # Negative indices + assert t.row[-1].id[0] == 19 + assert t.row[-5].id[0] == 15 + + # With holes: delete odd positions -> valid: 0,2,4,6,8,10... + t.delete([1, 3, 5, 7, 9]) + assert t.row[0].id[0] == 0 + assert t.row[1].id[0] == 2 + assert t.row[5].id[0] == 10 + + # Out of range + t2 = CTable(RowModel, new_data=generate_test_data(10)) + for idx in [10, 100, -11]: + with pytest.raises(IndexError): + _ = t2.row[idx] + + +def test_row_slice_indexing(): + """Slice indexing: no holes, with holes, step, negative, beyond bounds, empty/full.""" + data = generate_test_data(20) + + # No holes + t = CTable(RowModel, new_data=data) + assert isinstance(t.row[0:5], CTable) + assert list(t.row[0:5].id) == [0, 1, 2, 3, 4] + assert list(t.row[10:15].id) == [10, 11, 12, 13, 14] + assert list(t.row[::2].id) == [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] + + # With step + assert list(t.row[0:10:2].id) == [0, 2, 4, 6, 8] + assert list(t.row[1:10:3].id) == [1, 4, 7] + + # Negative indices + assert list(t.row[-5:].id) == [15, 16, 17, 18, 19] + assert list(t.row[-10:-5].id) == [10, 11, 12, 13, 14] + + # With holes: delete odd positions + t.delete([1, 3, 5, 7, 9]) + assert list(t.row[0:5].id) == [0, 2, 4, 6, 8] + assert list(t.row[5:10].id) == [10, 11, 12, 13, 14] + + # Beyond bounds + t2 = CTable(RowModel, new_data=generate_test_data(10)) + assert len(t2.row[11:20]) == 0 + assert list(t2.row[5:100].id) == [5, 6, 7, 8, 9] + assert len(t2.row[100:]) == 0 + + # Empty and full slices + assert len(t2.row[5:5]) == 0 + assert len(t2.row[0:0]) == 0 + result = t2.row[:] + assert len(result) == 10 + assert list(result.id) == list(range(10)) + + +def test_row_list_indexing(): + """List indexing: no holes, with holes, out-of-range, edge cases.""" + data = generate_test_data(20) + + # No holes + t = CTable(RowModel, new_data=data) + r = t.row[[0, 5, 10, 15]] + assert isinstance(r, CTable) + assert len(r) == 4 + assert set(r.id) == {0, 5, 10, 15} + assert set(t.row[[19, 0, 10]].id) == {0, 10, 19} + + # With holes: delete [1,3,5,7,9] -> logical 0->id0, 1->id2, 2->id4... + t.delete([1, 3, 5, 7, 9]) + assert set(t.row[[0, 2, 4]].id) == {0, 4, 8} + assert set(t.row[[5, 3, 1]].id) == {2, 6, 10} + + # Negative indices in list + t2 = CTable(RowModel, new_data=generate_test_data(10)) + assert set(t2.row[[0, -1, 5]].id) == {0, 5, 9} + + # Single element + assert t2.row[[5]].id[0] == 5 + + # Duplicate indices -> deduplicated + r_dup = t2.row[[5, 5, 5]] + assert len(r_dup) == 1 + assert r_dup.id[0] == 5 + + # Empty list + assert len(t2.row[[]]) == 0 + + # Out of range + for bad in [[0, 5, 100], [0, 1, -11]]: + with pytest.raises(IndexError): + _ = t2.row[bad] + + +def test_row_view_properties(): + """View metadata, base chain, mask integrity, column liveness, and chained views.""" + data = generate_test_data(100) + tabla0 = CTable(RowModel, new_data=data) + + # Base is None on root table + assert tabla0.base is None + + # View properties are shared with parent + v = tabla0.row[0:10] + assert v.base is tabla0 + assert v._row_type == tabla0._row_type + assert v._cols is tabla0._cols + assert v._col_widths == tabla0._col_widths + assert v.col_names == tabla0.col_names + + # Read ops on view + view = tabla0.row[5:15] + assert view.id[0] == 5 + assert view.score[0] == 50.0 + assert not view.active[0] + assert list(view.id) == list(range(5, 15)) + + # Mask integrity + assert np.count_nonzero(view._valid_rows[:]) == 10 + + # Column is live (points back to its view) + col = view.id + assert isinstance(col, Column) + assert col._table is view + + # Chained views: base always points to immediate parent + tabla1 = tabla0.row[:50] + assert tabla1.base is tabla0 + assert len(tabla1) == 50 + + tabla2 = tabla1.row[:10] + assert tabla2.base is tabla1 + assert len(tabla2) == 10 + assert list(tabla2.id) == list(range(10)) + + tabla3 = tabla2.row[5:] + assert tabla3.base is tabla2 + assert len(tabla3) == 5 + assert list(tabla3.id) == [5, 6, 7, 8, 9] + + # Chained view with holes on parent + tabla0.delete([5, 10, 15, 20, 25]) + tv1 = tabla0.row[:30] + assert tv1.base is tabla0 + assert len(tv1) == 30 + tv2 = tv1.row[10:20] + assert tv2.base is tv1 + assert len(tv2) == 10 + + +def test_row_edge_cases(): + """Empty table, fully-deleted table: int raises IndexError, slice returns empty.""" + # Empty table + empty = CTable(RowModel) + with pytest.raises(IndexError): + _ = empty.row[0] + assert len(empty.row[:]) == 0 + assert len(empty.row[0:10]) == 0 + + # All rows deleted + data = generate_test_data(10) + t = CTable(RowModel, new_data=data) + t.delete(list(range(10))) + with pytest.raises(IndexError): + _ = t.row[0] + assert len(t.row[:]) == 0 + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) From c05c2ec87271e466c94bdbdbf974f9fd61bf8f70 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 26 Mar 2026 11:21:43 +0100 Subject: [PATCH 02/11] Add a plan for declaring a simple schema for CTable objects --- plans/ctable-schema.md | 1269 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1269 insertions(+) create mode 100644 plans/ctable-schema.md diff --git a/plans/ctable-schema.md b/plans/ctable-schema.md new file mode 100644 index 00000000..bed1c6a7 --- /dev/null +++ b/plans/ctable-schema.md @@ -0,0 +1,1269 @@ +# CTable Schema Redesign + +## Motivation + +The current `CTable` prototype in PR #598 uses `pydantic.BaseModel` plus +`Annotated[...]` metadata to define table schemas. That works, but it is not the +best long-term API for a columnar container in `python-blosc2`. + +The main issues with the current shape are: + +* It mixes row validation concerns with physical storage concerns. +* It relies on custom metadata objects (`NumpyDtype`, `MaxLen`) embedded in + Pydantic annotations. +* It is verbose for simple schemas. +* It does not provide an obvious place for NDArray-specific per-column options + such as `cparams`, `dparams`, `chunks`, `blocks`, or future indexing hints. + +What we want instead is: + +* A schema API that is easy to read and write. +* A place to attach Blosc2-specific per-column configuration. +* A way to express logical constraints such as `ge=0`, `le=100`, `max_length=10`. +* Internal validation without forcing the public API to be Pydantic-shaped. +* A clean distinction between: + * logical field type and constraints + * physical storage type + * per-column storage options + +The proposed solution is a **dataclass-first schema API** with **declarative field +spec objects** and **optional internal Pydantic-backed validation**. + +The intended usage style is: + +* canonical form for constrained or storage-tuned columns: + `id: int = b2.field(b2.int64(ge=0))` +* shorthand for simple inferred columns: + `id: int` +* not preferred as a primary style: + `id = b2.field(b2.int64(ge=0))` + +The reason is that the canonical form preserves normal Python type annotations, +which are valuable for readability, static tooling, and schema inspection. + +--- + +## Proposed public API + +### Schema declaration + +The intended schema declaration style is: + +```python +from dataclasses import dataclass + +import blosc2 as b2 + + +@dataclass +class Row: + id: int = b2.field(b2.int64(ge=0)) + score: float = b2.field( + b2.float64(ge=0, le=100), + cparams={"codec": b2.Codec.LZ4, "clevel": 5}, + ) + active: bool = b2.field(b2.bool(), default=True) +``` + +This is the target user-facing API for `CTable`. + +This should be documented as the **canonical** schema declaration style. + +For simple unconstrained cases, `CTable` may support an inferred shorthand: + +```python +@dataclass +class Row: + id: int + score: float + active: bool = True +``` + +which is interpreted approximately as: + +```python +@dataclass +class Row: + id: int = b2.field(b2.int64()) + score: float = b2.field(b2.float64()) + active: bool = b2.field(b2.bool(), default=True) +``` + +This shorthand should be limited to simple built-in Python types where the +mapping is obvious. + +### Naming convention + +Use **lowercase names** for schema descriptor objects: + +* `b2.int64` +* `b2.float64` +* `b2.bool` +* later: `b2.string(max_length=...)`, `b2.bytes(max_length=...)`, `b2.complex128` + +Reason: + +* `b2.int64(...)` is not just a dtype; it is a schema descriptor with constraints. +* The lowercase form keeps the API closer in spirit to NumPy and PyTorch. +* If plain NumPy dtypes are needed, callers can use `np.int64`, `np.float64`, + `np.bool_`, etc. +* `b2.bool(...)` is preferred over `b2.bool_(...)` for readability, even though + NumPy uses `bool_`. This is closer to PyTorch style and fits better for a + schema-builder API. + +### Field helper + +`b2.field(...)` should be the standard way to attach schema metadata to a +dataclass field. + +Expected shape: + +```python +b2.field( + b2.float64(ge=0, le=100), + default=..., + default_factory=..., + cparams=..., + dparams=..., + chunks=..., + blocks=..., + title=..., + description=..., + nullable=..., +) +``` + +At minimum for the first version: + +* `spec` +* `default` +* `default_factory` +* `cparams` +* `dparams` +* `chunks` +* `blocks` + +The implementation should store these in `dataclasses.field(metadata=...)`. + +The unannotated form: + +```python +id = b2.field(b2.int64(ge=0)) +``` + +should not be the primary API. It may be supported later only if there is a +strong reason, but the preferred style should retain: + +* a Python type annotation in the annotation slot +* `b2.field(...)` in the field/default slot + +That keeps the schema aligned with normal dataclass usage. + +--- + +## Core design + +### 1. Dataclass is the schema carrier + +The dataclass defines: + +* field names +* Python-level row shape +* user-visible defaults + +Example: + +```python +@dataclass +class Row: + id: int = b2.field(b2.int64(ge=0)) + score: float = b2.field(b2.float64(ge=0, le=100)) + active: bool = b2.field(b2.bool(), default=True) +``` + +This keeps the declaration small and idiomatic. + +The Python annotation should remain part of the design, not be replaced by +`b2.field(...)` alone. The annotation provides value independently of the +Blosc2 schema descriptor. + +### 2. Schema spec objects are the source of truth + +Each lowercase builder object is a lightweight immutable schema descriptor. + +Examples: + +```python +b2.int64(ge=0) +b2.float64(ge=0, le=100) +b2.bool() +b2.string(max_length=32) +b2.bytes(max_length=64) +``` + +Each spec object should carry only schema-level metadata, for example: + +* logical kind +* storage dtype +* numeric constraints (`ge`, `gt`, `le`, `lt`, `multiple_of`) +* string constraints (`max_length`, `min_length`, `pattern`) +* nullability +* maybe logical annotations later (`categorical`, `timezone`, `unit`) + +They should **not** directly carry per-column NDArray instance settings such as +`cparams` or `chunks`; those belong in `b2.field(...)`. + +### 3. Column field metadata carries NDArray-specific configuration + +`b2.field(...)` metadata should be the place for: + +* column storage options +* per-column compression settings +* chunk/block tuning +* persistence options in future versions + +This keeps the separation clean: + +* `b2.float64(ge=0, le=100)` answers: "what values are valid?" +* `b2.field(..., cparams=..., chunks=...)` answers: "how is this column stored?" + +### 4. Schema compilation step inside CTable + +`CTable` should not consume raw dataclass fields repeatedly. On construction, it +should compile the row class into an internal schema representation. + +For example: + +```python +compiled = CompiledSchema( + row_cls=Row, + columns=[ + CompiledColumn( + name="id", + py_type=int, + spec=b2.int64(ge=0), + dtype=np.int64, + default=MISSING, + cparams=..., + dparams=..., + chunks=..., + blocks=..., + validator_info=..., + ), + ..., + ], + validator_model=..., +) +``` + +This compiled form should drive: + +* NDArray creation +* row validation +* bulk validation +* introspection and future serialization + +--- + +## Validation strategy + +### Use Pydantic internally, but do not make it the public schema API + +Pydantic is a good fit for validation because it is: + +* mature +* well-tested +* expressive +* fast enough for row-level operations + +However, it should be an **implementation detail**, not the public schema surface. + +The public schema should remain: + +* dataclass-based +* Blosc2-specific +* independent of any one validation library + +### Why not use Pydantic as the schema source directly? + +Because storage and validation are overlapping but not identical concerns. + +Examples: + +* `dtype=np.int16` is both logical and physical. +* `cparams`, `chunks`, `blocks`, `dparams` are not Pydantic concepts. +* a future column index, bloom filter, or codec hint is not a validation concept. + +Therefore, the internal architecture should be: + +* user declares a dataclass + `b2.field(...)` +* `CTable` compiles it into: + * storage schema + * validation schema + +### Row-level validation + +For `append(row)` and other row-wise inserts: + +* compile a cached internal Pydantic model once per schema +* validate incoming rows against that model +* convert the validated row into column values + +This is the simplest and safest path. + +Expected behavior: + +* `table.append(Row(...))` +* `table.append({"id": 1, "score": 2.0, "active": True})` +* `table.append((1, 2.0, True))` + +All may be accepted, but internally normalized through one validator path. + +### Bulk validation + +For `extend(...)`, row-by-row Pydantic validation may be too expensive for large +batches. Bulk inserts need a separate strategy. + +Recommended modes: + +* `validate=True` + Full validation. May use row-wise Pydantic validation for smaller inputs and + vectorized checks where available. +* `validate=False` + Trust caller, perform dtype coercion only. +* optional later: `validate="sample"` or `validate="vectorized"` + +For numeric and simple string constraints, vectorized checks are preferable when +possible: + +* `ge`, `gt`, `le`, `lt` +* `max_length`, `min_length` +* null checks +* dtype coercion checks + +This means the architecture should support both: + +* Pydantic row validation +* vectorized array validation + +The compiled schema should expose enough information for both. + +### Performance stance + +Pydantic should be treated as: + +* a strong default for correctness +* fast enough for row-wise validation +* not necessarily the fastest choice for large batch validation + +This is important because the performance bottleneck for `extend()` is more about +per-row Python overhead than about Pydantic specifically. + +--- + +## Detailed API proposal + +### Schema spec classes + +Add schema descriptor classes under `blosc2`, for example: + +* `int8`, `int16`, `int32`, `int64` +* `uint8`, `uint16`, `uint32`, `uint64` +* `float32`, `float64` +* `bool` +* `complex64`, `complex128` +* `string` +* `bytes` + +Minimal constructor examples: + +```python +b2.int64(ge=0) +b2.float64(ge=0, le=100) +b2.string(max_length=32) +b2.bytes(max_length=64) +b2.bool() +``` + +Internal common fields: + +* `dtype` +* `nullable` +* `constraints` +* `python_type` + +### Field helper + +`b2.field(spec, **kwargs)` should return a `dataclasses.field(...)` object with +Blosc2 metadata attached. + +Example metadata layout: + +```python +{ + "blosc2": { + "spec": ..., + "cparams": ..., + "dparams": ..., + "chunks": ..., + "blocks": ..., + } +} +``` + +This metadata key should be stable and reserved. + +### CTable constructor + +The desired constructor remains: + +```python +table = b2.CTable(Row) +``` + +Optional overrides: + +```python +table = b2.CTable( + Row, + expected_size=1_000_000, + compact=False, + validate=True, +) +``` + +`CTable` should detect that `Row` is a dataclass schema and compile it. + +### Possible compatibility layer + +If needed temporarily, `CTable` may continue accepting the old Pydantic model +style during a transition period: + +```python +table = b2.CTable(LegacyPydanticRow) +``` + +But that should be documented as legacy or transitional once the dataclass API +lands. + +--- + +## Internal compilation pipeline + +### Step 1. Inspect dataclass fields + +For each dataclass field: + +* field name +* Python annotation +* default or default factory +* Blosc2 metadata from `b2.field(...)` + +Reject invalid shapes early: + +* missing `b2.field(...)` +* missing schema spec +* incompatible Python annotation vs schema spec +* unsupported defaults + +If inferred shorthand is supported, refine the first two rules to: + +* either a supported plain annotation, or an explicit `b2.field(...)` +* if `b2.field(...)` is present, it must contain a schema spec + +### Step 2. Build compiled column descriptors + +For each field, produce a `CompiledColumn` object containing: + +* `name` +* `py_type` +* `spec` +* `dtype` +* `default` +* `default_factory` +* `nullable` +* `cparams` +* `dparams` +* `chunks` +* `blocks` +* validation constraints + +### Step 3. Derive physical NDArray creation arguments + +From the compiled column descriptor, derive: + +* `dtype` +* shape +* chunks +* blocks +* `cparams` +* `dparams` + +This should happen once during table initialization. + +### Step 4. Derive validation model + +Translate each schema spec into a Pydantic field definition. + +Examples: + +* `int64(ge=0)` -> integer field with `ge=0` +* `float64(ge=0, le=100)` -> float field with `ge=0`, `le=100` +* `string(max_length=32)` -> string field with `max_length=32` + +Cache the compiled Pydantic model class per row schema. + +### Step 5. Expose introspection hooks + +Expose enough metadata for: + +* debugging +* `table.info()` +* future schema serialization +* future schema-driven docs and reprs + +Possible user-facing hooks later: + +* `table.schema` +* `table.schema.columns` +* `table.schema.as_dict()` + +--- + +## Handling defaults + +Defaults should follow dataclass semantics as closely as possible. + +Examples: + +```python +active: bool = b2.field(b2.bool(), default=True) +tags: list[str] = b2.field(..., default_factory=list) +``` + +For the first implementation, keep this conservative: + +* support scalar defaults +* support `default_factory` only if there is a clear use case +* reject mutable defaults directly + +On insert: + +* omitted values should be filled from defaults +* explicit `None` should be accepted only if the field is nullable + +--- + +## Insert semantics + +### append() + +`append()` should accept a small set of normalized shapes: + +* dataclass row instance +* dict-like row +* tuple/list in schema order + +Recommended internal path: + +1. normalize the input to a field mapping +2. validate with cached validator model +3. coerce to final column values +4. append into underlying NDArrays + +### extend() + +`extend()` should accept: + +* iterable of row objects +* dict-of-arrays +* structured NumPy array +* maybe another `CTable` + +Recommended internal path: + +1. normalize to column batches where possible +2. validate according to `validate=` mode +3. coerce dtypes +4. write in bulk + +For `dict-of-arrays` and structured arrays, vectorized validation should be the +preferred long-term path. + +--- + +## Per-column NDArray options + +One of the main reasons for `b2.field(...)` is that different columns may want +different storage settings. + +Examples: + +* a boolean column may want different compression parameters from a float column +* a high-cardinality string column may need different chunk sizes +* a metric column may use a specific codec or filter tuning + +So the schema system must allow: + +```python +@dataclass +class Row: + id: int = b2.field(b2.int64(ge=0), cparams={"codec": b2.Codec.ZSTD, "clevel": 1}) + score: float = b2.field( + b2.float64(ge=0, le=100), cparams={"codec": b2.Codec.LZ4HC, "clevel": 9} + ) + active: bool = b2.field(b2.bool(), cparams={"codec": b2.Codec.LZ4}) +``` + +The implementation should define precedence rules clearly: + +* column-level options override table defaults +* table-level options fill in unspecified values + +This implies `CTable(...)` may also take default storage options: + +```python +table = b2.CTable(Row, cparams=..., dparams=...) +``` + +Column-level overrides should merge against those defaults, not replace them +blindly. + +--- + +## Compatibility and migration + +### Goal + +Move toward the dataclass-based schema API without locking the project into the +current Pydantic-shaped declaration model. + +### Migration path + +Phase 1: + +* introduce schema spec classes and `b2.field(...)` +* support dataclass schemas in `CTable` +* keep existing prototype behavior separate + +Phase 2: + +* add row validation via cached internal Pydantic model +* add bulk validation modes +* document the dataclass schema API as preferred + +Phase 3: + +* optionally add a compatibility adapter for existing Pydantic models +* deprecate ad hoc `Annotated[...]` metadata conventions if they remain exposed + +### Non-goal + +Do not make the first implementation solve every possible schema feature. The +first goal is to get the schema shape and internal architecture right. + +--- + +## Serialization implications + +Even if `save()` / `load()` are not implemented yet, this schema design should +anticipate persistence. + +Eventually a persisted `CTable` will need to store: + +* column names +* logical schema descriptors +* per-column defaults +* per-column NDArray storage options +* maybe validation constraints + +That argues strongly for having a stable compiled schema representation early. + +The compiled schema should be serializable to: + +* JSON-compatible metadata +* or a small msgpack payload + +The public dataclass itself does not need to be serialized directly. Only the +compiled schema matters for persistence. + +--- + +## Open questions + +### 1. Should Python annotations be required to match the schema spec? + +Example: + +```python +id: int = b2.field(b2.int64(ge=0)) +``` + +Recommended answer: yes, broadly, with sensible compatibility rules. + +Allowed: + +* `int` with `int64` +* `float` with `float64` +* `bool` with `bool` + +Potentially allowed later: + +* `str` with `string` +* `bytes` with `bytes` + +Reject obviously inconsistent declarations early. + +In other words: + +* `id: int = b2.field(b2.int64(ge=0))` is good +* `id: int` is acceptable shorthand for inferred `b2.int64()` +* `id = b2.field(b2.int64(ge=0))` is not the preferred style because it drops + the Python annotation + +### 2. Where should nullability live? + +Recommended answer: on the schema spec. + +Example: + +```python +name: str | None = b2.field(b2.string(max_length=32, nullable=True)) +``` + +The Python annotation and schema spec should agree. + +### 3. Should `b2.field()` require a spec? + +Recommended answer: yes for the first version. + +Allowing `b2.field(default=True)` without a spec means we must infer too much +from the Python annotation and lose clarity. + +This still allows fully inferred fields that do not use `b2.field(...)` at all: + +```python +active: bool = True +``` + +but once `b2.field(...)` is used, it should carry an explicit schema spec. + +### 4. How much should Pydantic-specific behavior leak? + +Recommended answer: as little as possible. + +Users should not need to know whether validation is backed by Pydantic, +vectorized NumPy checks, or another mechanism. + +--- + +## Concrete implementation sequence + +This section turns the design into a proposed execution order with concrete +files, class names, and function signatures. + +### Step 1: add schema descriptor primitives + +Create a new module: + +* `src/blosc2/schema.py` + +Primary contents: + +```python +from __future__ import annotations + +from dataclasses import MISSING, Field as DataclassField, field as dc_field +from typing import Any + +import numpy as np +``` + +Proposed public classes and functions: + +```python +class SchemaSpec: + dtype: np.dtype + python_type: type[Any] + nullable: bool + + def to_pydantic_kwargs(self) -> dict[str, Any]: ... + def to_metadata_dict(self) -> dict[str, Any]: ... + + +class int64(SchemaSpec): + def __init__( + self, *, ge=None, gt=None, le=None, lt=None, nullable: bool = False + ): ... + + +class float64(SchemaSpec): + def __init__( + self, *, ge=None, gt=None, le=None, lt=None, nullable: bool = False + ): ... + + +class bool(SchemaSpec): + def __init__(self, *, nullable: bool = False): ... + + +class string(SchemaSpec): + def __init__( + self, *, min_length=None, max_length=None, pattern=None, nullable: bool = False + ): ... + + +class bytes(SchemaSpec): + def __init__(self, *, min_length=None, max_length=None, nullable: bool = False): ... + + +def field( + spec: SchemaSpec, + *, + default=MISSING, + default_factory=MISSING, + cparams: dict[str, Any] | None = None, + dparams: dict[str, Any] | None = None, + chunks: tuple[int, ...] | None = None, + blocks: tuple[int, ...] | None = None, + title: str | None = None, + description: str | None = None, +) -> DataclassField: ... +``` + +Internal helper constants: + +```python +BLOSC2_FIELD_METADATA_KEY = "blosc2" +``` + +Notes: + +* Start with only the spec classes needed for the first `CTable` iteration: + `int64`, `float64`, `bool`. +* Add `string` and `bytes` only if needed in the same slice of work. +* Avoid over-generalizing the first implementation. + +### Step 2: add schema compiler and compiled representations + +Create a new module: + +* `src/blosc2/schema_compiler.py` + +Primary internal dataclasses: + +```python +from dataclasses import dataclass +from typing import Any + + +@dataclass(slots=True) +class ColumnConfig: + cparams: dict[str, Any] | None + dparams: dict[str, Any] | None + chunks: tuple[int, ...] | None + blocks: tuple[int, ...] | None + title: str | None + description: str | None + + +@dataclass(slots=True) +class CompiledColumn: + name: str + py_type: Any + spec: Any + dtype: np.dtype + default: Any + default_factory: Any + config: ColumnConfig + + +@dataclass(slots=True) +class CompiledSchema: + row_cls: type[Any] + columns: list[CompiledColumn] + columns_by_name: dict[str, CompiledColumn] + validator_model: type[Any] | None = None +``` + +Primary internal functions: + +```python +def compile_schema(row_cls: type[Any]) -> CompiledSchema: ... +def infer_spec_from_annotation(annotation: Any, default: Any = MISSING) -> Any: ... +def validate_annotation_matches_spec(annotation: Any, spec: Any) -> None: ... +def get_blosc2_field_metadata(dc_field) -> dict[str, Any] | None: ... +``` + +Behavior: + +* accept a dataclass type only +* for explicit `b2.field(...)`, read the spec from metadata +* for inferred fields like `id: int`, derive `b2.int64()` +* reject unsupported annotations early +* normalize all defaults/config into `CompiledSchema` + +### Step 3: export the schema API from `blosc2` + +Update: + +* `src/blosc2/__init__.py` + +Exports to add: + +```python +from .schema import bool, bytes, field, float64, int64, string +``` + +And in `__all__`: + +```python +"bool", +"bytes", +"field", +"float64", +"int64", +"string", +``` + +Notes: + +* Be careful with `bool` and `bytes` in `__init__.py` because they shadow + builtins within the module namespace. That is acceptable if done deliberately, + but it should be reviewed explicitly. +* If shadowing proves too awkward internally, keep the implementation names + private and re-export the public names only. + +### Step 4: refactor `CTable` to consume compiled schemas + +Update: + +* `src/blosc2/ctable.py` + +Primary constructor signature: + +```python +class CTable(Generic[RowT]): + def __init__( + self, + row_type: type[RowT], + new_data=None, + *, + expected_size: int = 1_048_576, + compact: bool = False, + validate: bool = True, + cparams: dict[str, Any] | None = None, + dparams: dict[str, Any] | None = None, + ) -> None: ... +``` + +New internal state: + +```python +self._schema: CompiledSchema +self._validate: bool +self._table_cparams: dict[str, Any] | None +self._table_dparams: dict[str, Any] | None +``` + +New internal helper methods: + +```python +def _init_columns(self, expected_size: int) -> None: ... +def _resolve_column_storage(self, col: CompiledColumn) -> dict[str, Any]: ... +def _normalize_row_input(self, data: Any) -> dict[str, Any]: ... +def _coerce_row_to_storage(self, row: dict[str, Any]) -> dict[str, Any]: ... +``` + +Behavior changes: + +* replace direct inspection of `row_type.model_fields` +* build columns from `self._schema.columns` +* derive column dtypes from compiled schema +* merge table-level and field-level storage settings + +### Step 5: implement row validation adapter + +Create a new internal module: + +* `src/blosc2/schema_validation.py` + +Primary functions: + +```python +from typing import Any + + +def build_validator_model(schema: CompiledSchema) -> type[Any]: ... +def validate_row(schema: CompiledSchema, row: dict[str, Any]) -> dict[str, Any]: ... +def validate_rows_rowwise( + schema: CompiledSchema, rows: list[dict[str, Any]] +) -> list[dict[str, Any]]: ... +``` + +Behavior: + +* build and cache a Pydantic model per compiled schema +* map `SchemaSpec` constraints into Pydantic field definitions +* return normalized Python values ready for storage coercion + +Implementation note: + +* Cache the generated validator model on `CompiledSchema.validator_model`. +* Keep all Pydantic-specific logic isolated in this module. + +### Step 6: wire validation into `append()` + +Update: + +* `src/blosc2/ctable.py` + +Target signatures: + +```python +def append(self, data: Any) -> None: ... +def _append_validated_row(self, row: dict[str, Any]) -> None: ... +``` + +Concrete behavior: + +1. normalize incoming row shape +2. if `self._validate` is true, validate via `schema_validation.validate_row` +3. coerce to storage values +4. append into column NDArrays + +Inputs to support in the first cut: + +* dataclass row instance +* dict +* tuple/list in schema order + +Inputs that can wait until later if needed: + +* structured NumPy scalar +* Pydantic model instance + +### Step 7: add `extend(..., validate=...)` + +Update: + +* `src/blosc2/ctable.py` + +Proposed signature: + +```python +def extend(self, data: Any, *, validate: bool | None = None) -> None: ... +``` + +Supporting internal helpers: + +```python +def _normalize_rows_input( + self, data: Any +) -> tuple[list[dict[str, Any]] | None, dict[str, Any] | None]: ... +def _extend_rowwise(self, rows: list[dict[str, Any]], *, validate: bool) -> None: ... +def _extend_columnwise(self, columns: dict[str, Any], *, validate: bool) -> None: ... +``` + +First implementation target: + +* support iterable of rows via `_extend_rowwise` +* preserve correctness first, optimize later + +Second implementation target: + +* add `_extend_columnwise` for structured arrays and dict-of-arrays +* add vectorized validation for simple constraints + +### Step 8: add vectorized validation helpers + +Create a new internal module: + +* `src/blosc2/schema_vectorized.py` + +Primary functions: + +```python +from typing import Any + + +def validate_column_values(col: CompiledColumn, values: Any) -> None: ... +def validate_column_batch(schema: CompiledSchema, columns: dict[str, Any]) -> None: ... +``` + +Initial checks to support: + +* numeric `ge`, `gt`, `le`, `lt` +* string and bytes `min_length`, `max_length` +* nullability +* dtype compatibility after coercion + +This module should remain optional in the first PR if the rowwise path is enough +to land the architecture cleanly. + +### Step 9: add schema introspection to `CTable` + +Update: + +* `src/blosc2/ctable.py` + +Proposed property: + +```python +@property +def schema(self) -> CompiledSchema: ... +``` + +Optional helper methods: + +```python +def schema_dict(self) -> dict[str, Any]: ... +def column_schema(self, name: str) -> CompiledColumn: ... +``` + +Goal: + +* make the new schema layer visible and debuggable +* provide a stable base for future save/load work + +### Step 10: add tests in focused modules + +Add: + +* `tests/ctable/test_schema_specs.py` +* `tests/ctable/test_schema_compiler.py` +* `tests/ctable/test_schema_validation.py` +* `tests/ctable/test_ctable_dataclass_schema.py` + +Test scope by file: + +`tests/ctable/test_schema_specs.py` + +* spec construction +* dtype mapping +* metadata export + +`tests/ctable/test_schema_compiler.py` + +* explicit `b2.field(...)` +* inferred shorthand from plain annotations +* annotation/spec mismatch rejection +* defaults handling + +`tests/ctable/test_schema_validation.py` + +* Pydantic validator generation +* constraint enforcement +* nullable vs non-nullable behavior + +`tests/ctable/test_ctable_dataclass_schema.py` + +* `CTable(Row)` construction +* append with dataclass/dict/tuple +* extend with iterable of rows +* per-column `cparams` override plumbing + +### Step 11: keep the legacy prototype isolated during transition + +Short-term implementation choice: + +* if the current `ctable.py` prototype is still in active flux, prefer landing + the schema/compiler modules first and then refactoring `CTable` over them +* do not expand the old Pydantic-specific schema path further + +Possible follow-up helper: + +```python +def compile_legacy_pydantic_schema(row_cls: type[Any]) -> CompiledSchema: ... +``` + +But only add that if compatibility becomes necessary. + +### Step 12: persistence groundwork + +No need to implement `save()` / `load()` immediately, but define serialization +hooks on the schema side now. + +Add to `CompiledSchema` or a related helper: + +```python +def schema_to_dict(schema: CompiledSchema) -> dict[str, Any]: ... +def schema_from_dict(data: dict[str, Any]) -> CompiledSchema: ... +``` + +This should remain internal until the persisted format is stable. + +### Step 13: delivery order across PRs + +Recommended PR slicing: + +PR 1: + +* `src/blosc2/schema.py` +* `src/blosc2/schema_compiler.py` +* exports in `src/blosc2/__init__.py` +* tests for schema specs and compiler + +PR 2: + +* `CTable` constructor refactor to use compiled schema +* `append()` row normalization +* row-wise validation module +* `tests/ctable/test_ctable_dataclass_schema.py` + +PR 3: + +* `extend(..., validate=...)` +* vectorized validation helpers +* schema introspection property +* more tests for batch validation and overrides + +PR 4: + +* persistence groundwork +* optional compatibility adapter for legacy Pydantic model declarations + +### Step 14: concrete first-PR checklist + +The smallest coherent first implementation should be: + +1. add `src/blosc2/schema.py` +2. add `src/blosc2/schema_compiler.py` +3. export `field`, `int64`, `float64`, `bool` +4. add tests for: + * explicit field specs + * inferred shorthand + * mismatch rejection +5. stop there + +That first PR gives the project: + +* the public schema vocabulary +* the internal compiled representation +* confidence in the canonical API shape + +before touching too much `CTable` mutation logic. + +--- + +## Recommendation + +The recommended direction is: + +1. Make **dataclasses** the public schema declaration mechanism for `CTable`. +2. Introduce **lowercase schema spec objects** such as `b2.int64(...)`. +3. Use **`b2.field(...)`** to carry both the schema spec and per-column NDArray + configuration. +4. Compile the schema once into an internal representation. +5. Use **Pydantic internally for row validation**, but keep it hidden behind the + Blosc2 schema API. +6. Add a separate **bulk validation path** for large inserts so `extend()` does + not depend entirely on per-row Pydantic validation. + +This design gives the project: + +* a cleaner user API +* a better place for columnar storage configuration +* a clear boundary between schema, validation, and storage +* flexibility to evolve validation internals later +* a strong base for future persistence and schema introspection From 725c28bfe9d0b613c1de1d324d1793f1e5e3e46e Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 26 Mar 2026 11:25:16 +0100 Subject: [PATCH 03/11] Add a pydantic as a new dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 6244b0d9..36f42bfa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ dependencies = [ "ndindex", "msgpack", "numexpr>=2.14.1; platform_machine != 'wasm32'", + "pydantic", "requests", ] version = "4.1.1.dev0" From 0efd45049d7314452475468420e4ae47ede5e54c Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 26 Mar 2026 11:35:43 +0100 Subject: [PATCH 04/11] Fix small formatting issues --- bench/ctable/extend.py | 1 - src/blosc2/__init__.py | 2 +- src/blosc2/ctable.py | 15 +++++++-------- tests/ctable/test_column.py | 7 ++++--- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/bench/ctable/extend.py b/bench/ctable/extend.py index f294b012..b691c03e 100644 --- a/bench/ctable/extend.py +++ b/bench/ctable/extend.py @@ -114,4 +114,3 @@ class RowModel(BaseModel): print(f"{'Python list of lists':<30} {t_from_list:>12.4f} {'1.00x':>18}") print(f"{'NumPy structured array':<30} {t_from_np:>12.4f} {t_from_list / t_from_np:>17.2f}x") print(f"{'Existing CTable':<30} {t_from_ctable:>12.4f} {t_from_list / t_from_ctable:>17.2f}x") - diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index 8afc0653..11ae5677 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -596,7 +596,7 @@ def _raise(exc): """ # Delayed imports for avoiding overwriting of python builtins -from .ctable import CTable, Column +from .ctable import Column, CTable from .ndarray import ( abs, acos, diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py index 3acb1731..9af63267 100644 --- a/src/blosc2/ctable.py +++ b/src/blosc2/ctable.py @@ -200,8 +200,6 @@ def __getitem__(self, key: int | slice | list | np.ndarray): pos_true = _find_physical_index(self._valid_rows, key) return self._raw_col[int(pos_true)] - - elif isinstance(key, slice): real_pos = blosc2.where(self._valid_rows, np.arange(len(self._valid_rows))).compute() start, stop, step = key.indices(len(real_pos)) @@ -209,14 +207,13 @@ def __getitem__(self, key: int | slice | list | np.ndarray): if step == 1: phys_start = real_pos[start] phys_stop = real_pos[stop - 1] - mask[phys_start: phys_stop + 1] = True + mask[phys_start : phys_stop + 1] = True else: lindices = np.arange(start, stop, step) phys_indices = real_pos[lindices] mask[phys_indices[:]] = True return Column(self._table, self._col_name, mask=mask) - elif isinstance(key, (list, tuple, np.ndarray)): real_pos = blosc2.where(self._valid_rows, np.arange(len(self._valid_rows))).compute() phys_indices = np.array([real_pos[i] for i in key], dtype=np.int64) @@ -264,11 +261,11 @@ def __iter__(self): val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0] if not val: continue - yield from self._raw_col[chunk_start: chunk_start + actual_size] + yield from self._raw_col[chunk_start : chunk_start + actual_size] continue - mask_chunk = arr[chunk_start: chunk_start + actual_size] - data_chunk = self._raw_col[chunk_start: chunk_start + actual_size] + mask_chunk = arr[chunk_start : chunk_start + actual_size] + data_chunk = self._raw_col[chunk_start : chunk_start + actual_size] yield from data_chunk[mask_chunk] def __len__(self): @@ -308,7 +305,9 @@ def to_numpy(self): class CTable(Generic[RowT]): - def __init__(self, row_type: type[RowT], new_data=None, expected_size: int = 1_048_576, compact: bool = False) -> None: + def __init__( + self, row_type: type[RowT], new_data=None, expected_size: int = 1_048_576, compact: bool = False + ) -> None: self._row_type = row_type self._cols: dict[str, blosc2.NDArray] = {} self._n_rows: int = 0 diff --git a/tests/ctable/test_column.py b/tests/ctable/test_column.py index 4f2e450b..60f15a5d 100644 --- a/tests/ctable/test_column.py +++ b/tests/ctable/test_column.py @@ -10,8 +10,8 @@ import numpy as np import pytest from pydantic import BaseModel, Field -import blosc2 +import blosc2 from blosc2 import CTable @@ -171,12 +171,14 @@ def test_column_iter(): tabla3 = CTable(RowModel, new_data=DATA20) tabla3.delete([0, 5, 10, 15]) + # fmt: off expected_score = [ 10.0, 20.0, 30.0, 40.0, 60.0, 70.0, 80.0, 90.0, 110.0, 120.0, 130.0, 140.0, 160.0, 170.0, 180.0, 190.0, ] + # fmt: on assert list(tabla3.score) == expected_score @@ -264,7 +266,7 @@ def test_to_array_full_column(): col = tabla.id expected = np.array([i for i in range(20) if i not in {0, 10, 19}], dtype=np.int64) - np.testing.assert_array_equal(col[0:len(col)].to_numpy(), expected) + np.testing.assert_array_equal(col[0 : len(col)].to_numpy(), expected) def test_to_array_mask_does_not_include_deleted(): @@ -289,6 +291,5 @@ def test_column_view_mask_is_independent(): np.testing.assert_array_equal(view_a.to_numpy(), np.arange(0, 5, dtype=np.int64)) - if __name__ == "__main__": pytest.main(["-v", __file__]) From f504ad0670e3afca2213f4f465185d260a88ae8c Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 26 Mar 2026 12:01:38 +0100 Subject: [PATCH 05/11] Simplify the plan for ctable schema --- plans/ctable-schema.md | 52 ++++++------------------------------------ 1 file changed, 7 insertions(+), 45 deletions(-) diff --git a/plans/ctable-schema.md b/plans/ctable-schema.md index bed1c6a7..cacac6d8 100644 --- a/plans/ctable-schema.md +++ b/plans/ctable-schema.md @@ -122,14 +122,10 @@ Expected shape: b2.field( b2.float64(ge=0, le=100), default=..., - default_factory=..., cparams=..., dparams=..., chunks=..., blocks=..., - title=..., - description=..., - nullable=..., ) ``` @@ -137,7 +133,6 @@ At minimum for the first version: * `spec` * `default` -* `default_factory` * `cparams` * `dparams` * `chunks` @@ -388,7 +383,6 @@ b2.bool() Internal common fields: * `dtype` -* `nullable` * `constraints` * `python_type` @@ -480,8 +474,6 @@ For each field, produce a `CompiledColumn` object containing: * `spec` * `dtype` * `default` -* `default_factory` -* `nullable` * `cparams` * `dparams` * `chunks` @@ -538,19 +530,16 @@ Examples: ```python active: bool = b2.field(b2.bool(), default=True) -tags: list[str] = b2.field(..., default_factory=list) ``` For the first implementation, keep this conservative: * support scalar defaults -* support `default_factory` only if there is a clear use case * reject mutable defaults directly On insert: * omitted values should be filled from defaults -* explicit `None` should be accepted only if the field is nullable --- @@ -721,19 +710,7 @@ In other words: * `id = b2.field(b2.int64(ge=0))` is not the preferred style because it drops the Python annotation -### 2. Where should nullability live? - -Recommended answer: on the schema spec. - -Example: - -```python -name: str | None = b2.field(b2.string(max_length=32, nullable=True)) -``` - -The Python annotation and schema spec should agree. - -### 3. Should `b2.field()` require a spec? +### 2. Should `b2.field()` require a spec? Recommended answer: yes for the first version. @@ -748,7 +725,7 @@ active: bool = True but once `b2.field(...)` is used, it should carry an explicit schema spec. -### 4. How much should Pydantic-specific behavior leak? +### 3. How much should Pydantic-specific behavior leak? Recommended answer: as little as possible. @@ -785,49 +762,39 @@ Proposed public classes and functions: class SchemaSpec: dtype: np.dtype python_type: type[Any] - nullable: bool def to_pydantic_kwargs(self) -> dict[str, Any]: ... def to_metadata_dict(self) -> dict[str, Any]: ... class int64(SchemaSpec): - def __init__( - self, *, ge=None, gt=None, le=None, lt=None, nullable: bool = False - ): ... + def __init__(self, *, ge=None, gt=None, le=None, lt=None): ... class float64(SchemaSpec): - def __init__( - self, *, ge=None, gt=None, le=None, lt=None, nullable: bool = False - ): ... + def __init__(self, *, ge=None, gt=None, le=None, lt=None): ... class bool(SchemaSpec): - def __init__(self, *, nullable: bool = False): ... + def __init__(self): ... class string(SchemaSpec): - def __init__( - self, *, min_length=None, max_length=None, pattern=None, nullable: bool = False - ): ... + def __init__(self, *, min_length=None, max_length=None, pattern=None): ... class bytes(SchemaSpec): - def __init__(self, *, min_length=None, max_length=None, nullable: bool = False): ... + def __init__(self, *, min_length=None, max_length=None): ... def field( spec: SchemaSpec, *, default=MISSING, - default_factory=MISSING, cparams: dict[str, Any] | None = None, dparams: dict[str, Any] | None = None, chunks: tuple[int, ...] | None = None, blocks: tuple[int, ...] | None = None, - title: str | None = None, - description: str | None = None, ) -> DataclassField: ... ``` @@ -863,8 +830,6 @@ class ColumnConfig: dparams: dict[str, Any] | None chunks: tuple[int, ...] | None blocks: tuple[int, ...] | None - title: str | None - description: str | None @dataclass(slots=True) @@ -874,7 +839,6 @@ class CompiledColumn: spec: Any dtype: np.dtype default: Any - default_factory: Any config: ColumnConfig @@ -1095,7 +1059,6 @@ Initial checks to support: * numeric `ge`, `gt`, `le`, `lt` * string and bytes `min_length`, `max_length` -* nullability * dtype compatibility after coercion This module should remain optional in the first PR if the rowwise path is enough @@ -1154,7 +1117,6 @@ Test scope by file: * Pydantic validator generation * constraint enforcement -* nullable vs non-nullable behavior `tests/ctable/test_ctable_dataclass_schema.py` From 46bf2e310e12a5fea0b554f644b8e3263d156226 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 26 Mar 2026 12:05:10 +0100 Subject: [PATCH 06/11] Disable wheel generation for each commit in this branch --- .github/workflows/cibuildwheels.yml | 4 ++-- .github/workflows/wasm.yml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cibuildwheels.yml b/.github/workflows/cibuildwheels.yml index ed69f764..f7e2bef0 100644 --- a/.github/workflows/cibuildwheels.yml +++ b/.github/workflows/cibuildwheels.yml @@ -26,6 +26,7 @@ env: jobs: build_wheels: + if: ${{ github.ref_name != 'ctable3' && github.head_ref != 'ctable3' }} name: Build wheels on ${{ matrix.os }} for ${{ matrix.arch }} runs-on: ${{ matrix.runs-on || matrix.os }} permissions: @@ -128,10 +129,9 @@ jobs: upload_pypi: + if: ${{ (github.ref_name != 'ctable3' && github.head_ref != 'ctable3') && startsWith(github.event.ref, 'refs/tags') }} needs: [ build_wheels] runs-on: ubuntu-latest - # Only upload wheels when tagging (typically a release) - if: startsWith(github.event.ref, 'refs/tags') steps: - uses: actions/download-artifact@v8 with: diff --git a/.github/workflows/wasm.yml b/.github/workflows/wasm.yml index f54afad5..3d293274 100644 --- a/.github/workflows/wasm.yml +++ b/.github/workflows/wasm.yml @@ -14,6 +14,7 @@ env: jobs: build_wheels_wasm: + if: ${{ github.ref_name != 'ctable3' && github.head_ref != 'ctable3' }} name: Build and test wheels for WASM on ${{ matrix.os }} for ${{ matrix.p_ver }} runs-on: ubuntu-latest permissions: From 43bf562727d727acd6d120a1ff7139043d99d83f Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 26 Mar 2026 13:29:57 +0100 Subject: [PATCH 07/11] Add a new plan on CTable persistence --- plans/ctable-persistency.md | 536 ++++++++++++++++++++++++++++++++++++ plans/ctable-schema.md | 29 +- 2 files changed, 564 insertions(+), 1 deletion(-) create mode 100644 plans/ctable-persistency.md diff --git a/plans/ctable-persistency.md b/plans/ctable-persistency.md new file mode 100644 index 00000000..a2ff6db2 --- /dev/null +++ b/plans/ctable-persistency.md @@ -0,0 +1,536 @@ +# CTable Persistency Plan + +## Goal + +Add persistent `CTable` support on top of `TreeStore` while keeping the public +API simple: + +* in-memory tables when `urlpath is None` +* persistent tables when `urlpath` is provided + +The first persistency iteration should support: + +* creating a persistent table +* opening an existing persistent table +* reading rows, columns, and views from persisted tables +* appending rows + +The first persistency iteration should **not** promise: + +* full schema evolution +* dropping columns +* renaming columns +* transactional multi-entry updates + +For now, the supported schema evolution story is: + +* append rows only + +--- + +## Storage layout + +Each persisted `CTable` lives under a table root inside a `TreeStore`. + +Confirmed layout: + +* `table_root/_meta` +* `table_root/_valid_rows` +* `table_root/_cols/` + +Example: + +* `people/_meta` +* `people/_valid_rows` +* `people/_cols/id` +* `people/_cols/score` +* `people/_cols/active` + +Rationale: + +* `_meta` holds mutable metadata in `vlmeta` +* `_valid_rows` is real table data and should be stored as a normal persisted array +* `_cols/` stores one persisted NDArray per column + +The underscore-prefixed names form the internal namespace for a table root and +must be treated as reserved. + +--- + +## `_meta` entry + +`_meta` should be a small serialized `SChunk` used primarily to hold mutable +`vlmeta`. + +This is preferable to immutable metalayers because: + +* we may want to evolve metadata over time +* multiple `CTable` objects may live in the same `TreeStore` +* schema and table metadata should be updateable without rewriting the entire table + +For the first version: + +* `tree_store["/_meta"].vlmeta["kind"] = "ctable"` +* `tree_store["/_meta"].vlmeta["version"] = 1` +* `tree_store["/_meta"].vlmeta["schema"] = {...}` + +This gives `open()` a minimal, reliable contract for introspection. + +--- + +## Schema persistence format + +The schema should be stored as JSON-compatible data in: + +* `tree_store["/_meta"].vlmeta["schema"]` + +The schema document should be versioned and explicit. + +Recommended shape: + +```python +{ + "version": 1, + "columns": [ + { + "name": "id", + "py_type": "int", + "spec": {"kind": "int64", "ge": 0}, + "default": None, + }, + { + "name": "score", + "py_type": "float", + "spec": {"kind": "float64", "ge": 0, "le": 100}, + "default": None, + }, + { + "name": "active", + "py_type": "bool", + "spec": {"kind": "bool"}, + "default": True, + }, + ], +} +``` + +Notes: + +* `columns` must be an ordered list, not a dict. +* The order of the list is the source of truth for column order. +* Do not rely on dict ordering or TreeStore iteration order. +* The schema JSON should capture logical schema information only. + +For the first version, do **not** duplicate: + +* per-column `cparams` +* per-column `dparams` +* array chunk/block layout +* `expected_size` +* compaction settings + +Those can be introspected directly from the stored arrays when needed. + +--- + +## `_valid_rows` persistence + +`_valid_rows` should be stored as a normal persisted boolean NDArray under: + +* `table_root/_valid_rows` + +This is the correct representation because `_valid_rows` is: + +* table data, not metadata +* potentially large +* used in normal row visibility semantics +* already aligned with current delete/view/compaction logic + +Do not encode `_valid_rows` into schema JSON or small metadata blobs. + +--- + +## Column persistence + +Each column should be stored as its own persisted NDArray under: + +* `table_root/_cols/` + +This means: + +* each column can be opened independently +* column-level array settings remain attached to the actual stored array +* persistence layout matches the internal columnar design cleanly + +The schema JSON provides the logical order and type constraints; the arrays under +`_cols` provide the physical stored data. + +--- + +## Constructor semantics + +The recommended constructor shape is: + +```python +table = b2.CTable( + Row, + urlpath=None, + mode="a", + expected_size=1_048_576, + compact=False, + validate=True, +) +``` + +Semantics: + +* `urlpath is None` + create an in-memory `CTable` +* `urlpath is not None` + use persistent storage rooted at that path + +Recommended `mode` meanings: + +* `mode="w"` + create a new persistent table, overwriting any existing table root if the API + already supports that pattern elsewhere +* `mode="a"` + open existing or create new +* `mode="r"` + open existing read-only table + +The important public signal is: + +* `urlpath` chooses persistence +* `mode` chooses creation/open behavior + +Users should not need to pass a `TreeStore` object explicitly for the common path. + +--- + +## `open()` support + +An explicit `open()` API should be supported. + +Recommended shape: + +```python +table = b2.open(urlpath) +``` + +or, if needed for clarity: + +```python +table = b2.CTable.open(urlpath, mode="r") +``` + +For `open()` to detect a persisted `CTable`, it should inspect: + +* `urlpath/_meta` +* `urlpath/_meta`.vlmeta["kind"] + +If: + +* `_meta` exists +* `vlmeta["kind"] == "ctable"` + +then the object should be recognized as a persisted `CTable`. + +This keeps `urlpath` simple: it points to the table root, and `_meta` provides +the type marker and schema. + +--- + +## Multiple tables in one TreeStore + +The design must support multiple `CTable` objects in the same `TreeStore`. + +That is one reason `_meta` is a good choice: + +* each table root has its own `_meta` +* each table root can be introspected independently +* schema metadata is naturally scoped to one table subtree + +Example shared TreeStore: + +* `users/_meta` +* `users/_valid_rows` +* `users/_cols/id` +* `orders/_meta` +* `orders/_valid_rows` +* `orders/_cols/order_id` + +No additional global registry is required in the first version. + +--- + +## Column name validation + +Column name validation should be explicit and should be shared between: + +* in-memory `CTable` +* persistent `CTable` + +Reason: + +* a schema should not be valid in memory and then fail only when persisted + +Recommended first-rule constraints for column names: + +* must be a non-empty string +* must not contain `/` +* must not start with `_` +* must not collide with reserved internal names + +Reserved internal names for the table root layout: + +* `_meta` +* `_valid_rows` +* `_cols` + +This validation should happen during schema compilation, not only during +persistent-table creation. + +--- + +## Column order + +Column order should be preserved explicitly in the schema JSON. + +The source of truth is: + +* the order of `schema["columns"]` + +Do not rely on: + +* dict ordering as a persistence contract +* lexical ordering of `_cols/` +* TreeStore iteration order + +On load: + +* reconstruct `table.col_names` from the schema list order +* rebuild any name-to-column map separately + +--- + +## Read-only mode + +When `mode="r"`: + +Allowed: + +* opening the table +* reading rows +* reading columns +* creating non-mutating views +* `head()`, `tail()`, filtering, and other read-only operations + +Disallowed: + +* `append()` +* `delete()` +* `compact()` +* any operation that mutates stored arrays or metadata + +These should fail immediately with a clear error. + +If some existing view path currently requires mutation internally, that should be +cleaned up rather than weakening the read-only contract. + +--- + +## Failure model + +The first persistency version does not need full transactional semantics. + +Be explicit in the implementation and docs: + +* updates touching multiple entries are not guaranteed to be atomic +* partial writes are possible if a failure occurs mid-update + +That is acceptable for the first version as long as it is not hidden. + +The initial goal is a correct and understandable persistent layout, not a full +transaction layer. + +--- + +## Internal API sketch + +This is a proposed internal storage split, not a final public API requirement. + +Possible internal helpers: + +```python +class TableStorage: + def open_column(self, name: str): ... + def create_column( + self, + name: str, + *, + dtype, + shape, + chunks=None, + blocks=None, + cparams=None, + dparams=None + ): ... + def open_valid_rows(self): ... + def create_valid_rows( + self, *, shape, chunks=None, blocks=None, cparams=None, dparams=None + ): ... + def load_schema(self) -> dict: ... + def save_schema(self, schema: dict) -> None: ... + def exists(self) -> bool: ... + def is_read_only(self) -> bool: ... + + +class InMemoryTableStorage(TableStorage): ... + + +class TreeStoreTableStorage(TableStorage): ... +``` + +Then `CTable` can route based on `urlpath`: + +* `urlpath is None` -> `InMemoryTableStorage` +* `urlpath is not None` -> `TreeStoreTableStorage` + +This keeps persistence a backend concern instead of scattering TreeStore logic +throughout all of `CTable`. + +--- + +## Concrete implementation sequence + +### Step 1: extend constructor/open signatures + +Update `src/blosc2/ctable.py` to accept: + +```python +class CTable: + def __init__( + self, + row_type, + new_data=None, + *, + urlpath: str | None = None, + mode: str = "a", + expected_size: int = 1_048_576, + compact: bool = False, + validate: bool = True, + ) -> None: ... +``` + +And add: + +```python +@classmethod +def open(cls, urlpath: str, *, mode: str = "r") -> "CTable": ... +``` + +### Step 2: add storage backend abstraction + +Create a new module: + +* `src/blosc2/ctable_storage.py` + +Add: + +* `TableStorage` +* `InMemoryTableStorage` +* `TreeStoreTableStorage` + +### Step 3: implement TreeStore layout helpers + +In `TreeStoreTableStorage`, add helpers for: + +* `_meta` path +* `_valid_rows` path +* `_cols/` paths +* reading/writing `vlmeta["kind"]` +* reading/writing `vlmeta["version"]` +* reading/writing `vlmeta["schema"]` + +### Step 4: persist schema JSON + +Connect compiled schema export/import to `_meta.vlmeta["schema"]`. + +The schema compiler work should provide: + +```python +def schema_to_dict(schema: CompiledSchema) -> dict: ... +def schema_from_dict(data: dict) -> CompiledSchema: ... +``` + +### Step 5: create/open persistent arrays + +Wire `CTable` initialization so that: + +* create path creates `_meta`, `_valid_rows`, and `_cols/` +* open path loads schema first, then opens `_valid_rows` and columns + +### Step 6: enforce read-only behavior + +Add an internal read-only flag so mutating methods fail early when opened with +`mode="r"`. + +Methods to guard first: + +* `append` +* `extend` +* `delete` +* `compact` + +### Step 7: test persistency layout and round-trips + +Add tests covering: + +* create persistent `CTable` +* reopen persistent `CTable` +* schema JSON present in `_meta.vlmeta` +* `_valid_rows` persisted correctly +* column order preserved after reopen +* multiple tables inside one TreeStore +* read-only mode errors on mutation + +--- + +## Proposed tests + +Suggested test file: + +* `tests/ctable/test_persistency.py` + +Suggested test cases: + +* `test_create_persistent_ctable_layout` +* `test_open_persistent_ctable` +* `test_schema_saved_in_meta_vlmeta` +* `test_valid_rows_persisted` +* `test_column_order_roundtrip` +* `test_multiple_ctables_in_same_treestore` +* `test_read_only_mode_rejects_mutation` + +--- + +## Recommendation + +The recommended persistency design is: + +1. use `urlpath` to switch between in-memory and persistent `CTable` +2. store one table per TreeStore subtree +3. use: + * `_meta` + * `_valid_rows` + * `_cols/` +4. store schema JSON in `_meta.vlmeta["schema"]` +5. store explicit markers in `_meta.vlmeta`: + * `"kind": "ctable"` + * `"version": 1` +6. preserve column order in the schema JSON as an ordered `columns` list +7. keep the first version limited to append-row persistence, not full schema evolution + +This gives `CTable` a clear persistent layout, keeps `open()` introspection +simple, and stays consistent with the existing columnar design. diff --git a/plans/ctable-schema.md b/plans/ctable-schema.md index cacac6d8..d9cd3fb1 100644 --- a/plans/ctable-schema.md +++ b/plans/ctable-schema.md @@ -1155,6 +1155,17 @@ def schema_from_dict(data: dict[str, Any]) -> CompiledSchema: ... This should remain internal until the persisted format is stable. +The persistency design itself is specified in: + +* [ctable-persistency.md](/Users/faltet/blosc/python-blosc2/plans/ctable-persistency.md) + +The schema-layer contract for persistency is: + +* schema must serialize to a versioned JSON-compatible dict +* column order must be preserved explicitly in the serialized `columns` list +* the serialized schema must be sufficient to reconstruct `CompiledSchema` + without requiring the original Python dataclass definition at load time + ### Step 13: delivery order across PRs Recommended PR slicing: @@ -1182,9 +1193,18 @@ PR 3: PR 4: -* persistence groundwork +* persistence groundwork on the schema side * optional compatibility adapter for legacy Pydantic model declarations +PR 5: + +* TreeStore-backed persistency as described in + [ctable-persistency.md](/Users/faltet/blosc/python-blosc2/plans/ctable-persistency.md) +* `urlpath` / `mode` constructor semantics +* explicit `open()` support +* `_meta`, `_valid_rows`, `_cols/` storage layout +* persistency tests + ### Step 14: concrete first-PR checklist The smallest coherent first implementation should be: @@ -1206,6 +1226,13 @@ That first PR gives the project: before touching too much `CTable` mutation logic. +After that first PR lands, follow the later phases in this order: + +1. dataclass-driven `CTable` construction and append path +2. validation and batch-insert behavior +3. schema introspection +4. TreeStore-backed persistency + --- ## Recommendation From e84f7ac6a6ec8dd53252b1833a9e574e51830c3b Mon Sep 17 00:00:00 2001 From: jorge Date: Thu, 26 Mar 2026 13:29:11 +0100 Subject: [PATCH 08/11] _ --- src/blosc2/ctable.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py index 9af63267..ff5ab440 100644 --- a/src/blosc2/ctable.py +++ b/src/blosc2/ctable.py @@ -78,6 +78,23 @@ def __getitem__(self, item): return self._table._run_row_logic(item) +class _Row: + def __init__(self, table: CTable, nrow: int): + self._table = table + self._nrow = nrow + self._real_pos = None + + def _get_real_pos(self) -> int: + self._real_pos = _find_physical_index(self._table._valid_rows, self._nrow) + return self._real_pos + + def __getitem__(self, col_name: str): + if self._real_pos is None: + self._get_real_pos() + return self._table._cols[col_name][self._real_pos] + + + def _resolve_field_dtype(field) -> tuple[np.dtype, int]: """Return (numpy dtype, display_width) for a pydantic model field. @@ -380,6 +397,10 @@ def __str__(self): def __len__(self): return self._n_rows + def __iter__(self): + for i in range(self.nrows): + yield _Row(self, i) + def view(self, new_valid_rows): if not ( isinstance(new_valid_rows, (blosc2.NDArray, blosc2.LazyExpr)) From 8de1870ba1cc926065898495446b59e431214bea Mon Sep 17 00:00:00 2001 From: jorge Date: Thu, 26 Mar 2026 13:53:16 +0100 Subject: [PATCH 09/11] _ --- src/blosc2/ctable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py index ff5ab440..ddd2719c 100644 --- a/src/blosc2/ctable.py +++ b/src/blosc2/ctable.py @@ -94,7 +94,6 @@ def __getitem__(self, col_name: str): return self._table._cols[col_name][self._real_pos] - def _resolve_field_dtype(field) -> tuple[np.dtype, int]: """Return (numpy dtype, display_width) for a pydantic model field. @@ -205,6 +204,7 @@ def _raw_col(self): def _valid_rows(self): if self._mask is None: return self._table._valid_rows + return (self._table._valid_rows & self._mask).compute() def __getitem__(self, key: int | slice | list | np.ndarray): From a8db18d17c9334f278a5a0bd9ebbed71aed1f1b3 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 26 Mar 2026 13:55:00 +0100 Subject: [PATCH 10/11] Testing --- test-remove.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test-remove.md diff --git a/test-remove.md b/test-remove.md new file mode 100644 index 00000000..e69de29b From ce656072d699dcb0434d3d7c2cd72cd347f6bf9d Mon Sep 17 00:00:00 2001 From: jorge Date: Thu, 26 Mar 2026 13:57:06 +0100 Subject: [PATCH 11/11] writen test --- test-remove.md | 1 + 1 file changed, 1 insertion(+) diff --git a/test-remove.md b/test-remove.md index e69de29b..038d718d 100644 --- a/test-remove.md +++ b/test-remove.md @@ -0,0 +1 @@ +testing