From 6de7c308f3434ccce06bf3ae6b732f24be3825ce Mon Sep 17 00:00:00 2001
From: jorge <al426671@uji.es>
Date: Thu, 26 Mar 2026 11:05:01 +0100
Subject: [PATCH 01/11] feat: add CTable, a columnar in-memory table built on
 top of blosc2

Introduce CTable, a new columnar table class for efficient in-memory
data storage using Blosc2 as the underlying compression engine.

Each column is represented as a Column object wrapping a blosc2.NDArray
with typed, compressed storage. Building on top of blosc2's existing
infrastructure, CTable supports append, iteration and
column-based queries.

This is an early-stage (beta) implementation; the table is always fully
loaded in memory.

New files:
- src/blosc2/ctable.py: CTable and Column class definitions
- tests/ctable/: unit tests covering construction, slicing, deletion,
  compaction and row logic
- bench/ctable/: benchmarks comparing CTable against pandas
---
 bench/ctable/compact.py            |  81 +++
 bench/ctable/ctable_v_panda.py     | 127 +++++
 bench/ctable/delete.py             |  82 +++
 bench/ctable/expected_size.py      |  75 +++
 bench/ctable/extend.py             | 117 +++++
 bench/ctable/extend_vs_apend.py    |  84 +++
 bench/ctable/index.py              |  69 +++
 bench/ctable/iteration_column.py   |  85 ++++
 bench/ctable/print.py              | 112 ++++
 bench/ctable/row_acces.py          |  68 +++
 bench/ctable/slice.py              |  77 +++
 bench/ctable/slice_steps.py        |  67 +++
 bench/ctable/slice_to_array.py     |  77 +++
 bench/ctable/where_chain.py        |  79 +++
 bench/ctable/where_selective.py    |  68 +++
 src/blosc2/__init__.py             |   1 +
 src/blosc2/ctable.py               | 793 +++++++++++++++++++++++++++++
 tests/ctable/test_column.py        | 294 +++++++++++
 tests/ctable/test_compact.py       | 157 ++++++
 tests/ctable/test_construct.py     | 225 ++++++++
 tests/ctable/test_delete_rows.py   | 210 ++++++++
 tests/ctable/test_extend_delete.py | 226 ++++++++
 tests/ctable/test_row_logic.py     | 221 ++++++++
 23 files changed, 3395 insertions(+)
 create mode 100644 bench/ctable/compact.py
 create mode 100644 bench/ctable/ctable_v_panda.py
 create mode 100644 bench/ctable/delete.py
 create mode 100644 bench/ctable/expected_size.py
 create mode 100644 bench/ctable/extend.py
 create mode 100644 bench/ctable/extend_vs_apend.py
 create mode 100644 bench/ctable/index.py
 create mode 100644 bench/ctable/iteration_column.py
 create mode 100644 bench/ctable/print.py
 create mode 100644 bench/ctable/row_acces.py
 create mode 100644 bench/ctable/slice.py
 create mode 100644 bench/ctable/slice_steps.py
 create mode 100644 bench/ctable/slice_to_array.py
 create mode 100644 bench/ctable/where_chain.py
 create mode 100644 bench/ctable/where_selective.py
 create mode 100644 src/blosc2/ctable.py
 create mode 100644 tests/ctable/test_column.py
 create mode 100644 tests/ctable/test_compact.py
 create mode 100644 tests/ctable/test_construct.py
 create mode 100644 tests/ctable/test_delete_rows.py
 create mode 100644 tests/ctable/test_extend_delete.py
 create mode 100644 tests/ctable/test_row_logic.py

diff --git a/bench/ctable/compact.py b/bench/ctable/compact.py
new file mode 100644
index 00000000..f41bb008
--- /dev/null
+++ b/bench/ctable/compact.py
@@ -0,0 +1,81 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for measuring compact() time and memory gain after deletions
+# of varying fractions of the table.
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+N = 1_000_000
+
+print(f"compact() benchmark  |  N = {N:,}\n")
+
+# Build base data once
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+DATA = np.array(
+    [
+        (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
+        for i in range(N)
+    ],
+    dtype=np_dtype,
+)
+
+delete_fractions = [0.1, 0.25, 0.5, 0.75, 0.9]
+
+print("=" * 75)
+print(f"{'DELETED':>10} {'ROWS LEFT':>10} {'TIME (s)':>12} {'CBYTES BEFORE':>15} {'CBYTES AFTER':>14}")
+print("-" * 75)
+
+for frac in delete_fractions:
+    ct = blosc2.CTable(RowModel, expected_size=N)
+    ct.extend(DATA)
+
+    n_delete = int(N * frac)
+    ct.delete(list(range(n_delete)))
+
+    cbytes_before = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes
+
+    t0 = time()
+    ct.compact()
+    t_compact = time() - t0
+
+    cbytes_after = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes
+
+    print(
+        f"{frac*100:>9.0f}%"
+        f" {N - n_delete:>10,}"
+        f" {t_compact:>12.4f}"
+        f" {cbytes_before / 1024**2:>13.2f} MB"
+        f" {cbytes_after / 1024**2:>12.2f} MB"
+    )
+
+print("-" * 75)
diff --git a/bench/ctable/ctable_v_panda.py b/bench/ctable/ctable_v_panda.py
new file mode 100644
index 00000000..4f7d6c8a
--- /dev/null
+++ b/bench/ctable/ctable_v_panda.py
@@ -0,0 +1,127 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark comparing CTable vs pandas DataFrame for:
+#   1. Creation from a NumPy structured array
+#   2. Column access (full column)
+#   3. Filtering (where/query)
+#   4. Row iteration
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+import pandas as pd
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+N = 1_000_000
+rng = np.random.default_rng(42)
+
+print(f"CTable vs pandas benchmark  |  N = {N:,}\n")
+
+# Build base data once
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+DATA = np.empty(N, dtype=np_dtype)
+DATA["id"]     = np.arange(N, dtype=np.int64)
+DATA["c_val"]  = rng.standard_normal(N) + 1j * rng.standard_normal(N)
+DATA["score"]  = rng.uniform(0, 100, N)
+DATA["active"] = rng.integers(0, 2, N, dtype=np.bool_)
+
+print("=" * 65)
+print(f"{'OPERATION':<30} {'CTable':>12} {'pandas':>12} {'SPEEDUP':>10}")
+print("-" * 65)
+
+# 1. Creation
+t0 = time()
+ct = blosc2.CTable(RowModel, expected_size=N)
+ct.extend(DATA)
+t_ct_create = time() - t0
+
+t0 = time()
+df = pd.DataFrame(DATA)
+t_pd_create = time() - t0
+
+print(f"{'Creation':<30} {t_ct_create:>12.4f} {t_pd_create:>12.4f} {t_pd_create/t_ct_create:>9.2f}x")
+
+# 2. Column access (full column)
+t0 = time()
+arr = ct["score"]
+t_ct_col = time() - t0
+
+t0 = time()
+arr = df["score"]
+t_pd_col = time() - t0
+
+print(f"{'Column access (full)':<30} {t_ct_col:>12.4f} {t_pd_col:>12.4f} {t_pd_col/t_ct_col:>9.2f}x")
+
+# 2.5 Column access (full column)
+t0 = time()
+arr = ct["score"].to_numpy()
+t_ct_col = time() - t0
+
+t0 = time()
+arr = df["score"].to_numpy()
+t_pd_col = time() - t0
+
+print(f"{'Column access to numpy (full)':<30} {t_ct_col:>12.4f} {t_pd_col:>12.4f} {t_pd_col/t_ct_col:>9.3f}x")
+
+# 3. Filtering
+t0 = time()
+result_ct = ct.where((ct["id"] > 250_000) & (ct["id"] < 750_000))
+t_ct_filter = time() - t0
+
+t0 = time()
+result_pd = df.query("250000 < id < 750000")
+t_pd_filter = time() - t0
+
+print(f"{'Filter (id 250k-750k)':<30} {t_ct_filter:>12.4f} {t_pd_filter:>12.4f} {t_pd_filter/t_ct_filter:>9.2f}x")
+
+# 4. Row iteration
+t0 = time()
+for val in ct["score"]:
+    pass
+t_ct_iter = time() - t0
+
+t0 = time()
+for val in df["score"]:
+    pass
+t_pd_iter = time() - t0
+
+print(f"{'Row iteration':<30} {t_ct_iter:>12.4f} {t_pd_iter:>12.4f} {t_pd_iter/t_ct_iter:>9.2f}x")
+
+print("-" * 65)
+
+# Memory
+ct_cbytes = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes
+ct_nbytes = sum(col.nbytes for col in ct._cols.values()) + ct._valid_rows.nbytes
+pd_nbytes  = df.memory_usage(deep=True).sum()
+
+print(f"\nMemory — CTable compressed:   {ct_cbytes / 1024**2:.2f} MB")
+print(f"Memory — CTable uncompressed: {ct_nbytes / 1024**2:.2f} MB")
+print(f"Memory — pandas:              {pd_nbytes  / 1024**2:.2f} MB")
+print(f"Compression ratio CTable:     {ct_nbytes / ct_cbytes:.2f}x")
diff --git a/bench/ctable/delete.py b/bench/ctable/delete.py
new file mode 100644
index 00000000..fb147c7c
--- /dev/null
+++ b/bench/ctable/delete.py
@@ -0,0 +1,82 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for measuring delete() performance with different index types:
+# int, slice, and list — with varying sizes.
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+N = 1_000_000
+
+print(f"delete() benchmark  |  N = {N:,}\n")
+
+# Build base data once
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+DATA = np.array(
+    [
+        (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
+        for i in range(N)
+    ],
+    dtype=np_dtype,
+)
+
+delete_cases = [
+    ("int",          0),
+    ("slice small",  slice(0, 100)),
+    ("slice large",  slice(0, 100_000)),
+    ("slice full",   slice(0, N)),
+    ("list small",   list(range(100))),
+    ("list large",   list(range(100_000))),
+    ("list full",    list(range(N))),
+]
+
+print("=" * 60)
+print(f"{'CASE':<20} {'ROWS DELETED':>14} {'TIME (s)':>12}")
+print("-" * 60)
+
+for label, key in delete_cases:
+    ct = blosc2.CTable(RowModel, expected_size=N)
+    ct.extend(DATA)
+
+    if isinstance(key, int):
+        n_deleted = 1
+    elif isinstance(key, slice):
+        n_deleted = len(range(*key.indices(N)))
+    else:
+        n_deleted = len(key)
+
+    t0 = time()
+    ct.delete(key)
+    t_delete = time() - t0
+    print(f"{label:<20} {n_deleted:>14,} {t_delete:>12.6f}")
+
+print("-" * 60)
diff --git a/bench/ctable/expected_size.py b/bench/ctable/expected_size.py
new file mode 100644
index 00000000..c4444a62
--- /dev/null
+++ b/bench/ctable/expected_size.py
@@ -0,0 +1,75 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for measuring the overhead of resize() when expected_size
+# is too small (M rows) vs correctly sized (N rows) during extend().
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+
+M = 779
+N = 62_500
+MAX_N = 1_000_000
+print(f"expected_size benchmark  |  wrong expected_size = {M}")
+
+# Pre-generate full dataset once
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+DATA = np.array(
+    [
+        (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
+        for i in range(MAX_N)
+    ],
+    dtype=np_dtype,
+)
+
+while N <= MAX_N:
+    print("-" * 80)
+    print(f"N = {N:,} rows")
+
+    # 1. extend() with correct expected_size = N
+    ct_correct = blosc2.CTable(RowModel, expected_size=N)
+    t0 = time()
+    ct_correct.extend(DATA[:N])
+    t_correct = time() - t0
+    print(f"extend() expected_size=N  ({N:>8,}):  {t_correct:.4f} s   rows: {len(ct_correct):,}")
+
+    # 2. extend() with wrong expected_size = M (forces resize)
+    ct_wrong = blosc2.CTable(RowModel, expected_size=M)
+    t0 = time()
+    ct_wrong.extend(DATA[:N])
+    t_wrong = time() - t0
+    print(f"extend() expected_size=M  ({M:>8,}):  {t_wrong:.4f} s   rows: {len(ct_wrong):,}")
+
+    # Summary
+    print(f"  Slowdown from wrong expected_size: {t_wrong / t_correct:.2f}x")
+
+    N *= 2
diff --git a/bench/ctable/extend.py b/bench/ctable/extend.py
new file mode 100644
index 00000000..f294b012
--- /dev/null
+++ b/bench/ctable/extend.py
@@ -0,0 +1,117 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for measuring CTable creation time from three different sources:
+#   1. Python list of lists (1M rows)
+#   2. NumPy structured array (1M rows) — list of named tuples
+#   3. An existing CTable (previously created from Python lists, 1M rows)
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+
+
+
+# ---------------------------------------------------------------------------
+# Row model
+# ---------------------------------------------------------------------------
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+N = 1_000_000
+print(f"CTable creation benchmark with {N:,} rows\n")
+
+# ---------------------------------------------------------------------------
+# Base data generation (not part of the benchmark timing)
+# ---------------------------------------------------------------------------
+print("Generating base data...")
+
+t0 = time()
+data_list = [
+    [i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0]
+    for i in range(N)
+]
+t_gen_list = time() - t0
+print(f"  Python list generated in:         {t_gen_list:.4f} s")
+
+t0 = time()
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+data_np = np.array(
+    [
+        (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
+        for i in range(N)
+    ],
+    dtype=np_dtype,
+)
+t_gen_np = time() - t0
+print(f"  NumPy structured array generated: {t_gen_np:.4f} s\n")
+
+# ---------------------------------------------------------------------------
+# 1. Creation from a Python list of lists
+# ---------------------------------------------------------------------------
+print("CTable from Python list of lists")
+t0 = time()
+ct_from_list = blosc2.CTable(RowModel, expected_size=N)
+ct_from_list.extend(data_list)
+t_from_list = time() - t0
+print(f"   extend() time (Python list):  {t_from_list:.4f} s")
+print(f"   Rows: {len(ct_from_list):,}")
+
+# ---------------------------------------------------------------------------
+# 2. Creation from a NumPy structured array (list of named tuples)
+# ---------------------------------------------------------------------------
+print("CTable from NumPy structured array")
+t0 = time()
+ct_from_np = blosc2.CTable(RowModel, expected_size=N)
+ct_from_np.extend(data_np)
+t_from_np = time() - t0
+print(f"   extend() time (NumPy struct): {t_from_np:.4f} s")
+print(f"   Rows: {len(ct_from_np):,}")
+
+
+# ---------------------------------------------------------------------------
+# 3. Creation from an existing CTable (ct_from_list, already built above)
+# ---------------------------------------------------------------------------
+print("CTable from an existing CTable")
+t0 = time()
+ct_from_ctable = blosc2.CTable(RowModel, expected_size=N)
+ct_from_ctable.extend(ct_from_list)
+t_from_ctable = time() - t0
+print(f"   extend() time (CTable):       {t_from_ctable:.4f} s")
+print(f"   Rows: {len(ct_from_ctable):,}")
+
+# ---------------------------------------------------------------------------
+# Summary
+# ---------------------------------------------------------------------------
+print("\n")
+print("=" * 60)
+print(f"{'SOURCE':<30} {'TIME (s)':>12} {'SPEEDUP vs list':>18}")
+print("-" * 60)
+print(f"{'Python list of lists':<30} {t_from_list:>12.4f} {'1.00x':>18}")
+print(f"{'NumPy structured array':<30} {t_from_np:>12.4f} {t_from_list / t_from_np:>17.2f}x")
+print(f"{'Existing CTable':<30} {t_from_ctable:>12.4f} {t_from_list / t_from_ctable:>17.2f}x")
+
diff --git a/bench/ctable/extend_vs_apend.py b/bench/ctable/extend_vs_apend.py
new file mode 100644
index 00000000..2036755c
--- /dev/null
+++ b/bench/ctable/extend_vs_apend.py
@@ -0,0 +1,84 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for comparing append() (row by row) vs extend() (bulk),
+# to find the crossover point where extend() becomes worth it.
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+# Parameter — change N to test different crossover points
+N = 2
+print(f"append() vs extend() benchmark")
+for i in range(6):
+    print("\n")
+    print("%" * 100)
+
+
+    # Base data generation
+    data_list = [
+        [i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0] for i in range(N)
+    ]
+
+    # 1. N individual append() calls
+    print(f"{N} individual append() calls")
+    ct_append = blosc2.CTable(RowModel, expected_size=N)
+    t0 = time()
+    for row in data_list:
+        ct_append.append(row)
+    t_append = time() - t0
+    print(f"   Time: {t_append:.6f} s")
+    print(f"   Rows: {len(ct_append):,}")
+
+    # 2. N individual extend() calls (one row at a time)
+    print(f"{N} individual extend() calls (one row at a time)")
+    ct_extend_one = blosc2.CTable(RowModel, expected_size=N)
+    t0 = time()
+    for row in data_list:
+        ct_extend_one.extend([row])
+    t_extend_one = time() - t0
+    print(f"   Time: {t_extend_one:.6f} s")
+    print(f"   Rows: {len(ct_extend_one):,}")
+
+    # 3. Single extend() call with all N rows at once
+    print(f"Single extend() call with all {N} rows at once")
+    ct_extend_bulk = blosc2.CTable(RowModel, expected_size=N)
+    t0 = time()
+    ct_extend_bulk.extend(data_list)
+    t_extend_bulk = time() - t0
+    print(f"   Time: {t_extend_bulk:.6f} s")
+    print(f"   Rows: {len(ct_extend_bulk):,}")
+
+    # Summary
+    print("=" * 70)
+    print(f"{'METHOD':<35} {'TIME (s)':>12} {'SPEEDUP vs append':>20}")
+    print("-" * 70)
+    print(f"{'append() x N':<35} {t_append:>12.6f} {'1.00x':>20}")
+    print(f"{'extend() x N (one row each)':<35} {t_extend_one:>12.6f} {t_append / t_extend_one:>19.2f}x")
+    print(f"{'extend() x 1 (all at once)':<35} {t_extend_bulk:>12.6f} {t_append / t_extend_bulk:>19.2f}x")
+    print("-" * 70)
+
+    N=N*2
diff --git a/bench/ctable/index.py b/bench/ctable/index.py
new file mode 100644
index 00000000..634a68e1
--- /dev/null
+++ b/bench/ctable/index.py
@@ -0,0 +1,69 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for measuring Column[int] access (single row by logical index),
+# which exercises _find_physical_index() traversal over chunk metadata.
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+N = 1_000_000
+indices = [0, N // 4, N // 2, (3 * N) // 4, N - 1]
+
+print(f"Column[int] access benchmark  |  N = {N:,}\n")
+
+# Build CTable once
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+DATA = np.array(
+    [
+        (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
+        for i in range(N)
+    ],
+    dtype=np_dtype,
+)
+
+ct = blosc2.CTable(RowModel, expected_size=N)
+ct.extend(DATA)
+
+print(f"CTable built with {len(ct):,} rows\n")
+print("=" * 60)
+print(f"{'INDEX':<15} {'POSITION':>12} {'TIME (s)':>12}")
+print("-" * 60)
+
+col = ct["score"]
+for idx in indices:
+    t0 = time()
+    val = col[idx]
+    t_access = time() - t0
+    position = f"{idx / N * 100:.0f}% into array"
+    print(f"{idx:<15,} {position:>12}   {t_access:.6f}")
+
+print("-" * 60)
diff --git a/bench/ctable/iteration_column.py b/bench/ctable/iteration_column.py
new file mode 100644
index 00000000..5f0efaed
--- /dev/null
+++ b/bench/ctable/iteration_column.py
@@ -0,0 +1,85 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for comparing full column iteration strategies:
+#   1. for val in ct["score"]  — Python iterator via __iter__
+#   2. np.array(list(ct["score"]))  — materialize via list then convert
+#   3. ct["score"][0:N].to_array()  — slice view + to_array()
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+N = 1_000_000
+
+print(f"Column iteration benchmark  |  N = {N:,}\n")
+
+# Build CTable once
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+DATA = np.array(
+    [
+        (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
+        for i in range(N)
+    ],
+    dtype=np_dtype,
+)
+
+ct = blosc2.CTable(RowModel, expected_size=N)
+ct.extend(DATA)
+
+print(f"CTable built with {len(ct):,} rows\n")
+print("=" * 60)
+
+col = ct["score"]
+
+# 1. Python iterator
+t0 = time()
+for val in col:
+    pass
+t_iter = time() - t0
+print(f"for val in col:              {t_iter:.4f} s")
+
+# 2. list() + np.array()
+t0 = time()
+arr = np.array(list(col))
+t_list = time() - t0
+print(f"np.array(list(col)):         {t_list:.4f} s")
+
+# 3. slice view + to_array()
+t0 = time()
+arr = col[0:N].to_numpy()
+for val in arr:
+    pass
+t_toarray = time() - t0
+print(f"col[0:N].to_array():         {t_toarray:.4f} s")
+
+print("=" * 60)
+print(f"Speedup to_array vs iter:    {t_iter / t_toarray:.2f}x")
+print(f"Speedup to_array vs list:    {t_list / t_toarray:.2f}x")
diff --git a/bench/ctable/print.py b/bench/ctable/print.py
new file mode 100644
index 00000000..af352a2a
--- /dev/null
+++ b/bench/ctable/print.py
@@ -0,0 +1,112 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark: iterative ingestion comparison — Pandas vs CTable
+#   Data source: randomly generated numpy structured array
+
+import time
+from typing import Annotated
+
+import numpy as np
+import pandas as pd
+import blosc2
+from pydantic import BaseModel, Field
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+class RowModel(BaseModel):
+    id:    Annotated[int,   NumpyDtype(np.int64)]
+    name:  Annotated[str,   NumpyDtype(np.dtype("<U9"))]
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0)
+
+
+NAMES = ["benchmark", "alpha", "beta", "gamma", "delta",
+         "epsilon", "zeta", "eta", "theta", "iota"]
+
+N   = 100_000
+rng = np.random.default_rng(42)
+
+np_dtype = np.dtype([("id", np.int64), ("name", "<U9"), ("score", np.float64)])
+
+
+def make_data(n: int) -> np.ndarray:
+    arr = np.empty(n, dtype=np_dtype)
+    arr["id"]    = np.arange(n, dtype=np.int64)
+    arr["name"]  = np.array([rng.choice(NAMES) for _ in range(n)], dtype="<U9")
+    arr["score"] = rng.uniform(0, 100, n)
+    return arr
+
+
+print(f"=== BENCHMARK: Iterative Ingestion ({N:,} rows) ===\n")
+
+# ─────────────────────────────────────────────────────────────
+# 1. PANDAS
+# ─────────────────────────────────────────────────────────────
+print("--- 1. PANDAS (structured array -> DataFrame) ---")
+data = make_data(N)
+
+t0 = time.perf_counter()
+df = pd.DataFrame(data)
+t_pandas = time.perf_counter() - t0
+
+mem_pandas = df.memory_usage(deep=True).sum() / (1024 ** 2)
+print(f"Total time:   {t_pandas:.4f} s")
+print(f"Memory (RAM): {mem_pandas:.2f} MB")
+
+print("\n--- PANDAS: First 10 rows ---")
+t0_print = time.perf_counter()
+print(df.head(10).to_string())
+t_print_pandas = time.perf_counter() - t0_print
+print(f"\nPrint time: {t_print_pandas:.6f} s")
+
+# ─────────────────────────────────────────────────────────────
+# 2. BLOSC2 CTable
+# ─────────────────────────────────────────────────────────────
+print("\n" + "=" * 60)
+print("--- 2. BLOSC2 CTable (structured array -> extend) ---")
+data = make_data(N)
+
+t0 = time.perf_counter()
+ct = blosc2.CTable(RowModel, expected_size=N)
+ct.extend(data)
+t_blosc = time.perf_counter() - t0
+
+fields       = list(RowModel.model_fields.keys())
+mem_blosc_c  = (sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes) / (1024 ** 2)
+mem_blosc_uc = (sum(col.nbytes for col in ct._cols.values()) + ct._valid_rows.nbytes) / (1024 ** 2)
+
+print(f"Total time:            {t_blosc:.4f} s")
+print(f"Memory (uncompressed): {mem_blosc_uc:.2f} MB")
+print(f"Memory (compressed):   {mem_blosc_c:.2f} MB")
+
+print("\n--- BLOSC2: First 10 rows ---")
+t0_print = time.perf_counter()
+print(ct.head(10))
+t_print_blosc = time.perf_counter() - t0_print
+print(f"\nPrint time: {t_print_blosc:.6f} s")
+
+# ─────────────────────────────────────────────────────────────
+# SUMMARY
+# ─────────────────────────────────────────────────────────────
+print("\n" + "=" * 60)
+print("--- SUMMARY ---")
+speedup   = t_pandas / t_blosc
+direction = "faster" if t_blosc < t_pandas else "slower"
+
+print(f"{'METRIC':<30} {'Pandas':>12} {'Blosc2':>12}")
+print("-" * 55)
+print(f"{'Ingestion time (s)':<30} {t_pandas:>12.4f} {t_blosc:>12.4f}")
+print(f"{'Memory (MB)':<30} {mem_pandas:>12.2f} {mem_blosc_c:>12.2f}")
+print(f"{'Print time (s)':<30} {t_print_pandas:>12.6f} {t_print_blosc:>12.6f}")
+print("-" * 55)
+print(f"\nSpeedup:               {speedup:.2f}x {direction}")
+print(f"Compression ratio:     {mem_blosc_uc / mem_blosc_c:.2f}x")
+print(f"Blosc2 vs Pandas size: {mem_blosc_c / mem_pandas * 100:.1f}%")
diff --git a/bench/ctable/row_acces.py b/bench/ctable/row_acces.py
new file mode 100644
index 00000000..c44439e0
--- /dev/null
+++ b/bench/ctable/row_acces.py
@@ -0,0 +1,68 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for measuring row[int] access (full row via _RowIndexer),
+# testing access at different positions across the array.
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+N = 1_000_000
+indices = [0, N // 4, N // 2, (3 * N) // 4, N - 1]
+
+print(f"row[int] access benchmark  |  N = {N:,}\n")
+
+# Build CTable once
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+DATA = np.array(
+    [
+        (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
+        for i in range(N)
+    ],
+    dtype=np_dtype,
+)
+
+ct = blosc2.CTable(RowModel, expected_size=N)
+ct.extend(DATA)
+
+print(f"CTable built with {len(ct):,} rows\n")
+print("=" * 60)
+print(f"{'INDEX':<15} {'POSITION':>12} {'TIME (s)':>12}")
+print("-" * 60)
+
+for idx in indices:
+    t0 = time()
+    row = ct.row[idx]
+    t_access = time() - t0
+    position = f"{idx / N * 100:.0f}% into array"
+    print(f"{idx:<15,} {position:>12}   {t_access:.6f}")
+
+print("-" * 60)
diff --git a/bench/ctable/slice.py b/bench/ctable/slice.py
new file mode 100644
index 00000000..4976a8d8
--- /dev/null
+++ b/bench/ctable/slice.py
@@ -0,0 +1,77 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for measuring Column[slice] access with slices of different
+# sizes and positions: small, large, and middle of the array.
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+N = 1_000_000
+slices = [
+    ("small  — start",  slice(0, 100)),
+    ("small  — middle", slice(N // 2, N // 2 + 100)),
+    ("small  — end",    slice(N - 100, N)),
+    ("large  — start",  slice(0, 100_000)),
+    ("large  — middle", slice(N // 2 - 50_000, N // 2 + 50_000)),
+    ("large  — end",    slice(N - 100_000, N)),
+    ("full   — all",    slice(0, N)),
+]
+
+print(f"Column[slice] access benchmark  |  N = {N:,}\n")
+
+# Build CTable once
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+DATA = np.array(
+    [
+        (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
+        for i in range(N)
+    ],
+    dtype=np_dtype,
+)
+
+ct = blosc2.CTable(RowModel, expected_size=N)
+ct.extend(DATA)
+
+print(f"CTable built with {len(ct):,} rows\n")
+print("=" * 65)
+print(f"{'SLICE':<25} {'ROWS':>8} {'TIME (s)':>12}")
+print("-" * 65)
+
+col = ct["score"]
+for label, s in slices:
+    t0 = time()
+    val = col[s]
+    t_access = time() - t0
+    n_rows = s.stop - s.start
+    print(f"{label:<25} {n_rows:>8,} {t_access:>12.6f}")
+
+print("-" * 65)
diff --git a/bench/ctable/slice_steps.py b/bench/ctable/slice_steps.py
new file mode 100644
index 00000000..311b5f9c
--- /dev/null
+++ b/bench/ctable/slice_steps.py
@@ -0,0 +1,67 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for measuring Column[::step].to_array() with varying step sizes.
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+N = 1_000_000
+steps = [1, 2, 4, 8, 16, 100, 1000]
+
+print(f"Column[::step].to_array() benchmark  |  N = {N:,}\n")
+
+# Build CTable once
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+DATA = np.array(
+    [
+        (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
+        for i in range(N)
+    ],
+    dtype=np_dtype,
+)
+
+ct = blosc2.CTable(RowModel, expected_size=N)
+ct.extend(DATA)
+
+print(f"CTable built with {len(ct):,} rows\n")
+print("=" * 60)
+print(f"{'STEP':<10} {'ROWS RETURNED':>15} {'TIME (s)':>12}")
+print("-" * 60)
+
+col = ct["score"]
+for step in steps:
+    t0 = time()
+    arr = col[::step].to_numpy()
+    t_total = time() - t0
+    print(f"::{ step:<8} {len(arr):>15,} {t_total:>12.6f}")
+
+print("-" * 60)
diff --git a/bench/ctable/slice_to_array.py b/bench/ctable/slice_to_array.py
new file mode 100644
index 00000000..2b072c8c
--- /dev/null
+++ b/bench/ctable/slice_to_array.py
@@ -0,0 +1,77 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for measuring Column[slice] + to_array() with slices of
+# different sizes and positions: small, large, and middle of the array.
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+N = 1_000_000
+slices = [
+    ("small  — start",  slice(0, 100)),
+    ("small  — middle", slice(N // 2, N // 2 + 100)),
+    ("small  — end",    slice(N - 100, N)),
+    ("large  — start",  slice(0, 100_000)),
+    ("large  — middle", slice(N // 2 - 50_000, N // 2 + 50_000)),
+    ("large  — end",    slice(N - 100_000, N)),
+    ("full   — all",    slice(0, N)),
+]
+
+print(f"Column[slice].to_array() benchmark  |  N = {N:,}\n")
+
+# Build CTable once
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+DATA = np.array(
+    [
+        (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
+        for i in range(N)
+    ],
+    dtype=np_dtype,
+)
+
+ct = blosc2.CTable(RowModel, expected_size=N)
+ct.extend(DATA)
+
+print(f"CTable built with {len(ct):,} rows\n")
+print("=" * 65)
+print(f"{'SLICE':<25} {'ROWS':>8} {'TIME (s)':>12}")
+print("-" * 65)
+
+col = ct["score"]
+for label, s in slices:
+    t0 = time()
+    arr = col[s].to_numpy()
+    t_total = time() - t0
+    n_rows = s.stop - s.start
+    print(f"{label:<25} {n_rows:>8,} {t_total:>12.6f}")
+
+print("-" * 65)
diff --git a/bench/ctable/where_chain.py b/bench/ctable/where_chain.py
new file mode 100644
index 00000000..ff8b9b30
--- /dev/null
+++ b/bench/ctable/where_chain.py
@@ -0,0 +1,79 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for comparing chained where() calls vs a single combined filter.
+# Filters: 250k < id < 750k, active == False, 25.0 < score < 75.0
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+N = 1_000_000
+
+print(f"where() chained vs combined benchmark  |  N = {N:,}")
+
+# Build CTable once
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+DATA = np.array(
+    [
+        (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
+        for i in range(N)
+    ],
+    dtype=np_dtype,
+)
+
+ct = blosc2.CTable(RowModel, expected_size=N)
+ct.extend(DATA)
+
+print(f"CTable built with {len(ct):,} rows\n")
+print("=" * 70)
+
+# 1. Three chained where() calls
+t0 = time()
+r1 = ct.where((ct["id"] > 250_000))
+r2 = r1.where((ct["id"] < 750_000))
+r3 = r2.where((ct["score"] > 25.0))
+r4 = r3.where((ct["score"] < 75.0))
+r5 = r4.where(ct["active"] == False)
+t_chained = time() - t0
+print(f"Chained where() (5 calls):  {t_chained:.6f} s   rows: {len(r5):,}")
+
+# 2. Single combined where() call
+t0 = time()
+result = ct.where(
+    (ct["id"] > 250_000) & (ct["id"] < 750_000) &
+    (ct["active"] == False) &
+    (ct["score"] > 25.0) & (ct["score"] < 75.0)
+)
+t_combined = time() - t0
+print(f"Combined where() (1 call):  {t_combined:.6f} s   rows: {len(result):,}")
+
+print("=" * 70)
+print(f"Speedup combined vs chained: {t_chained / t_combined:.2f}x")
diff --git a/bench/ctable/where_selective.py b/bench/ctable/where_selective.py
new file mode 100644
index 00000000..77767d45
--- /dev/null
+++ b/bench/ctable/where_selective.py
@@ -0,0 +1,68 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for measuring where() performance with varying selectivity.
+# Filter: id < threshold, with thresholds covering 1%, 10%, 50%, 90%, 100%
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+N = 1_000_000
+thresholds = [10,10_000, 100_000,250_000, 500_000,750_000 ,900_000, 999_990, 1_000_000]
+
+print(f"where() selectivity benchmark  |  N = {N:,}")
+
+# Build CTable once
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+DATA = np.array(
+    [
+        (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
+        for i in range(N)
+    ],
+    dtype=np_dtype,
+)
+
+ct = blosc2.CTable(RowModel, expected_size=N)
+ct.extend(DATA)
+
+print(f"CTable built with {len(ct):,} rows\n")
+print("=" * 70)
+print(f"{'THRESHOLD':<15} {'ROWS RETURNED':>15} {'SELECTIVITY':>13} {'TIME (s)':>12}")
+print("-" * 70)
+
+for threshold in thresholds:
+    t0 = time()
+    result = ct.where(ct["id"] < threshold)
+    t_where = time() - t0
+    selectivity = threshold / N * 100
+    print(f"id < {threshold:<10,} {len(result):>15,} {selectivity:>12.1f}% {t_where:>12.6f}")
+
+print("-" * 70)
diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py
index e32b2f48..8afc0653 100644
--- a/src/blosc2/__init__.py
+++ b/src/blosc2/__init__.py
@@ -596,6 +596,7 @@ def _raise(exc):
 """
 
 # Delayed imports for avoiding overwriting of python builtins
+from .ctable import CTable, Column
 from .ndarray import (
     abs,
     acos,
diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py
new file mode 100644
index 00000000..3acb1731
--- /dev/null
+++ b/src/blosc2/ctable.py
@@ -0,0 +1,793 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# This source code is licensed under a BSD-style license (found in the
+# LICENSE file in the root directory of this source tree)
+#######################################################################
+
+"""Imports for CTable"""
+
+from __future__ import annotations
+
+from collections.abc import Iterable
+from typing import Any, Generic, TypeVar
+
+import numpy as np
+
+from blosc2 import compute_chunks_blocks
+
+try:
+    from line_profiler import profile
+except ImportError:
+
+    def profile(func):
+
+        def wrapper(*args, **kwargs):
+            return func(*args, **kwargs)
+
+        wrapper.__name__ = func.__name__
+        return wrapper
+
+
+from pydantic import BaseModel
+
+import blosc2
+
+RowT = TypeVar("RowT", bound=BaseModel)
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+class MaxLen:
+    def __init__(self, length: int):
+        self.length = int(length)
+
+
+#############################
+####  Row model examples  ###
+#############################
+"""
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+class RowModel2(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int16)] = Field(ge=0)
+    name: Annotated[str, MaxLen(10)] = Field(default="unknown")
+    # name: Annotated[bytes, MaxLen(10)] = Field(default=b"unknown")
+    score: Annotated[float, NumpyDtype(np.float32)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+class RowModel3(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int16)] = Field(ge=0)
+    #name: Annotated[str, MaxLen(10)] = Field(default="unknown")
+    name: Annotated[bytes, MaxLen(10)] = Field(default=b"unknown")"""
+
+
+class _RowIndexer:
+    def __init__(self, table):
+        self._table = table
+
+    def __getitem__(self, item):
+        return self._table._run_row_logic(item)
+
+
+def _resolve_field_dtype(field) -> tuple[np.dtype, int]:
+    """Return (numpy dtype, display_width) for a pydantic model field.
+
+    Extracts dtype from NumpyDtype metadata when present, otherwise falls
+    back to a sensible default for each Python primitive type.
+    """
+    annotation = field.annotation
+    origin = getattr(annotation, "__origin__", annotation)
+
+    # str / bytes: look for MaxLen metadata, build fixed-width dtype
+    if origin in (str, bytes) or annotation in (str, bytes):
+        is_bytes = origin is bytes or annotation is bytes
+        max_len = 32
+        if hasattr(annotation, "__metadata__"):
+            for meta in annotation.__metadata__:
+                if isinstance(meta, MaxLen):
+                    max_len = meta.length
+                    break
+        kind = "S" if is_bytes else "U"
+        dt = np.dtype(f"{kind}{max_len}")
+        display_width = max(10, min(max_len, 50))
+        return dt, display_width
+
+    # Check for explicit NumpyDtype metadata (overrides primitive defaults)
+    if hasattr(annotation, "__metadata__"):
+        for meta in annotation.__metadata__:
+            if isinstance(meta, NumpyDtype):
+                dt = np.dtype(meta.dtype)
+                display_width = _default_display_width(origin)
+                return dt, display_width
+
+    # Primitive defaults
+    _PRIMITIVE_MAP = {
+        int: (np.int64, 12),
+        float: (np.float64, 15),
+        bool: (np.bool_, 6),
+        complex: (np.complex128, 25),
+    }
+    if origin in _PRIMITIVE_MAP:
+        dt_raw, display_width = _PRIMITIVE_MAP[origin]
+        return np.dtype(dt_raw), display_width
+
+    return np.dtype(np.object_), 20
+
+
+def _default_display_width(origin) -> int:
+    """Return a sensible display column width for a given Python type."""
+    return {int: 12, float: 15, bool: 6, complex: 25}.get(origin, 20)
+
+
+def _find_physical_index(arr: blosc2.NDArray, logical_key: int) -> int:
+    """Translate a logical (valid-row) index into a physical array index.
+
+    Iterates chunk metadata of the boolean *arr* (valid_rows) to locate the
+    *logical_key*-th True value without fully decompressing the array.
+
+    Returns
+    -------
+    int
+        Physical position in the underlying storage array.
+
+    Raises
+    ------
+    IndexError
+        If the logical index is out of range or the array is inconsistent.
+    """
+    count = 0
+    chunk_size = arr.chunks[0]
+
+    for info in arr.iterchunks_info():
+        actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size)
+        chunk_start = info.nchunk * chunk_size
+
+        if info.special == blosc2.SpecialValue.ZERO:
+            continue
+
+        if info.special == blosc2.SpecialValue.VALUE:
+            val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0]
+            if not val:
+                continue
+            if count + actual_size <= logical_key:
+                count += actual_size
+                continue
+            return chunk_start + (logical_key - count)
+
+        chunk_data = arr[chunk_start : chunk_start + actual_size]
+        n_true = int(np.count_nonzero(chunk_data))
+        if count + n_true <= logical_key:
+            count += n_true
+            continue
+
+        return chunk_start + int(np.flatnonzero(chunk_data)[logical_key - count])
+
+    raise IndexError("Unexpected error finding physical index.")
+
+
+class Column:
+    def __init__(self, table: CTable, col_name: str, mask=None):
+        self._table = table
+        self._col_name = col_name
+        self._mask = mask
+
+    @property
+    def _raw_col(self):
+        return self._table._cols[self._col_name]
+
+    @property
+    def _valid_rows(self):
+        if self._mask is None:
+            return self._table._valid_rows
+        return (self._table._valid_rows & self._mask).compute()
+
+    def __getitem__(self, key: int | slice | list | np.ndarray):
+        if isinstance(key, int):
+            n_rows = len(self)
+            if key < 0:
+                key += n_rows
+            if not (0 <= key < n_rows):
+                raise IndexError(f"index {key} is out of bounds for column with size {n_rows}")
+            pos_true = _find_physical_index(self._valid_rows, key)
+            return self._raw_col[int(pos_true)]
+
+
+
+        elif isinstance(key, slice):
+            real_pos = blosc2.where(self._valid_rows, np.arange(len(self._valid_rows))).compute()
+            start, stop, step = key.indices(len(real_pos))
+            mask = blosc2.zeros(len(self._table._valid_rows), dtype=np.bool_)
+            if step == 1:
+                phys_start = real_pos[start]
+                phys_stop = real_pos[stop - 1]
+                mask[phys_start: phys_stop + 1] = True
+            else:
+                lindices = np.arange(start, stop, step)
+                phys_indices = real_pos[lindices]
+                mask[phys_indices[:]] = True
+            return Column(self._table, self._col_name, mask=mask)
+
+
+        elif isinstance(key, (list, tuple, np.ndarray)):
+            real_pos = blosc2.where(self._valid_rows, np.arange(len(self._valid_rows))).compute()
+            phys_indices = np.array([real_pos[i] for i in key], dtype=np.int64)
+            return self._raw_col[phys_indices]
+
+        raise TypeError(f"Invalid index type: {type(key)}")
+
+    def __setitem__(self, key: int | slice | list | np.ndarray, value):
+        if isinstance(key, int):
+            n_rows = len(self)
+            if key < 0:
+                key += n_rows
+            if not (0 <= key < n_rows):
+                raise IndexError(f"index {key} is out of bounds for column with size {n_rows}")
+            pos_true = _find_physical_index(self._valid_rows, key)
+            self._raw_col[int(pos_true)] = value
+
+        elif isinstance(key, (slice, list, tuple, np.ndarray)):
+            real_pos = blosc2.where(self._valid_rows, np.arange(len(self._valid_rows))).compute()
+            if isinstance(key, slice):
+                lindices = range(*key.indices(len(real_pos)))
+                phys_indices = np.array([real_pos[i] for i in lindices], dtype=np.int64)
+            else:
+                phys_indices = np.array([real_pos[i] for i in key], dtype=np.int64)
+
+            if isinstance(value, (list, tuple)):
+                value = np.array(value, dtype=self._raw_col.dtype)
+            self._raw_col[phys_indices] = value
+
+        else:
+            raise TypeError(f"Invalid index type: {type(key)}")
+
+    def __iter__(self):
+        arr = self._valid_rows
+        chunk_size = arr.chunks[0]
+
+        for info in arr.iterchunks_info():
+            actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size)
+            chunk_start = info.nchunk * chunk_size
+
+            if info.special == blosc2.SpecialValue.ZERO:
+                continue
+
+            if info.special == blosc2.SpecialValue.VALUE:
+                val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0]
+                if not val:
+                    continue
+                yield from self._raw_col[chunk_start: chunk_start + actual_size]
+                continue
+
+            mask_chunk = arr[chunk_start: chunk_start + actual_size]
+            data_chunk = self._raw_col[chunk_start: chunk_start + actual_size]
+            yield from data_chunk[mask_chunk]
+
+    def __len__(self):
+        return blosc2.count_nonzero(self._valid_rows)
+
+    def __lt__(self, other):
+        # < (Less than)
+        return self._raw_col < other
+
+    def __le__(self, other):
+        # <= (Less than or equal to)
+        return self._raw_col <= other
+
+    def __eq__(self, other):
+        # == (Equal to)
+        return self._raw_col == other
+
+    def __ne__(self, other):
+        # != (Not equal to)
+        return self._raw_col != other
+
+    def __gt__(self, other):
+        # > (Greater than)
+        return self._raw_col > other
+
+    def __ge__(self, other):
+        # >= (Greater than or equal to)
+        return self._raw_col >= other
+
+    @property
+    def dtype(self):
+        return self._raw_col.dtype
+
+    def to_numpy(self):
+        real_pos = blosc2.where(self._valid_rows, np.arange(len(self._valid_rows))).compute()
+        return self._raw_col[real_pos[:]]
+
+
+class CTable(Generic[RowT]):
+    def __init__(self, row_type: type[RowT], new_data=None, expected_size: int = 1_048_576, compact: bool = False) -> None:
+        self._row_type = row_type
+        self._cols: dict[str, blosc2.NDArray] = {}
+        self._n_rows: int = 0
+        self._col_widths: dict[str, int] = {}
+        self.col_names = []
+        self.row = _RowIndexer(self)
+        self.auto_compact = compact
+        self.base = None
+
+        c, b = compute_chunks_blocks((expected_size,))
+        self._valid_rows = blosc2.zeros(shape=(expected_size,), dtype=np.bool_, chunks=c, blocks=b)
+
+        for name, field in row_type.model_fields.items():
+            self.col_names.append(name)
+            dt, display_width = _resolve_field_dtype(field)
+            final_width = max(len(name), display_width)
+            self._col_widths[name] = final_width
+            self._cols[name] = blosc2.zeros(shape=(expected_size,), dtype=dt, chunks=c, blocks=b)
+
+        if new_data is not None:
+            self._load_initial_data(new_data)
+
+    def _load_initial_data(self, new_data) -> None:
+        """Dispatch new_data to append() or extend() as appropriate."""
+        is_append = False
+
+        if isinstance(new_data, (np.void, np.record)):
+            is_append = True
+        elif isinstance(new_data, np.ndarray):
+            if new_data.dtype.names is not None and new_data.ndim == 0:
+                is_append = True
+        elif isinstance(new_data, list) and len(new_data) > 0:
+            first_elem = new_data[0]
+            if isinstance(first_elem, (str, bytes, int, float, bool, complex)):
+                is_append = True
+
+        if is_append:
+            self.append(new_data)
+        else:
+            self.extend(new_data)
+
+    def __str__(self):
+        retval = []
+        cont = 0
+
+        # We print the header
+        for name in self._cols:
+            retval.append(f"{name:^{self._col_widths[name]}} |")
+            cont += self._col_widths[name] + 2
+        retval.append("\n")
+        for _i in range(cont):
+            retval.append("-")
+        retval.append("\n")
+
+        # We print the rows
+
+        """Change this. Use where"""
+        real_poss = blosc2.where(self._valid_rows, np.array(range(len(self._valid_rows)))).compute()
+
+        for j in real_poss:
+            for name in self._cols:
+                retval.append(f"{self._cols[name][j]:^{self._col_widths[name]}}")
+                retval.append(" |")
+            retval.append("\n")
+            for _ in range(cont):
+                retval.append("-")
+            retval.append("\n")
+        return "".join(retval)
+
+    def __len__(self):
+        return self._n_rows
+
+    def view(self, new_valid_rows):
+        if not (
+            isinstance(new_valid_rows, (blosc2.NDArray, blosc2.LazyExpr))
+            and (getattr(new_valid_rows, "dtype", None) == np.bool_)
+        ):
+            raise TypeError(
+                f"Expected boolean blosc2.NDArray or LazyExpr, got {type(new_valid_rows).__name__}"
+            )
+
+        new_valid_rows = (
+            new_valid_rows.compute() if isinstance(new_valid_rows, blosc2.LazyExpr) else new_valid_rows
+        )
+
+        if len(self._valid_rows) != len(new_valid_rows):
+            raise ValueError()
+
+        retval = CTable(self._row_type, compact=self.auto_compact, expected_size=len(self._valid_rows))
+        retval._cols = self._cols
+        retval._n_rows = blosc2.count_nonzero(new_valid_rows)
+        retval._col_widths = self._col_widths
+        retval.col_names = self.col_names
+        retval.base = self
+        retval._valid_rows = new_valid_rows
+
+        return retval
+
+    def head(self, N: int = 5) -> CTable:
+        """
+        # Alternative code, slower with big data
+        if n <= 0:
+            return CTable(self._row_type, compact=self.auto_compact)
+
+        real_poss = blosc2.where(self._valid_rows, np.array(range(len(self._valid_rows)))).compute()
+        n_take = min(n, self._n_rows)
+
+        retval = CTable(self._row_type, compact=self.auto_compact)
+        retval._n_rows = n_take
+        retval._valid_rows[:n_take] = True
+
+        for k in self._cols.keys():
+            retval._cols[k][:n_take] = self._cols[k][real_poss[:n_take]]
+
+        return retval"""
+        if N <= 0:
+            # If N is 0 or negative, return an empty table
+            return self.view(blosc2.zeros(shape=len(self._valid_rows), dtype=np.bool_))
+
+        arr = self._valid_rows
+        count = 0
+        chunk_size = arr.chunks[0]
+        pos_N_true = -1
+        if N <= 0:
+            return self.view(blosc2.zeros(shape=len(arr), dtype=np.bool_))
+        for info in arr.iterchunks_info():
+            actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size)
+            chunk_start = info.nchunk * chunk_size
+
+            # All False without decompressing -> skip
+            if info.special == blosc2.SpecialValue.ZERO:
+                continue
+
+            # Repeated value -> check if True or False
+            if info.special == blosc2.SpecialValue.VALUE:
+                val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0]
+                if not val:
+                    continue  # all False, skip
+                # All True: target is at offset (N - count - 1) within the chunk
+                if count + actual_size < N:
+                    count += actual_size
+                    continue
+                pos_N_true = chunk_start + (N - count - 1)
+                break
+
+            # General case: decompress only this chunk
+            chunk_data = arr[chunk_start : chunk_start + actual_size]
+
+            n_true = int(np.count_nonzero(chunk_data))
+            if count + n_true < N:
+                count += n_true
+                continue
+
+            # The N-th True is in this chunk
+            pos_N_true = chunk_start + int(np.flatnonzero(chunk_data)[N - count - 1])
+            break
+
+        if pos_N_true == -1:
+            return self.view(self._valid_rows)
+
+        if pos_N_true < len(self._valid_rows) // 2:
+            mask_arr = blosc2.zeros(shape=len(arr), dtype=np.bool_)
+            mask_arr[: pos_N_true + 1] = True
+        else:
+            mask_arr = blosc2.ones(shape=len(arr), dtype=np.bool_)
+            mask_arr[pos_N_true + 1 :] = False
+
+        mask_arr = (mask_arr & self._valid_rows).compute()
+        return self.view(mask_arr)
+
+    def tail(self, N: int = 5) -> CTable:
+        if N <= 0:
+            # If N is 0 or negative, return an empty table
+            return self.view(blosc2.zeros(shape=len(self._valid_rows), dtype=np.bool_))
+
+        arr = self._valid_rows
+        count = 0
+        chunk_size = arr.chunks[0]
+        pos_N_true = -1
+
+        # Convert to list to iterate chunks in reverse order (metadata only, ~0 memory)
+        for info in reversed(list(arr.iterchunks_info())):
+            actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size)
+            chunk_start = info.nchunk * chunk_size
+
+            # All False without decompressing -> skip
+            if info.special == blosc2.SpecialValue.ZERO:
+                continue
+
+            # Repeated value -> check if True or False
+            if info.special == blosc2.SpecialValue.VALUE:
+                val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0]
+                if not val:
+                    continue  # all False, skip
+
+                # All True: target is at offset 'actual_size - (N - count)' from chunk start
+                if count + actual_size < N:
+                    count += actual_size
+                    continue
+                pos_N_true = chunk_start + actual_size - (N - count)
+                break
+
+            # General case: decompress only this chunk
+            chunk_data = arr[chunk_start : chunk_start + actual_size]
+
+            n_true = int(np.count_nonzero(chunk_data))
+            if count + n_true < N:
+                count += n_true
+                continue
+
+            # The N-th True from the end is in this chunk
+            # We use negative indexing [-(N - count)] to get elements from the back
+            pos_N_true = chunk_start + int(np.flatnonzero(chunk_data)[-(N - count)])
+            break
+
+        if pos_N_true == -1:
+            return self.view(self._valid_rows)
+
+        # Mask creation logic reversed: keep everything from pos_N_true to the end
+        if pos_N_true > len(arr) // 2:
+            # We keep a small tail (less than half the array): start with zeros
+            mask_arr = blosc2.zeros(shape=len(arr), dtype=np.bool_)
+            mask_arr[pos_N_true:] = True
+        else:
+            # We keep a large tail (more than half the array): start with ones
+            mask_arr = blosc2.ones(shape=len(arr), dtype=np.bool_)
+            if pos_N_true > 0:
+                mask_arr[:pos_N_true] = False
+
+        # Compute intersection with existing valid rows and creating view
+        mask_arr = (mask_arr & self._valid_rows).compute()
+        return self.view(mask_arr)
+
+    def __getitem__(self, s: str):
+        if s in self._cols:
+            return Column(self, s)
+        return None
+
+    def __getattr__(self, s: str):
+        if s in self._cols:
+            return Column(self, s)
+        return super().__getattribute__(s)
+
+    def compact(self):
+        real_poss = blosc2.where(self._valid_rows, np.array(range(len(self._valid_rows)))).compute()
+        start = 0
+        block_size = self._valid_rows.blocks[0]
+        end = min(block_size, self._n_rows)
+        while start < end:
+            for _k, v in self._cols.items():
+                v[start:end] = v[real_poss[start:end]]
+            start += block_size
+            end = min(end + block_size, self._n_rows)
+
+        self._valid_rows[: self._n_rows] = True
+        self._valid_rows[self._n_rows :] = False
+
+    @property
+    def nrows(self) -> int:
+        return self._n_rows
+
+    @property
+    def ncols(self) -> int:
+        return len(self._cols)
+
+    def info(self) -> None:
+        """
+        Prints a concise summary of the CTable, including the column names,
+        their data types, and memory layout.
+        """
+        n_cols = len(self._cols)
+        n_rows = len(self)
+
+        # Calculate global memory usage
+        cbytes = sum(col.cbytes for col in self._cols.values()) + self._valid_rows.cbytes
+        nbytes = sum(col.nbytes for col in self._cols.values()) + self._valid_rows.nbytes
+
+        def format_bytes(bytes_size: float) -> str:
+            if bytes_size < 1024:
+                return f"{bytes_size} B"
+            elif bytes_size < 1024**2:
+                return f"{bytes_size / 1024:.2f} KB"
+            elif bytes_size < 1024**3:
+                return f"{bytes_size / (1024**2):.2f} MB"
+            else:
+                return f"{bytes_size / (1024**3):.2f} GB"
+
+        ratio = (nbytes / cbytes) if cbytes > 0 else 0.0
+
+        lines = []
+        lines.append("<class 'CTable'>")
+        lines.append(f"nºColumns: {n_cols}")
+        lines.append(f"nºRows: {n_rows}")
+        lines.append("")
+
+        # New Header: replaced "Non-Null Count" with internal Array length & Itemsize
+        header = f" {'#':>3}   {'Column':<15} {'Itemsize':<12} {'Dtype':<15}"
+        lines.append(header)
+        lines.append(f" {'---':>3}  {'------':<15} {'--------':<12} {'-----':<15}")
+
+        for i, name in enumerate(self.col_names):
+            col_array = self._cols[name]
+            dtype_str = str(col_array.dtype)
+            itemsize = f"{col_array.dtype.itemsize} B"
+
+            line = f" {i:>3}   {name:<15} {itemsize:<12} {dtype_str:<15}"
+            lines.append(line)
+
+        lines.append("")
+        lines.append(f"memory usage: {format_bytes(cbytes)}")
+        lines.append(f"uncompressed size: {format_bytes(nbytes)}")
+        lines.append(f"compression ratio: {ratio:.2f}x")
+        lines.append("")
+
+        print("\n".join(lines))
+
+    def append(self, data: list | np.void | np.ndarray) -> None:
+        if self.base is not None:
+            raise TypeError("Cannot extend view.")
+
+        is_list = isinstance(data, (list, tuple))
+
+        arr = self._valid_rows
+        chunk_size = arr.chunks[0]
+        last_true_pos = -1
+
+        for info in reversed(list(arr.iterchunks_info())):
+            actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size)
+            chunk_start = info.nchunk * chunk_size
+
+            if info.special == blosc2.SpecialValue.ZERO:
+                continue
+
+            if info.special == blosc2.SpecialValue.VALUE:
+                val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0]
+                if not val:
+                    continue
+                last_true_pos = chunk_start + actual_size - 1
+                break
+
+            chunk_data = arr[chunk_start : chunk_start + actual_size]
+            nonzero = np.flatnonzero(chunk_data)
+            if len(nonzero) == 0:
+                continue
+            last_true_pos = chunk_start + int(nonzero[-1])
+            break
+
+        pos = last_true_pos + 1
+
+        if pos >= len(self._valid_rows):
+            c = len(self._valid_rows)
+            for v in self._cols.values():
+                v.resize((c * 2,))
+            self._valid_rows.resize((c * 2,))
+
+        if is_list:
+            for i, col_array in enumerate(self._cols.values()):
+                col_array[pos] = data[i]
+        else:
+            for name, col_array in self._cols.items():
+                col_array[pos] = data[name]
+
+        self._valid_rows[pos] = True
+        self._n_rows += 1
+
+    def delete(self, ind: int | slice | str | Iterable) -> blosc2.NDArray:
+        valid_rows_np = self._valid_rows[:]
+        true_pos = np.where(valid_rows_np)[0]
+
+        if isinstance(ind, Iterable) and not isinstance(ind, (str, bytes)):
+            ind = list(ind)
+        elif not isinstance(ind, int) and not isinstance(ind, slice):
+            raise TypeError(f"Invalid type '{type(ind)}'")
+
+        false_pos = true_pos[ind]
+
+        new_mask_np = valid_rows_np.copy()
+        new_mask_np[false_pos] = False
+
+        new_mask = blosc2.asarray(new_mask_np)
+        self._valid_rows = new_mask
+        self._n_rows = blosc2.count_nonzero(self._valid_rows)
+
+    def extend(self, data: list | CTable | Any) -> None:
+        if self.base is not None:
+            raise TypeError("Cannot extend view.")
+        if len(data) <= 0:
+            return
+        ultimas_validas = blosc2.where(self._valid_rows, np.array(range(len(self._valid_rows)))).compute()
+        start_pos = ultimas_validas[-1] + 1 if len(ultimas_validas) > 0 else 0
+
+        current_col_names = self.col_names
+        columns_to_insert = []
+        new_nrows = 0
+
+        if hasattr(data, "_cols") and hasattr(data, "_n_rows"):
+            for name in current_col_names:
+                col = data._cols[name][: data._n_rows]
+                columns_to_insert.append(col)
+            new_nrows = data._n_rows
+        else:
+            if isinstance(data, np.ndarray) and data.dtype.names is not None:
+                for name in current_col_names:
+                    columns_to_insert.append(data[name])
+                new_nrows = len(data)
+            else:
+                columns_to_insert = list(zip(*data, strict=False))
+                new_nrows = len(data)
+
+        processed_cols = []
+        for i, raw_col in enumerate(columns_to_insert):
+            target_dtype = self._cols[current_col_names[i]].dtype
+            b2_arr = blosc2.asarray(raw_col, dtype=target_dtype)
+            processed_cols.append(b2_arr)
+
+        end_pos = start_pos + new_nrows
+
+        if self.auto_compact and end_pos >= len(self._valid_rows):
+            self.compact()
+            ultimas_validas = blosc2.where(
+                self._valid_rows, np.array(range(len(self._valid_rows)))
+            ).compute()
+            start_pos = ultimas_validas[-1] + 1 if len(ultimas_validas) > 0 else 0
+            end_pos = start_pos + new_nrows
+
+        while end_pos > len(self._valid_rows):
+            c = len(self._valid_rows)
+            for name in current_col_names:
+                self._cols[name].resize((c * 2,))
+            self._valid_rows.resize((c * 2,))
+
+        # Do this per chunks
+        for j, name in enumerate(current_col_names):
+            self._cols[name][start_pos:end_pos] = processed_cols[j][:]
+
+        self._valid_rows[start_pos:end_pos] = True
+        self._n_rows = blosc2.count_nonzero(self._valid_rows)
+
+    @profile
+    def where(self, expr_result) -> CTable:
+        if not (
+            isinstance(expr_result, (blosc2.NDArray, blosc2.LazyExpr))
+            and (getattr(expr_result, "dtype", None) == np.bool_)
+        ):
+            raise TypeError(f"Expected boolean blosc2.NDArray or LazyExpr, got {type(expr_result).__name__}")
+
+        filter = expr_result.compute() if isinstance(expr_result, blosc2.LazyExpr) else expr_result
+
+        target_len = len(self._valid_rows)
+
+        if len(filter) > target_len:
+            filter = filter[:target_len]
+        elif len(filter) < target_len:
+            padding = blosc2.zeros(target_len, dtype=np.bool_)
+            padding[: len(filter)] = filter[:]
+            filter = padding
+
+        filter = (filter & self._valid_rows).compute()
+
+        return self.view(filter)
+
+    def _run_row_logic(self, ind: int | slice | str | Iterable) -> CTable:
+        valid_rows_np = self._valid_rows[:]
+        true_pos = np.where(valid_rows_np)[0]
+
+        if isinstance(ind, Iterable) and not isinstance(ind, (str, bytes)):
+            ind = list(ind)
+
+        mant_pos = true_pos[ind]
+
+        new_mask_np = np.zeros_like(valid_rows_np, dtype=bool)
+        new_mask_np[mant_pos] = True
+
+        new_mask = blosc2.asarray(new_mask_np)
+        return self.view(new_mask)
+
+    """Save & load are blank for now"""
+
+    def save(self, urlpath: str, group: str = "table") -> None: ...
+
+    @classmethod
+    def load(cls, urlpath: str, group: str = "table", row_type: type[RowT] | None = None) -> CTable: ...
diff --git a/tests/ctable/test_column.py b/tests/ctable/test_column.py
new file mode 100644
index 00000000..4f2e450b
--- /dev/null
+++ b/tests/ctable/test_column.py
@@ -0,0 +1,294 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+from typing import Annotated
+
+import numpy as np
+import pytest
+from pydantic import BaseModel, Field
+import blosc2
+
+from blosc2 import CTable
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+DATA20 = [(i, float(i * 10), True) for i in range(20)]
+
+
+# -------------------------------------------------------------------
+# Tests
+# -------------------------------------------------------------------
+
+
+def test_column_metadata():
+    """dtype correctness, internal reference consistency, and mask defaults."""
+    tabla = CTable(RowModel, new_data=DATA20)
+
+    assert tabla.id.dtype == np.int64
+    assert tabla.score.dtype == np.float64
+    assert tabla.active.dtype == np.bool_
+
+    assert tabla.id._raw_col is tabla._cols["id"]
+    assert tabla.id._valid_rows is tabla._valid_rows
+
+    # mask is None by default
+    assert tabla.id._mask is None
+    assert tabla.score._mask is None
+
+
+def test_column_getitem_no_holes():
+    """int, slice, and list indexing on a full table."""
+    tabla = CTable(RowModel, new_data=DATA20)
+    col = tabla.id
+
+    # int
+    assert col[0] == 0
+    assert col[5] == 5
+    assert col[19] == 19
+    assert col[-1] == 19
+    assert col[-5] == 15
+
+    # slice returns a Column view
+    assert isinstance(col[0:5], blosc2.Column)
+    assert isinstance(col[10:15], blosc2.Column)
+
+    # list
+    assert list(col[[0, 5, 10, 15]]) == [0, 5, 10, 15]
+    assert list(col[[19, 0, 10]]) == [19, 0, 10]
+
+
+def test_column_getitem_with_holes():
+    """int, slice, and list indexing after deletions."""
+    tabla = CTable(RowModel, new_data=DATA20)
+    tabla.delete([1, 3, 5, 7, 9])
+    col = tabla.id
+
+    assert col[0] == 0
+    assert col[1] == 2
+    assert col[2] == 4
+    assert col[3] == 6
+    assert col[4] == 8
+    assert col[-1] == 19
+    assert col[-2] == 18
+
+    assert list(col[[0, 2, 4]]) == [0, 4, 8]
+    assert list(col[[5, 3, 1]]) == [10, 6, 2]
+
+    tabla2 = CTable(RowModel, new_data=DATA20)
+    tabla2.delete([1, 3, 5, 7, 9, 11, 13, 15, 17, 19])
+    col2 = tabla2.id
+
+    assert list(col2[0:5].to_numpy()) == [0, 2, 4, 6, 8]
+    assert list(col2[5:10].to_numpy()) == [10, 12, 14, 16, 18]
+    assert list(col2[::2].to_numpy()) == [0, 4, 8, 12, 16]
+
+
+def test_column_getitem_out_of_range():
+    """int and list indexing raise IndexError when out of bounds."""
+    tabla = CTable(RowModel, new_data=DATA20)
+    tabla.delete([1, 3, 5, 7, 9])
+    col = tabla.id
+
+    with pytest.raises(IndexError):
+        _ = col[100]
+    with pytest.raises(IndexError):
+        _ = col[-100]
+    with pytest.raises(IndexError):
+        _ = col[[0, 1, 100]]
+
+
+def test_column_setitem_no_holes():
+    """int, slice, and list assignment on a full table."""
+    tabla = CTable(RowModel, new_data=DATA20)
+    col = tabla.id
+
+    col[0] = 999
+    assert col[0] == 999
+    col[10] = 888
+    assert col[10] == 888
+    col[-1] = 777
+    assert col[-1] == 777
+
+    col[0:5] = [100, 101, 102, 103, 104]
+    assert list(col[0:5].to_numpy()) == [100, 101, 102, 103, 104]
+
+    col[[0, 5, 10]] = [10, 50, 100]
+    assert col[0] == 10
+    assert col[5] == 50
+    assert col[10] == 100
+
+
+def test_column_setitem_with_holes():
+    """int, slice, and list assignment after deletions."""
+    tabla = CTable(RowModel, new_data=DATA20)
+    tabla.delete([1, 3, 5, 7, 9])
+    col = tabla.id
+
+    col[0] = 999
+    assert col[0] == 999
+    assert tabla._cols["id"][0] == 999
+
+    col[2] = 888
+    assert col[2] == 888
+    assert tabla._cols["id"][4] == 888
+
+    col[-1] = 777
+    assert col[-1] == 777
+
+    col[0:3] = [100, 200, 300]
+    assert col[0] == 100
+    assert col[1] == 200
+    assert col[2] == 300
+
+    col[[0, 2, 4]] = [11, 22, 33]
+    assert col[0] == 11
+    assert col[2] == 22
+    assert col[4] == 33
+
+
+def test_column_iter():
+    """Iteration over full table, with odd-index holes, and on score column."""
+    tabla = CTable(RowModel, new_data=DATA20)
+    assert list(tabla.id) == list(range(20))
+
+    tabla2 = CTable(RowModel, new_data=DATA20)
+    tabla2.delete([1, 3, 5, 7, 9, 11, 13, 15, 17, 19])
+    assert list(tabla2.id) == [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
+
+    tabla3 = CTable(RowModel, new_data=DATA20)
+    tabla3.delete([0, 5, 10, 15])
+    expected_score = [
+        10.0, 20.0, 30.0, 40.0,
+        60.0, 70.0, 80.0, 90.0,
+        110.0, 120.0, 130.0, 140.0,
+        160.0, 170.0, 180.0, 190.0,
+    ]
+    assert list(tabla3.score) == expected_score
+
+
+def test_column_len():
+    """len() after no deletions, partial deletions, cumulative deletions, and cross-column."""
+    tabla = CTable(RowModel, new_data=DATA20)
+    col = tabla.id
+    assert len(col) == 20
+
+    tabla.delete([1, 3, 5, 7, 9])
+    assert len(col) == 15
+
+    tabla2 = CTable(RowModel, new_data=DATA20)
+    col2 = tabla2.id
+    tabla2.delete([0, 1, 2])
+    assert len(col2) == 17
+    tabla2.delete([0, 1, 2, 3, 4])
+    assert len(col2) == 12
+
+    data = [(i, float(i * 10), i % 2 == 0) for i in range(10)]
+    tabla3 = CTable(RowModel, new_data=data, expected_size=10)
+    tabla3.delete([0, 1, 5, 6, 9])
+    assert len(tabla3.id) == len(tabla3.score) == len(tabla3.active) == 5
+    for i in range(len(tabla3.id)):
+        assert tabla3.score[i] == float(tabla3.id[i] * 10)
+
+
+def test_column_edge_cases():
+    """Empty table and fully-deleted table both behave as zero-length columns."""
+    tabla = CTable(RowModel)
+    assert len(tabla.id) == 0
+    assert list(tabla.id) == []
+
+    data = [(i, float(i * 10), True) for i in range(10)]
+    tabla2 = CTable(RowModel, new_data=data)
+    tabla2.delete(list(range(10)))
+    assert len(tabla2.id) == 0
+    assert list(tabla2.id) == []
+
+
+# -------------------------------------------------------------------
+# New tests for Column view (mask) and to_array()
+# -------------------------------------------------------------------
+
+
+def test_column_slice_returns_view():
+    """Column[slice] returns a Column instance with a non-None mask."""
+    tabla = CTable(RowModel, new_data=DATA20)
+    col = tabla.id
+
+    view = col[0:5]
+    assert isinstance(view, blosc2.Column)
+    assert view._mask is not None
+    assert view._table is tabla
+    assert view._col_name == "id"
+
+
+def test_to_array_no_holes():
+    """to_array() on a slice view returns correct data on a full table."""
+    tabla = CTable(RowModel, new_data=DATA20)
+    col = tabla.id
+
+    np.testing.assert_array_equal(col[0:5].to_numpy(), np.array([0, 1, 2, 3, 4], dtype=np.int64))
+    np.testing.assert_array_equal(col[5:10].to_numpy(), np.array([5, 6, 7, 8, 9], dtype=np.int64))
+    np.testing.assert_array_equal(col[15:20].to_numpy(), np.array([15, 16, 17, 18, 19], dtype=np.int64))
+    np.testing.assert_array_equal(col[0:20].to_numpy(), np.arange(20, dtype=np.int64))
+
+
+def test_to_array_with_holes():
+    """to_array() on a slice view skips deleted rows correctly."""
+    tabla = CTable(RowModel, new_data=DATA20)
+    tabla.delete([1, 3, 5, 7, 9, 11, 13, 15, 17, 19])  # keep evens: 0,2,4,...,18
+    col = tabla.id
+
+    # logical [0:5] → physical rows 0,2,4,6,8
+    np.testing.assert_array_equal(col[0:5].to_numpy(), np.array([0, 2, 4, 6, 8], dtype=np.int64))
+    # logical [5:10] → physical rows 10,12,14,16,18
+    np.testing.assert_array_equal(col[5:10].to_numpy(), np.array([10, 12, 14, 16, 18], dtype=np.int64))
+
+
+def test_to_array_full_column():
+    """to_array() with no slice (full column) returns all valid rows."""
+    tabla = CTable(RowModel, new_data=DATA20)
+    tabla.delete([0, 10, 19])
+    col = tabla.id
+
+    expected = np.array([i for i in range(20) if i not in {0, 10, 19}], dtype=np.int64)
+    np.testing.assert_array_equal(col[0:len(col)].to_numpy(), expected)
+
+
+def test_to_array_mask_does_not_include_deleted():
+    """Mask & valid_rows intersection excludes deleted rows inside the slice range."""
+    tabla = CTable(RowModel, new_data=DATA20)
+    # delete rows 2 and 3, which fall inside slice [0:5]
+    tabla.delete([2, 3])
+    col = tabla.id
+
+    # logical [0:5] should now map to physical rows 0,1,4,5,6
+    result = col[0:5].to_numpy()
+    np.testing.assert_array_equal(result, np.array([0, 1, 4, 5, 6], dtype=np.int64))
+
+
+def test_column_view_mask_is_independent():
+    """Two slice views on the same column have independent masks."""
+    tabla = CTable(RowModel, new_data=DATA20)
+    col = tabla.id
+
+    view_a = col[0:5]
+
+    np.testing.assert_array_equal(view_a.to_numpy(), np.arange(0, 5, dtype=np.int64))
+
+
+
+if __name__ == "__main__":
+    pytest.main(["-v", __file__])
diff --git a/tests/ctable/test_compact.py b/tests/ctable/test_compact.py
new file mode 100644
index 00000000..a0eaebb5
--- /dev/null
+++ b/tests/ctable/test_compact.py
@@ -0,0 +1,157 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+from blosc2 import CTable
+
+
+# --- Basic model setup for tests ---
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+
+
+def generate_test_data(n_rows: int) -> list:
+    return [(i, float(i)) for i in range(n_rows)]
+
+
+def test_compact_empty_table():
+    """Test compact() on a completely empty table (no data)."""
+    table = CTable(RowModel, expected_size=100)
+
+    assert len(table) == 0
+
+    # Should not raise any error
+    table.compact()
+
+    # Capacity might have drastically reduced, but the logical table must remain empty
+    assert len(table) == 0
+    # Verify that if data is added later, it works correctly
+    table.append((1, 10.0))
+    assert len(table) == 1
+    assert table.id[0] == 1
+
+
+def test_compact_full_table():
+    """Test compact() on a completely full table (no holes or free space)."""
+    data = generate_test_data(50)
+    table = CTable(RowModel, new_data=data, expected_size=50)
+
+    assert len(table) == 50
+    initial_capacity = len(table._valid_rows)
+
+    # Should not raise any error or change the logical state
+    table.compact()
+
+    assert len(table) == 50
+    # Capacity should not have changed because it was already full
+    assert len(table._valid_rows) == initial_capacity
+
+    # Verify data integrity
+    assert table.id[0] == 0
+    assert table.id[-1] == 49
+
+
+def test_compact_already_compacted_table():
+    """Test compact() on a table that has free space but no holes (contiguous data)."""
+    data = generate_test_data(20)
+    # Large expected_size to ensure free space at the end
+    table = CTable(RowModel, new_data=data, expected_size=100)
+
+    assert len(table) == 20
+
+    # Execute compact. Since data is already contiguous, the table might reduce
+    # its size due to the < len//2 while loop, but it shouldn't fail.
+    table.compact()
+
+    assert len(table) == 20
+
+    # Verify that data remains in place
+    for i in range(20):
+        assert table.id[i] == i
+
+    # Validate that all True values are consecutive at the beginning
+    mask = table._valid_rows[: len(table._valid_rows)]
+    assert np.all(mask[:20])
+    if len(mask) > 20:
+        assert not np.any(mask[20:])
+
+
+def test_compact_with_holes():
+    """Test compact() on a table with high fragmentation (holes)."""
+    data = generate_test_data(30)
+    table = CTable(RowModel, new_data=data, expected_size=50)
+
+    # Delete sparsely: leave only [0, 5, 10, 15, 20, 25]
+    to_delete = [i for i in range(30) if i % 5 != 0]
+    table.delete(to_delete)
+
+    assert len(table) == 6
+
+    # Execute compact
+    table.compact()
+
+    assert len(table) == 6
+
+    # Verify that the correct data survived and moved to the beginning
+    expected_ids = [0, 5, 10, 15, 20, 25]
+    for i, exp_id in enumerate(expected_ids):
+        # Through the logical view (Column wrapper)
+        assert table.id[i] == exp_id
+        # Through the physical blosc2 array (to ensure compact worked)
+        assert table._cols["id"][i] == exp_id
+
+    # Verify physical mask: first 6 must be True, the rest False
+    mask = table._valid_rows[: len(table._valid_rows)]
+    assert np.all(mask[:6])
+    if len(mask) > 6:
+        assert not np.any(mask[6:])
+
+
+def test_compact_all_deleted():
+    """Test compact() on a table where absolutely all rows have been deleted."""
+    data = generate_test_data(20)
+    table = CTable(RowModel, new_data=data, expected_size=20)
+
+    # Delete everything
+    table.delete(list(range(20)))
+    assert len(table) == 0
+
+    # Should handle empty arrays correctly
+    table.compact()
+
+    assert len(table) == 0
+
+    # Check that we can write to it again
+    table.append((99, 99.0))
+    assert len(table) == 1
+    assert table.id[0] == 99
+
+
+def test_compact_multiple_times():
+    """Calling compact() multiple times in a row must not corrupt data or crash."""
+    data = generate_test_data(10)
+    table = CTable(RowModel, new_data=data, expected_size=20)
+
+    table.delete([1, 3, 5, 7, 9])  # 5 elements remaining
+
+    # Compact 3 times in a row
+    table.compact()
+    table.compact()
+    table.compact()
+
+    assert len(table) == 5
+    assert list(table.id) == [0, 2, 4, 6, 8]
diff --git a/tests/ctable/test_construct.py b/tests/ctable/test_construct.py
new file mode 100644
index 00000000..63810076
--- /dev/null
+++ b/tests/ctable/test_construct.py
@@ -0,0 +1,225 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+from typing import Annotated, TypeVar
+
+import numpy as np
+import pytest
+from pydantic import BaseModel, Field
+
+import blosc2
+from blosc2 import CTable
+
+RowT = TypeVar("RowT", bound=BaseModel)
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+class MaxLen:
+    def __init__(self, length: int):
+        self.length = int(length)
+
+
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+# -------------------------------------------------------------------
+# Predefined Test Data
+# -------------------------------------------------------------------
+SMALL_DATA = [
+    (1, 1 + 2j, 95.5, True),
+    (2, 3 - 4j, 80.0, False),
+    (3, 0j, 50.2, True),
+    (4, -1 + 1j, 12.3, False),
+    (5, 5j, 99.9, True),
+]
+SMALLEST_DATA = SMALL_DATA[:2]
+
+dtype_struct = [("id", "i8"), ("c_val", "c16"), ("score", "f8"), ("active", "?")]
+SMALL_STRUCT = np.array(SMALL_DATA, dtype=dtype_struct)
+
+
+# -------------------------------------------------------------------
+# Validation Utility
+# -------------------------------------------------------------------
+def assert_table_equals_data(table: CTable, expected_data: list):
+    assert len(table) == len(expected_data), f"Expected length {len(expected_data)}, got {len(table)}"
+    col_names = table.col_names
+    for i, expected_row in enumerate(expected_data):
+        row_extracted = table.row[i]
+        for col_idx, expected_val in enumerate(expected_row):
+            col_name = col_names[col_idx]
+            extracted_val = getattr(row_extracted, col_name)[0]
+            if isinstance(expected_val, (float, complex)):
+                np.testing.assert_allclose(
+                    extracted_val, expected_val, err_msg=f"Discrepancy at row {i}, col {col_name}"
+                )
+            else:
+                assert extracted_val == expected_val, (
+                    f"Row {i}, col {col_name}: expected {expected_val}, got {extracted_val}"
+                )
+
+
+# -------------------------------------------------------------------
+# Tests
+# -------------------------------------------------------------------
+
+
+def test_empty_table_variants():
+    """Empty table: default, with expected_size, and with compact=True."""
+    table = CTable(RowModel)
+    assert len(table) == 0
+    assert table.nrows == 0
+    assert table.ncols == 4
+    for col_name in ["id", "c_val", "score", "active"]:
+        assert col_name in table._cols
+        assert isinstance(table._cols[col_name], blosc2.NDArray)
+
+    table_sized = CTable(RowModel, expected_size=5000)
+    assert len(table_sized) == 0
+    assert all(len(col) == 5000 for col in table_sized._cols.values())
+
+    table_compact = CTable(RowModel, compact=True)
+    assert len(table_compact) == 0
+    assert table_compact.auto_compact is True
+
+
+def test_empty_data_lifecycle():
+    """Create from [], extend with [], then extend with real data."""
+    table = CTable(RowModel, new_data=[])
+    assert len(table) == 0
+
+    table.extend([])
+    assert len(table) == 0
+
+    table.extend(SMALL_DATA)
+    assert_table_equals_data(table, SMALL_DATA)
+
+
+def test_construction_sources():
+    """List of tuples and structured array both produce identical tables."""
+    assert_table_equals_data(CTable(RowModel, new_data=SMALL_DATA), SMALL_DATA)
+    assert_table_equals_data(CTable(RowModel, new_data=SMALL_STRUCT), SMALL_DATA)
+
+
+def test_expected_size_variants():
+    """expected_size smaller, exact, and larger than the inserted data."""
+    for es in [1, 5]:
+        assert_table_equals_data(CTable(RowModel, new_data=SMALL_DATA, expected_size=es), SMALL_DATA)
+
+    table_large = CTable(RowModel, new_data=SMALL_DATA, expected_size=1000)
+    assert_table_equals_data(table_large, SMALL_DATA)
+    assert all(len(col) == 1000 for col in table_large._cols.values())
+
+
+def test_compact_flag():
+    """compact=False and compact=True both preserve data correctly."""
+    table_false = CTable(RowModel, new_data=SMALL_DATA, compact=False)
+    assert table_false.auto_compact is False
+    assert_table_equals_data(table_false, SMALL_DATA)
+
+    table_true = CTable(RowModel, new_data=SMALL_DATA, compact=True)
+    assert table_true.auto_compact is True
+    assert_table_equals_data(table_true, SMALL_DATA)
+
+
+def test_append_and_clone():
+    """Build table row by row, then clone it into a new CTable."""
+    table = CTable(RowModel)
+    for row in SMALLEST_DATA:
+        table.append(row)
+    assert_table_equals_data(table, SMALLEST_DATA)
+
+    cloned = CTable(RowModel, new_data=table)
+    assert_table_equals_data(cloned, SMALLEST_DATA)
+    assert table is not cloned
+
+
+def test_invalid_append():
+    """Wrong length, incompatible type, and dict all raise errors."""
+    table = CTable(RowModel, expected_size=1)
+
+    # Too few values → IndexError (NumPy raises natively after simplification)
+    with pytest.raises((IndexError, ValueError)):
+        table.append([1, 1 + 2j, 95.5])  # missing boolean
+
+    # Incompatible type → TypeError or ValueError from NumPy
+    with pytest.raises((TypeError, ValueError)):
+        table.append(["invalid_text", 1 + 2j, 95.5, True])
+
+
+def test_extreme_values():
+    """Extreme complex, float boundary, and large integer values."""
+    extreme_complex = [
+        (1, complex(1e308, -1e308), 50.0, True),
+        (2, complex(0, 0), 0.0, False),
+        (3, complex(-1e308, 1e308), 100.0, True),
+    ]
+    extreme_float = [
+        (1, 0j, 0.0, True),
+        (2, 0j, 100.0, False),
+        (3, 0j, 0.0001, True),
+        (4, 0j, 99.9999, False),
+    ]
+    extreme_int = [
+        (1, 0j, 50.0, True),
+        (2**32, 0j, 50.0, False),
+        (2**60, 0j, 50.0, True),
+    ]
+    for data in [extreme_complex, extreme_float, extreme_int]:
+        assert_table_equals_data(CTable(RowModel, new_data=data), data)
+
+
+def test_extend_append_and_resize():
+    """Auto-resize via append one-by-one, then extend+append beyond initial size."""
+    # Append beyond expected_size triggers resize
+    table = CTable(RowModel, expected_size=2)
+    for row in SMALL_DATA:
+        table.append(row)
+    assert_table_equals_data(table, SMALL_DATA)
+    assert all(len(col) >= 5 for col in table._cols.values())
+
+    # Extend beyond expected_size, then append the last row
+    table2 = CTable(RowModel, expected_size=2)
+    table2.extend(SMALL_DATA[:4])
+    assert len(table2) == 4
+    table2.append(SMALL_DATA[4])
+    assert_table_equals_data(table2, SMALL_DATA)
+
+
+def test_column_integrity():
+    """Column access via [] and getattr, and correct dtypes."""
+    table = CTable(RowModel, new_data=SMALL_DATA)
+
+    assert isinstance(table["id"], blosc2.ctable.Column)
+    assert isinstance(table.score, blosc2.ctable.Column)
+
+    assert table._cols["id"].dtype == np.int64
+    assert table._cols["c_val"].dtype == np.complex128
+    assert table._cols["score"].dtype == np.float64
+    assert table._cols["active"].dtype == np.bool_
+
+
+def test_valid_rows():
+    """_valid_rows has exactly 5 True entries after creation and after extend."""
+    table_direct = CTable(RowModel, new_data=SMALL_DATA)
+    assert blosc2.count_nonzero(table_direct._valid_rows) == 5
+
+    table_extended = CTable(RowModel)
+    table_extended.extend(SMALL_DATA)
+    assert blosc2.count_nonzero(table_extended._valid_rows) == 5
+
+
+if __name__ == "__main__":
+    pytest.main(["-v", __file__])
diff --git a/tests/ctable/test_delete_rows.py b/tests/ctable/test_delete_rows.py
new file mode 100644
index 00000000..b6d04f59
--- /dev/null
+++ b/tests/ctable/test_delete_rows.py
@@ -0,0 +1,210 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+from typing import Annotated, TypeVar
+
+import numpy as np
+import pytest
+from pydantic import BaseModel, Field
+
+from blosc2 import CTable
+
+RowT = TypeVar("RowT", bound=BaseModel)
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+def generate_test_data(n_rows: int) -> list:
+    return [(i, complex(i, -i), float((i * 7) % 100), bool(i % 2)) for i in range(1, n_rows + 1)]
+
+
+# -------------------------------------------------------------------
+# Tests
+# -------------------------------------------------------------------
+
+
+def test_delete_single_element():
+    """First, last, middle deletion once; and repeated deletion from front/back."""
+    data = generate_test_data(50)
+
+    # Delete first
+    t = CTable(RowModel, new_data=data, expected_size=50)
+    t.delete(0)
+    assert len(t) == 49
+    assert not t._valid_rows[0]
+
+    # Delete last
+    t2 = CTable(RowModel, new_data=data, expected_size=50)
+    t2.delete(-1)
+    assert len(t2) == 49
+
+    # Delete middle
+    t3 = CTable(RowModel, new_data=data, expected_size=50)
+    t3.delete(25)
+    assert len(t3) == 49
+
+    # Delete first 10 times in a row
+    t4 = CTable(RowModel, new_data=data, expected_size=50)
+    for i in range(10):
+        t4.delete(0)
+        assert len(t4) == 50 - (i + 1)
+    assert len(t4) == 40
+
+    # Delete last 10 times in a row
+    t5 = CTable(RowModel, new_data=data, expected_size=50)
+    for i in range(10):
+        t5.delete(-1)
+        assert len(t5) == 50 - (i + 1)
+    assert len(t5) == 40
+
+
+def test_delete_list_of_positions():
+    """Scattered, consecutive, even, odd, and slice-equivalent list deletions."""
+    data = generate_test_data(50)
+
+    # Scattered
+    t = CTable(RowModel, new_data=data, expected_size=50)
+    t.delete([0, 10, 20, 30, 40])
+    assert len(t) == 45
+
+    # Consecutive block
+    t2 = CTable(RowModel, new_data=data, expected_size=50)
+    t2.delete([5, 6, 7, 8, 9])
+    assert len(t2) == 45
+
+    # All even positions
+    t3 = CTable(RowModel, new_data=data, expected_size=50)
+    t3.delete(list(range(0, 50, 2)))
+    assert len(t3) == 25
+
+    # All odd positions
+    t4 = CTable(RowModel, new_data=data, expected_size=50)
+    t4.delete(list(range(1, 50, 2)))
+    assert len(t4) == 25
+
+    # Slice-equivalent: range(10, 20)
+    t5 = CTable(RowModel, new_data=data, expected_size=50)
+    t5.delete(list(range(10, 20)))
+    assert len(t5) == 40
+
+    # Slice with step: range(0, 20, 2)
+    t6 = CTable(RowModel, new_data=data, expected_size=50)
+    t6.delete(list(range(0, 20, 2)))
+    assert len(t6) == 40
+
+    # First 10 rows
+    t7 = CTable(RowModel, new_data=data, expected_size=50)
+    t7.delete(list(range(0, 10)))
+    assert len(t7) == 40
+
+    # Last 10 rows
+    t8 = CTable(RowModel, new_data=data, expected_size=50)
+    t8.delete(list(range(40, 50)))
+    assert len(t8) == 40
+
+
+def test_delete_out_of_bounds():
+    """All IndexError scenarios: full table, partial table, empty table, negative."""
+    data = generate_test_data(50)
+
+    # Beyond length on full table
+    t = CTable(RowModel, new_data=data, expected_size=50)
+    with pytest.raises(IndexError):
+        t.delete(60)
+    with pytest.raises(IndexError):
+        t.delete(-60)
+
+    # Beyond nrows on partial table (capacity 50, only 25 rows)
+    t2 = CTable(RowModel, new_data=generate_test_data(25), expected_size=50)
+    assert len(t2) == 25
+    with pytest.raises(IndexError):
+        t2.delete(35)
+
+    # Empty table: positions 0, 25, -1 all raise
+    for pos in [0, 25, -1]:
+        empty = CTable(RowModel, expected_size=50)
+        assert len(empty) == 0
+        with pytest.raises(IndexError):
+            empty.delete(pos)
+
+
+def test_delete_edge_cases():
+    """Same position twice, all rows front/back, negative and mixed indices."""
+    data = generate_test_data(50)
+
+    # Same logical position twice: second delete hits what was position 11
+    t = CTable(RowModel, new_data=data, expected_size=50)
+    t.delete(10)
+    assert len(t) == 49
+    t.delete(10)
+    assert len(t) == 48
+
+    # Delete all rows from the front one by one
+    t2 = CTable(RowModel, new_data=data, expected_size=50)
+    for _ in range(50):
+        t2.delete(0)
+    assert len(t2) == 0
+
+    # Delete all rows from the back one by one
+    t3 = CTable(RowModel, new_data=data, expected_size=50)
+    for _ in range(50):
+        t3.delete(-1)
+    assert len(t3) == 0
+
+    # Negative indices list
+    t4 = CTable(RowModel, new_data=data, expected_size=50)
+    t4.delete([-1, -5, -10])
+    assert len(t4) == 47
+
+    # Mixed positive and negative indices
+    t5 = CTable(RowModel, new_data=data, expected_size=50)
+    t5.delete([0, -1, 25])
+    assert len(t5) == 47
+
+
+def test_delete_invalid_types():
+    """string, float, and list-with-strings all raise errors."""
+    data = generate_test_data(50)
+
+    t = CTable(RowModel, new_data=data, expected_size=50)
+    with pytest.raises(TypeError):
+        t.delete("invalid")
+    with pytest.raises(TypeError):
+        t.delete(10.5)
+    with pytest.raises(IndexError):
+        t.delete([0, "invalid", 10])
+
+
+def test_delete_stress():
+    """Large batch deletion and alternating multi-pass pattern."""
+    data = generate_test_data(50)
+
+    # Delete 40 out of 50 at once
+    t = CTable(RowModel, new_data=data, expected_size=50)
+    t.delete(list(range(0, 40)))
+    assert len(t) == 10
+
+    # Alternating two-pass deletion
+    t2 = CTable(RowModel, new_data=data, expected_size=50)
+    t2.delete(list(range(0, 50, 2)))  # delete all even -> 25 remain
+    assert len(t2) == 25
+    t2.delete(list(range(0, 25, 2)))  # delete every other of remaining -> ~12
+    assert len(t2) == 12
+
+
+if __name__ == "__main__":
+    pytest.main(["-v", __file__])
diff --git a/tests/ctable/test_extend_delete.py b/tests/ctable/test_extend_delete.py
new file mode 100644
index 00000000..42cf974e
--- /dev/null
+++ b/tests/ctable/test_extend_delete.py
@@ -0,0 +1,226 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+from typing import Annotated, TypeVar
+
+import numpy as np
+import pytest
+from pydantic import BaseModel, Field
+
+from blosc2 import CTable
+
+RowT = TypeVar("RowT", bound=BaseModel)
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+def generate_test_data(n_rows: int, start_id: int = 1) -> list:
+    return [(start_id + i, complex(i, -i), float((i * 7) % 100), bool(i % 2)) for i in range(n_rows)]
+
+
+def get_valid_mask(table: CTable) -> np.ndarray:
+    return np.array(table._valid_rows[: len(table._valid_rows)], dtype=bool)
+
+
+def assert_mask_matches(table: CTable, expected_mask: list):
+    actual = get_valid_mask(table)[: len(expected_mask)]
+    np.testing.assert_array_equal(
+        actual,
+        np.array(expected_mask, dtype=bool),
+        err_msg=f"Mask mismatch.\nExpected: {expected_mask}\nGot: {actual}",
+    )
+
+
+def assert_data_at_positions(table: CTable, positions: list, expected_ids: list):
+    for pos, expected_id in zip(positions, expected_ids, strict=False):
+        actual_id = int(table._cols["id"][pos])
+        assert actual_id == expected_id, f"Position {pos}: expected ID {expected_id}, got {actual_id}"
+
+
+# -------------------------------------------------------------------
+# Tests
+# -------------------------------------------------------------------
+
+
+def test_gap_fill_mask_and_positions():
+    """extend and append fill from last valid position; mask is updated correctly."""
+    # extend after deletions: mask and physical positions
+    t = CTable(RowModel, new_data=generate_test_data(7, 1), expected_size=10)
+    t.delete([0, 2, 4, 6])
+    assert_mask_matches(t, [False, True, False, True, False, True, False])
+    assert len(t) == 3
+    t.extend(generate_test_data(3, 8))
+    assert_mask_matches(t, [False, True, False, True, False, True, True, True, True])
+    assert len(t) == 6
+    assert_data_at_positions(t, [6, 7, 8], [8, 9, 10])
+
+    # append fills from last valid position, not into holes
+    t2 = CTable(RowModel, new_data=generate_test_data(5, 1), expected_size=10)
+    t2.delete([1, 3])
+    assert_mask_matches(t2, [True, False, True, False, True])
+    t2.append((6, 1j, 50.0, True))
+    assert_mask_matches(t2, [True, False, True, False, True, True])
+    t2.append((7, 2j, 60.0, False))
+    assert_mask_matches(t2, [True, False, True, False, True, True, True])
+
+    # extend fills from last valid position when there's enough capacity
+    t3 = CTable(RowModel, new_data=generate_test_data(10, 1), expected_size=15)
+    t3.delete([2, 4, 6])
+    t3.extend(generate_test_data(3, 20))
+    assert_data_at_positions(t3, [10, 11, 12], [20, 21, 22])
+
+
+def test_resize_behavior():
+    """Resize triggered when capacity is full; compact=True avoids massive resize."""
+    # compact=False: append beyond capacity must resize
+    t = CTable(RowModel, new_data=generate_test_data(10, 1), expected_size=10, compact=False)
+    t.delete(list(range(9)))
+    assert len(t) == 1
+    initial_cap = len(t._valid_rows)
+    t.append((11, 5j, 75.0, True))
+    assert len(t._valid_rows) > initial_cap
+
+    # compact=True: no massive resize after deletions + extend
+    t2 = CTable(RowModel, new_data=generate_test_data(10, 1), expected_size=10, compact=True)
+    t2.delete(list(range(9)))
+    assert len(t2) == 1
+    initial_cap2 = len(t2._valid_rows)
+    t2.extend(generate_test_data(3, 11))
+    assert len(t2._valid_rows) <= initial_cap2 * 2
+
+    # extend exceeding capacity always resizes regardless of compact
+    t3 = CTable(RowModel, new_data=generate_test_data(5, 1), expected_size=10, compact=False)
+    t3.delete([0, 2, 4])
+    initial_cap3 = len(t3._valid_rows)
+    t3.extend(generate_test_data(20, 100))
+    assert len(t3._valid_rows) > initial_cap3
+
+
+def test_mixed_append_extend_with_gaps():
+    """Multiple extends, appends, and deletes interleaved; lengths stay correct."""
+    # Multiple extends with intermediate deletions
+    t = CTable(RowModel, expected_size=20)
+    t.extend(generate_test_data(5, 1))
+    t.extend(generate_test_data(3, 10))
+    assert len(t) == 8
+    t.delete([2, 4, 6])
+    assert len(t) == 5
+    t.extend(generate_test_data(2, 20))
+    assert len(t) == 7
+    t.delete([0, 1])
+    assert len(t) == 5
+    t.extend(generate_test_data(4, 30))
+    assert len(t) == 9
+
+    # append + extend mixed, delete all then re-extend
+    t2 = CTable(RowModel, expected_size=20)
+    for i in range(5):
+        t2.append((i + 1, complex(i), float(i * 10), True))
+    assert len(t2) == 5
+    t2.extend(generate_test_data(5, 10))
+    assert len(t2) == 10
+    t2.delete([1, 3, 5, 7, 9])
+    assert len(t2) == 5
+    t2.append((100, 0j, 50.0, False))
+    assert len(t2) == 6
+    t2.extend(generate_test_data(3, 200))
+    assert len(t2) == 9
+
+    # Fill all gaps then extend; delete all then extend from scratch
+    t3 = CTable(RowModel, new_data=generate_test_data(10, 1), expected_size=15)
+    t3.delete(list(range(0, 10, 2)))
+    assert len(t3) == 5
+    t3.extend(generate_test_data(5, 20))
+    assert len(t3) == 10
+
+    t4 = CTable(RowModel, new_data=generate_test_data(10, 1), expected_size=15)
+    t4.delete(list(range(10)))
+    assert len(t4) == 0
+    t4.extend(generate_test_data(5, 100))
+    assert len(t4) == 5
+
+
+def test_compact_behavior():
+    """Manual compact consolidates mask; auto-compact keeps data correct after extend."""
+    # Manual compact: valid rows packed to front, extend fills after them
+    t = CTable(RowModel, new_data=generate_test_data(10, 1), expected_size=15, compact=False)
+    t.delete([1, 3, 5, 7, 9])
+    assert len(t) == 5
+    t.compact()
+    assert_mask_matches(t, [True] * 5 + [False] * 10)
+    t.extend(generate_test_data(3, 20))
+    assert len(t) == 8
+
+    # Auto-compact: table stays consistent after heavy deletions + extend
+    t2 = CTable(RowModel, new_data=generate_test_data(10, 1), expected_size=15, compact=True)
+    t2.delete(list(range(0, 8)))
+    assert len(t2) == 2
+    t2.extend(generate_test_data(10, 100))
+    assert len(t2) == 12
+
+
+def test_complex_scenarios():
+    """Sparse gaps, alternating cycles, data integrity, and full workflow."""
+    # Sparse table: many scattered deletions then bulk extend
+    t = CTable(RowModel, new_data=generate_test_data(20, 1), expected_size=30)
+    t.delete([0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18])
+    assert len(t) == 5
+    t.extend(generate_test_data(10, 100))
+    assert len(t) == 15
+
+    # Alternating extend/delete cycles
+    t2 = CTable(RowModel, expected_size=50)
+    for cycle in range(5):
+        t2.extend(generate_test_data(10, cycle * 100))
+        current_len = len(t2)
+        if current_len >= 5:
+            t2.delete(list(range(0, min(5, current_len))))
+
+    # Data integrity: correct row values survive delete + extend
+    t3 = CTable(
+        RowModel, new_data=[(1, 1j, 10.0, True), (2, 2j, 20.0, False), (3, 3j, 30.0, True)], expected_size=10
+    )
+    t3.delete(1)
+    assert t3.row[0].id[0] == 1
+    assert t3.row[1].id[0] == 3
+    t3.extend([(10, 10j, 100.0, True), (11, 11j, 100.0, False)])
+    assert t3.row[0].id[0] == 1
+    assert t3.row[1].id[0] == 3
+    assert t3.row[2].id[0] == 10
+    assert t3.row[3].id[0] == 11
+
+    # Full workflow
+    t4 = CTable(RowModel, expected_size=20, compact=False)
+    t4.extend(generate_test_data(10, 1))
+    assert len(t4) == 10
+    t4.delete([0, 2, 4, 6, 8])
+    assert len(t4) == 5
+    t4.append((100, 0j, 50.0, True))
+    t4.append((101, 1j, 60.0, False))
+    assert len(t4) == 7
+    t4.extend(generate_test_data(5, 200))
+    assert len(t4) == 12
+    t4.delete([3, 7, 10])
+    assert len(t4) == 9
+    t4.extend(generate_test_data(3, 300))
+    assert len(t4) == 12
+    assert t4.nrows == 12
+    assert t4.ncols == 4
+
+
+if __name__ == "__main__":
+    pytest.main(["-v", __file__])
diff --git a/tests/ctable/test_row_logic.py b/tests/ctable/test_row_logic.py
new file mode 100644
index 00000000..483b7924
--- /dev/null
+++ b/tests/ctable/test_row_logic.py
@@ -0,0 +1,221 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+from typing import Annotated
+
+import numpy as np
+import pytest
+from pydantic import BaseModel, Field
+
+from blosc2 import CTable
+from blosc2.ctable import Column
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+def generate_test_data(n_rows: int, start_id: int = 0) -> list:
+    return [(start_id + i, float(i * 10), i % 2 == 0) for i in range(n_rows)]
+
+
+# -------------------------------------------------------------------
+# Tests
+# -------------------------------------------------------------------
+
+
+def test_row_int_indexing():
+    """int indexing: no holes, with holes, negative indices, and out-of-range."""
+    data = generate_test_data(20)
+
+    # No holes: spot checks
+    t = CTable(RowModel, new_data=data)
+    r = t.row[0]
+    assert isinstance(r, CTable)
+    assert len(r) == 1
+    assert r.id[0] == 0
+    assert r.score[0] == 0.0
+    assert r.active[0]
+    assert t.row[10].id[0] == 10
+    assert t.row[10].score[0] == 100.0
+
+    # Negative indices
+    assert t.row[-1].id[0] == 19
+    assert t.row[-5].id[0] == 15
+
+    # With holes: delete odd positions -> valid: 0,2,4,6,8,10...
+    t.delete([1, 3, 5, 7, 9])
+    assert t.row[0].id[0] == 0
+    assert t.row[1].id[0] == 2
+    assert t.row[5].id[0] == 10
+
+    # Out of range
+    t2 = CTable(RowModel, new_data=generate_test_data(10))
+    for idx in [10, 100, -11]:
+        with pytest.raises(IndexError):
+            _ = t2.row[idx]
+
+
+def test_row_slice_indexing():
+    """Slice indexing: no holes, with holes, step, negative, beyond bounds, empty/full."""
+    data = generate_test_data(20)
+
+    # No holes
+    t = CTable(RowModel, new_data=data)
+    assert isinstance(t.row[0:5], CTable)
+    assert list(t.row[0:5].id) == [0, 1, 2, 3, 4]
+    assert list(t.row[10:15].id) == [10, 11, 12, 13, 14]
+    assert list(t.row[::2].id) == [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
+
+    # With step
+    assert list(t.row[0:10:2].id) == [0, 2, 4, 6, 8]
+    assert list(t.row[1:10:3].id) == [1, 4, 7]
+
+    # Negative indices
+    assert list(t.row[-5:].id) == [15, 16, 17, 18, 19]
+    assert list(t.row[-10:-5].id) == [10, 11, 12, 13, 14]
+
+    # With holes: delete odd positions
+    t.delete([1, 3, 5, 7, 9])
+    assert list(t.row[0:5].id) == [0, 2, 4, 6, 8]
+    assert list(t.row[5:10].id) == [10, 11, 12, 13, 14]
+
+    # Beyond bounds
+    t2 = CTable(RowModel, new_data=generate_test_data(10))
+    assert len(t2.row[11:20]) == 0
+    assert list(t2.row[5:100].id) == [5, 6, 7, 8, 9]
+    assert len(t2.row[100:]) == 0
+
+    # Empty and full slices
+    assert len(t2.row[5:5]) == 0
+    assert len(t2.row[0:0]) == 0
+    result = t2.row[:]
+    assert len(result) == 10
+    assert list(result.id) == list(range(10))
+
+
+def test_row_list_indexing():
+    """List indexing: no holes, with holes, out-of-range, edge cases."""
+    data = generate_test_data(20)
+
+    # No holes
+    t = CTable(RowModel, new_data=data)
+    r = t.row[[0, 5, 10, 15]]
+    assert isinstance(r, CTable)
+    assert len(r) == 4
+    assert set(r.id) == {0, 5, 10, 15}
+    assert set(t.row[[19, 0, 10]].id) == {0, 10, 19}
+
+    # With holes: delete [1,3,5,7,9] -> logical 0->id0, 1->id2, 2->id4...
+    t.delete([1, 3, 5, 7, 9])
+    assert set(t.row[[0, 2, 4]].id) == {0, 4, 8}
+    assert set(t.row[[5, 3, 1]].id) == {2, 6, 10}
+
+    # Negative indices in list
+    t2 = CTable(RowModel, new_data=generate_test_data(10))
+    assert set(t2.row[[0, -1, 5]].id) == {0, 5, 9}
+
+    # Single element
+    assert t2.row[[5]].id[0] == 5
+
+    # Duplicate indices -> deduplicated
+    r_dup = t2.row[[5, 5, 5]]
+    assert len(r_dup) == 1
+    assert r_dup.id[0] == 5
+
+    # Empty list
+    assert len(t2.row[[]]) == 0
+
+    # Out of range
+    for bad in [[0, 5, 100], [0, 1, -11]]:
+        with pytest.raises(IndexError):
+            _ = t2.row[bad]
+
+
+def test_row_view_properties():
+    """View metadata, base chain, mask integrity, column liveness, and chained views."""
+    data = generate_test_data(100)
+    tabla0 = CTable(RowModel, new_data=data)
+
+    # Base is None on root table
+    assert tabla0.base is None
+
+    # View properties are shared with parent
+    v = tabla0.row[0:10]
+    assert v.base is tabla0
+    assert v._row_type == tabla0._row_type
+    assert v._cols is tabla0._cols
+    assert v._col_widths == tabla0._col_widths
+    assert v.col_names == tabla0.col_names
+
+    # Read ops on view
+    view = tabla0.row[5:15]
+    assert view.id[0] == 5
+    assert view.score[0] == 50.0
+    assert not view.active[0]
+    assert list(view.id) == list(range(5, 15))
+
+    # Mask integrity
+    assert np.count_nonzero(view._valid_rows[:]) == 10
+
+    # Column is live (points back to its view)
+    col = view.id
+    assert isinstance(col, Column)
+    assert col._table is view
+
+    # Chained views: base always points to immediate parent
+    tabla1 = tabla0.row[:50]
+    assert tabla1.base is tabla0
+    assert len(tabla1) == 50
+
+    tabla2 = tabla1.row[:10]
+    assert tabla2.base is tabla1
+    assert len(tabla2) == 10
+    assert list(tabla2.id) == list(range(10))
+
+    tabla3 = tabla2.row[5:]
+    assert tabla3.base is tabla2
+    assert len(tabla3) == 5
+    assert list(tabla3.id) == [5, 6, 7, 8, 9]
+
+    # Chained view with holes on parent
+    tabla0.delete([5, 10, 15, 20, 25])
+    tv1 = tabla0.row[:30]
+    assert tv1.base is tabla0
+    assert len(tv1) == 30
+    tv2 = tv1.row[10:20]
+    assert tv2.base is tv1
+    assert len(tv2) == 10
+
+
+def test_row_edge_cases():
+    """Empty table, fully-deleted table: int raises IndexError, slice returns empty."""
+    # Empty table
+    empty = CTable(RowModel)
+    with pytest.raises(IndexError):
+        _ = empty.row[0]
+    assert len(empty.row[:]) == 0
+    assert len(empty.row[0:10]) == 0
+
+    # All rows deleted
+    data = generate_test_data(10)
+    t = CTable(RowModel, new_data=data)
+    t.delete(list(range(10)))
+    with pytest.raises(IndexError):
+        _ = t.row[0]
+    assert len(t.row[:]) == 0
+
+
+if __name__ == "__main__":
+    pytest.main(["-v", __file__])

From c05c2ec87271e466c94bdbdbf974f9fd61bf8f70 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 26 Mar 2026 11:21:43 +0100
Subject: [PATCH 02/11] Add a plan for declaring a simple schema for CTable
 objects

---
 plans/ctable-schema.md | 1269 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1269 insertions(+)
 create mode 100644 plans/ctable-schema.md

diff --git a/plans/ctable-schema.md b/plans/ctable-schema.md
new file mode 100644
index 00000000..bed1c6a7
--- /dev/null
+++ b/plans/ctable-schema.md
@@ -0,0 +1,1269 @@
+# CTable Schema Redesign
+
+## Motivation
+
+The current `CTable` prototype in PR #598 uses `pydantic.BaseModel` plus
+`Annotated[...]` metadata to define table schemas.  That works, but it is not the
+best long-term API for a columnar container in `python-blosc2`.
+
+The main issues with the current shape are:
+
+* It mixes row validation concerns with physical storage concerns.
+* It relies on custom metadata objects (`NumpyDtype`, `MaxLen`) embedded in
+  Pydantic annotations.
+* It is verbose for simple schemas.
+* It does not provide an obvious place for NDArray-specific per-column options
+  such as `cparams`, `dparams`, `chunks`, `blocks`, or future indexing hints.
+
+What we want instead is:
+
+* A schema API that is easy to read and write.
+* A place to attach Blosc2-specific per-column configuration.
+* A way to express logical constraints such as `ge=0`, `le=100`, `max_length=10`.
+* Internal validation without forcing the public API to be Pydantic-shaped.
+* A clean distinction between:
+  * logical field type and constraints
+  * physical storage type
+  * per-column storage options
+
+The proposed solution is a **dataclass-first schema API** with **declarative field
+spec objects** and **optional internal Pydantic-backed validation**.
+
+The intended usage style is:
+
+* canonical form for constrained or storage-tuned columns:
+  `id: int = b2.field(b2.int64(ge=0))`
+* shorthand for simple inferred columns:
+  `id: int`
+* not preferred as a primary style:
+  `id = b2.field(b2.int64(ge=0))`
+
+The reason is that the canonical form preserves normal Python type annotations,
+which are valuable for readability, static tooling, and schema inspection.
+
+---
+
+## Proposed public API
+
+### Schema declaration
+
+The intended schema declaration style is:
+
+```python
+from dataclasses import dataclass
+
+import blosc2 as b2
+
+
+@dataclass
+class Row:
+    id: int = b2.field(b2.int64(ge=0))
+    score: float = b2.field(
+        b2.float64(ge=0, le=100),
+        cparams={"codec": b2.Codec.LZ4, "clevel": 5},
+    )
+    active: bool = b2.field(b2.bool(), default=True)
+```
+
+This is the target user-facing API for `CTable`.
+
+This should be documented as the **canonical** schema declaration style.
+
+For simple unconstrained cases, `CTable` may support an inferred shorthand:
+
+```python
+@dataclass
+class Row:
+    id: int
+    score: float
+    active: bool = True
+```
+
+which is interpreted approximately as:
+
+```python
+@dataclass
+class Row:
+    id: int = b2.field(b2.int64())
+    score: float = b2.field(b2.float64())
+    active: bool = b2.field(b2.bool(), default=True)
+```
+
+This shorthand should be limited to simple built-in Python types where the
+mapping is obvious.
+
+### Naming convention
+
+Use **lowercase names** for schema descriptor objects:
+
+* `b2.int64`
+* `b2.float64`
+* `b2.bool`
+* later: `b2.string(max_length=...)`, `b2.bytes(max_length=...)`, `b2.complex128`
+
+Reason:
+
+* `b2.int64(...)` is not just a dtype; it is a schema descriptor with constraints.
+* The lowercase form keeps the API closer in spirit to NumPy and PyTorch.
+* If plain NumPy dtypes are needed, callers can use `np.int64`, `np.float64`,
+  `np.bool_`, etc.
+* `b2.bool(...)` is preferred over `b2.bool_(...)` for readability, even though
+  NumPy uses `bool_`.  This is closer to PyTorch style and fits better for a
+  schema-builder API.
+
+### Field helper
+
+`b2.field(...)` should be the standard way to attach schema metadata to a
+dataclass field.
+
+Expected shape:
+
+```python
+b2.field(
+    b2.float64(ge=0, le=100),
+    default=...,
+    default_factory=...,
+    cparams=...,
+    dparams=...,
+    chunks=...,
+    blocks=...,
+    title=...,
+    description=...,
+    nullable=...,
+)
+```
+
+At minimum for the first version:
+
+* `spec`
+* `default`
+* `default_factory`
+* `cparams`
+* `dparams`
+* `chunks`
+* `blocks`
+
+The implementation should store these in `dataclasses.field(metadata=...)`.
+
+The unannotated form:
+
+```python
+id = b2.field(b2.int64(ge=0))
+```
+
+should not be the primary API.  It may be supported later only if there is a
+strong reason, but the preferred style should retain:
+
+* a Python type annotation in the annotation slot
+* `b2.field(...)` in the field/default slot
+
+That keeps the schema aligned with normal dataclass usage.
+
+---
+
+## Core design
+
+### 1. Dataclass is the schema carrier
+
+The dataclass defines:
+
+* field names
+* Python-level row shape
+* user-visible defaults
+
+Example:
+
+```python
+@dataclass
+class Row:
+    id: int = b2.field(b2.int64(ge=0))
+    score: float = b2.field(b2.float64(ge=0, le=100))
+    active: bool = b2.field(b2.bool(), default=True)
+```
+
+This keeps the declaration small and idiomatic.
+
+The Python annotation should remain part of the design, not be replaced by
+`b2.field(...)` alone.  The annotation provides value independently of the
+Blosc2 schema descriptor.
+
+### 2. Schema spec objects are the source of truth
+
+Each lowercase builder object is a lightweight immutable schema descriptor.
+
+Examples:
+
+```python
+b2.int64(ge=0)
+b2.float64(ge=0, le=100)
+b2.bool()
+b2.string(max_length=32)
+b2.bytes(max_length=64)
+```
+
+Each spec object should carry only schema-level metadata, for example:
+
+* logical kind
+* storage dtype
+* numeric constraints (`ge`, `gt`, `le`, `lt`, `multiple_of`)
+* string constraints (`max_length`, `min_length`, `pattern`)
+* nullability
+* maybe logical annotations later (`categorical`, `timezone`, `unit`)
+
+They should **not** directly carry per-column NDArray instance settings such as
+`cparams` or `chunks`; those belong in `b2.field(...)`.
+
+### 3. Column field metadata carries NDArray-specific configuration
+
+`b2.field(...)` metadata should be the place for:
+
+* column storage options
+* per-column compression settings
+* chunk/block tuning
+* persistence options in future versions
+
+This keeps the separation clean:
+
+* `b2.float64(ge=0, le=100)` answers: "what values are valid?"
+* `b2.field(..., cparams=..., chunks=...)` answers: "how is this column stored?"
+
+### 4. Schema compilation step inside CTable
+
+`CTable` should not consume raw dataclass fields repeatedly.  On construction, it
+should compile the row class into an internal schema representation.
+
+For example:
+
+```python
+compiled = CompiledSchema(
+    row_cls=Row,
+    columns=[
+        CompiledColumn(
+            name="id",
+            py_type=int,
+            spec=b2.int64(ge=0),
+            dtype=np.int64,
+            default=MISSING,
+            cparams=...,
+            dparams=...,
+            chunks=...,
+            blocks=...,
+            validator_info=...,
+        ),
+        ...,
+    ],
+    validator_model=...,
+)
+```
+
+This compiled form should drive:
+
+* NDArray creation
+* row validation
+* bulk validation
+* introspection and future serialization
+
+---
+
+## Validation strategy
+
+### Use Pydantic internally, but do not make it the public schema API
+
+Pydantic is a good fit for validation because it is:
+
+* mature
+* well-tested
+* expressive
+* fast enough for row-level operations
+
+However, it should be an **implementation detail**, not the public schema surface.
+
+The public schema should remain:
+
+* dataclass-based
+* Blosc2-specific
+* independent of any one validation library
+
+### Why not use Pydantic as the schema source directly?
+
+Because storage and validation are overlapping but not identical concerns.
+
+Examples:
+
+* `dtype=np.int16` is both logical and physical.
+* `cparams`, `chunks`, `blocks`, `dparams` are not Pydantic concepts.
+* a future column index, bloom filter, or codec hint is not a validation concept.
+
+Therefore, the internal architecture should be:
+
+* user declares a dataclass + `b2.field(...)`
+* `CTable` compiles it into:
+  * storage schema
+  * validation schema
+
+### Row-level validation
+
+For `append(row)` and other row-wise inserts:
+
+* compile a cached internal Pydantic model once per schema
+* validate incoming rows against that model
+* convert the validated row into column values
+
+This is the simplest and safest path.
+
+Expected behavior:
+
+* `table.append(Row(...))`
+* `table.append({"id": 1, "score": 2.0, "active": True})`
+* `table.append((1, 2.0, True))`
+
+All may be accepted, but internally normalized through one validator path.
+
+### Bulk validation
+
+For `extend(...)`, row-by-row Pydantic validation may be too expensive for large
+batches.  Bulk inserts need a separate strategy.
+
+Recommended modes:
+
+* `validate=True`
+  Full validation.  May use row-wise Pydantic validation for smaller inputs and
+  vectorized checks where available.
+* `validate=False`
+  Trust caller, perform dtype coercion only.
+* optional later: `validate="sample"` or `validate="vectorized"`
+
+For numeric and simple string constraints, vectorized checks are preferable when
+possible:
+
+* `ge`, `gt`, `le`, `lt`
+* `max_length`, `min_length`
+* null checks
+* dtype coercion checks
+
+This means the architecture should support both:
+
+* Pydantic row validation
+* vectorized array validation
+
+The compiled schema should expose enough information for both.
+
+### Performance stance
+
+Pydantic should be treated as:
+
+* a strong default for correctness
+* fast enough for row-wise validation
+* not necessarily the fastest choice for large batch validation
+
+This is important because the performance bottleneck for `extend()` is more about
+per-row Python overhead than about Pydantic specifically.
+
+---
+
+## Detailed API proposal
+
+### Schema spec classes
+
+Add schema descriptor classes under `blosc2`, for example:
+
+* `int8`, `int16`, `int32`, `int64`
+* `uint8`, `uint16`, `uint32`, `uint64`
+* `float32`, `float64`
+* `bool`
+* `complex64`, `complex128`
+* `string`
+* `bytes`
+
+Minimal constructor examples:
+
+```python
+b2.int64(ge=0)
+b2.float64(ge=0, le=100)
+b2.string(max_length=32)
+b2.bytes(max_length=64)
+b2.bool()
+```
+
+Internal common fields:
+
+* `dtype`
+* `nullable`
+* `constraints`
+* `python_type`
+
+### Field helper
+
+`b2.field(spec, **kwargs)` should return a `dataclasses.field(...)` object with
+Blosc2 metadata attached.
+
+Example metadata layout:
+
+```python
+{
+    "blosc2": {
+        "spec": ...,
+        "cparams": ...,
+        "dparams": ...,
+        "chunks": ...,
+        "blocks": ...,
+    }
+}
+```
+
+This metadata key should be stable and reserved.
+
+### CTable constructor
+
+The desired constructor remains:
+
+```python
+table = b2.CTable(Row)
+```
+
+Optional overrides:
+
+```python
+table = b2.CTable(
+    Row,
+    expected_size=1_000_000,
+    compact=False,
+    validate=True,
+)
+```
+
+`CTable` should detect that `Row` is a dataclass schema and compile it.
+
+### Possible compatibility layer
+
+If needed temporarily, `CTable` may continue accepting the old Pydantic model
+style during a transition period:
+
+```python
+table = b2.CTable(LegacyPydanticRow)
+```
+
+But that should be documented as legacy or transitional once the dataclass API
+lands.
+
+---
+
+## Internal compilation pipeline
+
+### Step 1. Inspect dataclass fields
+
+For each dataclass field:
+
+* field name
+* Python annotation
+* default or default factory
+* Blosc2 metadata from `b2.field(...)`
+
+Reject invalid shapes early:
+
+* missing `b2.field(...)`
+* missing schema spec
+* incompatible Python annotation vs schema spec
+* unsupported defaults
+
+If inferred shorthand is supported, refine the first two rules to:
+
+* either a supported plain annotation, or an explicit `b2.field(...)`
+* if `b2.field(...)` is present, it must contain a schema spec
+
+### Step 2. Build compiled column descriptors
+
+For each field, produce a `CompiledColumn` object containing:
+
+* `name`
+* `py_type`
+* `spec`
+* `dtype`
+* `default`
+* `default_factory`
+* `nullable`
+* `cparams`
+* `dparams`
+* `chunks`
+* `blocks`
+* validation constraints
+
+### Step 3. Derive physical NDArray creation arguments
+
+From the compiled column descriptor, derive:
+
+* `dtype`
+* shape
+* chunks
+* blocks
+* `cparams`
+* `dparams`
+
+This should happen once during table initialization.
+
+### Step 4. Derive validation model
+
+Translate each schema spec into a Pydantic field definition.
+
+Examples:
+
+* `int64(ge=0)` -> integer field with `ge=0`
+* `float64(ge=0, le=100)` -> float field with `ge=0`, `le=100`
+* `string(max_length=32)` -> string field with `max_length=32`
+
+Cache the compiled Pydantic model class per row schema.
+
+### Step 5. Expose introspection hooks
+
+Expose enough metadata for:
+
+* debugging
+* `table.info()`
+* future schema serialization
+* future schema-driven docs and reprs
+
+Possible user-facing hooks later:
+
+* `table.schema`
+* `table.schema.columns`
+* `table.schema.as_dict()`
+
+---
+
+## Handling defaults
+
+Defaults should follow dataclass semantics as closely as possible.
+
+Examples:
+
+```python
+active: bool = b2.field(b2.bool(), default=True)
+tags: list[str] = b2.field(..., default_factory=list)
+```
+
+For the first implementation, keep this conservative:
+
+* support scalar defaults
+* support `default_factory` only if there is a clear use case
+* reject mutable defaults directly
+
+On insert:
+
+* omitted values should be filled from defaults
+* explicit `None` should be accepted only if the field is nullable
+
+---
+
+## Insert semantics
+
+### append()
+
+`append()` should accept a small set of normalized shapes:
+
+* dataclass row instance
+* dict-like row
+* tuple/list in schema order
+
+Recommended internal path:
+
+1. normalize the input to a field mapping
+2. validate with cached validator model
+3. coerce to final column values
+4. append into underlying NDArrays
+
+### extend()
+
+`extend()` should accept:
+
+* iterable of row objects
+* dict-of-arrays
+* structured NumPy array
+* maybe another `CTable`
+
+Recommended internal path:
+
+1. normalize to column batches where possible
+2. validate according to `validate=` mode
+3. coerce dtypes
+4. write in bulk
+
+For `dict-of-arrays` and structured arrays, vectorized validation should be the
+preferred long-term path.
+
+---
+
+## Per-column NDArray options
+
+One of the main reasons for `b2.field(...)` is that different columns may want
+different storage settings.
+
+Examples:
+
+* a boolean column may want different compression parameters from a float column
+* a high-cardinality string column may need different chunk sizes
+* a metric column may use a specific codec or filter tuning
+
+So the schema system must allow:
+
+```python
+@dataclass
+class Row:
+    id: int = b2.field(b2.int64(ge=0), cparams={"codec": b2.Codec.ZSTD, "clevel": 1})
+    score: float = b2.field(
+        b2.float64(ge=0, le=100), cparams={"codec": b2.Codec.LZ4HC, "clevel": 9}
+    )
+    active: bool = b2.field(b2.bool(), cparams={"codec": b2.Codec.LZ4})
+```
+
+The implementation should define precedence rules clearly:
+
+* column-level options override table defaults
+* table-level options fill in unspecified values
+
+This implies `CTable(...)` may also take default storage options:
+
+```python
+table = b2.CTable(Row, cparams=..., dparams=...)
+```
+
+Column-level overrides should merge against those defaults, not replace them
+blindly.
+
+---
+
+## Compatibility and migration
+
+### Goal
+
+Move toward the dataclass-based schema API without locking the project into the
+current Pydantic-shaped declaration model.
+
+### Migration path
+
+Phase 1:
+
+* introduce schema spec classes and `b2.field(...)`
+* support dataclass schemas in `CTable`
+* keep existing prototype behavior separate
+
+Phase 2:
+
+* add row validation via cached internal Pydantic model
+* add bulk validation modes
+* document the dataclass schema API as preferred
+
+Phase 3:
+
+* optionally add a compatibility adapter for existing Pydantic models
+* deprecate ad hoc `Annotated[...]` metadata conventions if they remain exposed
+
+### Non-goal
+
+Do not make the first implementation solve every possible schema feature.  The
+first goal is to get the schema shape and internal architecture right.
+
+---
+
+## Serialization implications
+
+Even if `save()` / `load()` are not implemented yet, this schema design should
+anticipate persistence.
+
+Eventually a persisted `CTable` will need to store:
+
+* column names
+* logical schema descriptors
+* per-column defaults
+* per-column NDArray storage options
+* maybe validation constraints
+
+That argues strongly for having a stable compiled schema representation early.
+
+The compiled schema should be serializable to:
+
+* JSON-compatible metadata
+* or a small msgpack payload
+
+The public dataclass itself does not need to be serialized directly.  Only the
+compiled schema matters for persistence.
+
+---
+
+## Open questions
+
+### 1. Should Python annotations be required to match the schema spec?
+
+Example:
+
+```python
+id: int = b2.field(b2.int64(ge=0))
+```
+
+Recommended answer: yes, broadly, with sensible compatibility rules.
+
+Allowed:
+
+* `int` with `int64`
+* `float` with `float64`
+* `bool` with `bool`
+
+Potentially allowed later:
+
+* `str` with `string`
+* `bytes` with `bytes`
+
+Reject obviously inconsistent declarations early.
+
+In other words:
+
+* `id: int = b2.field(b2.int64(ge=0))` is good
+* `id: int` is acceptable shorthand for inferred `b2.int64()`
+* `id = b2.field(b2.int64(ge=0))` is not the preferred style because it drops
+  the Python annotation
+
+### 2. Where should nullability live?
+
+Recommended answer: on the schema spec.
+
+Example:
+
+```python
+name: str | None = b2.field(b2.string(max_length=32, nullable=True))
+```
+
+The Python annotation and schema spec should agree.
+
+### 3. Should `b2.field()` require a spec?
+
+Recommended answer: yes for the first version.
+
+Allowing `b2.field(default=True)` without a spec means we must infer too much
+from the Python annotation and lose clarity.
+
+This still allows fully inferred fields that do not use `b2.field(...)` at all:
+
+```python
+active: bool = True
+```
+
+but once `b2.field(...)` is used, it should carry an explicit schema spec.
+
+### 4. How much should Pydantic-specific behavior leak?
+
+Recommended answer: as little as possible.
+
+Users should not need to know whether validation is backed by Pydantic,
+vectorized NumPy checks, or another mechanism.
+
+---
+
+## Concrete implementation sequence
+
+This section turns the design into a proposed execution order with concrete
+files, class names, and function signatures.
+
+### Step 1: add schema descriptor primitives
+
+Create a new module:
+
+* `src/blosc2/schema.py`
+
+Primary contents:
+
+```python
+from __future__ import annotations
+
+from dataclasses import MISSING, Field as DataclassField, field as dc_field
+from typing import Any
+
+import numpy as np
+```
+
+Proposed public classes and functions:
+
+```python
+class SchemaSpec:
+    dtype: np.dtype
+    python_type: type[Any]
+    nullable: bool
+
+    def to_pydantic_kwargs(self) -> dict[str, Any]: ...
+    def to_metadata_dict(self) -> dict[str, Any]: ...
+
+
+class int64(SchemaSpec):
+    def __init__(
+        self, *, ge=None, gt=None, le=None, lt=None, nullable: bool = False
+    ): ...
+
+
+class float64(SchemaSpec):
+    def __init__(
+        self, *, ge=None, gt=None, le=None, lt=None, nullable: bool = False
+    ): ...
+
+
+class bool(SchemaSpec):
+    def __init__(self, *, nullable: bool = False): ...
+
+
+class string(SchemaSpec):
+    def __init__(
+        self, *, min_length=None, max_length=None, pattern=None, nullable: bool = False
+    ): ...
+
+
+class bytes(SchemaSpec):
+    def __init__(self, *, min_length=None, max_length=None, nullable: bool = False): ...
+
+
+def field(
+    spec: SchemaSpec,
+    *,
+    default=MISSING,
+    default_factory=MISSING,
+    cparams: dict[str, Any] | None = None,
+    dparams: dict[str, Any] | None = None,
+    chunks: tuple[int, ...] | None = None,
+    blocks: tuple[int, ...] | None = None,
+    title: str | None = None,
+    description: str | None = None,
+) -> DataclassField: ...
+```
+
+Internal helper constants:
+
+```python
+BLOSC2_FIELD_METADATA_KEY = "blosc2"
+```
+
+Notes:
+
+* Start with only the spec classes needed for the first `CTable` iteration:
+  `int64`, `float64`, `bool`.
+* Add `string` and `bytes` only if needed in the same slice of work.
+* Avoid over-generalizing the first implementation.
+
+### Step 2: add schema compiler and compiled representations
+
+Create a new module:
+
+* `src/blosc2/schema_compiler.py`
+
+Primary internal dataclasses:
+
+```python
+from dataclasses import dataclass
+from typing import Any
+
+
+@dataclass(slots=True)
+class ColumnConfig:
+    cparams: dict[str, Any] | None
+    dparams: dict[str, Any] | None
+    chunks: tuple[int, ...] | None
+    blocks: tuple[int, ...] | None
+    title: str | None
+    description: str | None
+
+
+@dataclass(slots=True)
+class CompiledColumn:
+    name: str
+    py_type: Any
+    spec: Any
+    dtype: np.dtype
+    default: Any
+    default_factory: Any
+    config: ColumnConfig
+
+
+@dataclass(slots=True)
+class CompiledSchema:
+    row_cls: type[Any]
+    columns: list[CompiledColumn]
+    columns_by_name: dict[str, CompiledColumn]
+    validator_model: type[Any] | None = None
+```
+
+Primary internal functions:
+
+```python
+def compile_schema(row_cls: type[Any]) -> CompiledSchema: ...
+def infer_spec_from_annotation(annotation: Any, default: Any = MISSING) -> Any: ...
+def validate_annotation_matches_spec(annotation: Any, spec: Any) -> None: ...
+def get_blosc2_field_metadata(dc_field) -> dict[str, Any] | None: ...
+```
+
+Behavior:
+
+* accept a dataclass type only
+* for explicit `b2.field(...)`, read the spec from metadata
+* for inferred fields like `id: int`, derive `b2.int64()`
+* reject unsupported annotations early
+* normalize all defaults/config into `CompiledSchema`
+
+### Step 3: export the schema API from `blosc2`
+
+Update:
+
+* `src/blosc2/__init__.py`
+
+Exports to add:
+
+```python
+from .schema import bool, bytes, field, float64, int64, string
+```
+
+And in `__all__`:
+
+```python
+"bool",
+"bytes",
+"field",
+"float64",
+"int64",
+"string",
+```
+
+Notes:
+
+* Be careful with `bool` and `bytes` in `__init__.py` because they shadow
+  builtins within the module namespace.  That is acceptable if done deliberately,
+  but it should be reviewed explicitly.
+* If shadowing proves too awkward internally, keep the implementation names
+  private and re-export the public names only.
+
+### Step 4: refactor `CTable` to consume compiled schemas
+
+Update:
+
+* `src/blosc2/ctable.py`
+
+Primary constructor signature:
+
+```python
+class CTable(Generic[RowT]):
+    def __init__(
+        self,
+        row_type: type[RowT],
+        new_data=None,
+        *,
+        expected_size: int = 1_048_576,
+        compact: bool = False,
+        validate: bool = True,
+        cparams: dict[str, Any] | None = None,
+        dparams: dict[str, Any] | None = None,
+    ) -> None: ...
+```
+
+New internal state:
+
+```python
+self._schema: CompiledSchema
+self._validate: bool
+self._table_cparams: dict[str, Any] | None
+self._table_dparams: dict[str, Any] | None
+```
+
+New internal helper methods:
+
+```python
+def _init_columns(self, expected_size: int) -> None: ...
+def _resolve_column_storage(self, col: CompiledColumn) -> dict[str, Any]: ...
+def _normalize_row_input(self, data: Any) -> dict[str, Any]: ...
+def _coerce_row_to_storage(self, row: dict[str, Any]) -> dict[str, Any]: ...
+```
+
+Behavior changes:
+
+* replace direct inspection of `row_type.model_fields`
+* build columns from `self._schema.columns`
+* derive column dtypes from compiled schema
+* merge table-level and field-level storage settings
+
+### Step 5: implement row validation adapter
+
+Create a new internal module:
+
+* `src/blosc2/schema_validation.py`
+
+Primary functions:
+
+```python
+from typing import Any
+
+
+def build_validator_model(schema: CompiledSchema) -> type[Any]: ...
+def validate_row(schema: CompiledSchema, row: dict[str, Any]) -> dict[str, Any]: ...
+def validate_rows_rowwise(
+    schema: CompiledSchema, rows: list[dict[str, Any]]
+) -> list[dict[str, Any]]: ...
+```
+
+Behavior:
+
+* build and cache a Pydantic model per compiled schema
+* map `SchemaSpec` constraints into Pydantic field definitions
+* return normalized Python values ready for storage coercion
+
+Implementation note:
+
+* Cache the generated validator model on `CompiledSchema.validator_model`.
+* Keep all Pydantic-specific logic isolated in this module.
+
+### Step 6: wire validation into `append()`
+
+Update:
+
+* `src/blosc2/ctable.py`
+
+Target signatures:
+
+```python
+def append(self, data: Any) -> None: ...
+def _append_validated_row(self, row: dict[str, Any]) -> None: ...
+```
+
+Concrete behavior:
+
+1. normalize incoming row shape
+2. if `self._validate` is true, validate via `schema_validation.validate_row`
+3. coerce to storage values
+4. append into column NDArrays
+
+Inputs to support in the first cut:
+
+* dataclass row instance
+* dict
+* tuple/list in schema order
+
+Inputs that can wait until later if needed:
+
+* structured NumPy scalar
+* Pydantic model instance
+
+### Step 7: add `extend(..., validate=...)`
+
+Update:
+
+* `src/blosc2/ctable.py`
+
+Proposed signature:
+
+```python
+def extend(self, data: Any, *, validate: bool | None = None) -> None: ...
+```
+
+Supporting internal helpers:
+
+```python
+def _normalize_rows_input(
+    self, data: Any
+) -> tuple[list[dict[str, Any]] | None, dict[str, Any] | None]: ...
+def _extend_rowwise(self, rows: list[dict[str, Any]], *, validate: bool) -> None: ...
+def _extend_columnwise(self, columns: dict[str, Any], *, validate: bool) -> None: ...
+```
+
+First implementation target:
+
+* support iterable of rows via `_extend_rowwise`
+* preserve correctness first, optimize later
+
+Second implementation target:
+
+* add `_extend_columnwise` for structured arrays and dict-of-arrays
+* add vectorized validation for simple constraints
+
+### Step 8: add vectorized validation helpers
+
+Create a new internal module:
+
+* `src/blosc2/schema_vectorized.py`
+
+Primary functions:
+
+```python
+from typing import Any
+
+
+def validate_column_values(col: CompiledColumn, values: Any) -> None: ...
+def validate_column_batch(schema: CompiledSchema, columns: dict[str, Any]) -> None: ...
+```
+
+Initial checks to support:
+
+* numeric `ge`, `gt`, `le`, `lt`
+* string and bytes `min_length`, `max_length`
+* nullability
+* dtype compatibility after coercion
+
+This module should remain optional in the first PR if the rowwise path is enough
+to land the architecture cleanly.
+
+### Step 9: add schema introspection to `CTable`
+
+Update:
+
+* `src/blosc2/ctable.py`
+
+Proposed property:
+
+```python
+@property
+def schema(self) -> CompiledSchema: ...
+```
+
+Optional helper methods:
+
+```python
+def schema_dict(self) -> dict[str, Any]: ...
+def column_schema(self, name: str) -> CompiledColumn: ...
+```
+
+Goal:
+
+* make the new schema layer visible and debuggable
+* provide a stable base for future save/load work
+
+### Step 10: add tests in focused modules
+
+Add:
+
+* `tests/ctable/test_schema_specs.py`
+* `tests/ctable/test_schema_compiler.py`
+* `tests/ctable/test_schema_validation.py`
+* `tests/ctable/test_ctable_dataclass_schema.py`
+
+Test scope by file:
+
+`tests/ctable/test_schema_specs.py`
+
+* spec construction
+* dtype mapping
+* metadata export
+
+`tests/ctable/test_schema_compiler.py`
+
+* explicit `b2.field(...)`
+* inferred shorthand from plain annotations
+* annotation/spec mismatch rejection
+* defaults handling
+
+`tests/ctable/test_schema_validation.py`
+
+* Pydantic validator generation
+* constraint enforcement
+* nullable vs non-nullable behavior
+
+`tests/ctable/test_ctable_dataclass_schema.py`
+
+* `CTable(Row)` construction
+* append with dataclass/dict/tuple
+* extend with iterable of rows
+* per-column `cparams` override plumbing
+
+### Step 11: keep the legacy prototype isolated during transition
+
+Short-term implementation choice:
+
+* if the current `ctable.py` prototype is still in active flux, prefer landing
+  the schema/compiler modules first and then refactoring `CTable` over them
+* do not expand the old Pydantic-specific schema path further
+
+Possible follow-up helper:
+
+```python
+def compile_legacy_pydantic_schema(row_cls: type[Any]) -> CompiledSchema: ...
+```
+
+But only add that if compatibility becomes necessary.
+
+### Step 12: persistence groundwork
+
+No need to implement `save()` / `load()` immediately, but define serialization
+hooks on the schema side now.
+
+Add to `CompiledSchema` or a related helper:
+
+```python
+def schema_to_dict(schema: CompiledSchema) -> dict[str, Any]: ...
+def schema_from_dict(data: dict[str, Any]) -> CompiledSchema: ...
+```
+
+This should remain internal until the persisted format is stable.
+
+### Step 13: delivery order across PRs
+
+Recommended PR slicing:
+
+PR 1:
+
+* `src/blosc2/schema.py`
+* `src/blosc2/schema_compiler.py`
+* exports in `src/blosc2/__init__.py`
+* tests for schema specs and compiler
+
+PR 2:
+
+* `CTable` constructor refactor to use compiled schema
+* `append()` row normalization
+* row-wise validation module
+* `tests/ctable/test_ctable_dataclass_schema.py`
+
+PR 3:
+
+* `extend(..., validate=...)`
+* vectorized validation helpers
+* schema introspection property
+* more tests for batch validation and overrides
+
+PR 4:
+
+* persistence groundwork
+* optional compatibility adapter for legacy Pydantic model declarations
+
+### Step 14: concrete first-PR checklist
+
+The smallest coherent first implementation should be:
+
+1. add `src/blosc2/schema.py`
+2. add `src/blosc2/schema_compiler.py`
+3. export `field`, `int64`, `float64`, `bool`
+4. add tests for:
+   * explicit field specs
+   * inferred shorthand
+   * mismatch rejection
+5. stop there
+
+That first PR gives the project:
+
+* the public schema vocabulary
+* the internal compiled representation
+* confidence in the canonical API shape
+
+before touching too much `CTable` mutation logic.
+
+---
+
+## Recommendation
+
+The recommended direction is:
+
+1. Make **dataclasses** the public schema declaration mechanism for `CTable`.
+2. Introduce **lowercase schema spec objects** such as `b2.int64(...)`.
+3. Use **`b2.field(...)`** to carry both the schema spec and per-column NDArray
+   configuration.
+4. Compile the schema once into an internal representation.
+5. Use **Pydantic internally for row validation**, but keep it hidden behind the
+   Blosc2 schema API.
+6. Add a separate **bulk validation path** for large inserts so `extend()` does
+   not depend entirely on per-row Pydantic validation.
+
+This design gives the project:
+
+* a cleaner user API
+* a better place for columnar storage configuration
+* a clear boundary between schema, validation, and storage
+* flexibility to evolve validation internals later
+* a strong base for future persistence and schema introspection

From 725c28bfe9d0b613c1de1d324d1793f1e5e3e46e Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 26 Mar 2026 11:25:16 +0100
Subject: [PATCH 03/11] Add a pydantic as a new dependency

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 6244b0d9..36f42bfa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,7 @@ dependencies = [
     "ndindex",
     "msgpack",
     "numexpr>=2.14.1; platform_machine != 'wasm32'",
+    "pydantic",
     "requests",
 ]
 version = "4.1.1.dev0"

From 0efd45049d7314452475468420e4ae47ede5e54c Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 26 Mar 2026 11:35:43 +0100
Subject: [PATCH 04/11] Fix small formatting issues

---
 bench/ctable/extend.py      |  1 -
 src/blosc2/__init__.py      |  2 +-
 src/blosc2/ctable.py        | 15 +++++++--------
 tests/ctable/test_column.py |  7 ++++---
 4 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/bench/ctable/extend.py b/bench/ctable/extend.py
index f294b012..b691c03e 100644
--- a/bench/ctable/extend.py
+++ b/bench/ctable/extend.py
@@ -114,4 +114,3 @@ class RowModel(BaseModel):
 print(f"{'Python list of lists':<30} {t_from_list:>12.4f} {'1.00x':>18}")
 print(f"{'NumPy structured array':<30} {t_from_np:>12.4f} {t_from_list / t_from_np:>17.2f}x")
 print(f"{'Existing CTable':<30} {t_from_ctable:>12.4f} {t_from_list / t_from_ctable:>17.2f}x")
-
diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py
index 8afc0653..11ae5677 100644
--- a/src/blosc2/__init__.py
+++ b/src/blosc2/__init__.py
@@ -596,7 +596,7 @@ def _raise(exc):
 """
 
 # Delayed imports for avoiding overwriting of python builtins
-from .ctable import CTable, Column
+from .ctable import Column, CTable
 from .ndarray import (
     abs,
     acos,
diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py
index 3acb1731..9af63267 100644
--- a/src/blosc2/ctable.py
+++ b/src/blosc2/ctable.py
@@ -200,8 +200,6 @@ def __getitem__(self, key: int | slice | list | np.ndarray):
             pos_true = _find_physical_index(self._valid_rows, key)
             return self._raw_col[int(pos_true)]
 
-
-
         elif isinstance(key, slice):
             real_pos = blosc2.where(self._valid_rows, np.arange(len(self._valid_rows))).compute()
             start, stop, step = key.indices(len(real_pos))
@@ -209,14 +207,13 @@ def __getitem__(self, key: int | slice | list | np.ndarray):
             if step == 1:
                 phys_start = real_pos[start]
                 phys_stop = real_pos[stop - 1]
-                mask[phys_start: phys_stop + 1] = True
+                mask[phys_start : phys_stop + 1] = True
             else:
                 lindices = np.arange(start, stop, step)
                 phys_indices = real_pos[lindices]
                 mask[phys_indices[:]] = True
             return Column(self._table, self._col_name, mask=mask)
 
-
         elif isinstance(key, (list, tuple, np.ndarray)):
             real_pos = blosc2.where(self._valid_rows, np.arange(len(self._valid_rows))).compute()
             phys_indices = np.array([real_pos[i] for i in key], dtype=np.int64)
@@ -264,11 +261,11 @@ def __iter__(self):
                 val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0]
                 if not val:
                     continue
-                yield from self._raw_col[chunk_start: chunk_start + actual_size]
+                yield from self._raw_col[chunk_start : chunk_start + actual_size]
                 continue
 
-            mask_chunk = arr[chunk_start: chunk_start + actual_size]
-            data_chunk = self._raw_col[chunk_start: chunk_start + actual_size]
+            mask_chunk = arr[chunk_start : chunk_start + actual_size]
+            data_chunk = self._raw_col[chunk_start : chunk_start + actual_size]
             yield from data_chunk[mask_chunk]
 
     def __len__(self):
@@ -308,7 +305,9 @@ def to_numpy(self):
 
 
 class CTable(Generic[RowT]):
-    def __init__(self, row_type: type[RowT], new_data=None, expected_size: int = 1_048_576, compact: bool = False) -> None:
+    def __init__(
+        self, row_type: type[RowT], new_data=None, expected_size: int = 1_048_576, compact: bool = False
+    ) -> None:
         self._row_type = row_type
         self._cols: dict[str, blosc2.NDArray] = {}
         self._n_rows: int = 0
diff --git a/tests/ctable/test_column.py b/tests/ctable/test_column.py
index 4f2e450b..60f15a5d 100644
--- a/tests/ctable/test_column.py
+++ b/tests/ctable/test_column.py
@@ -10,8 +10,8 @@
 import numpy as np
 import pytest
 from pydantic import BaseModel, Field
-import blosc2
 
+import blosc2
 from blosc2 import CTable
 
 
@@ -171,12 +171,14 @@ def test_column_iter():
 
     tabla3 = CTable(RowModel, new_data=DATA20)
     tabla3.delete([0, 5, 10, 15])
+    # fmt: off
     expected_score = [
         10.0, 20.0, 30.0, 40.0,
         60.0, 70.0, 80.0, 90.0,
         110.0, 120.0, 130.0, 140.0,
         160.0, 170.0, 180.0, 190.0,
     ]
+    # fmt: on
     assert list(tabla3.score) == expected_score
 
 
@@ -264,7 +266,7 @@ def test_to_array_full_column():
     col = tabla.id
 
     expected = np.array([i for i in range(20) if i not in {0, 10, 19}], dtype=np.int64)
-    np.testing.assert_array_equal(col[0:len(col)].to_numpy(), expected)
+    np.testing.assert_array_equal(col[0 : len(col)].to_numpy(), expected)
 
 
 def test_to_array_mask_does_not_include_deleted():
@@ -289,6 +291,5 @@ def test_column_view_mask_is_independent():
     np.testing.assert_array_equal(view_a.to_numpy(), np.arange(0, 5, dtype=np.int64))
 
 
-
 if __name__ == "__main__":
     pytest.main(["-v", __file__])

From f504ad0670e3afca2213f4f465185d260a88ae8c Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 26 Mar 2026 12:01:38 +0100
Subject: [PATCH 05/11] Simplify the plan for ctable schema

---
 plans/ctable-schema.md | 52 ++++++------------------------------------
 1 file changed, 7 insertions(+), 45 deletions(-)

diff --git a/plans/ctable-schema.md b/plans/ctable-schema.md
index bed1c6a7..cacac6d8 100644
--- a/plans/ctable-schema.md
+++ b/plans/ctable-schema.md
@@ -122,14 +122,10 @@ Expected shape:
 b2.field(
     b2.float64(ge=0, le=100),
     default=...,
-    default_factory=...,
     cparams=...,
     dparams=...,
     chunks=...,
     blocks=...,
-    title=...,
-    description=...,
-    nullable=...,
 )
 ```
 
@@ -137,7 +133,6 @@ At minimum for the first version:
 
 * `spec`
 * `default`
-* `default_factory`
 * `cparams`
 * `dparams`
 * `chunks`
@@ -388,7 +383,6 @@ b2.bool()
 Internal common fields:
 
 * `dtype`
-* `nullable`
 * `constraints`
 * `python_type`
 
@@ -480,8 +474,6 @@ For each field, produce a `CompiledColumn` object containing:
 * `spec`
 * `dtype`
 * `default`
-* `default_factory`
-* `nullable`
 * `cparams`
 * `dparams`
 * `chunks`
@@ -538,19 +530,16 @@ Examples:
 
 ```python
 active: bool = b2.field(b2.bool(), default=True)
-tags: list[str] = b2.field(..., default_factory=list)
 ```
 
 For the first implementation, keep this conservative:
 
 * support scalar defaults
-* support `default_factory` only if there is a clear use case
 * reject mutable defaults directly
 
 On insert:
 
 * omitted values should be filled from defaults
-* explicit `None` should be accepted only if the field is nullable
 
 ---
 
@@ -721,19 +710,7 @@ In other words:
 * `id = b2.field(b2.int64(ge=0))` is not the preferred style because it drops
   the Python annotation
 
-### 2. Where should nullability live?
-
-Recommended answer: on the schema spec.
-
-Example:
-
-```python
-name: str | None = b2.field(b2.string(max_length=32, nullable=True))
-```
-
-The Python annotation and schema spec should agree.
-
-### 3. Should `b2.field()` require a spec?
+### 2. Should `b2.field()` require a spec?
 
 Recommended answer: yes for the first version.
 
@@ -748,7 +725,7 @@ active: bool = True
 
 but once `b2.field(...)` is used, it should carry an explicit schema spec.
 
-### 4. How much should Pydantic-specific behavior leak?
+### 3. How much should Pydantic-specific behavior leak?
 
 Recommended answer: as little as possible.
 
@@ -785,49 +762,39 @@ Proposed public classes and functions:
 class SchemaSpec:
     dtype: np.dtype
     python_type: type[Any]
-    nullable: bool
 
     def to_pydantic_kwargs(self) -> dict[str, Any]: ...
     def to_metadata_dict(self) -> dict[str, Any]: ...
 
 
 class int64(SchemaSpec):
-    def __init__(
-        self, *, ge=None, gt=None, le=None, lt=None, nullable: bool = False
-    ): ...
+    def __init__(self, *, ge=None, gt=None, le=None, lt=None): ...
 
 
 class float64(SchemaSpec):
-    def __init__(
-        self, *, ge=None, gt=None, le=None, lt=None, nullable: bool = False
-    ): ...
+    def __init__(self, *, ge=None, gt=None, le=None, lt=None): ...
 
 
 class bool(SchemaSpec):
-    def __init__(self, *, nullable: bool = False): ...
+    def __init__(self): ...
 
 
 class string(SchemaSpec):
-    def __init__(
-        self, *, min_length=None, max_length=None, pattern=None, nullable: bool = False
-    ): ...
+    def __init__(self, *, min_length=None, max_length=None, pattern=None): ...
 
 
 class bytes(SchemaSpec):
-    def __init__(self, *, min_length=None, max_length=None, nullable: bool = False): ...
+    def __init__(self, *, min_length=None, max_length=None): ...
 
 
 def field(
     spec: SchemaSpec,
     *,
     default=MISSING,
-    default_factory=MISSING,
     cparams: dict[str, Any] | None = None,
     dparams: dict[str, Any] | None = None,
     chunks: tuple[int, ...] | None = None,
     blocks: tuple[int, ...] | None = None,
-    title: str | None = None,
-    description: str | None = None,
 ) -> DataclassField: ...
 ```
 
@@ -863,8 +830,6 @@ class ColumnConfig:
     dparams: dict[str, Any] | None
     chunks: tuple[int, ...] | None
     blocks: tuple[int, ...] | None
-    title: str | None
-    description: str | None
 
 
 @dataclass(slots=True)
@@ -874,7 +839,6 @@ class CompiledColumn:
     spec: Any
     dtype: np.dtype
     default: Any
-    default_factory: Any
     config: ColumnConfig
 
 
@@ -1095,7 +1059,6 @@ Initial checks to support:
 
 * numeric `ge`, `gt`, `le`, `lt`
 * string and bytes `min_length`, `max_length`
-* nullability
 * dtype compatibility after coercion
 
 This module should remain optional in the first PR if the rowwise path is enough
@@ -1154,7 +1117,6 @@ Test scope by file:
 
 * Pydantic validator generation
 * constraint enforcement
-* nullable vs non-nullable behavior
 
 `tests/ctable/test_ctable_dataclass_schema.py`
 

From 46bf2e310e12a5fea0b554f644b8e3263d156226 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 26 Mar 2026 12:05:10 +0100
Subject: [PATCH 06/11] Disable wheel generation for each commit in this branch

---
 .github/workflows/cibuildwheels.yml | 4 ++--
 .github/workflows/wasm.yml          | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cibuildwheels.yml b/.github/workflows/cibuildwheels.yml
index ed69f764..f7e2bef0 100644
--- a/.github/workflows/cibuildwheels.yml
+++ b/.github/workflows/cibuildwheels.yml
@@ -26,6 +26,7 @@ env:
 jobs:
 
   build_wheels:
+    if: ${{ github.ref_name != 'ctable3' && github.head_ref != 'ctable3' }}
     name: Build wheels on ${{ matrix.os }} for ${{ matrix.arch }}
     runs-on: ${{ matrix.runs-on || matrix.os }}
     permissions:
@@ -128,10 +129,9 @@ jobs:
 
 
   upload_pypi:
+    if: ${{ (github.ref_name != 'ctable3' && github.head_ref != 'ctable3') && startsWith(github.event.ref, 'refs/tags') }}
     needs: [ build_wheels]
     runs-on: ubuntu-latest
-    # Only upload wheels when tagging (typically a release)
-    if: startsWith(github.event.ref, 'refs/tags')
     steps:
       - uses: actions/download-artifact@v8
         with:
diff --git a/.github/workflows/wasm.yml b/.github/workflows/wasm.yml
index f54afad5..3d293274 100644
--- a/.github/workflows/wasm.yml
+++ b/.github/workflows/wasm.yml
@@ -14,6 +14,7 @@ env:
 
 jobs:
   build_wheels_wasm:
+    if: ${{ github.ref_name != 'ctable3' && github.head_ref != 'ctable3' }}
     name: Build and test wheels for WASM on ${{ matrix.os }} for ${{ matrix.p_ver }}
     runs-on: ubuntu-latest
     permissions:

From 43bf562727d727acd6d120a1ff7139043d99d83f Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 26 Mar 2026 13:29:57 +0100
Subject: [PATCH 07/11] Add a new plan on CTable persistence

---
 plans/ctable-persistency.md | 536 ++++++++++++++++++++++++++++++++++++
 plans/ctable-schema.md      |  29 +-
 2 files changed, 564 insertions(+), 1 deletion(-)
 create mode 100644 plans/ctable-persistency.md

diff --git a/plans/ctable-persistency.md b/plans/ctable-persistency.md
new file mode 100644
index 00000000..a2ff6db2
--- /dev/null
+++ b/plans/ctable-persistency.md
@@ -0,0 +1,536 @@
+# CTable Persistency Plan
+
+## Goal
+
+Add persistent `CTable` support on top of `TreeStore` while keeping the public
+API simple:
+
+* in-memory tables when `urlpath is None`
+* persistent tables when `urlpath` is provided
+
+The first persistency iteration should support:
+
+* creating a persistent table
+* opening an existing persistent table
+* reading rows, columns, and views from persisted tables
+* appending rows
+
+The first persistency iteration should **not** promise:
+
+* full schema evolution
+* dropping columns
+* renaming columns
+* transactional multi-entry updates
+
+For now, the supported schema evolution story is:
+
+* append rows only
+
+---
+
+## Storage layout
+
+Each persisted `CTable` lives under a table root inside a `TreeStore`.
+
+Confirmed layout:
+
+* `table_root/_meta`
+* `table_root/_valid_rows`
+* `table_root/_cols/<name>`
+
+Example:
+
+* `people/_meta`
+* `people/_valid_rows`
+* `people/_cols/id`
+* `people/_cols/score`
+* `people/_cols/active`
+
+Rationale:
+
+* `_meta` holds mutable metadata in `vlmeta`
+* `_valid_rows` is real table data and should be stored as a normal persisted array
+* `_cols/<name>` stores one persisted NDArray per column
+
+The underscore-prefixed names form the internal namespace for a table root and
+must be treated as reserved.
+
+---
+
+## `_meta` entry
+
+`_meta` should be a small serialized `SChunk` used primarily to hold mutable
+`vlmeta`.
+
+This is preferable to immutable metalayers because:
+
+* we may want to evolve metadata over time
+* multiple `CTable` objects may live in the same `TreeStore`
+* schema and table metadata should be updateable without rewriting the entire table
+
+For the first version:
+
+* `tree_store["<table_root>/_meta"].vlmeta["kind"] = "ctable"`
+* `tree_store["<table_root>/_meta"].vlmeta["version"] = 1`
+* `tree_store["<table_root>/_meta"].vlmeta["schema"] = {...}`
+
+This gives `open()` a minimal, reliable contract for introspection.
+
+---
+
+## Schema persistence format
+
+The schema should be stored as JSON-compatible data in:
+
+* `tree_store["<table_root>/_meta"].vlmeta["schema"]`
+
+The schema document should be versioned and explicit.
+
+Recommended shape:
+
+```python
+{
+    "version": 1,
+    "columns": [
+        {
+            "name": "id",
+            "py_type": "int",
+            "spec": {"kind": "int64", "ge": 0},
+            "default": None,
+        },
+        {
+            "name": "score",
+            "py_type": "float",
+            "spec": {"kind": "float64", "ge": 0, "le": 100},
+            "default": None,
+        },
+        {
+            "name": "active",
+            "py_type": "bool",
+            "spec": {"kind": "bool"},
+            "default": True,
+        },
+    ],
+}
+```
+
+Notes:
+
+* `columns` must be an ordered list, not a dict.
+* The order of the list is the source of truth for column order.
+* Do not rely on dict ordering or TreeStore iteration order.
+* The schema JSON should capture logical schema information only.
+
+For the first version, do **not** duplicate:
+
+* per-column `cparams`
+* per-column `dparams`
+* array chunk/block layout
+* `expected_size`
+* compaction settings
+
+Those can be introspected directly from the stored arrays when needed.
+
+---
+
+## `_valid_rows` persistence
+
+`_valid_rows` should be stored as a normal persisted boolean NDArray under:
+
+* `table_root/_valid_rows`
+
+This is the correct representation because `_valid_rows` is:
+
+* table data, not metadata
+* potentially large
+* used in normal row visibility semantics
+* already aligned with current delete/view/compaction logic
+
+Do not encode `_valid_rows` into schema JSON or small metadata blobs.
+
+---
+
+## Column persistence
+
+Each column should be stored as its own persisted NDArray under:
+
+* `table_root/_cols/<name>`
+
+This means:
+
+* each column can be opened independently
+* column-level array settings remain attached to the actual stored array
+* persistence layout matches the internal columnar design cleanly
+
+The schema JSON provides the logical order and type constraints; the arrays under
+`_cols` provide the physical stored data.
+
+---
+
+## Constructor semantics
+
+The recommended constructor shape is:
+
+```python
+table = b2.CTable(
+    Row,
+    urlpath=None,
+    mode="a",
+    expected_size=1_048_576,
+    compact=False,
+    validate=True,
+)
+```
+
+Semantics:
+
+* `urlpath is None`
+  create an in-memory `CTable`
+* `urlpath is not None`
+  use persistent storage rooted at that path
+
+Recommended `mode` meanings:
+
+* `mode="w"`
+  create a new persistent table, overwriting any existing table root if the API
+  already supports that pattern elsewhere
+* `mode="a"`
+  open existing or create new
+* `mode="r"`
+  open existing read-only table
+
+The important public signal is:
+
+* `urlpath` chooses persistence
+* `mode` chooses creation/open behavior
+
+Users should not need to pass a `TreeStore` object explicitly for the common path.
+
+---
+
+## `open()` support
+
+An explicit `open()` API should be supported.
+
+Recommended shape:
+
+```python
+table = b2.open(urlpath)
+```
+
+or, if needed for clarity:
+
+```python
+table = b2.CTable.open(urlpath, mode="r")
+```
+
+For `open()` to detect a persisted `CTable`, it should inspect:
+
+* `urlpath/_meta`
+* `urlpath/_meta`.vlmeta["kind"]
+
+If:
+
+* `_meta` exists
+* `vlmeta["kind"] == "ctable"`
+
+then the object should be recognized as a persisted `CTable`.
+
+This keeps `urlpath` simple: it points to the table root, and `_meta` provides
+the type marker and schema.
+
+---
+
+## Multiple tables in one TreeStore
+
+The design must support multiple `CTable` objects in the same `TreeStore`.
+
+That is one reason `_meta` is a good choice:
+
+* each table root has its own `_meta`
+* each table root can be introspected independently
+* schema metadata is naturally scoped to one table subtree
+
+Example shared TreeStore:
+
+* `users/_meta`
+* `users/_valid_rows`
+* `users/_cols/id`
+* `orders/_meta`
+* `orders/_valid_rows`
+* `orders/_cols/order_id`
+
+No additional global registry is required in the first version.
+
+---
+
+## Column name validation
+
+Column name validation should be explicit and should be shared between:
+
+* in-memory `CTable`
+* persistent `CTable`
+
+Reason:
+
+* a schema should not be valid in memory and then fail only when persisted
+
+Recommended first-rule constraints for column names:
+
+* must be a non-empty string
+* must not contain `/`
+* must not start with `_`
+* must not collide with reserved internal names
+
+Reserved internal names for the table root layout:
+
+* `_meta`
+* `_valid_rows`
+* `_cols`
+
+This validation should happen during schema compilation, not only during
+persistent-table creation.
+
+---
+
+## Column order
+
+Column order should be preserved explicitly in the schema JSON.
+
+The source of truth is:
+
+* the order of `schema["columns"]`
+
+Do not rely on:
+
+* dict ordering as a persistence contract
+* lexical ordering of `_cols/<name>`
+* TreeStore iteration order
+
+On load:
+
+* reconstruct `table.col_names` from the schema list order
+* rebuild any name-to-column map separately
+
+---
+
+## Read-only mode
+
+When `mode="r"`:
+
+Allowed:
+
+* opening the table
+* reading rows
+* reading columns
+* creating non-mutating views
+* `head()`, `tail()`, filtering, and other read-only operations
+
+Disallowed:
+
+* `append()`
+* `delete()`
+* `compact()`
+* any operation that mutates stored arrays or metadata
+
+These should fail immediately with a clear error.
+
+If some existing view path currently requires mutation internally, that should be
+cleaned up rather than weakening the read-only contract.
+
+---
+
+## Failure model
+
+The first persistency version does not need full transactional semantics.
+
+Be explicit in the implementation and docs:
+
+* updates touching multiple entries are not guaranteed to be atomic
+* partial writes are possible if a failure occurs mid-update
+
+That is acceptable for the first version as long as it is not hidden.
+
+The initial goal is a correct and understandable persistent layout, not a full
+transaction layer.
+
+---
+
+## Internal API sketch
+
+This is a proposed internal storage split, not a final public API requirement.
+
+Possible internal helpers:
+
+```python
+class TableStorage:
+    def open_column(self, name: str): ...
+    def create_column(
+        self,
+        name: str,
+        *,
+        dtype,
+        shape,
+        chunks=None,
+        blocks=None,
+        cparams=None,
+        dparams=None
+    ): ...
+    def open_valid_rows(self): ...
+    def create_valid_rows(
+        self, *, shape, chunks=None, blocks=None, cparams=None, dparams=None
+    ): ...
+    def load_schema(self) -> dict: ...
+    def save_schema(self, schema: dict) -> None: ...
+    def exists(self) -> bool: ...
+    def is_read_only(self) -> bool: ...
+
+
+class InMemoryTableStorage(TableStorage): ...
+
+
+class TreeStoreTableStorage(TableStorage): ...
+```
+
+Then `CTable` can route based on `urlpath`:
+
+* `urlpath is None` -> `InMemoryTableStorage`
+* `urlpath is not None` -> `TreeStoreTableStorage`
+
+This keeps persistence a backend concern instead of scattering TreeStore logic
+throughout all of `CTable`.
+
+---
+
+## Concrete implementation sequence
+
+### Step 1: extend constructor/open signatures
+
+Update `src/blosc2/ctable.py` to accept:
+
+```python
+class CTable:
+    def __init__(
+        self,
+        row_type,
+        new_data=None,
+        *,
+        urlpath: str | None = None,
+        mode: str = "a",
+        expected_size: int = 1_048_576,
+        compact: bool = False,
+        validate: bool = True,
+    ) -> None: ...
+```
+
+And add:
+
+```python
+@classmethod
+def open(cls, urlpath: str, *, mode: str = "r") -> "CTable": ...
+```
+
+### Step 2: add storage backend abstraction
+
+Create a new module:
+
+* `src/blosc2/ctable_storage.py`
+
+Add:
+
+* `TableStorage`
+* `InMemoryTableStorage`
+* `TreeStoreTableStorage`
+
+### Step 3: implement TreeStore layout helpers
+
+In `TreeStoreTableStorage`, add helpers for:
+
+* `_meta` path
+* `_valid_rows` path
+* `_cols/<name>` paths
+* reading/writing `vlmeta["kind"]`
+* reading/writing `vlmeta["version"]`
+* reading/writing `vlmeta["schema"]`
+
+### Step 4: persist schema JSON
+
+Connect compiled schema export/import to `_meta.vlmeta["schema"]`.
+
+The schema compiler work should provide:
+
+```python
+def schema_to_dict(schema: CompiledSchema) -> dict: ...
+def schema_from_dict(data: dict) -> CompiledSchema: ...
+```
+
+### Step 5: create/open persistent arrays
+
+Wire `CTable` initialization so that:
+
+* create path creates `_meta`, `_valid_rows`, and `_cols/<name>`
+* open path loads schema first, then opens `_valid_rows` and columns
+
+### Step 6: enforce read-only behavior
+
+Add an internal read-only flag so mutating methods fail early when opened with
+`mode="r"`.
+
+Methods to guard first:
+
+* `append`
+* `extend`
+* `delete`
+* `compact`
+
+### Step 7: test persistency layout and round-trips
+
+Add tests covering:
+
+* create persistent `CTable`
+* reopen persistent `CTable`
+* schema JSON present in `_meta.vlmeta`
+* `_valid_rows` persisted correctly
+* column order preserved after reopen
+* multiple tables inside one TreeStore
+* read-only mode errors on mutation
+
+---
+
+## Proposed tests
+
+Suggested test file:
+
+* `tests/ctable/test_persistency.py`
+
+Suggested test cases:
+
+* `test_create_persistent_ctable_layout`
+* `test_open_persistent_ctable`
+* `test_schema_saved_in_meta_vlmeta`
+* `test_valid_rows_persisted`
+* `test_column_order_roundtrip`
+* `test_multiple_ctables_in_same_treestore`
+* `test_read_only_mode_rejects_mutation`
+
+---
+
+## Recommendation
+
+The recommended persistency design is:
+
+1. use `urlpath` to switch between in-memory and persistent `CTable`
+2. store one table per TreeStore subtree
+3. use:
+   * `_meta`
+   * `_valid_rows`
+   * `_cols/<name>`
+4. store schema JSON in `_meta.vlmeta["schema"]`
+5. store explicit markers in `_meta.vlmeta`:
+   * `"kind": "ctable"`
+   * `"version": 1`
+6. preserve column order in the schema JSON as an ordered `columns` list
+7. keep the first version limited to append-row persistence, not full schema evolution
+
+This gives `CTable` a clear persistent layout, keeps `open()` introspection
+simple, and stays consistent with the existing columnar design.
diff --git a/plans/ctable-schema.md b/plans/ctable-schema.md
index cacac6d8..d9cd3fb1 100644
--- a/plans/ctable-schema.md
+++ b/plans/ctable-schema.md
@@ -1155,6 +1155,17 @@ def schema_from_dict(data: dict[str, Any]) -> CompiledSchema: ...
 
 This should remain internal until the persisted format is stable.
 
+The persistency design itself is specified in:
+
+* [ctable-persistency.md](/Users/faltet/blosc/python-blosc2/plans/ctable-persistency.md)
+
+The schema-layer contract for persistency is:
+
+* schema must serialize to a versioned JSON-compatible dict
+* column order must be preserved explicitly in the serialized `columns` list
+* the serialized schema must be sufficient to reconstruct `CompiledSchema`
+  without requiring the original Python dataclass definition at load time
+
 ### Step 13: delivery order across PRs
 
 Recommended PR slicing:
@@ -1182,9 +1193,18 @@ PR 3:
 
 PR 4:
 
-* persistence groundwork
+* persistence groundwork on the schema side
 * optional compatibility adapter for legacy Pydantic model declarations
 
+PR 5:
+
+* TreeStore-backed persistency as described in
+  [ctable-persistency.md](/Users/faltet/blosc/python-blosc2/plans/ctable-persistency.md)
+* `urlpath` / `mode` constructor semantics
+* explicit `open()` support
+* `_meta`, `_valid_rows`, `_cols/<name>` storage layout
+* persistency tests
+
 ### Step 14: concrete first-PR checklist
 
 The smallest coherent first implementation should be:
@@ -1206,6 +1226,13 @@ That first PR gives the project:
 
 before touching too much `CTable` mutation logic.
 
+After that first PR lands, follow the later phases in this order:
+
+1. dataclass-driven `CTable` construction and append path
+2. validation and batch-insert behavior
+3. schema introspection
+4. TreeStore-backed persistency
+
 ---
 
 ## Recommendation

From e84f7ac6a6ec8dd53252b1833a9e574e51830c3b Mon Sep 17 00:00:00 2001
From: jorge <al426671@uji.es>
Date: Thu, 26 Mar 2026 13:29:11 +0100
Subject: [PATCH 08/11] _

---
 src/blosc2/ctable.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py
index 9af63267..ff5ab440 100644
--- a/src/blosc2/ctable.py
+++ b/src/blosc2/ctable.py
@@ -78,6 +78,23 @@ def __getitem__(self, item):
         return self._table._run_row_logic(item)
 
 
+class _Row:
+    def __init__(self, table: CTable, nrow: int):
+        self._table = table
+        self._nrow = nrow
+        self._real_pos = None
+
+    def _get_real_pos(self) -> int:
+        self._real_pos = _find_physical_index(self._table._valid_rows, self._nrow)
+        return self._real_pos
+
+    def __getitem__(self, col_name: str):
+        if self._real_pos is None:
+            self._get_real_pos()
+        return self._table._cols[col_name][self._real_pos]
+
+
+
 def _resolve_field_dtype(field) -> tuple[np.dtype, int]:
     """Return (numpy dtype, display_width) for a pydantic model field.
 
@@ -380,6 +397,10 @@ def __str__(self):
     def __len__(self):
         return self._n_rows
 
+    def __iter__(self):
+        for i in range(self.nrows):
+            yield _Row(self, i)
+
     def view(self, new_valid_rows):
         if not (
             isinstance(new_valid_rows, (blosc2.NDArray, blosc2.LazyExpr))

From 8de1870ba1cc926065898495446b59e431214bea Mon Sep 17 00:00:00 2001
From: jorge <al426671@uji.es>
Date: Thu, 26 Mar 2026 13:53:16 +0100
Subject: [PATCH 09/11] _

---
 src/blosc2/ctable.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py
index ff5ab440..ddd2719c 100644
--- a/src/blosc2/ctable.py
+++ b/src/blosc2/ctable.py
@@ -94,7 +94,6 @@ def __getitem__(self, col_name: str):
         return self._table._cols[col_name][self._real_pos]
 
 
-
 def _resolve_field_dtype(field) -> tuple[np.dtype, int]:
     """Return (numpy dtype, display_width) for a pydantic model field.
 
@@ -205,6 +204,7 @@ def _raw_col(self):
     def _valid_rows(self):
         if self._mask is None:
             return self._table._valid_rows
+
         return (self._table._valid_rows & self._mask).compute()
 
     def __getitem__(self, key: int | slice | list | np.ndarray):

From a8db18d17c9334f278a5a0bd9ebbed71aed1f1b3 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 26 Mar 2026 13:55:00 +0100
Subject: [PATCH 10/11] Testing

---
 test-remove.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 test-remove.md

diff --git a/test-remove.md b/test-remove.md
new file mode 100644
index 00000000..e69de29b

From ce656072d699dcb0434d3d7c2cd72cd347f6bf9d Mon Sep 17 00:00:00 2001
From: jorge <al426671@uji.es>
Date: Thu, 26 Mar 2026 13:57:06 +0100
Subject: [PATCH 11/11] writen test

---
 test-remove.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test-remove.md b/test-remove.md
index e69de29b..038d718d 100644
--- a/test-remove.md
+++ b/test-remove.md
@@ -0,0 +1 @@
+testing