From e4cb0f4d74a0b363e97e00397771172fd8c2e6da Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 19 Mar 2026 06:29:53 +0000
Subject: [PATCH 01/24] Add read_files method to DataPortalDataset and
 DataPortalProject
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a read_files(pattern, file_format=None, **kwargs) method to both
DataPortalDataset and DataPortalProject. The method accepts a standard
glob pattern string (e.g. '*.csv', 'data/**/*.tsv.gz'), filters dataset
files using PurePath.match, and yields (DataPortalFile, content) tuples.

File format is auto-detected from the extension (.csv/.tsv → DataFrame,
.h5ad → AnnData, anything else → str) or can be specified explicitly.
Parsing kwargs are forwarded to the underlying read method (e.g. sep='\t'
for read_csv). Project-level read_files delegates to each dataset in turn.

https://claude.ai/code/session_01TANa5jJ1qzDMzoV8qCjpuU
---
 cirro/sdk/dataset.py     |  85 ++++++++++++++++++++-
 cirro/sdk/project.py     |  55 +++++++++++++-
 tests/test_read_files.py | 160 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 298 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_read_files.py

diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py
index 10a76aa1..b5a9424c 100644
--- a/cirro/sdk/dataset.py
+++ b/cirro/sdk/dataset.py
@@ -1,12 +1,13 @@
 import datetime
 from pathlib import Path
-from typing import Union, List, Optional
+from typing import Union, List, Optional, Iterator, Tuple, Any
 
 from cirro_api_client.v1.api.processes import validate_file_requirements
 from cirro_api_client.v1.models import Dataset, DatasetDetail, RunAnalysisRequest, ProcessDetail, Status, \
     RunAnalysisRequestParams, Tag, ArtifactType, NamedItem, Executor, ValidateFileRequirementsRequest
 
 from cirro.cirro_client import CirroApi
+from cirro.file_utils import filter_files_by_pattern
 from cirro.models.assets import DatasetAssets
 from cirro.models.file import PathLike
 from cirro.sdk.asset import DataPortalAssets, DataPortalAsset
@@ -17,6 +18,37 @@
 from cirro.sdk.process import DataPortalProcess
 
 
+def _infer_file_format(path: str) -> str:
+    """Infer the file format from the file extension."""
+    path_lower = path.lower()
+    for ext in ('.gz', '.bz2', '.xz', '.zst'):
+        if path_lower.endswith(ext):
+            path_lower = path_lower[:-len(ext)]
+            break
+    if path_lower.endswith('.csv') or path_lower.endswith('.tsv'):
+        return 'csv'
+    elif path_lower.endswith('.h5ad'):
+        return 'h5ad'
+    else:
+        return 'text'
+
+
+def _read_file_with_format(file: DataPortalFile, file_format: Optional[str], **kwargs) -> Any:
+    """Read a file using the specified format, or auto-detect from extension."""
+    if file_format is None:
+        file_format = _infer_file_format(file.relative_path)
+    if file_format == 'csv':
+        return file.read_csv(**kwargs)
+    elif file_format == 'h5ad':
+        return file.read_h5ad()
+    elif file_format == 'text':
+        return file.read(**kwargs)
+    else:
+        raise DataPortalInputError(
+            f"Unsupported file_format: '{file_format}'. Supported values: 'csv', 'h5ad', 'text'"
+        )
+
+
 class DataPortalDataset(DataPortalAsset):
     """
     Datasets in the Data Portal are collections of files which have
@@ -199,6 +231,57 @@ def list_files(self) -> DataPortalFiles:
             ]
         )
 
+    def read_files(
+            self,
+            pattern: str,
+            file_format: str = None,
+            **kwargs
+    ) -> Iterator[Tuple[DataPortalFile, Any]]:
+        """
+        Read the contents of files in the dataset matching the given glob pattern.
+
+        Uses standard glob pattern matching (e.g., ``*.csv``, ``data/**/*.tsv.gz``).
+        ``*`` matches any sequence of characters within a single path segment;
+        ``**`` matches zero or more path segments.
+
+        Args:
+            pattern (str): Glob pattern used to match file paths within the dataset
+                (e.g., ``'*.csv'``, ``'counts/**/*.tsv.gz'``)
+            file_format (str): File format used to parse each file. Supported values:
+
+                - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame``
+                - ``'h5ad'``: parse as AnnData (requires ``anndata`` package)
+                - ``'text'``: read as plain text, returns a ``str``
+                - ``None`` (default): infer from file extension
+                  (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, otherwise ``'text'``)
+            **kwargs: Additional keyword arguments forwarded to the file-parsing function.
+                For ``'csv'`` format these are passed to :func:`pandas.read_csv`
+                (e.g., ``sep='\\t'`` for TSV files).
+                For ``'text'`` format these are passed to
+                :meth:`~cirro.sdk.file.DataPortalFile.read`.
+
+        Yields:
+            Tuple[DataPortalFile, Any]: ``(file, content)`` for each matching file,
+            where *content* type depends on *file_format*.
+
+        Example:
+            ```python
+            # Read all CSV files in a dataset
+            for file, df in dataset.read_files('*.csv'):
+                print(file.relative_path, df.shape)
+
+            # Read gzip-compressed TSV files using explicit format and separator
+            for file, df in dataset.read_files('**/*.tsv.gz', file_format='csv', sep='\\t'):
+                print(file.relative_path, df.shape)
+
+            # Read plain-text log files
+            for file, text in dataset.read_files('logs/*.log', file_format='text'):
+                print(file.relative_path, text[:200])
+            ```
+        """
+        for file in filter_files_by_pattern(list(self.list_files()), pattern):
+            yield file, _read_file_with_format(file, file_format, **kwargs)
+
     def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile:
         """
         Get the artifact of a particular type from the dataset
diff --git a/cirro/sdk/project.py b/cirro/sdk/project.py
index 099872b9..7633de56 100644
--- a/cirro/sdk/project.py
+++ b/cirro/sdk/project.py
@@ -1,6 +1,6 @@
 from functools import cache
 from time import sleep
-from typing import List, Union
+from typing import List, Union, Iterator, Tuple, Any
 
 from cirro_api_client.v1.models import Project, UploadDatasetRequest, Dataset, Sample, Tag, Status
 
@@ -9,6 +9,7 @@
 from cirro.sdk.asset import DataPortalAssets, DataPortalAsset
 from cirro.sdk.dataset import DataPortalDataset, DataPortalDatasets
 from cirro.sdk.exceptions import DataPortalAssetNotFound, DataPortalInputError
+from cirro.sdk.file import DataPortalFile
 from cirro.sdk.helpers import parse_process_name_or_id
 from cirro.sdk.process import DataPortalProcess
 from cirro.sdk.reference import DataPortalReference, DataPortalReferences
@@ -235,6 +236,58 @@ def samples(self, max_items: int = 10000) -> List[Sample]:
         """
         return self._client.metadata.get_project_samples(self.id, max_items)
 
+    def read_files(
+            self,
+            pattern: str,
+            file_format: str = None,
+            **kwargs
+    ) -> Iterator[Tuple[DataPortalFile, Any]]:
+        """
+        Read the contents of files across all datasets in the project that match
+        the given glob pattern.
+
+        Iterates over every dataset in the project and yields matching files from
+        each one in turn. See :meth:`~cirro.sdk.dataset.DataPortalDataset.read_files`
+        for full details on pattern matching and format options.
+
+        Uses standard glob pattern matching (e.g., ``*.csv``, ``data/**/*.tsv.gz``).
+        ``*`` matches any sequence of characters within a single path segment;
+        ``**`` matches zero or more path segments.
+
+        Args:
+            pattern (str): Glob pattern used to match file paths within each dataset
+                (e.g., ``'*.csv'``, ``'counts/**/*.tsv.gz'``)
+            file_format (str): File format used to parse each file. Supported values:
+
+                - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame``
+                - ``'h5ad'``: parse as AnnData (requires ``anndata`` package)
+                - ``'text'``: read as plain text, returns a ``str``
+                - ``None`` (default): infer from file extension
+                  (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, otherwise ``'text'``)
+            **kwargs: Additional keyword arguments forwarded to the file-parsing function.
+                For ``'csv'`` format these are passed to :func:`pandas.read_csv`
+                (e.g., ``sep='\\t'`` for TSV files).
+                For ``'text'`` format these are passed to
+                :meth:`~cirro.sdk.file.DataPortalFile.read`.
+
+        Yields:
+            Tuple[DataPortalFile, Any]: ``(file, content)`` for each matching file
+            across all datasets, where *content* type depends on *file_format*.
+
+        Example:
+            ```python
+            # Read all CSV files across every dataset in a project
+            for file, df in project.read_files('*.csv'):
+                print(file.relative_path, df.shape)
+
+            # Read gzip-compressed TSV files with explicit separator
+            for file, df in project.read_files('**/*.tsv.gz', file_format='csv', sep='\\t'):
+                print(file.relative_path, df.shape)
+            ```
+        """
+        for dataset in self.list_datasets():
+            yield from dataset.read_files(pattern, file_format=file_format, **kwargs)
+
 
 class DataPortalProjects(DataPortalAssets[DataPortalProject]):
     """Collection of DataPortalProject objects"""
diff --git a/tests/test_read_files.py b/tests/test_read_files.py
new file mode 100644
index 00000000..05f4b39f
--- /dev/null
+++ b/tests/test_read_files.py
@@ -0,0 +1,160 @@
+import unittest
+from unittest.mock import Mock, patch, MagicMock
+
+from cirro.models.file import File, FileAccessContext
+from cirro.sdk.dataset import DataPortalDataset, _infer_file_format, _read_file_with_format
+from cirro.sdk.exceptions import DataPortalInputError
+from cirro.sdk.file import DataPortalFile, DataPortalFiles
+
+
+def _make_mock_file(relative_path: str, content: bytes = b'') -> DataPortalFile:
+    """Create a DataPortalFile with a mocked _get method."""
+    access_context = Mock(spec=FileAccessContext)
+    file = File(relative_path=relative_path, size=len(content), access_context=access_context)
+    client = Mock()
+    client.file.get_file.return_value = content
+    portal_file = DataPortalFile(file=file, client=client)
+    return portal_file
+
+
+def _make_dataset_with_files(files: list) -> DataPortalDataset:
+    """Create a DataPortalDataset whose list_files() returns the given DataPortalFile list."""
+    dataset_data = Mock()
+    dataset_data.id = 'ds-1'
+    dataset_data.project_id = 'proj-1'
+    dataset_data.name = 'Test Dataset'
+
+    client = Mock()
+    dataset = DataPortalDataset(dataset=dataset_data, client=client)
+    dataset.list_files = Mock(return_value=DataPortalFiles(files))
+    return dataset
+
+
+class TestInferFileFormat(unittest.TestCase):
+    def test_csv_extension(self):
+        self.assertEqual(_infer_file_format('data/results.csv'), 'csv')
+
+    def test_tsv_extension(self):
+        self.assertEqual(_infer_file_format('data/results.tsv'), 'csv')
+
+    def test_csv_gz_extension(self):
+        self.assertEqual(_infer_file_format('data/results.csv.gz'), 'csv')
+
+    def test_tsv_gz_extension(self):
+        self.assertEqual(_infer_file_format('data/results.tsv.gz'), 'csv')
+
+    def test_h5ad_extension(self):
+        self.assertEqual(_infer_file_format('data/adata.h5ad'), 'h5ad')
+
+    def test_text_fallback(self):
+        self.assertEqual(_infer_file_format('data/notes.txt'), 'text')
+
+    def test_log_fallback(self):
+        self.assertEqual(_infer_file_format('logs/run.log'), 'text')
+
+    def test_unknown_extension_fallback(self):
+        self.assertEqual(_infer_file_format('data/file.xyz'), 'text')
+
+
+class TestReadFileWithFormat(unittest.TestCase):
+    def setUp(self):
+        self.file = _make_mock_file('data/results.csv', b'a,b\n1,2\n')
+
+    def test_csv_format(self):
+        import pandas as pd
+        df = _read_file_with_format(self.file, 'csv')
+        self.assertIsInstance(df, pd.DataFrame)
+        self.assertListEqual(list(df.columns), ['a', 'b'])
+
+    def test_text_format(self):
+        file = _make_mock_file('data/notes.txt', b'hello world')
+        result = _read_file_with_format(file, 'text')
+        self.assertEqual(result, 'hello world')
+
+    def test_auto_infer_csv(self):
+        import pandas as pd
+        result = _read_file_with_format(self.file, None)
+        self.assertIsInstance(result, pd.DataFrame)
+
+    def test_auto_infer_text(self):
+        file = _make_mock_file('data/notes.txt', b'hello')
+        result = _read_file_with_format(file, None)
+        self.assertIsInstance(result, str)
+
+    def test_unsupported_format_raises(self):
+        with self.assertRaises(DataPortalInputError):
+            _read_file_with_format(self.file, 'parquet')
+
+    def test_csv_kwargs_passed_through(self):
+        import pandas as pd
+        file = _make_mock_file('data/data.tsv', b'a\tb\n1\t2\n')
+        df = _read_file_with_format(file, 'csv', sep='\t')
+        self.assertIsInstance(df, pd.DataFrame)
+        self.assertListEqual(list(df.columns), ['a', 'b'])
+
+
+class TestDatasetReadFiles(unittest.TestCase):
+    def setUp(self):
+        self.csv_file = _make_mock_file('data/results.csv', b'x,y\n3,4\n')
+        self.tsv_file = _make_mock_file('data/counts.tsv', b'gene\tcount\nTP53\t100\n')
+        self.txt_file = _make_mock_file('logs/run.log', b'started\nfinished\n')
+        self.dataset = _make_dataset_with_files([
+            self.csv_file,
+            self.tsv_file,
+            self.txt_file,
+        ])
+
+    def test_pattern_matches_csv(self):
+        results = list(self.dataset.read_files('*.csv'))
+        self.assertEqual(len(results), 1)
+        file, content = results[0]
+        self.assertEqual(file.relative_path, 'data/results.csv')
+
+    def test_pattern_matches_multiple(self):
+        results = list(self.dataset.read_files('data/*'))
+        self.assertEqual(len(results), 2)
+        paths = {f.relative_path for f, _ in results}
+        self.assertIn('data/results.csv', paths)
+        self.assertIn('data/counts.tsv', paths)
+
+    def test_pattern_no_match_returns_empty(self):
+        results = list(self.dataset.read_files('*.parquet'))
+        self.assertEqual(len(results), 0)
+
+    def test_explicit_format_csv(self):
+        import pandas as pd
+        results = list(self.dataset.read_files('data/*.tsv', file_format='csv', sep='\t'))
+        self.assertEqual(len(results), 1)
+        _, df = results[0]
+        self.assertIsInstance(df, pd.DataFrame)
+        self.assertIn('gene', df.columns)
+
+    def test_explicit_format_text(self):
+        results = list(self.dataset.read_files('logs/*.log', file_format='text'))
+        self.assertEqual(len(results), 1)
+        _, content = results[0]
+        self.assertIsInstance(content, str)
+        self.assertIn('started', content)
+
+    def test_auto_infer_csv_from_extension(self):
+        import pandas as pd
+        results = list(self.dataset.read_files('data/results.csv'))
+        _, content = results[0]
+        self.assertIsInstance(content, pd.DataFrame)
+
+    def test_auto_infer_text_from_extension(self):
+        results = list(self.dataset.read_files('logs/run.log'))
+        _, content = results[0]
+        self.assertIsInstance(content, str)
+
+    def test_yields_file_and_content_tuples(self):
+        results = list(self.dataset.read_files('data/*.csv'))
+        self.assertEqual(len(results), 1)
+        file, content = results[0]
+        self.assertIsInstance(file, DataPortalFile)
+
+    def test_globstar_pattern(self):
+        results = list(self.dataset.read_files('**/*.csv'))
+        self.assertEqual(len(results), 1)
+        file, _ = results[0]
+        self.assertEqual(file.relative_path, 'data/results.csv')

From 57c2e4e7fcd1b286a433bd38106a8fc0fb2a15f4 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 19 Mar 2026 09:38:14 +0000
Subject: [PATCH 02/24] Add support for json, parquet, feather, pickle, and
 excel file formats

- Add read_json, read_parquet, read_feather, read_pickle, read_excel methods to DataPortalFile
- Update _infer_file_format to detect .json, .parquet, .feather, .pkl/.pickle, .xlsx/.xls extensions
- Update _read_file_with_format to dispatch to the new read methods
- Update read_files docstring to document all supported formats
- Add tests for new format inference and reading (parquet/feather tests skip without pyarrow)

https://claude.ai/code/session_01TANa5jJ1qzDMzoV8qCjpuU
---
 cirro/sdk/dataset.py     |  36 +++++++++++++-
 cirro/sdk/file.py        |  40 +++++++++++++++
 tests/test_read_files.py | 105 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 177 insertions(+), 4 deletions(-)

diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py
index b5a9424c..edbb6901 100644
--- a/cirro/sdk/dataset.py
+++ b/cirro/sdk/dataset.py
@@ -29,6 +29,16 @@ def _infer_file_format(path: str) -> str:
         return 'csv'
     elif path_lower.endswith('.h5ad'):
         return 'h5ad'
+    elif path_lower.endswith('.json'):
+        return 'json'
+    elif path_lower.endswith('.parquet'):
+        return 'parquet'
+    elif path_lower.endswith('.feather'):
+        return 'feather'
+    elif path_lower.endswith('.pkl') or path_lower.endswith('.pickle'):
+        return 'pickle'
+    elif path_lower.endswith('.xlsx') or path_lower.endswith('.xls'):
+        return 'excel'
     else:
         return 'text'
 
@@ -41,11 +51,22 @@ def _read_file_with_format(file: DataPortalFile, file_format: Optional[str], **k
         return file.read_csv(**kwargs)
     elif file_format == 'h5ad':
         return file.read_h5ad()
+    elif file_format == 'json':
+        return file.read_json(**kwargs)
+    elif file_format == 'parquet':
+        return file.read_parquet(**kwargs)
+    elif file_format == 'feather':
+        return file.read_feather(**kwargs)
+    elif file_format == 'pickle':
+        return file.read_pickle(**kwargs)
+    elif file_format == 'excel':
+        return file.read_excel(**kwargs)
     elif file_format == 'text':
         return file.read(**kwargs)
     else:
         raise DataPortalInputError(
-            f"Unsupported file_format: '{file_format}'. Supported values: 'csv', 'h5ad', 'text'"
+            f"Unsupported file_format: '{file_format}'. "
+            f"Supported values: 'csv', 'h5ad', 'json', 'parquet', 'feather', 'pickle', 'excel', 'text'"
         )
 
 
@@ -251,9 +272,20 @@ def read_files(
 
                 - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame``
                 - ``'h5ad'``: parse as AnnData (requires ``anndata`` package)
+                - ``'json'``: parse with :func:`json.loads`, returns a Python object
+                - ``'parquet'``: parse with :func:`pandas.read_parquet`, returns a ``DataFrame``
+                  (requires ``pyarrow`` or ``fastparquet``)
+                - ``'feather'``: parse with :func:`pandas.read_feather`, returns a ``DataFrame``
+                  (requires ``pyarrow``)
+                - ``'pickle'``: deserialize with :mod:`pickle`, returns a Python object
+                - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame``
+                  (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``)
                 - ``'text'``: read as plain text, returns a ``str``
                 - ``None`` (default): infer from file extension
-                  (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, otherwise ``'text'``)
+                  (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``,
+                  ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``,
+                  ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``,
+                  ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``)
             **kwargs: Additional keyword arguments forwarded to the file-parsing function.
                 For ``'csv'`` format these are passed to :func:`pandas.read_csv`
                 (e.g., ``sep='\\t'`` for TSV files).
diff --git a/cirro/sdk/file.py b/cirro/sdk/file.py
index 03acd1ea..4b466ca4 100644
--- a/cirro/sdk/file.py
+++ b/cirro/sdk/file.py
@@ -1,4 +1,6 @@
 import gzip
+import json
+import pickle
 from io import BytesIO, StringIO
 from typing import List
 
@@ -141,6 +143,44 @@ def read_h5ad(self) -> 'anndata.AnnData':
         with BytesIO(self._get()) as handle:
             return ad.read_h5ad(handle)
 
+    def read_json(self, **kwargs):
+        """Read the file contents as a parsed JSON object (dict, list, etc.)."""
+        return json.loads(self._get(), **kwargs)
+
+    def read_parquet(self, **kwargs) -> 'DataFrame':
+        """
+        Read a Parquet file as a Pandas DataFrame.
+
+        Requires ``pyarrow`` or ``fastparquet`` to be installed.
+        All keyword arguments are passed to :func:`pandas.read_parquet`.
+        """
+        import pandas
+        return pandas.read_parquet(BytesIO(self._get()), **kwargs)
+
+    def read_feather(self, **kwargs) -> 'DataFrame':
+        """
+        Read a Feather file as a Pandas DataFrame.
+
+        Requires ``pyarrow`` to be installed.
+        All keyword arguments are passed to :func:`pandas.read_feather`.
+        """
+        import pandas
+        return pandas.read_feather(BytesIO(self._get()), **kwargs)
+
+    def read_pickle(self, **kwargs):
+        """Read the file contents as a Python pickle object."""
+        return pickle.loads(self._get(), **kwargs)
+
+    def read_excel(self, **kwargs) -> 'DataFrame':
+        """
+        Read an Excel file (``.xlsx`` / ``.xls``) as a Pandas DataFrame.
+
+        Requires ``openpyxl`` (for ``.xlsx``) or ``xlrd`` (for ``.xls``).
+        All keyword arguments are passed to :func:`pandas.read_excel`.
+        """
+        import pandas
+        return pandas.read_excel(BytesIO(self._get()), **kwargs)
+
     def readlines(self, encoding='utf-8', compression=None) -> List[str]:
         """Read the file contents as a list of lines."""
 
diff --git a/tests/test_read_files.py b/tests/test_read_files.py
index 05f4b39f..0ffa02f2 100644
--- a/tests/test_read_files.py
+++ b/tests/test_read_files.py
@@ -1,5 +1,10 @@
+import io
+import json
+import pickle
 import unittest
-from unittest.mock import Mock, patch, MagicMock
+from unittest.mock import Mock
+
+import pandas as pd
 
 from cirro.models.file import File, FileAccessContext
 from cirro.sdk.dataset import DataPortalDataset, _infer_file_format, _read_file_with_format
@@ -46,6 +51,30 @@ def test_tsv_gz_extension(self):
     def test_h5ad_extension(self):
         self.assertEqual(_infer_file_format('data/adata.h5ad'), 'h5ad')
 
+    def test_json_extension(self):
+        self.assertEqual(_infer_file_format('data/results.json'), 'json')
+
+    def test_json_gz_extension(self):
+        self.assertEqual(_infer_file_format('data/results.json.gz'), 'json')
+
+    def test_parquet_extension(self):
+        self.assertEqual(_infer_file_format('data/results.parquet'), 'parquet')
+
+    def test_feather_extension(self):
+        self.assertEqual(_infer_file_format('data/results.feather'), 'feather')
+
+    def test_pickle_pkl_extension(self):
+        self.assertEqual(_infer_file_format('data/results.pkl'), 'pickle')
+
+    def test_pickle_pickle_extension(self):
+        self.assertEqual(_infer_file_format('data/results.pickle'), 'pickle')
+
+    def test_excel_xlsx_extension(self):
+        self.assertEqual(_infer_file_format('data/results.xlsx'), 'excel')
+
+    def test_excel_xls_extension(self):
+        self.assertEqual(_infer_file_format('data/results.xls'), 'excel')
+
     def test_text_fallback(self):
         self.assertEqual(_infer_file_format('data/notes.txt'), 'text')
 
@@ -83,7 +112,79 @@ def test_auto_infer_text(self):
 
     def test_unsupported_format_raises(self):
         with self.assertRaises(DataPortalInputError):
-            _read_file_with_format(self.file, 'parquet')
+            _read_file_with_format(self.file, 'xyz_unknown')
+
+    def test_json_format(self):
+        file = _make_mock_file('data/data.json', b'{"key": "value"}')
+        result = _read_file_with_format(file, 'json')
+        self.assertIsInstance(result, dict)
+        self.assertEqual(result['key'], 'value')
+
+    def test_auto_infer_json(self):
+        file = _make_mock_file('data/data.json', b'[1, 2, 3]')
+        result = _read_file_with_format(file, None)
+        self.assertIsInstance(result, list)
+        self.assertEqual(result, [1, 2, 3])
+
+    def test_pickle_format(self):
+        data = {'hello': 42}
+        file = _make_mock_file('data/data.pkl', pickle.dumps(data))
+        result = _read_file_with_format(file, 'pickle')
+        self.assertEqual(result, data)
+
+    def test_auto_infer_pickle(self):
+        data = [1, 2, 3]
+        file = _make_mock_file('data/data.pkl', pickle.dumps(data))
+        result = _read_file_with_format(file, None)
+        self.assertEqual(result, data)
+
+    def _make_parquet_bytes(self):
+        buf = io.BytesIO()
+        pd.DataFrame({'a': [1, 2], 'b': [3, 4]}).to_parquet(buf)
+        return buf.getvalue()
+
+    def _make_feather_bytes(self):
+        buf = io.BytesIO()
+        pd.DataFrame({'a': [1, 2], 'b': [3, 4]}).to_feather(buf)
+        return buf.getvalue()
+
+    @unittest.skipUnless(
+        __import__('importlib').util.find_spec('pyarrow') is not None,
+        'pyarrow not installed'
+    )
+    def test_parquet_format(self):
+        file = _make_mock_file('data/data.parquet', self._make_parquet_bytes())
+        result = _read_file_with_format(file, 'parquet')
+        self.assertIsInstance(result, pd.DataFrame)
+        self.assertListEqual(list(result.columns), ['a', 'b'])
+
+    @unittest.skipUnless(
+        __import__('importlib').util.find_spec('pyarrow') is not None,
+        'pyarrow not installed'
+    )
+    def test_auto_infer_parquet(self):
+        file = _make_mock_file('data/data.parquet', self._make_parquet_bytes())
+        result = _read_file_with_format(file, None)
+        self.assertIsInstance(result, pd.DataFrame)
+
+    @unittest.skipUnless(
+        __import__('importlib').util.find_spec('pyarrow') is not None,
+        'pyarrow not installed'
+    )
+    def test_feather_format(self):
+        file = _make_mock_file('data/data.feather', self._make_feather_bytes())
+        result = _read_file_with_format(file, 'feather')
+        self.assertIsInstance(result, pd.DataFrame)
+        self.assertListEqual(list(result.columns), ['a', 'b'])
+
+    @unittest.skipUnless(
+        __import__('importlib').util.find_spec('pyarrow') is not None,
+        'pyarrow not installed'
+    )
+    def test_auto_infer_feather(self):
+        file = _make_mock_file('data/data.feather', self._make_feather_bytes())
+        result = _read_file_with_format(file, None)
+        self.assertIsInstance(result, pd.DataFrame)
 
     def test_csv_kwargs_passed_through(self):
         import pandas as pd

From 40cb5aeeae09a1863c3ca37da3597a1913941c8c Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 19 Mar 2026 13:40:18 +0000
Subject: [PATCH 03/24] Add {name} capture syntax to read_files for automatic
 path extraction

- Add _pattern_to_captures_regex() that converts {name} placeholders in
  glob patterns to named regex groups (suffix-anchored like PurePath.match)
- read_files() now always yields (file, content, captures) 3-tuples;
  captures is {} when the pattern has no {name} placeholders
- Patterns with {name} use regex matching; plain glob patterns continue
  to use filter_files_by_pattern / PurePath.match unchanged
- Add TestPatternToRegex suite and TestDatasetReadFiles capture tests;
  update all existing tests to unpack 3-tuples

https://claude.ai/code/session_01TANa5jJ1qzDMzoV8qCjpuU
---
 cirro/sdk/dataset.py     |  86 ++++++++++++++++++++++++++++----
 tests/test_read_files.py | 105 +++++++++++++++++++++++++++++++++++----
 2 files changed, 171 insertions(+), 20 deletions(-)

diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py
index edbb6901..4c47c19f 100644
--- a/cirro/sdk/dataset.py
+++ b/cirro/sdk/dataset.py
@@ -1,6 +1,7 @@
 import datetime
+import re
 from pathlib import Path
-from typing import Union, List, Optional, Iterator, Tuple, Any
+from typing import Union, List, Optional, Iterator, Tuple, Any, Dict
 
 from cirro_api_client.v1.api.processes import validate_file_requirements
 from cirro_api_client.v1.models import Dataset, DatasetDetail, RunAnalysisRequest, ProcessDetail, Status, \
@@ -18,6 +19,39 @@
 from cirro.sdk.process import DataPortalProcess
 
 
+def _pattern_to_captures_regex(pattern: str):
+    """
+    Convert a glob pattern that may contain ``{name}`` capture placeholders into
+    a compiled regex and return ``(compiled_regex, capture_names)``.
+
+    Conversion rules:
+      - ``{name}``  → named group matching a single path segment (no ``/``)
+      - ``*``       → matches any characters within a single path segment
+      - ``**``      → matches any characters including ``/`` (multiple segments)
+      - All other characters are regex-escaped.
+
+    The resulting regex is suffix-anchored (like ``pathlib.PurePath.match``):
+    a pattern without a leading ``/`` will match at any depth in the path.
+    """
+    capture_names = re.findall(r'\{(\w+)\}', pattern)
+    tokens = re.split(r'(\*\*|\*|\{\w+\})', pattern)
+    parts = []
+    for token in tokens:
+        if token == '**':
+            parts.append('.*')
+        elif token == '*':
+            parts.append('[^/]*')
+        elif re.match(r'^\{\w+\}$', token):
+            name = token[1:-1]
+            parts.append(f'(?P<{name}>[^/]+)')
+        else:
+            parts.append(re.escape(token))
+    regex_str = ''.join(parts)
+    if not pattern.startswith('/'):
+        regex_str = r'(?:.+/)?' + regex_str
+    return re.compile('^' + regex_str + '$'), capture_names
+
+
 def _infer_file_format(path: str) -> str:
     """Infer the file format from the file extension."""
     path_lower = path.lower()
@@ -257,7 +291,7 @@ def read_files(
             pattern: str,
             file_format: str = None,
             **kwargs
-    ) -> Iterator[Tuple[DataPortalFile, Any]]:
+    ) -> Iterator[Tuple[DataPortalFile, Any, Dict[str, str]]]:
         """
         Read the contents of files in the dataset matching the given glob pattern.
 
@@ -265,9 +299,18 @@ def read_files(
         ``*`` matches any sequence of characters within a single path segment;
         ``**`` matches zero or more path segments.
 
+        **Named captures** — wrap a segment in ``{name}`` to extract that portion
+        of the path automatically.  For example, ``{sample}.csv`` will match
+        ``sampleA.csv`` and ``sampleB.csv`` and return ``{'sample': 'sampleA'}``
+        / ``{'sample': 'sampleB'}`` respectively in the third element of each
+        yielded tuple.  Multiple captures are supported:
+        ``{condition}/{sample}.csv`` extracts both ``condition`` and ``sample``
+        from a two-level path.
+
         Args:
-            pattern (str): Glob pattern used to match file paths within the dataset
-                (e.g., ``'*.csv'``, ``'counts/**/*.tsv.gz'``)
+            pattern (str): Glob pattern used to match file paths within the dataset.
+                May contain ``{name}`` capture placeholders
+                (e.g., ``'{sample}.csv'``, ``'counts/{sample}/*.tsv.gz'``).
             file_format (str): File format used to parse each file. Supported values:
 
                 - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame``
@@ -293,26 +336,47 @@ def read_files(
                 :meth:`~cirro.sdk.file.DataPortalFile.read`.
 
         Yields:
-            Tuple[DataPortalFile, Any]: ``(file, content)`` for each matching file,
-            where *content* type depends on *file_format*.
+            Tuple[DataPortalFile, Any, Dict[str, str]]:
+            ``(file, content, captures)`` for each matching file, where:
+
+            - *content* type depends on *file_format*
+            - *captures* is a ``dict`` of values extracted from ``{name}``
+              placeholders in the pattern (empty ``{}`` when the pattern
+              contains no captures)
 
         Example:
             ```python
             # Read all CSV files in a dataset
-            for file, df in dataset.read_files('*.csv'):
+            for file, df, _ in dataset.read_files('*.csv'):
                 print(file.relative_path, df.shape)
 
+            # Extract sample names automatically from filenames
+            for file, df, captures in dataset.read_files('{sample}.csv'):
+                print(captures['sample'], df.shape)
+
+            # Multi-level capture: condition directory + sample filename
+            for file, df, captures in dataset.read_files('{condition}/{sample}.csv'):
+                print(captures['condition'], captures['sample'], df.shape)
+
             # Read gzip-compressed TSV files using explicit format and separator
-            for file, df in dataset.read_files('**/*.tsv.gz', file_format='csv', sep='\\t'):
+            for file, df, _ in dataset.read_files('**/*.tsv.gz', file_format='csv', sep='\\t'):
                 print(file.relative_path, df.shape)
 
             # Read plain-text log files
-            for file, text in dataset.read_files('logs/*.log', file_format='text'):
+            for file, text, _ in dataset.read_files('logs/*.log', file_format='text'):
                 print(file.relative_path, text[:200])
             ```
         """
-        for file in filter_files_by_pattern(list(self.list_files()), pattern):
-            yield file, _read_file_with_format(file, file_format, **kwargs)
+        has_captures = bool(re.search(r'\{\w+\}', pattern))
+        if has_captures:
+            compiled_regex, _ = _pattern_to_captures_regex(pattern)
+            for file in self.list_files():
+                m = compiled_regex.match(file.relative_path)
+                if m is not None:
+                    yield file, _read_file_with_format(file, file_format, **kwargs), m.groupdict()
+        else:
+            for file in filter_files_by_pattern(list(self.list_files()), pattern):
+                yield file, _read_file_with_format(file, file_format, **kwargs), {}
 
     def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile:
         """
diff --git a/tests/test_read_files.py b/tests/test_read_files.py
index 0ffa02f2..3f330958 100644
--- a/tests/test_read_files.py
+++ b/tests/test_read_files.py
@@ -7,7 +7,7 @@
 import pandas as pd
 
 from cirro.models.file import File, FileAccessContext
-from cirro.sdk.dataset import DataPortalDataset, _infer_file_format, _read_file_with_format
+from cirro.sdk.dataset import DataPortalDataset, _infer_file_format, _read_file_with_format, _pattern_to_captures_regex
 from cirro.sdk.exceptions import DataPortalInputError
 from cirro.sdk.file import DataPortalFile, DataPortalFiles
 
@@ -208,13 +208,14 @@ def setUp(self):
     def test_pattern_matches_csv(self):
         results = list(self.dataset.read_files('*.csv'))
         self.assertEqual(len(results), 1)
-        file, content = results[0]
+        file, content, captures = results[0]
         self.assertEqual(file.relative_path, 'data/results.csv')
+        self.assertEqual(captures, {})
 
     def test_pattern_matches_multiple(self):
         results = list(self.dataset.read_files('data/*'))
         self.assertEqual(len(results), 2)
-        paths = {f.relative_path for f, _ in results}
+        paths = {f.relative_path for f, _, _ in results}
         self.assertIn('data/results.csv', paths)
         self.assertIn('data/counts.tsv', paths)
 
@@ -226,36 +227,122 @@ def test_explicit_format_csv(self):
         import pandas as pd
         results = list(self.dataset.read_files('data/*.tsv', file_format='csv', sep='\t'))
         self.assertEqual(len(results), 1)
-        _, df = results[0]
+        _, df, _ = results[0]
         self.assertIsInstance(df, pd.DataFrame)
         self.assertIn('gene', df.columns)
 
     def test_explicit_format_text(self):
         results = list(self.dataset.read_files('logs/*.log', file_format='text'))
         self.assertEqual(len(results), 1)
-        _, content = results[0]
+        _, content, _ = results[0]
         self.assertIsInstance(content, str)
         self.assertIn('started', content)
 
     def test_auto_infer_csv_from_extension(self):
         import pandas as pd
         results = list(self.dataset.read_files('data/results.csv'))
-        _, content = results[0]
+        _, content, _ = results[0]
         self.assertIsInstance(content, pd.DataFrame)
 
     def test_auto_infer_text_from_extension(self):
         results = list(self.dataset.read_files('logs/run.log'))
-        _, content = results[0]
+        _, content, _ = results[0]
         self.assertIsInstance(content, str)
 
     def test_yields_file_and_content_tuples(self):
         results = list(self.dataset.read_files('data/*.csv'))
         self.assertEqual(len(results), 1)
-        file, content = results[0]
+        file, content, captures = results[0]
         self.assertIsInstance(file, DataPortalFile)
+        self.assertEqual(captures, {})
 
     def test_globstar_pattern(self):
         results = list(self.dataset.read_files('**/*.csv'))
         self.assertEqual(len(results), 1)
-        file, _ = results[0]
+        file, _, _ = results[0]
         self.assertEqual(file.relative_path, 'data/results.csv')
+
+    # --- capture pattern tests ---
+
+    def test_capture_simple_filename(self):
+        # {sample}.csv should match data/results.csv and capture sample='results'
+        results = list(self.dataset.read_files('{sample}.csv'))
+        self.assertEqual(len(results), 1)
+        file, _, captures = results[0]
+        self.assertEqual(file.relative_path, 'data/results.csv')
+        self.assertEqual(captures['sample'], 'results')
+
+    def test_capture_with_directory(self):
+        # data/{sample}.csv should match data/results.csv
+        results = list(self.dataset.read_files('data/{sample}.csv'))
+        self.assertEqual(len(results), 1)
+        _, _, captures = results[0]
+        self.assertEqual(captures['sample'], 'results')
+
+    def test_capture_multiple_files(self):
+        # {sample}.csv matches both csv files at depth; capture distinct names
+        dataset = _make_dataset_with_files([
+            _make_mock_file('sampleA.csv', b'a\n1\n'),
+            _make_mock_file('sampleB.csv', b'a\n2\n'),
+            _make_mock_file('notes.txt', b'text'),
+        ])
+        results = list(dataset.read_files('{sample}.csv'))
+        self.assertEqual(len(results), 2)
+        captured = {c['sample'] for _, _, c in results}
+        self.assertSetEqual(captured, {'sampleA', 'sampleB'})
+
+    def test_capture_multi_level(self):
+        # {condition}/{sample}.csv extracts two path segments
+        dataset = _make_dataset_with_files([
+            _make_mock_file('treated/sampleA.csv', b'x\n1\n'),
+            _make_mock_file('control/sampleB.csv', b'x\n2\n'),
+        ])
+        results = list(dataset.read_files('{condition}/{sample}.csv'))
+        self.assertEqual(len(results), 2)
+        by_sample = {c['sample']: c['condition'] for _, _, c in results}
+        self.assertEqual(by_sample['sampleA'], 'treated')
+        self.assertEqual(by_sample['sampleB'], 'control')
+
+    def test_capture_no_match_returns_empty(self):
+        results = list(self.dataset.read_files('{sample}.parquet'))
+        self.assertEqual(len(results), 0)
+
+    def test_capture_returns_empty_dict_when_no_placeholders(self):
+        results = list(self.dataset.read_files('*.csv'))
+        _, _, captures = results[0]
+        self.assertEqual(captures, {})
+
+
+class TestPatternToRegex(unittest.TestCase):
+    def _match(self, pattern, path):
+        compiled, names = _pattern_to_captures_regex(pattern)
+        m = compiled.match(path)
+        return m.groupdict() if m else None
+
+    def test_simple_capture(self):
+        self.assertEqual(self._match('{sample}.csv', 'sampleA.csv'), {'sample': 'sampleA'})
+
+    def test_simple_capture_with_directory(self):
+        self.assertEqual(self._match('{sample}.csv', 'data/sampleA.csv'), {'sample': 'sampleA'})
+
+    def test_directory_capture(self):
+        self.assertEqual(self._match('data/{sample}.csv', 'data/results.csv'), {'sample': 'results'})
+
+    def test_multi_level_capture(self):
+        result = self._match('{condition}/{sample}.csv', 'treated/sampleA.csv')
+        self.assertEqual(result, {'condition': 'treated', 'sample': 'sampleA'})
+
+    def test_multi_level_capture_with_prefix(self):
+        result = self._match('{condition}/{sample}.csv', 'data/treated/sampleA.csv')
+        self.assertEqual(result, {'condition': 'treated', 'sample': 'sampleA'})
+
+    def test_no_match_returns_none(self):
+        self.assertIsNone(self._match('{sample}.csv', 'sampleA.tsv'))
+
+    def test_wildcard_mixed_with_capture(self):
+        result = self._match('data/*/{sample}.csv', 'data/subdir/sampleA.csv')
+        self.assertEqual(result, {'sample': 'sampleA'})
+
+    def test_capture_names_returned(self):
+        _, names = _pattern_to_captures_regex('{condition}/{sample}.csv')
+        self.assertListEqual(names, ['condition', 'sample'])

From dec37a0334495e66829df3abb9337bed611ee74e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 19 Mar 2026 14:07:23 +0000
Subject: [PATCH 04/24] Replace positional pattern arg with explicit glob= and
 pattern= kwargs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

read_files() now takes two mutually exclusive keyword arguments:
- glob='*.csv'            → yields content per matching file
- pattern='{sample}.csv' → yields (content, captures) per matching file

Passing both or neither raises DataPortalInputError. This makes the
return type unambiguous: glob always gives a flat iterator of content,
pattern always gives (content, captures) 2-tuples.

https://claude.ai/code/session_01TANa5jJ1qzDMzoV8qCjpuU
---
 cirro/sdk/dataset.py     |  99 ++++++++++++++++---------------
 tests/test_read_files.py | 122 ++++++++++++++++++---------------------
 2 files changed, 109 insertions(+), 112 deletions(-)

diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py
index 4c47c19f..a5900fcd 100644
--- a/cirro/sdk/dataset.py
+++ b/cirro/sdk/dataset.py
@@ -288,29 +288,38 @@ def list_files(self) -> DataPortalFiles:
 
     def read_files(
             self,
-            pattern: str,
+            glob: str = None,
+            pattern: str = None,
             file_format: str = None,
             **kwargs
-    ) -> Iterator[Tuple[DataPortalFile, Any, Dict[str, str]]]:
+    ):
         """
-        Read the contents of files in the dataset matching the given glob pattern.
+        Read the contents of files in the dataset.
 
-        Uses standard glob pattern matching (e.g., ``*.csv``, ``data/**/*.tsv.gz``).
-        ``*`` matches any sequence of characters within a single path segment;
-        ``**`` matches zero or more path segments.
+        Exactly one of ``glob`` or ``pattern`` must be provided.
 
-        **Named captures** — wrap a segment in ``{name}`` to extract that portion
-        of the path automatically.  For example, ``{sample}.csv`` will match
-        ``sampleA.csv`` and ``sampleB.csv`` and return ``{'sample': 'sampleA'}``
-        / ``{'sample': 'sampleB'}`` respectively in the third element of each
-        yielded tuple.  Multiple captures are supported:
-        ``{condition}/{sample}.csv`` extracts both ``condition`` and ``sample``
-        from a two-level path.
+        **glob** — standard wildcard matching; yields the file content for each
+        matching file:
+
+        - ``*`` matches any characters within a single path segment
+        - ``**`` matches zero or more path segments
+        - Matching is suffix-anchored (``*.csv`` matches at any depth)
+
+        **pattern** — like ``glob`` but ``{name}`` placeholders capture portions
+        of the path automatically; yields ``(content, captures)`` pairs where
+        *captures* is a ``dict`` of extracted values:
+
+        - ``{name}`` captures one path segment (no ``/``)
+        - ``*`` and ``**`` wildcards work as in ``glob``
 
         Args:
-            pattern (str): Glob pattern used to match file paths within the dataset.
-                May contain ``{name}`` capture placeholders
-                (e.g., ``'{sample}.csv'``, ``'counts/{sample}/*.tsv.gz'``).
+            glob (str): Wildcard expression to match files
+                (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``).
+                Yields one item per matching file: the parsed content.
+            pattern (str): Wildcard expression with ``{name}`` capture
+                placeholders (e.g., ``'{sample}.csv'``,
+                ``'{condition}/{sample}.csv'``).
+                Yields ``(content, captures)`` per matching file.
             file_format (str): File format used to parse each file. Supported values:
 
                 - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame``
@@ -329,54 +338,52 @@ def read_files(
                   ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``,
                   ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``,
                   ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``)
-            **kwargs: Additional keyword arguments forwarded to the file-parsing function.
-                For ``'csv'`` format these are passed to :func:`pandas.read_csv`
-                (e.g., ``sep='\\t'`` for TSV files).
-                For ``'text'`` format these are passed to
-                :meth:`~cirro.sdk.file.DataPortalFile.read`.
+            **kwargs: Additional keyword arguments forwarded to the file-parsing
+                function (e.g., ``sep='\\t'`` for CSV/TSV files).
 
         Yields:
-            Tuple[DataPortalFile, Any, Dict[str, str]]:
-            ``(file, content, captures)`` for each matching file, where:
+            - When using ``glob``: *content* for each matching file
+            - When using ``pattern``: ``(content, captures)`` for each matching file,
+              where *captures* is a ``dict`` of values extracted from ``{name}``
+              placeholders
 
-            - *content* type depends on *file_format*
-            - *captures* is a ``dict`` of values extracted from ``{name}``
-              placeholders in the pattern (empty ``{}`` when the pattern
-              contains no captures)
+        Raises:
+            DataPortalInputError: if both ``glob`` and ``pattern`` are provided,
+                or if neither is provided.
 
         Example:
             ```python
-            # Read all CSV files in a dataset
-            for file, df, _ in dataset.read_files('*.csv'):
-                print(file.relative_path, df.shape)
+            # Read all CSV files — just the content
+            for df in dataset.read_files(glob='*.csv'):
+                print(df.shape)
 
-            # Extract sample names automatically from filenames
-            for file, df, captures in dataset.read_files('{sample}.csv'):
+            # Extract sample names from filenames automatically
+            for df, captures in dataset.read_files(pattern='{sample}.csv'):
                 print(captures['sample'], df.shape)
 
             # Multi-level capture: condition directory + sample filename
-            for file, df, captures in dataset.read_files('{condition}/{sample}.csv'):
+            for df, captures in dataset.read_files(pattern='{condition}/{sample}.csv'):
                 print(captures['condition'], captures['sample'], df.shape)
 
-            # Read gzip-compressed TSV files using explicit format and separator
-            for file, df, _ in dataset.read_files('**/*.tsv.gz', file_format='csv', sep='\\t'):
-                print(file.relative_path, df.shape)
-
-            # Read plain-text log files
-            for file, text, _ in dataset.read_files('logs/*.log', file_format='text'):
-                print(file.relative_path, text[:200])
+            # Read gzip-compressed TSV files with explicit separator
+            for df in dataset.read_files(glob='**/*.tsv.gz', file_format='csv', sep='\\t'):
+                print(df.shape)
             ```
         """
-        has_captures = bool(re.search(r'\{\w+\}', pattern))
-        if has_captures:
+        if glob is not None and pattern is not None:
+            raise DataPortalInputError("Cannot specify both 'glob' and 'pattern' — use one or the other")
+        if glob is None and pattern is None:
+            raise DataPortalInputError("Must specify either 'glob' or 'pattern'")
+
+        if glob is not None:
+            for file in filter_files_by_pattern(list(self.list_files()), glob):
+                yield _read_file_with_format(file, file_format, **kwargs)
+        else:
             compiled_regex, _ = _pattern_to_captures_regex(pattern)
             for file in self.list_files():
                 m = compiled_regex.match(file.relative_path)
                 if m is not None:
-                    yield file, _read_file_with_format(file, file_format, **kwargs), m.groupdict()
-        else:
-            for file in filter_files_by_pattern(list(self.list_files()), pattern):
-                yield file, _read_file_with_format(file, file_format, **kwargs), {}
+                    yield _read_file_with_format(file, file_format, **kwargs), m.groupdict()
 
     def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile:
         """
diff --git a/tests/test_read_files.py b/tests/test_read_files.py
index 3f330958..9f74ae88 100644
--- a/tests/test_read_files.py
+++ b/tests/test_read_files.py
@@ -205,112 +205,102 @@ def setUp(self):
             self.txt_file,
         ])
 
-    def test_pattern_matches_csv(self):
-        results = list(self.dataset.read_files('*.csv'))
+    # --- glob mode ---
+
+    def test_glob_matches_csv(self):
+        results = list(self.dataset.read_files(glob='*.csv'))
         self.assertEqual(len(results), 1)
-        file, content, captures = results[0]
-        self.assertEqual(file.relative_path, 'data/results.csv')
-        self.assertEqual(captures, {})
+        self.assertIsInstance(results[0], pd.DataFrame)
 
-    def test_pattern_matches_multiple(self):
-        results = list(self.dataset.read_files('data/*'))
+    def test_glob_matches_multiple(self):
+        results = list(self.dataset.read_files(glob='data/*'))
         self.assertEqual(len(results), 2)
-        paths = {f.relative_path for f, _, _ in results}
-        self.assertIn('data/results.csv', paths)
-        self.assertIn('data/counts.tsv', paths)
 
-    def test_pattern_no_match_returns_empty(self):
-        results = list(self.dataset.read_files('*.parquet'))
+    def test_glob_no_match_returns_empty(self):
+        results = list(self.dataset.read_files(glob='*.parquet'))
         self.assertEqual(len(results), 0)
 
-    def test_explicit_format_csv(self):
-        import pandas as pd
-        results = list(self.dataset.read_files('data/*.tsv', file_format='csv', sep='\t'))
+    def test_glob_explicit_format_csv(self):
+        results = list(self.dataset.read_files(glob='data/*.tsv', file_format='csv', sep='\t'))
         self.assertEqual(len(results), 1)
-        _, df, _ = results[0]
-        self.assertIsInstance(df, pd.DataFrame)
-        self.assertIn('gene', df.columns)
+        self.assertIsInstance(results[0], pd.DataFrame)
+        self.assertIn('gene', results[0].columns)
 
-    def test_explicit_format_text(self):
-        results = list(self.dataset.read_files('logs/*.log', file_format='text'))
+    def test_glob_explicit_format_text(self):
+        results = list(self.dataset.read_files(glob='logs/*.log', file_format='text'))
         self.assertEqual(len(results), 1)
-        _, content, _ = results[0]
-        self.assertIsInstance(content, str)
-        self.assertIn('started', content)
+        self.assertIsInstance(results[0], str)
+        self.assertIn('started', results[0])
 
-    def test_auto_infer_csv_from_extension(self):
-        import pandas as pd
-        results = list(self.dataset.read_files('data/results.csv'))
-        _, content, _ = results[0]
-        self.assertIsInstance(content, pd.DataFrame)
+    def test_glob_auto_infer_csv_from_extension(self):
+        results = list(self.dataset.read_files(glob='data/results.csv'))
+        self.assertIsInstance(results[0], pd.DataFrame)
 
-    def test_auto_infer_text_from_extension(self):
-        results = list(self.dataset.read_files('logs/run.log'))
-        _, content, _ = results[0]
-        self.assertIsInstance(content, str)
-
-    def test_yields_file_and_content_tuples(self):
-        results = list(self.dataset.read_files('data/*.csv'))
-        self.assertEqual(len(results), 1)
-        file, content, captures = results[0]
-        self.assertIsInstance(file, DataPortalFile)
-        self.assertEqual(captures, {})
+    def test_glob_auto_infer_text_from_extension(self):
+        results = list(self.dataset.read_files(glob='logs/run.log'))
+        self.assertIsInstance(results[0], str)
 
     def test_globstar_pattern(self):
-        results = list(self.dataset.read_files('**/*.csv'))
+        results = list(self.dataset.read_files(glob='**/*.csv'))
         self.assertEqual(len(results), 1)
-        file, _, _ = results[0]
-        self.assertEqual(file.relative_path, 'data/results.csv')
+        self.assertIsInstance(results[0], pd.DataFrame)
 
-    # --- capture pattern tests ---
+    # --- pattern (capture) mode ---
 
-    def test_capture_simple_filename(self):
-        # {sample}.csv should match data/results.csv and capture sample='results'
-        results = list(self.dataset.read_files('{sample}.csv'))
+    def test_pattern_simple_filename(self):
+        results = list(self.dataset.read_files(pattern='{sample}.csv'))
         self.assertEqual(len(results), 1)
-        file, _, captures = results[0]
-        self.assertEqual(file.relative_path, 'data/results.csv')
+        content, captures = results[0]
+        self.assertIsInstance(content, pd.DataFrame)
         self.assertEqual(captures['sample'], 'results')
 
-    def test_capture_with_directory(self):
-        # data/{sample}.csv should match data/results.csv
-        results = list(self.dataset.read_files('data/{sample}.csv'))
+    def test_pattern_with_directory(self):
+        results = list(self.dataset.read_files(pattern='data/{sample}.csv'))
         self.assertEqual(len(results), 1)
-        _, _, captures = results[0]
+        _, captures = results[0]
         self.assertEqual(captures['sample'], 'results')
 
-    def test_capture_multiple_files(self):
-        # {sample}.csv matches both csv files at depth; capture distinct names
+    def test_pattern_multiple_files(self):
         dataset = _make_dataset_with_files([
             _make_mock_file('sampleA.csv', b'a\n1\n'),
             _make_mock_file('sampleB.csv', b'a\n2\n'),
             _make_mock_file('notes.txt', b'text'),
         ])
-        results = list(dataset.read_files('{sample}.csv'))
+        results = list(dataset.read_files(pattern='{sample}.csv'))
         self.assertEqual(len(results), 2)
-        captured = {c['sample'] for _, _, c in results}
+        captured = {c['sample'] for _, c in results}
         self.assertSetEqual(captured, {'sampleA', 'sampleB'})
 
-    def test_capture_multi_level(self):
-        # {condition}/{sample}.csv extracts two path segments
+    def test_pattern_multi_level(self):
         dataset = _make_dataset_with_files([
             _make_mock_file('treated/sampleA.csv', b'x\n1\n'),
             _make_mock_file('control/sampleB.csv', b'x\n2\n'),
         ])
-        results = list(dataset.read_files('{condition}/{sample}.csv'))
+        results = list(dataset.read_files(pattern='{condition}/{sample}.csv'))
         self.assertEqual(len(results), 2)
-        by_sample = {c['sample']: c['condition'] for _, _, c in results}
+        by_sample = {c['sample']: c['condition'] for _, c in results}
         self.assertEqual(by_sample['sampleA'], 'treated')
         self.assertEqual(by_sample['sampleB'], 'control')
 
-    def test_capture_no_match_returns_empty(self):
-        results = list(self.dataset.read_files('{sample}.parquet'))
+    def test_pattern_no_match_returns_empty(self):
+        results = list(self.dataset.read_files(pattern='{sample}.parquet'))
         self.assertEqual(len(results), 0)
 
-    def test_capture_returns_empty_dict_when_no_placeholders(self):
-        results = list(self.dataset.read_files('*.csv'))
-        _, _, captures = results[0]
-        self.assertEqual(captures, {})
+    def test_pattern_yields_content_and_captures_tuple(self):
+        results = list(self.dataset.read_files(pattern='{sample}.csv'))
+        content, captures = results[0]
+        self.assertIsInstance(captures, dict)
+        self.assertIn('sample', captures)
+
+    # --- error cases ---
+
+    def test_both_glob_and_pattern_raises(self):
+        with self.assertRaises(DataPortalInputError):
+            list(self.dataset.read_files(glob='*.csv', pattern='{sample}.csv'))
+
+    def test_neither_glob_nor_pattern_raises(self):
+        with self.assertRaises(DataPortalInputError):
+            list(self.dataset.read_files())
 
 
 class TestPatternToRegex(unittest.TestCase):

From 916ab8a0c15d4400041bb2ccdad4ba5ecf7ff38d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 19 Mar 2026 16:35:56 +0000
Subject: [PATCH 05/24] Require dataset argument on project.read_files()

Instead of iterating across all datasets, read_files() on
DataPortalProject now requires a dataset argument (name, ID, or
DataPortalDataset object) and delegates to that dataset's read_files().
The glob/pattern/file_format interface is otherwise unchanged.

https://claude.ai/code/session_01TANa5jJ1qzDMzoV8qCjpuU
---
 cirro/sdk/project.py | 80 ++++++++++++++++++++++++--------------------
 1 file changed, 43 insertions(+), 37 deletions(-)

diff --git a/cirro/sdk/project.py b/cirro/sdk/project.py
index 18678026..48a1e722 100644
--- a/cirro/sdk/project.py
+++ b/cirro/sdk/project.py
@@ -1,6 +1,6 @@
 from functools import cache
 from time import sleep
-from typing import List, Union, Iterator, Tuple, Any
+from typing import List, Union
 
 from cirro_api_client.v1.models import Project, UploadDatasetRequest, Dataset, Sample, Tag, Status
 
@@ -238,55 +238,61 @@ def samples(self, max_items: int = 10000) -> List[Sample]:
 
     def read_files(
             self,
-            pattern: str,
+            dataset: Union[str, DataPortalDataset],
+            glob: str = None,
+            pattern: str = None,
             file_format: str = None,
             **kwargs
-    ) -> Iterator[Tuple[DataPortalFile, Any]]:
+    ):
         """
-        Read the contents of files across all datasets in the project that match
-        the given glob pattern.
-
-        Iterates over every dataset in the project and yields matching files from
-        each one in turn. See :meth:`~cirro.sdk.dataset.DataPortalDataset.read_files`
-        for full details on pattern matching and format options.
+        Read the contents of files from a specific dataset in the project.
 
-        Uses standard glob pattern matching (e.g., ``*.csv``, ``data/**/*.tsv.gz``).
-        ``*`` matches any sequence of characters within a single path segment;
-        ``**`` matches zero or more path segments.
+        The dataset can be identified by name, ID, or a
+        :class:`~cirro.sdk.dataset.DataPortalDataset` object.
+        See :meth:`~cirro.sdk.dataset.DataPortalDataset.read_files`
+        for full details on ``glob``/``pattern`` matching and format options.
 
         Args:
-            pattern (str): Glob pattern used to match file paths within each dataset
-                (e.g., ``'*.csv'``, ``'counts/**/*.tsv.gz'``)
-            file_format (str): File format used to parse each file. Supported values:
-
-                - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame``
-                - ``'h5ad'``: parse as AnnData (requires ``anndata`` package)
-                - ``'text'``: read as plain text, returns a ``str``
-                - ``None`` (default): infer from file extension
-                  (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, otherwise ``'text'``)
-            **kwargs: Additional keyword arguments forwarded to the file-parsing function.
-                For ``'csv'`` format these are passed to :func:`pandas.read_csv`
-                (e.g., ``sep='\\t'`` for TSV files).
-                For ``'text'`` format these are passed to
-                :meth:`~cirro.sdk.file.DataPortalFile.read`.
+            dataset (str | DataPortalDataset): Dataset to read files from,
+                identified by name, ID, or object.
+            glob (str): Wildcard expression to match files.
+                Yields one item per matching file: the parsed content.
+            pattern (str): Wildcard expression with ``{name}`` capture
+                placeholders. Yields ``(content, captures)`` per matching file.
+            file_format (str): File format used to parse each file
+                (``'csv'``, ``'h5ad'``, ``'json'``, ``'parquet'``,
+                ``'feather'``, ``'pickle'``, ``'excel'``, ``'text'``,
+                or ``None`` to infer from extension).
+            **kwargs: Additional keyword arguments forwarded to the
+                file-parsing function.
 
         Yields:
-            Tuple[DataPortalFile, Any]: ``(file, content)`` for each matching file
-            across all datasets, where *content* type depends on *file_format*.
+            - When using ``glob``: *content* for each matching file
+            - When using ``pattern``: ``(content, captures)`` for each
+              matching file
 
         Example:
             ```python
-            # Read all CSV files across every dataset in a project
-            for file, df in project.read_files('*.csv'):
-                print(file.relative_path, df.shape)
-
-            # Read gzip-compressed TSV files with explicit separator
-            for file, df in project.read_files('**/*.tsv.gz', file_format='csv', sep='\\t'):
-                print(file.relative_path, df.shape)
+            # Read all CSV files from a dataset identified by name
+            for df in project.read_files('My Dataset', glob='*.csv'):
+                print(df.shape)
+
+            # Extract sample names using pattern captures
+            for df, captures in project.read_files(
+                'My Dataset', pattern='{sample}.csv'
+            ):
+                print(captures['sample'], df.shape)
             ```
         """
-        for dataset in self.list_datasets():
-            yield from dataset.read_files(pattern, file_format=file_format, **kwargs)
+        if isinstance(dataset, DataPortalDataset):
+            ds = dataset
+        else:
+            # Try by ID first, fall back to name
+            try:
+                ds = self.get_dataset_by_id(dataset)
+            except (DataPortalAssetNotFound, Exception):
+                ds = self.get_dataset_by_name(dataset)
+        yield from ds.read_files(glob=glob, pattern=pattern, file_format=file_format, **kwargs)
 
 
 class DataPortalProjects(DataPortalAssets[DataPortalProject]):

From 4cf45aa5fde986198f0f834b0aeeb1701760382b Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 11:54:03 -0700
Subject: [PATCH 06/24] Fix flake8

---
 cirro/sdk/dataset.py | 2 +-
 cirro/sdk/project.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py
index 81dfd906..ef2c1707 100644
--- a/cirro/sdk/dataset.py
+++ b/cirro/sdk/dataset.py
@@ -1,7 +1,7 @@
 import datetime
 import re
 from pathlib import Path
-from typing import Union, List, Optional, Iterator, Tuple, Any, Dict
+from typing import Union, List, Optional, Any
 
 from cirro_api_client.v1.api.processes import validate_file_requirements
 from cirro_api_client.v1.models import Dataset, DatasetDetail, RunAnalysisRequest, ProcessDetail, Status, \
diff --git a/cirro/sdk/project.py b/cirro/sdk/project.py
index 48a1e722..4c3bad43 100644
--- a/cirro/sdk/project.py
+++ b/cirro/sdk/project.py
@@ -9,7 +9,6 @@
 from cirro.sdk.asset import DataPortalAssets, DataPortalAsset
 from cirro.sdk.dataset import DataPortalDataset, DataPortalDatasets
 from cirro.sdk.exceptions import DataPortalAssetNotFound, DataPortalInputError
-from cirro.sdk.file import DataPortalFile
 from cirro.sdk.helpers import parse_process_name_or_id
 from cirro.sdk.process import DataPortalProcess
 from cirro.sdk.reference import DataPortalReference, DataPortalReferences

From 29e0c4276176736b22928e5ab4592369af5b7392 Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 13:23:00 -0700
Subject: [PATCH 07/24] Get dataset by name or id

---
 cirro/sdk/project.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/cirro/sdk/project.py b/cirro/sdk/project.py
index 4c3bad43..dc29316a 100644
--- a/cirro/sdk/project.py
+++ b/cirro/sdk/project.py
@@ -89,6 +89,31 @@ def list_datasets(self, force_refresh=False) -> DataPortalDatasets:
             ]
         )
 
+    def get_dataset(self, name_or_id: str, force_refresh=False) -> DataPortalDataset:
+        """Return the dataset matching the given ID or name.
+
+        Tries to match by ID first, then by name.
+        Raises an error if the name matches multiple datasets.
+        """
+        if force_refresh:
+            self._get_datasets.cache_clear()
+
+        # Try by ID first
+        try:
+            return self.get_dataset_by_id(name_or_id)
+        except (DataPortalAssetNotFound, Exception):
+            pass
+
+        # Fall back to name matching
+        matches = [d for d in self._get_datasets() if d.name == name_or_id]
+        if len(matches) == 0:
+            raise DataPortalAssetNotFound(f'Dataset with name or ID "{name_or_id}" not found')
+        if len(matches) > 1:
+            raise DataPortalInputError(
+                f'Multiple datasets found with the name "{name_or_id}" — use get_dataset_by_id instead'
+            )
+        return self.get_dataset_by_id(matches[0].id)
+
     def get_dataset_by_name(self, name: str, force_refresh=False) -> DataPortalDataset:
         """Return the dataset with the specified name."""
         if force_refresh:

From 7b59277a08ef009b3dea6528750792fdafc26ebe Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 13:24:25 -0700
Subject: [PATCH 08/24] Add singular read_file function

---
 cirro/sdk/dataset.py | 47 ++++++++++++++++++++++++++++++++++++++++++++
 cirro/sdk/project.py | 40 +++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)

diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py
index ef2c1707..05f74959 100644
--- a/cirro/sdk/dataset.py
+++ b/cirro/sdk/dataset.py
@@ -385,6 +385,53 @@ def read_files(
                 if m is not None:
                     yield _read_file_with_format(file, file_format, **kwargs), m.groupdict()
 
+    def read_file(
+            self,
+            path: str = None,
+            glob: str = None,
+            file_format: str = None,
+            **kwargs
+    ) -> Any:
+        """
+        Read the contents of a single file from the dataset.
+
+        Provide either ``path`` (exact relative path) or ``glob`` (wildcard
+        expression). If ``glob`` is used it must match exactly one file.
+
+        Args:
+            path (str): Exact relative path of the file within the dataset.
+            glob (str): Wildcard expression to match a single file.
+            file_format (str): File format used to parse the file. Supported values
+                are the same as :meth:`read_files`.
+            **kwargs: Additional keyword arguments forwarded to the file-parsing
+                function.
+
+        Returns:
+            Parsed file content.
+
+        Raises:
+            DataPortalInputError: if both or neither of ``path``/``glob`` are
+                provided, or if ``glob`` matches zero or more than one file.
+        """
+        if path is not None and glob is not None:
+            raise DataPortalInputError("Cannot specify both 'path' and 'glob' — use one or the other")
+        if path is None and glob is None:
+            raise DataPortalInputError("Must specify either 'path' or 'glob'")
+
+        if path is not None:
+            file = self.get_file(path)
+        else:
+            matches = list(filter_files_by_pattern(list(self.list_files()), glob))
+            if len(matches) == 0:
+                raise DataPortalAssetNotFound(f"No files matched glob '{glob}'")
+            if len(matches) > 1:
+                raise DataPortalInputError(
+                    f"glob '{glob}' matched {len(matches)} files — use read_files() to read multiple files"
+                )
+            file = matches[0]
+
+        return _read_file_with_format(file, file_format, **kwargs)
+
     def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile:
         """
         Get the artifact of a particular type from the dataset
diff --git a/cirro/sdk/project.py b/cirro/sdk/project.py
index dc29316a..00d52d9a 100644
--- a/cirro/sdk/project.py
+++ b/cirro/sdk/project.py
@@ -318,6 +318,46 @@ def read_files(
                 ds = self.get_dataset_by_name(dataset)
         yield from ds.read_files(glob=glob, pattern=pattern, file_format=file_format, **kwargs)
 
+    def read_file(
+            self,
+            dataset: Union[str, DataPortalDataset],
+            path: str = None,
+            glob: str = None,
+            file_format: str = None,
+            **kwargs
+    ):
+        """
+        Read the contents of a single file from a specific dataset in the project.
+
+        The dataset can be identified by name, ID, or a
+        :class:`~cirro.sdk.dataset.DataPortalDataset` object.
+        Provide either ``path`` (exact relative path) or ``glob`` (wildcard
+        expression). If ``glob`` is used it must match exactly one file.
+
+        Args:
+            dataset (str | DataPortalDataset): Dataset to read the file from,
+                identified by name, ID, or object.
+            path (str): Exact relative path of the file within the dataset.
+            glob (str): Wildcard expression matching exactly one file.
+            file_format (str): File format used to parse the file
+                (``'csv'``, ``'h5ad'``, ``'json'``, ``'parquet'``,
+                ``'feather'``, ``'pickle'``, ``'excel'``, ``'text'``,
+                or ``None`` to infer from extension).
+            **kwargs: Additional keyword arguments forwarded to the
+                file-parsing function.
+
+        Returns:
+            Parsed file content.
+        """
+        if isinstance(dataset, DataPortalDataset):
+            ds = dataset
+        else:
+            try:
+                ds = self.get_dataset_by_id(dataset)
+            except (DataPortalAssetNotFound, Exception):
+                ds = self.get_dataset_by_name(dataset)
+        return ds.read_file(path=path, glob=glob, file_format=file_format, **kwargs)
+
 
 class DataPortalProjects(DataPortalAssets[DataPortalProject]):
     """Collection of DataPortalProject objects"""

From 52ee650a8e4e44e097cccf851f6f7bb7dec16893 Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 13:40:21 -0700
Subject: [PATCH 09/24] Increment version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6224e9f3..50ea289f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "cirro"
-version = "1.10.2"
+version = "1.10.3"
 description = "CLI tool and SDK for interacting with the Cirro platform"
 authors = ["Cirro Bio <support@cirro.bio>"]
 license = "MIT"

From 75e4e6abc9418d29f909afdaf06c25788049bc4b Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 14:03:45 -0700
Subject: [PATCH 10/24] Bugfixes

---
 cirro/sdk/asset.py   | 2 +-
 cirro/sdk/dataset.py | 9 +++++----
 cirro/sdk/file.py    | 4 ++--
 cirro/sdk/process.py | 3 ++-
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/cirro/sdk/asset.py b/cirro/sdk/asset.py
index ce1eea00..082200fe 100644
--- a/cirro/sdk/asset.py
+++ b/cirro/sdk/asset.py
@@ -60,7 +60,7 @@ def get_by_name(self, name: str) -> T:
         # Error if multiple projects are found
         msg = f"Multiple {self.asset_name} items found with name '{name}', use ID instead.\n{self.description()}"
         if len(matching_queries) > 1:
-            raise DataPortalAssetNotFound(msg)
+            raise DataPortalInputError(msg)
 
         return matching_queries[0]
 
diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py
index 05f74959..fe8ff48a 100644
--- a/cirro/sdk/dataset.py
+++ b/cirro/sdk/dataset.py
@@ -290,7 +290,7 @@ def read_files(
             self,
             glob: str = None,
             pattern: str = None,
-            file_format: str = None,
+            format: str = None,
             **kwargs
     ):
         """
@@ -320,7 +320,7 @@ def read_files(
                 placeholders (e.g., ``'{sample}.csv'``,
                 ``'{condition}/{sample}.csv'``).
                 Yields ``(content, captures)`` per matching file.
-            file_format (str): File format used to parse each file. Supported values:
+            format (str): File format used to parse each file. Supported values:
 
                 - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame``
                 - ``'h5ad'``: parse as AnnData (requires ``anndata`` package)
@@ -366,7 +366,7 @@ def read_files(
                 print(captures['condition'], captures['sample'], df.shape)
 
             # Read gzip-compressed TSV files with explicit separator
-            for df in dataset.read_files(glob='**/*.tsv.gz', file_format='csv', sep='\\t'):
+            for df in dataset.read_files(glob='**/*.tsv.gz', format='csv', sep='\\t'):
                 print(df.shape)
             ```
         """
@@ -514,6 +514,7 @@ def run_analysis(
         process = parse_process_name_or_id(process, self._client)
 
         if compute_environment:
+            compute_environment_name = compute_environment
             compute_environments = self._client.compute_environments.list_environments_for_project(
                 project_id=self.project_id
             )
@@ -523,7 +524,7 @@ def run_analysis(
                 None
             )
             if compute_environment is None:
-                raise DataPortalInputError(f"Compute environment '{compute_environment}' not found")
+                raise DataPortalInputError(f"Compute environment '{compute_environment_name}' not found")
 
         resp = self._client.execution.run_analysis(
             project_id=self.project_id,
diff --git a/cirro/sdk/file.py b/cirro/sdk/file.py
index f43bd22e..b6c2e1bb 100644
--- a/cirro/sdk/file.py
+++ b/cirro/sdk/file.py
@@ -111,7 +111,7 @@ def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> 'DataFram
             elif self.relative_path.endswith('.bz2'):
                 compression = dict(method='bz2')
             elif self.relative_path.endswith('.xz'):
-                compression = dict(method='zstd')
+                compression = dict(method='xz')
             elif self.relative_path.endswith('.zst'):
                 compression = dict(method='zstd')
             else:
@@ -280,5 +280,5 @@ def download(self, download_location: str = None) -> List[Path]:
 
         local_paths = []
         for f in self:
-            local_paths += f.download(download_location)
+            local_paths.append(f.download(download_location))
         return local_paths
diff --git a/cirro/sdk/process.py b/cirro/sdk/process.py
index 282924fa..8f4cff9d 100644
--- a/cirro/sdk/process.py
+++ b/cirro/sdk/process.py
@@ -147,6 +147,7 @@ def run_analysis(
         ]
 
         if compute_environment:
+            compute_environment_name = compute_environment
             compute_environments = self._client.compute_environments.list_environments_for_project(
                 project_id=project_id
             )
@@ -156,7 +157,7 @@ def run_analysis(
                 None
             )
             if compute_environment is None:
-                raise DataPortalInputError(f"Compute environment '{compute_environment}' not found")
+                raise DataPortalInputError(f"Compute environment '{compute_environment_name}' not found")
 
         resp = self._client.execution.run_analysis(
             project_id=project_id,

From 30abda98559fabfb74160a55b131021a91ae12d4 Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 14:04:52 -0700
Subject: [PATCH 11/24] Move from project to portal

---
 cirro/sdk/portal.py  | 83 +++++++++++++++++++++++++++++++++++--
 cirro/sdk/project.py | 98 --------------------------------------------
 2 files changed, 79 insertions(+), 102 deletions(-)

diff --git a/cirro/sdk/portal.py b/cirro/sdk/portal.py
index ebd5fd96..36f2afdc 100644
--- a/cirro/sdk/portal.py
+++ b/cirro/sdk/portal.py
@@ -100,10 +100,85 @@ def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDat
         except DataPortalAssetNotFound:
             project: DataPortalProject = self.get_project_by_name(project)
 
-        try:
-            return project.get_dataset_by_id(dataset)
-        except DataPortalAssetNotFound:
-            return project.get_dataset_by_name(dataset)
+        return project.get_dataset(dataset)
+
+    def read_files(
+            self,
+            project: str,
+            dataset: str,
+            glob: str = None,
+            pattern: str = None,
+            format: str = None,
+            **kwargs
+    ):
+        """
+        Read the contents of files from a dataset.
+
+        The project and dataset can each be identified by name or ID.
+        See :meth:`~cirro.sdk.dataset.DataPortalDataset.read_files`
+        for full details on ``glob``/``pattern`` matching and format options.
+
+        Args:
+            project (str): ID or name of the project.
+            dataset (str): ID or name of the dataset.
+            glob (str): Wildcard expression to match files.
+                Yields one item per matching file: the parsed content.
+            pattern (str): Wildcard expression with ``{name}`` capture
+                placeholders. Yields ``(content, captures)`` per matching file.
+            format (str): File format used to parse each file
+                (``'csv'``, ``'h5ad'``, ``'json'``, ``'parquet'``,
+                ``'feather'``, ``'pickle'``, ``'excel'``, ``'text'``,
+                or ``None`` to infer from extension).
+            **kwargs: Additional keyword arguments forwarded to the
+                file-parsing function.
+
+        Yields:
+            - When using ``glob``: *content* for each matching file
+            - When using ``pattern``: ``(content, captures)`` for each
+              matching file
+
+        Example:
+            ```python
+            for df in portal.read_files('My Project', 'My Dataset', glob='*.csv'):
+                print(df.shape)
+            ```
+        """
+        ds = self.get_dataset(project=project, dataset=dataset)
+        yield from ds.read_files(glob=glob, pattern=pattern, format=format, **kwargs)
+
+    def read_file(
+            self,
+            project: str,
+            dataset: str,
+            path: str = None,
+            glob: str = None,
+            format: str = None,
+            **kwargs
+    ):
+        """
+        Read the contents of a single file from a dataset.
+
+        The project and dataset can each be identified by name or ID.
+        Provide either ``path`` (exact relative path) or ``glob`` (wildcard
+        expression). If ``glob`` is used it must match exactly one file.
+
+        Args:
+            project (str): ID or name of the project.
+            dataset (str): ID or name of the dataset.
+            path (str): Exact relative path of the file within the dataset.
+            glob (str): Wildcard expression matching exactly one file.
+            format (str): File format used to parse the file
+                (``'csv'``, ``'h5ad'``, ``'json'``, ``'parquet'``,
+                ``'feather'``, ``'pickle'``, ``'excel'``, ``'text'``,
+                or ``None`` to infer from extension).
+            **kwargs: Additional keyword arguments forwarded to the
+                file-parsing function.
+
+        Returns:
+            Parsed file content.
+        """
+        ds = self.get_dataset(project=project, dataset=dataset)
+        return ds.read_file(path=path, glob=glob, format=format, **kwargs)
 
     def list_processes(self, ingest=False) -> DataPortalProcesses:
         """
diff --git a/cirro/sdk/project.py b/cirro/sdk/project.py
index 00d52d9a..b224a15f 100644
--- a/cirro/sdk/project.py
+++ b/cirro/sdk/project.py
@@ -260,104 +260,6 @@ def samples(self, max_items: int = 10000) -> List[Sample]:
         """
         return self._client.metadata.get_project_samples(self.id, max_items)
 
-    def read_files(
-            self,
-            dataset: Union[str, DataPortalDataset],
-            glob: str = None,
-            pattern: str = None,
-            file_format: str = None,
-            **kwargs
-    ):
-        """
-        Read the contents of files from a specific dataset in the project.
-
-        The dataset can be identified by name, ID, or a
-        :class:`~cirro.sdk.dataset.DataPortalDataset` object.
-        See :meth:`~cirro.sdk.dataset.DataPortalDataset.read_files`
-        for full details on ``glob``/``pattern`` matching and format options.
-
-        Args:
-            dataset (str | DataPortalDataset): Dataset to read files from,
-                identified by name, ID, or object.
-            glob (str): Wildcard expression to match files.
-                Yields one item per matching file: the parsed content.
-            pattern (str): Wildcard expression with ``{name}`` capture
-                placeholders. Yields ``(content, captures)`` per matching file.
-            file_format (str): File format used to parse each file
-                (``'csv'``, ``'h5ad'``, ``'json'``, ``'parquet'``,
-                ``'feather'``, ``'pickle'``, ``'excel'``, ``'text'``,
-                or ``None`` to infer from extension).
-            **kwargs: Additional keyword arguments forwarded to the
-                file-parsing function.
-
-        Yields:
-            - When using ``glob``: *content* for each matching file
-            - When using ``pattern``: ``(content, captures)`` for each
-              matching file
-
-        Example:
-            ```python
-            # Read all CSV files from a dataset identified by name
-            for df in project.read_files('My Dataset', glob='*.csv'):
-                print(df.shape)
-
-            # Extract sample names using pattern captures
-            for df, captures in project.read_files(
-                'My Dataset', pattern='{sample}.csv'
-            ):
-                print(captures['sample'], df.shape)
-            ```
-        """
-        if isinstance(dataset, DataPortalDataset):
-            ds = dataset
-        else:
-            # Try by ID first, fall back to name
-            try:
-                ds = self.get_dataset_by_id(dataset)
-            except (DataPortalAssetNotFound, Exception):
-                ds = self.get_dataset_by_name(dataset)
-        yield from ds.read_files(glob=glob, pattern=pattern, file_format=file_format, **kwargs)
-
-    def read_file(
-            self,
-            dataset: Union[str, DataPortalDataset],
-            path: str = None,
-            glob: str = None,
-            file_format: str = None,
-            **kwargs
-    ):
-        """
-        Read the contents of a single file from a specific dataset in the project.
-
-        The dataset can be identified by name, ID, or a
-        :class:`~cirro.sdk.dataset.DataPortalDataset` object.
-        Provide either ``path`` (exact relative path) or ``glob`` (wildcard
-        expression). If ``glob`` is used it must match exactly one file.
-
-        Args:
-            dataset (str | DataPortalDataset): Dataset to read the file from,
-                identified by name, ID, or object.
-            path (str): Exact relative path of the file within the dataset.
-            glob (str): Wildcard expression matching exactly one file.
-            file_format (str): File format used to parse the file
-                (``'csv'``, ``'h5ad'``, ``'json'``, ``'parquet'``,
-                ``'feather'``, ``'pickle'``, ``'excel'``, ``'text'``,
-                or ``None`` to infer from extension).
-            **kwargs: Additional keyword arguments forwarded to the
-                file-parsing function.
-
-        Returns:
-            Parsed file content.
-        """
-        if isinstance(dataset, DataPortalDataset):
-            ds = dataset
-        else:
-            try:
-                ds = self.get_dataset_by_id(dataset)
-            except (DataPortalAssetNotFound, Exception):
-                ds = self.get_dataset_by_name(dataset)
-        return ds.read_file(path=path, glob=glob, file_format=file_format, **kwargs)
-
 
 class DataPortalProjects(DataPortalAssets[DataPortalProject]):
     """Collection of DataPortalProject objects"""

From 05c78b451d166d89fa549b214ee331fd40c624fb Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 14:05:03 -0700
Subject: [PATCH 12/24] Change file_format to format

---
 cirro/sdk/dataset.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py
index fe8ff48a..2587c8f2 100644
--- a/cirro/sdk/dataset.py
+++ b/cirro/sdk/dataset.py
@@ -377,19 +377,19 @@ def read_files(
 
         if glob is not None:
             for file in filter_files_by_pattern(list(self.list_files()), glob):
-                yield _read_file_with_format(file, file_format, **kwargs)
+                yield _read_file_with_format(file, format, **kwargs)
         else:
             compiled_regex, _ = _pattern_to_captures_regex(pattern)
             for file in self.list_files():
                 m = compiled_regex.match(file.relative_path)
                 if m is not None:
-                    yield _read_file_with_format(file, file_format, **kwargs), m.groupdict()
+                    yield _read_file_with_format(file, format, **kwargs), m.groupdict()
 
     def read_file(
             self,
             path: str = None,
             glob: str = None,
-            file_format: str = None,
+            format: str = None,
             **kwargs
     ) -> Any:
         """
@@ -401,7 +401,7 @@ def read_file(
         Args:
             path (str): Exact relative path of the file within the dataset.
             glob (str): Wildcard expression to match a single file.
-            file_format (str): File format used to parse the file. Supported values
+            format (str): File format used to parse the file. Supported values
                 are the same as :meth:`read_files`.
             **kwargs: Additional keyword arguments forwarded to the file-parsing
                 function.
@@ -430,7 +430,7 @@ def read_file(
                 )
             file = matches[0]
 
-        return _read_file_with_format(file, file_format, **kwargs)
+        return _read_file_with_format(file, format, **kwargs)
 
     def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile:
         """

From 84c36bac9f88c91b2fba352c035b3a912a90bba4 Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 14:45:24 -0700
Subject: [PATCH 13/24] Clean up

---
 cirro/sdk/dataset.py | 2 +-
 cirro/sdk/file.py    | 2 +-
 cirro/sdk/portal.py  | 2 +-
 cirro/sdk/project.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py
index 2587c8f2..e35c7138 100644
--- a/cirro/sdk/dataset.py
+++ b/cirro/sdk/dataset.py
@@ -118,7 +118,7 @@ def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi):
         Should be invoked from a top-level constructor, for example:
 
         ```python
-        from cirro import DataPortal()
+        from cirro import DataPortal
         portal = DataPortal()
         dataset = portal.get_dataset(
             project="id-or-name-of-project",
diff --git a/cirro/sdk/file.py b/cirro/sdk/file.py
index b6c2e1bb..3c6850e2 100644
--- a/cirro/sdk/file.py
+++ b/cirro/sdk/file.py
@@ -27,7 +27,7 @@ def __init__(self, file: File, client: CirroApi):
         Instantiate by listing files from a dataset.
 
         ```python
-        from cirro import DataPortal()
+        from cirro import DataPortal
         portal = DataPortal()
         dataset = portal.get_dataset(
             project="id-or-name-of-project",
diff --git a/cirro/sdk/portal.py b/cirro/sdk/portal.py
index 36f2afdc..0a9a6852 100644
--- a/cirro/sdk/portal.py
+++ b/cirro/sdk/portal.py
@@ -28,7 +28,7 @@ def __init__(self, base_url: str = None, client: CirroApi = None):
         ```python
         from cirro import DataPortal
 
-        Portal = DataPortal(base_url="app.cirro.bio")
+        portal = DataPortal(base_url="app.cirro.bio")
         portal.list_projects()
         ```
         """
diff --git a/cirro/sdk/project.py b/cirro/sdk/project.py
index b224a15f..89f58c91 100644
--- a/cirro/sdk/project.py
+++ b/cirro/sdk/project.py
@@ -101,7 +101,7 @@ def get_dataset(self, name_or_id: str, force_refresh=False) -> DataPortalDataset
         # Try by ID first
         try:
             return self.get_dataset_by_id(name_or_id)
-        except (DataPortalAssetNotFound, Exception):
+        except Exception:
             pass
 
         # Fall back to name matching

From 96764c2b5e4eb6f4ecaac371763ad6c4a3daa772 Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 14:45:42 -0700
Subject: [PATCH 14/24] Move the primary read_files docs to the DataPortal
 object

---
 cirro/sdk/dataset.py | 86 +++++++-------------------------------------
 cirro/sdk/portal.py  | 82 +++++++++++++++++++++++++++++++++---------
 2 files changed, 78 insertions(+), 90 deletions(-)

diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py
index e35c7138..404801a2 100644
--- a/cirro/sdk/dataset.py
+++ b/cirro/sdk/dataset.py
@@ -296,79 +296,22 @@ def read_files(
         """
         Read the contents of files in the dataset.
 
-        Exactly one of ``glob`` or ``pattern`` must be provided.
-
-        **glob** — standard wildcard matching; yields the file content for each
-        matching file:
-
-        - ``*`` matches any characters within a single path segment
-        - ``**`` matches zero or more path segments
-        - Matching is suffix-anchored (``*.csv`` matches at any depth)
-
-        **pattern** — like ``glob`` but ``{name}`` placeholders capture portions
-        of the path automatically; yields ``(content, captures)`` pairs where
-        *captures* is a ``dict`` of extracted values:
-
-        - ``{name}`` captures one path segment (no ``/``)
-        - ``*`` and ``**`` wildcards work as in ``glob``
+        See :meth:`~cirro.sdk.portal.DataPortal.read_files` for full details
+        on ``glob``/``pattern`` matching and format options.
 
         Args:
-            glob (str): Wildcard expression to match files
-                (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``).
+            glob (str): Wildcard expression to match files.
                 Yields one item per matching file: the parsed content.
             pattern (str): Wildcard expression with ``{name}`` capture
-                placeholders (e.g., ``'{sample}.csv'``,
-                ``'{condition}/{sample}.csv'``).
-                Yields ``(content, captures)`` per matching file.
-            format (str): File format used to parse each file. Supported values:
-
-                - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame``
-                - ``'h5ad'``: parse as AnnData (requires ``anndata`` package)
-                - ``'json'``: parse with :func:`json.loads`, returns a Python object
-                - ``'parquet'``: parse with :func:`pandas.read_parquet`, returns a ``DataFrame``
-                  (requires ``pyarrow`` or ``fastparquet``)
-                - ``'feather'``: parse with :func:`pandas.read_feather`, returns a ``DataFrame``
-                  (requires ``pyarrow``)
-                - ``'pickle'``: deserialize with :mod:`pickle`, returns a Python object
-                - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame``
-                  (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``)
-                - ``'text'``: read as plain text, returns a ``str``
-                - ``None`` (default): infer from file extension
-                  (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``,
-                  ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``,
-                  ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``,
-                  ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``)
-            **kwargs: Additional keyword arguments forwarded to the file-parsing
-                function (e.g., ``sep='\\t'`` for CSV/TSV files).
+                placeholders. Yields ``(content, captures)`` per matching file.
+            format (str): File format used to parse each file
+                (or ``None`` to infer from extension).
+            **kwargs: Additional keyword arguments forwarded to the
+                file-parsing function.
 
         Yields:
             - When using ``glob``: *content* for each matching file
-            - When using ``pattern``: ``(content, captures)`` for each matching file,
-              where *captures* is a ``dict`` of values extracted from ``{name}``
-              placeholders
-
-        Raises:
-            DataPortalInputError: if both ``glob`` and ``pattern`` are provided,
-                or if neither is provided.
-
-        Example:
-            ```python
-            # Read all CSV files — just the content
-            for df in dataset.read_files(glob='*.csv'):
-                print(df.shape)
-
-            # Extract sample names from filenames automatically
-            for df, captures in dataset.read_files(pattern='{sample}.csv'):
-                print(captures['sample'], df.shape)
-
-            # Multi-level capture: condition directory + sample filename
-            for df, captures in dataset.read_files(pattern='{condition}/{sample}.csv'):
-                print(captures['condition'], captures['sample'], df.shape)
-
-            # Read gzip-compressed TSV files with explicit separator
-            for df in dataset.read_files(glob='**/*.tsv.gz', format='csv', sep='\\t'):
-                print(df.shape)
-            ```
+            - When using ``pattern``: ``(content, captures)`` for each matching file
         """
         if glob is not None and pattern is not None:
             raise DataPortalInputError("Cannot specify both 'glob' and 'pattern' — use one or the other")
@@ -395,23 +338,18 @@ def read_file(
         """
         Read the contents of a single file from the dataset.
 
-        Provide either ``path`` (exact relative path) or ``glob`` (wildcard
-        expression). If ``glob`` is used it must match exactly one file.
+        See :meth:`~cirro.sdk.portal.DataPortal.read_file` for full details.
 
         Args:
             path (str): Exact relative path of the file within the dataset.
-            glob (str): Wildcard expression to match a single file.
+            glob (str): Wildcard expression matching exactly one file.
             format (str): File format used to parse the file. Supported values
-                are the same as :meth:`read_files`.
+                are the same as :meth:`~cirro.sdk.portal.DataPortal.read_files`.
             **kwargs: Additional keyword arguments forwarded to the file-parsing
                 function.
 
         Returns:
             Parsed file content.
-
-        Raises:
-            DataPortalInputError: if both or neither of ``path``/``glob`` are
-                provided, or if ``glob`` matches zero or more than one file.
         """
         if path is not None and glob is not None:
             raise DataPortalInputError("Cannot specify both 'path' and 'glob' — use one or the other")
diff --git a/cirro/sdk/portal.py b/cirro/sdk/portal.py
index 0a9a6852..4bb4e5a7 100644
--- a/cirro/sdk/portal.py
+++ b/cirro/sdk/portal.py
@@ -115,32 +115,80 @@ def read_files(
         Read the contents of files from a dataset.
 
         The project and dataset can each be identified by name or ID.
-        See :meth:`~cirro.sdk.dataset.DataPortalDataset.read_files`
-        for full details on ``glob``/``pattern`` matching and format options.
+        Exactly one of ``glob`` or ``pattern`` must be provided.
+
+        **glob** — standard wildcard matching; yields the file content for each
+        matching file:
+
+        - ``*`` matches any characters within a single path segment
+        - ``**`` matches zero or more path segments
+        - Matching is suffix-anchored (``*.csv`` matches at any depth)
+
+        **pattern** — like ``glob`` but ``{name}`` placeholders capture portions
+        of the path automatically; yields ``(content, captures)`` pairs where
+        *captures* is a ``dict`` of extracted values:
+
+        - ``{name}`` captures one path segment (no ``/``)
+        - ``*`` and ``**`` wildcards work as in ``glob``
 
         Args:
             project (str): ID or name of the project.
             dataset (str): ID or name of the dataset.
-            glob (str): Wildcard expression to match files.
+            glob (str): Wildcard expression to match files
+                (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``).
                 Yields one item per matching file: the parsed content.
             pattern (str): Wildcard expression with ``{name}`` capture
-                placeholders. Yields ``(content, captures)`` per matching file.
-            format (str): File format used to parse each file
-                (``'csv'``, ``'h5ad'``, ``'json'``, ``'parquet'``,
-                ``'feather'``, ``'pickle'``, ``'excel'``, ``'text'``,
-                or ``None`` to infer from extension).
-            **kwargs: Additional keyword arguments forwarded to the
-                file-parsing function.
+                placeholders (e.g., ``'{sample}.csv'``,
+                ``'{condition}/{sample}.csv'``).
+                Yields ``(content, captures)`` per matching file.
+            format (str): File format used to parse each file. Supported values:
+
+                - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame``
+                - ``'h5ad'``: parse as AnnData (requires ``anndata`` package)
+                - ``'json'``: parse with :func:`json.loads`, returns a Python object
+                - ``'parquet'``: parse with :func:`pandas.read_parquet`, returns a ``DataFrame``
+                  (requires ``pyarrow`` or ``fastparquet``)
+                - ``'feather'``: parse with :func:`pandas.read_feather`, returns a ``DataFrame``
+                  (requires ``pyarrow``)
+                - ``'pickle'``: deserialize with :mod:`pickle`, returns a Python object
+                - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame``
+                  (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``)
+                - ``'text'``: read as plain text, returns a ``str``
+                - ``None`` (default): infer from file extension
+                  (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``,
+                  ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``,
+                  ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``,
+                  ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``)
+            **kwargs: Additional keyword arguments forwarded to the file-parsing
+                function (e.g., ``sep='\\t'`` for CSV/TSV files).
 
         Yields:
             - When using ``glob``: *content* for each matching file
-            - When using ``pattern``: ``(content, captures)`` for each
-              matching file
+            - When using ``pattern``: ``(content, captures)`` for each matching file,
+              where *captures* is a ``dict`` of values extracted from ``{name}``
+              placeholders
+
+        Raises:
+            DataPortalInputError: if both ``glob`` and ``pattern`` are provided,
+                or if neither is provided.
 
         Example:
             ```python
+            # Read all CSV files — just the content
             for df in portal.read_files('My Project', 'My Dataset', glob='*.csv'):
                 print(df.shape)
+
+            # Extract sample names from filenames automatically
+            for df, captures in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'):
+                print(captures['sample'], df.shape)
+
+            # Multi-level capture: condition directory + sample filename
+            for df, captures in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'):
+                print(captures['condition'], captures['sample'], df.shape)
+
+            # Read gzip-compressed TSV files with explicit separator
+            for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', format='csv', sep='\\t'):
+                print(df.shape)
             ```
         """
         ds = self.get_dataset(project=project, dataset=dataset)
@@ -167,15 +215,17 @@ def read_file(
             dataset (str): ID or name of the dataset.
             path (str): Exact relative path of the file within the dataset.
             glob (str): Wildcard expression matching exactly one file.
-            format (str): File format used to parse the file
-                (``'csv'``, ``'h5ad'``, ``'json'``, ``'parquet'``,
-                ``'feather'``, ``'pickle'``, ``'excel'``, ``'text'``,
-                or ``None`` to infer from extension).
+            format (str): File format used to parse the file. Supported values
+                are the same as :meth:`read_files`.
             **kwargs: Additional keyword arguments forwarded to the
                 file-parsing function.
 
         Returns:
             Parsed file content.
+
+        Raises:
+            DataPortalInputError: if both or neither of ``path``/``glob`` are
+                provided, or if ``glob`` matches zero or more than one file.
         """
         ds = self.get_dataset(project=project, dataset=dataset)
         return ds.read_file(path=path, glob=glob, format=format, **kwargs)

From 595b0a2f408fab3f0f7e3d4f95c4e59c7c6614b6 Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 14:52:06 -0700
Subject: [PATCH 15/24] format -> filetype

---
 cirro/sdk/dataset.py     | 16 ++++++++--------
 cirro/sdk/portal.py      | 14 +++++++-------
 tests/test_read_files.py |  4 ++--
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py
index 404801a2..d825360b 100644
--- a/cirro/sdk/dataset.py
+++ b/cirro/sdk/dataset.py
@@ -290,21 +290,21 @@ def read_files(
             self,
             glob: str = None,
             pattern: str = None,
-            format: str = None,
+            filetype: str = None,
             **kwargs
     ):
         """
         Read the contents of files in the dataset.
 
         See :meth:`~cirro.sdk.portal.DataPortal.read_files` for full details
-        on ``glob``/``pattern`` matching and format options.
+        on ``glob``/``pattern`` matching and filetype options.
 
         Args:
             glob (str): Wildcard expression to match files.
                 Yields one item per matching file: the parsed content.
             pattern (str): Wildcard expression with ``{name}`` capture
                 placeholders. Yields ``(content, captures)`` per matching file.
-            format (str): File format used to parse each file
+            filetype (str): File format used to parse each file
                 (or ``None`` to infer from extension).
             **kwargs: Additional keyword arguments forwarded to the
                 file-parsing function.
@@ -320,19 +320,19 @@ def read_files(
 
         if glob is not None:
             for file in filter_files_by_pattern(list(self.list_files()), glob):
-                yield _read_file_with_format(file, format, **kwargs)
+                yield _read_file_with_format(file, filetype, **kwargs)
         else:
             compiled_regex, _ = _pattern_to_captures_regex(pattern)
             for file in self.list_files():
                 m = compiled_regex.match(file.relative_path)
                 if m is not None:
-                    yield _read_file_with_format(file, format, **kwargs), m.groupdict()
+                    yield _read_file_with_format(file, filetype, **kwargs), m.groupdict()
 
     def read_file(
             self,
             path: str = None,
             glob: str = None,
-            format: str = None,
+            filetype: str = None,
             **kwargs
     ) -> Any:
         """
@@ -343,7 +343,7 @@ def read_file(
         Args:
             path (str): Exact relative path of the file within the dataset.
             glob (str): Wildcard expression matching exactly one file.
-            format (str): File format used to parse the file. Supported values
+            filetype (str): File format used to parse the file. Supported values
                 are the same as :meth:`~cirro.sdk.portal.DataPortal.read_files`.
             **kwargs: Additional keyword arguments forwarded to the file-parsing
                 function.
@@ -368,7 +368,7 @@ def read_file(
                 )
             file = matches[0]
 
-        return _read_file_with_format(file, format, **kwargs)
+        return _read_file_with_format(file, filetype, **kwargs)
 
     def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile:
         """
diff --git a/cirro/sdk/portal.py b/cirro/sdk/portal.py
index 4bb4e5a7..696e191f 100644
--- a/cirro/sdk/portal.py
+++ b/cirro/sdk/portal.py
@@ -108,7 +108,7 @@ def read_files(
             dataset: str,
             glob: str = None,
             pattern: str = None,
-            format: str = None,
+            filetype: str = None,
             **kwargs
     ):
         """
@@ -141,7 +141,7 @@ def read_files(
                 placeholders (e.g., ``'{sample}.csv'``,
                 ``'{condition}/{sample}.csv'``).
                 Yields ``(content, captures)`` per matching file.
-            format (str): File format used to parse each file. Supported values:
+            filetype (str): File format used to parse each file. Supported values:
 
                 - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame``
                 - ``'h5ad'``: parse as AnnData (requires ``anndata`` package)
@@ -187,12 +187,12 @@ def read_files(
                 print(captures['condition'], captures['sample'], df.shape)
 
             # Read gzip-compressed TSV files with explicit separator
-            for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', format='csv', sep='\\t'):
+            for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', filetype='csv', sep='\\t'):
                 print(df.shape)
             ```
         """
         ds = self.get_dataset(project=project, dataset=dataset)
-        yield from ds.read_files(glob=glob, pattern=pattern, format=format, **kwargs)
+        yield from ds.read_files(glob=glob, pattern=pattern, filetype=filetype, **kwargs)
 
     def read_file(
             self,
@@ -200,7 +200,7 @@ def read_file(
             dataset: str,
             path: str = None,
             glob: str = None,
-            format: str = None,
+            filetype: str = None,
             **kwargs
     ):
         """
@@ -215,7 +215,7 @@ def read_file(
             dataset (str): ID or name of the dataset.
             path (str): Exact relative path of the file within the dataset.
             glob (str): Wildcard expression matching exactly one file.
-            format (str): File format used to parse the file. Supported values
+            filetype (str): File format used to parse the file. Supported values
                 are the same as :meth:`read_files`.
             **kwargs: Additional keyword arguments forwarded to the
                 file-parsing function.
@@ -228,7 +228,7 @@ def read_file(
                 provided, or if ``glob`` matches zero or more than one file.
         """
         ds = self.get_dataset(project=project, dataset=dataset)
-        return ds.read_file(path=path, glob=glob, format=format, **kwargs)
+        return ds.read_file(path=path, glob=glob, filetype=filetype, **kwargs)
 
     def list_processes(self, ingest=False) -> DataPortalProcesses:
         """
diff --git a/tests/test_read_files.py b/tests/test_read_files.py
index 9f74ae88..98abe17c 100644
--- a/tests/test_read_files.py
+++ b/tests/test_read_files.py
@@ -221,13 +221,13 @@ def test_glob_no_match_returns_empty(self):
         self.assertEqual(len(results), 0)
 
     def test_glob_explicit_format_csv(self):
-        results = list(self.dataset.read_files(glob='data/*.tsv', file_format='csv', sep='\t'))
+        results = list(self.dataset.read_files(glob='data/*.tsv', filetype='csv', sep='\t'))
         self.assertEqual(len(results), 1)
         self.assertIsInstance(results[0], pd.DataFrame)
         self.assertIn('gene', results[0].columns)
 
     def test_glob_explicit_format_text(self):
-        results = list(self.dataset.read_files(glob='logs/*.log', file_format='text'))
+        results = list(self.dataset.read_files(glob='logs/*.log', filetype='text'))
         self.assertEqual(len(results), 1)
         self.assertIsInstance(results[0], str)
         self.assertIn('started', results[0])

From 5be899843a718e0467068c02a9f7730ba79e9f0f Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 14:56:49 -0700
Subject: [PATCH 16/24] captures -> meta

---
 cirro/sdk/dataset.py     |  4 ++--
 cirro/sdk/portal.py      | 18 +++++++++---------
 tests/test_read_files.py | 20 ++++++++++----------
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py
index d825360b..ab2fa74a 100644
--- a/cirro/sdk/dataset.py
+++ b/cirro/sdk/dataset.py
@@ -303,7 +303,7 @@ def read_files(
             glob (str): Wildcard expression to match files.
                 Yields one item per matching file: the parsed content.
             pattern (str): Wildcard expression with ``{name}`` capture
-                placeholders. Yields ``(content, captures)`` per matching file.
+                placeholders. Yields ``(content, meta)`` per matching file.
             filetype (str): File format used to parse each file
                 (or ``None`` to infer from extension).
             **kwargs: Additional keyword arguments forwarded to the
@@ -311,7 +311,7 @@ def read_files(
 
         Yields:
             - When using ``glob``: *content* for each matching file
-            - When using ``pattern``: ``(content, captures)`` for each matching file
+            - When using ``pattern``: ``(content, meta)`` for each matching file
         """
         if glob is not None and pattern is not None:
             raise DataPortalInputError("Cannot specify both 'glob' and 'pattern' — use one or the other")
diff --git a/cirro/sdk/portal.py b/cirro/sdk/portal.py
index 696e191f..d811b94b 100644
--- a/cirro/sdk/portal.py
+++ b/cirro/sdk/portal.py
@@ -125,8 +125,8 @@ def read_files(
         - Matching is suffix-anchored (``*.csv`` matches at any depth)
 
         **pattern** — like ``glob`` but ``{name}`` placeholders capture portions
-        of the path automatically; yields ``(content, captures)`` pairs where
-        *captures* is a ``dict`` of extracted values:
+        of the path automatically; yields ``(content, meta)`` pairs where
+        *meta* is a ``dict`` of extracted values:
 
         - ``{name}`` captures one path segment (no ``/``)
         - ``*`` and ``**`` wildcards work as in ``glob``
@@ -140,7 +140,7 @@ def read_files(
             pattern (str): Wildcard expression with ``{name}`` capture
                 placeholders (e.g., ``'{sample}.csv'``,
                 ``'{condition}/{sample}.csv'``).
-                Yields ``(content, captures)`` per matching file.
+                Yields ``(content, meta)`` per matching file.
             filetype (str): File format used to parse each file. Supported values:
 
                 - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame``
@@ -164,8 +164,8 @@ def read_files(
 
         Yields:
             - When using ``glob``: *content* for each matching file
-            - When using ``pattern``: ``(content, captures)`` for each matching file,
-              where *captures* is a ``dict`` of values extracted from ``{name}``
+            - When using ``pattern``: ``(content, meta)`` for each matching file,
+              where *meta* is a ``dict`` of values extracted from ``{name}``
               placeholders
 
         Raises:
@@ -179,12 +179,12 @@ def read_files(
                 print(df.shape)
 
             # Extract sample names from filenames automatically
-            for df, captures in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'):
-                print(captures['sample'], df.shape)
+            for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'):
+                print(meta['sample'], df.shape)
 
             # Multi-level capture: condition directory + sample filename
-            for df, captures in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'):
-                print(captures['condition'], captures['sample'], df.shape)
+            for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'):
+                print(meta['condition'], meta['sample'], df.shape)
 
             # Read gzip-compressed TSV files with explicit separator
             for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', filetype='csv', sep='\\t'):
diff --git a/tests/test_read_files.py b/tests/test_read_files.py
index 98abe17c..bbad2229 100644
--- a/tests/test_read_files.py
+++ b/tests/test_read_files.py
@@ -250,15 +250,15 @@ def test_globstar_pattern(self):
     def test_pattern_simple_filename(self):
         results = list(self.dataset.read_files(pattern='{sample}.csv'))
         self.assertEqual(len(results), 1)
-        content, captures = results[0]
+        content, meta = results[0]
         self.assertIsInstance(content, pd.DataFrame)
-        self.assertEqual(captures['sample'], 'results')
+        self.assertEqual(meta['sample'], 'results')
 
     def test_pattern_with_directory(self):
         results = list(self.dataset.read_files(pattern='data/{sample}.csv'))
         self.assertEqual(len(results), 1)
-        _, captures = results[0]
-        self.assertEqual(captures['sample'], 'results')
+        _, meta = results[0]
+        self.assertEqual(meta['sample'], 'results')
 
     def test_pattern_multiple_files(self):
         dataset = _make_dataset_with_files([
@@ -268,7 +268,7 @@ def test_pattern_multiple_files(self):
         ])
         results = list(dataset.read_files(pattern='{sample}.csv'))
         self.assertEqual(len(results), 2)
-        captured = {c['sample'] for _, c in results}
+        captured = {m['sample'] for _, m in results}
         self.assertSetEqual(captured, {'sampleA', 'sampleB'})
 
     def test_pattern_multi_level(self):
@@ -278,7 +278,7 @@ def test_pattern_multi_level(self):
         ])
         results = list(dataset.read_files(pattern='{condition}/{sample}.csv'))
         self.assertEqual(len(results), 2)
-        by_sample = {c['sample']: c['condition'] for _, c in results}
+        by_sample = {m['sample']: m['condition'] for _, m in results}
         self.assertEqual(by_sample['sampleA'], 'treated')
         self.assertEqual(by_sample['sampleB'], 'control')
 
@@ -286,11 +286,11 @@ def test_pattern_no_match_returns_empty(self):
         results = list(self.dataset.read_files(pattern='{sample}.parquet'))
         self.assertEqual(len(results), 0)
 
-    def test_pattern_yields_content_and_captures_tuple(self):
+    def test_pattern_yields_content_and_meta_tuple(self):
         results = list(self.dataset.read_files(pattern='{sample}.csv'))
-        content, captures = results[0]
-        self.assertIsInstance(captures, dict)
-        self.assertIn('sample', captures)
+        content, meta = results[0]
+        self.assertIsInstance(meta, dict)
+        self.assertIn('sample', meta)
 
     # --- error cases ---
 

From adf881491a3c8e189197eada2c84f6fbda34b6e3 Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 14:57:03 -0700
Subject: [PATCH 17/24] Update README.md

---
 README.md | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/README.md b/README.md
index ea78bb95..f5ee0bb9 100644
--- a/README.md
+++ b/README.md
@@ -181,6 +181,49 @@ See the following set of Jupyter notebooks that contain examples on the followin
 | [Using references](samples/Using_references.ipynb)                 | Managing reference data              |
 | [Advanced usage](samples/Advanced_usage.ipynb)                     | Advanced operations                  |
 
+### Reading files
+
+The `read_file` and `read_files` methods provide a convenient way to read dataset files directly into Python objects. The file format is inferred from the extension (`.csv`, `.tsv`, `.json`, `.parquet`, `.feather`, `.pkl`, `.xlsx`, `.h5ad`), or can be specified explicitly.
+
+```python
+from cirro import DataPortal
+
+# If not logged in, this will prompt with a login URL
+portal = DataPortal()
+
+# Read a single file from the indicated dataset
+df = portal.read_file(project="My Project", dataset="My Dataset", glob="**/results.csv")
+
+# Iterate over each of the files ending in .csv within a dataset
+for df in portal.read_files(project="My Project", dataset="My Dataset", glob="*.csv"):
+    print(df.shape)
+
+```
+
+You can also call these methods on the `DataPortalDataset` object:
+
+```python
+# Get an object representing a single dataset
+dataset = portal.get_dataset(project="My Project", dataset="My Dataset")
+
+# Read a single file by exact path or glob pattern
+df = dataset.read_file(path="data/results.csv")
+df = dataset.read_file(glob="**/results.csv")
+
+# Read multiple files matching a pattern — yields one result per file
+for df in dataset.read_files(glob="**/*.csv"):
+    print(df.shape)
+
+# Extract values from the path using {name} capture placeholders
+for df, meta in dataset.read_files(pattern="{sample}/results.csv"):
+    print(meta["sample"], df.shape)
+
+# Extra keyword arguments are forwarded to the file-parsing function
+for df in dataset.read_files(glob="**/*.tsv.gz", filetype="csv", sep="\t"):
+    print(df.shape)
+```
+
+
 ## R Usage
 
 | Jupyter Notebook                                    | Topic               |

From e51ba84af23f591586de5457973d7adc9f8842e7 Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 15:00:12 -0700
Subject: [PATCH 18/24] Add tests

---
 tests/test_read_files.py | 75 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 74 insertions(+), 1 deletion(-)

diff --git a/tests/test_read_files.py b/tests/test_read_files.py
index bbad2229..39974960 100644
--- a/tests/test_read_files.py
+++ b/tests/test_read_files.py
@@ -292,6 +292,51 @@ def test_pattern_yields_content_and_meta_tuple(self):
         self.assertIsInstance(meta, dict)
         self.assertIn('sample', meta)
 
+    # --- special characters in filenames ---
+
+    def test_glob_matches_filename_with_spaces(self):
+        dataset = _make_dataset_with_files([
+            _make_mock_file('data/my sample.csv', b'a\n1\n'),
+        ])
+        results = list(dataset.read_files(glob='*.csv'))
+        self.assertEqual(len(results), 1)
+
+    def test_glob_matches_filename_with_hyphens_and_parens(self):
+        dataset = _make_dataset_with_files([
+            _make_mock_file('data/sample-A (1).csv', b'a\n1\n'),
+        ])
+        results = list(dataset.read_files(glob='*.csv'))
+        self.assertEqual(len(results), 1)
+
+    def test_pattern_captures_filename_with_spaces(self):
+        dataset = _make_dataset_with_files([
+            _make_mock_file('my sample.csv', b'a\n1\n'),
+        ])
+        results = list(dataset.read_files(pattern='{sample}.csv'))
+        self.assertEqual(len(results), 1)
+        _, meta = results[0]
+        self.assertEqual(meta['sample'], 'my sample')
+
+    def test_pattern_captures_directory_with_spaces(self):
+        dataset = _make_dataset_with_files([
+            _make_mock_file('treated group/sampleA.csv', b'a\n1\n'),
+            _make_mock_file('control group/sampleB.csv', b'a\n2\n'),
+        ])
+        results = list(dataset.read_files(pattern='{condition}/{sample}.csv'))
+        self.assertEqual(len(results), 2)
+        by_sample = {m['sample']: m['condition'] for _, m in results}
+        self.assertEqual(by_sample['sampleA'], 'treated group')
+        self.assertEqual(by_sample['sampleB'], 'control group')
+
+    def test_pattern_captures_special_chars(self):
+        dataset = _make_dataset_with_files([
+            _make_mock_file('sample-A_v2 (1).csv', b'a\n1\n'),
+        ])
+        results = list(dataset.read_files(pattern='{sample}.csv'))
+        self.assertEqual(len(results), 1)
+        _, meta = results[0]
+        self.assertEqual(meta['sample'], 'sample-A_v2 (1)')
+
     # --- error cases ---
 
     def test_both_glob_and_pattern_raises(self):
@@ -305,7 +350,7 @@ def test_neither_glob_nor_pattern_raises(self):
 
 class TestPatternToRegex(unittest.TestCase):
     def _match(self, pattern, path):
-        compiled, names = _pattern_to_captures_regex(pattern)
+        compiled, _ = _pattern_to_captures_regex(pattern)
         m = compiled.match(path)
         return m.groupdict() if m else None
 
@@ -336,3 +381,31 @@ def test_wildcard_mixed_with_capture(self):
     def test_capture_names_returned(self):
         _, names = _pattern_to_captures_regex('{condition}/{sample}.csv')
         self.assertListEqual(names, ['condition', 'sample'])
+
+    def test_capture_with_spaces(self):
+        result = self._match('{sample}.csv', 'my sample.csv')
+        self.assertEqual(result, {'sample': 'my sample'})
+
+    def test_capture_with_spaces_in_directory(self):
+        result = self._match('{condition}/{sample}.csv', 'treated group/my sample.csv')
+        self.assertEqual(result, {'condition': 'treated group', 'sample': 'my sample'})
+
+    def test_capture_with_hyphens_and_underscores(self):
+        result = self._match('{sample}.csv', 'sample-A_v2.csv')
+        self.assertEqual(result, {'sample': 'sample-A_v2'})
+
+    def test_capture_with_parentheses(self):
+        result = self._match('{sample}.csv', 'sample (1).csv')
+        self.assertEqual(result, {'sample': 'sample (1)'})
+
+    def test_capture_with_dots_in_name(self):
+        result = self._match('{sample}.csv', 'sample.v2.csv')
+        self.assertEqual(result, {'sample': 'sample.v2'})
+
+    def test_wildcard_matches_spaces(self):
+        compiled, _ = _pattern_to_captures_regex('data/*.csv')
+        self.assertIsNotNone(compiled.match('data/my file.csv'))
+
+    def test_globstar_matches_spaces_across_segments(self):
+        compiled, _ = _pattern_to_captures_regex('**/*.csv')
+        self.assertIsNotNone(compiled.match('some dir/sub dir/my file.csv'))

From 21550d43370d7a3fc668753c8d2ea0879721af19 Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 15:02:56 -0700
Subject: [PATCH 19/24] Read file(s) as bytes

---
 cirro/sdk/dataset.py     | 4 +++-
 cirro/sdk/portal.py      | 1 +
 tests/test_read_files.py | 8 +++++++-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py
index ab2fa74a..6873d1d3 100644
--- a/cirro/sdk/dataset.py
+++ b/cirro/sdk/dataset.py
@@ -97,10 +97,12 @@ def _read_file_with_format(file: DataPortalFile, file_format: Optional[str], **k
         return file.read_excel(**kwargs)
     elif file_format == 'text':
         return file.read(**kwargs)
+    elif file_format == 'bytes':
+        return file._get()
     else:
         raise DataPortalInputError(
             f"Unsupported file_format: '{file_format}'. "
-            f"Supported values: 'csv', 'h5ad', 'json', 'parquet', 'feather', 'pickle', 'excel', 'text'"
+            f"Supported values: 'csv', 'h5ad', 'json', 'parquet', 'feather', 'pickle', 'excel', 'text', 'bytes'"
         )
 
 
diff --git a/cirro/sdk/portal.py b/cirro/sdk/portal.py
index d811b94b..7f4727c4 100644
--- a/cirro/sdk/portal.py
+++ b/cirro/sdk/portal.py
@@ -154,6 +154,7 @@ def read_files(
                 - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame``
                   (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``)
                 - ``'text'``: read as plain text, returns a ``str``
+                - ``'bytes'``: read as raw bytes, returns ``bytes``
                 - ``None`` (default): infer from file extension
                   (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``,
                   ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``,
diff --git a/tests/test_read_files.py b/tests/test_read_files.py
index 39974960..8bb56cc6 100644
--- a/tests/test_read_files.py
+++ b/tests/test_read_files.py
@@ -110,6 +110,12 @@ def test_auto_infer_text(self):
         result = _read_file_with_format(file, None)
         self.assertIsInstance(result, str)
 
+    def test_bytes_format(self):
+        file = _make_mock_file('data/blob.bin', b'\x00\x01\x02\x03')
+        result = _read_file_with_format(file, 'bytes')
+        self.assertIsInstance(result, bytes)
+        self.assertEqual(result, b'\x00\x01\x02\x03')
+
     def test_unsupported_format_raises(self):
         with self.assertRaises(DataPortalInputError):
             _read_file_with_format(self.file, 'xyz_unknown')
@@ -288,7 +294,7 @@ def test_pattern_no_match_returns_empty(self):
 
     def test_pattern_yields_content_and_meta_tuple(self):
         results = list(self.dataset.read_files(pattern='{sample}.csv'))
-        content, meta = results[0]
+        _, meta = results[0]
         self.assertIsInstance(meta, dict)
         self.assertIn('sample', meta)
 

From 9847e4d8c467c874cdeb4ae50351079729731067 Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 15:41:47 -0700
Subject: [PATCH 20/24] Update example for running analysis

---
 samples/Analyzing_a_dataset.ipynb | 214 ++++++++++++++++++++++++------
 1 file changed, 173 insertions(+), 41 deletions(-)

diff --git a/samples/Analyzing_a_dataset.ipynb b/samples/Analyzing_a_dataset.ipynb
index 1b7d0c62..3cd23ca8 100644
--- a/samples/Analyzing_a_dataset.ipynb
+++ b/samples/Analyzing_a_dataset.ipynb
@@ -21,14 +21,119 @@
    },
    "outputs": [],
    "source": [
+    "# Import the library used to interact with Cirro\n",
     "from cirro import DataPortal\n",
     "\n",
+    "# Create a connection to Cirro with your identity\n",
     "portal = DataPortal()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Option 1 - run analysis using the same set of parameters used previously"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset 'Test dataset for variant calling' contains 2 files\n"
+     ]
+    }
+   ],
+   "source": [
+    "# New dataset with FASTQs\n",
+    "input_dataset = portal.get_dataset(\n",
+    "    project=\"Pipeline Development\",\n",
+    "    dataset=\"Test dataset for variant calling\"\n",
+    ")\n",
+    "print(f\"Dataset '{input_dataset.name}' contains {len(input_dataset.list_files()):,} files\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using the 'Align Reads (nf-core/sarek)' process (ID: process-nf-core-sarek-align-3-2)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Get the process to run on the dataset\n",
+    "process = portal.get_process_by_name('Align Reads (nf-core/sarek)')\n",
+    "print(f\"Using the '{process.name}' process (ID: {process.id})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using parameters from Genomic variant calling - parameter validation\n",
+      "{'WORKFLOW_VERSION': '3.2.3', 'analysis_type': {'genome': 'GATK.GRCh38', 'wes': True, 'analysis_type': 'Germline Variant Calling', 'intervals': 's3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed', 'tools': ['strelka', 'haplotypecaller']}, 'annotation': {'annotation_tool': []}, 'read_trimming_options': {'trim_fastq': False}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Previous dataset created by the pipeline\n",
+    "previous_run = portal.get_dataset(\n",
+    "    project=\"Pipeline Development\",\n",
+    "    dataset=\"Genomic variant calling - parameter validation\"\n",
+    ")\n",
+    "print(f\"Using parameters from {previous_run.name}\")\n",
+    "print(previous_run.params)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Started new analysis: ID f7ca7e1b-d64c-4747-b647-0e984db87aa5\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Start a new run, using the parameters from the previous run\n",
+    "new_dataset_id = input_dataset.run_analysis(\n",
+    "    name=\"Genomic variant calling - new run\",\n",
+    "    description='Test from SDK',\n",
+    "    process=process,\n",
+    "    params=previous_run.params\n",
+    ")\n",
+    "print(f\"Started new analysis: ID {new_dataset_id}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Option 2: Build parameters from scratch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -39,24 +144,37 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Project 'Test Project' contains 104 datasets\n",
-      "Dataset 'Test dataset for variant calling' contains 2 files\n",
-      "Using the 'Variant Calling (nf-core/sarek)' process (ID: process-nf-core-sarek-3-0-1)\n"
+      "Project 'Pipeline Development' contains 709 datasets\n"
      ]
     }
    ],
    "source": [
     "# Get the project by name\n",
-    "project = portal.get_project_by_name('Test Project') \n",
-    "print(f\"Project '{project.name}' contains {len(project.list_datasets()):,} datasets\")\n",
-    "\n",
+    "project = portal.get_project_by_name('Pipeline Development') \n",
+    "print(f\"Project '{project.name}' contains {len(project.list_datasets()):,} datasets\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset 'Test dataset for variant calling' contains 2 files\n"
+     ]
+    }
+   ],
+   "source": [
     "# Get a particular dataset from that project\n",
     "dataset = project.get_dataset_by_name('Test dataset for variant calling')\n",
-    "print(f\"Dataset '{dataset.name}' contains {len(dataset.list_files()):,} files\")\n",
-    "\n",
-    "# Get the process to run on the dataset\n",
-    "process = portal.get_process_by_id('process-nf-core-sarek-3-0-1')\n",
-    "print(f\"Using the '{process.name}' process (ID: {process.id})\")"
+    "print(f\"Dataset '{dataset.name}' contains {len(dataset.list_files()):,} files\")"
    ]
   },
   {
@@ -72,7 +190,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 8,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -84,15 +202,15 @@
      "output_type": "stream",
      "text": [
       "Parameters:\n",
-      "\tExperiment Design (Group)\n",
+      "\tWorkflow Version (key=workflow_version, default=3.6.0, type=string, enum=['3.1', '3.1.1', '3.1.2', '3.2.3', '3.3.2', '3.4.4', '3.5.1', '3.6.0'], description=Select the specific version of nf-core/sarek used for analysis)\n",
+      "\tExperimental Design (Group)\n",
       "\t\tReference Genome (key=genome, default=GATK.GRCh38, type=string, enum=['GATK.GRCh38', 'GATK.GRCh37', 'GRCm38'])\n",
       "\t\tWhole Exome/Targeted Gene Panel Assay (key=wes, type=boolean, description=Please indicate if your data was generated using a capture kit.)\n",
       "\t\tGenomic intervals (key=intervals, type=string, description=Target bed file in case of whole exome or targeted sequencing or intervals file for parallelization.)\n",
-      "\t\tVariant Calling Type (key=analysis_type, default=Germline Variant Calling, enum=['Germline Variant Calling', 'Somatic Variant Calling'])\n",
-      "\tVariant Annotation (Group)\n",
-      "\t\tAnnotation tool(s) (key=annotation_tool, type=array, description=Please select one or both variant annotation tools.)\n",
       "\tRead Trimming Options (Group)\n",
-      "\t\tTrim reads using Trim-Galore? (key=trim_fastq, type=boolean)\n"
+      "\t\tTrim reads using Trim-Galore? (key=trim_fastq, type=boolean)\n",
+      "\tAdvanced Options (Group)\n",
+      "\t\tMarkDuplicates - Optical Duplicate Pixel Distance (key=optical_duplicate_pixel_distance, default=100, type=integer, description=The `--OPTICAL_DUPLICATE_PIXEL_DISTANCE` parameter is used by MarkDuplicates to set the maximum offset between two duplicate clusters in pixels for them to be considered optical duplicates. A value of 100 is generally appropriate for unpatterned Illumina flowcells and 250 is appropriate for patterned Illumina flow cells.)\n"
      ]
     }
    ],
@@ -114,7 +232,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 9,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -126,13 +244,15 @@
      "output_type": "stream",
      "text": [
       "The BED references available are:\n",
-      "GRCh38_Chr20\n",
-      " - NimbleGen_SeqCap_EZ_Exome_primary-capture_hg19_chr17\n",
-      " - wgs_calling_regions.hg19.bed\n",
+      "wgs_calling_regions.hg19.bed\n",
+      " - hg38\n",
+      " - epi2me-labs-wf-human-variation-ref\n",
       " - wgs_calling_regions.hg38.bed\n",
+      " - GRCh38_Chr20\n",
+      " - NimbleGen_SeqCap_EZ_Exome_primary-capture_hg19_chr17\n",
       "\n",
       "The reference library we are using is: GRCh38_Chr20\n",
-      "The absolute path to the file is: s3://z-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed\n"
+      "The absolute path to the file is: s3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed\n"
      ]
     }
    ],
@@ -153,7 +273,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 10,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -163,25 +283,37 @@
     {
      "data": {
       "text/plain": [
-       "{'genome': 'GATK.GRCh38',\n",
-       " 'wes': True,\n",
-       " 'intervals': 's3://z-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed',\n",
-       " 'trim_fastq': False,\n",
-       " 'annotation_tool': ['cnvkit', 'deepvariant']}"
+       "{'WORKFLOW_VERSION': '3.2.3',\n",
+       " 'analysis_type': {'genome': 'GATK.GRCh38',\n",
+       "  'wes': True,\n",
+       "  'analysis_type': 'Germline Variant Calling',\n",
+       "  'intervals': 's3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed',\n",
+       "  'tools': ['strelka', 'haplotypecaller']},\n",
+       " 'annotation': {'annotation_tool': []},\n",
+       " 'read_trimming_options': {'trim_fastq': False}}"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "params = {\n",
-    "    'genome': 'GATK.GRCh38',\n",
-    "    'wes': True,\n",
-    "    'intervals': reference_library.absolute_path,\n",
-    "    'trim_fastq': False,\n",
-    "    'annotation_tool': ['cnvkit', 'deepvariant']\n",
+    "    'WORKFLOW_VERSION': '3.2.3',\n",
+    "    'analysis_type': {\n",
+    "        'genome': 'GATK.GRCh38',\n",
+    "        'wes': True,\n",
+    "        'analysis_type': 'Germline Variant Calling',\n",
+    "        'intervals': 's3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed',\n",
+    "        'tools': ['strelka', 'haplotypecaller']\n",
+    "    },\n",
+    "    'annotation': {\n",
+    "        'annotation_tool': []\n",
+    "    },\n",
+    "    'read_trimming_options': {\n",
+    "        'trim_fastq': False\n",
+    "    }\n",
     "}\n",
     "params"
    ]
@@ -200,7 +332,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 11,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -225,7 +357,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 12,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -236,16 +368,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "71ec598c-368b-47a5-84c8-c209739b050a\n"
+      "ca8eee87-09d9-4abe-ba0e-4e6ba48b33fa\n"
      ]
     }
    ],
    "source": [
     "# Run the analysis, specifying a name and description for the resulting dataset\n",
-    "new_dataset_id = dataset.run_analysis(\n",
+    "new_dataset_id = input_dataset.run_analysis(\n",
     "    name='Variant Calling Analysis',\n",
     "    description='Test from SDK',\n",
-    "    process='process-nf-core-sarek-3-0-1',\n",
+    "    process=process,\n",
     "    params=params\n",
     ")\n",
     "print(new_dataset_id)"
@@ -275,7 +407,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.12"
+   "version": "3.12.7"
   },
   "vscode": {
    "interpreter": {
@@ -284,5 +416,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 1
+ "nbformat_minor": 4
 }

From ed9916e2c4104c6379554955e591e95b4458de8f Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 15:56:11 -0700
Subject: [PATCH 21/24] Optionally filter the files downloaded from a dataset

---
 cirro/sdk/dataset.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py
index 6873d1d3..d96a5540 100644
--- a/cirro/sdk/dataset.py
+++ b/cirro/sdk/dataset.py
@@ -398,16 +398,21 @@ def list_artifacts(self) -> List[DataPortalFile]:
             ]
         )
 
-    def download_files(self, download_location: str = None) -> None:
+    def download_files(self, download_location: str = None, glob: str = None) -> None:
         """
         Download all the files from the dataset to a local directory.
 
         Args:
             download_location (str): Path to local directory
+            glob (str): Optional wildcard expression to filter which files are downloaded
+                (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``).
+                If omitted, all files are downloaded.
         """
 
-        # Alias for internal method
-        self.list_files().download(download_location)
+        files = self.list_files()
+        if glob is not None:
+            files = DataPortalFiles(filter_files_by_pattern(list(files), glob))
+        files.download(download_location)
 
     def run_analysis(
             self,

From 220c9ea29b0f277e412d1e0737d88542cf450829 Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 16:01:25 -0700
Subject: [PATCH 22/24] Add tests for reading files

---
 tests/test_read_files.py | 46 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/tests/test_read_files.py b/tests/test_read_files.py
index 8bb56cc6..2292d0e9 100644
--- a/tests/test_read_files.py
+++ b/tests/test_read_files.py
@@ -354,6 +354,52 @@ def test_neither_glob_nor_pattern_raises(self):
             list(self.dataset.read_files())
 
 
+class TestDatasetDownloadFiles(unittest.TestCase):
+    def setUp(self):
+        self.csv_file = _make_mock_file('data/results.csv', b'x,y\n3,4\n')
+        self.tsv_file = _make_mock_file('data/counts.tsv', b'gene\tcount\nTP53\t100\n')
+        self.txt_file = _make_mock_file('logs/run.log', b'started\nfinished\n')
+        self.dataset = _make_dataset_with_files([
+            self.csv_file,
+            self.tsv_file,
+            self.txt_file,
+        ])
+        for f in [self.csv_file, self.tsv_file, self.txt_file]:
+            f.download = Mock(return_value=None)
+
+    def _downloaded_paths(self):
+        return [
+            f.relative_path
+            for f in [self.csv_file, self.tsv_file, self.txt_file]
+            if f.download.called
+        ]
+
+    def test_no_glob_downloads_all(self):
+        self.dataset.download_files(download_location='/tmp')
+        self.assertEqual(len(self._downloaded_paths()), 3)
+
+    def test_glob_filters_to_matching_files(self):
+        self.dataset.download_files(download_location='/tmp', glob='*.csv')
+        downloaded = self._downloaded_paths()
+        self.assertEqual(downloaded, ['data/results.csv'])
+
+    def test_glob_matches_multiple_files(self):
+        self.dataset.download_files(download_location='/tmp', glob='data/*')
+        downloaded = self._downloaded_paths()
+        self.assertIn('data/results.csv', downloaded)
+        self.assertIn('data/counts.tsv', downloaded)
+        self.assertNotIn('logs/run.log', downloaded)
+
+    def test_glob_no_match_downloads_nothing(self):
+        self.dataset.download_files(download_location='/tmp', glob='*.parquet')
+        self.assertEqual(len(self._downloaded_paths()), 0)
+
+    def test_globstar_filters_by_subdirectory(self):
+        self.dataset.download_files(download_location='/tmp', glob='logs/**')
+        downloaded = self._downloaded_paths()
+        self.assertEqual(downloaded, ['logs/run.log'])
+
+
 class TestPatternToRegex(unittest.TestCase):
     def _match(self, pattern, path):
         compiled, _ = _pattern_to_captures_regex(pattern)

From 3e271bf603cf8813626a5e00680deda4fbf12e8e Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 16:24:16 -0700
Subject: [PATCH 23/24] Add get_trace and get_logs

---
 cirro/sdk/dataset.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py
index d96a5540..205a14d6 100644
--- a/cirro/sdk/dataset.py
+++ b/cirro/sdk/dataset.py
@@ -372,6 +372,24 @@ def read_file(
 
         return _read_file_with_format(file, filetype, **kwargs)
 
+    def get_trace(self) -> Any:
+        """
+        Read the Nextflow workflow trace file for this dataset as a DataFrame.
+
+        Returns:
+            `pandas.DataFrame`
+        """
+        return self.get_artifact(ArtifactType.WORKFLOW_TRACE).read_csv(sep='\t')
+
+    def get_logs(self) -> str:
+        """
+        Read the Nextflow workflow logs for this dataset as a string.
+
+        Returns:
+            str
+        """
+        return self.get_artifact(ArtifactType.WORKFLOW_LOGS).read()
+
     def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile:
         """
         Get the artifact of a particular type from the dataset

From 96c81a959afeda88367bfc486dbebc27dd0d15d6 Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@gmail.com>
Date: Thu, 19 Mar 2026 16:26:19 -0700
Subject: [PATCH 24/24] Update samples

---
 samples/Downloading_a_dataset.ipynb  | 202 +++---------
 samples/Interacting_with_files.ipynb | 466 ++++++++++++++++-----------
 2 files changed, 319 insertions(+), 349 deletions(-)

diff --git a/samples/Downloading_a_dataset.ipynb b/samples/Downloading_a_dataset.ipynb
index 71f372ff..cca284a6 100644
--- a/samples/Downloading_a_dataset.ipynb
+++ b/samples/Downloading_a_dataset.ipynb
@@ -34,7 +34,10 @@
     }
    },
    "source": [
-    "You can get the list of all projects which are available, and select a particular project by name"
+    "If you don't know exactly what the name or ID is of the dataset you want to download,\n",
+    "you can get the list of all projects which are available, and select a particular project by name.\n",
+    "\n",
+    "### Inspecting datasets"
    ]
   },
   {
@@ -46,9 +49,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "There are 3 projects available\n",
-      "Selected the project 'Test Project' (ID: 9a31492a-e679-43ce-9f06-d84213c8f7f7)\n",
-      "This project contains 104 datasets to choose from\n"
+      "There are 5 projects available\n",
+      "Selected the project 'Pipeline Development' (ID: 9a31492a-e679-43ce-9f06-d84213c8f7f7)\n",
+      "This project contains 709 datasets to choose from\n"
      ]
     }
    ],
@@ -56,7 +59,7 @@
     "print(f\"There are {len(portal.list_projects()):,} projects available\")\n",
     "# print(portal.list_projects()) # run this line to see all the projects\n",
     "\n",
-    "project = portal.get_project_by_name(\"Test Project\")\n",
+    "project = portal.get_project_by_name(\"Pipeline Development\")\n",
     "print(f\"Selected the project '{project.name}' (ID: {project.id})\")\n",
     "print(f\"This project contains {len(project.list_datasets()):,} datasets to choose from\")"
    ]
@@ -82,17 +85,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Name: Test of mageck-count (updated headnode code 9/22/2022) (3)\n",
-      "Id: bcda3e84-1abe-4d08-86b0-690ea7e1cdad\n",
-      "Description: Test of mageck-count (updated headnode code 9/22/2022)\n",
+      "Name: Genomic variant calling - parameter validation\n",
+      "Id: 3fb7e8f8-b62d-43a6-ad08-eb28f59bd141\n",
+      "Description: None\n",
       "Status: COMPLETED\n"
      ]
     }
    ],
    "source": [
     "# Datasets can be selected by name or by ID\n",
-    "dataset = project.get_dataset_by_id(\"bcda3e84-1abe-4d08-86b0-690ea7e1cdad\")\n",
-    "# dataset = project.get_dataset_by_name(\"Test of mageck-count\")\n",
+    "dataset = project.get_dataset(\"Genomic variant calling - parameter validation\")\n",
     "print(dataset)"
    ]
   },
@@ -104,191 +106,63 @@
     }
    },
    "source": [
-    "Download all of the files from that dataset to a temporary folder"
+    "### Downloading files\n",
+    "\n",
+    "Download all of the files from that dataset (to a temporary folder in this case)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# You can also just select that dataset in a single call\n",
+    "dataset = portal.get_dataset(\n",
+    "    project=\"Pipeline Development\",\n",
+    "    dataset=\"Genomic variant calling - parameter validation\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
     },
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Downloading file MO_Brunello_1.fastq (898.44 KB) | 100.0%|█████████████████████████ | 1.46MB/s\n",
-      "Downloading file MO_Brunello_2.fastq (898.44 KB) | 100.0%|█████████████████████████ | 1.83MB/s\n",
-      "Downloading file MO_Brunello_gDNA_1.fastq (898.44 KB) | 100.0%|█████████████████████████ | 2.16MB/s\n",
-      "Downloading file MO_Brunello_gDNA_2.fastq (898.44 KB) | 100.0%|█████████████████████████ | 1.39MB/s\n",
-      "Downloading file multiqc_report.html (1.12 MB) | 100.0%|█████████████████████████ | 1.35MB/s\n",
-      "Downloading file MO_Brunello_1.json (72.07 KB) | 100.0%|█████████████████████████ | 285kB/s\n",
-      "Downloading file MO_Brunello_1_fastqc.html (804.22 KB) | 100.0%|█████████████████████████ | 1.15MB/s\n",
-      "Downloading file MO_Brunello_2.json (72.07 KB) | 100.0%|█████████████████████████ | 349kB/s\n",
-      "Downloading file MO_Brunello_2_fastqc.html (824.26 KB) | 100.0%|█████████████████████████ | 1.19MB/s\n",
-      "Downloading file MO_Brunello_gDNA_1.json (72.53 KB) | 100.0%|█████████████████████████ | 319kB/s\n",
-      "Downloading file MO_Brunello_gDNA_1_fastqc.html (824.76 KB) | 100.0%|█████████████████████████ | 2.10MB/s\n",
-      "Downloading file MO_Brunello_gDNA_2.json (71.84 KB) | 100.0%|█████████████████████████ | 289kB/s\n",
-      "Downloading file MO_Brunello_gDNA_2_fastqc.html (815.26 KB) | 100.0%|█████████████████████████ | 1.95MB/s\n",
-      "Downloading file MO_Brunello_1.count.txt (1.55 MB) | 100.0%|█████████████████████████ | 3.62MB/s\n",
-      "Downloading file MO_Brunello_1.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 3.09MB/s\n",
-      "Downloading file MO_Brunello_1.countsummary.txt (237.00 B) | 100.0%|█████████████████████████ | 1.42kB/s\n",
-      "Downloading file MO_Brunello_2.count.txt (1.55 MB) | 100.0%|█████████████████████████ | 3.61MB/s\n",
-      "Downloading file MO_Brunello_2.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 2.72MB/s\n",
-      "Downloading file MO_Brunello_2.countsummary.txt (237.00 B) | 100.0%|█████████████████████████ | 2.28kB/s\n",
-      "Downloading file MO_Brunello_gDNA_1.count.txt (1.55 MB) | 100.0%|█████████████████████████ | 2.82MB/s\n",
-      "Downloading file MO_Brunello_gDNA_1.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 2.57MB/s\n",
-      "Downloading file MO_Brunello_gDNA_1.countsummary.txt (247.00 B) | 100.0%|█████████████████████████ | 2.57kB/s\n",
-      "Downloading file MO_Brunello_gDNA_2.count.txt (1.55 MB) | 100.0%|█████████████████████████ | 3.40MB/s\n",
-      "Downloading file MO_Brunello_gDNA_2.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 1.52MB/s\n",
-      "Downloading file MO_Brunello_gDNA_2.countsummary.txt (246.00 B) | 100.0%|█████████████████████████ | 2.33kB/s\n",
-      "Downloading file counts.txt (1.99 MB) | 100.0%|█████████████████████████ | 3.48MB/s\n",
-      "Downloading file sample_names.txt (65.00 B) | 100.0%|█████████████████████████ | 662B/s\n",
-      "Downloading file summary.txt (366.00 B) | 100.0%|█████████████████████████ | 2.41kB/s\n",
-      "Downloading file MO_Brunello_1.log (2.39 KB) | 100.0%|█████████████████████████ | 11.1kB/s\n",
-      "Downloading file MO_Brunello_2.log (2.39 KB) | 100.0%|█████████████████████████ | 16.1kB/s\n",
-      "Downloading file MO_Brunello_gDNA_1.log (2.43 KB) | 100.0%|█████████████████████████ | 23.2kB/s\n",
-      "Downloading file MO_Brunello_gDNA_2.log (2.43 KB) | 100.0%|█████████████████████████ | 19.4kB/s\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "dataset.download_files(\"/tmp\")"
+    "# dataset.download_files(\"/tmp\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Alternatively, you can inspect and filter the list of files to only what is needed"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "data/cutadapt/trim/fastq/MO_Brunello_1.fastq (920000 bytes)\n",
-      "\n",
-      "data/cutadapt/trim/fastq/MO_Brunello_2.fastq (920000 bytes)\n",
-      "\n",
-      "data/cutadapt/trim/fastq/MO_Brunello_gDNA_1.fastq (920000 bytes)\n",
-      "\n",
-      "data/cutadapt/trim/fastq/MO_Brunello_gDNA_2.fastq (920000 bytes)\n",
-      "\n",
-      "data/fastqc/multiqc_report.html (1173155 bytes)\n",
-      "\n",
-      "data/fastqc/MO_Brunello_1/MO_Brunello_1.json (73803 bytes)\n",
-      "\n",
-      "data/fastqc/MO_Brunello_1/MO_Brunello_1_fastqc.html (823526 bytes)\n",
-      "\n",
-      "data/fastqc/MO_Brunello_2/MO_Brunello_2.json (73797 bytes)\n",
-      "\n",
-      "data/fastqc/MO_Brunello_2/MO_Brunello_2_fastqc.html (844044 bytes)\n",
-      "\n",
-      "data/fastqc/MO_Brunello_gDNA_1/MO_Brunello_gDNA_1.json (74268 bytes)\n",
-      "\n",
-      "data/fastqc/MO_Brunello_gDNA_1/MO_Brunello_gDNA_1_fastqc.html (844554 bytes)\n",
-      "\n",
-      "data/fastqc/MO_Brunello_gDNA_2/MO_Brunello_gDNA_2.json (73563 bytes)\n",
-      "\n",
-      "data/fastqc/MO_Brunello_gDNA_2/MO_Brunello_gDNA_2_fastqc.html (834827 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_1.count.txt (1625955 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_1.count_normalized.txt (1638475 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_1.countsummary.txt (237 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_2.count.txt (1625955 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_2.count_normalized.txt (1638372 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_2.countsummary.txt (237 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_gDNA_1.count.txt (1625960 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_gDNA_1.count_normalized.txt (1638522 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_gDNA_1.countsummary.txt (247 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_gDNA_2.count.txt (1625960 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_gDNA_2.count_normalized.txt (1638905 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_gDNA_2.countsummary.txt (246 bytes)\n",
-      "\n",
-      "data/mageck/count/combined/counts.txt (2090653 bytes)\n",
-      "\n",
-      "data/mageck/count/combined/sample_names.txt (65 bytes)\n",
-      "\n",
-      "data/mageck/count/combined/summary.txt (366 bytes)\n",
-      "\n",
-      "data/mageck/count/log/MO_Brunello_1.log (2449 bytes)\n",
-      "\n",
-      "data/mageck/count/log/MO_Brunello_2.log (2449 bytes)\n",
-      "\n",
-      "data/mageck/count/log/MO_Brunello_gDNA_1.log (2489 bytes)\n",
-      "\n",
-      "data/mageck/count/log/MO_Brunello_gDNA_2.log (2488 bytes)\n"
-     ]
-    }
-   ],
-   "source": [
-    "files = dataset.list_files()\n",
-    "print(files)"
+    "Alternatively, you can filter the list of files to only what is needed"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 6,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "data/mageck/count/MO_Brunello_1.count_normalized.txt (1638475 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_2.count_normalized.txt (1638372 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_gDNA_1.count_normalized.txt (1638522 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_gDNA_2.count_normalized.txt (1638905 bytes)\n"
-     ]
-    }
-   ],
-   "source": [
-    "norm_counts = files.filter_by_pattern(\"*.count_normalized.txt\")\n",
-    "print(norm_counts)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Downloading file MO_Brunello_1.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 1.86MB/s\n",
-      "Downloading file MO_Brunello_2.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 3.78MB/s\n",
-      "Downloading file MO_Brunello_gDNA_1.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 2.86MB/s\n",
-      "Downloading file MO_Brunello_gDNA_2.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 3.27MB/s\n"
+      "Downloading file ERR031935.haplotypecaller.filtered.vcf.gz (401.08 KB) | 100.0%|█████████████████████████ | 1.71MB/s\n",
+      "Downloading file ERR031935.haplotypecaller.vcf.gz (357.77 KB) | 100.0%|█████████████████████████ | 1.50MB/s\n",
+      "Downloading file ERR031935.strelka.genome.vcf.gz (12.29 MB) | 100.0%|█████████████████████████ | 6.54MB/s\n",
+      "Downloading file ERR031935.strelka.variants.vcf.gz (970.75 KB) | 100.0%|█████████████████████████ | 2.55MB/s\n"
      ]
     }
    ],
    "source": [
-    "norm_counts.download(\"/tmp\")"
+    "dataset.download_files(\"/tmp\", glob=\"*.vcf.gz\")"
    ]
   },
   {
@@ -315,7 +189,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.12"
+   "version": "3.12.7"
   },
   "vscode": {
    "interpreter": {
@@ -324,5 +198,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 1
+ "nbformat_minor": 4
 }
diff --git a/samples/Interacting_with_files.ipynb b/samples/Interacting_with_files.ipynb
index 929d9dfb..91b35b48 100644
--- a/samples/Interacting_with_files.ipynb
+++ b/samples/Interacting_with_files.ipynb
@@ -13,28 +13,37 @@
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    },
     "ExecuteTime": {
      "end_time": "2025-03-25T19:16:07.482109Z",
      "start_time": "2025-03-25T19:16:06.304549Z"
+    },
+    "pycharm": {
+     "name": "#%%\n"
     }
    },
+   "outputs": [],
    "source": [
     "from cirro import DataPortal\n",
     "\n",
-    "portal = DataPortal()"
-   ],
-   "outputs": [],
-   "execution_count": 1
+    "portal = DataPortal(base_url=\"\")"
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Find the file you are looking for by defining the project and dataset, then searching for a particular file of interest based on a pattern using `filter_by_pattern`"
+    "Find the file you are looking for by defining the project and dataset, then using `read_file` or `read_files` to read file contents directly into Python objects.\n",
+    "\n",
+    "The file format is inferred automatically from the extension (`.csv`, `.tsv`, `.json`, `.parquet`, `.feather`, `.pkl`, `.xlsx`, `.h5ad`), or can be specified with the `format` parameter."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Inspecting files"
    ]
   },
   {
@@ -50,31 +59,27 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The project Test Project contains 104 datasets\n",
-      "Dataset Test of mageck-count contains 32 files\n",
-      "Selected the file: data/mageck/count/combined/counts.txt (2090653 bytes)\n"
+      "Dataset: Genomic variant calling - parameter validation\n",
+      "Files: 235\n",
+      "data/variant_calling/haplotypecaller/ERR031935/ERR031935.haplotypecaller.filtered.vcf.gz\n",
+      "data/variant_calling/haplotypecaller/ERR031935/ERR031935.haplotypecaller.vcf.gz\n",
+      "data/variant_calling/strelka/ERR031935/ERR031935.strelka.genome.vcf.gz\n",
+      "data/variant_calling/strelka/ERR031935/ERR031935.strelka.variants.vcf.gz\n"
      ]
     }
    ],
    "source": [
     "# Get the project which contains the dataset\n",
-    "project = portal.get_project_by_name('Test Project')\n",
-    "\n",
-    "# Get the set of datasets within that project\n",
-    "all_datasets = project.list_datasets()\n",
-    "print(f\"The project {project.name} contains {len(all_datasets):,} datasets\")\n",
+    "project = portal.get_project_by_name(\"Pipeline Development\")\n",
     "\n",
     "# Get the dataset of interest based on its name\n",
-    "dataset = all_datasets.get_by_name('Test of mageck-count')\n",
-    "\n",
-    "# Get the complete list of files in that dataset\n",
-    "files = dataset.list_files()\n",
-    "print(f\"Dataset {dataset.name} contains {len(files):,} files\")\n",
-    "\n",
-    "# Filter to just the files named counts.txt (using the wildcard to match the string of folders it is in)\n",
-    "counts = files.filter_by_pattern(\"*/counts.txt\")\n",
+    "dataset = project.get_dataset(\"Genomic variant calling - parameter validation\")\n",
     "\n",
-    "print(f\"Selected the file: {counts.description()}\")"
+    "print(f\"Dataset: {dataset.name}\")\n",
+    "print(f\"Files: {len(dataset.list_files()):,}\")\n",
+    "for file in dataset.list_files():\n",
+    "    if file.name.endswith('.vcf.gz'):\n",
+    "        print(file.name)"
    ]
   },
   {
@@ -85,7 +90,9 @@
     }
    },
    "source": [
-    "Load the contents of that file into a DataFrame (keeping in mind that it is tab-delimited, not the default comma-delimited)"
+    "### Reading a file\n",
+    "\n",
+    "Read a single file into a DataFrame using `read_file`. The tab-separated format is specified explicitly with `sep='\\t'`."
    ]
   },
   {
@@ -118,78 +125,109 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>sgRNA</th>\n",
-       "      <th>Gene</th>\n",
-       "      <th>MO_Brunello_gDNA_2</th>\n",
-       "      <th>MO_Brunello_1</th>\n",
-       "      <th>MO_Brunello_2</th>\n",
-       "      <th>MO_Brunello_gDNA_1</th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "      <th>5</th>\n",
+       "      <th>6</th>\n",
+       "      <th>7</th>\n",
+       "      <th>8</th>\n",
+       "      <th>9</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>A1BG_0</td>\n",
-       "      <td>A1BG</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>chr20</td>\n",
+       "      <td>60826</td>\n",
+       "      <td>.</td>\n",
+       "      <td>T</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1</td>\n",
+       "      <td>LowDepth;LowGQX;NoPassedVariantGTs</td>\n",
+       "      <td>MQ=17;SNVHPOL=2</td>\n",
+       "      <td>GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL</td>\n",
+       "      <td>0/1:3:0:1:0:0,1:0,1:0,0:0.0:LowGQX;LowDepth:29...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>A1BG_1</td>\n",
-       "      <td>A1BG</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
+       "      <td>chr20</td>\n",
+       "      <td>60850</td>\n",
+       "      <td>.</td>\n",
+       "      <td>A</td>\n",
+       "      <td>T</td>\n",
+       "      <td>1</td>\n",
+       "      <td>LowDepth;LowGQX;NoPassedVariantGTs</td>\n",
+       "      <td>MQ=24;SNVHPOL=4</td>\n",
+       "      <td>GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL</td>\n",
+       "      <td>0/1:3:1:1:0:0,1:0,1:0,0:0.0:LowGQX;LowDepth:30...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>A1BG_2</td>\n",
-       "      <td>A1BG</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>chr20</td>\n",
+       "      <td>62437</td>\n",
+       "      <td>.</td>\n",
+       "      <td>C</td>\n",
+       "      <td>T</td>\n",
+       "      <td>3</td>\n",
+       "      <td>LowDepth;LowGQX;NoPassedVariantGTs</td>\n",
+       "      <td>MQ=22;SNVHPOL=2</td>\n",
+       "      <td>GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL</td>\n",
+       "      <td>0/1:3:0:1:1:0,1:0,0:0,1:0.0:LowGQX;LowDepth:35...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>A1BG_3</td>\n",
-       "      <td>A1BG</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0</td>\n",
+       "      <td>chr20</td>\n",
+       "      <td>62467</td>\n",
+       "      <td>.</td>\n",
+       "      <td>C</td>\n",
+       "      <td>A</td>\n",
+       "      <td>4</td>\n",
+       "      <td>LowDepth;LowGQX;NoPassedVariantGTs</td>\n",
+       "      <td>MQ=24;SNVHPOL=2</td>\n",
+       "      <td>GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL</td>\n",
+       "      <td>0/1:3:0:1:0:0,1:0,0:0,1:0.0:LowGQX;LowDepth:36...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>A1CF_36946</td>\n",
-       "      <td>A1CF</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>chr20</td>\n",
+       "      <td>62469</td>\n",
+       "      <td>.</td>\n",
+       "      <td>G</td>\n",
+       "      <td>A</td>\n",
+       "      <td>3</td>\n",
+       "      <td>LowDepth;LowGQX;NoPassedVariantGTs</td>\n",
+       "      <td>MQ=24;SNVHPOL=3</td>\n",
+       "      <td>GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL</td>\n",
+       "      <td>0/1:3:0:1:0:0,1:0,0:0,1:0.0:LowGQX;LowDepth:34...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "        sgRNA  Gene  MO_Brunello_gDNA_2  MO_Brunello_1  MO_Brunello_2  \\\n",
-       "0      A1BG_0  A1BG                   0              0              0   \n",
-       "1      A1BG_1  A1BG                   0              0              0   \n",
-       "2      A1BG_2  A1BG                   0              0              0   \n",
-       "3      A1BG_3  A1BG                   0              0              2   \n",
-       "4  A1CF_36946  A1CF                   0              0              0   \n",
+       "       0      1  2  3  4  5                                   6  \\\n",
+       "0  chr20  60826  .  T  A  1  LowDepth;LowGQX;NoPassedVariantGTs   \n",
+       "1  chr20  60850  .  A  T  1  LowDepth;LowGQX;NoPassedVariantGTs   \n",
+       "2  chr20  62437  .  C  T  3  LowDepth;LowGQX;NoPassedVariantGTs   \n",
+       "3  chr20  62467  .  C  A  4  LowDepth;LowGQX;NoPassedVariantGTs   \n",
+       "4  chr20  62469  .  G  A  3  LowDepth;LowGQX;NoPassedVariantGTs   \n",
        "\n",
-       "   MO_Brunello_gDNA_1  \n",
-       "0                   0  \n",
-       "1                   2  \n",
-       "2                   0  \n",
-       "3                   0  \n",
-       "4                   0  "
+       "                 7                                     8  \\\n",
+       "0  MQ=17;SNVHPOL=2  GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL   \n",
+       "1  MQ=24;SNVHPOL=4  GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL   \n",
+       "2  MQ=22;SNVHPOL=2  GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL   \n",
+       "3  MQ=24;SNVHPOL=2  GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL   \n",
+       "4  MQ=24;SNVHPOL=3  GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL   \n",
+       "\n",
+       "                                                   9  \n",
+       "0  0/1:3:0:1:0:0,1:0,1:0,0:0.0:LowGQX;LowDepth:29...  \n",
+       "1  0/1:3:1:1:0:0,1:0,1:0,0:0.0:LowGQX;LowDepth:30...  \n",
+       "2  0/1:3:0:1:1:0,1:0,0:0,1:0.0:LowGQX;LowDepth:35...  \n",
+       "3  0/1:3:0:1:0:0,1:0,0:0,1:0.0:LowGQX;LowDepth:36...  \n",
+       "4  0/1:3:0:1:0:0,1:0,0:0,1:0.0:LowGQX;LowDepth:34...  "
       ]
      },
      "execution_count": 3,
@@ -198,56 +236,72 @@
     }
    ],
    "source": [
-    "df = counts[0].read_csv(sep=\"\\t\")\n",
+    "# Read a single file matched by a glob pattern\n",
+    "df = dataset.read_file(glob=\"*.variants.vcf.gz\", filetype=\"csv\", sep=\"\\t\", comment=\"#\", header=None)\n",
     "df.head()"
    ]
   },
   {
+   "cell_type": "markdown",
    "metadata": {},
+   "source": [
+    "### Reading multiple files\n",
+    "\n",
+    "Use `read_files` to iterate over multiple matching files. With `{name}` capture placeholders in the `pattern`, extracted values are returned alongside each file's content."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'sample': 'ERR031935', 'type': 'genome'} (790381, 10)\n",
+      "{'sample': 'ERR031935', 'type': 'variants'} (36318, 10)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Extract folder names from the path automatically using {name} placeholders\n",
+    "for df, meta in dataset.read_files(\n",
+    "    pattern=\"*/strelka/{sample}/*.strelka.{type}.vcf.gz\",\n",
+    "    filetype=\"csv\",\n",
+    "    sep=\"\\t\",\n",
+    "    comment=\"#\",\n",
+    "    header=None\n",
+    "):\n",
+    "    print(meta, df.shape)"
+   ]
+  },
+  {
    "cell_type": "markdown",
-   "source": "You can also view any artifacts produced by running the analysis, such as the workflow report, graph, or logs."
+   "metadata": {},
+   "source": [
+    "You can also view any artifacts produced by running the analysis, such as the workflow report, graph, or logs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Getting metadata"
+   ]
   },
   {
+   "cell_type": "code",
+   "execution_count": 6,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2025-03-25T19:16:35.472469Z",
      "start_time": "2025-03-25T19:16:31.215624Z"
     }
    },
-   "cell_type": "code",
-   "source": [
-    "from cirro_api_client.v1.models import ArtifactType\n",
-    "\n",
-    "# Reading nextflow trace file\n",
-    "trace_file = dataset.get_artifact(ArtifactType.WORKFLOW_TRACE)\n",
-    "trace_df = trace_file.read_csv(sep=\"\\t\")\n",
-    "trace_df.head()"
-   ],
    "outputs": [
     {
      "data": {
-      "text/plain": [
-       "   task_id       hash                             native_id  \\\n",
-       "0        7  99/b42c07  826623a0-0ed5-44ff-8a94-e3802cccf531   \n",
-       "1        5  71/8e3d51  ace41478-ba98-403d-a6d1-3e95ad64c36f   \n",
-       "2        8  71/535e08  9d499098-6ed7-422b-9233-9983f775fdee   \n",
-       "3        1  41/c494ef  3a221dd3-7ca8-41e1-8212-856b6154be64   \n",
-       "4        2  25/13b116  94f91d55-1d41-4afd-88b4-743d75817032   \n",
-       "\n",
-       "                     name     status  exit                   submit duration  \\\n",
-       "0  trim:trim_adapters (4)  COMPLETED     0  2022-05-24 16:27:01.413   5m 38s   \n",
-       "1  trim:trim_adapters (3)  COMPLETED     0  2022-05-24 16:27:01.421   5m 38s   \n",
-       "2              fastqc (4)  COMPLETED     0  2022-05-24 16:27:01.464   5m 48s   \n",
-       "3              fastqc (1)  COMPLETED     0  2022-05-24 16:27:01.465   5m 48s   \n",
-       "4  trim:trim_adapters (1)  COMPLETED     0  2022-05-24 16:27:01.476   5m 58s   \n",
-       "\n",
-       "  realtime    %cpu  peak_rss peak_vmem    rchar     wchar  \n",
-       "0       1s   76.6%    3.1 MB    5.4 MB   1.8 MB  900.5 KB  \n",
-       "1       4s    6.4%   11.6 MB   17.3 MB   1.8 MB  900.5 KB  \n",
-       "2       3s  104.8%  152.7 MB    3.2 GB  15.9 MB    4.1 MB  \n",
-       "3       3s  102.5%  140.2 MB    3.2 GB    16 MB    4.1 MB  \n",
-       "4       1s   75.8%    3.1 MB    5.4 MB   1.8 MB  900.5 KB  "
-      ],
       "text/html": [
        "<div>\n",
        "<style scoped>\n",
@@ -287,128 +341,170 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>7</td>\n",
-       "      <td>99/b42c07</td>\n",
-       "      <td>826623a0-0ed5-44ff-8a94-e3802cccf531</td>\n",
-       "      <td>trim:trim_adapters (4)</td>\n",
+       "      <td>fb/18dde6</td>\n",
+       "      <td>4a268ebd-7d6d-42e7-8753-a9ee3f0b1aca</td>\n",
+       "      <td>NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZ...</td>\n",
        "      <td>COMPLETED</td>\n",
        "      <td>0</td>\n",
-       "      <td>2022-05-24 16:27:01.413</td>\n",
-       "      <td>5m 38s</td>\n",
-       "      <td>1s</td>\n",
-       "      <td>76.6%</td>\n",
-       "      <td>3.1 MB</td>\n",
+       "      <td>2023-08-29 18:55:47.794</td>\n",
+       "      <td>2m 52s</td>\n",
+       "      <td>0ms</td>\n",
+       "      <td>79.1%</td>\n",
+       "      <td>3 MB</td>\n",
        "      <td>5.4 MB</td>\n",
-       "      <td>1.8 MB</td>\n",
-       "      <td>900.5 KB</td>\n",
+       "      <td>79.8 KB</td>\n",
+       "      <td>3.6 KB</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>5</td>\n",
-       "      <td>71/8e3d51</td>\n",
-       "      <td>ace41478-ba98-403d-a6d1-3e95ad64c36f</td>\n",
-       "      <td>trim:trim_adapters (3)</td>\n",
+       "      <td>6</td>\n",
+       "      <td>e0/e394ac</td>\n",
+       "      <td>4195506d-60cd-4771-ac03-e801adeb7794</td>\n",
+       "      <td>NFCORE_SAREK:SAREK:PREPARE_INTERVALS:CREATE_IN...</td>\n",
        "      <td>COMPLETED</td>\n",
        "      <td>0</td>\n",
-       "      <td>2022-05-24 16:27:01.421</td>\n",
-       "      <td>5m 38s</td>\n",
-       "      <td>4s</td>\n",
-       "      <td>6.4%</td>\n",
-       "      <td>11.6 MB</td>\n",
-       "      <td>17.3 MB</td>\n",
-       "      <td>1.8 MB</td>\n",
-       "      <td>900.5 KB</td>\n",
+       "      <td>2023-08-29 18:55:47.807</td>\n",
+       "      <td>2m 53s</td>\n",
+       "      <td>0ms</td>\n",
+       "      <td>171.4%</td>\n",
+       "      <td>2.9 MB</td>\n",
+       "      <td>11 MB</td>\n",
+       "      <td>43 KB</td>\n",
+       "      <td>1.9 KB</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>8</td>\n",
-       "      <td>71/535e08</td>\n",
-       "      <td>9d499098-6ed7-422b-9233-9983f775fdee</td>\n",
-       "      <td>fastqc (4)</td>\n",
+       "      <td>21</td>\n",
+       "      <td>57/3dfeca</td>\n",
+       "      <td>563917fe-c79c-419c-a4c8-081457e9241a</td>\n",
+       "      <td>NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZ...</td>\n",
        "      <td>COMPLETED</td>\n",
        "      <td>0</td>\n",
-       "      <td>2022-05-24 16:27:01.464</td>\n",
-       "      <td>5m 48s</td>\n",
-       "      <td>3s</td>\n",
-       "      <td>104.8%</td>\n",
-       "      <td>152.7 MB</td>\n",
-       "      <td>3.2 GB</td>\n",
-       "      <td>15.9 MB</td>\n",
-       "      <td>4.1 MB</td>\n",
+       "      <td>2023-08-29 18:58:41.177</td>\n",
+       "      <td>38.5s</td>\n",
+       "      <td>0ms</td>\n",
+       "      <td>82.8%</td>\n",
+       "      <td>3.1 MB</td>\n",
+       "      <td>5.4 MB</td>\n",
+       "      <td>79 KB</td>\n",
+       "      <td>3.2 KB</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>1</td>\n",
-       "      <td>41/c494ef</td>\n",
-       "      <td>3a221dd3-7ca8-41e1-8212-856b6154be64</td>\n",
-       "      <td>fastqc (1)</td>\n",
+       "      <td>23</td>\n",
+       "      <td>37/ebcc21</td>\n",
+       "      <td>27342717-5e4c-4ce3-81bd-5dd04192c7a7</td>\n",
+       "      <td>NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZ...</td>\n",
        "      <td>COMPLETED</td>\n",
        "      <td>0</td>\n",
-       "      <td>2022-05-24 16:27:01.465</td>\n",
-       "      <td>5m 48s</td>\n",
-       "      <td>3s</td>\n",
-       "      <td>102.5%</td>\n",
-       "      <td>140.2 MB</td>\n",
-       "      <td>3.2 GB</td>\n",
-       "      <td>16 MB</td>\n",
-       "      <td>4.1 MB</td>\n",
+       "      <td>2023-08-29 18:58:41.319</td>\n",
+       "      <td>38.7s</td>\n",
+       "      <td>0ms</td>\n",
+       "      <td>86.7%</td>\n",
+       "      <td>3.1 MB</td>\n",
+       "      <td>5.4 MB</td>\n",
+       "      <td>79 KB</td>\n",
+       "      <td>3.2 KB</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>2</td>\n",
-       "      <td>25/13b116</td>\n",
-       "      <td>94f91d55-1d41-4afd-88b4-743d75817032</td>\n",
-       "      <td>trim:trim_adapters (1)</td>\n",
+       "      <td>20</td>\n",
+       "      <td>09/9937ff</td>\n",
+       "      <td>4324507a-80a9-4957-9b99-6de4a949fd62</td>\n",
+       "      <td>NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZ...</td>\n",
        "      <td>COMPLETED</td>\n",
        "      <td>0</td>\n",
-       "      <td>2022-05-24 16:27:01.476</td>\n",
-       "      <td>5m 58s</td>\n",
-       "      <td>1s</td>\n",
-       "      <td>75.8%</td>\n",
+       "      <td>2023-08-29 18:58:41.352</td>\n",
+       "      <td>39.1s</td>\n",
+       "      <td>0ms</td>\n",
+       "      <td>84.7%</td>\n",
        "      <td>3.1 MB</td>\n",
        "      <td>5.4 MB</td>\n",
-       "      <td>1.8 MB</td>\n",
-       "      <td>900.5 KB</td>\n",
+       "      <td>79 KB</td>\n",
+       "      <td>3.2 KB</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
+      ],
+      "text/plain": [
+       "   task_id       hash                             native_id  \\\n",
+       "0        7  fb/18dde6  4a268ebd-7d6d-42e7-8753-a9ee3f0b1aca   \n",
+       "1        6  e0/e394ac  4195506d-60cd-4771-ac03-e801adeb7794   \n",
+       "2       21  57/3dfeca  563917fe-c79c-419c-a4c8-081457e9241a   \n",
+       "3       23  37/ebcc21  27342717-5e4c-4ce3-81bd-5dd04192c7a7   \n",
+       "4       20  09/9937ff  4324507a-80a9-4957-9b99-6de4a949fd62   \n",
+       "\n",
+       "                                                name     status  exit  \\\n",
+       "0  NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZ...  COMPLETED     0   \n",
+       "1  NFCORE_SAREK:SAREK:PREPARE_INTERVALS:CREATE_IN...  COMPLETED     0   \n",
+       "2  NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZ...  COMPLETED     0   \n",
+       "3  NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZ...  COMPLETED     0   \n",
+       "4  NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZ...  COMPLETED     0   \n",
+       "\n",
+       "                    submit duration realtime    %cpu peak_rss peak_vmem  \\\n",
+       "0  2023-08-29 18:55:47.794   2m 52s      0ms   79.1%     3 MB    5.4 MB   \n",
+       "1  2023-08-29 18:55:47.807   2m 53s      0ms  171.4%   2.9 MB     11 MB   \n",
+       "2  2023-08-29 18:58:41.177    38.5s      0ms   82.8%   3.1 MB    5.4 MB   \n",
+       "3  2023-08-29 18:58:41.319    38.7s      0ms   86.7%   3.1 MB    5.4 MB   \n",
+       "4  2023-08-29 18:58:41.352    39.1s      0ms   84.7%   3.1 MB    5.4 MB   \n",
+       "\n",
+       "     rchar   wchar  \n",
+       "0  79.8 KB  3.6 KB  \n",
+       "1    43 KB  1.9 KB  \n",
+       "2    79 KB  3.2 KB  \n",
+       "3    79 KB  3.2 KB  \n",
+       "4    79 KB  3.2 KB  "
       ]
      },
-     "execution_count": 3,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
-   "execution_count": 3
+   "source": [
+    "# Reading nextflow trace file\n",
+    "trace = dataset.get_trace()\n",
+    "trace.head()"
+   ]
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-03-25T19:18:48.517520Z",
-     "start_time": "2025-03-25T19:18:48.161651Z"
-    }
-   },
    "cell_type": "code",
-   "source": [
-    "from IPython.display import display, SVG\n",
-    "\n",
-    "# Displaying the workflow graph\n",
-    "graph = dataset.get_artifact(ArtifactType.WORKFLOW_DAG)\n",
-    "display(SVG(graph.read()))"
-   ],
+   "execution_count": 7,
+   "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<IPython.core.display.SVG object>"
-      ],
-      "image/svg+xml": "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" width=\"513pt\" height=\"471pt\" viewBox=\"0.00 0.00 512.90 470.60\">\n<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 466.6)\">\n<title>flowchart</title>\n<polygon fill=\"white\" stroke=\"white\" points=\"-4,5 -4,-466.6 509.9,-466.6 509.9,5 -4,5\"/>\n<!-- p0 -->\n<g id=\"node1\" class=\"node\"><title>p0</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"137.5\" cy=\"-444\" rx=\"3.6\" ry=\"3.6\"/>\n<text text-anchor=\"middle\" x=\"68.9\" y=\"-451.4\" font-family=\"Times,serif\" font-size=\"14.00\">Channel.fromPath</text>\n</g>\n<!-- p2 -->\n<g id=\"node2\" class=\"node\"><title>p2</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"95.5\" cy=\"-370\" rx=\"39.4691\" ry=\"18\"/>\n<text text-anchor=\"middle\" x=\"95.5\" y=\"-366.3\" font-family=\"Times,serif\" font-size=\"14.00\">fastqc</text>\n</g>\n<!-- p0&#45;&gt;p2 -->\n<g id=\"edge1\" class=\"edge\"><title>p0-&gt;p2</title>\n<path fill=\"none\" stroke=\"black\" d=\"M134.182,-442.465C126.048,-440.956 105.146,-435.836 96.5,-422 92.1927,-415.107 90.8454,-406.601 90.8892,-398.525\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"94.3962,-398.551 91.6163,-388.327 87.4139,-398.053 94.3962,-398.551\"/>\n<text text-anchor=\"middle\" x=\"127\" y=\"-410.3\" font-family=\"Times,serif\" font-size=\"14.00\">fastq_ch</text>\n</g>\n<!-- p10 -->\n<g id=\"node11\" class=\"node\"><title>p10</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"257.5\" cy=\"-370\" rx=\"99.1619\" ry=\"18\"/>\n<text text-anchor=\"middle\" x=\"257.5\" y=\"-366.3\" font-family=\"Times,serif\" font-size=\"14.00\">trim:trim_adapters</text>\n</g>\n<!-- p0&#45;&gt;p10 -->\n<g id=\"edge9\" class=\"edge\"><title>p0-&gt;p10</title>\n<path fill=\"none\" stroke=\"black\" d=\"M140.127,-441.252C148.262,-435.856 173.793,-419.038 195.5,-406 203.122,-401.422 211.355,-396.654 219.255,-392.163\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"220.994,-395.2 227.982,-387.236 217.553,-389.105 220.994,-395.2\"/>\n<text text-anchor=\"middle\" x=\"226\" y=\"-410.3\" font-family=\"Times,serif\" font-size=\"14.00\">fastq_ch</text>\n</g>\n<!-- p3 -->\n<g id=\"node5\" class=\"node\"><title>p3</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"70.5\" cy=\"-282\" rx=\"1.8\" ry=\"1.8\"/>\n</g>\n<!-- p2&#45;&gt;p3 -->\n<g id=\"edge3\" class=\"edge\"><title>p2-&gt;p3</title>\n<path fill=\"none\" stroke=\"black\" d=\"M90.5608,-352.009C85.5572,-334.797 77.9645,-308.678 73.6989,-294.004\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"76.9742,-292.732 70.8218,-284.107 70.2524,-294.686 76.9742,-292.732\"/>\n</g>\n<!-- p6 -->\n<g id=\"node6\" class=\"node\"><title>p6</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"94.5\" cy=\"-282\" rx=\"3.25\" ry=\"3.5\"/>\n<text text-anchor=\"middle\" x=\"45.5\" y=\"-289.3\" font-family=\"Times,serif\" font-size=\"14.00\">toSortedList</text>\n</g>\n<!-- p2&#45;&gt;p6 -->\n<g id=\"edge4\" class=\"edge\"><title>p2-&gt;p6</title>\n<path fill=\"none\" stroke=\"black\" d=\"M95.2976,-351.597C95.1052,-335.046 94.8202,-310.54 94.6481,-295.737\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"98.1472,-295.63 94.5311,-285.671 91.1477,-295.711 98.1472,-295.63\"/>\n</g>\n<!-- p4 -->\n<g id=\"node7\" class=\"node\"><title>p4</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"186.5\" cy=\"-282\" rx=\"70.2909\" ry=\"18\"/>\n<text text-anchor=\"middle\" x=\"186.5\" y=\"-278.3\" font-family=\"Times,serif\" font-size=\"14.00\">parse_fastqc</text>\n</g>\n<!-- p2&#45;&gt;p4 -->\n<g id=\"edge5\" class=\"edge\"><title>p2-&gt;p4</title>\n<path fill=\"none\" stroke=\"black\" d=\"M112.191,-353.226C126.037,-340.141 145.929,-321.342 161.648,-306.487\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"164.121,-308.965 168.985,-299.553 159.313,-303.878 164.121,-308.965\"/>\n</g>\n<!-- p1 -->\n<g id=\"node3\" class=\"node\"><title>p1</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"378.5\" cy=\"-370\" rx=\"3.6\" ry=\"3.6\"/>\n<text text-anchor=\"middle\" x=\"439.9\" y=\"-377.4\" font-family=\"Times,serif\" font-size=\"14.00\">Channel.fromPath</text>\n</g>\n<!-- p11 -->\n<g id=\"node4\" class=\"node\"><title>p11</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"281.5\" cy=\"-282\" rx=\"3.25\" ry=\"3.5\"/>\n<text text-anchor=\"middle\" x=\"308.5\" y=\"-289.3\" font-family=\"Times,serif\" font-size=\"14.00\">combine</text>\n</g>\n<!-- p1&#45;&gt;p11 -->\n<g id=\"edge2\" class=\"edge\"><title>p1-&gt;p11</title>\n<path fill=\"none\" stroke=\"black\" d=\"M376.819,-366.432C374.492,-362.967 370.055,-356.692 365.5,-352 339.969,-325.706 329.686,-323.425 301.5,-300 298.206,-297.262 294.634,-294.243 291.433,-291.518\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"293.462,-288.647 283.589,-284.802 288.909,-293.965 293.462,-288.647\"/>\n<text text-anchor=\"middle\" x=\"407\" y=\"-322.3\" font-family=\"Times,serif\" font-size=\"14.00\">Channel_Library</text>\n</g>\n<!-- p13 -->\n<g id=\"node13\" class=\"node\"><title>p13</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"299.5\" cy=\"-194\" rx=\"78.4642\" ry=\"18\"/>\n<text text-anchor=\"middle\" x=\"299.5\" y=\"-190.3\" font-family=\"Times,serif\" font-size=\"14.00\">mageck_count</text>\n</g>\n<!-- p11&#45;&gt;p13 -->\n<g id=\"edge12\" class=\"edge\"><title>p11-&gt;p13</title>\n<path fill=\"none\" stroke=\"black\" d=\"M282.019,-278.519C283.735,-270.32 289.366,-243.419 293.778,-222.337\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"297.263,-222.773 295.886,-212.268 290.411,-221.339 297.263,-222.773\"/>\n</g>\n<!-- p7 -->\n<g id=\"node9\" class=\"node\"><title>p7</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"94.5\" cy=\"-194\" rx=\"46.2191\" ry=\"18\"/>\n<text text-anchor=\"middle\" x=\"94.5\" y=\"-190.3\" font-family=\"Times,serif\" font-size=\"14.00\">multiqc</text>\n</g>\n<!-- p6&#45;&gt;p7 -->\n<g id=\"edge7\" class=\"edge\"><title>p6-&gt;p7</title>\n<path fill=\"none\" stroke=\"black\" d=\"M94.5,-278.139C94.5,-269.502 94.5,-243.011 94.5,-222.227\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"98.0001,-222.004 94.5,-212.004 91.0001,-222.004 98.0001,-222.004\"/>\n</g>\n<!-- p5 -->\n<g id=\"node8\" class=\"node\"><title>p5</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"186.5\" cy=\"-194\" rx=\"1.8\" ry=\"1.8\"/>\n</g>\n<!-- p4&#45;&gt;p5 -->\n<g id=\"edge6\" class=\"edge\"><title>p4-&gt;p5</title>\n<path fill=\"none\" stroke=\"black\" d=\"M186.5,-263.597C186.5,-246.516 186.5,-220.965 186.5,-206.348\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"190,-206.095 186.5,-196.095 183,-206.095 190,-206.095\"/>\n</g>\n<!-- p8 -->\n<g id=\"node10\" class=\"node\"><title>p8</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"94.5\" cy=\"-134\" rx=\"1.8\" ry=\"1.8\"/>\n</g>\n<!-- p7&#45;&gt;p8 -->\n<g id=\"edge8\" class=\"edge\"><title>p7-&gt;p8</title>\n<path fill=\"none\" stroke=\"black\" d=\"M94.5,-175.912C94.5,-166.322 94.5,-154.628 94.5,-146.202\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"98.0001,-146.056 94.5,-136.056 91.0001,-146.056 98.0001,-146.056\"/>\n</g>\n<!-- p10&#45;&gt;p11 -->\n<g id=\"edge11\" class=\"edge\"><title>p10-&gt;p11</title>\n<path fill=\"none\" stroke=\"black\" d=\"M262.242,-352.009C266.891,-335.35 273.868,-310.349 278.024,-295.455\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"281.504,-296.005 280.821,-285.432 274.762,-294.124 281.504,-296.005\"/>\n</g>\n<!-- p9 -->\n<g id=\"node12\" class=\"node\"><title>p9</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"263.5\" cy=\"-444\" rx=\"3.6\" ry=\"3.6\"/>\n</g>\n<!-- p9&#45;&gt;p10 -->\n<g id=\"edge10\" class=\"edge\"><title>p9-&gt;p10</title>\n<path fill=\"none\" stroke=\"black\" d=\"M263.272,-440.265C262.696,-433.354 261.132,-414.578 259.776,-398.316\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"263.247,-397.818 258.929,-388.143 256.271,-398.399 263.247,-397.818\"/>\n<text text-anchor=\"middle\" x=\"284\" y=\"-410.3\" font-family=\"Times,serif\" font-size=\"14.00\">prefix</text>\n</g>\n<!-- p14 -->\n<g id=\"node15\" class=\"node\"><title>p14</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"286.5\" cy=\"-134\" rx=\"1.8\" ry=\"1.8\"/>\n</g>\n<!-- p13&#45;&gt;p14 -->\n<g id=\"edge14\" class=\"edge\"><title>p13-&gt;p14</title>\n<path fill=\"none\" stroke=\"black\" d=\"M295.67,-175.912C293.494,-166.205 290.835,-154.342 288.942,-145.896\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"292.339,-145.049 286.737,-136.056 285.509,-146.58 292.339,-145.049\"/>\n</g>\n<!-- p15 -->\n<g id=\"node16\" class=\"node\"><title>p15</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"311.5\" cy=\"-134\" rx=\"3.25\" ry=\"3.5\"/>\n<text text-anchor=\"middle\" x=\"262.5\" y=\"-141.3\" font-family=\"Times,serif\" font-size=\"14.00\">toSortedList</text>\n</g>\n<!-- p13&#45;&gt;p15 -->\n<g id=\"edge15\" class=\"edge\"><title>p13-&gt;p15</title>\n<path fill=\"none\" stroke=\"black\" d=\"M303.035,-175.912C304.909,-166.858 307.17,-155.93 308.883,-147.647\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"312.345,-148.193 310.943,-137.691 305.49,-146.774 312.345,-148.193\"/>\n</g>\n<!-- p12 -->\n<g id=\"node14\" class=\"node\"><title>p12</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"314.5\" cy=\"-282\" rx=\"3.6\" ry=\"3.6\"/>\n</g>\n<!-- p12&#45;&gt;p13 -->\n<g id=\"edge13\" class=\"edge\"><title>p12-&gt;p13</title>\n<path fill=\"none\" stroke=\"black\" d=\"M314.001,-278.139C312.488,-269.462 307.832,-242.767 304.199,-221.942\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"307.632,-221.254 302.466,-212.004 300.736,-222.456 307.632,-221.254\"/>\n<text text-anchor=\"middle\" x=\"330\" y=\"-234.3\" font-family=\"Times,serif\" font-size=\"14.00\">prefix</text>\n</g>\n<!-- p17 -->\n<g id=\"node17\" class=\"node\"><title>p17</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"327.5\" cy=\"-60\" rx=\"114.085\" ry=\"18\"/>\n<text text-anchor=\"middle\" x=\"327.5\" y=\"-56.3\" font-family=\"Times,serif\" font-size=\"14.00\">mageck_merge_single</text>\n</g>\n<!-- p15&#45;&gt;p17 -->\n<g id=\"edge16\" class=\"edge\"><title>p15-&gt;p17</title>\n<path fill=\"none\" stroke=\"black\" d=\"M312.108,-130.265C313.644,-123.354 317.816,-104.578 321.43,-88.3163\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"324.938,-88.6641 323.69,-78.1429 318.104,-87.1455 324.938,-88.6641\"/>\n</g>\n<!-- p19 -->\n<g id=\"node19\" class=\"node\"><title>p19</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"315.5\" cy=\"-2\" rx=\"1.8\" ry=\"1.8\"/>\n</g>\n<!-- p17&#45;&gt;p19 -->\n<g id=\"edge18\" class=\"edge\"><title>p17-&gt;p19</title>\n<path fill=\"none\" stroke=\"black\" d=\"M323.834,-41.8939C321.88,-32.775 319.535,-21.8299 317.831,-13.8767\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"321.231,-13.0404 315.713,-3.99575 314.386,-14.5072 321.231,-13.0404\"/>\n</g>\n<!-- p18 -->\n<g id=\"node20\" class=\"node\"><title>p18</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"340.5\" cy=\"-2\" rx=\"1.8\" ry=\"1.8\"/>\n</g>\n<!-- p17&#45;&gt;p18 -->\n<g id=\"edge19\" class=\"edge\"><title>p17-&gt;p18</title>\n<path fill=\"none\" stroke=\"black\" d=\"M331.471,-41.8939C333.588,-32.775 336.129,-21.8299 337.975,-13.8767\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"341.417,-14.5282 340.269,-3.99575 334.598,-12.9452 341.417,-14.5282\"/>\n</g>\n<!-- p16 -->\n<g id=\"node18\" class=\"node\"><title>p16</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"344.5\" cy=\"-134\" rx=\"3.6\" ry=\"3.6\"/>\n</g>\n<!-- p16&#45;&gt;p17 -->\n<g id=\"edge17\" class=\"edge\"><title>p16-&gt;p17</title>\n<path fill=\"none\" stroke=\"black\" d=\"M343.854,-130.265C342.222,-123.354 337.789,-104.578 333.95,-88.3163\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"337.252,-87.071 331.548,-78.1429 330.439,-88.6796 337.252,-87.071\"/>\n<text text-anchor=\"middle\" x=\"361\" y=\"-100.3\" font-family=\"Times,serif\" font-size=\"14.00\">prefix</text>\n</g>\n</g>\n</svg>"
+       "['PW_WORKFLOW_SCRIPT=main.nf',\n",
+       " 'PW_AWS_REGION=us-west-2',\n",
+       " 'PW_BATCH_JOB_ROLE=arn:aws:iam::523221283927:role/Cirro-BatchJobRole-9a31492a',\n",
+       " 'PW_S3_TRANSFORM_WORKFLOW=s3://pubweb-resources-develop/process/hutch/data-transforms/workflow/',\n",
+       " 'PW_ONDEMAND_JOB_QUEUE=arn:aws:batch:us-west-2:523221283927:job-queue/Cirro-OnDemand-9a31492a',\n",
+       " 'PW_DATASET=3fb7e8f8-b62d-43a6-ad08-eb28f59bd141',\n",
+       " 'PW_WORKFLOW_VERSION=3.2.3',\n",
+       " 'PW_S3_RESTORE_SESSION_DIR=s3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7-scratch/workdir/session/acc47b45-df28-4120-ab0d-7106ba7c5fc4',\n",
+       " 'PW_SPOT_JOB_QUEUE=arn:aws:batch:us-west-2:523221283927:job-queue/Cirro-Spot-9a31492a',\n",
+       " 'PW_S3_DATASET=s3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/datasets/3fb7e8f8-b62d-43a6-ad08-eb28f59bd141']"
+      ]
      },
+     "execution_count": 7,
      "metadata": {},
-     "output_type": "display_data"
+     "output_type": "execute_result"
     }
    ],
-   "execution_count": 6
+   "source": [
+    "# Get the logs\n",
+    "logs = dataset.get_logs()\n",
+    "logs.split(\"\\n\")[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -427,7 +523,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.12"
+   "version": "3.12.7"
   },
   "vscode": {
    "interpreter": {
@@ -436,5 +532,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 1
+ "nbformat_minor": 4
 }