From e4cb0f4d74a0b363e97e00397771172fd8c2e6da Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Mar 2026 06:29:53 +0000 Subject: [PATCH 01/24] Add read_files method to DataPortalDataset and DataPortalProject MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a read_files(pattern, file_format=None, **kwargs) method to both DataPortalDataset and DataPortalProject. The method accepts a standard glob pattern string (e.g. '*.csv', 'data/**/*.tsv.gz'), filters dataset files using PurePath.match, and yields (DataPortalFile, content) tuples. File format is auto-detected from the extension (.csv/.tsv → DataFrame, .h5ad → AnnData, anything else → str) or can be specified explicitly. Parsing kwargs are forwarded to the underlying read method (e.g. sep='\t' for read_csv). Project-level read_files delegates to each dataset in turn. https://claude.ai/code/session_01TANa5jJ1qzDMzoV8qCjpuU --- cirro/sdk/dataset.py | 85 ++++++++++++++++++++- cirro/sdk/project.py | 55 +++++++++++++- tests/test_read_files.py | 160 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 298 insertions(+), 2 deletions(-) create mode 100644 tests/test_read_files.py diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index 10a76aa1..b5a9424c 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -1,12 +1,13 @@ import datetime from pathlib import Path -from typing import Union, List, Optional +from typing import Union, List, Optional, Iterator, Tuple, Any from cirro_api_client.v1.api.processes import validate_file_requirements from cirro_api_client.v1.models import Dataset, DatasetDetail, RunAnalysisRequest, ProcessDetail, Status, \ RunAnalysisRequestParams, Tag, ArtifactType, NamedItem, Executor, ValidateFileRequirementsRequest from cirro.cirro_client import CirroApi +from cirro.file_utils import filter_files_by_pattern from cirro.models.assets import DatasetAssets from cirro.models.file import PathLike from cirro.sdk.asset import DataPortalAssets, DataPortalAsset @@ -17,6 +18,37 @@ from cirro.sdk.process import DataPortalProcess +def _infer_file_format(path: str) -> str: + """Infer the file format from the file extension.""" + path_lower = path.lower() + for ext in ('.gz', '.bz2', '.xz', '.zst'): + if path_lower.endswith(ext): + path_lower = path_lower[:-len(ext)] + break + if path_lower.endswith('.csv') or path_lower.endswith('.tsv'): + return 'csv' + elif path_lower.endswith('.h5ad'): + return 'h5ad' + else: + return 'text' + + +def _read_file_with_format(file: DataPortalFile, file_format: Optional[str], **kwargs) -> Any: + """Read a file using the specified format, or auto-detect from extension.""" + if file_format is None: + file_format = _infer_file_format(file.relative_path) + if file_format == 'csv': + return file.read_csv(**kwargs) + elif file_format == 'h5ad': + return file.read_h5ad() + elif file_format == 'text': + return file.read(**kwargs) + else: + raise DataPortalInputError( + f"Unsupported file_format: '{file_format}'. Supported values: 'csv', 'h5ad', 'text'" + ) + + class DataPortalDataset(DataPortalAsset): """ Datasets in the Data Portal are collections of files which have @@ -199,6 +231,57 @@ def list_files(self) -> DataPortalFiles: ] ) + def read_files( + self, + pattern: str, + file_format: str = None, + **kwargs + ) -> Iterator[Tuple[DataPortalFile, Any]]: + """ + Read the contents of files in the dataset matching the given glob pattern. + + Uses standard glob pattern matching (e.g., ``*.csv``, ``data/**/*.tsv.gz``). + ``*`` matches any sequence of characters within a single path segment; + ``**`` matches zero or more path segments. + + Args: + pattern (str): Glob pattern used to match file paths within the dataset + (e.g., ``'*.csv'``, ``'counts/**/*.tsv.gz'``) + file_format (str): File format used to parse each file. Supported values: + + - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame`` + - ``'h5ad'``: parse as AnnData (requires ``anndata`` package) + - ``'text'``: read as plain text, returns a ``str`` + - ``None`` (default): infer from file extension + (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, otherwise ``'text'``) + **kwargs: Additional keyword arguments forwarded to the file-parsing function. + For ``'csv'`` format these are passed to :func:`pandas.read_csv` + (e.g., ``sep='\\t'`` for TSV files). + For ``'text'`` format these are passed to + :meth:`~cirro.sdk.file.DataPortalFile.read`. + + Yields: + Tuple[DataPortalFile, Any]: ``(file, content)`` for each matching file, + where *content* type depends on *file_format*. + + Example: + ```python + # Read all CSV files in a dataset + for file, df in dataset.read_files('*.csv'): + print(file.relative_path, df.shape) + + # Read gzip-compressed TSV files using explicit format and separator + for file, df in dataset.read_files('**/*.tsv.gz', file_format='csv', sep='\\t'): + print(file.relative_path, df.shape) + + # Read plain-text log files + for file, text in dataset.read_files('logs/*.log', file_format='text'): + print(file.relative_path, text[:200]) + ``` + """ + for file in filter_files_by_pattern(list(self.list_files()), pattern): + yield file, _read_file_with_format(file, file_format, **kwargs) + def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile: """ Get the artifact of a particular type from the dataset diff --git a/cirro/sdk/project.py b/cirro/sdk/project.py index 099872b9..7633de56 100644 --- a/cirro/sdk/project.py +++ b/cirro/sdk/project.py @@ -1,6 +1,6 @@ from functools import cache from time import sleep -from typing import List, Union +from typing import List, Union, Iterator, Tuple, Any from cirro_api_client.v1.models import Project, UploadDatasetRequest, Dataset, Sample, Tag, Status @@ -9,6 +9,7 @@ from cirro.sdk.asset import DataPortalAssets, DataPortalAsset from cirro.sdk.dataset import DataPortalDataset, DataPortalDatasets from cirro.sdk.exceptions import DataPortalAssetNotFound, DataPortalInputError +from cirro.sdk.file import DataPortalFile from cirro.sdk.helpers import parse_process_name_or_id from cirro.sdk.process import DataPortalProcess from cirro.sdk.reference import DataPortalReference, DataPortalReferences @@ -235,6 +236,58 @@ def samples(self, max_items: int = 10000) -> List[Sample]: """ return self._client.metadata.get_project_samples(self.id, max_items) + def read_files( + self, + pattern: str, + file_format: str = None, + **kwargs + ) -> Iterator[Tuple[DataPortalFile, Any]]: + """ + Read the contents of files across all datasets in the project that match + the given glob pattern. + + Iterates over every dataset in the project and yields matching files from + each one in turn. See :meth:`~cirro.sdk.dataset.DataPortalDataset.read_files` + for full details on pattern matching and format options. + + Uses standard glob pattern matching (e.g., ``*.csv``, ``data/**/*.tsv.gz``). + ``*`` matches any sequence of characters within a single path segment; + ``**`` matches zero or more path segments. + + Args: + pattern (str): Glob pattern used to match file paths within each dataset + (e.g., ``'*.csv'``, ``'counts/**/*.tsv.gz'``) + file_format (str): File format used to parse each file. Supported values: + + - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame`` + - ``'h5ad'``: parse as AnnData (requires ``anndata`` package) + - ``'text'``: read as plain text, returns a ``str`` + - ``None`` (default): infer from file extension + (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, otherwise ``'text'``) + **kwargs: Additional keyword arguments forwarded to the file-parsing function. + For ``'csv'`` format these are passed to :func:`pandas.read_csv` + (e.g., ``sep='\\t'`` for TSV files). + For ``'text'`` format these are passed to + :meth:`~cirro.sdk.file.DataPortalFile.read`. + + Yields: + Tuple[DataPortalFile, Any]: ``(file, content)`` for each matching file + across all datasets, where *content* type depends on *file_format*. + + Example: + ```python + # Read all CSV files across every dataset in a project + for file, df in project.read_files('*.csv'): + print(file.relative_path, df.shape) + + # Read gzip-compressed TSV files with explicit separator + for file, df in project.read_files('**/*.tsv.gz', file_format='csv', sep='\\t'): + print(file.relative_path, df.shape) + ``` + """ + for dataset in self.list_datasets(): + yield from dataset.read_files(pattern, file_format=file_format, **kwargs) + class DataPortalProjects(DataPortalAssets[DataPortalProject]): """Collection of DataPortalProject objects""" diff --git a/tests/test_read_files.py b/tests/test_read_files.py new file mode 100644 index 00000000..05f4b39f --- /dev/null +++ b/tests/test_read_files.py @@ -0,0 +1,160 @@ +import unittest +from unittest.mock import Mock, patch, MagicMock + +from cirro.models.file import File, FileAccessContext +from cirro.sdk.dataset import DataPortalDataset, _infer_file_format, _read_file_with_format +from cirro.sdk.exceptions import DataPortalInputError +from cirro.sdk.file import DataPortalFile, DataPortalFiles + + +def _make_mock_file(relative_path: str, content: bytes = b'') -> DataPortalFile: + """Create a DataPortalFile with a mocked _get method.""" + access_context = Mock(spec=FileAccessContext) + file = File(relative_path=relative_path, size=len(content), access_context=access_context) + client = Mock() + client.file.get_file.return_value = content + portal_file = DataPortalFile(file=file, client=client) + return portal_file + + +def _make_dataset_with_files(files: list) -> DataPortalDataset: + """Create a DataPortalDataset whose list_files() returns the given DataPortalFile list.""" + dataset_data = Mock() + dataset_data.id = 'ds-1' + dataset_data.project_id = 'proj-1' + dataset_data.name = 'Test Dataset' + + client = Mock() + dataset = DataPortalDataset(dataset=dataset_data, client=client) + dataset.list_files = Mock(return_value=DataPortalFiles(files)) + return dataset + + +class TestInferFileFormat(unittest.TestCase): + def test_csv_extension(self): + self.assertEqual(_infer_file_format('data/results.csv'), 'csv') + + def test_tsv_extension(self): + self.assertEqual(_infer_file_format('data/results.tsv'), 'csv') + + def test_csv_gz_extension(self): + self.assertEqual(_infer_file_format('data/results.csv.gz'), 'csv') + + def test_tsv_gz_extension(self): + self.assertEqual(_infer_file_format('data/results.tsv.gz'), 'csv') + + def test_h5ad_extension(self): + self.assertEqual(_infer_file_format('data/adata.h5ad'), 'h5ad') + + def test_text_fallback(self): + self.assertEqual(_infer_file_format('data/notes.txt'), 'text') + + def test_log_fallback(self): + self.assertEqual(_infer_file_format('logs/run.log'), 'text') + + def test_unknown_extension_fallback(self): + self.assertEqual(_infer_file_format('data/file.xyz'), 'text') + + +class TestReadFileWithFormat(unittest.TestCase): + def setUp(self): + self.file = _make_mock_file('data/results.csv', b'a,b\n1,2\n') + + def test_csv_format(self): + import pandas as pd + df = _read_file_with_format(self.file, 'csv') + self.assertIsInstance(df, pd.DataFrame) + self.assertListEqual(list(df.columns), ['a', 'b']) + + def test_text_format(self): + file = _make_mock_file('data/notes.txt', b'hello world') + result = _read_file_with_format(file, 'text') + self.assertEqual(result, 'hello world') + + def test_auto_infer_csv(self): + import pandas as pd + result = _read_file_with_format(self.file, None) + self.assertIsInstance(result, pd.DataFrame) + + def test_auto_infer_text(self): + file = _make_mock_file('data/notes.txt', b'hello') + result = _read_file_with_format(file, None) + self.assertIsInstance(result, str) + + def test_unsupported_format_raises(self): + with self.assertRaises(DataPortalInputError): + _read_file_with_format(self.file, 'parquet') + + def test_csv_kwargs_passed_through(self): + import pandas as pd + file = _make_mock_file('data/data.tsv', b'a\tb\n1\t2\n') + df = _read_file_with_format(file, 'csv', sep='\t') + self.assertIsInstance(df, pd.DataFrame) + self.assertListEqual(list(df.columns), ['a', 'b']) + + +class TestDatasetReadFiles(unittest.TestCase): + def setUp(self): + self.csv_file = _make_mock_file('data/results.csv', b'x,y\n3,4\n') + self.tsv_file = _make_mock_file('data/counts.tsv', b'gene\tcount\nTP53\t100\n') + self.txt_file = _make_mock_file('logs/run.log', b'started\nfinished\n') + self.dataset = _make_dataset_with_files([ + self.csv_file, + self.tsv_file, + self.txt_file, + ]) + + def test_pattern_matches_csv(self): + results = list(self.dataset.read_files('*.csv')) + self.assertEqual(len(results), 1) + file, content = results[0] + self.assertEqual(file.relative_path, 'data/results.csv') + + def test_pattern_matches_multiple(self): + results = list(self.dataset.read_files('data/*')) + self.assertEqual(len(results), 2) + paths = {f.relative_path for f, _ in results} + self.assertIn('data/results.csv', paths) + self.assertIn('data/counts.tsv', paths) + + def test_pattern_no_match_returns_empty(self): + results = list(self.dataset.read_files('*.parquet')) + self.assertEqual(len(results), 0) + + def test_explicit_format_csv(self): + import pandas as pd + results = list(self.dataset.read_files('data/*.tsv', file_format='csv', sep='\t')) + self.assertEqual(len(results), 1) + _, df = results[0] + self.assertIsInstance(df, pd.DataFrame) + self.assertIn('gene', df.columns) + + def test_explicit_format_text(self): + results = list(self.dataset.read_files('logs/*.log', file_format='text')) + self.assertEqual(len(results), 1) + _, content = results[0] + self.assertIsInstance(content, str) + self.assertIn('started', content) + + def test_auto_infer_csv_from_extension(self): + import pandas as pd + results = list(self.dataset.read_files('data/results.csv')) + _, content = results[0] + self.assertIsInstance(content, pd.DataFrame) + + def test_auto_infer_text_from_extension(self): + results = list(self.dataset.read_files('logs/run.log')) + _, content = results[0] + self.assertIsInstance(content, str) + + def test_yields_file_and_content_tuples(self): + results = list(self.dataset.read_files('data/*.csv')) + self.assertEqual(len(results), 1) + file, content = results[0] + self.assertIsInstance(file, DataPortalFile) + + def test_globstar_pattern(self): + results = list(self.dataset.read_files('**/*.csv')) + self.assertEqual(len(results), 1) + file, _ = results[0] + self.assertEqual(file.relative_path, 'data/results.csv') From 57c2e4e7fcd1b286a433bd38106a8fc0fb2a15f4 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Mar 2026 09:38:14 +0000 Subject: [PATCH 02/24] Add support for json, parquet, feather, pickle, and excel file formats - Add read_json, read_parquet, read_feather, read_pickle, read_excel methods to DataPortalFile - Update _infer_file_format to detect .json, .parquet, .feather, .pkl/.pickle, .xlsx/.xls extensions - Update _read_file_with_format to dispatch to the new read methods - Update read_files docstring to document all supported formats - Add tests for new format inference and reading (parquet/feather tests skip without pyarrow) https://claude.ai/code/session_01TANa5jJ1qzDMzoV8qCjpuU --- cirro/sdk/dataset.py | 36 +++++++++++++- cirro/sdk/file.py | 40 +++++++++++++++ tests/test_read_files.py | 105 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 177 insertions(+), 4 deletions(-) diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index b5a9424c..edbb6901 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -29,6 +29,16 @@ def _infer_file_format(path: str) -> str: return 'csv' elif path_lower.endswith('.h5ad'): return 'h5ad' + elif path_lower.endswith('.json'): + return 'json' + elif path_lower.endswith('.parquet'): + return 'parquet' + elif path_lower.endswith('.feather'): + return 'feather' + elif path_lower.endswith('.pkl') or path_lower.endswith('.pickle'): + return 'pickle' + elif path_lower.endswith('.xlsx') or path_lower.endswith('.xls'): + return 'excel' else: return 'text' @@ -41,11 +51,22 @@ def _read_file_with_format(file: DataPortalFile, file_format: Optional[str], **k return file.read_csv(**kwargs) elif file_format == 'h5ad': return file.read_h5ad() + elif file_format == 'json': + return file.read_json(**kwargs) + elif file_format == 'parquet': + return file.read_parquet(**kwargs) + elif file_format == 'feather': + return file.read_feather(**kwargs) + elif file_format == 'pickle': + return file.read_pickle(**kwargs) + elif file_format == 'excel': + return file.read_excel(**kwargs) elif file_format == 'text': return file.read(**kwargs) else: raise DataPortalInputError( - f"Unsupported file_format: '{file_format}'. Supported values: 'csv', 'h5ad', 'text'" + f"Unsupported file_format: '{file_format}'. " + f"Supported values: 'csv', 'h5ad', 'json', 'parquet', 'feather', 'pickle', 'excel', 'text'" ) @@ -251,9 +272,20 @@ def read_files( - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame`` - ``'h5ad'``: parse as AnnData (requires ``anndata`` package) + - ``'json'``: parse with :func:`json.loads`, returns a Python object + - ``'parquet'``: parse with :func:`pandas.read_parquet`, returns a ``DataFrame`` + (requires ``pyarrow`` or ``fastparquet``) + - ``'feather'``: parse with :func:`pandas.read_feather`, returns a ``DataFrame`` + (requires ``pyarrow``) + - ``'pickle'``: deserialize with :mod:`pickle`, returns a Python object + - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame`` + (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``) - ``'text'``: read as plain text, returns a ``str`` - ``None`` (default): infer from file extension - (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, otherwise ``'text'``) + (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, + ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``, + ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``, + ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``) **kwargs: Additional keyword arguments forwarded to the file-parsing function. For ``'csv'`` format these are passed to :func:`pandas.read_csv` (e.g., ``sep='\\t'`` for TSV files). diff --git a/cirro/sdk/file.py b/cirro/sdk/file.py index 03acd1ea..4b466ca4 100644 --- a/cirro/sdk/file.py +++ b/cirro/sdk/file.py @@ -1,4 +1,6 @@ import gzip +import json +import pickle from io import BytesIO, StringIO from typing import List @@ -141,6 +143,44 @@ def read_h5ad(self) -> 'anndata.AnnData': with BytesIO(self._get()) as handle: return ad.read_h5ad(handle) + def read_json(self, **kwargs): + """Read the file contents as a parsed JSON object (dict, list, etc.).""" + return json.loads(self._get(), **kwargs) + + def read_parquet(self, **kwargs) -> 'DataFrame': + """ + Read a Parquet file as a Pandas DataFrame. + + Requires ``pyarrow`` or ``fastparquet`` to be installed. + All keyword arguments are passed to :func:`pandas.read_parquet`. + """ + import pandas + return pandas.read_parquet(BytesIO(self._get()), **kwargs) + + def read_feather(self, **kwargs) -> 'DataFrame': + """ + Read a Feather file as a Pandas DataFrame. + + Requires ``pyarrow`` to be installed. + All keyword arguments are passed to :func:`pandas.read_feather`. + """ + import pandas + return pandas.read_feather(BytesIO(self._get()), **kwargs) + + def read_pickle(self, **kwargs): + """Read the file contents as a Python pickle object.""" + return pickle.loads(self._get(), **kwargs) + + def read_excel(self, **kwargs) -> 'DataFrame': + """ + Read an Excel file (``.xlsx`` / ``.xls``) as a Pandas DataFrame. + + Requires ``openpyxl`` (for ``.xlsx``) or ``xlrd`` (for ``.xls``). + All keyword arguments are passed to :func:`pandas.read_excel`. + """ + import pandas + return pandas.read_excel(BytesIO(self._get()), **kwargs) + def readlines(self, encoding='utf-8', compression=None) -> List[str]: """Read the file contents as a list of lines.""" diff --git a/tests/test_read_files.py b/tests/test_read_files.py index 05f4b39f..0ffa02f2 100644 --- a/tests/test_read_files.py +++ b/tests/test_read_files.py @@ -1,5 +1,10 @@ +import io +import json +import pickle import unittest -from unittest.mock import Mock, patch, MagicMock +from unittest.mock import Mock + +import pandas as pd from cirro.models.file import File, FileAccessContext from cirro.sdk.dataset import DataPortalDataset, _infer_file_format, _read_file_with_format @@ -46,6 +51,30 @@ def test_tsv_gz_extension(self): def test_h5ad_extension(self): self.assertEqual(_infer_file_format('data/adata.h5ad'), 'h5ad') + def test_json_extension(self): + self.assertEqual(_infer_file_format('data/results.json'), 'json') + + def test_json_gz_extension(self): + self.assertEqual(_infer_file_format('data/results.json.gz'), 'json') + + def test_parquet_extension(self): + self.assertEqual(_infer_file_format('data/results.parquet'), 'parquet') + + def test_feather_extension(self): + self.assertEqual(_infer_file_format('data/results.feather'), 'feather') + + def test_pickle_pkl_extension(self): + self.assertEqual(_infer_file_format('data/results.pkl'), 'pickle') + + def test_pickle_pickle_extension(self): + self.assertEqual(_infer_file_format('data/results.pickle'), 'pickle') + + def test_excel_xlsx_extension(self): + self.assertEqual(_infer_file_format('data/results.xlsx'), 'excel') + + def test_excel_xls_extension(self): + self.assertEqual(_infer_file_format('data/results.xls'), 'excel') + def test_text_fallback(self): self.assertEqual(_infer_file_format('data/notes.txt'), 'text') @@ -83,7 +112,79 @@ def test_auto_infer_text(self): def test_unsupported_format_raises(self): with self.assertRaises(DataPortalInputError): - _read_file_with_format(self.file, 'parquet') + _read_file_with_format(self.file, 'xyz_unknown') + + def test_json_format(self): + file = _make_mock_file('data/data.json', b'{"key": "value"}') + result = _read_file_with_format(file, 'json') + self.assertIsInstance(result, dict) + self.assertEqual(result['key'], 'value') + + def test_auto_infer_json(self): + file = _make_mock_file('data/data.json', b'[1, 2, 3]') + result = _read_file_with_format(file, None) + self.assertIsInstance(result, list) + self.assertEqual(result, [1, 2, 3]) + + def test_pickle_format(self): + data = {'hello': 42} + file = _make_mock_file('data/data.pkl', pickle.dumps(data)) + result = _read_file_with_format(file, 'pickle') + self.assertEqual(result, data) + + def test_auto_infer_pickle(self): + data = [1, 2, 3] + file = _make_mock_file('data/data.pkl', pickle.dumps(data)) + result = _read_file_with_format(file, None) + self.assertEqual(result, data) + + def _make_parquet_bytes(self): + buf = io.BytesIO() + pd.DataFrame({'a': [1, 2], 'b': [3, 4]}).to_parquet(buf) + return buf.getvalue() + + def _make_feather_bytes(self): + buf = io.BytesIO() + pd.DataFrame({'a': [1, 2], 'b': [3, 4]}).to_feather(buf) + return buf.getvalue() + + @unittest.skipUnless( + __import__('importlib').util.find_spec('pyarrow') is not None, + 'pyarrow not installed' + ) + def test_parquet_format(self): + file = _make_mock_file('data/data.parquet', self._make_parquet_bytes()) + result = _read_file_with_format(file, 'parquet') + self.assertIsInstance(result, pd.DataFrame) + self.assertListEqual(list(result.columns), ['a', 'b']) + + @unittest.skipUnless( + __import__('importlib').util.find_spec('pyarrow') is not None, + 'pyarrow not installed' + ) + def test_auto_infer_parquet(self): + file = _make_mock_file('data/data.parquet', self._make_parquet_bytes()) + result = _read_file_with_format(file, None) + self.assertIsInstance(result, pd.DataFrame) + + @unittest.skipUnless( + __import__('importlib').util.find_spec('pyarrow') is not None, + 'pyarrow not installed' + ) + def test_feather_format(self): + file = _make_mock_file('data/data.feather', self._make_feather_bytes()) + result = _read_file_with_format(file, 'feather') + self.assertIsInstance(result, pd.DataFrame) + self.assertListEqual(list(result.columns), ['a', 'b']) + + @unittest.skipUnless( + __import__('importlib').util.find_spec('pyarrow') is not None, + 'pyarrow not installed' + ) + def test_auto_infer_feather(self): + file = _make_mock_file('data/data.feather', self._make_feather_bytes()) + result = _read_file_with_format(file, None) + self.assertIsInstance(result, pd.DataFrame) def test_csv_kwargs_passed_through(self): import pandas as pd From 40cb5aeeae09a1863c3ca37da3597a1913941c8c Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Mar 2026 13:40:18 +0000 Subject: [PATCH 03/24] Add {name} capture syntax to read_files for automatic path extraction - Add _pattern_to_captures_regex() that converts {name} placeholders in glob patterns to named regex groups (suffix-anchored like PurePath.match) - read_files() now always yields (file, content, captures) 3-tuples; captures is {} when the pattern has no {name} placeholders - Patterns with {name} use regex matching; plain glob patterns continue to use filter_files_by_pattern / PurePath.match unchanged - Add TestPatternToRegex suite and TestDatasetReadFiles capture tests; update all existing tests to unpack 3-tuples https://claude.ai/code/session_01TANa5jJ1qzDMzoV8qCjpuU --- cirro/sdk/dataset.py | 86 ++++++++++++++++++++++++++++---- tests/test_read_files.py | 105 +++++++++++++++++++++++++++++++++++---- 2 files changed, 171 insertions(+), 20 deletions(-) diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index edbb6901..4c47c19f 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -1,6 +1,7 @@ import datetime +import re from pathlib import Path -from typing import Union, List, Optional, Iterator, Tuple, Any +from typing import Union, List, Optional, Iterator, Tuple, Any, Dict from cirro_api_client.v1.api.processes import validate_file_requirements from cirro_api_client.v1.models import Dataset, DatasetDetail, RunAnalysisRequest, ProcessDetail, Status, \ @@ -18,6 +19,39 @@ from cirro.sdk.process import DataPortalProcess +def _pattern_to_captures_regex(pattern: str): + """ + Convert a glob pattern that may contain ``{name}`` capture placeholders into + a compiled regex and return ``(compiled_regex, capture_names)``. + + Conversion rules: + - ``{name}`` → named group matching a single path segment (no ``/``) + - ``*`` → matches any characters within a single path segment + - ``**`` → matches any characters including ``/`` (multiple segments) + - All other characters are regex-escaped. + + The resulting regex is suffix-anchored (like ``pathlib.PurePath.match``): + a pattern without a leading ``/`` will match at any depth in the path. + """ + capture_names = re.findall(r'\{(\w+)\}', pattern) + tokens = re.split(r'(\*\*|\*|\{\w+\})', pattern) + parts = [] + for token in tokens: + if token == '**': + parts.append('.*') + elif token == '*': + parts.append('[^/]*') + elif re.match(r'^\{\w+\}$', token): + name = token[1:-1] + parts.append(f'(?P<{name}>[^/]+)') + else: + parts.append(re.escape(token)) + regex_str = ''.join(parts) + if not pattern.startswith('/'): + regex_str = r'(?:.+/)?' + regex_str + return re.compile('^' + regex_str + '$'), capture_names + + def _infer_file_format(path: str) -> str: """Infer the file format from the file extension.""" path_lower = path.lower() @@ -257,7 +291,7 @@ def read_files( pattern: str, file_format: str = None, **kwargs - ) -> Iterator[Tuple[DataPortalFile, Any]]: + ) -> Iterator[Tuple[DataPortalFile, Any, Dict[str, str]]]: """ Read the contents of files in the dataset matching the given glob pattern. @@ -265,9 +299,18 @@ def read_files( ``*`` matches any sequence of characters within a single path segment; ``**`` matches zero or more path segments. + **Named captures** — wrap a segment in ``{name}`` to extract that portion + of the path automatically. For example, ``{sample}.csv`` will match + ``sampleA.csv`` and ``sampleB.csv`` and return ``{'sample': 'sampleA'}`` + / ``{'sample': 'sampleB'}`` respectively in the third element of each + yielded tuple. Multiple captures are supported: + ``{condition}/{sample}.csv`` extracts both ``condition`` and ``sample`` + from a two-level path. + Args: - pattern (str): Glob pattern used to match file paths within the dataset - (e.g., ``'*.csv'``, ``'counts/**/*.tsv.gz'``) + pattern (str): Glob pattern used to match file paths within the dataset. + May contain ``{name}`` capture placeholders + (e.g., ``'{sample}.csv'``, ``'counts/{sample}/*.tsv.gz'``). file_format (str): File format used to parse each file. Supported values: - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame`` @@ -293,26 +336,47 @@ def read_files( :meth:`~cirro.sdk.file.DataPortalFile.read`. Yields: - Tuple[DataPortalFile, Any]: ``(file, content)`` for each matching file, - where *content* type depends on *file_format*. + Tuple[DataPortalFile, Any, Dict[str, str]]: + ``(file, content, captures)`` for each matching file, where: + + - *content* type depends on *file_format* + - *captures* is a ``dict`` of values extracted from ``{name}`` + placeholders in the pattern (empty ``{}`` when the pattern + contains no captures) Example: ```python # Read all CSV files in a dataset - for file, df in dataset.read_files('*.csv'): + for file, df, _ in dataset.read_files('*.csv'): print(file.relative_path, df.shape) + # Extract sample names automatically from filenames + for file, df, captures in dataset.read_files('{sample}.csv'): + print(captures['sample'], df.shape) + + # Multi-level capture: condition directory + sample filename + for file, df, captures in dataset.read_files('{condition}/{sample}.csv'): + print(captures['condition'], captures['sample'], df.shape) + # Read gzip-compressed TSV files using explicit format and separator - for file, df in dataset.read_files('**/*.tsv.gz', file_format='csv', sep='\\t'): + for file, df, _ in dataset.read_files('**/*.tsv.gz', file_format='csv', sep='\\t'): print(file.relative_path, df.shape) # Read plain-text log files - for file, text in dataset.read_files('logs/*.log', file_format='text'): + for file, text, _ in dataset.read_files('logs/*.log', file_format='text'): print(file.relative_path, text[:200]) ``` """ - for file in filter_files_by_pattern(list(self.list_files()), pattern): - yield file, _read_file_with_format(file, file_format, **kwargs) + has_captures = bool(re.search(r'\{\w+\}', pattern)) + if has_captures: + compiled_regex, _ = _pattern_to_captures_regex(pattern) + for file in self.list_files(): + m = compiled_regex.match(file.relative_path) + if m is not None: + yield file, _read_file_with_format(file, file_format, **kwargs), m.groupdict() + else: + for file in filter_files_by_pattern(list(self.list_files()), pattern): + yield file, _read_file_with_format(file, file_format, **kwargs), {} def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile: """ diff --git a/tests/test_read_files.py b/tests/test_read_files.py index 0ffa02f2..3f330958 100644 --- a/tests/test_read_files.py +++ b/tests/test_read_files.py @@ -7,7 +7,7 @@ import pandas as pd from cirro.models.file import File, FileAccessContext -from cirro.sdk.dataset import DataPortalDataset, _infer_file_format, _read_file_with_format +from cirro.sdk.dataset import DataPortalDataset, _infer_file_format, _read_file_with_format, _pattern_to_captures_regex from cirro.sdk.exceptions import DataPortalInputError from cirro.sdk.file import DataPortalFile, DataPortalFiles @@ -208,13 +208,14 @@ def setUp(self): def test_pattern_matches_csv(self): results = list(self.dataset.read_files('*.csv')) self.assertEqual(len(results), 1) - file, content = results[0] + file, content, captures = results[0] self.assertEqual(file.relative_path, 'data/results.csv') + self.assertEqual(captures, {}) def test_pattern_matches_multiple(self): results = list(self.dataset.read_files('data/*')) self.assertEqual(len(results), 2) - paths = {f.relative_path for f, _ in results} + paths = {f.relative_path for f, _, _ in results} self.assertIn('data/results.csv', paths) self.assertIn('data/counts.tsv', paths) @@ -226,36 +227,122 @@ def test_explicit_format_csv(self): import pandas as pd results = list(self.dataset.read_files('data/*.tsv', file_format='csv', sep='\t')) self.assertEqual(len(results), 1) - _, df = results[0] + _, df, _ = results[0] self.assertIsInstance(df, pd.DataFrame) self.assertIn('gene', df.columns) def test_explicit_format_text(self): results = list(self.dataset.read_files('logs/*.log', file_format='text')) self.assertEqual(len(results), 1) - _, content = results[0] + _, content, _ = results[0] self.assertIsInstance(content, str) self.assertIn('started', content) def test_auto_infer_csv_from_extension(self): import pandas as pd results = list(self.dataset.read_files('data/results.csv')) - _, content = results[0] + _, content, _ = results[0] self.assertIsInstance(content, pd.DataFrame) def test_auto_infer_text_from_extension(self): results = list(self.dataset.read_files('logs/run.log')) - _, content = results[0] + _, content, _ = results[0] self.assertIsInstance(content, str) def test_yields_file_and_content_tuples(self): results = list(self.dataset.read_files('data/*.csv')) self.assertEqual(len(results), 1) - file, content = results[0] + file, content, captures = results[0] self.assertIsInstance(file, DataPortalFile) + self.assertEqual(captures, {}) def test_globstar_pattern(self): results = list(self.dataset.read_files('**/*.csv')) self.assertEqual(len(results), 1) - file, _ = results[0] + file, _, _ = results[0] self.assertEqual(file.relative_path, 'data/results.csv') + + # --- capture pattern tests --- + + def test_capture_simple_filename(self): + # {sample}.csv should match data/results.csv and capture sample='results' + results = list(self.dataset.read_files('{sample}.csv')) + self.assertEqual(len(results), 1) + file, _, captures = results[0] + self.assertEqual(file.relative_path, 'data/results.csv') + self.assertEqual(captures['sample'], 'results') + + def test_capture_with_directory(self): + # data/{sample}.csv should match data/results.csv + results = list(self.dataset.read_files('data/{sample}.csv')) + self.assertEqual(len(results), 1) + _, _, captures = results[0] + self.assertEqual(captures['sample'], 'results') + + def test_capture_multiple_files(self): + # {sample}.csv matches both csv files at depth; capture distinct names + dataset = _make_dataset_with_files([ + _make_mock_file('sampleA.csv', b'a\n1\n'), + _make_mock_file('sampleB.csv', b'a\n2\n'), + _make_mock_file('notes.txt', b'text'), + ]) + results = list(dataset.read_files('{sample}.csv')) + self.assertEqual(len(results), 2) + captured = {c['sample'] for _, _, c in results} + self.assertSetEqual(captured, {'sampleA', 'sampleB'}) + + def test_capture_multi_level(self): + # {condition}/{sample}.csv extracts two path segments + dataset = _make_dataset_with_files([ + _make_mock_file('treated/sampleA.csv', b'x\n1\n'), + _make_mock_file('control/sampleB.csv', b'x\n2\n'), + ]) + results = list(dataset.read_files('{condition}/{sample}.csv')) + self.assertEqual(len(results), 2) + by_sample = {c['sample']: c['condition'] for _, _, c in results} + self.assertEqual(by_sample['sampleA'], 'treated') + self.assertEqual(by_sample['sampleB'], 'control') + + def test_capture_no_match_returns_empty(self): + results = list(self.dataset.read_files('{sample}.parquet')) + self.assertEqual(len(results), 0) + + def test_capture_returns_empty_dict_when_no_placeholders(self): + results = list(self.dataset.read_files('*.csv')) + _, _, captures = results[0] + self.assertEqual(captures, {}) + + +class TestPatternToRegex(unittest.TestCase): + def _match(self, pattern, path): + compiled, names = _pattern_to_captures_regex(pattern) + m = compiled.match(path) + return m.groupdict() if m else None + + def test_simple_capture(self): + self.assertEqual(self._match('{sample}.csv', 'sampleA.csv'), {'sample': 'sampleA'}) + + def test_simple_capture_with_directory(self): + self.assertEqual(self._match('{sample}.csv', 'data/sampleA.csv'), {'sample': 'sampleA'}) + + def test_directory_capture(self): + self.assertEqual(self._match('data/{sample}.csv', 'data/results.csv'), {'sample': 'results'}) + + def test_multi_level_capture(self): + result = self._match('{condition}/{sample}.csv', 'treated/sampleA.csv') + self.assertEqual(result, {'condition': 'treated', 'sample': 'sampleA'}) + + def test_multi_level_capture_with_prefix(self): + result = self._match('{condition}/{sample}.csv', 'data/treated/sampleA.csv') + self.assertEqual(result, {'condition': 'treated', 'sample': 'sampleA'}) + + def test_no_match_returns_none(self): + self.assertIsNone(self._match('{sample}.csv', 'sampleA.tsv')) + + def test_wildcard_mixed_with_capture(self): + result = self._match('data/*/{sample}.csv', 'data/subdir/sampleA.csv') + self.assertEqual(result, {'sample': 'sampleA'}) + + def test_capture_names_returned(self): + _, names = _pattern_to_captures_regex('{condition}/{sample}.csv') + self.assertListEqual(names, ['condition', 'sample']) From dec37a0334495e66829df3abb9337bed611ee74e Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Mar 2026 14:07:23 +0000 Subject: [PATCH 04/24] Replace positional pattern arg with explicit glob= and pattern= kwargs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit read_files() now takes two mutually exclusive keyword arguments: - glob='*.csv' → yields content per matching file - pattern='{sample}.csv' → yields (content, captures) per matching file Passing both or neither raises DataPortalInputError. This makes the return type unambiguous: glob always gives a flat iterator of content, pattern always gives (content, captures) 2-tuples. https://claude.ai/code/session_01TANa5jJ1qzDMzoV8qCjpuU --- cirro/sdk/dataset.py | 99 ++++++++++++++++--------------- tests/test_read_files.py | 122 ++++++++++++++++++--------------------- 2 files changed, 109 insertions(+), 112 deletions(-) diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index 4c47c19f..a5900fcd 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -288,29 +288,38 @@ def list_files(self) -> DataPortalFiles: def read_files( self, - pattern: str, + glob: str = None, + pattern: str = None, file_format: str = None, **kwargs - ) -> Iterator[Tuple[DataPortalFile, Any, Dict[str, str]]]: + ): """ - Read the contents of files in the dataset matching the given glob pattern. + Read the contents of files in the dataset. - Uses standard glob pattern matching (e.g., ``*.csv``, ``data/**/*.tsv.gz``). - ``*`` matches any sequence of characters within a single path segment; - ``**`` matches zero or more path segments. + Exactly one of ``glob`` or ``pattern`` must be provided. - **Named captures** — wrap a segment in ``{name}`` to extract that portion - of the path automatically. For example, ``{sample}.csv`` will match - ``sampleA.csv`` and ``sampleB.csv`` and return ``{'sample': 'sampleA'}`` - / ``{'sample': 'sampleB'}`` respectively in the third element of each - yielded tuple. Multiple captures are supported: - ``{condition}/{sample}.csv`` extracts both ``condition`` and ``sample`` - from a two-level path. + **glob** — standard wildcard matching; yields the file content for each + matching file: + + - ``*`` matches any characters within a single path segment + - ``**`` matches zero or more path segments + - Matching is suffix-anchored (``*.csv`` matches at any depth) + + **pattern** — like ``glob`` but ``{name}`` placeholders capture portions + of the path automatically; yields ``(content, captures)`` pairs where + *captures* is a ``dict`` of extracted values: + + - ``{name}`` captures one path segment (no ``/``) + - ``*`` and ``**`` wildcards work as in ``glob`` Args: - pattern (str): Glob pattern used to match file paths within the dataset. - May contain ``{name}`` capture placeholders - (e.g., ``'{sample}.csv'``, ``'counts/{sample}/*.tsv.gz'``). + glob (str): Wildcard expression to match files + (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``). + Yields one item per matching file: the parsed content. + pattern (str): Wildcard expression with ``{name}`` capture + placeholders (e.g., ``'{sample}.csv'``, + ``'{condition}/{sample}.csv'``). + Yields ``(content, captures)`` per matching file. file_format (str): File format used to parse each file. Supported values: - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame`` @@ -329,54 +338,52 @@ def read_files( ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``, ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``, ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``) - **kwargs: Additional keyword arguments forwarded to the file-parsing function. - For ``'csv'`` format these are passed to :func:`pandas.read_csv` - (e.g., ``sep='\\t'`` for TSV files). - For ``'text'`` format these are passed to - :meth:`~cirro.sdk.file.DataPortalFile.read`. + **kwargs: Additional keyword arguments forwarded to the file-parsing + function (e.g., ``sep='\\t'`` for CSV/TSV files). Yields: - Tuple[DataPortalFile, Any, Dict[str, str]]: - ``(file, content, captures)`` for each matching file, where: + - When using ``glob``: *content* for each matching file + - When using ``pattern``: ``(content, captures)`` for each matching file, + where *captures* is a ``dict`` of values extracted from ``{name}`` + placeholders - - *content* type depends on *file_format* - - *captures* is a ``dict`` of values extracted from ``{name}`` - placeholders in the pattern (empty ``{}`` when the pattern - contains no captures) + Raises: + DataPortalInputError: if both ``glob`` and ``pattern`` are provided, + or if neither is provided. Example: ```python - # Read all CSV files in a dataset - for file, df, _ in dataset.read_files('*.csv'): - print(file.relative_path, df.shape) + # Read all CSV files — just the content + for df in dataset.read_files(glob='*.csv'): + print(df.shape) - # Extract sample names automatically from filenames - for file, df, captures in dataset.read_files('{sample}.csv'): + # Extract sample names from filenames automatically + for df, captures in dataset.read_files(pattern='{sample}.csv'): print(captures['sample'], df.shape) # Multi-level capture: condition directory + sample filename - for file, df, captures in dataset.read_files('{condition}/{sample}.csv'): + for df, captures in dataset.read_files(pattern='{condition}/{sample}.csv'): print(captures['condition'], captures['sample'], df.shape) - # Read gzip-compressed TSV files using explicit format and separator - for file, df, _ in dataset.read_files('**/*.tsv.gz', file_format='csv', sep='\\t'): - print(file.relative_path, df.shape) - - # Read plain-text log files - for file, text, _ in dataset.read_files('logs/*.log', file_format='text'): - print(file.relative_path, text[:200]) + # Read gzip-compressed TSV files with explicit separator + for df in dataset.read_files(glob='**/*.tsv.gz', file_format='csv', sep='\\t'): + print(df.shape) ``` """ - has_captures = bool(re.search(r'\{\w+\}', pattern)) - if has_captures: + if glob is not None and pattern is not None: + raise DataPortalInputError("Cannot specify both 'glob' and 'pattern' — use one or the other") + if glob is None and pattern is None: + raise DataPortalInputError("Must specify either 'glob' or 'pattern'") + + if glob is not None: + for file in filter_files_by_pattern(list(self.list_files()), glob): + yield _read_file_with_format(file, file_format, **kwargs) + else: compiled_regex, _ = _pattern_to_captures_regex(pattern) for file in self.list_files(): m = compiled_regex.match(file.relative_path) if m is not None: - yield file, _read_file_with_format(file, file_format, **kwargs), m.groupdict() - else: - for file in filter_files_by_pattern(list(self.list_files()), pattern): - yield file, _read_file_with_format(file, file_format, **kwargs), {} + yield _read_file_with_format(file, file_format, **kwargs), m.groupdict() def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile: """ diff --git a/tests/test_read_files.py b/tests/test_read_files.py index 3f330958..9f74ae88 100644 --- a/tests/test_read_files.py +++ b/tests/test_read_files.py @@ -205,112 +205,102 @@ def setUp(self): self.txt_file, ]) - def test_pattern_matches_csv(self): - results = list(self.dataset.read_files('*.csv')) + # --- glob mode --- + + def test_glob_matches_csv(self): + results = list(self.dataset.read_files(glob='*.csv')) self.assertEqual(len(results), 1) - file, content, captures = results[0] - self.assertEqual(file.relative_path, 'data/results.csv') - self.assertEqual(captures, {}) + self.assertIsInstance(results[0], pd.DataFrame) - def test_pattern_matches_multiple(self): - results = list(self.dataset.read_files('data/*')) + def test_glob_matches_multiple(self): + results = list(self.dataset.read_files(glob='data/*')) self.assertEqual(len(results), 2) - paths = {f.relative_path for f, _, _ in results} - self.assertIn('data/results.csv', paths) - self.assertIn('data/counts.tsv', paths) - def test_pattern_no_match_returns_empty(self): - results = list(self.dataset.read_files('*.parquet')) + def test_glob_no_match_returns_empty(self): + results = list(self.dataset.read_files(glob='*.parquet')) self.assertEqual(len(results), 0) - def test_explicit_format_csv(self): - import pandas as pd - results = list(self.dataset.read_files('data/*.tsv', file_format='csv', sep='\t')) + def test_glob_explicit_format_csv(self): + results = list(self.dataset.read_files(glob='data/*.tsv', file_format='csv', sep='\t')) self.assertEqual(len(results), 1) - _, df, _ = results[0] - self.assertIsInstance(df, pd.DataFrame) - self.assertIn('gene', df.columns) + self.assertIsInstance(results[0], pd.DataFrame) + self.assertIn('gene', results[0].columns) - def test_explicit_format_text(self): - results = list(self.dataset.read_files('logs/*.log', file_format='text')) + def test_glob_explicit_format_text(self): + results = list(self.dataset.read_files(glob='logs/*.log', file_format='text')) self.assertEqual(len(results), 1) - _, content, _ = results[0] - self.assertIsInstance(content, str) - self.assertIn('started', content) + self.assertIsInstance(results[0], str) + self.assertIn('started', results[0]) - def test_auto_infer_csv_from_extension(self): - import pandas as pd - results = list(self.dataset.read_files('data/results.csv')) - _, content, _ = results[0] - self.assertIsInstance(content, pd.DataFrame) + def test_glob_auto_infer_csv_from_extension(self): + results = list(self.dataset.read_files(glob='data/results.csv')) + self.assertIsInstance(results[0], pd.DataFrame) - def test_auto_infer_text_from_extension(self): - results = list(self.dataset.read_files('logs/run.log')) - _, content, _ = results[0] - self.assertIsInstance(content, str) - - def test_yields_file_and_content_tuples(self): - results = list(self.dataset.read_files('data/*.csv')) - self.assertEqual(len(results), 1) - file, content, captures = results[0] - self.assertIsInstance(file, DataPortalFile) - self.assertEqual(captures, {}) + def test_glob_auto_infer_text_from_extension(self): + results = list(self.dataset.read_files(glob='logs/run.log')) + self.assertIsInstance(results[0], str) def test_globstar_pattern(self): - results = list(self.dataset.read_files('**/*.csv')) + results = list(self.dataset.read_files(glob='**/*.csv')) self.assertEqual(len(results), 1) - file, _, _ = results[0] - self.assertEqual(file.relative_path, 'data/results.csv') + self.assertIsInstance(results[0], pd.DataFrame) - # --- capture pattern tests --- + # --- pattern (capture) mode --- - def test_capture_simple_filename(self): - # {sample}.csv should match data/results.csv and capture sample='results' - results = list(self.dataset.read_files('{sample}.csv')) + def test_pattern_simple_filename(self): + results = list(self.dataset.read_files(pattern='{sample}.csv')) self.assertEqual(len(results), 1) - file, _, captures = results[0] - self.assertEqual(file.relative_path, 'data/results.csv') + content, captures = results[0] + self.assertIsInstance(content, pd.DataFrame) self.assertEqual(captures['sample'], 'results') - def test_capture_with_directory(self): - # data/{sample}.csv should match data/results.csv - results = list(self.dataset.read_files('data/{sample}.csv')) + def test_pattern_with_directory(self): + results = list(self.dataset.read_files(pattern='data/{sample}.csv')) self.assertEqual(len(results), 1) - _, _, captures = results[0] + _, captures = results[0] self.assertEqual(captures['sample'], 'results') - def test_capture_multiple_files(self): - # {sample}.csv matches both csv files at depth; capture distinct names + def test_pattern_multiple_files(self): dataset = _make_dataset_with_files([ _make_mock_file('sampleA.csv', b'a\n1\n'), _make_mock_file('sampleB.csv', b'a\n2\n'), _make_mock_file('notes.txt', b'text'), ]) - results = list(dataset.read_files('{sample}.csv')) + results = list(dataset.read_files(pattern='{sample}.csv')) self.assertEqual(len(results), 2) - captured = {c['sample'] for _, _, c in results} + captured = {c['sample'] for _, c in results} self.assertSetEqual(captured, {'sampleA', 'sampleB'}) - def test_capture_multi_level(self): - # {condition}/{sample}.csv extracts two path segments + def test_pattern_multi_level(self): dataset = _make_dataset_with_files([ _make_mock_file('treated/sampleA.csv', b'x\n1\n'), _make_mock_file('control/sampleB.csv', b'x\n2\n'), ]) - results = list(dataset.read_files('{condition}/{sample}.csv')) + results = list(dataset.read_files(pattern='{condition}/{sample}.csv')) self.assertEqual(len(results), 2) - by_sample = {c['sample']: c['condition'] for _, _, c in results} + by_sample = {c['sample']: c['condition'] for _, c in results} self.assertEqual(by_sample['sampleA'], 'treated') self.assertEqual(by_sample['sampleB'], 'control') - def test_capture_no_match_returns_empty(self): - results = list(self.dataset.read_files('{sample}.parquet')) + def test_pattern_no_match_returns_empty(self): + results = list(self.dataset.read_files(pattern='{sample}.parquet')) self.assertEqual(len(results), 0) - def test_capture_returns_empty_dict_when_no_placeholders(self): - results = list(self.dataset.read_files('*.csv')) - _, _, captures = results[0] - self.assertEqual(captures, {}) + def test_pattern_yields_content_and_captures_tuple(self): + results = list(self.dataset.read_files(pattern='{sample}.csv')) + content, captures = results[0] + self.assertIsInstance(captures, dict) + self.assertIn('sample', captures) + + # --- error cases --- + + def test_both_glob_and_pattern_raises(self): + with self.assertRaises(DataPortalInputError): + list(self.dataset.read_files(glob='*.csv', pattern='{sample}.csv')) + + def test_neither_glob_nor_pattern_raises(self): + with self.assertRaises(DataPortalInputError): + list(self.dataset.read_files()) class TestPatternToRegex(unittest.TestCase): From 916ab8a0c15d4400041bb2ccdad4ba5ecf7ff38d Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 19 Mar 2026 16:35:56 +0000 Subject: [PATCH 05/24] Require dataset argument on project.read_files() Instead of iterating across all datasets, read_files() on DataPortalProject now requires a dataset argument (name, ID, or DataPortalDataset object) and delegates to that dataset's read_files(). The glob/pattern/file_format interface is otherwise unchanged. https://claude.ai/code/session_01TANa5jJ1qzDMzoV8qCjpuU --- cirro/sdk/project.py | 80 ++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 37 deletions(-) diff --git a/cirro/sdk/project.py b/cirro/sdk/project.py index 18678026..48a1e722 100644 --- a/cirro/sdk/project.py +++ b/cirro/sdk/project.py @@ -1,6 +1,6 @@ from functools import cache from time import sleep -from typing import List, Union, Iterator, Tuple, Any +from typing import List, Union from cirro_api_client.v1.models import Project, UploadDatasetRequest, Dataset, Sample, Tag, Status @@ -238,55 +238,61 @@ def samples(self, max_items: int = 10000) -> List[Sample]: def read_files( self, - pattern: str, + dataset: Union[str, DataPortalDataset], + glob: str = None, + pattern: str = None, file_format: str = None, **kwargs - ) -> Iterator[Tuple[DataPortalFile, Any]]: + ): """ - Read the contents of files across all datasets in the project that match - the given glob pattern. - - Iterates over every dataset in the project and yields matching files from - each one in turn. See :meth:`~cirro.sdk.dataset.DataPortalDataset.read_files` - for full details on pattern matching and format options. + Read the contents of files from a specific dataset in the project. - Uses standard glob pattern matching (e.g., ``*.csv``, ``data/**/*.tsv.gz``). - ``*`` matches any sequence of characters within a single path segment; - ``**`` matches zero or more path segments. + The dataset can be identified by name, ID, or a + :class:`~cirro.sdk.dataset.DataPortalDataset` object. + See :meth:`~cirro.sdk.dataset.DataPortalDataset.read_files` + for full details on ``glob``/``pattern`` matching and format options. Args: - pattern (str): Glob pattern used to match file paths within each dataset - (e.g., ``'*.csv'``, ``'counts/**/*.tsv.gz'``) - file_format (str): File format used to parse each file. Supported values: - - - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame`` - - ``'h5ad'``: parse as AnnData (requires ``anndata`` package) - - ``'text'``: read as plain text, returns a ``str`` - - ``None`` (default): infer from file extension - (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, otherwise ``'text'``) - **kwargs: Additional keyword arguments forwarded to the file-parsing function. - For ``'csv'`` format these are passed to :func:`pandas.read_csv` - (e.g., ``sep='\\t'`` for TSV files). - For ``'text'`` format these are passed to - :meth:`~cirro.sdk.file.DataPortalFile.read`. + dataset (str | DataPortalDataset): Dataset to read files from, + identified by name, ID, or object. + glob (str): Wildcard expression to match files. + Yields one item per matching file: the parsed content. + pattern (str): Wildcard expression with ``{name}`` capture + placeholders. Yields ``(content, captures)`` per matching file. + file_format (str): File format used to parse each file + (``'csv'``, ``'h5ad'``, ``'json'``, ``'parquet'``, + ``'feather'``, ``'pickle'``, ``'excel'``, ``'text'``, + or ``None`` to infer from extension). + **kwargs: Additional keyword arguments forwarded to the + file-parsing function. Yields: - Tuple[DataPortalFile, Any]: ``(file, content)`` for each matching file - across all datasets, where *content* type depends on *file_format*. + - When using ``glob``: *content* for each matching file + - When using ``pattern``: ``(content, captures)`` for each + matching file Example: ```python - # Read all CSV files across every dataset in a project - for file, df in project.read_files('*.csv'): - print(file.relative_path, df.shape) - - # Read gzip-compressed TSV files with explicit separator - for file, df in project.read_files('**/*.tsv.gz', file_format='csv', sep='\\t'): - print(file.relative_path, df.shape) + # Read all CSV files from a dataset identified by name + for df in project.read_files('My Dataset', glob='*.csv'): + print(df.shape) + + # Extract sample names using pattern captures + for df, captures in project.read_files( + 'My Dataset', pattern='{sample}.csv' + ): + print(captures['sample'], df.shape) ``` """ - for dataset in self.list_datasets(): - yield from dataset.read_files(pattern, file_format=file_format, **kwargs) + if isinstance(dataset, DataPortalDataset): + ds = dataset + else: + # Try by ID first, fall back to name + try: + ds = self.get_dataset_by_id(dataset) + except (DataPortalAssetNotFound, Exception): + ds = self.get_dataset_by_name(dataset) + yield from ds.read_files(glob=glob, pattern=pattern, file_format=file_format, **kwargs) class DataPortalProjects(DataPortalAssets[DataPortalProject]): From 4cf45aa5fde986198f0f834b0aeeb1701760382b Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 11:54:03 -0700 Subject: [PATCH 06/24] Fix flake8 --- cirro/sdk/dataset.py | 2 +- cirro/sdk/project.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index 81dfd906..ef2c1707 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -1,7 +1,7 @@ import datetime import re from pathlib import Path -from typing import Union, List, Optional, Iterator, Tuple, Any, Dict +from typing import Union, List, Optional, Any from cirro_api_client.v1.api.processes import validate_file_requirements from cirro_api_client.v1.models import Dataset, DatasetDetail, RunAnalysisRequest, ProcessDetail, Status, \ diff --git a/cirro/sdk/project.py b/cirro/sdk/project.py index 48a1e722..4c3bad43 100644 --- a/cirro/sdk/project.py +++ b/cirro/sdk/project.py @@ -9,7 +9,6 @@ from cirro.sdk.asset import DataPortalAssets, DataPortalAsset from cirro.sdk.dataset import DataPortalDataset, DataPortalDatasets from cirro.sdk.exceptions import DataPortalAssetNotFound, DataPortalInputError -from cirro.sdk.file import DataPortalFile from cirro.sdk.helpers import parse_process_name_or_id from cirro.sdk.process import DataPortalProcess from cirro.sdk.reference import DataPortalReference, DataPortalReferences From 29e0c4276176736b22928e5ab4592369af5b7392 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 13:23:00 -0700 Subject: [PATCH 07/24] Get dataset by name or id --- cirro/sdk/project.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/cirro/sdk/project.py b/cirro/sdk/project.py index 4c3bad43..dc29316a 100644 --- a/cirro/sdk/project.py +++ b/cirro/sdk/project.py @@ -89,6 +89,31 @@ def list_datasets(self, force_refresh=False) -> DataPortalDatasets: ] ) + def get_dataset(self, name_or_id: str, force_refresh=False) -> DataPortalDataset: + """Return the dataset matching the given ID or name. + + Tries to match by ID first, then by name. + Raises an error if the name matches multiple datasets. + """ + if force_refresh: + self._get_datasets.cache_clear() + + # Try by ID first + try: + return self.get_dataset_by_id(name_or_id) + except (DataPortalAssetNotFound, Exception): + pass + + # Fall back to name matching + matches = [d for d in self._get_datasets() if d.name == name_or_id] + if len(matches) == 0: + raise DataPortalAssetNotFound(f'Dataset with name or ID "{name_or_id}" not found') + if len(matches) > 1: + raise DataPortalInputError( + f'Multiple datasets found with the name "{name_or_id}" — use get_dataset_by_id instead' + ) + return self.get_dataset_by_id(matches[0].id) + def get_dataset_by_name(self, name: str, force_refresh=False) -> DataPortalDataset: """Return the dataset with the specified name.""" if force_refresh: From 7b59277a08ef009b3dea6528750792fdafc26ebe Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 13:24:25 -0700 Subject: [PATCH 08/24] Add singular read_file function --- cirro/sdk/dataset.py | 47 ++++++++++++++++++++++++++++++++++++++++++++ cirro/sdk/project.py | 40 +++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index ef2c1707..05f74959 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -385,6 +385,53 @@ def read_files( if m is not None: yield _read_file_with_format(file, file_format, **kwargs), m.groupdict() + def read_file( + self, + path: str = None, + glob: str = None, + file_format: str = None, + **kwargs + ) -> Any: + """ + Read the contents of a single file from the dataset. + + Provide either ``path`` (exact relative path) or ``glob`` (wildcard + expression). If ``glob`` is used it must match exactly one file. + + Args: + path (str): Exact relative path of the file within the dataset. + glob (str): Wildcard expression to match a single file. + file_format (str): File format used to parse the file. Supported values + are the same as :meth:`read_files`. + **kwargs: Additional keyword arguments forwarded to the file-parsing + function. + + Returns: + Parsed file content. + + Raises: + DataPortalInputError: if both or neither of ``path``/``glob`` are + provided, or if ``glob`` matches zero or more than one file. + """ + if path is not None and glob is not None: + raise DataPortalInputError("Cannot specify both 'path' and 'glob' — use one or the other") + if path is None and glob is None: + raise DataPortalInputError("Must specify either 'path' or 'glob'") + + if path is not None: + file = self.get_file(path) + else: + matches = list(filter_files_by_pattern(list(self.list_files()), glob)) + if len(matches) == 0: + raise DataPortalAssetNotFound(f"No files matched glob '{glob}'") + if len(matches) > 1: + raise DataPortalInputError( + f"glob '{glob}' matched {len(matches)} files — use read_files() to read multiple files" + ) + file = matches[0] + + return _read_file_with_format(file, file_format, **kwargs) + def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile: """ Get the artifact of a particular type from the dataset diff --git a/cirro/sdk/project.py b/cirro/sdk/project.py index dc29316a..00d52d9a 100644 --- a/cirro/sdk/project.py +++ b/cirro/sdk/project.py @@ -318,6 +318,46 @@ def read_files( ds = self.get_dataset_by_name(dataset) yield from ds.read_files(glob=glob, pattern=pattern, file_format=file_format, **kwargs) + def read_file( + self, + dataset: Union[str, DataPortalDataset], + path: str = None, + glob: str = None, + file_format: str = None, + **kwargs + ): + """ + Read the contents of a single file from a specific dataset in the project. + + The dataset can be identified by name, ID, or a + :class:`~cirro.sdk.dataset.DataPortalDataset` object. + Provide either ``path`` (exact relative path) or ``glob`` (wildcard + expression). If ``glob`` is used it must match exactly one file. + + Args: + dataset (str | DataPortalDataset): Dataset to read the file from, + identified by name, ID, or object. + path (str): Exact relative path of the file within the dataset. + glob (str): Wildcard expression matching exactly one file. + file_format (str): File format used to parse the file + (``'csv'``, ``'h5ad'``, ``'json'``, ``'parquet'``, + ``'feather'``, ``'pickle'``, ``'excel'``, ``'text'``, + or ``None`` to infer from extension). + **kwargs: Additional keyword arguments forwarded to the + file-parsing function. + + Returns: + Parsed file content. + """ + if isinstance(dataset, DataPortalDataset): + ds = dataset + else: + try: + ds = self.get_dataset_by_id(dataset) + except (DataPortalAssetNotFound, Exception): + ds = self.get_dataset_by_name(dataset) + return ds.read_file(path=path, glob=glob, file_format=file_format, **kwargs) + class DataPortalProjects(DataPortalAssets[DataPortalProject]): """Collection of DataPortalProject objects""" From 52ee650a8e4e44e097cccf851f6f7bb7dec16893 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 13:40:21 -0700 Subject: [PATCH 09/24] Increment version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6224e9f3..50ea289f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "cirro" -version = "1.10.2" +version = "1.10.3" description = "CLI tool and SDK for interacting with the Cirro platform" authors = ["Cirro Bio "] license = "MIT" From 75e4e6abc9418d29f909afdaf06c25788049bc4b Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 14:03:45 -0700 Subject: [PATCH 10/24] Bugfixes --- cirro/sdk/asset.py | 2 +- cirro/sdk/dataset.py | 9 +++++---- cirro/sdk/file.py | 4 ++-- cirro/sdk/process.py | 3 ++- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/cirro/sdk/asset.py b/cirro/sdk/asset.py index ce1eea00..082200fe 100644 --- a/cirro/sdk/asset.py +++ b/cirro/sdk/asset.py @@ -60,7 +60,7 @@ def get_by_name(self, name: str) -> T: # Error if multiple projects are found msg = f"Multiple {self.asset_name} items found with name '{name}', use ID instead.\n{self.description()}" if len(matching_queries) > 1: - raise DataPortalAssetNotFound(msg) + raise DataPortalInputError(msg) return matching_queries[0] diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index 05f74959..fe8ff48a 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -290,7 +290,7 @@ def read_files( self, glob: str = None, pattern: str = None, - file_format: str = None, + format: str = None, **kwargs ): """ @@ -320,7 +320,7 @@ def read_files( placeholders (e.g., ``'{sample}.csv'``, ``'{condition}/{sample}.csv'``). Yields ``(content, captures)`` per matching file. - file_format (str): File format used to parse each file. Supported values: + format (str): File format used to parse each file. Supported values: - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame`` - ``'h5ad'``: parse as AnnData (requires ``anndata`` package) @@ -366,7 +366,7 @@ def read_files( print(captures['condition'], captures['sample'], df.shape) # Read gzip-compressed TSV files with explicit separator - for df in dataset.read_files(glob='**/*.tsv.gz', file_format='csv', sep='\\t'): + for df in dataset.read_files(glob='**/*.tsv.gz', format='csv', sep='\\t'): print(df.shape) ``` """ @@ -514,6 +514,7 @@ def run_analysis( process = parse_process_name_or_id(process, self._client) if compute_environment: + compute_environment_name = compute_environment compute_environments = self._client.compute_environments.list_environments_for_project( project_id=self.project_id ) @@ -523,7 +524,7 @@ def run_analysis( None ) if compute_environment is None: - raise DataPortalInputError(f"Compute environment '{compute_environment}' not found") + raise DataPortalInputError(f"Compute environment '{compute_environment_name}' not found") resp = self._client.execution.run_analysis( project_id=self.project_id, diff --git a/cirro/sdk/file.py b/cirro/sdk/file.py index f43bd22e..b6c2e1bb 100644 --- a/cirro/sdk/file.py +++ b/cirro/sdk/file.py @@ -111,7 +111,7 @@ def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> 'DataFram elif self.relative_path.endswith('.bz2'): compression = dict(method='bz2') elif self.relative_path.endswith('.xz'): - compression = dict(method='zstd') + compression = dict(method='xz') elif self.relative_path.endswith('.zst'): compression = dict(method='zstd') else: @@ -280,5 +280,5 @@ def download(self, download_location: str = None) -> List[Path]: local_paths = [] for f in self: - local_paths += f.download(download_location) + local_paths.append(f.download(download_location)) return local_paths diff --git a/cirro/sdk/process.py b/cirro/sdk/process.py index 282924fa..8f4cff9d 100644 --- a/cirro/sdk/process.py +++ b/cirro/sdk/process.py @@ -147,6 +147,7 @@ def run_analysis( ] if compute_environment: + compute_environment_name = compute_environment compute_environments = self._client.compute_environments.list_environments_for_project( project_id=project_id ) @@ -156,7 +157,7 @@ def run_analysis( None ) if compute_environment is None: - raise DataPortalInputError(f"Compute environment '{compute_environment}' not found") + raise DataPortalInputError(f"Compute environment '{compute_environment_name}' not found") resp = self._client.execution.run_analysis( project_id=project_id, From 30abda98559fabfb74160a55b131021a91ae12d4 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 14:04:52 -0700 Subject: [PATCH 11/24] Move from project to portal --- cirro/sdk/portal.py | 83 +++++++++++++++++++++++++++++++++++-- cirro/sdk/project.py | 98 -------------------------------------------- 2 files changed, 79 insertions(+), 102 deletions(-) diff --git a/cirro/sdk/portal.py b/cirro/sdk/portal.py index ebd5fd96..36f2afdc 100644 --- a/cirro/sdk/portal.py +++ b/cirro/sdk/portal.py @@ -100,10 +100,85 @@ def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDat except DataPortalAssetNotFound: project: DataPortalProject = self.get_project_by_name(project) - try: - return project.get_dataset_by_id(dataset) - except DataPortalAssetNotFound: - return project.get_dataset_by_name(dataset) + return project.get_dataset(dataset) + + def read_files( + self, + project: str, + dataset: str, + glob: str = None, + pattern: str = None, + format: str = None, + **kwargs + ): + """ + Read the contents of files from a dataset. + + The project and dataset can each be identified by name or ID. + See :meth:`~cirro.sdk.dataset.DataPortalDataset.read_files` + for full details on ``glob``/``pattern`` matching and format options. + + Args: + project (str): ID or name of the project. + dataset (str): ID or name of the dataset. + glob (str): Wildcard expression to match files. + Yields one item per matching file: the parsed content. + pattern (str): Wildcard expression with ``{name}`` capture + placeholders. Yields ``(content, captures)`` per matching file. + format (str): File format used to parse each file + (``'csv'``, ``'h5ad'``, ``'json'``, ``'parquet'``, + ``'feather'``, ``'pickle'``, ``'excel'``, ``'text'``, + or ``None`` to infer from extension). + **kwargs: Additional keyword arguments forwarded to the + file-parsing function. + + Yields: + - When using ``glob``: *content* for each matching file + - When using ``pattern``: ``(content, captures)`` for each + matching file + + Example: + ```python + for df in portal.read_files('My Project', 'My Dataset', glob='*.csv'): + print(df.shape) + ``` + """ + ds = self.get_dataset(project=project, dataset=dataset) + yield from ds.read_files(glob=glob, pattern=pattern, format=format, **kwargs) + + def read_file( + self, + project: str, + dataset: str, + path: str = None, + glob: str = None, + format: str = None, + **kwargs + ): + """ + Read the contents of a single file from a dataset. + + The project and dataset can each be identified by name or ID. + Provide either ``path`` (exact relative path) or ``glob`` (wildcard + expression). If ``glob`` is used it must match exactly one file. + + Args: + project (str): ID or name of the project. + dataset (str): ID or name of the dataset. + path (str): Exact relative path of the file within the dataset. + glob (str): Wildcard expression matching exactly one file. + format (str): File format used to parse the file + (``'csv'``, ``'h5ad'``, ``'json'``, ``'parquet'``, + ``'feather'``, ``'pickle'``, ``'excel'``, ``'text'``, + or ``None`` to infer from extension). + **kwargs: Additional keyword arguments forwarded to the + file-parsing function. + + Returns: + Parsed file content. + """ + ds = self.get_dataset(project=project, dataset=dataset) + return ds.read_file(path=path, glob=glob, format=format, **kwargs) def list_processes(self, ingest=False) -> DataPortalProcesses: """ diff --git a/cirro/sdk/project.py b/cirro/sdk/project.py index 00d52d9a..b224a15f 100644 --- a/cirro/sdk/project.py +++ b/cirro/sdk/project.py @@ -260,104 +260,6 @@ def samples(self, max_items: int = 10000) -> List[Sample]: """ return self._client.metadata.get_project_samples(self.id, max_items) - def read_files( - self, - dataset: Union[str, DataPortalDataset], - glob: str = None, - pattern: str = None, - file_format: str = None, - **kwargs - ): - """ - Read the contents of files from a specific dataset in the project. - - The dataset can be identified by name, ID, or a - :class:`~cirro.sdk.dataset.DataPortalDataset` object. - See :meth:`~cirro.sdk.dataset.DataPortalDataset.read_files` - for full details on ``glob``/``pattern`` matching and format options. - - Args: - dataset (str | DataPortalDataset): Dataset to read files from, - identified by name, ID, or object. - glob (str): Wildcard expression to match files. - Yields one item per matching file: the parsed content. - pattern (str): Wildcard expression with ``{name}`` capture - placeholders. Yields ``(content, captures)`` per matching file. - file_format (str): File format used to parse each file - (``'csv'``, ``'h5ad'``, ``'json'``, ``'parquet'``, - ``'feather'``, ``'pickle'``, ``'excel'``, ``'text'``, - or ``None`` to infer from extension). - **kwargs: Additional keyword arguments forwarded to the - file-parsing function. - - Yields: - - When using ``glob``: *content* for each matching file - - When using ``pattern``: ``(content, captures)`` for each - matching file - - Example: - ```python - # Read all CSV files from a dataset identified by name - for df in project.read_files('My Dataset', glob='*.csv'): - print(df.shape) - - # Extract sample names using pattern captures - for df, captures in project.read_files( - 'My Dataset', pattern='{sample}.csv' - ): - print(captures['sample'], df.shape) - ``` - """ - if isinstance(dataset, DataPortalDataset): - ds = dataset - else: - # Try by ID first, fall back to name - try: - ds = self.get_dataset_by_id(dataset) - except (DataPortalAssetNotFound, Exception): - ds = self.get_dataset_by_name(dataset) - yield from ds.read_files(glob=glob, pattern=pattern, file_format=file_format, **kwargs) - - def read_file( - self, - dataset: Union[str, DataPortalDataset], - path: str = None, - glob: str = None, - file_format: str = None, - **kwargs - ): - """ - Read the contents of a single file from a specific dataset in the project. - - The dataset can be identified by name, ID, or a - :class:`~cirro.sdk.dataset.DataPortalDataset` object. - Provide either ``path`` (exact relative path) or ``glob`` (wildcard - expression). If ``glob`` is used it must match exactly one file. - - Args: - dataset (str | DataPortalDataset): Dataset to read the file from, - identified by name, ID, or object. - path (str): Exact relative path of the file within the dataset. - glob (str): Wildcard expression matching exactly one file. - file_format (str): File format used to parse the file - (``'csv'``, ``'h5ad'``, ``'json'``, ``'parquet'``, - ``'feather'``, ``'pickle'``, ``'excel'``, ``'text'``, - or ``None`` to infer from extension). - **kwargs: Additional keyword arguments forwarded to the - file-parsing function. - - Returns: - Parsed file content. - """ - if isinstance(dataset, DataPortalDataset): - ds = dataset - else: - try: - ds = self.get_dataset_by_id(dataset) - except (DataPortalAssetNotFound, Exception): - ds = self.get_dataset_by_name(dataset) - return ds.read_file(path=path, glob=glob, file_format=file_format, **kwargs) - class DataPortalProjects(DataPortalAssets[DataPortalProject]): """Collection of DataPortalProject objects""" From 05c78b451d166d89fa549b214ee331fd40c624fb Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 14:05:03 -0700 Subject: [PATCH 12/24] Change file_format to format --- cirro/sdk/dataset.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index fe8ff48a..2587c8f2 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -377,19 +377,19 @@ def read_files( if glob is not None: for file in filter_files_by_pattern(list(self.list_files()), glob): - yield _read_file_with_format(file, file_format, **kwargs) + yield _read_file_with_format(file, format, **kwargs) else: compiled_regex, _ = _pattern_to_captures_regex(pattern) for file in self.list_files(): m = compiled_regex.match(file.relative_path) if m is not None: - yield _read_file_with_format(file, file_format, **kwargs), m.groupdict() + yield _read_file_with_format(file, format, **kwargs), m.groupdict() def read_file( self, path: str = None, glob: str = None, - file_format: str = None, + format: str = None, **kwargs ) -> Any: """ @@ -401,7 +401,7 @@ def read_file( Args: path (str): Exact relative path of the file within the dataset. glob (str): Wildcard expression to match a single file. - file_format (str): File format used to parse the file. Supported values + format (str): File format used to parse the file. Supported values are the same as :meth:`read_files`. **kwargs: Additional keyword arguments forwarded to the file-parsing function. @@ -430,7 +430,7 @@ def read_file( ) file = matches[0] - return _read_file_with_format(file, file_format, **kwargs) + return _read_file_with_format(file, format, **kwargs) def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile: """ From 84c36bac9f88c91b2fba352c035b3a912a90bba4 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 14:45:24 -0700 Subject: [PATCH 13/24] Clean up --- cirro/sdk/dataset.py | 2 +- cirro/sdk/file.py | 2 +- cirro/sdk/portal.py | 2 +- cirro/sdk/project.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index 2587c8f2..e35c7138 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -118,7 +118,7 @@ def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi): Should be invoked from a top-level constructor, for example: ```python - from cirro import DataPortal() + from cirro import DataPortal portal = DataPortal() dataset = portal.get_dataset( project="id-or-name-of-project", diff --git a/cirro/sdk/file.py b/cirro/sdk/file.py index b6c2e1bb..3c6850e2 100644 --- a/cirro/sdk/file.py +++ b/cirro/sdk/file.py @@ -27,7 +27,7 @@ def __init__(self, file: File, client: CirroApi): Instantiate by listing files from a dataset. ```python - from cirro import DataPortal() + from cirro import DataPortal portal = DataPortal() dataset = portal.get_dataset( project="id-or-name-of-project", diff --git a/cirro/sdk/portal.py b/cirro/sdk/portal.py index 36f2afdc..0a9a6852 100644 --- a/cirro/sdk/portal.py +++ b/cirro/sdk/portal.py @@ -28,7 +28,7 @@ def __init__(self, base_url: str = None, client: CirroApi = None): ```python from cirro import DataPortal - Portal = DataPortal(base_url="app.cirro.bio") + portal = DataPortal(base_url="app.cirro.bio") portal.list_projects() ``` """ diff --git a/cirro/sdk/project.py b/cirro/sdk/project.py index b224a15f..89f58c91 100644 --- a/cirro/sdk/project.py +++ b/cirro/sdk/project.py @@ -101,7 +101,7 @@ def get_dataset(self, name_or_id: str, force_refresh=False) -> DataPortalDataset # Try by ID first try: return self.get_dataset_by_id(name_or_id) - except (DataPortalAssetNotFound, Exception): + except Exception: pass # Fall back to name matching From 96764c2b5e4eb6f4ecaac371763ad6c4a3daa772 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 14:45:42 -0700 Subject: [PATCH 14/24] Move the primary read_files docs to the DataPortal object --- cirro/sdk/dataset.py | 86 +++++++------------------------------------- cirro/sdk/portal.py | 82 +++++++++++++++++++++++++++++++++--------- 2 files changed, 78 insertions(+), 90 deletions(-) diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index e35c7138..404801a2 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -296,79 +296,22 @@ def read_files( """ Read the contents of files in the dataset. - Exactly one of ``glob`` or ``pattern`` must be provided. - - **glob** — standard wildcard matching; yields the file content for each - matching file: - - - ``*`` matches any characters within a single path segment - - ``**`` matches zero or more path segments - - Matching is suffix-anchored (``*.csv`` matches at any depth) - - **pattern** — like ``glob`` but ``{name}`` placeholders capture portions - of the path automatically; yields ``(content, captures)`` pairs where - *captures* is a ``dict`` of extracted values: - - - ``{name}`` captures one path segment (no ``/``) - - ``*`` and ``**`` wildcards work as in ``glob`` + See :meth:`~cirro.sdk.portal.DataPortal.read_files` for full details + on ``glob``/``pattern`` matching and format options. Args: - glob (str): Wildcard expression to match files - (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``). + glob (str): Wildcard expression to match files. Yields one item per matching file: the parsed content. pattern (str): Wildcard expression with ``{name}`` capture - placeholders (e.g., ``'{sample}.csv'``, - ``'{condition}/{sample}.csv'``). - Yields ``(content, captures)`` per matching file. - format (str): File format used to parse each file. Supported values: - - - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame`` - - ``'h5ad'``: parse as AnnData (requires ``anndata`` package) - - ``'json'``: parse with :func:`json.loads`, returns a Python object - - ``'parquet'``: parse with :func:`pandas.read_parquet`, returns a ``DataFrame`` - (requires ``pyarrow`` or ``fastparquet``) - - ``'feather'``: parse with :func:`pandas.read_feather`, returns a ``DataFrame`` - (requires ``pyarrow``) - - ``'pickle'``: deserialize with :mod:`pickle`, returns a Python object - - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame`` - (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``) - - ``'text'``: read as plain text, returns a ``str`` - - ``None`` (default): infer from file extension - (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, - ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``, - ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``, - ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``) - **kwargs: Additional keyword arguments forwarded to the file-parsing - function (e.g., ``sep='\\t'`` for CSV/TSV files). + placeholders. Yields ``(content, captures)`` per matching file. + format (str): File format used to parse each file + (or ``None`` to infer from extension). + **kwargs: Additional keyword arguments forwarded to the + file-parsing function. Yields: - When using ``glob``: *content* for each matching file - - When using ``pattern``: ``(content, captures)`` for each matching file, - where *captures* is a ``dict`` of values extracted from ``{name}`` - placeholders - - Raises: - DataPortalInputError: if both ``glob`` and ``pattern`` are provided, - or if neither is provided. - - Example: - ```python - # Read all CSV files — just the content - for df in dataset.read_files(glob='*.csv'): - print(df.shape) - - # Extract sample names from filenames automatically - for df, captures in dataset.read_files(pattern='{sample}.csv'): - print(captures['sample'], df.shape) - - # Multi-level capture: condition directory + sample filename - for df, captures in dataset.read_files(pattern='{condition}/{sample}.csv'): - print(captures['condition'], captures['sample'], df.shape) - - # Read gzip-compressed TSV files with explicit separator - for df in dataset.read_files(glob='**/*.tsv.gz', format='csv', sep='\\t'): - print(df.shape) - ``` + - When using ``pattern``: ``(content, captures)`` for each matching file """ if glob is not None and pattern is not None: raise DataPortalInputError("Cannot specify both 'glob' and 'pattern' — use one or the other") @@ -395,23 +338,18 @@ def read_file( """ Read the contents of a single file from the dataset. - Provide either ``path`` (exact relative path) or ``glob`` (wildcard - expression). If ``glob`` is used it must match exactly one file. + See :meth:`~cirro.sdk.portal.DataPortal.read_file` for full details. Args: path (str): Exact relative path of the file within the dataset. - glob (str): Wildcard expression to match a single file. + glob (str): Wildcard expression matching exactly one file. format (str): File format used to parse the file. Supported values - are the same as :meth:`read_files`. + are the same as :meth:`~cirro.sdk.portal.DataPortal.read_files`. **kwargs: Additional keyword arguments forwarded to the file-parsing function. Returns: Parsed file content. - - Raises: - DataPortalInputError: if both or neither of ``path``/``glob`` are - provided, or if ``glob`` matches zero or more than one file. """ if path is not None and glob is not None: raise DataPortalInputError("Cannot specify both 'path' and 'glob' — use one or the other") diff --git a/cirro/sdk/portal.py b/cirro/sdk/portal.py index 0a9a6852..4bb4e5a7 100644 --- a/cirro/sdk/portal.py +++ b/cirro/sdk/portal.py @@ -115,32 +115,80 @@ def read_files( Read the contents of files from a dataset. The project and dataset can each be identified by name or ID. - See :meth:`~cirro.sdk.dataset.DataPortalDataset.read_files` - for full details on ``glob``/``pattern`` matching and format options. + Exactly one of ``glob`` or ``pattern`` must be provided. + + **glob** — standard wildcard matching; yields the file content for each + matching file: + + - ``*`` matches any characters within a single path segment + - ``**`` matches zero or more path segments + - Matching is suffix-anchored (``*.csv`` matches at any depth) + + **pattern** — like ``glob`` but ``{name}`` placeholders capture portions + of the path automatically; yields ``(content, captures)`` pairs where + *captures* is a ``dict`` of extracted values: + + - ``{name}`` captures one path segment (no ``/``) + - ``*`` and ``**`` wildcards work as in ``glob`` Args: project (str): ID or name of the project. dataset (str): ID or name of the dataset. - glob (str): Wildcard expression to match files. + glob (str): Wildcard expression to match files + (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``). Yields one item per matching file: the parsed content. pattern (str): Wildcard expression with ``{name}`` capture - placeholders. Yields ``(content, captures)`` per matching file. - format (str): File format used to parse each file - (``'csv'``, ``'h5ad'``, ``'json'``, ``'parquet'``, - ``'feather'``, ``'pickle'``, ``'excel'``, ``'text'``, - or ``None`` to infer from extension). - **kwargs: Additional keyword arguments forwarded to the - file-parsing function. + placeholders (e.g., ``'{sample}.csv'``, + ``'{condition}/{sample}.csv'``). + Yields ``(content, captures)`` per matching file. + format (str): File format used to parse each file. Supported values: + + - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame`` + - ``'h5ad'``: parse as AnnData (requires ``anndata`` package) + - ``'json'``: parse with :func:`json.loads`, returns a Python object + - ``'parquet'``: parse with :func:`pandas.read_parquet`, returns a ``DataFrame`` + (requires ``pyarrow`` or ``fastparquet``) + - ``'feather'``: parse with :func:`pandas.read_feather`, returns a ``DataFrame`` + (requires ``pyarrow``) + - ``'pickle'``: deserialize with :mod:`pickle`, returns a Python object + - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame`` + (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``) + - ``'text'``: read as plain text, returns a ``str`` + - ``None`` (default): infer from file extension + (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, + ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``, + ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``, + ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``) + **kwargs: Additional keyword arguments forwarded to the file-parsing + function (e.g., ``sep='\\t'`` for CSV/TSV files). Yields: - When using ``glob``: *content* for each matching file - - When using ``pattern``: ``(content, captures)`` for each - matching file + - When using ``pattern``: ``(content, captures)`` for each matching file, + where *captures* is a ``dict`` of values extracted from ``{name}`` + placeholders + + Raises: + DataPortalInputError: if both ``glob`` and ``pattern`` are provided, + or if neither is provided. Example: ```python + # Read all CSV files — just the content for df in portal.read_files('My Project', 'My Dataset', glob='*.csv'): print(df.shape) + + # Extract sample names from filenames automatically + for df, captures in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'): + print(captures['sample'], df.shape) + + # Multi-level capture: condition directory + sample filename + for df, captures in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'): + print(captures['condition'], captures['sample'], df.shape) + + # Read gzip-compressed TSV files with explicit separator + for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', format='csv', sep='\\t'): + print(df.shape) ``` """ ds = self.get_dataset(project=project, dataset=dataset) @@ -167,15 +215,17 @@ def read_file( dataset (str): ID or name of the dataset. path (str): Exact relative path of the file within the dataset. glob (str): Wildcard expression matching exactly one file. - format (str): File format used to parse the file - (``'csv'``, ``'h5ad'``, ``'json'``, ``'parquet'``, - ``'feather'``, ``'pickle'``, ``'excel'``, ``'text'``, - or ``None`` to infer from extension). + format (str): File format used to parse the file. Supported values + are the same as :meth:`read_files`. **kwargs: Additional keyword arguments forwarded to the file-parsing function. Returns: Parsed file content. + + Raises: + DataPortalInputError: if both or neither of ``path``/``glob`` are + provided, or if ``glob`` matches zero or more than one file. """ ds = self.get_dataset(project=project, dataset=dataset) return ds.read_file(path=path, glob=glob, format=format, **kwargs) From 595b0a2f408fab3f0f7e3d4f95c4e59c7c6614b6 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 14:52:06 -0700 Subject: [PATCH 15/24] format -> filetype --- cirro/sdk/dataset.py | 16 ++++++++-------- cirro/sdk/portal.py | 14 +++++++------- tests/test_read_files.py | 4 ++-- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index 404801a2..d825360b 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -290,21 +290,21 @@ def read_files( self, glob: str = None, pattern: str = None, - format: str = None, + filetype: str = None, **kwargs ): """ Read the contents of files in the dataset. See :meth:`~cirro.sdk.portal.DataPortal.read_files` for full details - on ``glob``/``pattern`` matching and format options. + on ``glob``/``pattern`` matching and filetype options. Args: glob (str): Wildcard expression to match files. Yields one item per matching file: the parsed content. pattern (str): Wildcard expression with ``{name}`` capture placeholders. Yields ``(content, captures)`` per matching file. - format (str): File format used to parse each file + filetype (str): File format used to parse each file (or ``None`` to infer from extension). **kwargs: Additional keyword arguments forwarded to the file-parsing function. @@ -320,19 +320,19 @@ def read_files( if glob is not None: for file in filter_files_by_pattern(list(self.list_files()), glob): - yield _read_file_with_format(file, format, **kwargs) + yield _read_file_with_format(file, filetype, **kwargs) else: compiled_regex, _ = _pattern_to_captures_regex(pattern) for file in self.list_files(): m = compiled_regex.match(file.relative_path) if m is not None: - yield _read_file_with_format(file, format, **kwargs), m.groupdict() + yield _read_file_with_format(file, filetype, **kwargs), m.groupdict() def read_file( self, path: str = None, glob: str = None, - format: str = None, + filetype: str = None, **kwargs ) -> Any: """ @@ -343,7 +343,7 @@ def read_file( Args: path (str): Exact relative path of the file within the dataset. glob (str): Wildcard expression matching exactly one file. - format (str): File format used to parse the file. Supported values + filetype (str): File format used to parse the file. Supported values are the same as :meth:`~cirro.sdk.portal.DataPortal.read_files`. **kwargs: Additional keyword arguments forwarded to the file-parsing function. @@ -368,7 +368,7 @@ def read_file( ) file = matches[0] - return _read_file_with_format(file, format, **kwargs) + return _read_file_with_format(file, filetype, **kwargs) def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile: """ diff --git a/cirro/sdk/portal.py b/cirro/sdk/portal.py index 4bb4e5a7..696e191f 100644 --- a/cirro/sdk/portal.py +++ b/cirro/sdk/portal.py @@ -108,7 +108,7 @@ def read_files( dataset: str, glob: str = None, pattern: str = None, - format: str = None, + filetype: str = None, **kwargs ): """ @@ -141,7 +141,7 @@ def read_files( placeholders (e.g., ``'{sample}.csv'``, ``'{condition}/{sample}.csv'``). Yields ``(content, captures)`` per matching file. - format (str): File format used to parse each file. Supported values: + filetype (str): File format used to parse each file. Supported values: - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame`` - ``'h5ad'``: parse as AnnData (requires ``anndata`` package) @@ -187,12 +187,12 @@ def read_files( print(captures['condition'], captures['sample'], df.shape) # Read gzip-compressed TSV files with explicit separator - for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', format='csv', sep='\\t'): + for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', filetype='csv', sep='\\t'): print(df.shape) ``` """ ds = self.get_dataset(project=project, dataset=dataset) - yield from ds.read_files(glob=glob, pattern=pattern, format=format, **kwargs) + yield from ds.read_files(glob=glob, pattern=pattern, filetype=filetype, **kwargs) def read_file( self, @@ -200,7 +200,7 @@ def read_file( dataset: str, path: str = None, glob: str = None, - format: str = None, + filetype: str = None, **kwargs ): """ @@ -215,7 +215,7 @@ def read_file( dataset (str): ID or name of the dataset. path (str): Exact relative path of the file within the dataset. glob (str): Wildcard expression matching exactly one file. - format (str): File format used to parse the file. Supported values + filetype (str): File format used to parse the file. Supported values are the same as :meth:`read_files`. **kwargs: Additional keyword arguments forwarded to the file-parsing function. @@ -228,7 +228,7 @@ def read_file( provided, or if ``glob`` matches zero or more than one file. """ ds = self.get_dataset(project=project, dataset=dataset) - return ds.read_file(path=path, glob=glob, format=format, **kwargs) + return ds.read_file(path=path, glob=glob, filetype=filetype, **kwargs) def list_processes(self, ingest=False) -> DataPortalProcesses: """ diff --git a/tests/test_read_files.py b/tests/test_read_files.py index 9f74ae88..98abe17c 100644 --- a/tests/test_read_files.py +++ b/tests/test_read_files.py @@ -221,13 +221,13 @@ def test_glob_no_match_returns_empty(self): self.assertEqual(len(results), 0) def test_glob_explicit_format_csv(self): - results = list(self.dataset.read_files(glob='data/*.tsv', file_format='csv', sep='\t')) + results = list(self.dataset.read_files(glob='data/*.tsv', filetype='csv', sep='\t')) self.assertEqual(len(results), 1) self.assertIsInstance(results[0], pd.DataFrame) self.assertIn('gene', results[0].columns) def test_glob_explicit_format_text(self): - results = list(self.dataset.read_files(glob='logs/*.log', file_format='text')) + results = list(self.dataset.read_files(glob='logs/*.log', filetype='text')) self.assertEqual(len(results), 1) self.assertIsInstance(results[0], str) self.assertIn('started', results[0]) From 5be899843a718e0467068c02a9f7730ba79e9f0f Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 14:56:49 -0700 Subject: [PATCH 16/24] captures -> meta --- cirro/sdk/dataset.py | 4 ++-- cirro/sdk/portal.py | 18 +++++++++--------- tests/test_read_files.py | 20 ++++++++++---------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index d825360b..ab2fa74a 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -303,7 +303,7 @@ def read_files( glob (str): Wildcard expression to match files. Yields one item per matching file: the parsed content. pattern (str): Wildcard expression with ``{name}`` capture - placeholders. Yields ``(content, captures)`` per matching file. + placeholders. Yields ``(content, meta)`` per matching file. filetype (str): File format used to parse each file (or ``None`` to infer from extension). **kwargs: Additional keyword arguments forwarded to the @@ -311,7 +311,7 @@ def read_files( Yields: - When using ``glob``: *content* for each matching file - - When using ``pattern``: ``(content, captures)`` for each matching file + - When using ``pattern``: ``(content, meta)`` for each matching file """ if glob is not None and pattern is not None: raise DataPortalInputError("Cannot specify both 'glob' and 'pattern' — use one or the other") diff --git a/cirro/sdk/portal.py b/cirro/sdk/portal.py index 696e191f..d811b94b 100644 --- a/cirro/sdk/portal.py +++ b/cirro/sdk/portal.py @@ -125,8 +125,8 @@ def read_files( - Matching is suffix-anchored (``*.csv`` matches at any depth) **pattern** — like ``glob`` but ``{name}`` placeholders capture portions - of the path automatically; yields ``(content, captures)`` pairs where - *captures* is a ``dict`` of extracted values: + of the path automatically; yields ``(content, meta)`` pairs where + *meta* is a ``dict`` of extracted values: - ``{name}`` captures one path segment (no ``/``) - ``*`` and ``**`` wildcards work as in ``glob`` @@ -140,7 +140,7 @@ def read_files( pattern (str): Wildcard expression with ``{name}`` capture placeholders (e.g., ``'{sample}.csv'``, ``'{condition}/{sample}.csv'``). - Yields ``(content, captures)`` per matching file. + Yields ``(content, meta)`` per matching file. filetype (str): File format used to parse each file. Supported values: - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame`` @@ -164,8 +164,8 @@ def read_files( Yields: - When using ``glob``: *content* for each matching file - - When using ``pattern``: ``(content, captures)`` for each matching file, - where *captures* is a ``dict`` of values extracted from ``{name}`` + - When using ``pattern``: ``(content, meta)`` for each matching file, + where *meta* is a ``dict`` of values extracted from ``{name}`` placeholders Raises: @@ -179,12 +179,12 @@ def read_files( print(df.shape) # Extract sample names from filenames automatically - for df, captures in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'): - print(captures['sample'], df.shape) + for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'): + print(meta['sample'], df.shape) # Multi-level capture: condition directory + sample filename - for df, captures in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'): - print(captures['condition'], captures['sample'], df.shape) + for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'): + print(meta['condition'], meta['sample'], df.shape) # Read gzip-compressed TSV files with explicit separator for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', filetype='csv', sep='\\t'): diff --git a/tests/test_read_files.py b/tests/test_read_files.py index 98abe17c..bbad2229 100644 --- a/tests/test_read_files.py +++ b/tests/test_read_files.py @@ -250,15 +250,15 @@ def test_globstar_pattern(self): def test_pattern_simple_filename(self): results = list(self.dataset.read_files(pattern='{sample}.csv')) self.assertEqual(len(results), 1) - content, captures = results[0] + content, meta = results[0] self.assertIsInstance(content, pd.DataFrame) - self.assertEqual(captures['sample'], 'results') + self.assertEqual(meta['sample'], 'results') def test_pattern_with_directory(self): results = list(self.dataset.read_files(pattern='data/{sample}.csv')) self.assertEqual(len(results), 1) - _, captures = results[0] - self.assertEqual(captures['sample'], 'results') + _, meta = results[0] + self.assertEqual(meta['sample'], 'results') def test_pattern_multiple_files(self): dataset = _make_dataset_with_files([ @@ -268,7 +268,7 @@ def test_pattern_multiple_files(self): ]) results = list(dataset.read_files(pattern='{sample}.csv')) self.assertEqual(len(results), 2) - captured = {c['sample'] for _, c in results} + captured = {m['sample'] for _, m in results} self.assertSetEqual(captured, {'sampleA', 'sampleB'}) def test_pattern_multi_level(self): @@ -278,7 +278,7 @@ def test_pattern_multi_level(self): ]) results = list(dataset.read_files(pattern='{condition}/{sample}.csv')) self.assertEqual(len(results), 2) - by_sample = {c['sample']: c['condition'] for _, c in results} + by_sample = {m['sample']: m['condition'] for _, m in results} self.assertEqual(by_sample['sampleA'], 'treated') self.assertEqual(by_sample['sampleB'], 'control') @@ -286,11 +286,11 @@ def test_pattern_no_match_returns_empty(self): results = list(self.dataset.read_files(pattern='{sample}.parquet')) self.assertEqual(len(results), 0) - def test_pattern_yields_content_and_captures_tuple(self): + def test_pattern_yields_content_and_meta_tuple(self): results = list(self.dataset.read_files(pattern='{sample}.csv')) - content, captures = results[0] - self.assertIsInstance(captures, dict) - self.assertIn('sample', captures) + content, meta = results[0] + self.assertIsInstance(meta, dict) + self.assertIn('sample', meta) # --- error cases --- From adf881491a3c8e189197eada2c84f6fbda34b6e3 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 14:57:03 -0700 Subject: [PATCH 17/24] Update README.md --- README.md | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/README.md b/README.md index ea78bb95..f5ee0bb9 100644 --- a/README.md +++ b/README.md @@ -181,6 +181,49 @@ See the following set of Jupyter notebooks that contain examples on the followin | [Using references](samples/Using_references.ipynb) | Managing reference data | | [Advanced usage](samples/Advanced_usage.ipynb) | Advanced operations | +### Reading files + +The `read_file` and `read_files` methods provide a convenient way to read dataset files directly into Python objects. The file format is inferred from the extension (`.csv`, `.tsv`, `.json`, `.parquet`, `.feather`, `.pkl`, `.xlsx`, `.h5ad`), or can be specified explicitly. + +```python +from cirro import DataPortal + +# If not logged in, this will prompt with a login URL +portal = DataPortal() + +# Read a single file from the indicated dataset +df = portal.read_file(project="My Project", dataset="My Dataset", glob="**/results.csv") + +# Iterate over each of the files ending in .csv within a dataset +for df in portal.read_files(project="My Project", dataset="My Dataset", glob="*.csv"): + print(df.shape) + +``` + +You can also call these methods on the `DataPortalDataset` object: + +```python +# Get an object representing a single dataset +dataset = portal.get_dataset(project="My Project", dataset="My Dataset") + +# Read a single file by exact path or glob pattern +df = dataset.read_file(path="data/results.csv") +df = dataset.read_file(glob="**/results.csv") + +# Read multiple files matching a pattern — yields one result per file +for df in dataset.read_files(glob="**/*.csv"): + print(df.shape) + +# Extract values from the path using {name} capture placeholders +for df, meta in dataset.read_files(pattern="{sample}/results.csv"): + print(meta["sample"], df.shape) + +# Extra keyword arguments are forwarded to the file-parsing function +for df in dataset.read_files(glob="**/*.tsv.gz", filetype="csv", sep="\t"): + print(df.shape) +``` + + ## R Usage | Jupyter Notebook | Topic | From e51ba84af23f591586de5457973d7adc9f8842e7 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 15:00:12 -0700 Subject: [PATCH 18/24] Add tests --- tests/test_read_files.py | 75 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/tests/test_read_files.py b/tests/test_read_files.py index bbad2229..39974960 100644 --- a/tests/test_read_files.py +++ b/tests/test_read_files.py @@ -292,6 +292,51 @@ def test_pattern_yields_content_and_meta_tuple(self): self.assertIsInstance(meta, dict) self.assertIn('sample', meta) + # --- special characters in filenames --- + + def test_glob_matches_filename_with_spaces(self): + dataset = _make_dataset_with_files([ + _make_mock_file('data/my sample.csv', b'a\n1\n'), + ]) + results = list(dataset.read_files(glob='*.csv')) + self.assertEqual(len(results), 1) + + def test_glob_matches_filename_with_hyphens_and_parens(self): + dataset = _make_dataset_with_files([ + _make_mock_file('data/sample-A (1).csv', b'a\n1\n'), + ]) + results = list(dataset.read_files(glob='*.csv')) + self.assertEqual(len(results), 1) + + def test_pattern_captures_filename_with_spaces(self): + dataset = _make_dataset_with_files([ + _make_mock_file('my sample.csv', b'a\n1\n'), + ]) + results = list(dataset.read_files(pattern='{sample}.csv')) + self.assertEqual(len(results), 1) + _, meta = results[0] + self.assertEqual(meta['sample'], 'my sample') + + def test_pattern_captures_directory_with_spaces(self): + dataset = _make_dataset_with_files([ + _make_mock_file('treated group/sampleA.csv', b'a\n1\n'), + _make_mock_file('control group/sampleB.csv', b'a\n2\n'), + ]) + results = list(dataset.read_files(pattern='{condition}/{sample}.csv')) + self.assertEqual(len(results), 2) + by_sample = {m['sample']: m['condition'] for _, m in results} + self.assertEqual(by_sample['sampleA'], 'treated group') + self.assertEqual(by_sample['sampleB'], 'control group') + + def test_pattern_captures_special_chars(self): + dataset = _make_dataset_with_files([ + _make_mock_file('sample-A_v2 (1).csv', b'a\n1\n'), + ]) + results = list(dataset.read_files(pattern='{sample}.csv')) + self.assertEqual(len(results), 1) + _, meta = results[0] + self.assertEqual(meta['sample'], 'sample-A_v2 (1)') + # --- error cases --- def test_both_glob_and_pattern_raises(self): @@ -305,7 +350,7 @@ def test_neither_glob_nor_pattern_raises(self): class TestPatternToRegex(unittest.TestCase): def _match(self, pattern, path): - compiled, names = _pattern_to_captures_regex(pattern) + compiled, _ = _pattern_to_captures_regex(pattern) m = compiled.match(path) return m.groupdict() if m else None @@ -336,3 +381,31 @@ def test_wildcard_mixed_with_capture(self): def test_capture_names_returned(self): _, names = _pattern_to_captures_regex('{condition}/{sample}.csv') self.assertListEqual(names, ['condition', 'sample']) + + def test_capture_with_spaces(self): + result = self._match('{sample}.csv', 'my sample.csv') + self.assertEqual(result, {'sample': 'my sample'}) + + def test_capture_with_spaces_in_directory(self): + result = self._match('{condition}/{sample}.csv', 'treated group/my sample.csv') + self.assertEqual(result, {'condition': 'treated group', 'sample': 'my sample'}) + + def test_capture_with_hyphens_and_underscores(self): + result = self._match('{sample}.csv', 'sample-A_v2.csv') + self.assertEqual(result, {'sample': 'sample-A_v2'}) + + def test_capture_with_parentheses(self): + result = self._match('{sample}.csv', 'sample (1).csv') + self.assertEqual(result, {'sample': 'sample (1)'}) + + def test_capture_with_dots_in_name(self): + result = self._match('{sample}.csv', 'sample.v2.csv') + self.assertEqual(result, {'sample': 'sample.v2'}) + + def test_wildcard_matches_spaces(self): + compiled, _ = _pattern_to_captures_regex('data/*.csv') + self.assertIsNotNone(compiled.match('data/my file.csv')) + + def test_globstar_matches_spaces_across_segments(self): + compiled, _ = _pattern_to_captures_regex('**/*.csv') + self.assertIsNotNone(compiled.match('some dir/sub dir/my file.csv')) From 21550d43370d7a3fc668753c8d2ea0879721af19 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 15:02:56 -0700 Subject: [PATCH 19/24] Read file(s) as bytes --- cirro/sdk/dataset.py | 4 +++- cirro/sdk/portal.py | 1 + tests/test_read_files.py | 8 +++++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index ab2fa74a..6873d1d3 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -97,10 +97,12 @@ def _read_file_with_format(file: DataPortalFile, file_format: Optional[str], **k return file.read_excel(**kwargs) elif file_format == 'text': return file.read(**kwargs) + elif file_format == 'bytes': + return file._get() else: raise DataPortalInputError( f"Unsupported file_format: '{file_format}'. " - f"Supported values: 'csv', 'h5ad', 'json', 'parquet', 'feather', 'pickle', 'excel', 'text'" + f"Supported values: 'csv', 'h5ad', 'json', 'parquet', 'feather', 'pickle', 'excel', 'text', 'bytes'" ) diff --git a/cirro/sdk/portal.py b/cirro/sdk/portal.py index d811b94b..7f4727c4 100644 --- a/cirro/sdk/portal.py +++ b/cirro/sdk/portal.py @@ -154,6 +154,7 @@ def read_files( - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame`` (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``) - ``'text'``: read as plain text, returns a ``str`` + - ``'bytes'``: read as raw bytes, returns ``bytes`` - ``None`` (default): infer from file extension (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``, diff --git a/tests/test_read_files.py b/tests/test_read_files.py index 39974960..8bb56cc6 100644 --- a/tests/test_read_files.py +++ b/tests/test_read_files.py @@ -110,6 +110,12 @@ def test_auto_infer_text(self): result = _read_file_with_format(file, None) self.assertIsInstance(result, str) + def test_bytes_format(self): + file = _make_mock_file('data/blob.bin', b'\x00\x01\x02\x03') + result = _read_file_with_format(file, 'bytes') + self.assertIsInstance(result, bytes) + self.assertEqual(result, b'\x00\x01\x02\x03') + def test_unsupported_format_raises(self): with self.assertRaises(DataPortalInputError): _read_file_with_format(self.file, 'xyz_unknown') @@ -288,7 +294,7 @@ def test_pattern_no_match_returns_empty(self): def test_pattern_yields_content_and_meta_tuple(self): results = list(self.dataset.read_files(pattern='{sample}.csv')) - content, meta = results[0] + _, meta = results[0] self.assertIsInstance(meta, dict) self.assertIn('sample', meta) From 9847e4d8c467c874cdeb4ae50351079729731067 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 15:41:47 -0700 Subject: [PATCH 20/24] Update example for running analysis --- samples/Analyzing_a_dataset.ipynb | 214 ++++++++++++++++++++++++------ 1 file changed, 173 insertions(+), 41 deletions(-) diff --git a/samples/Analyzing_a_dataset.ipynb b/samples/Analyzing_a_dataset.ipynb index 1b7d0c62..3cd23ca8 100644 --- a/samples/Analyzing_a_dataset.ipynb +++ b/samples/Analyzing_a_dataset.ipynb @@ -21,14 +21,119 @@ }, "outputs": [], "source": [ + "# Import the library used to interact with Cirro\n", "from cirro import DataPortal\n", "\n", + "# Create a connection to Cirro with your identity\n", "portal = DataPortal()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Option 1 - run analysis using the same set of parameters used previously" + ] + }, { "cell_type": "code", "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset 'Test dataset for variant calling' contains 2 files\n" + ] + } + ], + "source": [ + "# New dataset with FASTQs\n", + "input_dataset = portal.get_dataset(\n", + " project=\"Pipeline Development\",\n", + " dataset=\"Test dataset for variant calling\"\n", + ")\n", + "print(f\"Dataset '{input_dataset.name}' contains {len(input_dataset.list_files()):,} files\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using the 'Align Reads (nf-core/sarek)' process (ID: process-nf-core-sarek-align-3-2)\n" + ] + } + ], + "source": [ + "# Get the process to run on the dataset\n", + "process = portal.get_process_by_name('Align Reads (nf-core/sarek)')\n", + "print(f\"Using the '{process.name}' process (ID: {process.id})\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using parameters from Genomic variant calling - parameter validation\n", + "{'WORKFLOW_VERSION': '3.2.3', 'analysis_type': {'genome': 'GATK.GRCh38', 'wes': True, 'analysis_type': 'Germline Variant Calling', 'intervals': 's3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed', 'tools': ['strelka', 'haplotypecaller']}, 'annotation': {'annotation_tool': []}, 'read_trimming_options': {'trim_fastq': False}}\n" + ] + } + ], + "source": [ + "# Previous dataset created by the pipeline\n", + "previous_run = portal.get_dataset(\n", + " project=\"Pipeline Development\",\n", + " dataset=\"Genomic variant calling - parameter validation\"\n", + ")\n", + "print(f\"Using parameters from {previous_run.name}\")\n", + "print(previous_run.params)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Started new analysis: ID f7ca7e1b-d64c-4747-b647-0e984db87aa5\n" + ] + } + ], + "source": [ + "# Start a new run, using the parameters from the previous run\n", + "new_dataset_id = input_dataset.run_analysis(\n", + " name=\"Genomic variant calling - new run\",\n", + " description='Test from SDK',\n", + " process=process,\n", + " params=previous_run.params\n", + ")\n", + "print(f\"Started new analysis: ID {new_dataset_id}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Option 2: Build parameters from scratch" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "metadata": { "pycharm": { "name": "#%%\n" @@ -39,24 +144,37 @@ "name": "stdout", "output_type": "stream", "text": [ - "Project 'Test Project' contains 104 datasets\n", - "Dataset 'Test dataset for variant calling' contains 2 files\n", - "Using the 'Variant Calling (nf-core/sarek)' process (ID: process-nf-core-sarek-3-0-1)\n" + "Project 'Pipeline Development' contains 709 datasets\n" ] } ], "source": [ "# Get the project by name\n", - "project = portal.get_project_by_name('Test Project') \n", - "print(f\"Project '{project.name}' contains {len(project.list_datasets()):,} datasets\")\n", - "\n", + "project = portal.get_project_by_name('Pipeline Development') \n", + "print(f\"Project '{project.name}' contains {len(project.list_datasets()):,} datasets\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset 'Test dataset for variant calling' contains 2 files\n" + ] + } + ], + "source": [ "# Get a particular dataset from that project\n", "dataset = project.get_dataset_by_name('Test dataset for variant calling')\n", - "print(f\"Dataset '{dataset.name}' contains {len(dataset.list_files()):,} files\")\n", - "\n", - "# Get the process to run on the dataset\n", - "process = portal.get_process_by_id('process-nf-core-sarek-3-0-1')\n", - "print(f\"Using the '{process.name}' process (ID: {process.id})\")" + "print(f\"Dataset '{dataset.name}' contains {len(dataset.list_files()):,} files\")" ] }, { @@ -72,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "metadata": { "pycharm": { "name": "#%%\n" @@ -84,15 +202,15 @@ "output_type": "stream", "text": [ "Parameters:\n", - "\tExperiment Design (Group)\n", + "\tWorkflow Version (key=workflow_version, default=3.6.0, type=string, enum=['3.1', '3.1.1', '3.1.2', '3.2.3', '3.3.2', '3.4.4', '3.5.1', '3.6.0'], description=Select the specific version of nf-core/sarek used for analysis)\n", + "\tExperimental Design (Group)\n", "\t\tReference Genome (key=genome, default=GATK.GRCh38, type=string, enum=['GATK.GRCh38', 'GATK.GRCh37', 'GRCm38'])\n", "\t\tWhole Exome/Targeted Gene Panel Assay (key=wes, type=boolean, description=Please indicate if your data was generated using a capture kit.)\n", "\t\tGenomic intervals (key=intervals, type=string, description=Target bed file in case of whole exome or targeted sequencing or intervals file for parallelization.)\n", - "\t\tVariant Calling Type (key=analysis_type, default=Germline Variant Calling, enum=['Germline Variant Calling', 'Somatic Variant Calling'])\n", - "\tVariant Annotation (Group)\n", - "\t\tAnnotation tool(s) (key=annotation_tool, type=array, description=Please select one or both variant annotation tools.)\n", "\tRead Trimming Options (Group)\n", - "\t\tTrim reads using Trim-Galore? (key=trim_fastq, type=boolean)\n" + "\t\tTrim reads using Trim-Galore? (key=trim_fastq, type=boolean)\n", + "\tAdvanced Options (Group)\n", + "\t\tMarkDuplicates - Optical Duplicate Pixel Distance (key=optical_duplicate_pixel_distance, default=100, type=integer, description=The `--OPTICAL_DUPLICATE_PIXEL_DISTANCE` parameter is used by MarkDuplicates to set the maximum offset between two duplicate clusters in pixels for them to be considered optical duplicates. A value of 100 is generally appropriate for unpatterned Illumina flowcells and 250 is appropriate for patterned Illumina flow cells.)\n" ] } ], @@ -114,7 +232,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "metadata": { "pycharm": { "name": "#%%\n" @@ -126,13 +244,15 @@ "output_type": "stream", "text": [ "The BED references available are:\n", - "GRCh38_Chr20\n", - " - NimbleGen_SeqCap_EZ_Exome_primary-capture_hg19_chr17\n", - " - wgs_calling_regions.hg19.bed\n", + "wgs_calling_regions.hg19.bed\n", + " - hg38\n", + " - epi2me-labs-wf-human-variation-ref\n", " - wgs_calling_regions.hg38.bed\n", + " - GRCh38_Chr20\n", + " - NimbleGen_SeqCap_EZ_Exome_primary-capture_hg19_chr17\n", "\n", "The reference library we are using is: GRCh38_Chr20\n", - "The absolute path to the file is: s3://z-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed\n" + "The absolute path to the file is: s3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed\n" ] } ], @@ -153,7 +273,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, "metadata": { "pycharm": { "name": "#%%\n" @@ -163,25 +283,37 @@ { "data": { "text/plain": [ - "{'genome': 'GATK.GRCh38',\n", - " 'wes': True,\n", - " 'intervals': 's3://z-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed',\n", - " 'trim_fastq': False,\n", - " 'annotation_tool': ['cnvkit', 'deepvariant']}" + "{'WORKFLOW_VERSION': '3.2.3',\n", + " 'analysis_type': {'genome': 'GATK.GRCh38',\n", + " 'wes': True,\n", + " 'analysis_type': 'Germline Variant Calling',\n", + " 'intervals': 's3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed',\n", + " 'tools': ['strelka', 'haplotypecaller']},\n", + " 'annotation': {'annotation_tool': []},\n", + " 'read_trimming_options': {'trim_fastq': False}}" ] }, - "execution_count": 5, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "params = {\n", - " 'genome': 'GATK.GRCh38',\n", - " 'wes': True,\n", - " 'intervals': reference_library.absolute_path,\n", - " 'trim_fastq': False,\n", - " 'annotation_tool': ['cnvkit', 'deepvariant']\n", + " 'WORKFLOW_VERSION': '3.2.3',\n", + " 'analysis_type': {\n", + " 'genome': 'GATK.GRCh38',\n", + " 'wes': True,\n", + " 'analysis_type': 'Germline Variant Calling',\n", + " 'intervals': 's3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed',\n", + " 'tools': ['strelka', 'haplotypecaller']\n", + " },\n", + " 'annotation': {\n", + " 'annotation_tool': []\n", + " },\n", + " 'read_trimming_options': {\n", + " 'trim_fastq': False\n", + " }\n", "}\n", "params" ] @@ -200,7 +332,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "metadata": { "pycharm": { "name": "#%%\n" @@ -225,7 +357,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": { "pycharm": { "name": "#%%\n" @@ -236,16 +368,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "71ec598c-368b-47a5-84c8-c209739b050a\n" + "ca8eee87-09d9-4abe-ba0e-4e6ba48b33fa\n" ] } ], "source": [ "# Run the analysis, specifying a name and description for the resulting dataset\n", - "new_dataset_id = dataset.run_analysis(\n", + "new_dataset_id = input_dataset.run_analysis(\n", " name='Variant Calling Analysis',\n", " description='Test from SDK',\n", - " process='process-nf-core-sarek-3-0-1',\n", + " process=process,\n", " params=params\n", ")\n", "print(new_dataset_id)" @@ -275,7 +407,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.12.7" }, "vscode": { "interpreter": { @@ -284,5 +416,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } From ed9916e2c4104c6379554955e591e95b4458de8f Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 15:56:11 -0700 Subject: [PATCH 21/24] Optionally filter the files downloaded from a dataset --- cirro/sdk/dataset.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index 6873d1d3..d96a5540 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -398,16 +398,21 @@ def list_artifacts(self) -> List[DataPortalFile]: ] ) - def download_files(self, download_location: str = None) -> None: + def download_files(self, download_location: str = None, glob: str = None) -> None: """ Download all the files from the dataset to a local directory. Args: download_location (str): Path to local directory + glob (str): Optional wildcard expression to filter which files are downloaded + (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``). + If omitted, all files are downloaded. """ - # Alias for internal method - self.list_files().download(download_location) + files = self.list_files() + if glob is not None: + files = DataPortalFiles(filter_files_by_pattern(list(files), glob)) + files.download(download_location) def run_analysis( self, From 220c9ea29b0f277e412d1e0737d88542cf450829 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 16:01:25 -0700 Subject: [PATCH 22/24] Add tests for reading files --- tests/test_read_files.py | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tests/test_read_files.py b/tests/test_read_files.py index 8bb56cc6..2292d0e9 100644 --- a/tests/test_read_files.py +++ b/tests/test_read_files.py @@ -354,6 +354,52 @@ def test_neither_glob_nor_pattern_raises(self): list(self.dataset.read_files()) +class TestDatasetDownloadFiles(unittest.TestCase): + def setUp(self): + self.csv_file = _make_mock_file('data/results.csv', b'x,y\n3,4\n') + self.tsv_file = _make_mock_file('data/counts.tsv', b'gene\tcount\nTP53\t100\n') + self.txt_file = _make_mock_file('logs/run.log', b'started\nfinished\n') + self.dataset = _make_dataset_with_files([ + self.csv_file, + self.tsv_file, + self.txt_file, + ]) + for f in [self.csv_file, self.tsv_file, self.txt_file]: + f.download = Mock(return_value=None) + + def _downloaded_paths(self): + return [ + f.relative_path + for f in [self.csv_file, self.tsv_file, self.txt_file] + if f.download.called + ] + + def test_no_glob_downloads_all(self): + self.dataset.download_files(download_location='/tmp') + self.assertEqual(len(self._downloaded_paths()), 3) + + def test_glob_filters_to_matching_files(self): + self.dataset.download_files(download_location='/tmp', glob='*.csv') + downloaded = self._downloaded_paths() + self.assertEqual(downloaded, ['data/results.csv']) + + def test_glob_matches_multiple_files(self): + self.dataset.download_files(download_location='/tmp', glob='data/*') + downloaded = self._downloaded_paths() + self.assertIn('data/results.csv', downloaded) + self.assertIn('data/counts.tsv', downloaded) + self.assertNotIn('logs/run.log', downloaded) + + def test_glob_no_match_downloads_nothing(self): + self.dataset.download_files(download_location='/tmp', glob='*.parquet') + self.assertEqual(len(self._downloaded_paths()), 0) + + def test_globstar_filters_by_subdirectory(self): + self.dataset.download_files(download_location='/tmp', glob='logs/**') + downloaded = self._downloaded_paths() + self.assertEqual(downloaded, ['logs/run.log']) + + class TestPatternToRegex(unittest.TestCase): def _match(self, pattern, path): compiled, _ = _pattern_to_captures_regex(pattern) From 3e271bf603cf8813626a5e00680deda4fbf12e8e Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 16:24:16 -0700 Subject: [PATCH 23/24] Add get_trace and get_logs --- cirro/sdk/dataset.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index d96a5540..205a14d6 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -372,6 +372,24 @@ def read_file( return _read_file_with_format(file, filetype, **kwargs) + def get_trace(self) -> Any: + """ + Read the Nextflow workflow trace file for this dataset as a DataFrame. + + Returns: + `pandas.DataFrame` + """ + return self.get_artifact(ArtifactType.WORKFLOW_TRACE).read_csv(sep='\t') + + def get_logs(self) -> str: + """ + Read the Nextflow workflow logs for this dataset as a string. + + Returns: + str + """ + return self.get_artifact(ArtifactType.WORKFLOW_LOGS).read() + def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile: """ Get the artifact of a particular type from the dataset From 96c81a959afeda88367bfc486dbebc27dd0d15d6 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Thu, 19 Mar 2026 16:26:19 -0700 Subject: [PATCH 24/24] Update samples --- samples/Downloading_a_dataset.ipynb | 202 +++--------- samples/Interacting_with_files.ipynb | 466 ++++++++++++++++----------- 2 files changed, 319 insertions(+), 349 deletions(-) diff --git a/samples/Downloading_a_dataset.ipynb b/samples/Downloading_a_dataset.ipynb index 71f372ff..cca284a6 100644 --- a/samples/Downloading_a_dataset.ipynb +++ b/samples/Downloading_a_dataset.ipynb @@ -34,7 +34,10 @@ } }, "source": [ - "You can get the list of all projects which are available, and select a particular project by name" + "If you don't know exactly what the name or ID is of the dataset you want to download,\n", + "you can get the list of all projects which are available, and select a particular project by name.\n", + "\n", + "### Inspecting datasets" ] }, { @@ -46,9 +49,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "There are 3 projects available\n", - "Selected the project 'Test Project' (ID: 9a31492a-e679-43ce-9f06-d84213c8f7f7)\n", - "This project contains 104 datasets to choose from\n" + "There are 5 projects available\n", + "Selected the project 'Pipeline Development' (ID: 9a31492a-e679-43ce-9f06-d84213c8f7f7)\n", + "This project contains 709 datasets to choose from\n" ] } ], @@ -56,7 +59,7 @@ "print(f\"There are {len(portal.list_projects()):,} projects available\")\n", "# print(portal.list_projects()) # run this line to see all the projects\n", "\n", - "project = portal.get_project_by_name(\"Test Project\")\n", + "project = portal.get_project_by_name(\"Pipeline Development\")\n", "print(f\"Selected the project '{project.name}' (ID: {project.id})\")\n", "print(f\"This project contains {len(project.list_datasets()):,} datasets to choose from\")" ] @@ -82,17 +85,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Name: Test of mageck-count (updated headnode code 9/22/2022) (3)\n", - "Id: bcda3e84-1abe-4d08-86b0-690ea7e1cdad\n", - "Description: Test of mageck-count (updated headnode code 9/22/2022)\n", + "Name: Genomic variant calling - parameter validation\n", + "Id: 3fb7e8f8-b62d-43a6-ad08-eb28f59bd141\n", + "Description: None\n", "Status: COMPLETED\n" ] } ], "source": [ "# Datasets can be selected by name or by ID\n", - "dataset = project.get_dataset_by_id(\"bcda3e84-1abe-4d08-86b0-690ea7e1cdad\")\n", - "# dataset = project.get_dataset_by_name(\"Test of mageck-count\")\n", + "dataset = project.get_dataset(\"Genomic variant calling - parameter validation\")\n", "print(dataset)" ] }, @@ -104,191 +106,63 @@ } }, "source": [ - "Download all of the files from that dataset to a temporary folder" + "### Downloading files\n", + "\n", + "Download all of the files from that dataset (to a temporary folder in this case)" ] }, { "cell_type": "code", "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# You can also just select that dataset in a single call\n", + "dataset = portal.get_dataset(\n", + " project=\"Pipeline Development\",\n", + " dataset=\"Genomic variant calling - parameter validation\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": { "pycharm": { "name": "#%%\n" }, "scrolled": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading file MO_Brunello_1.fastq (898.44 KB) | 100.0%|█████████████████████████ | 1.46MB/s\n", - "Downloading file MO_Brunello_2.fastq (898.44 KB) | 100.0%|█████████████████████████ | 1.83MB/s\n", - "Downloading file MO_Brunello_gDNA_1.fastq (898.44 KB) | 100.0%|█████████████████████████ | 2.16MB/s\n", - "Downloading file MO_Brunello_gDNA_2.fastq (898.44 KB) | 100.0%|█████████████████████████ | 1.39MB/s\n", - "Downloading file multiqc_report.html (1.12 MB) | 100.0%|█████████████████████████ | 1.35MB/s\n", - "Downloading file MO_Brunello_1.json (72.07 KB) | 100.0%|█████████████████████████ | 285kB/s\n", - "Downloading file MO_Brunello_1_fastqc.html (804.22 KB) | 100.0%|█████████████████████████ | 1.15MB/s\n", - "Downloading file MO_Brunello_2.json (72.07 KB) | 100.0%|█████████████████████████ | 349kB/s\n", - "Downloading file MO_Brunello_2_fastqc.html (824.26 KB) | 100.0%|█████████████████████████ | 1.19MB/s\n", - "Downloading file MO_Brunello_gDNA_1.json (72.53 KB) | 100.0%|█████████████████████████ | 319kB/s\n", - "Downloading file MO_Brunello_gDNA_1_fastqc.html (824.76 KB) | 100.0%|█████████████████████████ | 2.10MB/s\n", - "Downloading file MO_Brunello_gDNA_2.json (71.84 KB) | 100.0%|█████████████████████████ | 289kB/s\n", - "Downloading file MO_Brunello_gDNA_2_fastqc.html (815.26 KB) | 100.0%|█████████████████████████ | 1.95MB/s\n", - "Downloading file MO_Brunello_1.count.txt (1.55 MB) | 100.0%|█████████████████████████ | 3.62MB/s\n", - "Downloading file MO_Brunello_1.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 3.09MB/s\n", - "Downloading file MO_Brunello_1.countsummary.txt (237.00 B) | 100.0%|█████████████████████████ | 1.42kB/s\n", - "Downloading file MO_Brunello_2.count.txt (1.55 MB) | 100.0%|█████████████████████████ | 3.61MB/s\n", - "Downloading file MO_Brunello_2.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 2.72MB/s\n", - "Downloading file MO_Brunello_2.countsummary.txt (237.00 B) | 100.0%|█████████████████████████ | 2.28kB/s\n", - "Downloading file MO_Brunello_gDNA_1.count.txt (1.55 MB) | 100.0%|█████████████████████████ | 2.82MB/s\n", - "Downloading file MO_Brunello_gDNA_1.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 2.57MB/s\n", - "Downloading file MO_Brunello_gDNA_1.countsummary.txt (247.00 B) | 100.0%|█████████████████████████ | 2.57kB/s\n", - "Downloading file MO_Brunello_gDNA_2.count.txt (1.55 MB) | 100.0%|█████████████████████████ | 3.40MB/s\n", - "Downloading file MO_Brunello_gDNA_2.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 1.52MB/s\n", - "Downloading file MO_Brunello_gDNA_2.countsummary.txt (246.00 B) | 100.0%|█████████████████████████ | 2.33kB/s\n", - "Downloading file counts.txt (1.99 MB) | 100.0%|█████████████████████████ | 3.48MB/s\n", - "Downloading file sample_names.txt (65.00 B) | 100.0%|█████████████████████████ | 662B/s\n", - "Downloading file summary.txt (366.00 B) | 100.0%|█████████████████████████ | 2.41kB/s\n", - "Downloading file MO_Brunello_1.log (2.39 KB) | 100.0%|█████████████████████████ | 11.1kB/s\n", - "Downloading file MO_Brunello_2.log (2.39 KB) | 100.0%|█████████████████████████ | 16.1kB/s\n", - "Downloading file MO_Brunello_gDNA_1.log (2.43 KB) | 100.0%|█████████████████████████ | 23.2kB/s\n", - "Downloading file MO_Brunello_gDNA_2.log (2.43 KB) | 100.0%|█████████████████████████ | 19.4kB/s\n" - ] - } - ], + "outputs": [], "source": [ - "dataset.download_files(\"/tmp\")" + "# dataset.download_files(\"/tmp\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Alternatively, you can inspect and filter the list of files to only what is needed" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data/cutadapt/trim/fastq/MO_Brunello_1.fastq (920000 bytes)\n", - "\n", - "data/cutadapt/trim/fastq/MO_Brunello_2.fastq (920000 bytes)\n", - "\n", - "data/cutadapt/trim/fastq/MO_Brunello_gDNA_1.fastq (920000 bytes)\n", - "\n", - "data/cutadapt/trim/fastq/MO_Brunello_gDNA_2.fastq (920000 bytes)\n", - "\n", - "data/fastqc/multiqc_report.html (1173155 bytes)\n", - "\n", - "data/fastqc/MO_Brunello_1/MO_Brunello_1.json (73803 bytes)\n", - "\n", - "data/fastqc/MO_Brunello_1/MO_Brunello_1_fastqc.html (823526 bytes)\n", - "\n", - "data/fastqc/MO_Brunello_2/MO_Brunello_2.json (73797 bytes)\n", - "\n", - "data/fastqc/MO_Brunello_2/MO_Brunello_2_fastqc.html (844044 bytes)\n", - "\n", - "data/fastqc/MO_Brunello_gDNA_1/MO_Brunello_gDNA_1.json (74268 bytes)\n", - "\n", - "data/fastqc/MO_Brunello_gDNA_1/MO_Brunello_gDNA_1_fastqc.html (844554 bytes)\n", - "\n", - "data/fastqc/MO_Brunello_gDNA_2/MO_Brunello_gDNA_2.json (73563 bytes)\n", - "\n", - "data/fastqc/MO_Brunello_gDNA_2/MO_Brunello_gDNA_2_fastqc.html (834827 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_1.count.txt (1625955 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_1.count_normalized.txt (1638475 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_1.countsummary.txt (237 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_2.count.txt (1625955 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_2.count_normalized.txt (1638372 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_2.countsummary.txt (237 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_gDNA_1.count.txt (1625960 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_gDNA_1.count_normalized.txt (1638522 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_gDNA_1.countsummary.txt (247 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_gDNA_2.count.txt (1625960 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_gDNA_2.count_normalized.txt (1638905 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_gDNA_2.countsummary.txt (246 bytes)\n", - "\n", - "data/mageck/count/combined/counts.txt (2090653 bytes)\n", - "\n", - "data/mageck/count/combined/sample_names.txt (65 bytes)\n", - "\n", - "data/mageck/count/combined/summary.txt (366 bytes)\n", - "\n", - "data/mageck/count/log/MO_Brunello_1.log (2449 bytes)\n", - "\n", - "data/mageck/count/log/MO_Brunello_2.log (2449 bytes)\n", - "\n", - "data/mageck/count/log/MO_Brunello_gDNA_1.log (2489 bytes)\n", - "\n", - "data/mageck/count/log/MO_Brunello_gDNA_2.log (2488 bytes)\n" - ] - } - ], - "source": [ - "files = dataset.list_files()\n", - "print(files)" + "Alternatively, you can filter the list of files to only what is needed" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data/mageck/count/MO_Brunello_1.count_normalized.txt (1638475 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_2.count_normalized.txt (1638372 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_gDNA_1.count_normalized.txt (1638522 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_gDNA_2.count_normalized.txt (1638905 bytes)\n" - ] - } - ], - "source": [ - "norm_counts = files.filter_by_pattern(\"*.count_normalized.txt\")\n", - "print(norm_counts)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Downloading file MO_Brunello_1.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 1.86MB/s\n", - "Downloading file MO_Brunello_2.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 3.78MB/s\n", - "Downloading file MO_Brunello_gDNA_1.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 2.86MB/s\n", - "Downloading file MO_Brunello_gDNA_2.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 3.27MB/s\n" + "Downloading file ERR031935.haplotypecaller.filtered.vcf.gz (401.08 KB) | 100.0%|█████████████████████████ | 1.71MB/s\n", + "Downloading file ERR031935.haplotypecaller.vcf.gz (357.77 KB) | 100.0%|█████████████████████████ | 1.50MB/s\n", + "Downloading file ERR031935.strelka.genome.vcf.gz (12.29 MB) | 100.0%|█████████████████████████ | 6.54MB/s\n", + "Downloading file ERR031935.strelka.variants.vcf.gz (970.75 KB) | 100.0%|█████████████████████████ | 2.55MB/s\n" ] } ], "source": [ - "norm_counts.download(\"/tmp\")" + "dataset.download_files(\"/tmp\", glob=\"*.vcf.gz\")" ] }, { @@ -315,7 +189,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.12.7" }, "vscode": { "interpreter": { @@ -324,5 +198,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/samples/Interacting_with_files.ipynb b/samples/Interacting_with_files.ipynb index 929d9dfb..91b35b48 100644 --- a/samples/Interacting_with_files.ipynb +++ b/samples/Interacting_with_files.ipynb @@ -13,28 +13,37 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { - "pycharm": { - "name": "#%%\n" - }, "ExecuteTime": { "end_time": "2025-03-25T19:16:07.482109Z", "start_time": "2025-03-25T19:16:06.304549Z" + }, + "pycharm": { + "name": "#%%\n" } }, + "outputs": [], "source": [ "from cirro import DataPortal\n", "\n", - "portal = DataPortal()" - ], - "outputs": [], - "execution_count": 1 + "portal = DataPortal(base_url=\"\")" + ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Find the file you are looking for by defining the project and dataset, then searching for a particular file of interest based on a pattern using `filter_by_pattern`" + "Find the file you are looking for by defining the project and dataset, then using `read_file` or `read_files` to read file contents directly into Python objects.\n", + "\n", + "The file format is inferred automatically from the extension (`.csv`, `.tsv`, `.json`, `.parquet`, `.feather`, `.pkl`, `.xlsx`, `.h5ad`), or can be specified with the `format` parameter." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inspecting files" ] }, { @@ -50,31 +59,27 @@ "name": "stdout", "output_type": "stream", "text": [ - "The project Test Project contains 104 datasets\n", - "Dataset Test of mageck-count contains 32 files\n", - "Selected the file: data/mageck/count/combined/counts.txt (2090653 bytes)\n" + "Dataset: Genomic variant calling - parameter validation\n", + "Files: 235\n", + "data/variant_calling/haplotypecaller/ERR031935/ERR031935.haplotypecaller.filtered.vcf.gz\n", + "data/variant_calling/haplotypecaller/ERR031935/ERR031935.haplotypecaller.vcf.gz\n", + "data/variant_calling/strelka/ERR031935/ERR031935.strelka.genome.vcf.gz\n", + "data/variant_calling/strelka/ERR031935/ERR031935.strelka.variants.vcf.gz\n" ] } ], "source": [ "# Get the project which contains the dataset\n", - "project = portal.get_project_by_name('Test Project')\n", - "\n", - "# Get the set of datasets within that project\n", - "all_datasets = project.list_datasets()\n", - "print(f\"The project {project.name} contains {len(all_datasets):,} datasets\")\n", + "project = portal.get_project_by_name(\"Pipeline Development\")\n", "\n", "# Get the dataset of interest based on its name\n", - "dataset = all_datasets.get_by_name('Test of mageck-count')\n", - "\n", - "# Get the complete list of files in that dataset\n", - "files = dataset.list_files()\n", - "print(f\"Dataset {dataset.name} contains {len(files):,} files\")\n", - "\n", - "# Filter to just the files named counts.txt (using the wildcard to match the string of folders it is in)\n", - "counts = files.filter_by_pattern(\"*/counts.txt\")\n", + "dataset = project.get_dataset(\"Genomic variant calling - parameter validation\")\n", "\n", - "print(f\"Selected the file: {counts.description()}\")" + "print(f\"Dataset: {dataset.name}\")\n", + "print(f\"Files: {len(dataset.list_files()):,}\")\n", + "for file in dataset.list_files():\n", + " if file.name.endswith('.vcf.gz'):\n", + " print(file.name)" ] }, { @@ -85,7 +90,9 @@ } }, "source": [ - "Load the contents of that file into a DataFrame (keeping in mind that it is tab-delimited, not the default comma-delimited)" + "### Reading a file\n", + "\n", + "Read a single file into a DataFrame using `read_file`. The tab-separated format is specified explicitly with `sep='\\t'`." ] }, { @@ -118,78 +125,109 @@ " \n", " \n", " \n", - " sgRNA\n", - " Gene\n", - " MO_Brunello_gDNA_2\n", - " MO_Brunello_1\n", - " MO_Brunello_2\n", - " MO_Brunello_gDNA_1\n", + " 0\n", + " 1\n", + " 2\n", + " 3\n", + " 4\n", + " 5\n", + " 6\n", + " 7\n", + " 8\n", + " 9\n", " \n", " \n", " \n", " \n", " 0\n", - " A1BG_0\n", - " A1BG\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " chr20\n", + " 60826\n", + " .\n", + " T\n", + " A\n", + " 1\n", + " LowDepth;LowGQX;NoPassedVariantGTs\n", + " MQ=17;SNVHPOL=2\n", + " GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL\n", + " 0/1:3:0:1:0:0,1:0,1:0,0:0.0:LowGQX;LowDepth:29...\n", " \n", " \n", " 1\n", - " A1BG_1\n", - " A1BG\n", - " 0\n", - " 0\n", - " 0\n", - " 2\n", + " chr20\n", + " 60850\n", + " .\n", + " A\n", + " T\n", + " 1\n", + " LowDepth;LowGQX;NoPassedVariantGTs\n", + " MQ=24;SNVHPOL=4\n", + " GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL\n", + " 0/1:3:1:1:0:0,1:0,1:0,0:0.0:LowGQX;LowDepth:30...\n", " \n", " \n", " 2\n", - " A1BG_2\n", - " A1BG\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " chr20\n", + " 62437\n", + " .\n", + " C\n", + " T\n", + " 3\n", + " LowDepth;LowGQX;NoPassedVariantGTs\n", + " MQ=22;SNVHPOL=2\n", + " GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL\n", + " 0/1:3:0:1:1:0,1:0,0:0,1:0.0:LowGQX;LowDepth:35...\n", " \n", " \n", " 3\n", - " A1BG_3\n", - " A1BG\n", - " 0\n", - " 0\n", - " 2\n", - " 0\n", + " chr20\n", + " 62467\n", + " .\n", + " C\n", + " A\n", + " 4\n", + " LowDepth;LowGQX;NoPassedVariantGTs\n", + " MQ=24;SNVHPOL=2\n", + " GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL\n", + " 0/1:3:0:1:0:0,1:0,0:0,1:0.0:LowGQX;LowDepth:36...\n", " \n", " \n", " 4\n", - " A1CF_36946\n", - " A1CF\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " chr20\n", + " 62469\n", + " .\n", + " G\n", + " A\n", + " 3\n", + " LowDepth;LowGQX;NoPassedVariantGTs\n", + " MQ=24;SNVHPOL=3\n", + " GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL\n", + " 0/1:3:0:1:0:0,1:0,0:0,1:0.0:LowGQX;LowDepth:34...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " sgRNA Gene MO_Brunello_gDNA_2 MO_Brunello_1 MO_Brunello_2 \\\n", - "0 A1BG_0 A1BG 0 0 0 \n", - "1 A1BG_1 A1BG 0 0 0 \n", - "2 A1BG_2 A1BG 0 0 0 \n", - "3 A1BG_3 A1BG 0 0 2 \n", - "4 A1CF_36946 A1CF 0 0 0 \n", + " 0 1 2 3 4 5 6 \\\n", + "0 chr20 60826 . T A 1 LowDepth;LowGQX;NoPassedVariantGTs \n", + "1 chr20 60850 . A T 1 LowDepth;LowGQX;NoPassedVariantGTs \n", + "2 chr20 62437 . C T 3 LowDepth;LowGQX;NoPassedVariantGTs \n", + "3 chr20 62467 . C A 4 LowDepth;LowGQX;NoPassedVariantGTs \n", + "4 chr20 62469 . G A 3 LowDepth;LowGQX;NoPassedVariantGTs \n", "\n", - " MO_Brunello_gDNA_1 \n", - "0 0 \n", - "1 2 \n", - "2 0 \n", - "3 0 \n", - "4 0 " + " 7 8 \\\n", + "0 MQ=17;SNVHPOL=2 GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL \n", + "1 MQ=24;SNVHPOL=4 GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL \n", + "2 MQ=22;SNVHPOL=2 GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL \n", + "3 MQ=24;SNVHPOL=2 GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL \n", + "4 MQ=24;SNVHPOL=3 GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL \n", + "\n", + " 9 \n", + "0 0/1:3:0:1:0:0,1:0,1:0,0:0.0:LowGQX;LowDepth:29... \n", + "1 0/1:3:1:1:0:0,1:0,1:0,0:0.0:LowGQX;LowDepth:30... \n", + "2 0/1:3:0:1:1:0,1:0,0:0,1:0.0:LowGQX;LowDepth:35... \n", + "3 0/1:3:0:1:0:0,1:0,0:0,1:0.0:LowGQX;LowDepth:36... \n", + "4 0/1:3:0:1:0:0,1:0,0:0,1:0.0:LowGQX;LowDepth:34... " ] }, "execution_count": 3, @@ -198,56 +236,72 @@ } ], "source": [ - "df = counts[0].read_csv(sep=\"\\t\")\n", + "# Read a single file matched by a glob pattern\n", + "df = dataset.read_file(glob=\"*.variants.vcf.gz\", filetype=\"csv\", sep=\"\\t\", comment=\"#\", header=None)\n", "df.head()" ] }, { + "cell_type": "markdown", "metadata": {}, + "source": [ + "### Reading multiple files\n", + "\n", + "Use `read_files` to iterate over multiple matching files. With `{name}` capture placeholders in the `pattern`, extracted values are returned alongside each file's content." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'sample': 'ERR031935', 'type': 'genome'} (790381, 10)\n", + "{'sample': 'ERR031935', 'type': 'variants'} (36318, 10)\n" + ] + } + ], + "source": [ + "# Extract folder names from the path automatically using {name} placeholders\n", + "for df, meta in dataset.read_files(\n", + " pattern=\"*/strelka/{sample}/*.strelka.{type}.vcf.gz\",\n", + " filetype=\"csv\",\n", + " sep=\"\\t\",\n", + " comment=\"#\",\n", + " header=None\n", + "):\n", + " print(meta, df.shape)" + ] + }, + { "cell_type": "markdown", - "source": "You can also view any artifacts produced by running the analysis, such as the workflow report, graph, or logs." + "metadata": {}, + "source": [ + "You can also view any artifacts produced by running the analysis, such as the workflow report, graph, or logs." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Getting metadata" + ] }, { + "cell_type": "code", + "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2025-03-25T19:16:35.472469Z", "start_time": "2025-03-25T19:16:31.215624Z" } }, - "cell_type": "code", - "source": [ - "from cirro_api_client.v1.models import ArtifactType\n", - "\n", - "# Reading nextflow trace file\n", - "trace_file = dataset.get_artifact(ArtifactType.WORKFLOW_TRACE)\n", - "trace_df = trace_file.read_csv(sep=\"\\t\")\n", - "trace_df.head()" - ], "outputs": [ { "data": { - "text/plain": [ - " task_id hash native_id \\\n", - "0 7 99/b42c07 826623a0-0ed5-44ff-8a94-e3802cccf531 \n", - "1 5 71/8e3d51 ace41478-ba98-403d-a6d1-3e95ad64c36f \n", - "2 8 71/535e08 9d499098-6ed7-422b-9233-9983f775fdee \n", - "3 1 41/c494ef 3a221dd3-7ca8-41e1-8212-856b6154be64 \n", - "4 2 25/13b116 94f91d55-1d41-4afd-88b4-743d75817032 \n", - "\n", - " name status exit submit duration \\\n", - "0 trim:trim_adapters (4) COMPLETED 0 2022-05-24 16:27:01.413 5m 38s \n", - "1 trim:trim_adapters (3) COMPLETED 0 2022-05-24 16:27:01.421 5m 38s \n", - "2 fastqc (4) COMPLETED 0 2022-05-24 16:27:01.464 5m 48s \n", - "3 fastqc (1) COMPLETED 0 2022-05-24 16:27:01.465 5m 48s \n", - "4 trim:trim_adapters (1) COMPLETED 0 2022-05-24 16:27:01.476 5m 58s \n", - "\n", - " realtime %cpu peak_rss peak_vmem rchar wchar \n", - "0 1s 76.6% 3.1 MB 5.4 MB 1.8 MB 900.5 KB \n", - "1 4s 6.4% 11.6 MB 17.3 MB 1.8 MB 900.5 KB \n", - "2 3s 104.8% 152.7 MB 3.2 GB 15.9 MB 4.1 MB \n", - "3 3s 102.5% 140.2 MB 3.2 GB 16 MB 4.1 MB \n", - "4 1s 75.8% 3.1 MB 5.4 MB 1.8 MB 900.5 KB " - ], "text/html": [ "
\n", "