Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
e4cb0f4
Add read_files method to DataPortalDataset and DataPortalProject
claude Mar 19, 2026
57c2e4e
Add support for json, parquet, feather, pickle, and excel file formats
claude Mar 19, 2026
40cb5ae
Add {name} capture syntax to read_files for automatic path extraction
claude Mar 19, 2026
dec37a0
Replace positional pattern arg with explicit glob= and pattern= kwargs
claude Mar 19, 2026
96de2ad
Merge branch 'CirroBio:main' into claude/add-read-files-function-5LNXG
sminot Mar 19, 2026
916ab8a
Require dataset argument on project.read_files()
claude Mar 19, 2026
4cf45aa
Fix flake8
sminot Mar 19, 2026
8bf8338
Merge branch 'main' into pr/195
sminot Mar 19, 2026
29e0c42
Get dataset by name or id
sminot Mar 19, 2026
7b59277
Add singular read_file function
sminot Mar 19, 2026
52ee650
Increment version
sminot Mar 19, 2026
75e4e6a
Bugfixes
sminot Mar 19, 2026
30abda9
Move from project to portal
sminot Mar 19, 2026
05c78b4
Change file_format to format
sminot Mar 19, 2026
84c36ba
Clean up
sminot Mar 19, 2026
96764c2
Move the primary read_files docs to the DataPortal object
sminot Mar 19, 2026
595b0a2
format -> filetype
sminot Mar 19, 2026
5be8998
captures -> meta
sminot Mar 19, 2026
adf8814
Update README.md
sminot Mar 19, 2026
e51ba84
Add tests
sminot Mar 19, 2026
21550d4
Read file(s) as bytes
sminot Mar 19, 2026
9847e4d
Update example for running analysis
sminot Mar 19, 2026
ed9916e
Optionally filter the files downloaded from a dataset
sminot Mar 19, 2026
220c9ea
Add tests for reading files
sminot Mar 19, 2026
3e271bf
Add get_trace and get_logs
sminot Mar 19, 2026
96c81a9
Update samples
sminot Mar 19, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,49 @@ See the following set of Jupyter notebooks that contain examples on the followin
| [Using references](samples/Using_references.ipynb) | Managing reference data |
| [Advanced usage](samples/Advanced_usage.ipynb) | Advanced operations |

### Reading files

The `read_file` and `read_files` methods provide a convenient way to read dataset files directly into Python objects. The file format is inferred from the extension (`.csv`, `.tsv`, `.json`, `.parquet`, `.feather`, `.pkl`, `.xlsx`, `.h5ad`), or can be specified explicitly.

```python
from cirro import DataPortal

# If not logged in, this will prompt with a login URL
portal = DataPortal()

# Read a single file from the indicated dataset
df = portal.read_file(project="My Project", dataset="My Dataset", glob="**/results.csv")

# Iterate over each of the files ending in .csv within a dataset
for df in portal.read_files(project="My Project", dataset="My Dataset", glob="*.csv"):
print(df.shape)

```

You can also call these methods on the `DataPortalDataset` object:

```python
# Get an object representing a single dataset
dataset = portal.get_dataset(project="My Project", dataset="My Dataset")

# Read a single file by exact path or glob pattern
df = dataset.read_file(path="data/results.csv")
df = dataset.read_file(glob="**/results.csv")

# Read multiple files matching a pattern — yields one result per file
for df in dataset.read_files(glob="**/*.csv"):
print(df.shape)

# Extract values from the path using {name} capture placeholders
for df, meta in dataset.read_files(pattern="{sample}/results.csv"):
print(meta["sample"], df.shape)

# Extra keyword arguments are forwarded to the file-parsing function
for df in dataset.read_files(glob="**/*.tsv.gz", filetype="csv", sep="\t"):
print(df.shape)
```


## R Usage

| Jupyter Notebook | Topic |
Expand Down
2 changes: 1 addition & 1 deletion cirro/sdk/asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def get_by_name(self, name: str) -> T:
# Error if multiple projects are found
msg = f"Multiple {self.asset_name} items found with name '{name}', use ID instead.\n{self.description()}"
if len(matching_queries) > 1:
raise DataPortalAssetNotFound(msg)
raise DataPortalInputError(msg)

return matching_queries[0]

Expand Down
209 changes: 203 additions & 6 deletions cirro/sdk/dataset.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import datetime
import re
from pathlib import Path
from typing import Union, List, Optional
from typing import Union, List, Optional, Any

from cirro_api_client.v1.api.processes import validate_file_requirements
from cirro_api_client.v1.models import Dataset, DatasetDetail, RunAnalysisRequest, ProcessDetail, Status, \
RunAnalysisRequestParams, Tag, ArtifactType, NamedItem, ValidateFileRequirementsRequest

from cirro.cirro_client import CirroApi
from cirro.file_utils import filter_files_by_pattern
from cirro.models.assets import DatasetAssets
from cirro.models.file import PathLike
from cirro.sdk.asset import DataPortalAssets, DataPortalAsset
Expand All @@ -17,6 +19,93 @@
from cirro.sdk.process import DataPortalProcess


def _pattern_to_captures_regex(pattern: str):
"""
Convert a glob pattern that may contain ``{name}`` capture placeholders into
a compiled regex and return ``(compiled_regex, capture_names)``.

Conversion rules:
- ``{name}`` → named group matching a single path segment (no ``/``)
- ``*`` → matches any characters within a single path segment
- ``**`` → matches any characters including ``/`` (multiple segments)
- All other characters are regex-escaped.

The resulting regex is suffix-anchored (like ``pathlib.PurePath.match``):
a pattern without a leading ``/`` will match at any depth in the path.
"""
capture_names = re.findall(r'\{(\w+)\}', pattern)
tokens = re.split(r'(\*\*|\*|\{\w+\})', pattern)
parts = []
for token in tokens:
if token == '**':
parts.append('.*')
elif token == '*':
parts.append('[^/]*')
elif re.match(r'^\{\w+\}$', token):
name = token[1:-1]
parts.append(f'(?P<{name}>[^/]+)')
else:
parts.append(re.escape(token))
regex_str = ''.join(parts)
if not pattern.startswith('/'):
regex_str = r'(?:.+/)?' + regex_str
return re.compile('^' + regex_str + '$'), capture_names


def _infer_file_format(path: str) -> str:
"""Infer the file format from the file extension."""
path_lower = path.lower()
for ext in ('.gz', '.bz2', '.xz', '.zst'):
if path_lower.endswith(ext):
path_lower = path_lower[:-len(ext)]
break
if path_lower.endswith('.csv') or path_lower.endswith('.tsv'):
return 'csv'
elif path_lower.endswith('.h5ad'):
return 'h5ad'
elif path_lower.endswith('.json'):
return 'json'
elif path_lower.endswith('.parquet'):
return 'parquet'
elif path_lower.endswith('.feather'):
return 'feather'
elif path_lower.endswith('.pkl') or path_lower.endswith('.pickle'):
return 'pickle'
elif path_lower.endswith('.xlsx') or path_lower.endswith('.xls'):
return 'excel'
else:
return 'text'


def _read_file_with_format(file: DataPortalFile, file_format: Optional[str], **kwargs) -> Any:
"""Read a file using the specified format, or auto-detect from extension."""
if file_format is None:
file_format = _infer_file_format(file.relative_path)
if file_format == 'csv':
return file.read_csv(**kwargs)
elif file_format == 'h5ad':
return file.read_h5ad()
elif file_format == 'json':
return file.read_json(**kwargs)
elif file_format == 'parquet':
return file.read_parquet(**kwargs)
elif file_format == 'feather':
return file.read_feather(**kwargs)
elif file_format == 'pickle':
return file.read_pickle(**kwargs)
elif file_format == 'excel':
return file.read_excel(**kwargs)
elif file_format == 'text':
return file.read(**kwargs)
elif file_format == 'bytes':
return file._get()
else:
raise DataPortalInputError(
f"Unsupported file_format: '{file_format}'. "
f"Supported values: 'csv', 'h5ad', 'json', 'parquet', 'feather', 'pickle', 'excel', 'text', 'bytes'"
)


class DataPortalDataset(DataPortalAsset):
"""
Datasets in the Data Portal are collections of files which have
Expand All @@ -31,7 +120,7 @@ def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi):
Should be invoked from a top-level constructor, for example:

```python
from cirro import DataPortal()
from cirro import DataPortal
portal = DataPortal()
dataset = portal.get_dataset(
project="id-or-name-of-project",
Expand Down Expand Up @@ -199,6 +288,108 @@ def list_files(self) -> DataPortalFiles:
]
)

def read_files(
self,
glob: str = None,
pattern: str = None,
filetype: str = None,
**kwargs
):
"""
Read the contents of files in the dataset.

See :meth:`~cirro.sdk.portal.DataPortal.read_files` for full details
on ``glob``/``pattern`` matching and filetype options.

Args:
glob (str): Wildcard expression to match files.
Yields one item per matching file: the parsed content.
pattern (str): Wildcard expression with ``{name}`` capture
placeholders. Yields ``(content, meta)`` per matching file.
filetype (str): File format used to parse each file
(or ``None`` to infer from extension).
**kwargs: Additional keyword arguments forwarded to the
file-parsing function.

Yields:
- When using ``glob``: *content* for each matching file
- When using ``pattern``: ``(content, meta)`` for each matching file
"""
if glob is not None and pattern is not None:
raise DataPortalInputError("Cannot specify both 'glob' and 'pattern' — use one or the other")
if glob is None and pattern is None:
raise DataPortalInputError("Must specify either 'glob' or 'pattern'")

if glob is not None:
for file in filter_files_by_pattern(list(self.list_files()), glob):
yield _read_file_with_format(file, filetype, **kwargs)
else:
compiled_regex, _ = _pattern_to_captures_regex(pattern)
for file in self.list_files():
m = compiled_regex.match(file.relative_path)
if m is not None:
yield _read_file_with_format(file, filetype, **kwargs), m.groupdict()

def read_file(
self,
path: str = None,
glob: str = None,
filetype: str = None,
**kwargs
) -> Any:
"""
Read the contents of a single file from the dataset.

See :meth:`~cirro.sdk.portal.DataPortal.read_file` for full details.

Args:
path (str): Exact relative path of the file within the dataset.
glob (str): Wildcard expression matching exactly one file.
filetype (str): File format used to parse the file. Supported values
are the same as :meth:`~cirro.sdk.portal.DataPortal.read_files`.
**kwargs: Additional keyword arguments forwarded to the file-parsing
function.

Returns:
Parsed file content.
"""
if path is not None and glob is not None:
raise DataPortalInputError("Cannot specify both 'path' and 'glob' — use one or the other")
if path is None and glob is None:
raise DataPortalInputError("Must specify either 'path' or 'glob'")

if path is not None:
file = self.get_file(path)
else:
matches = list(filter_files_by_pattern(list(self.list_files()), glob))
if len(matches) == 0:
raise DataPortalAssetNotFound(f"No files matched glob '{glob}'")
if len(matches) > 1:
raise DataPortalInputError(
f"glob '{glob}' matched {len(matches)} files — use read_files() to read multiple files"
)
file = matches[0]

return _read_file_with_format(file, filetype, **kwargs)

def get_trace(self) -> Any:
"""
Read the Nextflow workflow trace file for this dataset as a DataFrame.

Returns:
`pandas.DataFrame`
"""
return self.get_artifact(ArtifactType.WORKFLOW_TRACE).read_csv(sep='\t')

def get_logs(self) -> str:
"""
Read the Nextflow workflow logs for this dataset as a string.

Returns:
str
"""
return self.get_artifact(ArtifactType.WORKFLOW_LOGS).read()

def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile:
"""
Get the artifact of a particular type from the dataset
Expand All @@ -225,16 +416,21 @@ def list_artifacts(self) -> List[DataPortalFile]:
]
)

def download_files(self, download_location: str = None) -> None:
def download_files(self, download_location: str = None, glob: str = None) -> None:
"""
Download all the files from the dataset to a local directory.

Args:
download_location (str): Path to local directory
glob (str): Optional wildcard expression to filter which files are downloaded
(e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``).
If omitted, all files are downloaded.
"""

# Alias for internal method
self.list_files().download(download_location)
files = self.list_files()
if glob is not None:
files = DataPortalFiles(filter_files_by_pattern(list(files), glob))
files.download(download_location)

def run_analysis(
self,
Expand Down Expand Up @@ -281,6 +477,7 @@ def run_analysis(
process = parse_process_name_or_id(process, self._client)

if compute_environment:
compute_environment_name = compute_environment
compute_environments = self._client.compute_environments.list_environments_for_project(
project_id=self.project_id
)
Expand All @@ -290,7 +487,7 @@ def run_analysis(
None
)
if compute_environment is None:
raise DataPortalInputError(f"Compute environment '{compute_environment}' not found")
raise DataPortalInputError(f"Compute environment '{compute_environment_name}' not found")

resp = self._client.execution.run_analysis(
project_id=self.project_id,
Expand Down
Loading
Loading