diff --git a/README.md b/README.md index ea78bb9..f5ee0bb 100644 --- a/README.md +++ b/README.md @@ -181,6 +181,49 @@ See the following set of Jupyter notebooks that contain examples on the followin | [Using references](samples/Using_references.ipynb) | Managing reference data | | [Advanced usage](samples/Advanced_usage.ipynb) | Advanced operations | +### Reading files + +The `read_file` and `read_files` methods provide a convenient way to read dataset files directly into Python objects. The file format is inferred from the extension (`.csv`, `.tsv`, `.json`, `.parquet`, `.feather`, `.pkl`, `.xlsx`, `.h5ad`), or can be specified explicitly. + +```python +from cirro import DataPortal + +# If not logged in, this will prompt with a login URL +portal = DataPortal() + +# Read a single file from the indicated dataset +df = portal.read_file(project="My Project", dataset="My Dataset", glob="**/results.csv") + +# Iterate over each of the files ending in .csv within a dataset +for df in portal.read_files(project="My Project", dataset="My Dataset", glob="*.csv"): + print(df.shape) + +``` + +You can also call these methods on the `DataPortalDataset` object: + +```python +# Get an object representing a single dataset +dataset = portal.get_dataset(project="My Project", dataset="My Dataset") + +# Read a single file by exact path or glob pattern +df = dataset.read_file(path="data/results.csv") +df = dataset.read_file(glob="**/results.csv") + +# Read multiple files matching a pattern — yields one result per file +for df in dataset.read_files(glob="**/*.csv"): + print(df.shape) + +# Extract values from the path using {name} capture placeholders +for df, meta in dataset.read_files(pattern="{sample}/results.csv"): + print(meta["sample"], df.shape) + +# Extra keyword arguments are forwarded to the file-parsing function +for df in dataset.read_files(glob="**/*.tsv.gz", filetype="csv", sep="\t"): + print(df.shape) +``` + + ## R Usage | Jupyter Notebook | Topic | diff --git a/cirro/sdk/asset.py b/cirro/sdk/asset.py index ce1eea0..082200f 100644 --- a/cirro/sdk/asset.py +++ b/cirro/sdk/asset.py @@ -60,7 +60,7 @@ def get_by_name(self, name: str) -> T: # Error if multiple projects are found msg = f"Multiple {self.asset_name} items found with name '{name}', use ID instead.\n{self.description()}" if len(matching_queries) > 1: - raise DataPortalAssetNotFound(msg) + raise DataPortalInputError(msg) return matching_queries[0] diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index ee89247..205a14d 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -1,12 +1,14 @@ import datetime +import re from pathlib import Path -from typing import Union, List, Optional +from typing import Union, List, Optional, Any from cirro_api_client.v1.api.processes import validate_file_requirements from cirro_api_client.v1.models import Dataset, DatasetDetail, RunAnalysisRequest, ProcessDetail, Status, \ RunAnalysisRequestParams, Tag, ArtifactType, NamedItem, ValidateFileRequirementsRequest from cirro.cirro_client import CirroApi +from cirro.file_utils import filter_files_by_pattern from cirro.models.assets import DatasetAssets from cirro.models.file import PathLike from cirro.sdk.asset import DataPortalAssets, DataPortalAsset @@ -17,6 +19,93 @@ from cirro.sdk.process import DataPortalProcess +def _pattern_to_captures_regex(pattern: str): + """ + Convert a glob pattern that may contain ``{name}`` capture placeholders into + a compiled regex and return ``(compiled_regex, capture_names)``. + + Conversion rules: + - ``{name}`` → named group matching a single path segment (no ``/``) + - ``*`` → matches any characters within a single path segment + - ``**`` → matches any characters including ``/`` (multiple segments) + - All other characters are regex-escaped. + + The resulting regex is suffix-anchored (like ``pathlib.PurePath.match``): + a pattern without a leading ``/`` will match at any depth in the path. + """ + capture_names = re.findall(r'\{(\w+)\}', pattern) + tokens = re.split(r'(\*\*|\*|\{\w+\})', pattern) + parts = [] + for token in tokens: + if token == '**': + parts.append('.*') + elif token == '*': + parts.append('[^/]*') + elif re.match(r'^\{\w+\}$', token): + name = token[1:-1] + parts.append(f'(?P<{name}>[^/]+)') + else: + parts.append(re.escape(token)) + regex_str = ''.join(parts) + if not pattern.startswith('/'): + regex_str = r'(?:.+/)?' + regex_str + return re.compile('^' + regex_str + '$'), capture_names + + +def _infer_file_format(path: str) -> str: + """Infer the file format from the file extension.""" + path_lower = path.lower() + for ext in ('.gz', '.bz2', '.xz', '.zst'): + if path_lower.endswith(ext): + path_lower = path_lower[:-len(ext)] + break + if path_lower.endswith('.csv') or path_lower.endswith('.tsv'): + return 'csv' + elif path_lower.endswith('.h5ad'): + return 'h5ad' + elif path_lower.endswith('.json'): + return 'json' + elif path_lower.endswith('.parquet'): + return 'parquet' + elif path_lower.endswith('.feather'): + return 'feather' + elif path_lower.endswith('.pkl') or path_lower.endswith('.pickle'): + return 'pickle' + elif path_lower.endswith('.xlsx') or path_lower.endswith('.xls'): + return 'excel' + else: + return 'text' + + +def _read_file_with_format(file: DataPortalFile, file_format: Optional[str], **kwargs) -> Any: + """Read a file using the specified format, or auto-detect from extension.""" + if file_format is None: + file_format = _infer_file_format(file.relative_path) + if file_format == 'csv': + return file.read_csv(**kwargs) + elif file_format == 'h5ad': + return file.read_h5ad() + elif file_format == 'json': + return file.read_json(**kwargs) + elif file_format == 'parquet': + return file.read_parquet(**kwargs) + elif file_format == 'feather': + return file.read_feather(**kwargs) + elif file_format == 'pickle': + return file.read_pickle(**kwargs) + elif file_format == 'excel': + return file.read_excel(**kwargs) + elif file_format == 'text': + return file.read(**kwargs) + elif file_format == 'bytes': + return file._get() + else: + raise DataPortalInputError( + f"Unsupported file_format: '{file_format}'. " + f"Supported values: 'csv', 'h5ad', 'json', 'parquet', 'feather', 'pickle', 'excel', 'text', 'bytes'" + ) + + class DataPortalDataset(DataPortalAsset): """ Datasets in the Data Portal are collections of files which have @@ -31,7 +120,7 @@ def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi): Should be invoked from a top-level constructor, for example: ```python - from cirro import DataPortal() + from cirro import DataPortal portal = DataPortal() dataset = portal.get_dataset( project="id-or-name-of-project", @@ -199,6 +288,108 @@ def list_files(self) -> DataPortalFiles: ] ) + def read_files( + self, + glob: str = None, + pattern: str = None, + filetype: str = None, + **kwargs + ): + """ + Read the contents of files in the dataset. + + See :meth:`~cirro.sdk.portal.DataPortal.read_files` for full details + on ``glob``/``pattern`` matching and filetype options. + + Args: + glob (str): Wildcard expression to match files. + Yields one item per matching file: the parsed content. + pattern (str): Wildcard expression with ``{name}`` capture + placeholders. Yields ``(content, meta)`` per matching file. + filetype (str): File format used to parse each file + (or ``None`` to infer from extension). + **kwargs: Additional keyword arguments forwarded to the + file-parsing function. + + Yields: + - When using ``glob``: *content* for each matching file + - When using ``pattern``: ``(content, meta)`` for each matching file + """ + if glob is not None and pattern is not None: + raise DataPortalInputError("Cannot specify both 'glob' and 'pattern' — use one or the other") + if glob is None and pattern is None: + raise DataPortalInputError("Must specify either 'glob' or 'pattern'") + + if glob is not None: + for file in filter_files_by_pattern(list(self.list_files()), glob): + yield _read_file_with_format(file, filetype, **kwargs) + else: + compiled_regex, _ = _pattern_to_captures_regex(pattern) + for file in self.list_files(): + m = compiled_regex.match(file.relative_path) + if m is not None: + yield _read_file_with_format(file, filetype, **kwargs), m.groupdict() + + def read_file( + self, + path: str = None, + glob: str = None, + filetype: str = None, + **kwargs + ) -> Any: + """ + Read the contents of a single file from the dataset. + + See :meth:`~cirro.sdk.portal.DataPortal.read_file` for full details. + + Args: + path (str): Exact relative path of the file within the dataset. + glob (str): Wildcard expression matching exactly one file. + filetype (str): File format used to parse the file. Supported values + are the same as :meth:`~cirro.sdk.portal.DataPortal.read_files`. + **kwargs: Additional keyword arguments forwarded to the file-parsing + function. + + Returns: + Parsed file content. + """ + if path is not None and glob is not None: + raise DataPortalInputError("Cannot specify both 'path' and 'glob' — use one or the other") + if path is None and glob is None: + raise DataPortalInputError("Must specify either 'path' or 'glob'") + + if path is not None: + file = self.get_file(path) + else: + matches = list(filter_files_by_pattern(list(self.list_files()), glob)) + if len(matches) == 0: + raise DataPortalAssetNotFound(f"No files matched glob '{glob}'") + if len(matches) > 1: + raise DataPortalInputError( + f"glob '{glob}' matched {len(matches)} files — use read_files() to read multiple files" + ) + file = matches[0] + + return _read_file_with_format(file, filetype, **kwargs) + + def get_trace(self) -> Any: + """ + Read the Nextflow workflow trace file for this dataset as a DataFrame. + + Returns: + `pandas.DataFrame` + """ + return self.get_artifact(ArtifactType.WORKFLOW_TRACE).read_csv(sep='\t') + + def get_logs(self) -> str: + """ + Read the Nextflow workflow logs for this dataset as a string. + + Returns: + str + """ + return self.get_artifact(ArtifactType.WORKFLOW_LOGS).read() + def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile: """ Get the artifact of a particular type from the dataset @@ -225,16 +416,21 @@ def list_artifacts(self) -> List[DataPortalFile]: ] ) - def download_files(self, download_location: str = None) -> None: + def download_files(self, download_location: str = None, glob: str = None) -> None: """ Download all the files from the dataset to a local directory. Args: download_location (str): Path to local directory + glob (str): Optional wildcard expression to filter which files are downloaded + (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``). + If omitted, all files are downloaded. """ - # Alias for internal method - self.list_files().download(download_location) + files = self.list_files() + if glob is not None: + files = DataPortalFiles(filter_files_by_pattern(list(files), glob)) + files.download(download_location) def run_analysis( self, @@ -281,6 +477,7 @@ def run_analysis( process = parse_process_name_or_id(process, self._client) if compute_environment: + compute_environment_name = compute_environment compute_environments = self._client.compute_environments.list_environments_for_project( project_id=self.project_id ) @@ -290,7 +487,7 @@ def run_analysis( None ) if compute_environment is None: - raise DataPortalInputError(f"Compute environment '{compute_environment}' not found") + raise DataPortalInputError(f"Compute environment '{compute_environment_name}' not found") resp = self._client.execution.run_analysis( project_id=self.project_id, diff --git a/cirro/sdk/file.py b/cirro/sdk/file.py index db30119..3c6850e 100644 --- a/cirro/sdk/file.py +++ b/cirro/sdk/file.py @@ -1,4 +1,6 @@ import gzip +import json +import pickle from io import BytesIO, StringIO from pathlib import Path from typing import List @@ -25,7 +27,7 @@ def __init__(self, file: File, client: CirroApi): Instantiate by listing files from a dataset. ```python - from cirro import DataPortal() + from cirro import DataPortal portal = DataPortal() dataset = portal.get_dataset( project="id-or-name-of-project", @@ -109,7 +111,7 @@ def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> 'DataFram elif self.relative_path.endswith('.bz2'): compression = dict(method='bz2') elif self.relative_path.endswith('.xz'): - compression = dict(method='zstd') + compression = dict(method='xz') elif self.relative_path.endswith('.zst'): compression = dict(method='zstd') else: @@ -142,6 +144,44 @@ def read_h5ad(self) -> 'anndata.AnnData': with BytesIO(self._get()) as handle: return ad.read_h5ad(handle) + def read_json(self, **kwargs): + """Read the file contents as a parsed JSON object (dict, list, etc.).""" + return json.loads(self._get(), **kwargs) + + def read_parquet(self, **kwargs) -> 'DataFrame': + """ + Read a Parquet file as a Pandas DataFrame. + + Requires ``pyarrow`` or ``fastparquet`` to be installed. + All keyword arguments are passed to :func:`pandas.read_parquet`. + """ + import pandas + return pandas.read_parquet(BytesIO(self._get()), **kwargs) + + def read_feather(self, **kwargs) -> 'DataFrame': + """ + Read a Feather file as a Pandas DataFrame. + + Requires ``pyarrow`` to be installed. + All keyword arguments are passed to :func:`pandas.read_feather`. + """ + import pandas + return pandas.read_feather(BytesIO(self._get()), **kwargs) + + def read_pickle(self, **kwargs): + """Read the file contents as a Python pickle object.""" + return pickle.loads(self._get(), **kwargs) + + def read_excel(self, **kwargs) -> 'DataFrame': + """ + Read an Excel file (``.xlsx`` / ``.xls``) as a Pandas DataFrame. + + Requires ``openpyxl`` (for ``.xlsx``) or ``xlrd`` (for ``.xls``). + All keyword arguments are passed to :func:`pandas.read_excel`. + """ + import pandas + return pandas.read_excel(BytesIO(self._get()), **kwargs) + def readlines(self, encoding='utf-8', compression=None) -> List[str]: """Read the file contents as a list of lines.""" @@ -240,5 +280,5 @@ def download(self, download_location: str = None) -> List[Path]: local_paths = [] for f in self: - local_paths += f.download(download_location) + local_paths.append(f.download(download_location)) return local_paths diff --git a/cirro/sdk/portal.py b/cirro/sdk/portal.py index ebd5fd9..7f4727c 100644 --- a/cirro/sdk/portal.py +++ b/cirro/sdk/portal.py @@ -28,7 +28,7 @@ def __init__(self, base_url: str = None, client: CirroApi = None): ```python from cirro import DataPortal - Portal = DataPortal(base_url="app.cirro.bio") + portal = DataPortal(base_url="app.cirro.bio") portal.list_projects() ``` """ @@ -100,10 +100,136 @@ def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDat except DataPortalAssetNotFound: project: DataPortalProject = self.get_project_by_name(project) - try: - return project.get_dataset_by_id(dataset) - except DataPortalAssetNotFound: - return project.get_dataset_by_name(dataset) + return project.get_dataset(dataset) + + def read_files( + self, + project: str, + dataset: str, + glob: str = None, + pattern: str = None, + filetype: str = None, + **kwargs + ): + """ + Read the contents of files from a dataset. + + The project and dataset can each be identified by name or ID. + Exactly one of ``glob`` or ``pattern`` must be provided. + + **glob** — standard wildcard matching; yields the file content for each + matching file: + + - ``*`` matches any characters within a single path segment + - ``**`` matches zero or more path segments + - Matching is suffix-anchored (``*.csv`` matches at any depth) + + **pattern** — like ``glob`` but ``{name}`` placeholders capture portions + of the path automatically; yields ``(content, meta)`` pairs where + *meta* is a ``dict`` of extracted values: + + - ``{name}`` captures one path segment (no ``/``) + - ``*`` and ``**`` wildcards work as in ``glob`` + + Args: + project (str): ID or name of the project. + dataset (str): ID or name of the dataset. + glob (str): Wildcard expression to match files + (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``). + Yields one item per matching file: the parsed content. + pattern (str): Wildcard expression with ``{name}`` capture + placeholders (e.g., ``'{sample}.csv'``, + ``'{condition}/{sample}.csv'``). + Yields ``(content, meta)`` per matching file. + filetype (str): File format used to parse each file. Supported values: + + - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame`` + - ``'h5ad'``: parse as AnnData (requires ``anndata`` package) + - ``'json'``: parse with :func:`json.loads`, returns a Python object + - ``'parquet'``: parse with :func:`pandas.read_parquet`, returns a ``DataFrame`` + (requires ``pyarrow`` or ``fastparquet``) + - ``'feather'``: parse with :func:`pandas.read_feather`, returns a ``DataFrame`` + (requires ``pyarrow``) + - ``'pickle'``: deserialize with :mod:`pickle`, returns a Python object + - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame`` + (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``) + - ``'text'``: read as plain text, returns a ``str`` + - ``'bytes'``: read as raw bytes, returns ``bytes`` + - ``None`` (default): infer from file extension + (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``, + ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``, + ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``, + ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``) + **kwargs: Additional keyword arguments forwarded to the file-parsing + function (e.g., ``sep='\\t'`` for CSV/TSV files). + + Yields: + - When using ``glob``: *content* for each matching file + - When using ``pattern``: ``(content, meta)`` for each matching file, + where *meta* is a ``dict`` of values extracted from ``{name}`` + placeholders + + Raises: + DataPortalInputError: if both ``glob`` and ``pattern`` are provided, + or if neither is provided. + + Example: + ```python + # Read all CSV files — just the content + for df in portal.read_files('My Project', 'My Dataset', glob='*.csv'): + print(df.shape) + + # Extract sample names from filenames automatically + for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'): + print(meta['sample'], df.shape) + + # Multi-level capture: condition directory + sample filename + for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'): + print(meta['condition'], meta['sample'], df.shape) + + # Read gzip-compressed TSV files with explicit separator + for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', filetype='csv', sep='\\t'): + print(df.shape) + ``` + """ + ds = self.get_dataset(project=project, dataset=dataset) + yield from ds.read_files(glob=glob, pattern=pattern, filetype=filetype, **kwargs) + + def read_file( + self, + project: str, + dataset: str, + path: str = None, + glob: str = None, + filetype: str = None, + **kwargs + ): + """ + Read the contents of a single file from a dataset. + + The project and dataset can each be identified by name or ID. + Provide either ``path`` (exact relative path) or ``glob`` (wildcard + expression). If ``glob`` is used it must match exactly one file. + + Args: + project (str): ID or name of the project. + dataset (str): ID or name of the dataset. + path (str): Exact relative path of the file within the dataset. + glob (str): Wildcard expression matching exactly one file. + filetype (str): File format used to parse the file. Supported values + are the same as :meth:`read_files`. + **kwargs: Additional keyword arguments forwarded to the + file-parsing function. + + Returns: + Parsed file content. + + Raises: + DataPortalInputError: if both or neither of ``path``/``glob`` are + provided, or if ``glob`` matches zero or more than one file. + """ + ds = self.get_dataset(project=project, dataset=dataset) + return ds.read_file(path=path, glob=glob, filetype=filetype, **kwargs) def list_processes(self, ingest=False) -> DataPortalProcesses: """ diff --git a/cirro/sdk/process.py b/cirro/sdk/process.py index 282924f..8f4cff9 100644 --- a/cirro/sdk/process.py +++ b/cirro/sdk/process.py @@ -147,6 +147,7 @@ def run_analysis( ] if compute_environment: + compute_environment_name = compute_environment compute_environments = self._client.compute_environments.list_environments_for_project( project_id=project_id ) @@ -156,7 +157,7 @@ def run_analysis( None ) if compute_environment is None: - raise DataPortalInputError(f"Compute environment '{compute_environment}' not found") + raise DataPortalInputError(f"Compute environment '{compute_environment_name}' not found") resp = self._client.execution.run_analysis( project_id=project_id, diff --git a/cirro/sdk/project.py b/cirro/sdk/project.py index ae85c87..89f58c9 100644 --- a/cirro/sdk/project.py +++ b/cirro/sdk/project.py @@ -89,6 +89,31 @@ def list_datasets(self, force_refresh=False) -> DataPortalDatasets: ] ) + def get_dataset(self, name_or_id: str, force_refresh=False) -> DataPortalDataset: + """Return the dataset matching the given ID or name. + + Tries to match by ID first, then by name. + Raises an error if the name matches multiple datasets. + """ + if force_refresh: + self._get_datasets.cache_clear() + + # Try by ID first + try: + return self.get_dataset_by_id(name_or_id) + except Exception: + pass + + # Fall back to name matching + matches = [d for d in self._get_datasets() if d.name == name_or_id] + if len(matches) == 0: + raise DataPortalAssetNotFound(f'Dataset with name or ID "{name_or_id}" not found') + if len(matches) > 1: + raise DataPortalInputError( + f'Multiple datasets found with the name "{name_or_id}" — use get_dataset_by_id instead' + ) + return self.get_dataset_by_id(matches[0].id) + def get_dataset_by_name(self, name: str, force_refresh=False) -> DataPortalDataset: """Return the dataset with the specified name.""" if force_refresh: diff --git a/pyproject.toml b/pyproject.toml index 6224e9f..50ea289 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "cirro" -version = "1.10.2" +version = "1.10.3" description = "CLI tool and SDK for interacting with the Cirro platform" authors = ["Cirro Bio "] license = "MIT" diff --git a/samples/Analyzing_a_dataset.ipynb b/samples/Analyzing_a_dataset.ipynb index 1b7d0c6..3cd23ca 100644 --- a/samples/Analyzing_a_dataset.ipynb +++ b/samples/Analyzing_a_dataset.ipynb @@ -21,14 +21,119 @@ }, "outputs": [], "source": [ + "# Import the library used to interact with Cirro\n", "from cirro import DataPortal\n", "\n", + "# Create a connection to Cirro with your identity\n", "portal = DataPortal()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Option 1 - run analysis using the same set of parameters used previously" + ] + }, { "cell_type": "code", "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset 'Test dataset for variant calling' contains 2 files\n" + ] + } + ], + "source": [ + "# New dataset with FASTQs\n", + "input_dataset = portal.get_dataset(\n", + " project=\"Pipeline Development\",\n", + " dataset=\"Test dataset for variant calling\"\n", + ")\n", + "print(f\"Dataset '{input_dataset.name}' contains {len(input_dataset.list_files()):,} files\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using the 'Align Reads (nf-core/sarek)' process (ID: process-nf-core-sarek-align-3-2)\n" + ] + } + ], + "source": [ + "# Get the process to run on the dataset\n", + "process = portal.get_process_by_name('Align Reads (nf-core/sarek)')\n", + "print(f\"Using the '{process.name}' process (ID: {process.id})\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using parameters from Genomic variant calling - parameter validation\n", + "{'WORKFLOW_VERSION': '3.2.3', 'analysis_type': {'genome': 'GATK.GRCh38', 'wes': True, 'analysis_type': 'Germline Variant Calling', 'intervals': 's3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed', 'tools': ['strelka', 'haplotypecaller']}, 'annotation': {'annotation_tool': []}, 'read_trimming_options': {'trim_fastq': False}}\n" + ] + } + ], + "source": [ + "# Previous dataset created by the pipeline\n", + "previous_run = portal.get_dataset(\n", + " project=\"Pipeline Development\",\n", + " dataset=\"Genomic variant calling - parameter validation\"\n", + ")\n", + "print(f\"Using parameters from {previous_run.name}\")\n", + "print(previous_run.params)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Started new analysis: ID f7ca7e1b-d64c-4747-b647-0e984db87aa5\n" + ] + } + ], + "source": [ + "# Start a new run, using the parameters from the previous run\n", + "new_dataset_id = input_dataset.run_analysis(\n", + " name=\"Genomic variant calling - new run\",\n", + " description='Test from SDK',\n", + " process=process,\n", + " params=previous_run.params\n", + ")\n", + "print(f\"Started new analysis: ID {new_dataset_id}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Option 2: Build parameters from scratch" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "metadata": { "pycharm": { "name": "#%%\n" @@ -39,24 +144,37 @@ "name": "stdout", "output_type": "stream", "text": [ - "Project 'Test Project' contains 104 datasets\n", - "Dataset 'Test dataset for variant calling' contains 2 files\n", - "Using the 'Variant Calling (nf-core/sarek)' process (ID: process-nf-core-sarek-3-0-1)\n" + "Project 'Pipeline Development' contains 709 datasets\n" ] } ], "source": [ "# Get the project by name\n", - "project = portal.get_project_by_name('Test Project') \n", - "print(f\"Project '{project.name}' contains {len(project.list_datasets()):,} datasets\")\n", - "\n", + "project = portal.get_project_by_name('Pipeline Development') \n", + "print(f\"Project '{project.name}' contains {len(project.list_datasets()):,} datasets\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset 'Test dataset for variant calling' contains 2 files\n" + ] + } + ], + "source": [ "# Get a particular dataset from that project\n", "dataset = project.get_dataset_by_name('Test dataset for variant calling')\n", - "print(f\"Dataset '{dataset.name}' contains {len(dataset.list_files()):,} files\")\n", - "\n", - "# Get the process to run on the dataset\n", - "process = portal.get_process_by_id('process-nf-core-sarek-3-0-1')\n", - "print(f\"Using the '{process.name}' process (ID: {process.id})\")" + "print(f\"Dataset '{dataset.name}' contains {len(dataset.list_files()):,} files\")" ] }, { @@ -72,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "metadata": { "pycharm": { "name": "#%%\n" @@ -84,15 +202,15 @@ "output_type": "stream", "text": [ "Parameters:\n", - "\tExperiment Design (Group)\n", + "\tWorkflow Version (key=workflow_version, default=3.6.0, type=string, enum=['3.1', '3.1.1', '3.1.2', '3.2.3', '3.3.2', '3.4.4', '3.5.1', '3.6.0'], description=Select the specific version of nf-core/sarek used for analysis)\n", + "\tExperimental Design (Group)\n", "\t\tReference Genome (key=genome, default=GATK.GRCh38, type=string, enum=['GATK.GRCh38', 'GATK.GRCh37', 'GRCm38'])\n", "\t\tWhole Exome/Targeted Gene Panel Assay (key=wes, type=boolean, description=Please indicate if your data was generated using a capture kit.)\n", "\t\tGenomic intervals (key=intervals, type=string, description=Target bed file in case of whole exome or targeted sequencing or intervals file for parallelization.)\n", - "\t\tVariant Calling Type (key=analysis_type, default=Germline Variant Calling, enum=['Germline Variant Calling', 'Somatic Variant Calling'])\n", - "\tVariant Annotation (Group)\n", - "\t\tAnnotation tool(s) (key=annotation_tool, type=array, description=Please select one or both variant annotation tools.)\n", "\tRead Trimming Options (Group)\n", - "\t\tTrim reads using Trim-Galore? (key=trim_fastq, type=boolean)\n" + "\t\tTrim reads using Trim-Galore? (key=trim_fastq, type=boolean)\n", + "\tAdvanced Options (Group)\n", + "\t\tMarkDuplicates - Optical Duplicate Pixel Distance (key=optical_duplicate_pixel_distance, default=100, type=integer, description=The `--OPTICAL_DUPLICATE_PIXEL_DISTANCE` parameter is used by MarkDuplicates to set the maximum offset between two duplicate clusters in pixels for them to be considered optical duplicates. A value of 100 is generally appropriate for unpatterned Illumina flowcells and 250 is appropriate for patterned Illumina flow cells.)\n" ] } ], @@ -114,7 +232,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "metadata": { "pycharm": { "name": "#%%\n" @@ -126,13 +244,15 @@ "output_type": "stream", "text": [ "The BED references available are:\n", - "GRCh38_Chr20\n", - " - NimbleGen_SeqCap_EZ_Exome_primary-capture_hg19_chr17\n", - " - wgs_calling_regions.hg19.bed\n", + "wgs_calling_regions.hg19.bed\n", + " - hg38\n", + " - epi2me-labs-wf-human-variation-ref\n", " - wgs_calling_regions.hg38.bed\n", + " - GRCh38_Chr20\n", + " - NimbleGen_SeqCap_EZ_Exome_primary-capture_hg19_chr17\n", "\n", "The reference library we are using is: GRCh38_Chr20\n", - "The absolute path to the file is: s3://z-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed\n" + "The absolute path to the file is: s3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed\n" ] } ], @@ -153,7 +273,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, "metadata": { "pycharm": { "name": "#%%\n" @@ -163,25 +283,37 @@ { "data": { "text/plain": [ - "{'genome': 'GATK.GRCh38',\n", - " 'wes': True,\n", - " 'intervals': 's3://z-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed',\n", - " 'trim_fastq': False,\n", - " 'annotation_tool': ['cnvkit', 'deepvariant']}" + "{'WORKFLOW_VERSION': '3.2.3',\n", + " 'analysis_type': {'genome': 'GATK.GRCh38',\n", + " 'wes': True,\n", + " 'analysis_type': 'Germline Variant Calling',\n", + " 'intervals': 's3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed',\n", + " 'tools': ['strelka', 'haplotypecaller']},\n", + " 'annotation': {'annotation_tool': []},\n", + " 'read_trimming_options': {'trim_fastq': False}}" ] }, - "execution_count": 5, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "params = {\n", - " 'genome': 'GATK.GRCh38',\n", - " 'wes': True,\n", - " 'intervals': reference_library.absolute_path,\n", - " 'trim_fastq': False,\n", - " 'annotation_tool': ['cnvkit', 'deepvariant']\n", + " 'WORKFLOW_VERSION': '3.2.3',\n", + " 'analysis_type': {\n", + " 'genome': 'GATK.GRCh38',\n", + " 'wes': True,\n", + " 'analysis_type': 'Germline Variant Calling',\n", + " 'intervals': 's3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed',\n", + " 'tools': ['strelka', 'haplotypecaller']\n", + " },\n", + " 'annotation': {\n", + " 'annotation_tool': []\n", + " },\n", + " 'read_trimming_options': {\n", + " 'trim_fastq': False\n", + " }\n", "}\n", "params" ] @@ -200,7 +332,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "metadata": { "pycharm": { "name": "#%%\n" @@ -225,7 +357,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": { "pycharm": { "name": "#%%\n" @@ -236,16 +368,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "71ec598c-368b-47a5-84c8-c209739b050a\n" + "ca8eee87-09d9-4abe-ba0e-4e6ba48b33fa\n" ] } ], "source": [ "# Run the analysis, specifying a name and description for the resulting dataset\n", - "new_dataset_id = dataset.run_analysis(\n", + "new_dataset_id = input_dataset.run_analysis(\n", " name='Variant Calling Analysis',\n", " description='Test from SDK',\n", - " process='process-nf-core-sarek-3-0-1',\n", + " process=process,\n", " params=params\n", ")\n", "print(new_dataset_id)" @@ -275,7 +407,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.12.7" }, "vscode": { "interpreter": { @@ -284,5 +416,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/samples/Downloading_a_dataset.ipynb b/samples/Downloading_a_dataset.ipynb index 71f372f..cca284a 100644 --- a/samples/Downloading_a_dataset.ipynb +++ b/samples/Downloading_a_dataset.ipynb @@ -34,7 +34,10 @@ } }, "source": [ - "You can get the list of all projects which are available, and select a particular project by name" + "If you don't know exactly what the name or ID is of the dataset you want to download,\n", + "you can get the list of all projects which are available, and select a particular project by name.\n", + "\n", + "### Inspecting datasets" ] }, { @@ -46,9 +49,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "There are 3 projects available\n", - "Selected the project 'Test Project' (ID: 9a31492a-e679-43ce-9f06-d84213c8f7f7)\n", - "This project contains 104 datasets to choose from\n" + "There are 5 projects available\n", + "Selected the project 'Pipeline Development' (ID: 9a31492a-e679-43ce-9f06-d84213c8f7f7)\n", + "This project contains 709 datasets to choose from\n" ] } ], @@ -56,7 +59,7 @@ "print(f\"There are {len(portal.list_projects()):,} projects available\")\n", "# print(portal.list_projects()) # run this line to see all the projects\n", "\n", - "project = portal.get_project_by_name(\"Test Project\")\n", + "project = portal.get_project_by_name(\"Pipeline Development\")\n", "print(f\"Selected the project '{project.name}' (ID: {project.id})\")\n", "print(f\"This project contains {len(project.list_datasets()):,} datasets to choose from\")" ] @@ -82,17 +85,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Name: Test of mageck-count (updated headnode code 9/22/2022) (3)\n", - "Id: bcda3e84-1abe-4d08-86b0-690ea7e1cdad\n", - "Description: Test of mageck-count (updated headnode code 9/22/2022)\n", + "Name: Genomic variant calling - parameter validation\n", + "Id: 3fb7e8f8-b62d-43a6-ad08-eb28f59bd141\n", + "Description: None\n", "Status: COMPLETED\n" ] } ], "source": [ "# Datasets can be selected by name or by ID\n", - "dataset = project.get_dataset_by_id(\"bcda3e84-1abe-4d08-86b0-690ea7e1cdad\")\n", - "# dataset = project.get_dataset_by_name(\"Test of mageck-count\")\n", + "dataset = project.get_dataset(\"Genomic variant calling - parameter validation\")\n", "print(dataset)" ] }, @@ -104,191 +106,63 @@ } }, "source": [ - "Download all of the files from that dataset to a temporary folder" + "### Downloading files\n", + "\n", + "Download all of the files from that dataset (to a temporary folder in this case)" ] }, { "cell_type": "code", "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# You can also just select that dataset in a single call\n", + "dataset = portal.get_dataset(\n", + " project=\"Pipeline Development\",\n", + " dataset=\"Genomic variant calling - parameter validation\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": { "pycharm": { "name": "#%%\n" }, "scrolled": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading file MO_Brunello_1.fastq (898.44 KB) | 100.0%|█████████████████████████ | 1.46MB/s\n", - "Downloading file MO_Brunello_2.fastq (898.44 KB) | 100.0%|█████████████████████████ | 1.83MB/s\n", - "Downloading file MO_Brunello_gDNA_1.fastq (898.44 KB) | 100.0%|█████████████████████████ | 2.16MB/s\n", - "Downloading file MO_Brunello_gDNA_2.fastq (898.44 KB) | 100.0%|█████████████████████████ | 1.39MB/s\n", - "Downloading file multiqc_report.html (1.12 MB) | 100.0%|█████████████████████████ | 1.35MB/s\n", - "Downloading file MO_Brunello_1.json (72.07 KB) | 100.0%|█████████████████████████ | 285kB/s\n", - "Downloading file MO_Brunello_1_fastqc.html (804.22 KB) | 100.0%|█████████████████████████ | 1.15MB/s\n", - "Downloading file MO_Brunello_2.json (72.07 KB) | 100.0%|█████████████████████████ | 349kB/s\n", - "Downloading file MO_Brunello_2_fastqc.html (824.26 KB) | 100.0%|█████████████████████████ | 1.19MB/s\n", - "Downloading file MO_Brunello_gDNA_1.json (72.53 KB) | 100.0%|█████████████████████████ | 319kB/s\n", - "Downloading file MO_Brunello_gDNA_1_fastqc.html (824.76 KB) | 100.0%|█████████████████████████ | 2.10MB/s\n", - "Downloading file MO_Brunello_gDNA_2.json (71.84 KB) | 100.0%|█████████████████████████ | 289kB/s\n", - "Downloading file MO_Brunello_gDNA_2_fastqc.html (815.26 KB) | 100.0%|█████████████████████████ | 1.95MB/s\n", - "Downloading file MO_Brunello_1.count.txt (1.55 MB) | 100.0%|█████████████████████████ | 3.62MB/s\n", - "Downloading file MO_Brunello_1.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 3.09MB/s\n", - "Downloading file MO_Brunello_1.countsummary.txt (237.00 B) | 100.0%|█████████████████████████ | 1.42kB/s\n", - "Downloading file MO_Brunello_2.count.txt (1.55 MB) | 100.0%|█████████████████████████ | 3.61MB/s\n", - "Downloading file MO_Brunello_2.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 2.72MB/s\n", - "Downloading file MO_Brunello_2.countsummary.txt (237.00 B) | 100.0%|█████████████████████████ | 2.28kB/s\n", - "Downloading file MO_Brunello_gDNA_1.count.txt (1.55 MB) | 100.0%|█████████████████████████ | 2.82MB/s\n", - "Downloading file MO_Brunello_gDNA_1.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 2.57MB/s\n", - "Downloading file MO_Brunello_gDNA_1.countsummary.txt (247.00 B) | 100.0%|█████████████████████████ | 2.57kB/s\n", - "Downloading file MO_Brunello_gDNA_2.count.txt (1.55 MB) | 100.0%|█████████████████████████ | 3.40MB/s\n", - "Downloading file MO_Brunello_gDNA_2.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 1.52MB/s\n", - "Downloading file MO_Brunello_gDNA_2.countsummary.txt (246.00 B) | 100.0%|█████████████████████████ | 2.33kB/s\n", - "Downloading file counts.txt (1.99 MB) | 100.0%|█████████████████████████ | 3.48MB/s\n", - "Downloading file sample_names.txt (65.00 B) | 100.0%|█████████████████████████ | 662B/s\n", - "Downloading file summary.txt (366.00 B) | 100.0%|█████████████████████████ | 2.41kB/s\n", - "Downloading file MO_Brunello_1.log (2.39 KB) | 100.0%|█████████████████████████ | 11.1kB/s\n", - "Downloading file MO_Brunello_2.log (2.39 KB) | 100.0%|█████████████████████████ | 16.1kB/s\n", - "Downloading file MO_Brunello_gDNA_1.log (2.43 KB) | 100.0%|█████████████████████████ | 23.2kB/s\n", - "Downloading file MO_Brunello_gDNA_2.log (2.43 KB) | 100.0%|█████████████████████████ | 19.4kB/s\n" - ] - } - ], + "outputs": [], "source": [ - "dataset.download_files(\"/tmp\")" + "# dataset.download_files(\"/tmp\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Alternatively, you can inspect and filter the list of files to only what is needed" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data/cutadapt/trim/fastq/MO_Brunello_1.fastq (920000 bytes)\n", - "\n", - "data/cutadapt/trim/fastq/MO_Brunello_2.fastq (920000 bytes)\n", - "\n", - "data/cutadapt/trim/fastq/MO_Brunello_gDNA_1.fastq (920000 bytes)\n", - "\n", - "data/cutadapt/trim/fastq/MO_Brunello_gDNA_2.fastq (920000 bytes)\n", - "\n", - "data/fastqc/multiqc_report.html (1173155 bytes)\n", - "\n", - "data/fastqc/MO_Brunello_1/MO_Brunello_1.json (73803 bytes)\n", - "\n", - "data/fastqc/MO_Brunello_1/MO_Brunello_1_fastqc.html (823526 bytes)\n", - "\n", - "data/fastqc/MO_Brunello_2/MO_Brunello_2.json (73797 bytes)\n", - "\n", - "data/fastqc/MO_Brunello_2/MO_Brunello_2_fastqc.html (844044 bytes)\n", - "\n", - "data/fastqc/MO_Brunello_gDNA_1/MO_Brunello_gDNA_1.json (74268 bytes)\n", - "\n", - "data/fastqc/MO_Brunello_gDNA_1/MO_Brunello_gDNA_1_fastqc.html (844554 bytes)\n", - "\n", - "data/fastqc/MO_Brunello_gDNA_2/MO_Brunello_gDNA_2.json (73563 bytes)\n", - "\n", - "data/fastqc/MO_Brunello_gDNA_2/MO_Brunello_gDNA_2_fastqc.html (834827 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_1.count.txt (1625955 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_1.count_normalized.txt (1638475 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_1.countsummary.txt (237 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_2.count.txt (1625955 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_2.count_normalized.txt (1638372 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_2.countsummary.txt (237 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_gDNA_1.count.txt (1625960 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_gDNA_1.count_normalized.txt (1638522 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_gDNA_1.countsummary.txt (247 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_gDNA_2.count.txt (1625960 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_gDNA_2.count_normalized.txt (1638905 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_gDNA_2.countsummary.txt (246 bytes)\n", - "\n", - "data/mageck/count/combined/counts.txt (2090653 bytes)\n", - "\n", - "data/mageck/count/combined/sample_names.txt (65 bytes)\n", - "\n", - "data/mageck/count/combined/summary.txt (366 bytes)\n", - "\n", - "data/mageck/count/log/MO_Brunello_1.log (2449 bytes)\n", - "\n", - "data/mageck/count/log/MO_Brunello_2.log (2449 bytes)\n", - "\n", - "data/mageck/count/log/MO_Brunello_gDNA_1.log (2489 bytes)\n", - "\n", - "data/mageck/count/log/MO_Brunello_gDNA_2.log (2488 bytes)\n" - ] - } - ], - "source": [ - "files = dataset.list_files()\n", - "print(files)" + "Alternatively, you can filter the list of files to only what is needed" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data/mageck/count/MO_Brunello_1.count_normalized.txt (1638475 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_2.count_normalized.txt (1638372 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_gDNA_1.count_normalized.txt (1638522 bytes)\n", - "\n", - "data/mageck/count/MO_Brunello_gDNA_2.count_normalized.txt (1638905 bytes)\n" - ] - } - ], - "source": [ - "norm_counts = files.filter_by_pattern(\"*.count_normalized.txt\")\n", - "print(norm_counts)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Downloading file MO_Brunello_1.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 1.86MB/s\n", - "Downloading file MO_Brunello_2.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 3.78MB/s\n", - "Downloading file MO_Brunello_gDNA_1.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 2.86MB/s\n", - "Downloading file MO_Brunello_gDNA_2.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 3.27MB/s\n" + "Downloading file ERR031935.haplotypecaller.filtered.vcf.gz (401.08 KB) | 100.0%|█████████████████████████ | 1.71MB/s\n", + "Downloading file ERR031935.haplotypecaller.vcf.gz (357.77 KB) | 100.0%|█████████████████████████ | 1.50MB/s\n", + "Downloading file ERR031935.strelka.genome.vcf.gz (12.29 MB) | 100.0%|█████████████████████████ | 6.54MB/s\n", + "Downloading file ERR031935.strelka.variants.vcf.gz (970.75 KB) | 100.0%|█████████████████████████ | 2.55MB/s\n" ] } ], "source": [ - "norm_counts.download(\"/tmp\")" + "dataset.download_files(\"/tmp\", glob=\"*.vcf.gz\")" ] }, { @@ -315,7 +189,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.12.7" }, "vscode": { "interpreter": { @@ -324,5 +198,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/samples/Interacting_with_files.ipynb b/samples/Interacting_with_files.ipynb index 929d9df..91b35b4 100644 --- a/samples/Interacting_with_files.ipynb +++ b/samples/Interacting_with_files.ipynb @@ -13,28 +13,37 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { - "pycharm": { - "name": "#%%\n" - }, "ExecuteTime": { "end_time": "2025-03-25T19:16:07.482109Z", "start_time": "2025-03-25T19:16:06.304549Z" + }, + "pycharm": { + "name": "#%%\n" } }, + "outputs": [], "source": [ "from cirro import DataPortal\n", "\n", - "portal = DataPortal()" - ], - "outputs": [], - "execution_count": 1 + "portal = DataPortal(base_url=\"\")" + ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Find the file you are looking for by defining the project and dataset, then searching for a particular file of interest based on a pattern using `filter_by_pattern`" + "Find the file you are looking for by defining the project and dataset, then using `read_file` or `read_files` to read file contents directly into Python objects.\n", + "\n", + "The file format is inferred automatically from the extension (`.csv`, `.tsv`, `.json`, `.parquet`, `.feather`, `.pkl`, `.xlsx`, `.h5ad`), or can be specified with the `format` parameter." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inspecting files" ] }, { @@ -50,31 +59,27 @@ "name": "stdout", "output_type": "stream", "text": [ - "The project Test Project contains 104 datasets\n", - "Dataset Test of mageck-count contains 32 files\n", - "Selected the file: data/mageck/count/combined/counts.txt (2090653 bytes)\n" + "Dataset: Genomic variant calling - parameter validation\n", + "Files: 235\n", + "data/variant_calling/haplotypecaller/ERR031935/ERR031935.haplotypecaller.filtered.vcf.gz\n", + "data/variant_calling/haplotypecaller/ERR031935/ERR031935.haplotypecaller.vcf.gz\n", + "data/variant_calling/strelka/ERR031935/ERR031935.strelka.genome.vcf.gz\n", + "data/variant_calling/strelka/ERR031935/ERR031935.strelka.variants.vcf.gz\n" ] } ], "source": [ "# Get the project which contains the dataset\n", - "project = portal.get_project_by_name('Test Project')\n", - "\n", - "# Get the set of datasets within that project\n", - "all_datasets = project.list_datasets()\n", - "print(f\"The project {project.name} contains {len(all_datasets):,} datasets\")\n", + "project = portal.get_project_by_name(\"Pipeline Development\")\n", "\n", "# Get the dataset of interest based on its name\n", - "dataset = all_datasets.get_by_name('Test of mageck-count')\n", - "\n", - "# Get the complete list of files in that dataset\n", - "files = dataset.list_files()\n", - "print(f\"Dataset {dataset.name} contains {len(files):,} files\")\n", - "\n", - "# Filter to just the files named counts.txt (using the wildcard to match the string of folders it is in)\n", - "counts = files.filter_by_pattern(\"*/counts.txt\")\n", + "dataset = project.get_dataset(\"Genomic variant calling - parameter validation\")\n", "\n", - "print(f\"Selected the file: {counts.description()}\")" + "print(f\"Dataset: {dataset.name}\")\n", + "print(f\"Files: {len(dataset.list_files()):,}\")\n", + "for file in dataset.list_files():\n", + " if file.name.endswith('.vcf.gz'):\n", + " print(file.name)" ] }, { @@ -85,7 +90,9 @@ } }, "source": [ - "Load the contents of that file into a DataFrame (keeping in mind that it is tab-delimited, not the default comma-delimited)" + "### Reading a file\n", + "\n", + "Read a single file into a DataFrame using `read_file`. The tab-separated format is specified explicitly with `sep='\\t'`." ] }, { @@ -118,78 +125,109 @@ " \n", " \n", " \n", - " sgRNA\n", - " Gene\n", - " MO_Brunello_gDNA_2\n", - " MO_Brunello_1\n", - " MO_Brunello_2\n", - " MO_Brunello_gDNA_1\n", + " 0\n", + " 1\n", + " 2\n", + " 3\n", + " 4\n", + " 5\n", + " 6\n", + " 7\n", + " 8\n", + " 9\n", " \n", " \n", " \n", " \n", " 0\n", - " A1BG_0\n", - " A1BG\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " chr20\n", + " 60826\n", + " .\n", + " T\n", + " A\n", + " 1\n", + " LowDepth;LowGQX;NoPassedVariantGTs\n", + " MQ=17;SNVHPOL=2\n", + " GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL\n", + " 0/1:3:0:1:0:0,1:0,1:0,0:0.0:LowGQX;LowDepth:29...\n", " \n", " \n", " 1\n", - " A1BG_1\n", - " A1BG\n", - " 0\n", - " 0\n", - " 0\n", - " 2\n", + " chr20\n", + " 60850\n", + " .\n", + " A\n", + " T\n", + " 1\n", + " LowDepth;LowGQX;NoPassedVariantGTs\n", + " MQ=24;SNVHPOL=4\n", + " GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL\n", + " 0/1:3:1:1:0:0,1:0,1:0,0:0.0:LowGQX;LowDepth:30...\n", " \n", " \n", " 2\n", - " A1BG_2\n", - " A1BG\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " chr20\n", + " 62437\n", + " .\n", + " C\n", + " T\n", + " 3\n", + " LowDepth;LowGQX;NoPassedVariantGTs\n", + " MQ=22;SNVHPOL=2\n", + " GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL\n", + " 0/1:3:0:1:1:0,1:0,0:0,1:0.0:LowGQX;LowDepth:35...\n", " \n", " \n", " 3\n", - " A1BG_3\n", - " A1BG\n", - " 0\n", - " 0\n", - " 2\n", - " 0\n", + " chr20\n", + " 62467\n", + " .\n", + " C\n", + " A\n", + " 4\n", + " LowDepth;LowGQX;NoPassedVariantGTs\n", + " MQ=24;SNVHPOL=2\n", + " GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL\n", + " 0/1:3:0:1:0:0,1:0,0:0,1:0.0:LowGQX;LowDepth:36...\n", " \n", " \n", " 4\n", - " A1CF_36946\n", - " A1CF\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " chr20\n", + " 62469\n", + " .\n", + " G\n", + " A\n", + " 3\n", + " LowDepth;LowGQX;NoPassedVariantGTs\n", + " MQ=24;SNVHPOL=3\n", + " GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL\n", + " 0/1:3:0:1:0:0,1:0,0:0,1:0.0:LowGQX;LowDepth:34...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " sgRNA Gene MO_Brunello_gDNA_2 MO_Brunello_1 MO_Brunello_2 \\\n", - "0 A1BG_0 A1BG 0 0 0 \n", - "1 A1BG_1 A1BG 0 0 0 \n", - "2 A1BG_2 A1BG 0 0 0 \n", - "3 A1BG_3 A1BG 0 0 2 \n", - "4 A1CF_36946 A1CF 0 0 0 \n", + " 0 1 2 3 4 5 6 \\\n", + "0 chr20 60826 . T A 1 LowDepth;LowGQX;NoPassedVariantGTs \n", + "1 chr20 60850 . A T 1 LowDepth;LowGQX;NoPassedVariantGTs \n", + "2 chr20 62437 . C T 3 LowDepth;LowGQX;NoPassedVariantGTs \n", + "3 chr20 62467 . C A 4 LowDepth;LowGQX;NoPassedVariantGTs \n", + "4 chr20 62469 . G A 3 LowDepth;LowGQX;NoPassedVariantGTs \n", "\n", - " MO_Brunello_gDNA_1 \n", - "0 0 \n", - "1 2 \n", - "2 0 \n", - "3 0 \n", - "4 0 " + " 7 8 \\\n", + "0 MQ=17;SNVHPOL=2 GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL \n", + "1 MQ=24;SNVHPOL=4 GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL \n", + "2 MQ=22;SNVHPOL=2 GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL \n", + "3 MQ=24;SNVHPOL=2 GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL \n", + "4 MQ=24;SNVHPOL=3 GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL \n", + "\n", + " 9 \n", + "0 0/1:3:0:1:0:0,1:0,1:0,0:0.0:LowGQX;LowDepth:29... \n", + "1 0/1:3:1:1:0:0,1:0,1:0,0:0.0:LowGQX;LowDepth:30... \n", + "2 0/1:3:0:1:1:0,1:0,0:0,1:0.0:LowGQX;LowDepth:35... \n", + "3 0/1:3:0:1:0:0,1:0,0:0,1:0.0:LowGQX;LowDepth:36... \n", + "4 0/1:3:0:1:0:0,1:0,0:0,1:0.0:LowGQX;LowDepth:34... " ] }, "execution_count": 3, @@ -198,56 +236,72 @@ } ], "source": [ - "df = counts[0].read_csv(sep=\"\\t\")\n", + "# Read a single file matched by a glob pattern\n", + "df = dataset.read_file(glob=\"*.variants.vcf.gz\", filetype=\"csv\", sep=\"\\t\", comment=\"#\", header=None)\n", "df.head()" ] }, { + "cell_type": "markdown", "metadata": {}, + "source": [ + "### Reading multiple files\n", + "\n", + "Use `read_files` to iterate over multiple matching files. With `{name}` capture placeholders in the `pattern`, extracted values are returned alongside each file's content." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'sample': 'ERR031935', 'type': 'genome'} (790381, 10)\n", + "{'sample': 'ERR031935', 'type': 'variants'} (36318, 10)\n" + ] + } + ], + "source": [ + "# Extract folder names from the path automatically using {name} placeholders\n", + "for df, meta in dataset.read_files(\n", + " pattern=\"*/strelka/{sample}/*.strelka.{type}.vcf.gz\",\n", + " filetype=\"csv\",\n", + " sep=\"\\t\",\n", + " comment=\"#\",\n", + " header=None\n", + "):\n", + " print(meta, df.shape)" + ] + }, + { "cell_type": "markdown", - "source": "You can also view any artifacts produced by running the analysis, such as the workflow report, graph, or logs." + "metadata": {}, + "source": [ + "You can also view any artifacts produced by running the analysis, such as the workflow report, graph, or logs." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Getting metadata" + ] }, { + "cell_type": "code", + "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2025-03-25T19:16:35.472469Z", "start_time": "2025-03-25T19:16:31.215624Z" } }, - "cell_type": "code", - "source": [ - "from cirro_api_client.v1.models import ArtifactType\n", - "\n", - "# Reading nextflow trace file\n", - "trace_file = dataset.get_artifact(ArtifactType.WORKFLOW_TRACE)\n", - "trace_df = trace_file.read_csv(sep=\"\\t\")\n", - "trace_df.head()" - ], "outputs": [ { "data": { - "text/plain": [ - " task_id hash native_id \\\n", - "0 7 99/b42c07 826623a0-0ed5-44ff-8a94-e3802cccf531 \n", - "1 5 71/8e3d51 ace41478-ba98-403d-a6d1-3e95ad64c36f \n", - "2 8 71/535e08 9d499098-6ed7-422b-9233-9983f775fdee \n", - "3 1 41/c494ef 3a221dd3-7ca8-41e1-8212-856b6154be64 \n", - "4 2 25/13b116 94f91d55-1d41-4afd-88b4-743d75817032 \n", - "\n", - " name status exit submit duration \\\n", - "0 trim:trim_adapters (4) COMPLETED 0 2022-05-24 16:27:01.413 5m 38s \n", - "1 trim:trim_adapters (3) COMPLETED 0 2022-05-24 16:27:01.421 5m 38s \n", - "2 fastqc (4) COMPLETED 0 2022-05-24 16:27:01.464 5m 48s \n", - "3 fastqc (1) COMPLETED 0 2022-05-24 16:27:01.465 5m 48s \n", - "4 trim:trim_adapters (1) COMPLETED 0 2022-05-24 16:27:01.476 5m 58s \n", - "\n", - " realtime %cpu peak_rss peak_vmem rchar wchar \n", - "0 1s 76.6% 3.1 MB 5.4 MB 1.8 MB 900.5 KB \n", - "1 4s 6.4% 11.6 MB 17.3 MB 1.8 MB 900.5 KB \n", - "2 3s 104.8% 152.7 MB 3.2 GB 15.9 MB 4.1 MB \n", - "3 3s 102.5% 140.2 MB 3.2 GB 16 MB 4.1 MB \n", - "4 1s 75.8% 3.1 MB 5.4 MB 1.8 MB 900.5 KB " - ], "text/html": [ "
\n", "