diff --git a/README.md b/README.md
index ea78bb9..f5ee0bb 100644
--- a/README.md
+++ b/README.md
@@ -181,6 +181,49 @@ See the following set of Jupyter notebooks that contain examples on the followin
 | [Using references](samples/Using_references.ipynb)                 | Managing reference data              |
 | [Advanced usage](samples/Advanced_usage.ipynb)                     | Advanced operations                  |
 
+### Reading files
+
+The `read_file` and `read_files` methods provide a convenient way to read dataset files directly into Python objects. The file format is inferred from the extension (`.csv`, `.tsv`, `.json`, `.parquet`, `.feather`, `.pkl`, `.xlsx`, `.h5ad`), or can be specified explicitly.
+
+```python
+from cirro import DataPortal
+
+# If not logged in, this will prompt with a login URL
+portal = DataPortal()
+
+# Read a single file from the indicated dataset
+df = portal.read_file(project="My Project", dataset="My Dataset", glob="**/results.csv")
+
+# Iterate over each of the files ending in .csv within a dataset
+for df in portal.read_files(project="My Project", dataset="My Dataset", glob="*.csv"):
+    print(df.shape)
+
+```
+
+You can also call these methods on the `DataPortalDataset` object:
+
+```python
+# Get an object representing a single dataset
+dataset = portal.get_dataset(project="My Project", dataset="My Dataset")
+
+# Read a single file by exact path or glob pattern
+df = dataset.read_file(path="data/results.csv")
+df = dataset.read_file(glob="**/results.csv")
+
+# Read multiple files matching a pattern — yields one result per file
+for df in dataset.read_files(glob="**/*.csv"):
+    print(df.shape)
+
+# Extract values from the path using {name} capture placeholders
+for df, meta in dataset.read_files(pattern="{sample}/results.csv"):
+    print(meta["sample"], df.shape)
+
+# Extra keyword arguments are forwarded to the file-parsing function
+for df in dataset.read_files(glob="**/*.tsv.gz", filetype="csv", sep="\t"):
+    print(df.shape)
+```
+
+
 ## R Usage
 
 | Jupyter Notebook                                    | Topic               |
diff --git a/cirro/sdk/asset.py b/cirro/sdk/asset.py
index ce1eea0..082200f 100644
--- a/cirro/sdk/asset.py
+++ b/cirro/sdk/asset.py
@@ -60,7 +60,7 @@ def get_by_name(self, name: str) -> T:
         # Error if multiple projects are found
         msg = f"Multiple {self.asset_name} items found with name '{name}', use ID instead.\n{self.description()}"
         if len(matching_queries) > 1:
-            raise DataPortalAssetNotFound(msg)
+            raise DataPortalInputError(msg)
 
         return matching_queries[0]
 
diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py
index ee89247..205a14d 100644
--- a/cirro/sdk/dataset.py
+++ b/cirro/sdk/dataset.py
@@ -1,12 +1,14 @@
 import datetime
+import re
 from pathlib import Path
-from typing import Union, List, Optional
+from typing import Union, List, Optional, Any
 
 from cirro_api_client.v1.api.processes import validate_file_requirements
 from cirro_api_client.v1.models import Dataset, DatasetDetail, RunAnalysisRequest, ProcessDetail, Status, \
     RunAnalysisRequestParams, Tag, ArtifactType, NamedItem, ValidateFileRequirementsRequest
 
 from cirro.cirro_client import CirroApi
+from cirro.file_utils import filter_files_by_pattern
 from cirro.models.assets import DatasetAssets
 from cirro.models.file import PathLike
 from cirro.sdk.asset import DataPortalAssets, DataPortalAsset
@@ -17,6 +19,93 @@
 from cirro.sdk.process import DataPortalProcess
 
 
+def _pattern_to_captures_regex(pattern: str):
+    """
+    Convert a glob pattern that may contain ``{name}`` capture placeholders into
+    a compiled regex and return ``(compiled_regex, capture_names)``.
+
+    Conversion rules:
+      - ``{name}``  → named group matching a single path segment (no ``/``)
+      - ``*``       → matches any characters within a single path segment
+      - ``**``      → matches any characters including ``/`` (multiple segments)
+      - All other characters are regex-escaped.
+
+    The resulting regex is suffix-anchored (like ``pathlib.PurePath.match``):
+    a pattern without a leading ``/`` will match at any depth in the path.
+    """
+    capture_names = re.findall(r'\{(\w+)\}', pattern)
+    tokens = re.split(r'(\*\*|\*|\{\w+\})', pattern)
+    parts = []
+    for token in tokens:
+        if token == '**':
+            parts.append('.*')
+        elif token == '*':
+            parts.append('[^/]*')
+        elif re.match(r'^\{\w+\}$', token):
+            name = token[1:-1]
+            parts.append(f'(?P<{name}>[^/]+)')
+        else:
+            parts.append(re.escape(token))
+    regex_str = ''.join(parts)
+    if not pattern.startswith('/'):
+        regex_str = r'(?:.+/)?' + regex_str
+    return re.compile('^' + regex_str + '$'), capture_names
+
+
+def _infer_file_format(path: str) -> str:
+    """Infer the file format from the file extension."""
+    path_lower = path.lower()
+    for ext in ('.gz', '.bz2', '.xz', '.zst'):
+        if path_lower.endswith(ext):
+            path_lower = path_lower[:-len(ext)]
+            break
+    if path_lower.endswith('.csv') or path_lower.endswith('.tsv'):
+        return 'csv'
+    elif path_lower.endswith('.h5ad'):
+        return 'h5ad'
+    elif path_lower.endswith('.json'):
+        return 'json'
+    elif path_lower.endswith('.parquet'):
+        return 'parquet'
+    elif path_lower.endswith('.feather'):
+        return 'feather'
+    elif path_lower.endswith('.pkl') or path_lower.endswith('.pickle'):
+        return 'pickle'
+    elif path_lower.endswith('.xlsx') or path_lower.endswith('.xls'):
+        return 'excel'
+    else:
+        return 'text'
+
+
+def _read_file_with_format(file: DataPortalFile, file_format: Optional[str], **kwargs) -> Any:
+    """Read a file using the specified format, or auto-detect from extension."""
+    if file_format is None:
+        file_format = _infer_file_format(file.relative_path)
+    if file_format == 'csv':
+        return file.read_csv(**kwargs)
+    elif file_format == 'h5ad':
+        return file.read_h5ad()
+    elif file_format == 'json':
+        return file.read_json(**kwargs)
+    elif file_format == 'parquet':
+        return file.read_parquet(**kwargs)
+    elif file_format == 'feather':
+        return file.read_feather(**kwargs)
+    elif file_format == 'pickle':
+        return file.read_pickle(**kwargs)
+    elif file_format == 'excel':
+        return file.read_excel(**kwargs)
+    elif file_format == 'text':
+        return file.read(**kwargs)
+    elif file_format == 'bytes':
+        return file._get()
+    else:
+        raise DataPortalInputError(
+            f"Unsupported file_format: '{file_format}'. "
+            f"Supported values: 'csv', 'h5ad', 'json', 'parquet', 'feather', 'pickle', 'excel', 'text', 'bytes'"
+        )
+
+
 class DataPortalDataset(DataPortalAsset):
     """
     Datasets in the Data Portal are collections of files which have
@@ -31,7 +120,7 @@ def __init__(self, dataset: Union[Dataset, DatasetDetail], client: CirroApi):
         Should be invoked from a top-level constructor, for example:
 
         ```python
-        from cirro import DataPortal()
+        from cirro import DataPortal
         portal = DataPortal()
         dataset = portal.get_dataset(
             project="id-or-name-of-project",
@@ -199,6 +288,108 @@ def list_files(self) -> DataPortalFiles:
             ]
         )
 
+    def read_files(
+            self,
+            glob: str = None,
+            pattern: str = None,
+            filetype: str = None,
+            **kwargs
+    ):
+        """
+        Read the contents of files in the dataset.
+
+        See :meth:`~cirro.sdk.portal.DataPortal.read_files` for full details
+        on ``glob``/``pattern`` matching and filetype options.
+
+        Args:
+            glob (str): Wildcard expression to match files.
+                Yields one item per matching file: the parsed content.
+            pattern (str): Wildcard expression with ``{name}`` capture
+                placeholders. Yields ``(content, meta)`` per matching file.
+            filetype (str): File format used to parse each file
+                (or ``None`` to infer from extension).
+            **kwargs: Additional keyword arguments forwarded to the
+                file-parsing function.
+
+        Yields:
+            - When using ``glob``: *content* for each matching file
+            - When using ``pattern``: ``(content, meta)`` for each matching file
+        """
+        if glob is not None and pattern is not None:
+            raise DataPortalInputError("Cannot specify both 'glob' and 'pattern' — use one or the other")
+        if glob is None and pattern is None:
+            raise DataPortalInputError("Must specify either 'glob' or 'pattern'")
+
+        if glob is not None:
+            for file in filter_files_by_pattern(list(self.list_files()), glob):
+                yield _read_file_with_format(file, filetype, **kwargs)
+        else:
+            compiled_regex, _ = _pattern_to_captures_regex(pattern)
+            for file in self.list_files():
+                m = compiled_regex.match(file.relative_path)
+                if m is not None:
+                    yield _read_file_with_format(file, filetype, **kwargs), m.groupdict()
+
+    def read_file(
+            self,
+            path: str = None,
+            glob: str = None,
+            filetype: str = None,
+            **kwargs
+    ) -> Any:
+        """
+        Read the contents of a single file from the dataset.
+
+        See :meth:`~cirro.sdk.portal.DataPortal.read_file` for full details.
+
+        Args:
+            path (str): Exact relative path of the file within the dataset.
+            glob (str): Wildcard expression matching exactly one file.
+            filetype (str): File format used to parse the file. Supported values
+                are the same as :meth:`~cirro.sdk.portal.DataPortal.read_files`.
+            **kwargs: Additional keyword arguments forwarded to the file-parsing
+                function.
+
+        Returns:
+            Parsed file content.
+        """
+        if path is not None and glob is not None:
+            raise DataPortalInputError("Cannot specify both 'path' and 'glob' — use one or the other")
+        if path is None and glob is None:
+            raise DataPortalInputError("Must specify either 'path' or 'glob'")
+
+        if path is not None:
+            file = self.get_file(path)
+        else:
+            matches = list(filter_files_by_pattern(list(self.list_files()), glob))
+            if len(matches) == 0:
+                raise DataPortalAssetNotFound(f"No files matched glob '{glob}'")
+            if len(matches) > 1:
+                raise DataPortalInputError(
+                    f"glob '{glob}' matched {len(matches)} files — use read_files() to read multiple files"
+                )
+            file = matches[0]
+
+        return _read_file_with_format(file, filetype, **kwargs)
+
+    def get_trace(self) -> Any:
+        """
+        Read the Nextflow workflow trace file for this dataset as a DataFrame.
+
+        Returns:
+            `pandas.DataFrame`
+        """
+        return self.get_artifact(ArtifactType.WORKFLOW_TRACE).read_csv(sep='\t')
+
+    def get_logs(self) -> str:
+        """
+        Read the Nextflow workflow logs for this dataset as a string.
+
+        Returns:
+            str
+        """
+        return self.get_artifact(ArtifactType.WORKFLOW_LOGS).read()
+
     def get_artifact(self, artifact_type: ArtifactType) -> DataPortalFile:
         """
         Get the artifact of a particular type from the dataset
@@ -225,16 +416,21 @@ def list_artifacts(self) -> List[DataPortalFile]:
             ]
         )
 
-    def download_files(self, download_location: str = None) -> None:
+    def download_files(self, download_location: str = None, glob: str = None) -> None:
         """
         Download all the files from the dataset to a local directory.
 
         Args:
             download_location (str): Path to local directory
+            glob (str): Optional wildcard expression to filter which files are downloaded
+                (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``).
+                If omitted, all files are downloaded.
         """
 
-        # Alias for internal method
-        self.list_files().download(download_location)
+        files = self.list_files()
+        if glob is not None:
+            files = DataPortalFiles(filter_files_by_pattern(list(files), glob))
+        files.download(download_location)
 
     def run_analysis(
             self,
@@ -281,6 +477,7 @@ def run_analysis(
         process = parse_process_name_or_id(process, self._client)
 
         if compute_environment:
+            compute_environment_name = compute_environment
             compute_environments = self._client.compute_environments.list_environments_for_project(
                 project_id=self.project_id
             )
@@ -290,7 +487,7 @@ def run_analysis(
                 None
             )
             if compute_environment is None:
-                raise DataPortalInputError(f"Compute environment '{compute_environment}' not found")
+                raise DataPortalInputError(f"Compute environment '{compute_environment_name}' not found")
 
         resp = self._client.execution.run_analysis(
             project_id=self.project_id,
diff --git a/cirro/sdk/file.py b/cirro/sdk/file.py
index db30119..3c6850e 100644
--- a/cirro/sdk/file.py
+++ b/cirro/sdk/file.py
@@ -1,4 +1,6 @@
 import gzip
+import json
+import pickle
 from io import BytesIO, StringIO
 from pathlib import Path
 from typing import List
@@ -25,7 +27,7 @@ def __init__(self, file: File, client: CirroApi):
         Instantiate by listing files from a dataset.
 
         ```python
-        from cirro import DataPortal()
+        from cirro import DataPortal
         portal = DataPortal()
         dataset = portal.get_dataset(
             project="id-or-name-of-project",
@@ -109,7 +111,7 @@ def read_csv(self, compression='infer', encoding='utf-8', **kwargs) -> 'DataFram
             elif self.relative_path.endswith('.bz2'):
                 compression = dict(method='bz2')
             elif self.relative_path.endswith('.xz'):
-                compression = dict(method='zstd')
+                compression = dict(method='xz')
             elif self.relative_path.endswith('.zst'):
                 compression = dict(method='zstd')
             else:
@@ -142,6 +144,44 @@ def read_h5ad(self) -> 'anndata.AnnData':
         with BytesIO(self._get()) as handle:
             return ad.read_h5ad(handle)
 
+    def read_json(self, **kwargs):
+        """Read the file contents as a parsed JSON object (dict, list, etc.)."""
+        return json.loads(self._get(), **kwargs)
+
+    def read_parquet(self, **kwargs) -> 'DataFrame':
+        """
+        Read a Parquet file as a Pandas DataFrame.
+
+        Requires ``pyarrow`` or ``fastparquet`` to be installed.
+        All keyword arguments are passed to :func:`pandas.read_parquet`.
+        """
+        import pandas
+        return pandas.read_parquet(BytesIO(self._get()), **kwargs)
+
+    def read_feather(self, **kwargs) -> 'DataFrame':
+        """
+        Read a Feather file as a Pandas DataFrame.
+
+        Requires ``pyarrow`` to be installed.
+        All keyword arguments are passed to :func:`pandas.read_feather`.
+        """
+        import pandas
+        return pandas.read_feather(BytesIO(self._get()), **kwargs)
+
+    def read_pickle(self, **kwargs):
+        """Read the file contents as a Python pickle object."""
+        return pickle.loads(self._get(), **kwargs)
+
+    def read_excel(self, **kwargs) -> 'DataFrame':
+        """
+        Read an Excel file (``.xlsx`` / ``.xls``) as a Pandas DataFrame.
+
+        Requires ``openpyxl`` (for ``.xlsx``) or ``xlrd`` (for ``.xls``).
+        All keyword arguments are passed to :func:`pandas.read_excel`.
+        """
+        import pandas
+        return pandas.read_excel(BytesIO(self._get()), **kwargs)
+
     def readlines(self, encoding='utf-8', compression=None) -> List[str]:
         """Read the file contents as a list of lines."""
 
@@ -240,5 +280,5 @@ def download(self, download_location: str = None) -> List[Path]:
 
         local_paths = []
         for f in self:
-            local_paths += f.download(download_location)
+            local_paths.append(f.download(download_location))
         return local_paths
diff --git a/cirro/sdk/portal.py b/cirro/sdk/portal.py
index ebd5fd9..7f4727c 100644
--- a/cirro/sdk/portal.py
+++ b/cirro/sdk/portal.py
@@ -28,7 +28,7 @@ def __init__(self, base_url: str = None, client: CirroApi = None):
         ```python
         from cirro import DataPortal
 
-        Portal = DataPortal(base_url="app.cirro.bio")
+        portal = DataPortal(base_url="app.cirro.bio")
         portal.list_projects()
         ```
         """
@@ -100,10 +100,136 @@ def get_dataset(self, project: str = None, dataset: str = None) -> DataPortalDat
         except DataPortalAssetNotFound:
             project: DataPortalProject = self.get_project_by_name(project)
 
-        try:
-            return project.get_dataset_by_id(dataset)
-        except DataPortalAssetNotFound:
-            return project.get_dataset_by_name(dataset)
+        return project.get_dataset(dataset)
+
+    def read_files(
+            self,
+            project: str,
+            dataset: str,
+            glob: str = None,
+            pattern: str = None,
+            filetype: str = None,
+            **kwargs
+    ):
+        """
+        Read the contents of files from a dataset.
+
+        The project and dataset can each be identified by name or ID.
+        Exactly one of ``glob`` or ``pattern`` must be provided.
+
+        **glob** — standard wildcard matching; yields the file content for each
+        matching file:
+
+        - ``*`` matches any characters within a single path segment
+        - ``**`` matches zero or more path segments
+        - Matching is suffix-anchored (``*.csv`` matches at any depth)
+
+        **pattern** — like ``glob`` but ``{name}`` placeholders capture portions
+        of the path automatically; yields ``(content, meta)`` pairs where
+        *meta* is a ``dict`` of extracted values:
+
+        - ``{name}`` captures one path segment (no ``/``)
+        - ``*`` and ``**`` wildcards work as in ``glob``
+
+        Args:
+            project (str): ID or name of the project.
+            dataset (str): ID or name of the dataset.
+            glob (str): Wildcard expression to match files
+                (e.g., ``'*.csv'``, ``'data/**/*.tsv.gz'``).
+                Yields one item per matching file: the parsed content.
+            pattern (str): Wildcard expression with ``{name}`` capture
+                placeholders (e.g., ``'{sample}.csv'``,
+                ``'{condition}/{sample}.csv'``).
+                Yields ``(content, meta)`` per matching file.
+            filetype (str): File format used to parse each file. Supported values:
+
+                - ``'csv'``: parse with :func:`pandas.read_csv`, returns a ``DataFrame``
+                - ``'h5ad'``: parse as AnnData (requires ``anndata`` package)
+                - ``'json'``: parse with :func:`json.loads`, returns a Python object
+                - ``'parquet'``: parse with :func:`pandas.read_parquet`, returns a ``DataFrame``
+                  (requires ``pyarrow`` or ``fastparquet``)
+                - ``'feather'``: parse with :func:`pandas.read_feather`, returns a ``DataFrame``
+                  (requires ``pyarrow``)
+                - ``'pickle'``: deserialize with :mod:`pickle`, returns a Python object
+                - ``'excel'``: parse with :func:`pandas.read_excel`, returns a ``DataFrame``
+                  (requires ``openpyxl`` for ``.xlsx`` or ``xlrd`` for ``.xls``)
+                - ``'text'``: read as plain text, returns a ``str``
+                - ``'bytes'``: read as raw bytes, returns ``bytes``
+                - ``None`` (default): infer from file extension
+                  (``.csv``/``.tsv`` → ``'csv'``, ``.h5ad`` → ``'h5ad'``,
+                  ``.json`` → ``'json'``, ``.parquet`` → ``'parquet'``,
+                  ``.feather`` → ``'feather'``, ``.pkl``/``.pickle`` → ``'pickle'``,
+                  ``.xlsx``/``.xls`` → ``'excel'``, otherwise ``'text'``)
+            **kwargs: Additional keyword arguments forwarded to the file-parsing
+                function (e.g., ``sep='\\t'`` for CSV/TSV files).
+
+        Yields:
+            - When using ``glob``: *content* for each matching file
+            - When using ``pattern``: ``(content, meta)`` for each matching file,
+              where *meta* is a ``dict`` of values extracted from ``{name}``
+              placeholders
+
+        Raises:
+            DataPortalInputError: if both ``glob`` and ``pattern`` are provided,
+                or if neither is provided.
+
+        Example:
+            ```python
+            # Read all CSV files — just the content
+            for df in portal.read_files('My Project', 'My Dataset', glob='*.csv'):
+                print(df.shape)
+
+            # Extract sample names from filenames automatically
+            for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{sample}.csv'):
+                print(meta['sample'], df.shape)
+
+            # Multi-level capture: condition directory + sample filename
+            for df, meta in portal.read_files('My Project', 'My Dataset', pattern='{condition}/{sample}.csv'):
+                print(meta['condition'], meta['sample'], df.shape)
+
+            # Read gzip-compressed TSV files with explicit separator
+            for df in portal.read_files('My Project', 'My Dataset', glob='**/*.tsv.gz', filetype='csv', sep='\\t'):
+                print(df.shape)
+            ```
+        """
+        ds = self.get_dataset(project=project, dataset=dataset)
+        yield from ds.read_files(glob=glob, pattern=pattern, filetype=filetype, **kwargs)
+
+    def read_file(
+            self,
+            project: str,
+            dataset: str,
+            path: str = None,
+            glob: str = None,
+            filetype: str = None,
+            **kwargs
+    ):
+        """
+        Read the contents of a single file from a dataset.
+
+        The project and dataset can each be identified by name or ID.
+        Provide either ``path`` (exact relative path) or ``glob`` (wildcard
+        expression). If ``glob`` is used it must match exactly one file.
+
+        Args:
+            project (str): ID or name of the project.
+            dataset (str): ID or name of the dataset.
+            path (str): Exact relative path of the file within the dataset.
+            glob (str): Wildcard expression matching exactly one file.
+            filetype (str): File format used to parse the file. Supported values
+                are the same as :meth:`read_files`.
+            **kwargs: Additional keyword arguments forwarded to the
+                file-parsing function.
+
+        Returns:
+            Parsed file content.
+
+        Raises:
+            DataPortalInputError: if both or neither of ``path``/``glob`` are
+                provided, or if ``glob`` matches zero or more than one file.
+        """
+        ds = self.get_dataset(project=project, dataset=dataset)
+        return ds.read_file(path=path, glob=glob, filetype=filetype, **kwargs)
 
     def list_processes(self, ingest=False) -> DataPortalProcesses:
         """
diff --git a/cirro/sdk/process.py b/cirro/sdk/process.py
index 282924f..8f4cff9 100644
--- a/cirro/sdk/process.py
+++ b/cirro/sdk/process.py
@@ -147,6 +147,7 @@ def run_analysis(
         ]
 
         if compute_environment:
+            compute_environment_name = compute_environment
             compute_environments = self._client.compute_environments.list_environments_for_project(
                 project_id=project_id
             )
@@ -156,7 +157,7 @@ def run_analysis(
                 None
             )
             if compute_environment is None:
-                raise DataPortalInputError(f"Compute environment '{compute_environment}' not found")
+                raise DataPortalInputError(f"Compute environment '{compute_environment_name}' not found")
 
         resp = self._client.execution.run_analysis(
             project_id=project_id,
diff --git a/cirro/sdk/project.py b/cirro/sdk/project.py
index ae85c87..89f58c9 100644
--- a/cirro/sdk/project.py
+++ b/cirro/sdk/project.py
@@ -89,6 +89,31 @@ def list_datasets(self, force_refresh=False) -> DataPortalDatasets:
             ]
         )
 
+    def get_dataset(self, name_or_id: str, force_refresh=False) -> DataPortalDataset:
+        """Return the dataset matching the given ID or name.
+
+        Tries to match by ID first, then by name.
+        Raises an error if the name matches multiple datasets.
+        """
+        if force_refresh:
+            self._get_datasets.cache_clear()
+
+        # Try by ID first
+        try:
+            return self.get_dataset_by_id(name_or_id)
+        except Exception:
+            pass
+
+        # Fall back to name matching
+        matches = [d for d in self._get_datasets() if d.name == name_or_id]
+        if len(matches) == 0:
+            raise DataPortalAssetNotFound(f'Dataset with name or ID "{name_or_id}" not found')
+        if len(matches) > 1:
+            raise DataPortalInputError(
+                f'Multiple datasets found with the name "{name_or_id}" — use get_dataset_by_id instead'
+            )
+        return self.get_dataset_by_id(matches[0].id)
+
     def get_dataset_by_name(self, name: str, force_refresh=False) -> DataPortalDataset:
         """Return the dataset with the specified name."""
         if force_refresh:
diff --git a/pyproject.toml b/pyproject.toml
index 6224e9f..50ea289 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "cirro"
-version = "1.10.2"
+version = "1.10.3"
 description = "CLI tool and SDK for interacting with the Cirro platform"
 authors = ["Cirro Bio <support@cirro.bio>"]
 license = "MIT"
diff --git a/samples/Analyzing_a_dataset.ipynb b/samples/Analyzing_a_dataset.ipynb
index 1b7d0c6..3cd23ca 100644
--- a/samples/Analyzing_a_dataset.ipynb
+++ b/samples/Analyzing_a_dataset.ipynb
@@ -21,14 +21,119 @@
    },
    "outputs": [],
    "source": [
+    "# Import the library used to interact with Cirro\n",
     "from cirro import DataPortal\n",
     "\n",
+    "# Create a connection to Cirro with your identity\n",
     "portal = DataPortal()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Option 1 - run analysis using the same set of parameters used previously"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset 'Test dataset for variant calling' contains 2 files\n"
+     ]
+    }
+   ],
+   "source": [
+    "# New dataset with FASTQs\n",
+    "input_dataset = portal.get_dataset(\n",
+    "    project=\"Pipeline Development\",\n",
+    "    dataset=\"Test dataset for variant calling\"\n",
+    ")\n",
+    "print(f\"Dataset '{input_dataset.name}' contains {len(input_dataset.list_files()):,} files\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using the 'Align Reads (nf-core/sarek)' process (ID: process-nf-core-sarek-align-3-2)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Get the process to run on the dataset\n",
+    "process = portal.get_process_by_name('Align Reads (nf-core/sarek)')\n",
+    "print(f\"Using the '{process.name}' process (ID: {process.id})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using parameters from Genomic variant calling - parameter validation\n",
+      "{'WORKFLOW_VERSION': '3.2.3', 'analysis_type': {'genome': 'GATK.GRCh38', 'wes': True, 'analysis_type': 'Germline Variant Calling', 'intervals': 's3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed', 'tools': ['strelka', 'haplotypecaller']}, 'annotation': {'annotation_tool': []}, 'read_trimming_options': {'trim_fastq': False}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Previous dataset created by the pipeline\n",
+    "previous_run = portal.get_dataset(\n",
+    "    project=\"Pipeline Development\",\n",
+    "    dataset=\"Genomic variant calling - parameter validation\"\n",
+    ")\n",
+    "print(f\"Using parameters from {previous_run.name}\")\n",
+    "print(previous_run.params)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Started new analysis: ID f7ca7e1b-d64c-4747-b647-0e984db87aa5\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Start a new run, using the parameters from the previous run\n",
+    "new_dataset_id = input_dataset.run_analysis(\n",
+    "    name=\"Genomic variant calling - new run\",\n",
+    "    description='Test from SDK',\n",
+    "    process=process,\n",
+    "    params=previous_run.params\n",
+    ")\n",
+    "print(f\"Started new analysis: ID {new_dataset_id}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Option 2: Build parameters from scratch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -39,24 +144,37 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Project 'Test Project' contains 104 datasets\n",
-      "Dataset 'Test dataset for variant calling' contains 2 files\n",
-      "Using the 'Variant Calling (nf-core/sarek)' process (ID: process-nf-core-sarek-3-0-1)\n"
+      "Project 'Pipeline Development' contains 709 datasets\n"
      ]
     }
    ],
    "source": [
     "# Get the project by name\n",
-    "project = portal.get_project_by_name('Test Project') \n",
-    "print(f\"Project '{project.name}' contains {len(project.list_datasets()):,} datasets\")\n",
-    "\n",
+    "project = portal.get_project_by_name('Pipeline Development') \n",
+    "print(f\"Project '{project.name}' contains {len(project.list_datasets()):,} datasets\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset 'Test dataset for variant calling' contains 2 files\n"
+     ]
+    }
+   ],
+   "source": [
     "# Get a particular dataset from that project\n",
     "dataset = project.get_dataset_by_name('Test dataset for variant calling')\n",
-    "print(f\"Dataset '{dataset.name}' contains {len(dataset.list_files()):,} files\")\n",
-    "\n",
-    "# Get the process to run on the dataset\n",
-    "process = portal.get_process_by_id('process-nf-core-sarek-3-0-1')\n",
-    "print(f\"Using the '{process.name}' process (ID: {process.id})\")"
+    "print(f\"Dataset '{dataset.name}' contains {len(dataset.list_files()):,} files\")"
    ]
   },
   {
@@ -72,7 +190,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 8,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -84,15 +202,15 @@
      "output_type": "stream",
      "text": [
       "Parameters:\n",
-      "\tExperiment Design (Group)\n",
+      "\tWorkflow Version (key=workflow_version, default=3.6.0, type=string, enum=['3.1', '3.1.1', '3.1.2', '3.2.3', '3.3.2', '3.4.4', '3.5.1', '3.6.0'], description=Select the specific version of nf-core/sarek used for analysis)\n",
+      "\tExperimental Design (Group)\n",
       "\t\tReference Genome (key=genome, default=GATK.GRCh38, type=string, enum=['GATK.GRCh38', 'GATK.GRCh37', 'GRCm38'])\n",
       "\t\tWhole Exome/Targeted Gene Panel Assay (key=wes, type=boolean, description=Please indicate if your data was generated using a capture kit.)\n",
       "\t\tGenomic intervals (key=intervals, type=string, description=Target bed file in case of whole exome or targeted sequencing or intervals file for parallelization.)\n",
-      "\t\tVariant Calling Type (key=analysis_type, default=Germline Variant Calling, enum=['Germline Variant Calling', 'Somatic Variant Calling'])\n",
-      "\tVariant Annotation (Group)\n",
-      "\t\tAnnotation tool(s) (key=annotation_tool, type=array, description=Please select one or both variant annotation tools.)\n",
       "\tRead Trimming Options (Group)\n",
-      "\t\tTrim reads using Trim-Galore? (key=trim_fastq, type=boolean)\n"
+      "\t\tTrim reads using Trim-Galore? (key=trim_fastq, type=boolean)\n",
+      "\tAdvanced Options (Group)\n",
+      "\t\tMarkDuplicates - Optical Duplicate Pixel Distance (key=optical_duplicate_pixel_distance, default=100, type=integer, description=The `--OPTICAL_DUPLICATE_PIXEL_DISTANCE` parameter is used by MarkDuplicates to set the maximum offset between two duplicate clusters in pixels for them to be considered optical duplicates. A value of 100 is generally appropriate for unpatterned Illumina flowcells and 250 is appropriate for patterned Illumina flow cells.)\n"
      ]
     }
    ],
@@ -114,7 +232,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 9,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -126,13 +244,15 @@
      "output_type": "stream",
      "text": [
       "The BED references available are:\n",
-      "GRCh38_Chr20\n",
-      " - NimbleGen_SeqCap_EZ_Exome_primary-capture_hg19_chr17\n",
-      " - wgs_calling_regions.hg19.bed\n",
+      "wgs_calling_regions.hg19.bed\n",
+      " - hg38\n",
+      " - epi2me-labs-wf-human-variation-ref\n",
       " - wgs_calling_regions.hg38.bed\n",
+      " - GRCh38_Chr20\n",
+      " - NimbleGen_SeqCap_EZ_Exome_primary-capture_hg19_chr17\n",
       "\n",
       "The reference library we are using is: GRCh38_Chr20\n",
-      "The absolute path to the file is: s3://z-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed\n"
+      "The absolute path to the file is: s3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed\n"
      ]
     }
    ],
@@ -153,7 +273,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 10,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -163,25 +283,37 @@
     {
      "data": {
       "text/plain": [
-       "{'genome': 'GATK.GRCh38',\n",
-       " 'wes': True,\n",
-       " 'intervals': 's3://z-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed',\n",
-       " 'trim_fastq': False,\n",
-       " 'annotation_tool': ['cnvkit', 'deepvariant']}"
+       "{'WORKFLOW_VERSION': '3.2.3',\n",
+       " 'analysis_type': {'genome': 'GATK.GRCh38',\n",
+       "  'wes': True,\n",
+       "  'analysis_type': 'Germline Variant Calling',\n",
+       "  'intervals': 's3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed',\n",
+       "  'tools': ['strelka', 'haplotypecaller']},\n",
+       " 'annotation': {'annotation_tool': []},\n",
+       " 'read_trimming_options': {'trim_fastq': False}}"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "params = {\n",
-    "    'genome': 'GATK.GRCh38',\n",
-    "    'wes': True,\n",
-    "    'intervals': reference_library.absolute_path,\n",
-    "    'trim_fastq': False,\n",
-    "    'annotation_tool': ['cnvkit', 'deepvariant']\n",
+    "    'WORKFLOW_VERSION': '3.2.3',\n",
+    "    'analysis_type': {\n",
+    "        'genome': 'GATK.GRCh38',\n",
+    "        'wes': True,\n",
+    "        'analysis_type': 'Germline Variant Calling',\n",
+    "        'intervals': 's3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/resources/data/references/genome_bed/GRCh38_Chr20/regions.bed',\n",
+    "        'tools': ['strelka', 'haplotypecaller']\n",
+    "    },\n",
+    "    'annotation': {\n",
+    "        'annotation_tool': []\n",
+    "    },\n",
+    "    'read_trimming_options': {\n",
+    "        'trim_fastq': False\n",
+    "    }\n",
     "}\n",
     "params"
    ]
@@ -200,7 +332,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 11,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -225,7 +357,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 12,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -236,16 +368,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "71ec598c-368b-47a5-84c8-c209739b050a\n"
+      "ca8eee87-09d9-4abe-ba0e-4e6ba48b33fa\n"
      ]
     }
    ],
    "source": [
     "# Run the analysis, specifying a name and description for the resulting dataset\n",
-    "new_dataset_id = dataset.run_analysis(\n",
+    "new_dataset_id = input_dataset.run_analysis(\n",
     "    name='Variant Calling Analysis',\n",
     "    description='Test from SDK',\n",
-    "    process='process-nf-core-sarek-3-0-1',\n",
+    "    process=process,\n",
     "    params=params\n",
     ")\n",
     "print(new_dataset_id)"
@@ -275,7 +407,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.12"
+   "version": "3.12.7"
   },
   "vscode": {
    "interpreter": {
@@ -284,5 +416,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 1
+ "nbformat_minor": 4
 }
diff --git a/samples/Downloading_a_dataset.ipynb b/samples/Downloading_a_dataset.ipynb
index 71f372f..cca284a 100644
--- a/samples/Downloading_a_dataset.ipynb
+++ b/samples/Downloading_a_dataset.ipynb
@@ -34,7 +34,10 @@
     }
    },
    "source": [
-    "You can get the list of all projects which are available, and select a particular project by name"
+    "If you don't know exactly what the name or ID is of the dataset you want to download,\n",
+    "you can get the list of all projects which are available, and select a particular project by name.\n",
+    "\n",
+    "### Inspecting datasets"
    ]
   },
   {
@@ -46,9 +49,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "There are 3 projects available\n",
-      "Selected the project 'Test Project' (ID: 9a31492a-e679-43ce-9f06-d84213c8f7f7)\n",
-      "This project contains 104 datasets to choose from\n"
+      "There are 5 projects available\n",
+      "Selected the project 'Pipeline Development' (ID: 9a31492a-e679-43ce-9f06-d84213c8f7f7)\n",
+      "This project contains 709 datasets to choose from\n"
      ]
     }
    ],
@@ -56,7 +59,7 @@
     "print(f\"There are {len(portal.list_projects()):,} projects available\")\n",
     "# print(portal.list_projects()) # run this line to see all the projects\n",
     "\n",
-    "project = portal.get_project_by_name(\"Test Project\")\n",
+    "project = portal.get_project_by_name(\"Pipeline Development\")\n",
     "print(f\"Selected the project '{project.name}' (ID: {project.id})\")\n",
     "print(f\"This project contains {len(project.list_datasets()):,} datasets to choose from\")"
    ]
@@ -82,17 +85,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Name: Test of mageck-count (updated headnode code 9/22/2022) (3)\n",
-      "Id: bcda3e84-1abe-4d08-86b0-690ea7e1cdad\n",
-      "Description: Test of mageck-count (updated headnode code 9/22/2022)\n",
+      "Name: Genomic variant calling - parameter validation\n",
+      "Id: 3fb7e8f8-b62d-43a6-ad08-eb28f59bd141\n",
+      "Description: None\n",
       "Status: COMPLETED\n"
      ]
     }
    ],
    "source": [
     "# Datasets can be selected by name or by ID\n",
-    "dataset = project.get_dataset_by_id(\"bcda3e84-1abe-4d08-86b0-690ea7e1cdad\")\n",
-    "# dataset = project.get_dataset_by_name(\"Test of mageck-count\")\n",
+    "dataset = project.get_dataset(\"Genomic variant calling - parameter validation\")\n",
     "print(dataset)"
    ]
   },
@@ -104,191 +106,63 @@
     }
    },
    "source": [
-    "Download all of the files from that dataset to a temporary folder"
+    "### Downloading files\n",
+    "\n",
+    "Download all of the files from that dataset (to a temporary folder in this case)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# You can also just select that dataset in a single call\n",
+    "dataset = portal.get_dataset(\n",
+    "    project=\"Pipeline Development\",\n",
+    "    dataset=\"Genomic variant calling - parameter validation\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
     },
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Downloading file MO_Brunello_1.fastq (898.44 KB) | 100.0%|█████████████████████████ | 1.46MB/s\n",
-      "Downloading file MO_Brunello_2.fastq (898.44 KB) | 100.0%|█████████████████████████ | 1.83MB/s\n",
-      "Downloading file MO_Brunello_gDNA_1.fastq (898.44 KB) | 100.0%|█████████████████████████ | 2.16MB/s\n",
-      "Downloading file MO_Brunello_gDNA_2.fastq (898.44 KB) | 100.0%|█████████████████████████ | 1.39MB/s\n",
-      "Downloading file multiqc_report.html (1.12 MB) | 100.0%|█████████████████████████ | 1.35MB/s\n",
-      "Downloading file MO_Brunello_1.json (72.07 KB) | 100.0%|█████████████████████████ | 285kB/s\n",
-      "Downloading file MO_Brunello_1_fastqc.html (804.22 KB) | 100.0%|█████████████████████████ | 1.15MB/s\n",
-      "Downloading file MO_Brunello_2.json (72.07 KB) | 100.0%|█████████████████████████ | 349kB/s\n",
-      "Downloading file MO_Brunello_2_fastqc.html (824.26 KB) | 100.0%|█████████████████████████ | 1.19MB/s\n",
-      "Downloading file MO_Brunello_gDNA_1.json (72.53 KB) | 100.0%|█████████████████████████ | 319kB/s\n",
-      "Downloading file MO_Brunello_gDNA_1_fastqc.html (824.76 KB) | 100.0%|█████████████████████████ | 2.10MB/s\n",
-      "Downloading file MO_Brunello_gDNA_2.json (71.84 KB) | 100.0%|█████████████████████████ | 289kB/s\n",
-      "Downloading file MO_Brunello_gDNA_2_fastqc.html (815.26 KB) | 100.0%|█████████████████████████ | 1.95MB/s\n",
-      "Downloading file MO_Brunello_1.count.txt (1.55 MB) | 100.0%|█████████████████████████ | 3.62MB/s\n",
-      "Downloading file MO_Brunello_1.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 3.09MB/s\n",
-      "Downloading file MO_Brunello_1.countsummary.txt (237.00 B) | 100.0%|█████████████████████████ | 1.42kB/s\n",
-      "Downloading file MO_Brunello_2.count.txt (1.55 MB) | 100.0%|█████████████████████████ | 3.61MB/s\n",
-      "Downloading file MO_Brunello_2.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 2.72MB/s\n",
-      "Downloading file MO_Brunello_2.countsummary.txt (237.00 B) | 100.0%|█████████████████████████ | 2.28kB/s\n",
-      "Downloading file MO_Brunello_gDNA_1.count.txt (1.55 MB) | 100.0%|█████████████████████████ | 2.82MB/s\n",
-      "Downloading file MO_Brunello_gDNA_1.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 2.57MB/s\n",
-      "Downloading file MO_Brunello_gDNA_1.countsummary.txt (247.00 B) | 100.0%|█████████████████████████ | 2.57kB/s\n",
-      "Downloading file MO_Brunello_gDNA_2.count.txt (1.55 MB) | 100.0%|█████████████████████████ | 3.40MB/s\n",
-      "Downloading file MO_Brunello_gDNA_2.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 1.52MB/s\n",
-      "Downloading file MO_Brunello_gDNA_2.countsummary.txt (246.00 B) | 100.0%|█████████████████████████ | 2.33kB/s\n",
-      "Downloading file counts.txt (1.99 MB) | 100.0%|█████████████████████████ | 3.48MB/s\n",
-      "Downloading file sample_names.txt (65.00 B) | 100.0%|█████████████████████████ | 662B/s\n",
-      "Downloading file summary.txt (366.00 B) | 100.0%|█████████████████████████ | 2.41kB/s\n",
-      "Downloading file MO_Brunello_1.log (2.39 KB) | 100.0%|█████████████████████████ | 11.1kB/s\n",
-      "Downloading file MO_Brunello_2.log (2.39 KB) | 100.0%|█████████████████████████ | 16.1kB/s\n",
-      "Downloading file MO_Brunello_gDNA_1.log (2.43 KB) | 100.0%|█████████████████████████ | 23.2kB/s\n",
-      "Downloading file MO_Brunello_gDNA_2.log (2.43 KB) | 100.0%|█████████████████████████ | 19.4kB/s\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "dataset.download_files(\"/tmp\")"
+    "# dataset.download_files(\"/tmp\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Alternatively, you can inspect and filter the list of files to only what is needed"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "data/cutadapt/trim/fastq/MO_Brunello_1.fastq (920000 bytes)\n",
-      "\n",
-      "data/cutadapt/trim/fastq/MO_Brunello_2.fastq (920000 bytes)\n",
-      "\n",
-      "data/cutadapt/trim/fastq/MO_Brunello_gDNA_1.fastq (920000 bytes)\n",
-      "\n",
-      "data/cutadapt/trim/fastq/MO_Brunello_gDNA_2.fastq (920000 bytes)\n",
-      "\n",
-      "data/fastqc/multiqc_report.html (1173155 bytes)\n",
-      "\n",
-      "data/fastqc/MO_Brunello_1/MO_Brunello_1.json (73803 bytes)\n",
-      "\n",
-      "data/fastqc/MO_Brunello_1/MO_Brunello_1_fastqc.html (823526 bytes)\n",
-      "\n",
-      "data/fastqc/MO_Brunello_2/MO_Brunello_2.json (73797 bytes)\n",
-      "\n",
-      "data/fastqc/MO_Brunello_2/MO_Brunello_2_fastqc.html (844044 bytes)\n",
-      "\n",
-      "data/fastqc/MO_Brunello_gDNA_1/MO_Brunello_gDNA_1.json (74268 bytes)\n",
-      "\n",
-      "data/fastqc/MO_Brunello_gDNA_1/MO_Brunello_gDNA_1_fastqc.html (844554 bytes)\n",
-      "\n",
-      "data/fastqc/MO_Brunello_gDNA_2/MO_Brunello_gDNA_2.json (73563 bytes)\n",
-      "\n",
-      "data/fastqc/MO_Brunello_gDNA_2/MO_Brunello_gDNA_2_fastqc.html (834827 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_1.count.txt (1625955 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_1.count_normalized.txt (1638475 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_1.countsummary.txt (237 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_2.count.txt (1625955 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_2.count_normalized.txt (1638372 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_2.countsummary.txt (237 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_gDNA_1.count.txt (1625960 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_gDNA_1.count_normalized.txt (1638522 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_gDNA_1.countsummary.txt (247 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_gDNA_2.count.txt (1625960 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_gDNA_2.count_normalized.txt (1638905 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_gDNA_2.countsummary.txt (246 bytes)\n",
-      "\n",
-      "data/mageck/count/combined/counts.txt (2090653 bytes)\n",
-      "\n",
-      "data/mageck/count/combined/sample_names.txt (65 bytes)\n",
-      "\n",
-      "data/mageck/count/combined/summary.txt (366 bytes)\n",
-      "\n",
-      "data/mageck/count/log/MO_Brunello_1.log (2449 bytes)\n",
-      "\n",
-      "data/mageck/count/log/MO_Brunello_2.log (2449 bytes)\n",
-      "\n",
-      "data/mageck/count/log/MO_Brunello_gDNA_1.log (2489 bytes)\n",
-      "\n",
-      "data/mageck/count/log/MO_Brunello_gDNA_2.log (2488 bytes)\n"
-     ]
-    }
-   ],
-   "source": [
-    "files = dataset.list_files()\n",
-    "print(files)"
+    "Alternatively, you can filter the list of files to only what is needed"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 6,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "data/mageck/count/MO_Brunello_1.count_normalized.txt (1638475 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_2.count_normalized.txt (1638372 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_gDNA_1.count_normalized.txt (1638522 bytes)\n",
-      "\n",
-      "data/mageck/count/MO_Brunello_gDNA_2.count_normalized.txt (1638905 bytes)\n"
-     ]
-    }
-   ],
-   "source": [
-    "norm_counts = files.filter_by_pattern(\"*.count_normalized.txt\")\n",
-    "print(norm_counts)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Downloading file MO_Brunello_1.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 1.86MB/s\n",
-      "Downloading file MO_Brunello_2.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 3.78MB/s\n",
-      "Downloading file MO_Brunello_gDNA_1.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 2.86MB/s\n",
-      "Downloading file MO_Brunello_gDNA_2.count_normalized.txt (1.56 MB) | 100.0%|█████████████████████████ | 3.27MB/s\n"
+      "Downloading file ERR031935.haplotypecaller.filtered.vcf.gz (401.08 KB) | 100.0%|█████████████████████████ | 1.71MB/s\n",
+      "Downloading file ERR031935.haplotypecaller.vcf.gz (357.77 KB) | 100.0%|█████████████████████████ | 1.50MB/s\n",
+      "Downloading file ERR031935.strelka.genome.vcf.gz (12.29 MB) | 100.0%|█████████████████████████ | 6.54MB/s\n",
+      "Downloading file ERR031935.strelka.variants.vcf.gz (970.75 KB) | 100.0%|█████████████████████████ | 2.55MB/s\n"
      ]
     }
    ],
    "source": [
-    "norm_counts.download(\"/tmp\")"
+    "dataset.download_files(\"/tmp\", glob=\"*.vcf.gz\")"
    ]
   },
   {
@@ -315,7 +189,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.12"
+   "version": "3.12.7"
   },
   "vscode": {
    "interpreter": {
@@ -324,5 +198,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 1
+ "nbformat_minor": 4
 }
diff --git a/samples/Interacting_with_files.ipynb b/samples/Interacting_with_files.ipynb
index 929d9df..91b35b4 100644
--- a/samples/Interacting_with_files.ipynb
+++ b/samples/Interacting_with_files.ipynb
@@ -13,28 +13,37 @@
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    },
     "ExecuteTime": {
      "end_time": "2025-03-25T19:16:07.482109Z",
      "start_time": "2025-03-25T19:16:06.304549Z"
+    },
+    "pycharm": {
+     "name": "#%%\n"
     }
    },
+   "outputs": [],
    "source": [
     "from cirro import DataPortal\n",
     "\n",
-    "portal = DataPortal()"
-   ],
-   "outputs": [],
-   "execution_count": 1
+    "portal = DataPortal(base_url=\"\")"
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Find the file you are looking for by defining the project and dataset, then searching for a particular file of interest based on a pattern using `filter_by_pattern`"
+    "Find the file you are looking for by defining the project and dataset, then using `read_file` or `read_files` to read file contents directly into Python objects.\n",
+    "\n",
+    "The file format is inferred automatically from the extension (`.csv`, `.tsv`, `.json`, `.parquet`, `.feather`, `.pkl`, `.xlsx`, `.h5ad`), or can be specified with the `format` parameter."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Inspecting files"
    ]
   },
   {
@@ -50,31 +59,27 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The project Test Project contains 104 datasets\n",
-      "Dataset Test of mageck-count contains 32 files\n",
-      "Selected the file: data/mageck/count/combined/counts.txt (2090653 bytes)\n"
+      "Dataset: Genomic variant calling - parameter validation\n",
+      "Files: 235\n",
+      "data/variant_calling/haplotypecaller/ERR031935/ERR031935.haplotypecaller.filtered.vcf.gz\n",
+      "data/variant_calling/haplotypecaller/ERR031935/ERR031935.haplotypecaller.vcf.gz\n",
+      "data/variant_calling/strelka/ERR031935/ERR031935.strelka.genome.vcf.gz\n",
+      "data/variant_calling/strelka/ERR031935/ERR031935.strelka.variants.vcf.gz\n"
      ]
     }
    ],
    "source": [
     "# Get the project which contains the dataset\n",
-    "project = portal.get_project_by_name('Test Project')\n",
-    "\n",
-    "# Get the set of datasets within that project\n",
-    "all_datasets = project.list_datasets()\n",
-    "print(f\"The project {project.name} contains {len(all_datasets):,} datasets\")\n",
+    "project = portal.get_project_by_name(\"Pipeline Development\")\n",
     "\n",
     "# Get the dataset of interest based on its name\n",
-    "dataset = all_datasets.get_by_name('Test of mageck-count')\n",
-    "\n",
-    "# Get the complete list of files in that dataset\n",
-    "files = dataset.list_files()\n",
-    "print(f\"Dataset {dataset.name} contains {len(files):,} files\")\n",
-    "\n",
-    "# Filter to just the files named counts.txt (using the wildcard to match the string of folders it is in)\n",
-    "counts = files.filter_by_pattern(\"*/counts.txt\")\n",
+    "dataset = project.get_dataset(\"Genomic variant calling - parameter validation\")\n",
     "\n",
-    "print(f\"Selected the file: {counts.description()}\")"
+    "print(f\"Dataset: {dataset.name}\")\n",
+    "print(f\"Files: {len(dataset.list_files()):,}\")\n",
+    "for file in dataset.list_files():\n",
+    "    if file.name.endswith('.vcf.gz'):\n",
+    "        print(file.name)"
    ]
   },
   {
@@ -85,7 +90,9 @@
     }
    },
    "source": [
-    "Load the contents of that file into a DataFrame (keeping in mind that it is tab-delimited, not the default comma-delimited)"
+    "### Reading a file\n",
+    "\n",
+    "Read a single file into a DataFrame using `read_file`. The tab-separated format is specified explicitly with `sep='\\t'`."
    ]
   },
   {
@@ -118,78 +125,109 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>sgRNA</th>\n",
-       "      <th>Gene</th>\n",
-       "      <th>MO_Brunello_gDNA_2</th>\n",
-       "      <th>MO_Brunello_1</th>\n",
-       "      <th>MO_Brunello_2</th>\n",
-       "      <th>MO_Brunello_gDNA_1</th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "      <th>5</th>\n",
+       "      <th>6</th>\n",
+       "      <th>7</th>\n",
+       "      <th>8</th>\n",
+       "      <th>9</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>A1BG_0</td>\n",
-       "      <td>A1BG</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>chr20</td>\n",
+       "      <td>60826</td>\n",
+       "      <td>.</td>\n",
+       "      <td>T</td>\n",
+       "      <td>A</td>\n",
+       "      <td>1</td>\n",
+       "      <td>LowDepth;LowGQX;NoPassedVariantGTs</td>\n",
+       "      <td>MQ=17;SNVHPOL=2</td>\n",
+       "      <td>GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL</td>\n",
+       "      <td>0/1:3:0:1:0:0,1:0,1:0,0:0.0:LowGQX;LowDepth:29...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>A1BG_1</td>\n",
-       "      <td>A1BG</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
+       "      <td>chr20</td>\n",
+       "      <td>60850</td>\n",
+       "      <td>.</td>\n",
+       "      <td>A</td>\n",
+       "      <td>T</td>\n",
+       "      <td>1</td>\n",
+       "      <td>LowDepth;LowGQX;NoPassedVariantGTs</td>\n",
+       "      <td>MQ=24;SNVHPOL=4</td>\n",
+       "      <td>GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL</td>\n",
+       "      <td>0/1:3:1:1:0:0,1:0,1:0,0:0.0:LowGQX;LowDepth:30...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>A1BG_2</td>\n",
-       "      <td>A1BG</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>chr20</td>\n",
+       "      <td>62437</td>\n",
+       "      <td>.</td>\n",
+       "      <td>C</td>\n",
+       "      <td>T</td>\n",
+       "      <td>3</td>\n",
+       "      <td>LowDepth;LowGQX;NoPassedVariantGTs</td>\n",
+       "      <td>MQ=22;SNVHPOL=2</td>\n",
+       "      <td>GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL</td>\n",
+       "      <td>0/1:3:0:1:1:0,1:0,0:0,1:0.0:LowGQX;LowDepth:35...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>A1BG_3</td>\n",
-       "      <td>A1BG</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0</td>\n",
+       "      <td>chr20</td>\n",
+       "      <td>62467</td>\n",
+       "      <td>.</td>\n",
+       "      <td>C</td>\n",
+       "      <td>A</td>\n",
+       "      <td>4</td>\n",
+       "      <td>LowDepth;LowGQX;NoPassedVariantGTs</td>\n",
+       "      <td>MQ=24;SNVHPOL=2</td>\n",
+       "      <td>GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL</td>\n",
+       "      <td>0/1:3:0:1:0:0,1:0,0:0,1:0.0:LowGQX;LowDepth:36...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>A1CF_36946</td>\n",
-       "      <td>A1CF</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>chr20</td>\n",
+       "      <td>62469</td>\n",
+       "      <td>.</td>\n",
+       "      <td>G</td>\n",
+       "      <td>A</td>\n",
+       "      <td>3</td>\n",
+       "      <td>LowDepth;LowGQX;NoPassedVariantGTs</td>\n",
+       "      <td>MQ=24;SNVHPOL=3</td>\n",
+       "      <td>GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL</td>\n",
+       "      <td>0/1:3:0:1:0:0,1:0,0:0,1:0.0:LowGQX;LowDepth:34...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "        sgRNA  Gene  MO_Brunello_gDNA_2  MO_Brunello_1  MO_Brunello_2  \\\n",
-       "0      A1BG_0  A1BG                   0              0              0   \n",
-       "1      A1BG_1  A1BG                   0              0              0   \n",
-       "2      A1BG_2  A1BG                   0              0              0   \n",
-       "3      A1BG_3  A1BG                   0              0              2   \n",
-       "4  A1CF_36946  A1CF                   0              0              0   \n",
+       "       0      1  2  3  4  5                                   6  \\\n",
+       "0  chr20  60826  .  T  A  1  LowDepth;LowGQX;NoPassedVariantGTs   \n",
+       "1  chr20  60850  .  A  T  1  LowDepth;LowGQX;NoPassedVariantGTs   \n",
+       "2  chr20  62437  .  C  T  3  LowDepth;LowGQX;NoPassedVariantGTs   \n",
+       "3  chr20  62467  .  C  A  4  LowDepth;LowGQX;NoPassedVariantGTs   \n",
+       "4  chr20  62469  .  G  A  3  LowDepth;LowGQX;NoPassedVariantGTs   \n",
        "\n",
-       "   MO_Brunello_gDNA_1  \n",
-       "0                   0  \n",
-       "1                   2  \n",
-       "2                   0  \n",
-       "3                   0  \n",
-       "4                   0  "
+       "                 7                                     8  \\\n",
+       "0  MQ=17;SNVHPOL=2  GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL   \n",
+       "1  MQ=24;SNVHPOL=4  GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL   \n",
+       "2  MQ=22;SNVHPOL=2  GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL   \n",
+       "3  MQ=24;SNVHPOL=2  GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL   \n",
+       "4  MQ=24;SNVHPOL=3  GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL   \n",
+       "\n",
+       "                                                   9  \n",
+       "0  0/1:3:0:1:0:0,1:0,1:0,0:0.0:LowGQX;LowDepth:29...  \n",
+       "1  0/1:3:1:1:0:0,1:0,1:0,0:0.0:LowGQX;LowDepth:30...  \n",
+       "2  0/1:3:0:1:1:0,1:0,0:0,1:0.0:LowGQX;LowDepth:35...  \n",
+       "3  0/1:3:0:1:0:0,1:0,0:0,1:0.0:LowGQX;LowDepth:36...  \n",
+       "4  0/1:3:0:1:0:0,1:0,0:0,1:0.0:LowGQX;LowDepth:34...  "
       ]
      },
      "execution_count": 3,
@@ -198,56 +236,72 @@
     }
    ],
    "source": [
-    "df = counts[0].read_csv(sep=\"\\t\")\n",
+    "# Read a single file matched by a glob pattern\n",
+    "df = dataset.read_file(glob=\"*.variants.vcf.gz\", filetype=\"csv\", sep=\"\\t\", comment=\"#\", header=None)\n",
     "df.head()"
    ]
   },
   {
+   "cell_type": "markdown",
    "metadata": {},
+   "source": [
+    "### Reading multiple files\n",
+    "\n",
+    "Use `read_files` to iterate over multiple matching files. With `{name}` capture placeholders in the `pattern`, extracted values are returned alongside each file's content."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'sample': 'ERR031935', 'type': 'genome'} (790381, 10)\n",
+      "{'sample': 'ERR031935', 'type': 'variants'} (36318, 10)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Extract folder names from the path automatically using {name} placeholders\n",
+    "for df, meta in dataset.read_files(\n",
+    "    pattern=\"*/strelka/{sample}/*.strelka.{type}.vcf.gz\",\n",
+    "    filetype=\"csv\",\n",
+    "    sep=\"\\t\",\n",
+    "    comment=\"#\",\n",
+    "    header=None\n",
+    "):\n",
+    "    print(meta, df.shape)"
+   ]
+  },
+  {
    "cell_type": "markdown",
-   "source": "You can also view any artifacts produced by running the analysis, such as the workflow report, graph, or logs."
+   "metadata": {},
+   "source": [
+    "You can also view any artifacts produced by running the analysis, such as the workflow report, graph, or logs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Getting metadata"
+   ]
   },
   {
+   "cell_type": "code",
+   "execution_count": 6,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2025-03-25T19:16:35.472469Z",
      "start_time": "2025-03-25T19:16:31.215624Z"
     }
    },
-   "cell_type": "code",
-   "source": [
-    "from cirro_api_client.v1.models import ArtifactType\n",
-    "\n",
-    "# Reading nextflow trace file\n",
-    "trace_file = dataset.get_artifact(ArtifactType.WORKFLOW_TRACE)\n",
-    "trace_df = trace_file.read_csv(sep=\"\\t\")\n",
-    "trace_df.head()"
-   ],
    "outputs": [
     {
      "data": {
-      "text/plain": [
-       "   task_id       hash                             native_id  \\\n",
-       "0        7  99/b42c07  826623a0-0ed5-44ff-8a94-e3802cccf531   \n",
-       "1        5  71/8e3d51  ace41478-ba98-403d-a6d1-3e95ad64c36f   \n",
-       "2        8  71/535e08  9d499098-6ed7-422b-9233-9983f775fdee   \n",
-       "3        1  41/c494ef  3a221dd3-7ca8-41e1-8212-856b6154be64   \n",
-       "4        2  25/13b116  94f91d55-1d41-4afd-88b4-743d75817032   \n",
-       "\n",
-       "                     name     status  exit                   submit duration  \\\n",
-       "0  trim:trim_adapters (4)  COMPLETED     0  2022-05-24 16:27:01.413   5m 38s   \n",
-       "1  trim:trim_adapters (3)  COMPLETED     0  2022-05-24 16:27:01.421   5m 38s   \n",
-       "2              fastqc (4)  COMPLETED     0  2022-05-24 16:27:01.464   5m 48s   \n",
-       "3              fastqc (1)  COMPLETED     0  2022-05-24 16:27:01.465   5m 48s   \n",
-       "4  trim:trim_adapters (1)  COMPLETED     0  2022-05-24 16:27:01.476   5m 58s   \n",
-       "\n",
-       "  realtime    %cpu  peak_rss peak_vmem    rchar     wchar  \n",
-       "0       1s   76.6%    3.1 MB    5.4 MB   1.8 MB  900.5 KB  \n",
-       "1       4s    6.4%   11.6 MB   17.3 MB   1.8 MB  900.5 KB  \n",
-       "2       3s  104.8%  152.7 MB    3.2 GB  15.9 MB    4.1 MB  \n",
-       "3       3s  102.5%  140.2 MB    3.2 GB    16 MB    4.1 MB  \n",
-       "4       1s   75.8%    3.1 MB    5.4 MB   1.8 MB  900.5 KB  "
-      ],
       "text/html": [
        "<div>\n",
        "<style scoped>\n",
@@ -287,128 +341,170 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>7</td>\n",
-       "      <td>99/b42c07</td>\n",
-       "      <td>826623a0-0ed5-44ff-8a94-e3802cccf531</td>\n",
-       "      <td>trim:trim_adapters (4)</td>\n",
+       "      <td>fb/18dde6</td>\n",
+       "      <td>4a268ebd-7d6d-42e7-8753-a9ee3f0b1aca</td>\n",
+       "      <td>NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZ...</td>\n",
        "      <td>COMPLETED</td>\n",
        "      <td>0</td>\n",
-       "      <td>2022-05-24 16:27:01.413</td>\n",
-       "      <td>5m 38s</td>\n",
-       "      <td>1s</td>\n",
-       "      <td>76.6%</td>\n",
-       "      <td>3.1 MB</td>\n",
+       "      <td>2023-08-29 18:55:47.794</td>\n",
+       "      <td>2m 52s</td>\n",
+       "      <td>0ms</td>\n",
+       "      <td>79.1%</td>\n",
+       "      <td>3 MB</td>\n",
        "      <td>5.4 MB</td>\n",
-       "      <td>1.8 MB</td>\n",
-       "      <td>900.5 KB</td>\n",
+       "      <td>79.8 KB</td>\n",
+       "      <td>3.6 KB</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>5</td>\n",
-       "      <td>71/8e3d51</td>\n",
-       "      <td>ace41478-ba98-403d-a6d1-3e95ad64c36f</td>\n",
-       "      <td>trim:trim_adapters (3)</td>\n",
+       "      <td>6</td>\n",
+       "      <td>e0/e394ac</td>\n",
+       "      <td>4195506d-60cd-4771-ac03-e801adeb7794</td>\n",
+       "      <td>NFCORE_SAREK:SAREK:PREPARE_INTERVALS:CREATE_IN...</td>\n",
        "      <td>COMPLETED</td>\n",
        "      <td>0</td>\n",
-       "      <td>2022-05-24 16:27:01.421</td>\n",
-       "      <td>5m 38s</td>\n",
-       "      <td>4s</td>\n",
-       "      <td>6.4%</td>\n",
-       "      <td>11.6 MB</td>\n",
-       "      <td>17.3 MB</td>\n",
-       "      <td>1.8 MB</td>\n",
-       "      <td>900.5 KB</td>\n",
+       "      <td>2023-08-29 18:55:47.807</td>\n",
+       "      <td>2m 53s</td>\n",
+       "      <td>0ms</td>\n",
+       "      <td>171.4%</td>\n",
+       "      <td>2.9 MB</td>\n",
+       "      <td>11 MB</td>\n",
+       "      <td>43 KB</td>\n",
+       "      <td>1.9 KB</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>8</td>\n",
-       "      <td>71/535e08</td>\n",
-       "      <td>9d499098-6ed7-422b-9233-9983f775fdee</td>\n",
-       "      <td>fastqc (4)</td>\n",
+       "      <td>21</td>\n",
+       "      <td>57/3dfeca</td>\n",
+       "      <td>563917fe-c79c-419c-a4c8-081457e9241a</td>\n",
+       "      <td>NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZ...</td>\n",
        "      <td>COMPLETED</td>\n",
        "      <td>0</td>\n",
-       "      <td>2022-05-24 16:27:01.464</td>\n",
-       "      <td>5m 48s</td>\n",
-       "      <td>3s</td>\n",
-       "      <td>104.8%</td>\n",
-       "      <td>152.7 MB</td>\n",
-       "      <td>3.2 GB</td>\n",
-       "      <td>15.9 MB</td>\n",
-       "      <td>4.1 MB</td>\n",
+       "      <td>2023-08-29 18:58:41.177</td>\n",
+       "      <td>38.5s</td>\n",
+       "      <td>0ms</td>\n",
+       "      <td>82.8%</td>\n",
+       "      <td>3.1 MB</td>\n",
+       "      <td>5.4 MB</td>\n",
+       "      <td>79 KB</td>\n",
+       "      <td>3.2 KB</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>1</td>\n",
-       "      <td>41/c494ef</td>\n",
-       "      <td>3a221dd3-7ca8-41e1-8212-856b6154be64</td>\n",
-       "      <td>fastqc (1)</td>\n",
+       "      <td>23</td>\n",
+       "      <td>37/ebcc21</td>\n",
+       "      <td>27342717-5e4c-4ce3-81bd-5dd04192c7a7</td>\n",
+       "      <td>NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZ...</td>\n",
        "      <td>COMPLETED</td>\n",
        "      <td>0</td>\n",
-       "      <td>2022-05-24 16:27:01.465</td>\n",
-       "      <td>5m 48s</td>\n",
-       "      <td>3s</td>\n",
-       "      <td>102.5%</td>\n",
-       "      <td>140.2 MB</td>\n",
-       "      <td>3.2 GB</td>\n",
-       "      <td>16 MB</td>\n",
-       "      <td>4.1 MB</td>\n",
+       "      <td>2023-08-29 18:58:41.319</td>\n",
+       "      <td>38.7s</td>\n",
+       "      <td>0ms</td>\n",
+       "      <td>86.7%</td>\n",
+       "      <td>3.1 MB</td>\n",
+       "      <td>5.4 MB</td>\n",
+       "      <td>79 KB</td>\n",
+       "      <td>3.2 KB</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>2</td>\n",
-       "      <td>25/13b116</td>\n",
-       "      <td>94f91d55-1d41-4afd-88b4-743d75817032</td>\n",
-       "      <td>trim:trim_adapters (1)</td>\n",
+       "      <td>20</td>\n",
+       "      <td>09/9937ff</td>\n",
+       "      <td>4324507a-80a9-4957-9b99-6de4a949fd62</td>\n",
+       "      <td>NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZ...</td>\n",
        "      <td>COMPLETED</td>\n",
        "      <td>0</td>\n",
-       "      <td>2022-05-24 16:27:01.476</td>\n",
-       "      <td>5m 58s</td>\n",
-       "      <td>1s</td>\n",
-       "      <td>75.8%</td>\n",
+       "      <td>2023-08-29 18:58:41.352</td>\n",
+       "      <td>39.1s</td>\n",
+       "      <td>0ms</td>\n",
+       "      <td>84.7%</td>\n",
        "      <td>3.1 MB</td>\n",
        "      <td>5.4 MB</td>\n",
-       "      <td>1.8 MB</td>\n",
-       "      <td>900.5 KB</td>\n",
+       "      <td>79 KB</td>\n",
+       "      <td>3.2 KB</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
+      ],
+      "text/plain": [
+       "   task_id       hash                             native_id  \\\n",
+       "0        7  fb/18dde6  4a268ebd-7d6d-42e7-8753-a9ee3f0b1aca   \n",
+       "1        6  e0/e394ac  4195506d-60cd-4771-ac03-e801adeb7794   \n",
+       "2       21  57/3dfeca  563917fe-c79c-419c-a4c8-081457e9241a   \n",
+       "3       23  37/ebcc21  27342717-5e4c-4ce3-81bd-5dd04192c7a7   \n",
+       "4       20  09/9937ff  4324507a-80a9-4957-9b99-6de4a949fd62   \n",
+       "\n",
+       "                                                name     status  exit  \\\n",
+       "0  NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZ...  COMPLETED     0   \n",
+       "1  NFCORE_SAREK:SAREK:PREPARE_INTERVALS:CREATE_IN...  COMPLETED     0   \n",
+       "2  NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZ...  COMPLETED     0   \n",
+       "3  NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZ...  COMPLETED     0   \n",
+       "4  NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZ...  COMPLETED     0   \n",
+       "\n",
+       "                    submit duration realtime    %cpu peak_rss peak_vmem  \\\n",
+       "0  2023-08-29 18:55:47.794   2m 52s      0ms   79.1%     3 MB    5.4 MB   \n",
+       "1  2023-08-29 18:55:47.807   2m 53s      0ms  171.4%   2.9 MB     11 MB   \n",
+       "2  2023-08-29 18:58:41.177    38.5s      0ms   82.8%   3.1 MB    5.4 MB   \n",
+       "3  2023-08-29 18:58:41.319    38.7s      0ms   86.7%   3.1 MB    5.4 MB   \n",
+       "4  2023-08-29 18:58:41.352    39.1s      0ms   84.7%   3.1 MB    5.4 MB   \n",
+       "\n",
+       "     rchar   wchar  \n",
+       "0  79.8 KB  3.6 KB  \n",
+       "1    43 KB  1.9 KB  \n",
+       "2    79 KB  3.2 KB  \n",
+       "3    79 KB  3.2 KB  \n",
+       "4    79 KB  3.2 KB  "
       ]
      },
-     "execution_count": 3,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
-   "execution_count": 3
+   "source": [
+    "# Reading nextflow trace file\n",
+    "trace = dataset.get_trace()\n",
+    "trace.head()"
+   ]
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-03-25T19:18:48.517520Z",
-     "start_time": "2025-03-25T19:18:48.161651Z"
-    }
-   },
    "cell_type": "code",
-   "source": [
-    "from IPython.display import display, SVG\n",
-    "\n",
-    "# Displaying the workflow graph\n",
-    "graph = dataset.get_artifact(ArtifactType.WORKFLOW_DAG)\n",
-    "display(SVG(graph.read()))"
-   ],
+   "execution_count": 7,
+   "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<IPython.core.display.SVG object>"
-      ],
-      "image/svg+xml": "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" width=\"513pt\" height=\"471pt\" viewBox=\"0.00 0.00 512.90 470.60\">\n<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 466.6)\">\n<title>flowchart</title>\n<polygon fill=\"white\" stroke=\"white\" points=\"-4,5 -4,-466.6 509.9,-466.6 509.9,5 -4,5\"/>\n<!-- p0 -->\n<g id=\"node1\" class=\"node\"><title>p0</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"137.5\" cy=\"-444\" rx=\"3.6\" ry=\"3.6\"/>\n<text text-anchor=\"middle\" x=\"68.9\" y=\"-451.4\" font-family=\"Times,serif\" font-size=\"14.00\">Channel.fromPath</text>\n</g>\n<!-- p2 -->\n<g id=\"node2\" class=\"node\"><title>p2</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"95.5\" cy=\"-370\" rx=\"39.4691\" ry=\"18\"/>\n<text text-anchor=\"middle\" x=\"95.5\" y=\"-366.3\" font-family=\"Times,serif\" font-size=\"14.00\">fastqc</text>\n</g>\n<!-- p0&#45;&gt;p2 -->\n<g id=\"edge1\" class=\"edge\"><title>p0-&gt;p2</title>\n<path fill=\"none\" stroke=\"black\" d=\"M134.182,-442.465C126.048,-440.956 105.146,-435.836 96.5,-422 92.1927,-415.107 90.8454,-406.601 90.8892,-398.525\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"94.3962,-398.551 91.6163,-388.327 87.4139,-398.053 94.3962,-398.551\"/>\n<text text-anchor=\"middle\" x=\"127\" y=\"-410.3\" font-family=\"Times,serif\" font-size=\"14.00\">fastq_ch</text>\n</g>\n<!-- p10 -->\n<g id=\"node11\" class=\"node\"><title>p10</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"257.5\" cy=\"-370\" rx=\"99.1619\" ry=\"18\"/>\n<text text-anchor=\"middle\" x=\"257.5\" y=\"-366.3\" font-family=\"Times,serif\" font-size=\"14.00\">trim:trim_adapters</text>\n</g>\n<!-- p0&#45;&gt;p10 -->\n<g id=\"edge9\" class=\"edge\"><title>p0-&gt;p10</title>\n<path fill=\"none\" stroke=\"black\" d=\"M140.127,-441.252C148.262,-435.856 173.793,-419.038 195.5,-406 203.122,-401.422 211.355,-396.654 219.255,-392.163\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"220.994,-395.2 227.982,-387.236 217.553,-389.105 220.994,-395.2\"/>\n<text text-anchor=\"middle\" x=\"226\" y=\"-410.3\" font-family=\"Times,serif\" font-size=\"14.00\">fastq_ch</text>\n</g>\n<!-- p3 -->\n<g id=\"node5\" class=\"node\"><title>p3</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"70.5\" cy=\"-282\" rx=\"1.8\" ry=\"1.8\"/>\n</g>\n<!-- p2&#45;&gt;p3 -->\n<g id=\"edge3\" class=\"edge\"><title>p2-&gt;p3</title>\n<path fill=\"none\" stroke=\"black\" d=\"M90.5608,-352.009C85.5572,-334.797 77.9645,-308.678 73.6989,-294.004\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"76.9742,-292.732 70.8218,-284.107 70.2524,-294.686 76.9742,-292.732\"/>\n</g>\n<!-- p6 -->\n<g id=\"node6\" class=\"node\"><title>p6</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"94.5\" cy=\"-282\" rx=\"3.25\" ry=\"3.5\"/>\n<text text-anchor=\"middle\" x=\"45.5\" y=\"-289.3\" font-family=\"Times,serif\" font-size=\"14.00\">toSortedList</text>\n</g>\n<!-- p2&#45;&gt;p6 -->\n<g id=\"edge4\" class=\"edge\"><title>p2-&gt;p6</title>\n<path fill=\"none\" stroke=\"black\" d=\"M95.2976,-351.597C95.1052,-335.046 94.8202,-310.54 94.6481,-295.737\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"98.1472,-295.63 94.5311,-285.671 91.1477,-295.711 98.1472,-295.63\"/>\n</g>\n<!-- p4 -->\n<g id=\"node7\" class=\"node\"><title>p4</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"186.5\" cy=\"-282\" rx=\"70.2909\" ry=\"18\"/>\n<text text-anchor=\"middle\" x=\"186.5\" y=\"-278.3\" font-family=\"Times,serif\" font-size=\"14.00\">parse_fastqc</text>\n</g>\n<!-- p2&#45;&gt;p4 -->\n<g id=\"edge5\" class=\"edge\"><title>p2-&gt;p4</title>\n<path fill=\"none\" stroke=\"black\" d=\"M112.191,-353.226C126.037,-340.141 145.929,-321.342 161.648,-306.487\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"164.121,-308.965 168.985,-299.553 159.313,-303.878 164.121,-308.965\"/>\n</g>\n<!-- p1 -->\n<g id=\"node3\" class=\"node\"><title>p1</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"378.5\" cy=\"-370\" rx=\"3.6\" ry=\"3.6\"/>\n<text text-anchor=\"middle\" x=\"439.9\" y=\"-377.4\" font-family=\"Times,serif\" font-size=\"14.00\">Channel.fromPath</text>\n</g>\n<!-- p11 -->\n<g id=\"node4\" class=\"node\"><title>p11</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"281.5\" cy=\"-282\" rx=\"3.25\" ry=\"3.5\"/>\n<text text-anchor=\"middle\" x=\"308.5\" y=\"-289.3\" font-family=\"Times,serif\" font-size=\"14.00\">combine</text>\n</g>\n<!-- p1&#45;&gt;p11 -->\n<g id=\"edge2\" class=\"edge\"><title>p1-&gt;p11</title>\n<path fill=\"none\" stroke=\"black\" d=\"M376.819,-366.432C374.492,-362.967 370.055,-356.692 365.5,-352 339.969,-325.706 329.686,-323.425 301.5,-300 298.206,-297.262 294.634,-294.243 291.433,-291.518\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"293.462,-288.647 283.589,-284.802 288.909,-293.965 293.462,-288.647\"/>\n<text text-anchor=\"middle\" x=\"407\" y=\"-322.3\" font-family=\"Times,serif\" font-size=\"14.00\">Channel_Library</text>\n</g>\n<!-- p13 -->\n<g id=\"node13\" class=\"node\"><title>p13</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"299.5\" cy=\"-194\" rx=\"78.4642\" ry=\"18\"/>\n<text text-anchor=\"middle\" x=\"299.5\" y=\"-190.3\" font-family=\"Times,serif\" font-size=\"14.00\">mageck_count</text>\n</g>\n<!-- p11&#45;&gt;p13 -->\n<g id=\"edge12\" class=\"edge\"><title>p11-&gt;p13</title>\n<path fill=\"none\" stroke=\"black\" d=\"M282.019,-278.519C283.735,-270.32 289.366,-243.419 293.778,-222.337\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"297.263,-222.773 295.886,-212.268 290.411,-221.339 297.263,-222.773\"/>\n</g>\n<!-- p7 -->\n<g id=\"node9\" class=\"node\"><title>p7</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"94.5\" cy=\"-194\" rx=\"46.2191\" ry=\"18\"/>\n<text text-anchor=\"middle\" x=\"94.5\" y=\"-190.3\" font-family=\"Times,serif\" font-size=\"14.00\">multiqc</text>\n</g>\n<!-- p6&#45;&gt;p7 -->\n<g id=\"edge7\" class=\"edge\"><title>p6-&gt;p7</title>\n<path fill=\"none\" stroke=\"black\" d=\"M94.5,-278.139C94.5,-269.502 94.5,-243.011 94.5,-222.227\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"98.0001,-222.004 94.5,-212.004 91.0001,-222.004 98.0001,-222.004\"/>\n</g>\n<!-- p5 -->\n<g id=\"node8\" class=\"node\"><title>p5</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"186.5\" cy=\"-194\" rx=\"1.8\" ry=\"1.8\"/>\n</g>\n<!-- p4&#45;&gt;p5 -->\n<g id=\"edge6\" class=\"edge\"><title>p4-&gt;p5</title>\n<path fill=\"none\" stroke=\"black\" d=\"M186.5,-263.597C186.5,-246.516 186.5,-220.965 186.5,-206.348\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"190,-206.095 186.5,-196.095 183,-206.095 190,-206.095\"/>\n</g>\n<!-- p8 -->\n<g id=\"node10\" class=\"node\"><title>p8</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"94.5\" cy=\"-134\" rx=\"1.8\" ry=\"1.8\"/>\n</g>\n<!-- p7&#45;&gt;p8 -->\n<g id=\"edge8\" class=\"edge\"><title>p7-&gt;p8</title>\n<path fill=\"none\" stroke=\"black\" d=\"M94.5,-175.912C94.5,-166.322 94.5,-154.628 94.5,-146.202\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"98.0001,-146.056 94.5,-136.056 91.0001,-146.056 98.0001,-146.056\"/>\n</g>\n<!-- p10&#45;&gt;p11 -->\n<g id=\"edge11\" class=\"edge\"><title>p10-&gt;p11</title>\n<path fill=\"none\" stroke=\"black\" d=\"M262.242,-352.009C266.891,-335.35 273.868,-310.349 278.024,-295.455\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"281.504,-296.005 280.821,-285.432 274.762,-294.124 281.504,-296.005\"/>\n</g>\n<!-- p9 -->\n<g id=\"node12\" class=\"node\"><title>p9</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"263.5\" cy=\"-444\" rx=\"3.6\" ry=\"3.6\"/>\n</g>\n<!-- p9&#45;&gt;p10 -->\n<g id=\"edge10\" class=\"edge\"><title>p9-&gt;p10</title>\n<path fill=\"none\" stroke=\"black\" d=\"M263.272,-440.265C262.696,-433.354 261.132,-414.578 259.776,-398.316\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"263.247,-397.818 258.929,-388.143 256.271,-398.399 263.247,-397.818\"/>\n<text text-anchor=\"middle\" x=\"284\" y=\"-410.3\" font-family=\"Times,serif\" font-size=\"14.00\">prefix</text>\n</g>\n<!-- p14 -->\n<g id=\"node15\" class=\"node\"><title>p14</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"286.5\" cy=\"-134\" rx=\"1.8\" ry=\"1.8\"/>\n</g>\n<!-- p13&#45;&gt;p14 -->\n<g id=\"edge14\" class=\"edge\"><title>p13-&gt;p14</title>\n<path fill=\"none\" stroke=\"black\" d=\"M295.67,-175.912C293.494,-166.205 290.835,-154.342 288.942,-145.896\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"292.339,-145.049 286.737,-136.056 285.509,-146.58 292.339,-145.049\"/>\n</g>\n<!-- p15 -->\n<g id=\"node16\" class=\"node\"><title>p15</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"311.5\" cy=\"-134\" rx=\"3.25\" ry=\"3.5\"/>\n<text text-anchor=\"middle\" x=\"262.5\" y=\"-141.3\" font-family=\"Times,serif\" font-size=\"14.00\">toSortedList</text>\n</g>\n<!-- p13&#45;&gt;p15 -->\n<g id=\"edge15\" class=\"edge\"><title>p13-&gt;p15</title>\n<path fill=\"none\" stroke=\"black\" d=\"M303.035,-175.912C304.909,-166.858 307.17,-155.93 308.883,-147.647\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"312.345,-148.193 310.943,-137.691 305.49,-146.774 312.345,-148.193\"/>\n</g>\n<!-- p12 -->\n<g id=\"node14\" class=\"node\"><title>p12</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"314.5\" cy=\"-282\" rx=\"3.6\" ry=\"3.6\"/>\n</g>\n<!-- p12&#45;&gt;p13 -->\n<g id=\"edge13\" class=\"edge\"><title>p12-&gt;p13</title>\n<path fill=\"none\" stroke=\"black\" d=\"M314.001,-278.139C312.488,-269.462 307.832,-242.767 304.199,-221.942\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"307.632,-221.254 302.466,-212.004 300.736,-222.456 307.632,-221.254\"/>\n<text text-anchor=\"middle\" x=\"330\" y=\"-234.3\" font-family=\"Times,serif\" font-size=\"14.00\">prefix</text>\n</g>\n<!-- p17 -->\n<g id=\"node17\" class=\"node\"><title>p17</title>\n<ellipse fill=\"none\" stroke=\"black\" cx=\"327.5\" cy=\"-60\" rx=\"114.085\" ry=\"18\"/>\n<text text-anchor=\"middle\" x=\"327.5\" y=\"-56.3\" font-family=\"Times,serif\" font-size=\"14.00\">mageck_merge_single</text>\n</g>\n<!-- p15&#45;&gt;p17 -->\n<g id=\"edge16\" class=\"edge\"><title>p15-&gt;p17</title>\n<path fill=\"none\" stroke=\"black\" d=\"M312.108,-130.265C313.644,-123.354 317.816,-104.578 321.43,-88.3163\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"324.938,-88.6641 323.69,-78.1429 318.104,-87.1455 324.938,-88.6641\"/>\n</g>\n<!-- p19 -->\n<g id=\"node19\" class=\"node\"><title>p19</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"315.5\" cy=\"-2\" rx=\"1.8\" ry=\"1.8\"/>\n</g>\n<!-- p17&#45;&gt;p19 -->\n<g id=\"edge18\" class=\"edge\"><title>p17-&gt;p19</title>\n<path fill=\"none\" stroke=\"black\" d=\"M323.834,-41.8939C321.88,-32.775 319.535,-21.8299 317.831,-13.8767\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"321.231,-13.0404 315.713,-3.99575 314.386,-14.5072 321.231,-13.0404\"/>\n</g>\n<!-- p18 -->\n<g id=\"node20\" class=\"node\"><title>p18</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"340.5\" cy=\"-2\" rx=\"1.8\" ry=\"1.8\"/>\n</g>\n<!-- p17&#45;&gt;p18 -->\n<g id=\"edge19\" class=\"edge\"><title>p17-&gt;p18</title>\n<path fill=\"none\" stroke=\"black\" d=\"M331.471,-41.8939C333.588,-32.775 336.129,-21.8299 337.975,-13.8767\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"341.417,-14.5282 340.269,-3.99575 334.598,-12.9452 341.417,-14.5282\"/>\n</g>\n<!-- p16 -->\n<g id=\"node18\" class=\"node\"><title>p16</title>\n<ellipse fill=\"black\" stroke=\"black\" cx=\"344.5\" cy=\"-134\" rx=\"3.6\" ry=\"3.6\"/>\n</g>\n<!-- p16&#45;&gt;p17 -->\n<g id=\"edge17\" class=\"edge\"><title>p16-&gt;p17</title>\n<path fill=\"none\" stroke=\"black\" d=\"M343.854,-130.265C342.222,-123.354 337.789,-104.578 333.95,-88.3163\"/>\n<polygon fill=\"black\" stroke=\"black\" points=\"337.252,-87.071 331.548,-78.1429 330.439,-88.6796 337.252,-87.071\"/>\n<text text-anchor=\"middle\" x=\"361\" y=\"-100.3\" font-family=\"Times,serif\" font-size=\"14.00\">prefix</text>\n</g>\n</g>\n</svg>"
+       "['PW_WORKFLOW_SCRIPT=main.nf',\n",
+       " 'PW_AWS_REGION=us-west-2',\n",
+       " 'PW_BATCH_JOB_ROLE=arn:aws:iam::523221283927:role/Cirro-BatchJobRole-9a31492a',\n",
+       " 'PW_S3_TRANSFORM_WORKFLOW=s3://pubweb-resources-develop/process/hutch/data-transforms/workflow/',\n",
+       " 'PW_ONDEMAND_JOB_QUEUE=arn:aws:batch:us-west-2:523221283927:job-queue/Cirro-OnDemand-9a31492a',\n",
+       " 'PW_DATASET=3fb7e8f8-b62d-43a6-ad08-eb28f59bd141',\n",
+       " 'PW_WORKFLOW_VERSION=3.2.3',\n",
+       " 'PW_S3_RESTORE_SESSION_DIR=s3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7-scratch/workdir/session/acc47b45-df28-4120-ab0d-7106ba7c5fc4',\n",
+       " 'PW_SPOT_JOB_QUEUE=arn:aws:batch:us-west-2:523221283927:job-queue/Cirro-Spot-9a31492a',\n",
+       " 'PW_S3_DATASET=s3://project-9a31492a-e679-43ce-9f06-d84213c8f7f7/datasets/3fb7e8f8-b62d-43a6-ad08-eb28f59bd141']"
+      ]
      },
+     "execution_count": 7,
      "metadata": {},
-     "output_type": "display_data"
+     "output_type": "execute_result"
     }
    ],
-   "execution_count": 6
+   "source": [
+    "# Get the logs\n",
+    "logs = dataset.get_logs()\n",
+    "logs.split(\"\\n\")[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -427,7 +523,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.12"
+   "version": "3.12.7"
   },
   "vscode": {
    "interpreter": {
@@ -436,5 +532,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 1
+ "nbformat_minor": 4
 }
diff --git a/tests/test_read_files.py b/tests/test_read_files.py
new file mode 100644
index 0000000..2292d0e
--- /dev/null
+++ b/tests/test_read_files.py
@@ -0,0 +1,463 @@
+import io
+import json
+import pickle
+import unittest
+from unittest.mock import Mock
+
+import pandas as pd
+
+from cirro.models.file import File, FileAccessContext
+from cirro.sdk.dataset import DataPortalDataset, _infer_file_format, _read_file_with_format, _pattern_to_captures_regex
+from cirro.sdk.exceptions import DataPortalInputError
+from cirro.sdk.file import DataPortalFile, DataPortalFiles
+
+
+def _make_mock_file(relative_path: str, content: bytes = b'') -> DataPortalFile:
+    """Create a DataPortalFile with a mocked _get method."""
+    access_context = Mock(spec=FileAccessContext)
+    file = File(relative_path=relative_path, size=len(content), access_context=access_context)
+    client = Mock()
+    client.file.get_file.return_value = content
+    portal_file = DataPortalFile(file=file, client=client)
+    return portal_file
+
+
+def _make_dataset_with_files(files: list) -> DataPortalDataset:
+    """Create a DataPortalDataset whose list_files() returns the given DataPortalFile list."""
+    dataset_data = Mock()
+    dataset_data.id = 'ds-1'
+    dataset_data.project_id = 'proj-1'
+    dataset_data.name = 'Test Dataset'
+
+    client = Mock()
+    dataset = DataPortalDataset(dataset=dataset_data, client=client)
+    dataset.list_files = Mock(return_value=DataPortalFiles(files))
+    return dataset
+
+
+class TestInferFileFormat(unittest.TestCase):
+    def test_csv_extension(self):
+        self.assertEqual(_infer_file_format('data/results.csv'), 'csv')
+
+    def test_tsv_extension(self):
+        self.assertEqual(_infer_file_format('data/results.tsv'), 'csv')
+
+    def test_csv_gz_extension(self):
+        self.assertEqual(_infer_file_format('data/results.csv.gz'), 'csv')
+
+    def test_tsv_gz_extension(self):
+        self.assertEqual(_infer_file_format('data/results.tsv.gz'), 'csv')
+
+    def test_h5ad_extension(self):
+        self.assertEqual(_infer_file_format('data/adata.h5ad'), 'h5ad')
+
+    def test_json_extension(self):
+        self.assertEqual(_infer_file_format('data/results.json'), 'json')
+
+    def test_json_gz_extension(self):
+        self.assertEqual(_infer_file_format('data/results.json.gz'), 'json')
+
+    def test_parquet_extension(self):
+        self.assertEqual(_infer_file_format('data/results.parquet'), 'parquet')
+
+    def test_feather_extension(self):
+        self.assertEqual(_infer_file_format('data/results.feather'), 'feather')
+
+    def test_pickle_pkl_extension(self):
+        self.assertEqual(_infer_file_format('data/results.pkl'), 'pickle')
+
+    def test_pickle_pickle_extension(self):
+        self.assertEqual(_infer_file_format('data/results.pickle'), 'pickle')
+
+    def test_excel_xlsx_extension(self):
+        self.assertEqual(_infer_file_format('data/results.xlsx'), 'excel')
+
+    def test_excel_xls_extension(self):
+        self.assertEqual(_infer_file_format('data/results.xls'), 'excel')
+
+    def test_text_fallback(self):
+        self.assertEqual(_infer_file_format('data/notes.txt'), 'text')
+
+    def test_log_fallback(self):
+        self.assertEqual(_infer_file_format('logs/run.log'), 'text')
+
+    def test_unknown_extension_fallback(self):
+        self.assertEqual(_infer_file_format('data/file.xyz'), 'text')
+
+
+class TestReadFileWithFormat(unittest.TestCase):
+    def setUp(self):
+        self.file = _make_mock_file('data/results.csv', b'a,b\n1,2\n')
+
+    def test_csv_format(self):
+        import pandas as pd
+        df = _read_file_with_format(self.file, 'csv')
+        self.assertIsInstance(df, pd.DataFrame)
+        self.assertListEqual(list(df.columns), ['a', 'b'])
+
+    def test_text_format(self):
+        file = _make_mock_file('data/notes.txt', b'hello world')
+        result = _read_file_with_format(file, 'text')
+        self.assertEqual(result, 'hello world')
+
+    def test_auto_infer_csv(self):
+        import pandas as pd
+        result = _read_file_with_format(self.file, None)
+        self.assertIsInstance(result, pd.DataFrame)
+
+    def test_auto_infer_text(self):
+        file = _make_mock_file('data/notes.txt', b'hello')
+        result = _read_file_with_format(file, None)
+        self.assertIsInstance(result, str)
+
+    def test_bytes_format(self):
+        file = _make_mock_file('data/blob.bin', b'\x00\x01\x02\x03')
+        result = _read_file_with_format(file, 'bytes')
+        self.assertIsInstance(result, bytes)
+        self.assertEqual(result, b'\x00\x01\x02\x03')
+
+    def test_unsupported_format_raises(self):
+        with self.assertRaises(DataPortalInputError):
+            _read_file_with_format(self.file, 'xyz_unknown')
+
+    def test_json_format(self):
+        file = _make_mock_file('data/data.json', b'{"key": "value"}')
+        result = _read_file_with_format(file, 'json')
+        self.assertIsInstance(result, dict)
+        self.assertEqual(result['key'], 'value')
+
+    def test_auto_infer_json(self):
+        file = _make_mock_file('data/data.json', b'[1, 2, 3]')
+        result = _read_file_with_format(file, None)
+        self.assertIsInstance(result, list)
+        self.assertEqual(result, [1, 2, 3])
+
+    def test_pickle_format(self):
+        data = {'hello': 42}
+        file = _make_mock_file('data/data.pkl', pickle.dumps(data))
+        result = _read_file_with_format(file, 'pickle')
+        self.assertEqual(result, data)
+
+    def test_auto_infer_pickle(self):
+        data = [1, 2, 3]
+        file = _make_mock_file('data/data.pkl', pickle.dumps(data))
+        result = _read_file_with_format(file, None)
+        self.assertEqual(result, data)
+
+    def _make_parquet_bytes(self):
+        buf = io.BytesIO()
+        pd.DataFrame({'a': [1, 2], 'b': [3, 4]}).to_parquet(buf)
+        return buf.getvalue()
+
+    def _make_feather_bytes(self):
+        buf = io.BytesIO()
+        pd.DataFrame({'a': [1, 2], 'b': [3, 4]}).to_feather(buf)
+        return buf.getvalue()
+
+    @unittest.skipUnless(
+        __import__('importlib').util.find_spec('pyarrow') is not None,
+        'pyarrow not installed'
+    )
+    def test_parquet_format(self):
+        file = _make_mock_file('data/data.parquet', self._make_parquet_bytes())
+        result = _read_file_with_format(file, 'parquet')
+        self.assertIsInstance(result, pd.DataFrame)
+        self.assertListEqual(list(result.columns), ['a', 'b'])
+
+    @unittest.skipUnless(
+        __import__('importlib').util.find_spec('pyarrow') is not None,
+        'pyarrow not installed'
+    )
+    def test_auto_infer_parquet(self):
+        file = _make_mock_file('data/data.parquet', self._make_parquet_bytes())
+        result = _read_file_with_format(file, None)
+        self.assertIsInstance(result, pd.DataFrame)
+
+    @unittest.skipUnless(
+        __import__('importlib').util.find_spec('pyarrow') is not None,
+        'pyarrow not installed'
+    )
+    def test_feather_format(self):
+        file = _make_mock_file('data/data.feather', self._make_feather_bytes())
+        result = _read_file_with_format(file, 'feather')
+        self.assertIsInstance(result, pd.DataFrame)
+        self.assertListEqual(list(result.columns), ['a', 'b'])
+
+    @unittest.skipUnless(
+        __import__('importlib').util.find_spec('pyarrow') is not None,
+        'pyarrow not installed'
+    )
+    def test_auto_infer_feather(self):
+        file = _make_mock_file('data/data.feather', self._make_feather_bytes())
+        result = _read_file_with_format(file, None)
+        self.assertIsInstance(result, pd.DataFrame)
+
+    def test_csv_kwargs_passed_through(self):
+        import pandas as pd
+        file = _make_mock_file('data/data.tsv', b'a\tb\n1\t2\n')
+        df = _read_file_with_format(file, 'csv', sep='\t')
+        self.assertIsInstance(df, pd.DataFrame)
+        self.assertListEqual(list(df.columns), ['a', 'b'])
+
+
+class TestDatasetReadFiles(unittest.TestCase):
+    def setUp(self):
+        self.csv_file = _make_mock_file('data/results.csv', b'x,y\n3,4\n')
+        self.tsv_file = _make_mock_file('data/counts.tsv', b'gene\tcount\nTP53\t100\n')
+        self.txt_file = _make_mock_file('logs/run.log', b'started\nfinished\n')
+        self.dataset = _make_dataset_with_files([
+            self.csv_file,
+            self.tsv_file,
+            self.txt_file,
+        ])
+
+    # --- glob mode ---
+
+    def test_glob_matches_csv(self):
+        results = list(self.dataset.read_files(glob='*.csv'))
+        self.assertEqual(len(results), 1)
+        self.assertIsInstance(results[0], pd.DataFrame)
+
+    def test_glob_matches_multiple(self):
+        results = list(self.dataset.read_files(glob='data/*'))
+        self.assertEqual(len(results), 2)
+
+    def test_glob_no_match_returns_empty(self):
+        results = list(self.dataset.read_files(glob='*.parquet'))
+        self.assertEqual(len(results), 0)
+
+    def test_glob_explicit_format_csv(self):
+        results = list(self.dataset.read_files(glob='data/*.tsv', filetype='csv', sep='\t'))
+        self.assertEqual(len(results), 1)
+        self.assertIsInstance(results[0], pd.DataFrame)
+        self.assertIn('gene', results[0].columns)
+
+    def test_glob_explicit_format_text(self):
+        results = list(self.dataset.read_files(glob='logs/*.log', filetype='text'))
+        self.assertEqual(len(results), 1)
+        self.assertIsInstance(results[0], str)
+        self.assertIn('started', results[0])
+
+    def test_glob_auto_infer_csv_from_extension(self):
+        results = list(self.dataset.read_files(glob='data/results.csv'))
+        self.assertIsInstance(results[0], pd.DataFrame)
+
+    def test_glob_auto_infer_text_from_extension(self):
+        results = list(self.dataset.read_files(glob='logs/run.log'))
+        self.assertIsInstance(results[0], str)
+
+    def test_globstar_pattern(self):
+        results = list(self.dataset.read_files(glob='**/*.csv'))
+        self.assertEqual(len(results), 1)
+        self.assertIsInstance(results[0], pd.DataFrame)
+
+    # --- pattern (capture) mode ---
+
+    def test_pattern_simple_filename(self):
+        results = list(self.dataset.read_files(pattern='{sample}.csv'))
+        self.assertEqual(len(results), 1)
+        content, meta = results[0]
+        self.assertIsInstance(content, pd.DataFrame)
+        self.assertEqual(meta['sample'], 'results')
+
+    def test_pattern_with_directory(self):
+        results = list(self.dataset.read_files(pattern='data/{sample}.csv'))
+        self.assertEqual(len(results), 1)
+        _, meta = results[0]
+        self.assertEqual(meta['sample'], 'results')
+
+    def test_pattern_multiple_files(self):
+        dataset = _make_dataset_with_files([
+            _make_mock_file('sampleA.csv', b'a\n1\n'),
+            _make_mock_file('sampleB.csv', b'a\n2\n'),
+            _make_mock_file('notes.txt', b'text'),
+        ])
+        results = list(dataset.read_files(pattern='{sample}.csv'))
+        self.assertEqual(len(results), 2)
+        captured = {m['sample'] for _, m in results}
+        self.assertSetEqual(captured, {'sampleA', 'sampleB'})
+
+    def test_pattern_multi_level(self):
+        dataset = _make_dataset_with_files([
+            _make_mock_file('treated/sampleA.csv', b'x\n1\n'),
+            _make_mock_file('control/sampleB.csv', b'x\n2\n'),
+        ])
+        results = list(dataset.read_files(pattern='{condition}/{sample}.csv'))
+        self.assertEqual(len(results), 2)
+        by_sample = {m['sample']: m['condition'] for _, m in results}
+        self.assertEqual(by_sample['sampleA'], 'treated')
+        self.assertEqual(by_sample['sampleB'], 'control')
+
+    def test_pattern_no_match_returns_empty(self):
+        results = list(self.dataset.read_files(pattern='{sample}.parquet'))
+        self.assertEqual(len(results), 0)
+
+    def test_pattern_yields_content_and_meta_tuple(self):
+        results = list(self.dataset.read_files(pattern='{sample}.csv'))
+        _, meta = results[0]
+        self.assertIsInstance(meta, dict)
+        self.assertIn('sample', meta)
+
+    # --- special characters in filenames ---
+
+    def test_glob_matches_filename_with_spaces(self):
+        dataset = _make_dataset_with_files([
+            _make_mock_file('data/my sample.csv', b'a\n1\n'),
+        ])
+        results = list(dataset.read_files(glob='*.csv'))
+        self.assertEqual(len(results), 1)
+
+    def test_glob_matches_filename_with_hyphens_and_parens(self):
+        dataset = _make_dataset_with_files([
+            _make_mock_file('data/sample-A (1).csv', b'a\n1\n'),
+        ])
+        results = list(dataset.read_files(glob='*.csv'))
+        self.assertEqual(len(results), 1)
+
+    def test_pattern_captures_filename_with_spaces(self):
+        dataset = _make_dataset_with_files([
+            _make_mock_file('my sample.csv', b'a\n1\n'),
+        ])
+        results = list(dataset.read_files(pattern='{sample}.csv'))
+        self.assertEqual(len(results), 1)
+        _, meta = results[0]
+        self.assertEqual(meta['sample'], 'my sample')
+
+    def test_pattern_captures_directory_with_spaces(self):
+        dataset = _make_dataset_with_files([
+            _make_mock_file('treated group/sampleA.csv', b'a\n1\n'),
+            _make_mock_file('control group/sampleB.csv', b'a\n2\n'),
+        ])
+        results = list(dataset.read_files(pattern='{condition}/{sample}.csv'))
+        self.assertEqual(len(results), 2)
+        by_sample = {m['sample']: m['condition'] for _, m in results}
+        self.assertEqual(by_sample['sampleA'], 'treated group')
+        self.assertEqual(by_sample['sampleB'], 'control group')
+
+    def test_pattern_captures_special_chars(self):
+        dataset = _make_dataset_with_files([
+            _make_mock_file('sample-A_v2 (1).csv', b'a\n1\n'),
+        ])
+        results = list(dataset.read_files(pattern='{sample}.csv'))
+        self.assertEqual(len(results), 1)
+        _, meta = results[0]
+        self.assertEqual(meta['sample'], 'sample-A_v2 (1)')
+
+    # --- error cases ---
+
+    def test_both_glob_and_pattern_raises(self):
+        with self.assertRaises(DataPortalInputError):
+            list(self.dataset.read_files(glob='*.csv', pattern='{sample}.csv'))
+
+    def test_neither_glob_nor_pattern_raises(self):
+        with self.assertRaises(DataPortalInputError):
+            list(self.dataset.read_files())
+
+
+class TestDatasetDownloadFiles(unittest.TestCase):
+    def setUp(self):
+        self.csv_file = _make_mock_file('data/results.csv', b'x,y\n3,4\n')
+        self.tsv_file = _make_mock_file('data/counts.tsv', b'gene\tcount\nTP53\t100\n')
+        self.txt_file = _make_mock_file('logs/run.log', b'started\nfinished\n')
+        self.dataset = _make_dataset_with_files([
+            self.csv_file,
+            self.tsv_file,
+            self.txt_file,
+        ])
+        for f in [self.csv_file, self.tsv_file, self.txt_file]:
+            f.download = Mock(return_value=None)
+
+    def _downloaded_paths(self):
+        return [
+            f.relative_path
+            for f in [self.csv_file, self.tsv_file, self.txt_file]
+            if f.download.called
+        ]
+
+    def test_no_glob_downloads_all(self):
+        self.dataset.download_files(download_location='/tmp')
+        self.assertEqual(len(self._downloaded_paths()), 3)
+
+    def test_glob_filters_to_matching_files(self):
+        self.dataset.download_files(download_location='/tmp', glob='*.csv')
+        downloaded = self._downloaded_paths()
+        self.assertEqual(downloaded, ['data/results.csv'])
+
+    def test_glob_matches_multiple_files(self):
+        self.dataset.download_files(download_location='/tmp', glob='data/*')
+        downloaded = self._downloaded_paths()
+        self.assertIn('data/results.csv', downloaded)
+        self.assertIn('data/counts.tsv', downloaded)
+        self.assertNotIn('logs/run.log', downloaded)
+
+    def test_glob_no_match_downloads_nothing(self):
+        self.dataset.download_files(download_location='/tmp', glob='*.parquet')
+        self.assertEqual(len(self._downloaded_paths()), 0)
+
+    def test_globstar_filters_by_subdirectory(self):
+        self.dataset.download_files(download_location='/tmp', glob='logs/**')
+        downloaded = self._downloaded_paths()
+        self.assertEqual(downloaded, ['logs/run.log'])
+
+
+class TestPatternToRegex(unittest.TestCase):
+    def _match(self, pattern, path):
+        compiled, _ = _pattern_to_captures_regex(pattern)
+        m = compiled.match(path)
+        return m.groupdict() if m else None
+
+    def test_simple_capture(self):
+        self.assertEqual(self._match('{sample}.csv', 'sampleA.csv'), {'sample': 'sampleA'})
+
+    def test_simple_capture_with_directory(self):
+        self.assertEqual(self._match('{sample}.csv', 'data/sampleA.csv'), {'sample': 'sampleA'})
+
+    def test_directory_capture(self):
+        self.assertEqual(self._match('data/{sample}.csv', 'data/results.csv'), {'sample': 'results'})
+
+    def test_multi_level_capture(self):
+        result = self._match('{condition}/{sample}.csv', 'treated/sampleA.csv')
+        self.assertEqual(result, {'condition': 'treated', 'sample': 'sampleA'})
+
+    def test_multi_level_capture_with_prefix(self):
+        result = self._match('{condition}/{sample}.csv', 'data/treated/sampleA.csv')
+        self.assertEqual(result, {'condition': 'treated', 'sample': 'sampleA'})
+
+    def test_no_match_returns_none(self):
+        self.assertIsNone(self._match('{sample}.csv', 'sampleA.tsv'))
+
+    def test_wildcard_mixed_with_capture(self):
+        result = self._match('data/*/{sample}.csv', 'data/subdir/sampleA.csv')
+        self.assertEqual(result, {'sample': 'sampleA'})
+
+    def test_capture_names_returned(self):
+        _, names = _pattern_to_captures_regex('{condition}/{sample}.csv')
+        self.assertListEqual(names, ['condition', 'sample'])
+
+    def test_capture_with_spaces(self):
+        result = self._match('{sample}.csv', 'my sample.csv')
+        self.assertEqual(result, {'sample': 'my sample'})
+
+    def test_capture_with_spaces_in_directory(self):
+        result = self._match('{condition}/{sample}.csv', 'treated group/my sample.csv')
+        self.assertEqual(result, {'condition': 'treated group', 'sample': 'my sample'})
+
+    def test_capture_with_hyphens_and_underscores(self):
+        result = self._match('{sample}.csv', 'sample-A_v2.csv')
+        self.assertEqual(result, {'sample': 'sample-A_v2'})
+
+    def test_capture_with_parentheses(self):
+        result = self._match('{sample}.csv', 'sample (1).csv')
+        self.assertEqual(result, {'sample': 'sample (1)'})
+
+    def test_capture_with_dots_in_name(self):
+        result = self._match('{sample}.csv', 'sample.v2.csv')
+        self.assertEqual(result, {'sample': 'sample.v2'})
+
+    def test_wildcard_matches_spaces(self):
+        compiled, _ = _pattern_to_captures_regex('data/*.csv')
+        self.assertIsNotNone(compiled.match('data/my file.csv'))
+
+    def test_globstar_matches_spaces_across_segments(self):
+        compiled, _ = _pattern_to_captures_regex('**/*.csv')
+        self.assertIsNotNone(compiled.match('some dir/sub dir/my file.csv'))