From 8cee77684530a3e86fa555b57878e809bdfb97cd Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 4 Feb 2026 22:27:23 +0000 Subject: [PATCH 1/2] Supporting storage locations in SYNPY --- .gitignore | 1 + .../storage_location_architecture.md | 785 ++++++++ docs/js/mermaid-init.js | 12 + docs/reference/experimental/async/folder.md | 6 + docs/reference/experimental/async/project.md | 6 + .../experimental/async/storage_location.md | 23 + .../mixins/manifest_generatable.md | 69 + .../mixins/storage_location_configurable.md | 54 + docs/reference/experimental/sync/folder.md | 6 + docs/reference/experimental/sync/project.md | 6 + .../experimental/sync/storage_location.md | 24 + docs/tutorials/python/manifest_operations.md | 328 ++++ docs/tutorials/python/storage_location.md | 135 ++ .../tutorial_scripts/storage_location.py | 86 + mkdocs.yml | 11 + synapseclient/api/__init__.py | 15 + .../api/storage_location_services.py | 169 ++ synapseclient/client.py | 51 +- .../core/constants/concrete_types.py | 26 +- synapseclient/models/__init__.py | 19 + synapseclient/models/folder.py | 6 + synapseclient/models/mixins/__init__.py | 12 + .../models/mixins/asynchronous_job.py | 2 + synapseclient/models/mixins/manifest.py | 950 ++++++++++ .../models/mixins/storable_container.py | 28 +- .../models/mixins/storage_location_mixin.py | 450 +++++ synapseclient/models/project.py | 6 + .../protocols/download_list_protocol.py | 97 + .../models/protocols/manifest_protocol.py | 240 +++ .../protocols/storable_container_protocol.py | 13 +- .../storage_location_mixin_protocol.py | 279 +++ .../protocols/storage_location_protocol.py | 159 ++ synapseclient/models/services/__init__.py | 29 +- synapseclient/models/services/migration.py | 1650 +++++++++++++++++ .../models/services/migration_types.py | 371 ++++ synapseclient/models/storage_location.py | 600 ++++++ .../unit_test_storage_location_services.py | 215 +++ .../models/unit_test_manifest.py | 499 +++++ .../models/unit_test_storage_location.py | 355 ++++ 39 files changed, 7777 insertions(+), 16 deletions(-) create mode 100644 docs/explanations/storage_location_architecture.md create mode 100644 docs/js/mermaid-init.js create mode 100644 docs/reference/experimental/async/storage_location.md create mode 100644 docs/reference/experimental/mixins/manifest_generatable.md create mode 100644 docs/reference/experimental/mixins/storage_location_configurable.md create mode 100644 docs/reference/experimental/sync/storage_location.md create mode 100644 docs/tutorials/python/manifest_operations.md create mode 100644 docs/tutorials/python/storage_location.md create mode 100644 docs/tutorials/python/tutorial_scripts/storage_location.py create mode 100644 synapseclient/api/storage_location_services.py create mode 100644 synapseclient/models/mixins/manifest.py create mode 100644 synapseclient/models/mixins/storage_location_mixin.py create mode 100644 synapseclient/models/protocols/download_list_protocol.py create mode 100644 synapseclient/models/protocols/manifest_protocol.py create mode 100644 synapseclient/models/protocols/storage_location_mixin_protocol.py create mode 100644 synapseclient/models/protocols/storage_location_protocol.py create mode 100644 synapseclient/models/services/migration.py create mode 100644 synapseclient/models/services/migration_types.py create mode 100644 synapseclient/models/storage_location.py create mode 100644 tests/unit/synapseclient/api/unit_test_storage_location_services.py create mode 100644 tests/unit/synapseclient/models/unit_test_manifest.py create mode 100644 tests/unit/synapseclient/models/unit_test_storage_location.py diff --git a/.gitignore b/.gitignore index fa4e7f520..19eb11079 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ jenkins/ .idea/* docs/build/doctrees/* docs/build/html/_sources/* +docs_site/* build/* /venv diff --git a/docs/explanations/storage_location_architecture.md b/docs/explanations/storage_location_architecture.md new file mode 100644 index 000000000..612ef7d21 --- /dev/null +++ b/docs/explanations/storage_location_architecture.md @@ -0,0 +1,785 @@ +# Storage Location Architecture + +This document provides an in-depth architectural overview of the StorageLocation +system in the Synapse Python Client. It explains the design decisions, class +relationships, and data flows that enable flexible storage configuration. + +--- + +## On This Page + +
+ +- **[Domain Model](#domain-model)** + + Core classes, enums, and their relationships + +- **[Storage Types](#storage-type-mapping)** + + How storage types map to REST API types and choosing the right one + +- **[Entity Inheritance](#entity-inheritance-hierarchy)** + + How Projects and Folders gain storage capabilities + +- **[Operation Flows](#operation-flows)** + + Sequence diagrams for store, setup, and STS operations + +- **[Settings & API](#project-setting-lifecycle)** + + Project settings lifecycle and REST API architecture + +- **[Migration](#migration-flow)** + + Two-phase file migration process + +
+ +--- + +## Overview + +The StorageLocation system enables Synapse users to configure where uploaded files +are stored. By default, Synapse stores files in its internal S3 storage, but +users can configure projects and folders to use external storage backends such as +AWS S3 buckets, Google Cloud Storage, SFTP servers, or proxy servers. + +!!! info "Key Concepts" + - **StorageLocation**: A configuration describing where files are stored + - **Project Setting**: Links a storage location to a Project or Folder + - **STS Credentials**: Temporary AWS credentials for direct S3 access + - **Storage Migration**: Moving files between storage locations + +--- + +
+ +# Part 1: Data Model + +This section covers the core classes, enumerations, and type mappings. + +
+ +## Domain Model + +The following class diagram shows the core classes and their relationships in the +StorageLocation system. + +```mermaid +classDiagram + direction TB + + class StorageLocation { + +int storage_location_id + +StorageLocationType storage_type + +UploadType upload_type + +str bucket + +str base_key + +bool sts_enabled + +str banner + +str description + +str etag + +str created_on + +int created_by + +str url + +bool supports_subfolders + +str endpoint_url + +str proxy_url + +str secret_key + +str benefactor_id + +store() StorageLocation + +get() StorageLocation + +setup_s3() Tuple~Folder, StorageLocation~ + +fill_from_dict(dict) StorageLocation + } + + class StorageLocationType { + <> + SYNAPSE_S3 + EXTERNAL_S3 + EXTERNAL_GOOGLE_CLOUD + EXTERNAL_SFTP + EXTERNAL_OBJECT_STORE + PROXY + } + + class UploadType { + <> + S3 + GOOGLE_CLOUD_STORAGE + SFTP + HTTPS + NONE + } + + class StorageLocationConfigurable { + <> + +set_storage_location(storage_location_id) + +get_project_setting(setting_type) + +delete_project_setting(setting_id) + +get_sts_storage_token(permission, output_format) + +index_files_for_migration(dest_storage_location_id, db_path) + +migrate_indexed_files(db_path) + } + + class Project { + +str id + +str name + +str description + } + + class Folder { + +str id + +str name + +str parent_id + } + + StorageLocation --> StorageLocationType : storage_type + StorageLocation --> UploadType : upload_type + StorageLocationConfigurable <|-- Project : implements + StorageLocationConfigurable <|-- Folder : implements +``` + +
+ +### Key Components + +| Component | Purpose | +|-----------|---------| +| [StorageLocation][synapseclient.models.StorageLocation] | Data model representing a storage location setting in Synapse | +| [StorageLocationType][synapseclient.models.StorageLocationType] | Enumeration defining the supported storage backend types | +| [UploadType][synapseclient.models.UploadType] | Enumeration defining the upload protocol for each storage type | +| [StorageLocationConfigurable][synapseclient.models.mixins.StorageLocationConfigurable] | Mixin providing storage management methods to entities | + +--- + +
+ +## Storage Type Mapping + +Each `StorageLocationType` maps to a specific REST API `concreteType` and has a +default `UploadType`. This mapping is bidirectional, allowing the system to parse +responses from the API and construct requests. + +```mermaid +flowchart LR + subgraph StorageLocationType + SYNAPSE_S3["SYNAPSE_S3"] + EXTERNAL_S3["EXTERNAL_S3"] + EXTERNAL_GOOGLE_CLOUD["EXTERNAL_GOOGLE_CLOUD"] + EXTERNAL_SFTP["EXTERNAL_SFTP"] + EXTERNAL_OBJECT_STORE["EXTERNAL_OBJECT_STORE"] + PROXY["PROXY"] + end + + subgraph concreteType + S3SLS["S3StorageLocationSetting"] + ExtS3SLS["ExternalS3StorageLocationSetting"] + ExtGCSSLS["ExternalGoogleCloudStorageLocationSetting"] + ExtSLS["ExternalStorageLocationSetting"] + ExtObjSLS["ExternalObjectStorageLocationSetting"] + ProxySLS["ProxyStorageLocationSettings"] + end + + subgraph UploadType + S3["S3"] + GCS["GOOGLECLOUDSTORAGE"] + SFTP["SFTP"] + HTTPS["HTTPS"] + end + + SYNAPSE_S3 --> S3SLS --> S3 + EXTERNAL_S3 --> ExtS3SLS --> S3 + EXTERNAL_GOOGLE_CLOUD --> ExtGCSSLS --> GCS + EXTERNAL_SFTP --> ExtSLS --> SFTP + EXTERNAL_OBJECT_STORE --> ExtObjSLS --> S3 + PROXY --> ProxySLS --> HTTPS +``` + +
+ +### Type-Specific Attributes + +Different storage types support different configuration attributes: + +| Attribute | SYNAPSE | EXT_S3 | EXT_GCS | EXT_SFTP | EXT_OBJ | PROXY | +|-----------|:-------:|:------:|:-------:|:--------:|:-------:|:-----:| +| `bucket` | ✓ | ✓ | ✓ | | ✓ | | +| `base_key` | ✓ | ✓ | ✓ | | | | +| `sts_enabled` | ✓ | ✓ | | | | | +| `endpoint_url` | | ✓ | | | ✓ | | +| `url` | | | | ✓ | | | +| `supports_subfolders` | | | | ✓ | | | +| `proxy_url` | | | | | | ✓ | +| `secret_key` | | | | | | ✓ | +| `benefactor_id` | | | | | | ✓ | + +**Legend:** SYNAPSE = SYNAPSE_S3, EXT_S3 = EXTERNAL_S3, EXT_GCS = EXTERNAL_GOOGLE_CLOUD, EXT_SFTP = EXTERNAL_SFTP, EXT_OBJ = EXTERNAL_OBJECT_STORE + +
+ +### Choosing a Storage Type + +Use this decision tree to select the appropriate storage type for your use case: + +```mermaid +flowchart TB + Start([Need custom storage?]) --> Q1{Want Synapse to
manage storage?} + + Q1 -->|Yes| SYNAPSE_S3[Use SYNAPSE_S3] + Q1 -->|No| Q2{What storage
backend?} + + Q2 -->|AWS S3| Q3{Synapse accesses
bucket directly?} + Q2 -->|Google Cloud| EXTERNAL_GOOGLE_CLOUD[Use EXTERNAL_GOOGLE_CLOUD] + Q2 -->|SFTP Server| EXTERNAL_SFTP[Use EXTERNAL_SFTP] + Q2 -->|Proxy Server| PROXY[Use PROXY] + Q2 -->|S3-compatible
non-AWS| EXTERNAL_OBJECT_STORE[Use EXTERNAL_OBJECT_STORE] + + Q3 -->|Yes| Q4{Need STS
credentials?} + Q3 -->|No| EXTERNAL_OBJECT_STORE + + Q4 -->|Yes| EXTERNAL_S3_STS[Use EXTERNAL_S3
with sts_enabled=True] + Q4 -->|No| EXTERNAL_S3[Use EXTERNAL_S3] + + SYNAPSE_S3 --> Benefits1[Benefits:
- Zero configuration
- Managed by Synapse
- STS available] + EXTERNAL_S3 --> Benefits2[Benefits:
- Use your own bucket
- Control access & costs
- Optional STS] + EXTERNAL_S3_STS --> Benefits2 + EXTERNAL_GOOGLE_CLOUD --> Benefits3[Benefits:
- GCP native
- Use existing GCS buckets] + EXTERNAL_SFTP --> Benefits4[Benefits:
- Legacy systems
- Synapse never touches data] + EXTERNAL_OBJECT_STORE --> Benefits5[Benefits:
- OpenStack, MinIO, etc
- Synapse never touches data] + PROXY --> Benefits6[Benefits:
- Custom access control
- Data transformation] +``` + +--- + +
+ +## Entity Inheritance Hierarchy + +Projects and Folders inherit storage configuration capabilities through the +`StorageLocationConfigurable` mixin. This pattern allows consistent storage +management across container entities. + +```mermaid +classDiagram + direction TB + + class AccessControllable { + <> + +get_permissions() + +set_permissions() + +delete_permissions() + } + + class StorableContainer { + <> + +sync() + +get_children() + } + + class StorageLocationConfigurable { + <> + +set_storage_location() + +get_project_setting() + +delete_project_setting() + +get_sts_storage_token() + +index_files_for_migration() + +migrate_indexed_files() + } + + class Project { + +str id + +str name + +str description + +str etag + } + + class Folder { + +str id + +str name + +str parent_id + +str etag + } + + AccessControllable <|-- Project + AccessControllable <|-- Folder + StorableContainer <|-- Project + StorableContainer <|-- Folder + StorageLocationConfigurable <|-- Project + StorageLocationConfigurable <|-- Folder +``` + +!!! tip "Mixin Pattern" + The mixin pattern allows `Project` and `Folder` to share storage location + functionality without code duplication. Both classes inherit the same + methods from `StorageLocationConfigurable`. + +--- + +
+
+ +# Part 2: Operation Flows + +This section contains sequence diagrams for key operations. + +
+ +## Operation Flows + +### Store Operation + +The `store()` method creates a new storage location in Synapse. + +```mermaid +sequenceDiagram + participant User + participant StorageLocation + participant _to_synapse_request as _to_synapse_request() + participant API as storage_location_services + participant Synapse as Synapse REST API + + User->>StorageLocation: store() + activate StorageLocation + + StorageLocation->>_to_synapse_request: Build request body + activate _to_synapse_request + + Note over _to_synapse_request: Validate storage_type is set + Note over _to_synapse_request: Build concreteType from storage_type + Note over _to_synapse_request: Determine uploadType + Note over _to_synapse_request: Add type-specific fields + + _to_synapse_request-->>StorageLocation: Request body dict + deactivate _to_synapse_request + + StorageLocation->>API: create_storage_location_setting(body) + activate API + + API->>Synapse: POST /storageLocation + activate Synapse + + Synapse-->>API: Response with storageLocationId + deactivate Synapse + + API-->>StorageLocation: Response dict + deactivate API + + StorageLocation->>StorageLocation: fill_from_dict(response) + Note over StorageLocation: Parse storageLocationId + Note over StorageLocation: Parse concreteType → storage_type + Note over StorageLocation: Parse uploadType → upload_type + Note over StorageLocation: Extract type-specific fields + + StorageLocation-->>User: StorageLocation (populated) + deactivate StorageLocation +``` + +!!! note "Idempotent Behavior" + Storage locations are immutable once created. If you call `store()` with + identical parameters, Synapse returns the existing storage location rather + than creating a duplicate. + +
+ +### Setup S3 Convenience Flow + +The `setup_s3()` class method creates a folder with S3 storage in a single call. + +??? example "Click to expand sequence diagram" + ```mermaid + sequenceDiagram + participant User + participant setup_s3 as StorageLocation.setup_s3() + participant StorageLocation + participant Folder + participant Mixin as StorageLocationConfigurable + participant API as storage_location_services + participant Synapse as Synapse REST API + + User->>setup_s3: setup_s3(parent, folder_name, bucket_name) + activate setup_s3 + + Note over setup_s3: Validate: folder_name XOR folder + + alt folder_name provided + setup_s3->>Folder: Folder(name, parent_id).store() + activate Folder + Folder->>Synapse: POST /entity + Synapse-->>Folder: Folder response + Folder-->>setup_s3: New Folder + deactivate Folder + else folder ID provided + setup_s3->>Folder: Folder(id).get() + activate Folder + Folder->>Synapse: GET /entity/{id} + Synapse-->>Folder: Folder response + Folder-->>setup_s3: Existing Folder + deactivate Folder + end + + alt bucket_name provided + Note over setup_s3: storage_type = EXTERNAL_S3 + else bucket_name is None + Note over setup_s3: storage_type = SYNAPSE_S3 + end + + setup_s3->>StorageLocation: StorageLocation(...).store() + activate StorageLocation + StorageLocation->>Synapse: POST /storageLocation + Synapse-->>StorageLocation: StorageLocation response + StorageLocation-->>setup_s3: StorageLocation + deactivate StorageLocation + + setup_s3->>Mixin: folder.set_storage_location(storage_location_id) + activate Mixin + + Mixin->>API: get_project_setting(project_id, "upload") + API->>Synapse: GET /projectSettings/{id}/type/upload + Synapse-->>API: Setting or empty + + alt Setting exists + API-->>Mixin: Existing setting + Mixin->>API: update_project_setting(body) + API->>Synapse: PUT /projectSettings + else No setting + Mixin->>API: create_project_setting(body) + API->>Synapse: POST /projectSettings + end + + Synapse-->>API: Project setting response + API-->>Mixin: Updated setting + deactivate Mixin + + setup_s3-->>User: (Folder, StorageLocation) + deactivate setup_s3 + ``` + +
+ +### STS Token Retrieval + +STS (AWS Security Token Service) enables direct S3 access using temporary credentials. + +```mermaid +sequenceDiagram + participant User + participant Entity as Folder/Project + participant Mixin as StorageLocationConfigurable + participant STS as sts_transfer module + participant Client as Synapse Client + participant Synapse as Synapse REST API + + User->>Entity: get_sts_storage_token(permission, output_format) + activate Entity + + Entity->>Mixin: get_sts_storage_token_async() + activate Mixin + + Mixin->>Client: Synapse.get_client() + Client-->>Mixin: Synapse client instance + + Mixin->>STS: sts_transfer.get_sts_credentials() + activate STS + + STS->>Synapse: GET /entity/{id}/sts?permission={permission} + activate Synapse + + Synapse-->>STS: STS credentials response + deactivate Synapse + + Note over STS: Parse credentials + + alt output_format == "boto" + Note over STS: Format for boto3 client kwargs + STS-->>Mixin: {aws_access_key_id, aws_secret_access_key, aws_session_token} + else output_format == "json" + Note over STS: Return JSON string + STS-->>Mixin: JSON credentials string + else output_format == "shell" / "bash" + Note over STS: Format as export commands + STS-->>Mixin: Shell export commands + else output_format == "dictionary" + Note over STS: Return raw dict + STS-->>Mixin: Dictionary + end + deactivate STS + + Mixin-->>Entity: Formatted credentials + deactivate Mixin + + Entity-->>User: Credentials + deactivate Entity +``` + +
+ +#### Credential Output Formats + +| Format | Description | Use Case | +|--------|-------------|----------| +| `boto` | Dict with `aws_access_key_id`, `aws_secret_access_key`, `aws_session_token` | Pass directly to `boto3.client('s3', **creds)` | +| `json` | JSON string | Store or pass to external tools | +| `shell` / `bash` | `export AWS_ACCESS_KEY_ID=...` format | Execute in shell | +| `cmd` | Windows SET commands | Windows command prompt | +| `powershell` | PowerShell variable assignments | PowerShell scripts | +| `dictionary` | Raw Python dict | Custom processing | + +--- + +
+
+ +# Part 3: Settings & Infrastructure + +This section covers project settings, API architecture, and the async/sync pattern. + +
+ +## Project Setting Lifecycle + +Project settings control which storage location(s) are used for uploads to an +entity. The following state diagram shows the lifecycle of a project setting. + +```mermaid +stateDiagram-v2 + [*] --> NoSetting: Entity created + + NoSetting --> Created: set_storage_location() + Note right of NoSetting: Inherits from parent\nor uses Synapse default + + Created --> Updated: set_storage_location()\nwith different locations + Updated --> Updated: set_storage_location()\nwith different locations + + Created --> Deleted: delete_project_setting() + Updated --> Deleted: delete_project_setting() + + Deleted --> NoSetting: Returns to default + + state Created { + [*] --> Active + Active: locations = [storage_location_id] + Active: settingsType = "upload" + } + + state Updated { + [*] --> Modified + Modified: locations = [new_id, ...] + Modified: settingsType = "upload" + } +``` + +
+ +### Setting Types + +| Type | Purpose | +|------|---------| +| `upload` | Configures upload destination storage location(s) | +| `external_sync` | Configures external sync settings | +| `requester_pays` | Configures requester-pays bucket access | + +--- + +
+ +## API Layer Architecture + +The storage location services module provides async functions that wrap the +Synapse REST API endpoints. This layer handles serialization and error handling. + +```mermaid +flowchart TB + subgraph "Model Layer" + SL[StorageLocation] + SLCM[StorageLocationConfigurable Mixin] + end + + subgraph "API Layer (storage_location_services.py)" + create_sls[create_storage_location_setting] + get_sls[get_storage_location_setting] + get_ps[get_project_setting] + create_ps[create_project_setting] + update_ps[update_project_setting] + delete_ps[delete_project_setting] + end + + subgraph "REST Endpoints" + POST_SL["POST /storageLocation"] + GET_SL["GET /storageLocation/{id}"] + GET_PS["GET /projectSettings/{id}/type/{type}"] + POST_PS["POST /projectSettings"] + PUT_PS["PUT /projectSettings"] + DELETE_PS["DELETE /projectSettings/{id}"] + end + + SL --> create_sls --> POST_SL + SL --> get_sls --> GET_SL + + SLCM --> get_ps --> GET_PS + SLCM --> create_ps --> POST_PS + SLCM --> update_ps --> PUT_PS + SLCM --> delete_ps --> DELETE_PS +``` + +
+ +### REST API Reference + +| Method | Endpoint | Description | +|--------|----------|-------------| +| POST | `/storageLocation` | Create a new storage location setting | +| GET | `/storageLocation/{id}` | Retrieve a storage location by ID | +| GET | `/projectSettings/{projectId}/type/{type}` | Get project settings for an entity | +| POST | `/projectSettings` | Create a new project setting | +| PUT | `/projectSettings` | Update an existing project setting | +| DELETE | `/projectSettings/{id}` | Delete a project setting | + +--- + +
+ +## Async/Sync Pattern + +The StorageLocation system follows the Python client's `@async_to_sync` pattern, +providing both async and sync versions of all methods. + +```mermaid +flowchart LR + subgraph "User Code" + SyncCall["folder.set_storage_location()"] + AsyncCall["await folder.set_storage_location_async()"] + end + + subgraph "@async_to_sync Decorator" + Wrapper["Sync wrapper"] + AsyncMethod["Async implementation"] + end + + subgraph "Event Loop" + RunSync["wrap_async_to_sync()"] + AsyncIO["asyncio"] + end + + SyncCall --> Wrapper + Wrapper --> RunSync + RunSync --> AsyncIO + AsyncIO --> AsyncMethod + + AsyncCall --> AsyncMethod +``` + +
+ +### Method Pairs + +| Sync Method | Async Method | +|-------------|--------------| +| `StorageLocation.store()` | `StorageLocation.store_async()` | +| `StorageLocation.get()` | `StorageLocation.get_async()` | +| `StorageLocation.setup_s3()` | `StorageLocation.setup_s3_async()` | +| `folder.set_storage_location()` | `folder.set_storage_location_async()` | +| `folder.get_project_setting()` | `folder.get_project_setting_async()` | +| `folder.delete_project_setting()` | `folder.delete_project_setting_async()` | +| `folder.get_sts_storage_token()` | `folder.get_sts_storage_token_async()` | +| `folder.index_files_for_migration()` | `folder.index_files_for_migration_async()` | +| `folder.migrate_indexed_files()` | `folder.migrate_indexed_files_async()` | + +--- + +
+
+ +# Part 4: Migration + +This section covers the file migration system. + +
+ +## Migration Flow + +File migration is a two-phase process that moves files from one storage location +to another while preserving Synapse metadata. + +```mermaid +sequenceDiagram + participant User + participant Entity as Project/Folder + participant IndexFn as index_files_for_migration + participant DB as SQLite Database + participant MigrateFn as migrate_indexed_files + participant Synapse as Synapse REST API + + rect rgb(240, 248, 255) + Note over User,Synapse: Phase 1: Index Files + User->>Entity: index_files_for_migration(dest_id, db_path) + activate Entity + + Entity->>IndexFn: Start indexing + activate IndexFn + + IndexFn->>Synapse: Query entity tree + Synapse-->>IndexFn: File list + + loop For each file + IndexFn->>Synapse: Get file metadata + Synapse-->>IndexFn: File info + IndexFn->>DB: Record file for migration + end + + IndexFn-->>Entity: MigrationResult (indexed counts) + deactivate IndexFn + + Entity-->>User: MigrationResult + deactivate Entity + end + + rect rgb(255, 248, 240) + Note over User,Synapse: Phase 2: Migrate Files + User->>Entity: migrate_indexed_files(db_path) + activate Entity + + Entity->>MigrateFn: Start migration + activate MigrateFn + + MigrateFn->>DB: Read indexed files + + loop For each indexed file + MigrateFn->>Synapse: Copy file to new storage + Synapse-->>MigrateFn: Success/Failure + MigrateFn->>DB: Update status + end + + MigrateFn-->>Entity: MigrationResult (migrated counts) + deactivate MigrateFn + + Entity-->>User: MigrationResult + deactivate Entity + end +``` + +
+ +### Migration Strategies + +| Strategy | Description | +|----------|-------------| +| `new` | Create new file versions in destination (default) | +| `all` | Migrate all versions of each file | +| `latest` | Only migrate the latest version | +| `skip` | Skip if file already exists in destination | + +--- + +
+
+ +# Learn More + +| Resource | Description | +|----------|-------------| +| [Storage Location Tutorial](../tutorials/python/storage_location.md) | Step-by-step guide to using storage locations | +| [StorageLocation API Reference][synapseclient.models.StorageLocation] | Complete API documentation | +| [StorageLocationConfigurable Mixin][synapseclient.models.mixins.StorageLocationConfigurable] | Mixin methods for Projects and Folders | +| [Custom Storage Locations (Synapse Docs)](https://help.synapse.org/docs/Custom-Storage-Locations.2048327803.html) | Official Synapse documentation | diff --git a/docs/js/mermaid-init.js b/docs/js/mermaid-init.js new file mode 100644 index 000000000..823cbce57 --- /dev/null +++ b/docs/js/mermaid-init.js @@ -0,0 +1,12 @@ +// Initialize Mermaid diagrams +document.addEventListener("DOMContentLoaded", function() { + mermaid.initialize({ + startOnLoad: true, + theme: "default", + securityLevel: "loose", + flowchart: { + useMaxWidth: true, + htmlLabels: true + } + }); +}); diff --git a/docs/reference/experimental/async/folder.md b/docs/reference/experimental/async/folder.md index 7b29f84ea..fd74e65dd 100644 --- a/docs/reference/experimental/async/folder.md +++ b/docs/reference/experimental/async/folder.md @@ -30,3 +30,9 @@ at your own risk. - get_schema_derived_keys_async - get_schema_validation_statistics_async - get_invalid_validation_async + - set_storage_location_async + - get_project_setting_async + - delete_project_setting_async + - get_sts_storage_token_async + - index_files_for_migration_async + - migrate_indexed_files_async diff --git a/docs/reference/experimental/async/project.md b/docs/reference/experimental/async/project.md index e3adfa9fc..42803e871 100644 --- a/docs/reference/experimental/async/project.md +++ b/docs/reference/experimental/async/project.md @@ -29,3 +29,9 @@ at your own risk. - get_schema_derived_keys_async - get_schema_validation_statistics_async - get_invalid_validation_async + - set_storage_location_async + - get_project_setting_async + - delete_project_setting_async + - get_sts_storage_token_async + - index_files_for_migration_async + - migrate_indexed_files_async diff --git a/docs/reference/experimental/async/storage_location.md b/docs/reference/experimental/async/storage_location.md new file mode 100644 index 000000000..00e03fc47 --- /dev/null +++ b/docs/reference/experimental/async/storage_location.md @@ -0,0 +1,23 @@ +# StorageLocation + +Contained within this file are experimental interfaces for working with the Synapse Python +Client. Unless otherwise noted these interfaces are subject to change at any time. Use +at your own risk. + +## API Reference + +::: synapseclient.models.StorageLocation + options: + inherited_members: true + members: + - store_async + - get_async + - setup_s3_async + +--- + +::: synapseclient.models.StorageLocationType + +--- + +::: synapseclient.models.UploadType diff --git a/docs/reference/experimental/mixins/manifest_generatable.md b/docs/reference/experimental/mixins/manifest_generatable.md new file mode 100644 index 000000000..47aac2a4c --- /dev/null +++ b/docs/reference/experimental/mixins/manifest_generatable.md @@ -0,0 +1,69 @@ +# ManifestGeneratable Mixin + +The `ManifestGeneratable` mixin provides manifest TSV file generation and reading capabilities for container entities (Projects and Folders). + +## Overview + +This mixin enables: + +- Generating manifest TSV files after syncing from Synapse +- Uploading files from manifest TSV files +- Validating manifest files before upload + +## Usage + +The mixin is automatically available on `Project` and `Folder` classes: + +```python +from synapseclient.models import Project, Folder + +# Project and Folder both have manifest capabilities +project = Project(id="syn123") +folder = Folder(id="syn456") +``` + +## API Reference + +::: synapseclient.models.mixins.manifest.ManifestGeneratable + options: + show_root_heading: true + show_source: false + members: + - generate_manifest + - generate_manifest_async + - from_manifest + - from_manifest_async + - validate_manifest + - validate_manifest_async + - get_manifest_data + - get_manifest_data_async + +## Constants + +### MANIFEST_FILENAME + +The default filename for generated manifests: `SYNAPSE_METADATA_MANIFEST.tsv` + +```python +from synapseclient.models import MANIFEST_FILENAME + +print(MANIFEST_FILENAME) # "SYNAPSE_METADATA_MANIFEST.tsv" +``` + +### DEFAULT_GENERATED_MANIFEST_KEYS + +The default columns included in generated manifest files: + +```python +from synapseclient.models import DEFAULT_GENERATED_MANIFEST_KEYS + +print(DEFAULT_GENERATED_MANIFEST_KEYS) +# ['path', 'parent', 'name', 'id', 'synapseStore', 'contentType', +# 'used', 'executed', 'activityName', 'activityDescription'] +``` + +## See Also + +- [Manifest Operations Tutorial](../../../tutorials/python/manifest_operations.md) +- [StorableContainer Mixin](storable_container.md) +- [Manifest TSV Format](../../../explanations/manifest_tsv.md) diff --git a/docs/reference/experimental/mixins/storage_location_configurable.md b/docs/reference/experimental/mixins/storage_location_configurable.md new file mode 100644 index 000000000..3cf29d81a --- /dev/null +++ b/docs/reference/experimental/mixins/storage_location_configurable.md @@ -0,0 +1,54 @@ +# StorageLocationConfigurable + +The `StorageLocationConfigurable` mixin provides methods for managing storage locations +on entities (Projects and Folders). + +For architecture diagrams and design documentation, see +[Storage Location Architecture](../../../explanations/storage_location_architecture.md). + +This mixin includes: + +- Setting upload storage locations +- Getting and deleting project settings +- Obtaining STS credentials for direct S3 access +- Migrating files to new storage locations + +## Methods Overview + +| Method | Description | +|--------|-------------| +| `set_storage_location` | Set the upload storage location for this entity | +| `get_project_setting` | Get project settings (upload, external_sync, etc.) | +| `delete_project_setting` | Delete a project setting | +| `get_sts_storage_token` | Get STS credentials for direct S3 access | +| `index_files_for_migration` | Index files for migration to a new storage location | +| `migrate_indexed_files` | Migrate previously indexed files | + +## Usage Example + +```python +from synapseclient.models import Folder, StorageLocation, StorageLocationType + +# Create a storage location +storage = StorageLocation( + storage_type=StorageLocationType.EXTERNAL_S3, + bucket="my-bucket", + sts_enabled=True, +).store() + +# Set storage location on a folder +folder = Folder(id="syn123").get() +folder.set_storage_location(storage_location_id=storage.storage_location_id) + +# Get STS credentials +credentials = folder.get_sts_storage_token( + permission="read_write", + output_format="boto", +) +``` + +::: synapseclient.models.mixins.StorageLocationConfigurable + +--- + +::: synapseclient.models.protocols.storage_location_mixin_protocol.StorageLocationConfigurableSynchronousProtocol diff --git a/docs/reference/experimental/sync/folder.md b/docs/reference/experimental/sync/folder.md index 43272ea30..c866a727e 100644 --- a/docs/reference/experimental/sync/folder.md +++ b/docs/reference/experimental/sync/folder.md @@ -41,3 +41,9 @@ at your own risk. - get_schema_derived_keys - get_schema_validation_statistics - get_invalid_validation + - set_storage_location + - get_project_setting + - delete_project_setting + - get_sts_storage_token + - index_files_for_migration + - migrate_indexed_files diff --git a/docs/reference/experimental/sync/project.md b/docs/reference/experimental/sync/project.md index 4e2f35a26..1bb859795 100644 --- a/docs/reference/experimental/sync/project.md +++ b/docs/reference/experimental/sync/project.md @@ -40,3 +40,9 @@ at your own risk. - get_schema_derived_keys - get_schema_validation_statistics - get_invalid_validation + - set_storage_location + - get_project_setting + - delete_project_setting + - get_sts_storage_token + - index_files_for_migration + - migrate_indexed_files diff --git a/docs/reference/experimental/sync/storage_location.md b/docs/reference/experimental/sync/storage_location.md new file mode 100644 index 000000000..a764c9d7d --- /dev/null +++ b/docs/reference/experimental/sync/storage_location.md @@ -0,0 +1,24 @@ +[](){ #storage-location-reference-sync } +# StorageLocation + +Contained within this file are experimental interfaces for working with the Synapse Python +Client. Unless otherwise noted these interfaces are subject to change at any time. Use +at your own risk. + +## API Reference + +::: synapseclient.models.StorageLocation + options: + inherited_members: true + members: + - store + - get + - setup_s3 + +--- + +::: synapseclient.models.StorageLocationType + +--- + +::: synapseclient.models.UploadType diff --git a/docs/tutorials/python/manifest_operations.md b/docs/tutorials/python/manifest_operations.md new file mode 100644 index 000000000..25362a347 --- /dev/null +++ b/docs/tutorials/python/manifest_operations.md @@ -0,0 +1,328 @@ +# Manifest Operations + +This tutorial covers how to work with manifest TSV files for bulk file operations in Synapse. Manifest files provide a way to track file metadata, download files with their annotations, and upload files with provenance information. + +## Overview + +A manifest file is a tab-separated values (TSV) file that contains metadata about files in Synapse. The manifest includes: + +- File paths and Synapse IDs +- Parent container IDs +- Annotations +- Provenance information (used/executed references) + +## Generating Manifests During Download + +When syncing files from Synapse, you can automatically generate a manifest file that captures all file metadata. + +### Using sync_from_synapse with Manifest Generation + +```python +from synapseclient.models import Project +import synapseclient + +synapseclient.login() + +# Download a project with manifest generation at each directory level +project = Project(id="syn123456").sync_from_synapse( + path="/path/to/download", + generate_manifest="all" +) + +# Or generate a single manifest at the root level only +project = Project(id="syn123456").sync_from_synapse( + path="/path/to/download", + generate_manifest="root" +) +``` + +### Manifest Generation Options + +The `generate_manifest` parameter accepts three values: + +| Value | Description | +|-------|-------------| +| `"suppress"` | (Default) Do not create any manifest files | +| `"root"` | Create a single manifest at the root download path | +| `"all"` | Create a manifest in each directory level | + +### Generating Manifest Separately + +You can also generate a manifest after syncing: + +```python +from synapseclient.models import Project +import synapseclient + +synapseclient.login() + +# First sync without manifest +project = Project(id="syn123456").sync_from_synapse( + path="/path/to/download" +) + +# Then generate manifest separately +manifest_path = project.generate_manifest( + path="/path/to/download", + manifest_scope="root" +) +print(f"Manifest created at: {manifest_path}") +``` + +## Manifest File Format + +The generated manifest file (`SYNAPSE_METADATA_MANIFEST.tsv`) contains the following columns: + +| Column | Description | +|--------|-------------| +| `path` | Local file path | +| `parent` | Synapse ID of the parent container | +| `name` | File name in Synapse | +| `id` | Synapse file ID | +| `synapseStore` | Whether the file is stored in Synapse | +| `contentType` | MIME type of the file | +| `used` | Provenance - entities used to create this file | +| `executed` | Provenance - code/scripts executed | +| `activityName` | Name of the provenance activity | +| `activityDescription` | Description of the provenance activity | +| *custom columns* | Any annotations on the files | + +### Example Manifest + +```tsv +path parent name id synapseStore contentType used executed activityName activityDescription study dataType +/data/file1.csv syn123 file1.csv syn456 True text/csv Data Processing Study1 RNA-seq +/data/file2.csv syn123 file2.csv syn789 True text/csv syn456 Analysis Processed from file1 Study1 RNA-seq +``` + +## Uploading Files from a Manifest + +You can upload files to Synapse using a manifest file: + +```python +from synapseclient.models import Project +import synapseclient + +synapseclient.login() + +# Upload files from a manifest +files = Project.from_manifest( + manifest_path="/path/to/manifest.tsv", + parent_id="syn123456" +) + +for file in files: + print(f"Uploaded: {file.name} ({file.id})") +``` + +### Dry Run Validation + +Before uploading, you can validate the manifest: + +```python +from synapseclient.models import Project + +# Validate without uploading +is_valid, errors = Project.validate_manifest( + manifest_path="/path/to/manifest.tsv" +) + +if is_valid: + print("Manifest is valid, ready for upload") +else: + for error in errors: + print(f"Error: {error}") +``` + +Or use the `dry_run` option to validate the manifest and see what would be uploaded without making changes: + +```python +# Dry run - validates and returns what would be uploaded, but doesn't upload +files = Project.from_manifest( + manifest_path="/path/to/manifest.tsv", + parent_id="syn123456", + dry_run=True # Validate only, no actual upload +) +print(f"Would upload {len(files)} files") +``` + +The `dry_run` parameter is useful for: + +- Validating manifest format before committing to an upload +- Testing your manifest configuration +- Previewing which files will be affected + +## Working with Annotations + +Annotations in the manifest are automatically handled: + +### On Download + +When generating a manifest, all file annotations are included as additional columns: + +```python +project = Project(id="syn123456").sync_from_synapse( + path="/path/to/download", + generate_manifest="root" +) +# Annotations appear as columns in the manifest +``` + +### On Upload + +Any columns in the manifest that aren't standard fields become annotations: + +```tsv +path parent study dataType specimenType +/data/file1.csv syn123 Study1 RNA-seq tissue +``` + +```python +files = Project.from_manifest( + manifest_path="/path/to/manifest.tsv", + parent_id="syn123456", + merge_existing_annotations=True # Merge with existing annotations +) +``` + +## Working with Provenance + +### On Download + +Provenance information is captured in the `used`, `executed`, `activityName`, and `activityDescription` columns: + +```python +project = Project(id="syn123456").sync_from_synapse( + path="/path/to/download", + include_activity=True, # Include provenance + generate_manifest="root" +) +``` + +### On Upload + +You can specify provenance in the manifest: + +```tsv +path parent used executed activityName activityDescription +/data/output.csv syn123 syn456;syn789 https://github.com/repo/script.py Analysis Generated from input files +``` + +- Multiple references are separated by semicolons (`;`) +- References can be Synapse IDs, URLs, or local file paths + +## Synapse Download List Integration + +The manifest functionality integrates with Synapse's Download List feature. You can generate a manifest directly from your Synapse download list, which is useful for exporting metadata about files you've queued for download in the Synapse web interface. + +### Generating Manifest from Download List + +```python +from synapseclient.models import Project +import synapseclient + +synapseclient.login() + +# Generate a manifest from your Synapse download list +manifest_path = Project.generate_download_list_manifest( + download_path="/path/to/save/manifest" +) +print(f"Manifest downloaded to: {manifest_path}") +``` + +### Custom CSV Formatting + +You can customize the manifest format: + +```python +from synapseclient.models import Project +import synapseclient + +synapseclient.login() + +# Generate a tab-separated manifest +manifest_path = Project.generate_download_list_manifest( + download_path="/path/to/save/manifest", + csv_separator="\t", # Tab-separated + include_header=True +) +``` + +### Using DownloadListManifestRequest Directly + +For more control over the manifest generation process, use the `DownloadListManifestRequest` class directly: + +```python +from synapseclient.models import DownloadListManifestRequest, CsvTableDescriptor +import synapseclient + +synapseclient.login() + +# Create a request with custom CSV formatting +request = DownloadListManifestRequest( + csv_table_descriptor=CsvTableDescriptor( + separator="\t", + quote_character='"', + is_first_line_header=True + ) +) + +# Send the job and wait for completion +request.send_job_and_wait() + +# Download the generated manifest +manifest_path = request.download_manifest(download_path="/path/to/download") +print(f"Manifest file handle: {request.result_file_handle_id}") +``` + +## Best Practices + +1. **Use `generate_manifest="root"` for simple cases** - Creates a single manifest at the root level, easier to manage. + +2. **Use `generate_manifest="all"` for complex hierarchies** - Creates manifests at each directory level, useful for large projects with many subdirectories. + +3. **Validate manifests before upload** - Use `validate_manifest()` or `dry_run=True` to catch errors early. + +4. **Include provenance information** - Set `include_activity=True` when syncing to capture provenance in the manifest. + +5. **Backup your manifest** - The manifest is a valuable record of your data and its metadata. + +## Async API + +All manifest operations are available as async methods: + +```python +import asyncio +from synapseclient.models import Project +import synapseclient + +async def main(): + synapseclient.login() + + # Async sync with manifest + project = Project(id="syn123456") + await project.sync_from_synapse_async( + path="/path/to/download", + generate_manifest="root" + ) + + # Async manifest generation + manifest_path = await project.generate_manifest_async( + path="/path/to/download", + manifest_scope="root" + ) + + # Async upload from manifest + files = await Project.from_manifest_async( + manifest_path="/path/to/manifest.tsv", + parent_id="syn123456" + ) + +asyncio.run(main()) +``` + +## See Also + +- [Download Data in Bulk](download_data_in_bulk.md) +- [Upload Data in Bulk](upload_data_in_bulk.md) +- [Manifest TSV Format](../../explanations/manifest_tsv.md) diff --git a/docs/tutorials/python/storage_location.md b/docs/tutorials/python/storage_location.md new file mode 100644 index 000000000..41dd5036c --- /dev/null +++ b/docs/tutorials/python/storage_location.md @@ -0,0 +1,135 @@ +# Storage Locations in Synapse + +Storage locations allow you to configure where files uploaded to Synapse are +stored. By default, files are stored in Synapse's internal S3 storage, but you +can configure projects or folders to use your own AWS S3 buckets, Google Cloud +Storage buckets, or other external storage. + +This tutorial demonstrates how to use the Python client to manage storage +locations using the new object-oriented models. + +[Read more about Custom Storage Locations](https://help.synapse.org/docs/Custom-Storage-Locations.2048327803.html) + +## Tutorial Purpose +In this tutorial you will: + +1. Create an external S3 storage location +2. Set up a folder backed by external S3 storage +3. Create an STS-enabled storage location for direct S3 access +4. Use STS credentials with boto3 +5. Retrieve and inspect storage location settings + +## Prerequisites + +* Make sure that you have completed the [Installation](../installation.md) and + [Authentication](../authentication.md) setup. +* You must have a [Project](./project.md) created and replace the one used in + this tutorial. +* An AWS S3 bucket properly configured for use with Synapse, including an + `owner.txt` file. See + [Custom Storage Locations](https://help.synapse.org/docs/Custom-Storage-Locations.2048327803.html). +* (Optional) `boto3` installed for STS credential examples. + +## Understanding Storage Location Types + +Synapse supports several types of storage locations: + +- **SYNAPSE_S3**: Synapse-managed S3 storage (default) +- **EXTERNAL_S3**: User-owned Amazon S3 bucket accessed by Synapse +- **EXTERNAL_GOOGLE_CLOUD**: User-owned Google Cloud Storage bucket +- **EXTERNAL_SFTP**: External SFTP server not accessed by Synapse +- **EXTERNAL_OBJECT_STORE**: S3-like bucket (e.g., OpenStack) not accessed by Synapse +- **PROXY**: A proxy server that controls access to storage + +## STS-Enabled Storage + +STS (AWS Security Token Service) enabled storage locations allow users to get +temporary AWS credentials for direct S3 access. This is useful for: + +- Uploading large files directly to S3 +- Using AWS tools like the AWS CLI or boto3 +- Performing bulk operations on files + +## 1. Set up and get project + +```python +{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=5-12} +``` + +## 2. Create an external S3 storage location + +Create a storage location backed by your own S3 bucket. The bucket must be +properly configured with an `owner.txt` file. + +```python +{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=14-27} +``` + +
+ You'll notice the output looks like: + +``` +Created storage location: 12345 +Type: StorageLocationType.EXTERNAL_S3 +Bucket: my-synapse-bucket +``` +
+ +## 3. Set up a folder with external S3 storage + +The `setup_s3` convenience method handles creating the folder, storage location, +and project settings in a single call. + +```python +{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=29-38} +``` + +## 4. Create an STS-enabled storage location + +STS-enabled storage locations allow you to get temporary AWS credentials for +direct S3 access. + +```python +{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=40-50} +``` + +## 5. Use STS credentials with boto3 + +Once you have an STS-enabled folder, you can get temporary credentials to +access the underlying S3 bucket directly. + +```python +{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=52-72} +``` + +## 6. Retrieve and inspect storage location settings + +You can retrieve your storage location settings and inspect their configuration. + +```python +{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=74-86} +``` + +## Source code for this tutorial + +
+ Click to show me + +```python +{!docs/tutorials/python/tutorial_scripts/storage_location.py!} +``` +
+ +## References used in this tutorial + +- [StorageLocation][synapseclient.models.StorageLocation] +- [StorageLocationType][synapseclient.models.StorageLocationType] +- [Folder][synapseclient.models.Folder] +- [Project][synapseclient.models.Project] +- [syn.login][synapseclient.Synapse.login] +- [Custom Storage Locations Documentation](https://help.synapse.org/docs/Custom-Storage-Locations.2048327803.html) + +## See also + +- [Storage Location Architecture](../../explanations/storage_location_architecture.md) - + In-depth architecture diagrams and design documentation diff --git a/docs/tutorials/python/tutorial_scripts/storage_location.py b/docs/tutorials/python/tutorial_scripts/storage_location.py new file mode 100644 index 000000000..9fe81ff6e --- /dev/null +++ b/docs/tutorials/python/tutorial_scripts/storage_location.py @@ -0,0 +1,86 @@ +""" +Here is where you'll find the code for the Storage Location tutorial. +""" + +# Step 1: Create an External S3 Storage Location +import synapseclient +from synapseclient.models import Project, StorageLocation, StorageLocationType + +syn = synapseclient.login() + +# Retrieve the project +my_project = Project(name="My uniquely named project about Alzheimer's Disease").get() + +# Step 2: Create an External S3 Storage Location +# Replace with your S3 bucket name (must have owner.txt configured) +MY_BUCKET_NAME = "my-synapse-bucket" +MY_BASE_KEY = "synapse-data" + +storage_location = StorageLocation( + storage_type=StorageLocationType.EXTERNAL_S3, + bucket=MY_BUCKET_NAME, + base_key=MY_BASE_KEY, +).store() + +print(f"Created storage location: {storage_location.storage_location_id}") +print(f"Type: {storage_location.storage_type}") +print(f"Bucket: {storage_location.bucket}") + +# Step 3: Set up a folder with external S3 storage +folder, storage = StorageLocation.setup_s3( + folder_name="my-external-storage-folder", + parent=my_project.id, + bucket_name=MY_BUCKET_NAME, + base_key="folder-specific-prefix", +) + +print(f"Created folder: {folder.id}") +print(f"Storage location ID: {storage.storage_location_id}") + +# Step 4: Create an STS-enabled storage location +sts_folder, sts_storage = StorageLocation.setup_s3( + folder_name="my-sts-enabled-folder", + parent=my_project.id, + bucket_name=MY_BUCKET_NAME, + base_key="sts-data", + sts_enabled=True, +) + +print(f"Created STS-enabled folder: {sts_folder.id}") +print(f"STS enabled: {sts_storage.sts_enabled}") + +# Step 5: Use STS credentials with boto3 +credentials = sts_folder.get_sts_storage_token( + permission="read_write", + output_format="boto", +) + +print(f"AWS Access Key ID: {credentials['aws_access_key_id'][:10]}...") +print("Credentials expire: check 'expiration' in json format") + +try: + import boto3 + + s3_client = boto3.client("s3", **credentials) + response = s3_client.list_objects_v2( + Bucket=MY_BUCKET_NAME, + Prefix="sts-data/", + MaxKeys=10, + ) + print(f"Found {response.get('KeyCount', 0)} objects") +except ImportError: + print("boto3 not installed, skipping S3 client example") + +# Step 6: Retrieve and inspect storage location settings +retrieved_storage = StorageLocation( + storage_location_id=storage_location.storage_location_id +).get() + +print("Retrieved storage location:") +print(f" ID: {retrieved_storage.storage_location_id}") +print(f" Type: {retrieved_storage.storage_type}") +print(f" Bucket: {retrieved_storage.bucket}") +print(f" Base Key: {retrieved_storage.base_key}") +print(f" STS Enabled: {retrieved_storage.sts_enabled}") +print(f" Created By: {retrieved_storage.created_by}") +print(f" Created On: {retrieved_storage.created_on}") diff --git a/mkdocs.yml b/mkdocs.yml index 85a237d0c..7461f91f3 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -45,8 +45,10 @@ nav: # - Team: tutorials/python/team.md - Upload data in bulk: tutorials/python/upload_data_in_bulk.md - Download data in bulk: tutorials/python/download_data_in_bulk.md + - Manifest Operations: tutorials/python/manifest_operations.md - Creating JSON Schema: tutorials/python/schema_operations.md - Working with JSON Schema: tutorials/python/json_schema.md + - Storage Location: tutorials/python/storage_location.md # - Move Files and Folders: tutorials/python/move_files_and_folders.md # - Migrate data to other storage locations: tutorials/python/migrate_data_to_other_storage_locations.md - Working with the Command Line Client: tutorials/command_line_client.md @@ -111,6 +113,7 @@ nav: - JSONSchema: reference/experimental/sync/json_schema.md - Wiki: reference/experimental/sync/wiki.md - FormGroup and Form: reference/experimental/sync/form.md + - StorageLocation: reference/experimental/sync/storage_location.md - Extensions: - Curator: reference/extensions/curator.md - Asynchronous: @@ -139,15 +142,18 @@ nav: - JSONSchema: reference/experimental/async/json_schema.md - Wiki: reference/experimental/async/wiki.md - FormGroup and Form: reference/experimental/async/form.md + - StorageLocation: reference/experimental/async/storage_location.md - Mixins: - AccessControllable: reference/experimental/mixins/access_controllable.md - StorableContainer: reference/experimental/mixins/storable_container.md + - ManifestGeneratable: reference/experimental/mixins/manifest_generatable.md - AsynchronousCommunicator: reference/experimental/mixins/asynchronous_communicator.md - FailureStrategy: reference/experimental/mixins/failure_strategy.md - BaseJSONSchema: reference/experimental/mixins/base_json_schema.md - ContainerEntityJSONSchema: reference/experimental/mixins/container_json_schema.md - FormData: reference/experimental/mixins/form_data.md - FormGroup: reference/experimental/mixins/form_group.md + - StorageLocationConfigurable: reference/experimental/mixins/storage_location_configurable.md - Further Reading: - Home: explanations/home.md @@ -159,6 +165,7 @@ nav: - Structuring Your Project: explanations/structuring_your_project.md - Asyncio Changes in Python 3.14: explanations/asyncio_in_python_3_14.md - Curator Data model: explanations/curator_data_model.md + - Storage Location Architecture: explanations/storage_location_architecture.md - News: - news.md - Contact Us: https://sagebionetworks.jira.com/servicedesk/customer/portal/9/group/16/create/206 @@ -201,6 +208,10 @@ theme: extra_css: - css/custom.css +extra_javascript: + - https://unpkg.com/mermaid@10/dist/mermaid.min.js + - js/mermaid-init.js + plugins: - search - mkdocstrings: diff --git a/synapseclient/api/__init__.py b/synapseclient/api/__init__.py index 6b0961677..13e97c701 100644 --- a/synapseclient/api/__init__.py +++ b/synapseclient/api/__init__.py @@ -130,6 +130,14 @@ update_organization_acl, validate_entity_with_json_schema, ) +from .storage_location_services import ( + create_project_setting, + create_storage_location_setting, + delete_project_setting, + get_project_setting, + get_storage_location_setting, + update_project_setting, +) from .table_services import ( ViewEntityType, ViewTypeMask, @@ -357,4 +365,11 @@ "create_form_data", "list_form_data", "list_form_data_sync", + # storage_location_services + "create_storage_location_setting", + "get_storage_location_setting", + "get_project_setting", + "create_project_setting", + "update_project_setting", + "delete_project_setting", ] diff --git a/synapseclient/api/storage_location_services.py b/synapseclient/api/storage_location_services.py new file mode 100644 index 000000000..c73c7e8cc --- /dev/null +++ b/synapseclient/api/storage_location_services.py @@ -0,0 +1,169 @@ +"""Services for interacting with storage location settings and project settings in Synapse. + +This module provides async REST wrappers for creating, retrieving, and managing +storage location settings and their associated project settings. +""" + +import json +from typing import TYPE_CHECKING, Any, Dict, Optional + +if TYPE_CHECKING: + from synapseclient import Synapse + + +async def create_storage_location_setting( + body: Dict[str, Any], + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Create a new storage location setting in Synapse. + + Storage location creation is idempotent per user - if the same user creates + a storage location with identical properties, the existing one is returned. + + Arguments: + body: The storage location setting request body containing concreteType + and other type-specific fields. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The created or existing storage location setting as a dictionary. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_post_async( + uri="/storageLocation", + body=json.dumps(body), + ) + + +async def get_storage_location_setting( + storage_location_id: int, + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Retrieve a storage location setting by its ID. + + Only the creator of a StorageLocationSetting can retrieve it by its ID. + + Arguments: + storage_location_id: The ID of the storage location setting to retrieve. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The storage location setting as a dictionary. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_get_async( + uri=f"/storageLocation/{storage_location_id}", + ) + + +async def get_project_setting( + project_id: str, + setting_type: str, + *, + synapse_client: Optional["Synapse"] = None, +) -> Optional[Dict[str, Any]]: + """Get the project setting for an entity. + + Arguments: + project_id: The Synapse ID of the project or folder. + setting_type: The type of setting to retrieve. One of: + 'upload', 'external_sync', 'requester_pays'. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The project setting as a dictionary, or None if no setting exists. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + response = await client.rest_get_async( + uri=f"/projectSettings/{project_id}/type/{setting_type}", + ) + # If no project setting, an empty string is returned as the response + return response if response else None + + +async def create_project_setting( + body: Dict[str, Any], + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Create a new project setting. + + Arguments: + body: The project setting request body. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The created project setting as a dictionary. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_post_async( + uri="/projectSettings", + body=json.dumps(body), + ) + + +async def update_project_setting( + body: Dict[str, Any], + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Update an existing project setting. + + Arguments: + body: The project setting request body including the id field. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The updated project setting as a dictionary. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_put_async( + uri="/projectSettings", + body=json.dumps(body), + ) + + +async def delete_project_setting( + setting_id: str, + *, + synapse_client: Optional["Synapse"] = None, +) -> None: + """Delete a project setting. + + Arguments: + setting_id: The ID of the project setting to delete. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + None + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + await client.rest_delete_async( + uri=f"/projectSettings/{setting_id}", + ) diff --git a/synapseclient/client.py b/synapseclient/client.py index 2e9c543cb..35d521a27 100644 --- a/synapseclient/client.py +++ b/synapseclient/client.py @@ -5512,6 +5512,11 @@ def _createExternalObjectStoreFileHandle( "/externalFileHandle", json.dumps(file_handle), self.fileHandleEndpoint ) + @deprecated( + version="4.12.0", + reason="To be removed in 5.0.0. " + "Use `synapseclient.api.post_external_s3_file_handle()` instead.", + ) def create_external_s3_file_handle( self, bucket_name, @@ -5650,7 +5655,11 @@ def _getUserCredentials( # Project/Folder storage location settings # ############################################ - # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441 + @deprecated( + version="4.12.0", + reason="To be removed in 5.0.0. " + "Use `StorageLocation(...).store()` from synapseclient.models instead.", + ) def createStorageLocationSetting(self, storage_type, **kwargs): """ Creates an IMMUTABLE storage location based on the specified type. @@ -5707,7 +5716,12 @@ def createStorageLocationSetting(self, storage_type, **kwargs): return self.restPOST("/storageLocation", body=json.dumps(kwargs)) - # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441 + @deprecated( + version="4.12.0", + reason="To be removed in 5.0.0. " + "Use `StorageLocation(storage_location_id=id).get()` from " + "synapseclient.models instead.", + ) def getMyStorageLocationSetting(self, storage_location_id): """ Get a StorageLocationSetting by its id. @@ -5721,7 +5735,12 @@ def getMyStorageLocationSetting(self, storage_location_id): """ return self.restGET("/storageLocation/%s" % storage_location_id) - # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441 + @deprecated( + version="4.12.0", + reason="To be removed in 5.0.0. " + "Use `Folder(id=...).set_storage_location(...)` or " + "`Project(id=...).set_storage_location(...)` from synapseclient.models instead.", + ) def setStorageLocation(self, entity, storage_location_id): """ Sets the storage location for a Project or Folder @@ -5759,7 +5778,12 @@ def setStorageLocation(self, entity, storage_location_id): "/projectSettings", body=json.dumps(project_destination) ) - # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441 + @deprecated( + version="4.12.0", + reason="To be removed in 5.0.0. " + "Use `Folder(id=...).get_project_setting(...)` or " + "`Project(id=...).get_project_setting(...)` from synapseclient.models instead.", + ) def getProjectSetting(self, project, setting_type): """ Gets the ProjectSetting for a project. @@ -5787,7 +5811,12 @@ def getProjectSetting(self, project, setting_type): response if response else None ) # if no project setting, a empty string is returned as the response - # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441 + @deprecated( + version="4.12.0", + reason="To be removed in 5.0.0. " + "Use `Folder(id=...).get_sts_storage_token(...)` or " + "`Project(id=...).get_sts_storage_token(...)` from synapseclient.models instead.", + ) def get_sts_storage_token( self, entity, permission, *, output_format="json", min_remaining_life=None ): @@ -5820,7 +5849,11 @@ def get_sts_storage_token( min_remaining_life=min_remaining_life, ) - # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441 + @deprecated( + version="4.12.0", + reason="To be removed in 5.0.0. " + "Use `StorageLocation.setup_s3(...)` from synapseclient.models instead.", + ) def create_s3_storage_location( self, *, @@ -5862,7 +5895,11 @@ def create_s3_storage_location( ) ) - # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441 + @deprecated( + version="4.12.0", + reason="To be removed in 5.0.0. " + "Use `StorageLocation.setup_s3_async(...)` from synapseclient.models instead.", + ) async def create_s3_storage_location_async( self, *, diff --git a/synapseclient/core/constants/concrete_types.py b/synapseclient/core/constants/concrete_types.py index fba11dbdb..f34fc3887 100644 --- a/synapseclient/core/constants/concrete_types.py +++ b/synapseclient/core/constants/concrete_types.py @@ -9,7 +9,23 @@ EXTERNAL_S3_STORAGE_LOCATION_SETTING = ( "org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting" ) -# EXTERNAL_GCP_STORAGE_LOCATION_SETTING = 'org.sagebionetworks.repo.model.project.ExternalGoogleCloudStorageLocationSetting' # noqa: E501 +EXTERNAL_GCP_STORAGE_LOCATION_SETTING = ( + "org.sagebionetworks.repo.model.project.ExternalGoogleCloudStorageLocationSetting" +) +EXTERNAL_STORAGE_LOCATION_SETTING = ( + "org.sagebionetworks.repo.model.project.ExternalStorageLocationSetting" +) +EXTERNAL_OBJECT_STORAGE_LOCATION_SETTING = ( + "org.sagebionetworks.repo.model.project.ExternalObjectStorageLocationSetting" +) +PROXY_STORAGE_LOCATION_SETTINGS = ( + "org.sagebionetworks.repo.model.project.ProxyStorageLocationSettings" +) + +# Concrete types for ProjectSettings +UPLOAD_DESTINATION_LIST_SETTING = ( + "org.sagebionetworks.repo.model.project.UploadDestinationListSetting" +) # Concrete types for UploadDestinations SYNAPSE_S3_UPLOAD_DESTINATION = ( @@ -117,6 +133,14 @@ "org.sagebionetworks.repo.model.curation.metadata.RecordBasedMetadataTaskProperties" ) +# Download List Types +DOWNLOAD_LIST_MANIFEST_REQUEST = ( + "org.sagebionetworks.repo.model.download.DownloadListManifestRequest" +) +DOWNLOAD_LIST_MANIFEST_RESPONSE = ( + "org.sagebionetworks.repo.model.download.DownloadListManifestResponse" +) + # Grid Session Types CREATE_GRID_REQUEST = "org.sagebionetworks.repo.model.grid.CreateGridRequest" GRID_RECORD_SET_EXPORT_REQUEST = ( diff --git a/synapseclient/models/__init__.py b/synapseclient/models/__init__.py index 554de0bc2..9d5bc90b0 100644 --- a/synapseclient/models/__init__.py +++ b/synapseclient/models/__init__.py @@ -14,6 +14,7 @@ RecordBasedMetadataTaskProperties, ) from synapseclient.models.dataset import Dataset, DatasetCollection, EntityRef +from synapseclient.models.download_list import DownloadListManifestRequest from synapseclient.models.entityview import EntityView, ViewTypeMask from synapseclient.models.evaluation import Evaluation from synapseclient.models.file import File, FileHandle @@ -21,11 +22,20 @@ from synapseclient.models.form import FormData, FormGroup from synapseclient.models.link import Link from synapseclient.models.materializedview import MaterializedView +from synapseclient.models.mixins.manifest import ( + DEFAULT_GENERATED_MANIFEST_KEYS, + MANIFEST_FILENAME, +) from synapseclient.models.mixins.table_components import QueryMixin from synapseclient.models.project import Project from synapseclient.models.recordset import RecordSet from synapseclient.models.schema_organization import JSONSchema, SchemaOrganization from synapseclient.models.services import FailureStrategy +from synapseclient.models.storage_location import ( + StorageLocation, + StorageLocationType, + UploadType, +) from synapseclient.models.submission import Submission from synapseclient.models.submission_bundle import SubmissionBundle from synapseclient.models.submission_status import SubmissionStatus @@ -153,6 +163,15 @@ # Form models "FormGroup", "FormData", + # Storage Location models + "StorageLocation", + "StorageLocationType", + "UploadType", + # Manifest constants + "MANIFEST_FILENAME", + "DEFAULT_GENERATED_MANIFEST_KEYS", + # Download List models + "DownloadListManifestRequest", ] # Static methods to expose as functions diff --git a/synapseclient/models/folder.py b/synapseclient/models/folder.py index a0658f521..c4d4e0718 100644 --- a/synapseclient/models/folder.py +++ b/synapseclient/models/folder.py @@ -18,6 +18,10 @@ ContainerEntityJSONSchema, StorableContainer, ) +from synapseclient.models.mixins.manifest import ManifestGeneratable +from synapseclient.models.mixins.storage_location_mixin import ( + StorageLocationConfigurable, +) from synapseclient.models.protocols.folder_protocol import FolderSynchronousProtocol from synapseclient.models.services.search import get_id from synapseclient.models.services.storable_entity import store_entity @@ -47,6 +51,8 @@ class Folder( AccessControllable, StorableContainer, ContainerEntityJSONSchema, + StorageLocationConfigurable, + ManifestGeneratable, ): """Folder is a hierarchical container for organizing data in Synapse. diff --git a/synapseclient/models/mixins/__init__.py b/synapseclient/models/mixins/__init__.py index 62ddcf017..491ea9616 100644 --- a/synapseclient/models/mixins/__init__.py +++ b/synapseclient/models/mixins/__init__.py @@ -20,11 +20,20 @@ JSONSchemaValidationStatistics, ValidationException, ) +from synapseclient.models.mixins.manifest import ( + DEFAULT_GENERATED_MANIFEST_KEYS, + MANIFEST_FILENAME, + ManifestGeneratable, +) from synapseclient.models.mixins.storable_container import StorableContainer +from synapseclient.models.mixins.storage_location_mixin import ( + StorageLocationConfigurable, +) __all__ = [ "AccessControllable", "StorableContainer", + "StorageLocationConfigurable", "AsynchronousCommunicator", "BaseJSONSchema", "ContainerEntityJSONSchema", @@ -40,4 +49,7 @@ "FormChangeRequest", "FormSubmissionStatus", "StateEnum", + "ManifestGeneratable", + "MANIFEST_FILENAME", + "DEFAULT_GENERATED_MANIFEST_KEYS", ] diff --git a/synapseclient/models/mixins/asynchronous_job.py b/synapseclient/models/mixins/asynchronous_job.py index fd3649bc1..407babe92 100644 --- a/synapseclient/models/mixins/asynchronous_job.py +++ b/synapseclient/models/mixins/asynchronous_job.py @@ -14,6 +14,7 @@ AGENT_CHAT_REQUEST, CREATE_GRID_REQUEST, CREATE_SCHEMA_REQUEST, + DOWNLOAD_LIST_MANIFEST_REQUEST, GET_VALIDATION_SCHEMA_REQUEST, GRID_RECORD_SET_EXPORT_REQUEST, QUERY_BUNDLE_REQUEST, @@ -29,6 +30,7 @@ ASYNC_JOB_URIS = { AGENT_CHAT_REQUEST: "/agent/chat/async", CREATE_GRID_REQUEST: "/grid/session/async", + DOWNLOAD_LIST_MANIFEST_REQUEST: "/download/list/manifest/async", GRID_RECORD_SET_EXPORT_REQUEST: "/grid/export/recordset/async", TABLE_UPDATE_TRANSACTION_REQUEST: "/entity/{entityId}/table/transaction/async", GET_VALIDATION_SCHEMA_REQUEST: "/schema/type/validation/async", diff --git a/synapseclient/models/mixins/manifest.py b/synapseclient/models/mixins/manifest.py new file mode 100644 index 000000000..785a9c7b9 --- /dev/null +++ b/synapseclient/models/mixins/manifest.py @@ -0,0 +1,950 @@ +"""Mixin for objects that can generate and read manifest TSV files.""" + +import csv +import datetime +import io +import os +import re +import sys +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union + +from synapseclient import Synapse +from synapseclient.core import utils +from synapseclient.core.async_utils import async_to_sync, otel_trace_method +from synapseclient.core.utils import is_synapse_id_str, is_url, topolgical_sort +from synapseclient.models.protocols.manifest_protocol import ( + ManifestGeneratableSynchronousProtocol, +) + +if TYPE_CHECKING: + from synapseclient.models import File + +# When new fields are added to the manifest they will also need to be added to +# file.py#_determine_fields_to_ignore_in_merge +REQUIRED_FIELDS = ["path", "parent"] +FILE_CONSTRUCTOR_FIELDS = ["name", "id", "synapseStore", "contentType"] +STORE_FUNCTION_FIELDS = ["activityName", "activityDescription", "forceVersion"] +PROVENANCE_FIELDS = ["used", "executed"] +MANIFEST_FILENAME = "SYNAPSE_METADATA_MANIFEST.tsv" +DEFAULT_GENERATED_MANIFEST_KEYS = [ + "path", + "parent", + "name", + "id", + "synapseStore", + "contentType", + "used", + "executed", + "activityName", + "activityDescription", +] +ARRAY_BRACKET_PATTERN = re.compile(r"^\[.*\]$") +SINGLE_OPEN_BRACKET_PATTERN = re.compile(r"^\[") +SINGLE_CLOSING_BRACKET_PATTERN = re.compile(r"\]$") +# https://stackoverflow.com/questions/18893390/splitting-on-comma-outside-quotes +COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN = re.compile(r",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)") + + +def _manifest_filename(path: str) -> str: + """Get the full path to the manifest file. + + Arguments: + path: The directory where the manifest file will be created. + + Returns: + The full path to the manifest file. + """ + return os.path.join(path, MANIFEST_FILENAME) + + +def _convert_manifest_data_items_to_string_list( + items: List[Union[str, datetime.datetime, bool, int, float]], +) -> str: + """ + Handle converting an individual key that contains a possible list of data into a + list of strings or objects that can be written to the manifest file. + + This has specific logic around how to handle datetime fields. + + When working with datetime fields we are printing the ISO 8601 UTC representation of + the datetime. + + When working with non strings we are printing the non-quoted version of the object. + + Example: Examples + Several examples of how this function works. + + >>> _convert_manifest_data_items_to_string_list(["a", "b", "c"]) + '[a,b,c]' + >>> _convert_manifest_data_items_to_string_list(["string,with,commas", "string without commas"]) + '["string,with,commas",string without commas]' + >>> _convert_manifest_data_items_to_string_list(["string,with,commas"]) + 'string,with,commas' + >>> _convert_manifest_data_items_to_string_list( + [datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)]) + '2020-01-01T00:00:00Z' + >>> _convert_manifest_data_items_to_string_list([True]) + 'True' + >>> _convert_manifest_data_items_to_string_list([1]) + '1' + >>> _convert_manifest_data_items_to_string_list([1.0]) + '1.0' + >>> _convert_manifest_data_items_to_string_list( + [datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc), + datetime.datetime(2021, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)]) + '[2020-01-01T00:00:00Z,2021-01-01T00:00:00Z]' + + + Args: + items: The list of items to convert. + + Returns: + The list of items converted to strings. + """ + items_to_write = [] + for item in items: + if isinstance(item, datetime.datetime): + items_to_write.append( + utils.datetime_to_iso(dt=item, include_milliseconds_if_zero=False) + ) + else: + # If a string based annotation has a comma in it + # this will wrap the string in quotes so it won't be parsed + # as multiple values. For example this is an annotation with 2 values: + # [my first annotation, "my, second, annotation"] + # This is an annotation with 4 value: + # [my first annotation, my, second, annotation] + if isinstance(item, str): + if len(items) > 1 and "," in item: + items_to_write.append(f'"{item}"') + else: + items_to_write.append(item) + else: + items_to_write.append(repr(item)) + + if len(items_to_write) > 1: + return f'[{",".join(items_to_write)}]' + elif len(items_to_write) == 1: + return items_to_write[0] + else: + return "" + + +def _convert_manifest_data_row_to_dict(row: dict, keys: List[str]) -> dict: + """ + Convert a row of data to a dict that can be written to a manifest file. + + Args: + row: The row of data to convert. + keys: The keys of the manifest. Used to select the rows of data. + + Returns: + The dict representation of the row. + """ + data_to_write = {} + for key in keys: + data_for_key = row.get(key, "") + if isinstance(data_for_key, list): + items_to_write = _convert_manifest_data_items_to_string_list(data_for_key) + data_to_write[key] = items_to_write + else: + data_to_write[key] = data_for_key + return data_to_write + + +def _write_manifest_data(filename: str, keys: List[str], data: List[dict]) -> None: + """ + Write a number of keys and a list of data to a manifest file. This will write + the data out as a tab separated file. + + For the data we are writing to the TSV file we are not quoting the content with any + characters. This is because the syncToSynapse function does not require strings to + be quoted. When quote characters were included extra double quotes were being added + to the strings when they were written to the manifest file. This was not causing + errors, however, it was changing the content of the manifest file when changes + were not required. + + Args: + filename: The name of the file to write to. + keys: The keys of the manifest. + data: The data to write to the manifest. This should be a list of dicts where + each dict represents a row of data. + """ + with io.open(filename, "w", encoding="utf8") if filename else sys.stdout as fp: + csv_writer = csv.DictWriter( + fp, + keys, + restval="", + extrasaction="ignore", + delimiter="\t", + quotechar=None, + quoting=csv.QUOTE_NONE, + ) + csv_writer.writeheader() + for row in data: + csv_writer.writerow(rowdict=_convert_manifest_data_row_to_dict(row, keys)) + + +def _extract_entity_metadata_for_file( + all_files: List["File"], +) -> Tuple[List[str], List[Dict[str, str]]]: + """ + Extracts metadata from the list of File Entities and returns them in a form + usable by csv.DictWriter + + Arguments: + all_files: an iterable that provides File entities + + Returns: + keys: a list column headers + data: a list of dicts containing data from each row + """ + keys = list(DEFAULT_GENERATED_MANIFEST_KEYS) + annotation_keys = set() + data = [] + for entity in all_files: + row = { + "parent": entity.parent_id, + "path": entity.path, + "name": entity.name, + "id": entity.id, + "synapseStore": entity.synapse_store, + "contentType": entity.content_type, + } + + if entity.annotations: + annotation_keys.update(set(entity.annotations.keys())) + row.update( + { + key: (val if len(val) > 0 else "") + for key, val in entity.annotations.items() + } + ) + + row_provenance = _get_entity_provenance_dict_for_file(entity=entity) + row.update(row_provenance) + + data.append(row) + keys.extend(annotation_keys) + return keys, data + + +def _get_entity_provenance_dict_for_file(entity: "File") -> Dict[str, str]: + """ + Arguments: + entity: File entity object + + Returns: + dict: a dict with a subset of the provenance metadata for the entity. + An empty dict is returned if the metadata does not have a provenance record. + """ + if not entity.activity: + return {} + + used_activities = [] + for used_activity in entity.activity.used: + used_activities.append(used_activity.format_for_manifest()) + + executed_activities = [] + for executed_activity in entity.activity.executed: + executed_activities.append(executed_activity.format_for_manifest()) + + return { + "used": ";".join(used_activities), + "executed": ";".join(executed_activities), + "activityName": entity.activity.name or "", + "activityDescription": entity.activity.description or "", + } + + +def _validate_manifest_required_fields( + manifest_path: str, +) -> Tuple[bool, List[str]]: + """ + Validate that a manifest file exists and has the required fields. + + Args: + manifest_path: Path to the manifest file. + + Returns: + Tuple of (is_valid, list_of_error_messages). + """ + errors = [] + + if not os.path.isfile(manifest_path): + errors.append(f"Manifest file not found: {manifest_path}") + return (False, errors) + + try: + with io.open(manifest_path, "r", encoding="utf8") as fp: + reader = csv.DictReader(fp, delimiter="\t") + headers = reader.fieldnames or [] + + # Check for required fields + for field in REQUIRED_FIELDS: + if field not in headers: + errors.append(f"Missing required field: {field}") + + # Validate each row + row_num = 1 + for row in reader: + row_num += 1 + path = row.get("path", "") + parent = row.get("parent", "") + + if not path: + errors.append(f"Row {row_num}: 'path' is empty") + + if not parent: + errors.append(f"Row {row_num}: 'parent' is empty") + elif not is_synapse_id_str(parent) and not is_url(parent): + errors.append( + f"Row {row_num}: 'parent' is not a valid Synapse ID: {parent}" + ) + + # Check if path exists (skip URLs) + if path and not is_url(path): + expanded_path = os.path.abspath( + os.path.expandvars(os.path.expanduser(path)) + ) + if not os.path.isfile(expanded_path): + errors.append(f"Row {row_num}: File not found: {path}") + + except Exception as e: + errors.append(f"Error reading manifest file: {str(e)}") + + return (len(errors) == 0, errors) + + +@async_to_sync +class ManifestGeneratable(ManifestGeneratableSynchronousProtocol): + """ + Mixin for objects that can generate and read manifest TSV files. + + In order to use this mixin, the class must have the following attributes: + + - `id` + - `name` + - `_synced_from_synapse` + + The class must also inherit from `StorableContainer` mixin which provides: + + - `flatten_file_list()` + - `map_directory_to_all_contained_files()` + """ + + id: Optional[str] = None + name: Optional[str] = None + _synced_from_synapse: bool = False + + @otel_trace_method( + method_to_trace_name=lambda self, **kwargs: f"{self.__class__.__name__}_generate_manifest: {self.id}" + ) + async def generate_manifest_async( + self, + path: str, + manifest_scope: str = "all", + *, + synapse_client: Optional[Synapse] = None, + ) -> Optional[str]: + """ + Generate a manifest TSV file for all files in this container. + + This method should be called after `sync_from_synapse()` to generate + a manifest of all downloaded files with their metadata. + + Arguments: + path: The directory where the manifest file(s) will be written. + manifest_scope: Controls manifest file generation: + + - "all": Create a manifest in each directory level + - "root": Create a single manifest at the root path only + - "suppress": Do not create any manifest files + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The path to the root manifest file if created, or None if suppressed. + + Raises: + ValueError: If the container has not been synced from Synapse. + ValueError: If manifest_scope is not one of 'all', 'root', 'suppress'. + + Example: Generate manifest after sync + Generate a manifest file after syncing from Synapse: + + from synapseclient.models import Project + + import synapseclient + synapseclient.login() + + project = Project(id="syn123").sync_from_synapse( + path="/path/to/download" + ) + manifest_path = project.generate_manifest( + path="/path/to/download", + manifest_scope="root" + ) + print(f"Manifest created at: {manifest_path}") + """ + if manifest_scope not in ("all", "root", "suppress"): + raise ValueError( + 'Value of manifest_scope should be one of ("all", "root", "suppress")' + ) + + if manifest_scope == "suppress": + return None + + if not self._synced_from_synapse: + raise ValueError( + "Container has not been synced from Synapse. " + "Call sync_from_synapse() before generating a manifest." + ) + + syn = Synapse.get_client(synapse_client=synapse_client) + + # Expand the path + path = os.path.expanduser(path) if path else None + if not path: + raise ValueError("A path must be provided to generate a manifest.") + + # Get all files from this container + all_files = self.flatten_file_list() + + if not all_files: + syn.logger.info( + f"[{self.id}:{self.name}]: No files found in container, " + "skipping manifest generation." + ) + return None + + root_manifest_path = None + + if manifest_scope == "root": + # Generate a single manifest at the root + keys, data = _extract_entity_metadata_for_file(all_files=all_files) + manifest_path = _manifest_filename(path) + _write_manifest_data(manifest_path, keys, data) + root_manifest_path = manifest_path + syn.logger.info( + f"[{self.id}:{self.name}]: Created manifest at {manifest_path}" + ) + elif manifest_scope == "all": + # Generate a manifest at each directory level + directory_map = self.map_directory_to_all_contained_files(root_path=path) + + for directory_path, files_in_directory in directory_map.items(): + if files_in_directory: + keys, data = _extract_entity_metadata_for_file( + all_files=files_in_directory + ) + manifest_path = _manifest_filename(directory_path) + _write_manifest_data(manifest_path, keys, data) + + # Track the root manifest path + if directory_path == path: + root_manifest_path = manifest_path + + syn.logger.info( + f"[{self.id}:{self.name}]: Created manifest at {manifest_path}" + ) + + return root_manifest_path + + @otel_trace_method( + method_to_trace_name=lambda self, **kwargs: f"{self.__class__.__name__}_get_manifest_data: {self.id}" + ) + async def get_manifest_data_async( + self, + *, + synapse_client: Optional[Synapse] = None, + ) -> Tuple[List[str], List[Dict[str, str]]]: + """ + Get manifest data for all files in this container. + + This method extracts metadata from all files that have been synced + to this container. The data can be used to generate a manifest file + or for other purposes. + + Arguments: + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + Tuple of (keys, data) where keys is a list of column headers + and data is a list of dictionaries, one per file, containing + the file metadata. + + Raises: + ValueError: If the container has not been synced from Synapse. + + Example: Get manifest data + Get manifest data for all files in a project: + + from synapseclient.models import Project + + import synapseclient + synapseclient.login() + + project = Project(id="syn123").sync_from_synapse( + path="/path/to/download" + ) + keys, data = project.get_manifest_data() + for row in data: + print(f"File: {row['name']} at {row['path']}") + """ + if not self._synced_from_synapse: + raise ValueError( + "Container has not been synced from Synapse. " + "Call sync_from_synapse() before getting manifest data." + ) + + all_files = self.flatten_file_list() + return _extract_entity_metadata_for_file(all_files=all_files) + + @classmethod + @otel_trace_method( + method_to_trace_name=lambda cls, **kwargs: f"{cls.__name__}_from_manifest" + ) + async def from_manifest_async( + cls, + manifest_path: str, + parent_id: str, + dry_run: bool = False, + merge_existing_annotations: bool = True, + associate_activity_to_new_version: bool = False, + *, + synapse_client: Optional[Synapse] = None, + ) -> List["File"]: + """ + Upload files to Synapse from a manifest TSV file. + + This method reads a manifest TSV file and uploads all files defined in it + to Synapse. The manifest file must contain at minimum the 'path' and 'parent' + columns. + + Arguments: + manifest_path: Path to the manifest TSV file. + parent_id: The Synapse ID of the parent container (Project or Folder) + where files will be uploaded if not specified in the manifest. + dry_run: If True, validate the manifest but do not upload. + merge_existing_annotations: If True, merge annotations with existing + annotations on the file. If False, replace existing annotations. + associate_activity_to_new_version: If True, copy the activity + (provenance) from the previous version to the new version. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + List of File objects that were uploaded. + + Raises: + ValueError: If the manifest file does not exist. + ValueError: If the manifest file is missing required fields. + IOError: If a file path in the manifest does not exist. + + Example: Upload files from a manifest + Upload files from a manifest TSV file: + + from synapseclient.models import Project + + import synapseclient + synapseclient.login() + + files = Project.from_manifest( + manifest_path="/path/to/manifest.tsv", + parent_id="syn123" + ) + for file in files: + print(f"Uploaded: {file.name} ({file.id})") + + Example: Dry run validation + Validate a manifest without uploading: + + from synapseclient.models import Project + + import synapseclient + synapseclient.login() + + files = Project.from_manifest( + manifest_path="/path/to/manifest.tsv", + parent_id="syn123", + dry_run=True + ) + print("Manifest is valid, ready for upload") + """ + from synapseclient.models import Activity, File + + syn = Synapse.get_client(synapse_client=synapse_client) + + # Validate the manifest + is_valid, errors = _validate_manifest_required_fields(manifest_path) + if not is_valid: + raise ValueError( + "Invalid manifest file:\n" + "\n".join(f" - {e}" for e in errors) + ) + + # Read the manifest + rows = [] + with io.open(manifest_path, "r", encoding="utf8") as fp: + reader = csv.DictReader(fp, delimiter="\t") + for row in reader: + rows.append(row) + + if dry_run: + syn.logger.info( + f"Dry run: {len(rows)} files would be uploaded from manifest" + ) + return [] + + # Build dependency graph for provenance ordering + path_to_row = {} + upload_order = {} + + for row in rows: + path = row.get("path", "") + if path and not is_url(path): + path = os.path.abspath(os.path.expandvars(os.path.expanduser(path))) + path_to_row[path] = row + + # Collect provenance references + all_refs = [] + used = row.get("used", "") + if used and used.strip(): + for item in used.split(";"): + item = item.strip() + if item: + if os.path.isfile( + os.path.abspath( + os.path.expandvars(os.path.expanduser(item)) + ) + ): + all_refs.append( + os.path.abspath( + os.path.expandvars(os.path.expanduser(item)) + ) + ) + + executed = row.get("executed", "") + if executed and executed.strip(): + for item in executed.split(";"): + item = item.strip() + if item: + if os.path.isfile( + os.path.abspath( + os.path.expandvars(os.path.expanduser(item)) + ) + ): + all_refs.append( + os.path.abspath( + os.path.expandvars(os.path.expanduser(item)) + ) + ) + + upload_order[path] = all_refs + + # Topologically sort based on provenance dependencies + sorted_paths = topolgical_sort(upload_order) + sorted_paths = [p[0] for p in sorted_paths] + + # Track uploaded files for provenance resolution + path_to_synapse_id: Dict[str, str] = {} + uploaded_files: List["File"] = [] + + for path in sorted_paths: + row = path_to_row[path] + + # Get parent - use manifest value or fall back to provided parent_id + file_parent = row.get("parent", "").strip() or parent_id + + # Build the File object + file = File( + path=path, + parent_id=file_parent, + name=row.get("name", "").strip() or None, + id=row.get("id", "").strip() or None, + synapse_store=( + row.get("synapseStore", "").strip().lower() != "false" + if row.get("synapseStore", "").strip() + else True + ), + content_type=row.get("contentType", "").strip() or None, + merge_existing_annotations=merge_existing_annotations, + associate_activity_to_new_version=associate_activity_to_new_version, + ) + + # Build annotations from extra columns + annotations = {} + skip_keys = set( + REQUIRED_FIELDS + + FILE_CONSTRUCTOR_FIELDS + + STORE_FUNCTION_FIELDS + + PROVENANCE_FIELDS + ) + for key, value in row.items(): + if key not in skip_keys and value and value.strip(): + annotations[key] = _parse_manifest_value(value.strip()) + if annotations: + file.annotations = annotations + + # Build provenance/activity + used_items = [] + executed_items = [] + + used_str = row.get("used", "") + if used_str and used_str.strip(): + for item in used_str.split(";"): + item = item.strip() + if item: + used_items.append( + _resolve_provenance_item(item, path_to_synapse_id) + ) + + executed_str = row.get("executed", "") + if executed_str and executed_str.strip(): + for item in executed_str.split(";"): + item = item.strip() + if item: + executed_items.append( + _resolve_provenance_item(item, path_to_synapse_id) + ) + + if used_items or executed_items: + activity = Activity( + name=row.get("activityName", "").strip() or None, + description=row.get("activityDescription", "").strip() or None, + used=used_items, + executed=executed_items, + ) + file.activity = activity + + # Upload the file + file = await file.store_async(synapse_client=syn) + + # Track for provenance resolution + path_to_synapse_id[path] = file.id + uploaded_files.append(file) + + syn.logger.info(f"Uploaded: {file.name} ({file.id})") + + return uploaded_files + + @staticmethod + @otel_trace_method(method_to_trace_name=lambda **kwargs: "validate_manifest") + async def validate_manifest_async( + manifest_path: str, + *, + synapse_client: Optional[Synapse] = None, + ) -> Tuple[bool, List[str]]: + """ + Validate a manifest TSV file without uploading. + + This method validates a manifest file to ensure it is properly formatted + and all paths exist. + + Arguments: + manifest_path: Path to the manifest TSV file. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + Tuple of (is_valid, list_of_error_messages). If the manifest is valid, + is_valid will be True and the list will be empty. + + Example: Validate a manifest file + Validate a manifest file before uploading: + + from synapseclient.models import Project + + is_valid, errors = Project.validate_manifest( + manifest_path="/path/to/manifest.tsv" + ) + if is_valid: + print("Manifest is valid") + else: + for error in errors: + print(f"Error: {error}") + """ + return _validate_manifest_required_fields(manifest_path) + + @staticmethod + async def generate_download_list_manifest_async( + download_path: str, + csv_separator: str = ",", + include_header: bool = True, + timeout: int = 120, + *, + synapse_client: Optional[Synapse] = None, + ) -> str: + """ + Generate a manifest file from the current user's download list using the + Synapse REST API. + + This method creates a CSV manifest containing metadata about all files in + the user's download list. The manifest is generated server-side by Synapse + and then downloaded to the specified path. + + This is interoperable with the Synapse download list feature and provides + a way to export the download list as a manifest file that can be used for + bulk operations. + + Arguments: + download_path: The local directory path where the manifest will be saved. + csv_separator: The delimiter character for the CSV file. + Defaults to "," for comma-separated values. Use "\t" for tab-separated. + include_header: Whether to include column headers in the first row. + Defaults to True. + timeout: The number of seconds to wait for the job to complete. + Defaults to 120 seconds. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The full path to the downloaded manifest file. + + Example: Generate manifest from download list + Generate a manifest from your Synapse download list: + + from synapseclient.models import Project + + import synapseclient + synapseclient.login() + + # Generate manifest from download list + manifest_path = Project.generate_download_list_manifest( + download_path="/path/to/download" + ) + print(f"Manifest downloaded to: {manifest_path}") + + Example: Generate tab-separated manifest + Generate a TSV manifest from your download list: + + from synapseclient.models import Project + + import synapseclient + synapseclient.login() + + manifest_path = Project.generate_download_list_manifest( + download_path="/path/to/download", + csv_separator="\t" + ) + + See Also: + - `DownloadListManifestRequest`: The underlying request class for more + fine-grained control over the manifest generation process. + """ + from synapseclient.models.download_list import DownloadListManifestRequest + from synapseclient.models.table_components import CsvTableDescriptor + + # Create the request with CSV formatting options + request = DownloadListManifestRequest( + csv_table_descriptor=CsvTableDescriptor( + separator=csv_separator, + is_first_line_header=include_header, + ) + ) + + # Send the job and wait for completion + await request.send_job_and_wait_async( + timeout=timeout, + synapse_client=synapse_client, + ) + + # Download the manifest + manifest_file_path = await request.download_manifest_async( + download_path=download_path, + synapse_client=synapse_client, + ) + + return manifest_file_path + + +def _resolve_provenance_item( + item: str, + path_to_synapse_id: Dict[str, str], +) -> Any: + """ + Resolve a provenance item to a UsedEntity or UsedURL. + + Args: + item: The provenance item string (could be a path, Synapse ID, or URL). + path_to_synapse_id: Mapping of local file paths to their Synapse IDs. + + Returns: + UsedEntity or UsedURL object. + """ + from synapseclient.models import UsedEntity, UsedURL + + # Check if it's a local file path that was uploaded + expanded_path = os.path.abspath(os.path.expandvars(os.path.expanduser(item))) + if expanded_path in path_to_synapse_id: + return UsedEntity(target_id=path_to_synapse_id[expanded_path]) + + # Check if it's a URL + if is_url(item): + return UsedURL(url=item) + + # Check if it's a Synapse ID + if is_synapse_id_str(item): + return UsedEntity(target_id=item) + + # Assume it's a Synapse ID + return UsedEntity(target_id=item) + + +def _parse_manifest_value(value: str) -> Any: + """ + Parse a manifest cell value into an appropriate Python type. + + Handles: + - List syntax: [a,b,c] -> ['a', 'b', 'c'] + - Boolean strings: 'true', 'false' -> True, False + - Numeric strings: '123' -> 123, '1.5' -> 1.5 + - Everything else: returned as string + + Args: + value: The string value from the manifest. + + Returns: + The parsed value. + """ + # Check for list syntax + if ARRAY_BRACKET_PATTERN.match(value): + # Remove brackets + inner = value[1:-1] + # Split on commas outside quotes + items = COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN.split(inner) + result = [] + for item in items: + item = item.strip() + # Remove surrounding quotes if present + if item.startswith('"') and item.endswith('"'): + item = item[1:-1] + result.append(item) + return result + + # Check for boolean + if value.lower() == "true": + return True + if value.lower() == "false": + return False + + # Check for integer + try: + return int(value) + except ValueError: + pass + + # Check for float + try: + return float(value) + except ValueError: + pass + + # Return as string + return value diff --git a/synapseclient/models/mixins/storable_container.py b/synapseclient/models/mixins/storable_container.py index 25432a6b9..1a2d557f2 100644 --- a/synapseclient/models/mixins/storable_container.py +++ b/synapseclient/models/mixins/storable_container.py @@ -159,6 +159,7 @@ async def sync_from_synapse_async( link_hops: int = 1, queue: asyncio.Queue = None, include_types: Optional[List[str]] = None, + generate_manifest: str = "suppress", *, synapse_client: Optional[Synapse] = None, ) -> Self: @@ -170,9 +171,8 @@ async def sync_from_synapse_async( If you only want to retrieve the full tree of metadata about your container specify `download_file` as False. - This works similar to [synapseutils.syncFromSynapse][], however, this does not - currently support the writing of data to a manifest TSV file. This will be a - future enhancement. + This works similar to [synapseutils.syncFromSynapse][] and supports + generating a manifest TSV file with file metadata. Supports syncing Files, Folders, Tables, EntityViews, SubmissionViews, Datasets, DatasetCollections, MaterializedViews, and VirtualTables from Synapse. The @@ -208,6 +208,13 @@ async def sync_from_synapse_async( `["folder", "file", "table", "entityview", "dockerrepo", "submissionview", "dataset", "datasetcollection", "materializedview", "virtualtable"]`. + generate_manifest: Controls manifest file generation. Options: + + - "all": Create a manifest in each directory level + - "root": Create a single manifest at the root path only + - "suppress": (Default) Do not create any manifest files + + A path must be specified for manifest generation. synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor. @@ -386,7 +393,7 @@ async def my_function(): file_size=1, synapse_client=syn, custom_message=custom_message ): self._synced_from_synapse = True - return await self._sync_from_synapse_async( + await self._sync_from_synapse_async( path=path, recursive=recursive, download_file=download_file, @@ -400,6 +407,19 @@ async def my_function(): synapse_client=syn, ) + # Generate manifest if requested and path is provided + if generate_manifest != "suppress" and path: + # The manifest generation is handled by ManifestGeneratable mixin + # which provides generate_manifest_async method + if hasattr(self, "generate_manifest_async"): + await self.generate_manifest_async( + path=path, + manifest_scope=generate_manifest, + synapse_client=syn, + ) + + return self + async def _sync_from_synapse_async( self: Self, path: Optional[str] = None, diff --git a/synapseclient/models/mixins/storage_location_mixin.py b/synapseclient/models/mixins/storage_location_mixin.py new file mode 100644 index 000000000..db3c509a8 --- /dev/null +++ b/synapseclient/models/mixins/storage_location_mixin.py @@ -0,0 +1,450 @@ +"""Mixin for entities that can have their storage location configured.""" + +import asyncio +from typing import Any, Dict, List, Optional, Union + +from synapseclient import Synapse +from synapseclient.api.storage_location_services import ( + create_project_setting, + delete_project_setting, + get_project_setting, + update_project_setting, +) +from synapseclient.core.async_utils import async_to_sync, otel_trace_method +from synapseclient.core.constants import concrete_types +from synapseclient.models.protocols.storage_location_mixin_protocol import ( + StorageLocationConfigurableSynchronousProtocol, +) +from synapseclient.models.services.migration import ( + index_files_for_migration_async as _index_files_for_migration_async, +) +from synapseclient.models.services.migration import ( + migrate_indexed_files_async as _migrate_indexed_files_async, +) +from synapseclient.models.services.migration_types import MigrationResult + +# Default storage location ID used by Synapse +DEFAULT_STORAGE_LOCATION_ID = 1 + + +@async_to_sync +class StorageLocationConfigurable(StorageLocationConfigurableSynchronousProtocol): + """Mixin for objects that can have their storage location configured. + + In order to use this mixin, the class must have an `id` attribute. + + This mixin provides methods for: + - Setting and getting the upload storage location for an entity + - Getting STS (AWS Security Token Service) credentials for direct S3 access + - Migrating files to a new storage location + """ + + id: Optional[str] = None + """The unique immutable ID for this entity.""" + + @otel_trace_method( + method_to_trace_name=lambda self, **kwargs: f"Entity_SetStorageLocation: {self.id}" + ) + async def set_storage_location_async( + self, + storage_location_id: Optional[Union[int, List[int]]] = None, + *, + synapse_client: Optional[Synapse] = None, + ) -> Dict[str, Any]: + """Set the upload storage location for this entity. This configures where + files uploaded to this entity will be stored. + + Arguments: + storage_location_id: The storage location ID(s) to set. Can be a single + ID, a list of IDs (first is default, max 10), or None to use + Synapse default storage. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The project setting dict returned from Synapse. + + Raises: + ValueError: If the entity does not have an id set. + + Example: Using this function + Set storage location on a folder: + + import asyncio + from synapseclient import Synapse + from synapseclient.models import Folder + + syn = Synapse() + syn.login() + + async def main(): + folder = await Folder(id="syn123").get_async() + setting = await folder.set_storage_location_async( + storage_location_id=12345 + ) + print(setting) + + asyncio.run(main()) + """ + if not self.id: + raise ValueError("The entity must have an id set.") + + if storage_location_id is None: + storage_location_id = DEFAULT_STORAGE_LOCATION_ID + + locations = ( + storage_location_id + if isinstance(storage_location_id, list) + else [storage_location_id] + ) + + existing_setting = await get_project_setting( + project_id=self.id, + setting_type="upload", + synapse_client=synapse_client, + ) + + if existing_setting is not None: + existing_setting["locations"] = locations + await update_project_setting( + body=existing_setting, + synapse_client=synapse_client, + ) + return await get_project_setting( + project_id=self.id, + setting_type="upload", + synapse_client=synapse_client, + ) + else: + project_destination = { + "concreteType": concrete_types.UPLOAD_DESTINATION_LIST_SETTING, + "settingsType": "upload", + "locations": locations, + "projectId": self.id, + } + return await create_project_setting( + body=project_destination, + synapse_client=synapse_client, + ) + + @otel_trace_method( + method_to_trace_name=lambda self, **kwargs: f"Entity_GetProjectSetting: {self.id}" + ) + async def get_project_setting_async( + self, + setting_type: str = "upload", + *, + synapse_client: Optional[Synapse] = None, + ) -> Optional[Dict[str, Any]]: + """Get the project setting for this entity. + + Arguments: + setting_type: The type of setting to retrieve. One of: + 'upload', 'external_sync', 'requester_pays'. Default: 'upload'. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The project setting as a dictionary, or None if no setting exists. + + Raises: + ValueError: If the entity does not have an id set. + + Example: Using this function + Get the upload settings for a folder: + + import asyncio + from synapseclient import Synapse + from synapseclient.models import Folder + + syn = Synapse() + syn.login() + + async def main(): + folder = await Folder(id="syn123").get_async() + setting = await folder.get_project_setting_async(setting_type="upload") + if setting: + print(f"Storage locations: {setting.get('locations')}") + + asyncio.run(main()) + """ + if not self.id: + raise ValueError("The entity must have an id set.") + + if setting_type not in {"upload", "external_sync", "requester_pays"}: + raise ValueError(f"Invalid setting_type: {setting_type}") + + return await get_project_setting( + project_id=self.id, + setting_type=setting_type, + synapse_client=synapse_client, + ) + + @otel_trace_method( + method_to_trace_name=lambda self, **kwargs: f"Entity_DeleteProjectSetting: {self.id}" + ) + async def delete_project_setting_async( + self, + setting_id: str, + *, + synapse_client: Optional[Synapse] = None, + ) -> None: + """Delete a project setting by its setting ID. + + Arguments: + setting_id: The ID of the project setting to delete. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + None + + Raises: + ValueError: If the entity does not have an id set. + + Example: Using this function + Delete the upload settings for a folder: + + import asyncio + from synapseclient import Synapse + from synapseclient.models import Folder + + syn = Synapse() + syn.login() + + async def main(): + folder = await Folder(id="syn123").get_async() + setting = await folder.get_project_setting_async(setting_type="upload") + if setting: + await folder.delete_project_setting_async(setting_id=setting['id']) + + asyncio.run(main()) + """ + if not self.id: + raise ValueError("The entity must have an id set.") + + await delete_project_setting( + setting_id=setting_id, + synapse_client=synapse_client, + ) + + @otel_trace_method( + method_to_trace_name=lambda self, **kwargs: f"Entity_GetStsStorageToken: {self.id}" + ) + async def get_sts_storage_token_async( + self, + permission: str, + *, + output_format: str = "json", + min_remaining_life: Optional[int] = None, + synapse_client: Optional[Synapse] = None, + ) -> Any: + """Get STS (AWS Security Token Service) credentials for direct access to + the storage location backing this entity. These credentials can be used + with AWS tools like awscli and boto3. + + Arguments: + permission: The permission level for the token. Must be 'read_only' + or 'read_write'. + output_format: The output format for the credentials. Options: + 'json' (default), 'boto', 'shell', 'bash', 'cmd', 'powershell'. + min_remaining_life: The minimum remaining life (in seconds) for a + cached token before a new one is fetched. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The STS credentials in the requested format. + + Raises: + ValueError: If the entity does not have an id set. + + Example: Using credentials with boto3 + Get STS credentials for an STS-enabled folder and use with boto3: + + import asyncio + import boto3 + from synapseclient import Synapse + from synapseclient.models import Folder + + syn = Synapse() + syn.login() + + async def main(): + folder = await Folder(id="syn123").get_async() + credentials = await folder.get_sts_storage_token_async( + permission="read_write", + output_format="boto", + ) + s3_client = boto3.client('s3', **credentials) + + asyncio.run(main()) + """ + if not self.id: + raise ValueError("The entity must have an id set.") + + from synapseclient.core import sts_transfer + + client = Synapse.get_client(synapse_client=synapse_client) + + return await asyncio.to_thread( + sts_transfer.get_sts_credentials, + client, + self.id, + permission, + output_format=output_format, + min_remaining_life=min_remaining_life, + ) + + @otel_trace_method( + method_to_trace_name=lambda self, **kwargs: f"Entity_IndexFilesForMigration: {self.id}" + ) + async def index_files_for_migration_async( + self, + dest_storage_location_id: int, + db_path: Optional[str] = None, + *, + source_storage_location_ids: Optional[List[int]] = None, + file_version_strategy: str = "new", + include_table_files: bool = False, + continue_on_error: bool = False, + synapse_client: Optional[Synapse] = None, + ) -> MigrationResult: + """Index files in this entity for migration to a new storage location. + + This is the first step in migrating files to a new storage location. + After indexing, use `migrate_indexed_files` to perform the actual migration. + + Arguments: + dest_storage_location_id: The destination storage location ID. + db_path: Path to the SQLite database file for tracking migration state. + If not provided, a temporary directory will be used. The path + can be retrieved from the returned MigrationResult.db_path. + source_storage_location_ids: Optional list of source storage location IDs + to filter which files to migrate. If None, all files are indexed. + file_version_strategy: Strategy for handling file versions. Options: + 'new' (default) - create new versions, 'all' - migrate all versions, + 'latest' - only migrate latest version, 'skip' - skip if file exists. + include_table_files: Whether to include files attached to tables. + continue_on_error: Whether to continue indexing if an error occurs. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + A MigrationResult object containing indexing statistics and the database + path (accessible via result.db_path). + + Example: Indexing files for migration + Index files in a project for migration: + + import asyncio + from synapseclient import Synapse + from synapseclient.models import Project + + syn = Synapse() + syn.login() + + async def main(): + project = await Project(id="syn123").get_async() + result = await project.index_files_for_migration_async( + dest_storage_location_id=12345, + ) + print(f"Database path: {result.db_path}") + print(f"Indexed {result.counts_by_status}") + + asyncio.run(main()) + """ + if not self.id: + raise ValueError("The entity must have an id set.") + + return await _index_files_for_migration_async( + entity_id=self.id, + dest_storage_location_id=str(dest_storage_location_id), + db_path=db_path, + source_storage_location_ids=( + [str(s) for s in source_storage_location_ids] + if source_storage_location_ids + else None + ), + file_version_strategy=file_version_strategy, + include_table_files=include_table_files, + continue_on_error=continue_on_error, + synapse_client=synapse_client, + ) + + @otel_trace_method( + method_to_trace_name=lambda self, **kwargs: f"Entity_MigrateIndexedFiles: {self.id}" + ) + async def migrate_indexed_files_async( + self, + db_path: str, + *, + create_table_snapshots: bool = True, + continue_on_error: bool = False, + force: bool = False, + synapse_client: Optional[Synapse] = None, + ) -> Optional[MigrationResult]: + """Migrate files that have been indexed with `index_files_for_migration`. + + This is the second step in migrating files to a new storage location. + Files must first be indexed using `index_files_for_migration`. + + Arguments: + db_path: Path to the SQLite database file created by + `index_files_for_migration`. You can get this from the + MigrationResult.db_path returned by index_files_for_migration. + create_table_snapshots: Whether to create table snapshots before + migrating table files. + continue_on_error: Whether to continue migration if an error occurs. + force: Whether to force migration of files that have already been + migrated. Also bypasses interactive confirmation. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + A MigrationResult object containing migration statistics, or None + if the user declined the confirmation prompt. + + Example: Migrating indexed files + Migrate previously indexed files: + + import asyncio + from synapseclient import Synapse + from synapseclient.models import Project + + syn = Synapse() + syn.login() + + async def main(): + project = await Project(id="syn123").get_async() + + # Index first + index_result = await project.index_files_for_migration_async( + dest_storage_location_id=12345, + ) + + # Then migrate using the db_path from index result + result = await project.migrate_indexed_files_async( + db_path=index_result.db_path, + force=True, # Skip interactive confirmation + ) + print(f"Migrated {result.counts_by_status}") + + asyncio.run(main()) + """ + if not self.id: + raise ValueError("The entity must have an id set.") + + return await _migrate_indexed_files_async( + db_path=db_path, + create_table_snapshots=create_table_snapshots, + continue_on_error=continue_on_error, + force=force, + synapse_client=synapse_client, + ) diff --git a/synapseclient/models/project.py b/synapseclient/models/project.py index a1a6a1c21..6686c8ac5 100644 --- a/synapseclient/models/project.py +++ b/synapseclient/models/project.py @@ -18,6 +18,10 @@ ContainerEntityJSONSchema, StorableContainer, ) +from synapseclient.models.mixins.manifest import ManifestGeneratable +from synapseclient.models.mixins.storage_location_mixin import ( + StorageLocationConfigurable, +) from synapseclient.models.protocols.project_protocol import ProjectSynchronousProtocol from synapseclient.models.services.search import get_id from synapseclient.models.services.storable_entity import store_entity @@ -46,6 +50,8 @@ class Project( AccessControllable, StorableContainer, ContainerEntityJSONSchema, + StorageLocationConfigurable, + ManifestGeneratable, ): """A Project is a top-level container for organizing data in Synapse. diff --git a/synapseclient/models/protocols/download_list_protocol.py b/synapseclient/models/protocols/download_list_protocol.py new file mode 100644 index 000000000..7152d4bf1 --- /dev/null +++ b/synapseclient/models/protocols/download_list_protocol.py @@ -0,0 +1,97 @@ +"""Protocol for the specific methods of download list classes that have synchronous counterparts +generated at runtime.""" + +from typing import Any, Dict, Optional, Protocol + +from typing_extensions import Self + +from synapseclient import Synapse + + +class DownloadListManifestRequestSynchronousProtocol(Protocol): + """ + The protocol for methods that are asynchronous but also + have a synchronous counterpart that may also be called. + """ + + def send_job_and_wait( + self, + post_exchange_args: Optional[Dict[str, Any]] = None, + timeout: int = 120, + *, + synapse_client: Optional[Synapse] = None, + ) -> Self: + """Send the job to the Asynchronous Job service and wait for it to complete. + + This method sends the manifest generation request to Synapse and waits + for the job to complete. After completion, the `result_file_handle_id` + attribute will be populated. + + Arguments: + post_exchange_args: Additional arguments to pass to the request. + timeout: The number of seconds to wait for the job to complete or progress + before raising a SynapseTimeoutError. Defaults to 120. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + This instance with `result_file_handle_id` populated. + + Raises: + SynapseTimeoutError: If the job does not complete within the timeout. + SynapseError: If the job fails. + + Example: Generate a manifest + Generate a manifest from the download list: + + from synapseclient.models import DownloadListManifestRequest + import synapseclient + + synapseclient.login() + + request = DownloadListManifestRequest() + request.send_job_and_wait() + print(f"Manifest file handle: {request.result_file_handle_id}") + """ + return self + + def download_manifest( + self, + download_path: str, + *, + synapse_client: Optional[Synapse] = None, + ) -> str: + """ + Download the generated manifest file to a local path. + + This method should be called after `send_job_and_wait()` has completed + successfully and `result_file_handle_id` is populated. + + Arguments: + download_path: The local directory path where the manifest will be saved. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The full path to the downloaded manifest file. + + Raises: + ValueError: If the manifest has not been generated yet (no result_file_handle_id). + + Example: Download the manifest after generation + Generate and download a manifest: + + from synapseclient.models import DownloadListManifestRequest + import synapseclient + + synapseclient.login() + + request = DownloadListManifestRequest() + request.send_job_and_wait() + + manifest_path = request.download_manifest(download_path="/path/to/download") + print(f"Manifest downloaded to: {manifest_path}") + """ + return "" diff --git a/synapseclient/models/protocols/manifest_protocol.py b/synapseclient/models/protocols/manifest_protocol.py new file mode 100644 index 000000000..1da447da0 --- /dev/null +++ b/synapseclient/models/protocols/manifest_protocol.py @@ -0,0 +1,240 @@ +"""Protocol for the specific methods of ManifestGeneratable mixin that have +synchronous counterparts generated at runtime.""" + +from typing import Dict, List, Optional, Protocol, Tuple + +from synapseclient import Synapse + + +class ManifestGeneratableSynchronousProtocol(Protocol): + """ + The protocol for methods that are asynchronous but also + have a synchronous counterpart that may also be called. + """ + + def generate_manifest( + self, + path: str, + manifest_scope: str = "all", + *, + synapse_client: Optional[Synapse] = None, + ) -> Optional[str]: + """Generate a manifest TSV file for all files in this container. + + This method should be called after `sync_from_synapse()` to generate + a manifest of all downloaded files with their metadata. + + Arguments: + path: The directory where the manifest file(s) will be written. + manifest_scope: Controls manifest file generation: + + - "all": Create a manifest in each directory level + - "root": Create a single manifest at the root path only + - "suppress": Do not create any manifest files + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The path to the root manifest file if created, or None if suppressed. + + Raises: + ValueError: If the container has not been synced from Synapse. + ValueError: If manifest_scope is not one of 'all', 'root', 'suppress'. + + Example: Generate manifest after sync + Generate a manifest file after syncing from Synapse: + + from synapseclient.models import Project + + import synapseclient + synapseclient.login() + + project = Project(id="syn123").sync_from_synapse( + path="/path/to/download" + ) + manifest_path = project.generate_manifest( + path="/path/to/download", + manifest_scope="root" + ) + print(f"Manifest created at: {manifest_path}") + """ + return None + + @classmethod + def from_manifest( + cls, + manifest_path: str, + parent_id: str, + dry_run: bool = False, + merge_existing_annotations: bool = True, + associate_activity_to_new_version: bool = False, + *, + synapse_client: Optional[Synapse] = None, + ) -> List: + """Upload files to Synapse from a manifest TSV file. + + This method reads a manifest TSV file and uploads all files defined in it + to Synapse. The manifest file must contain at minimum the 'path' and 'parent' + columns. + + Arguments: + manifest_path: Path to the manifest TSV file. + parent_id: The Synapse ID of the parent container (Project or Folder) + where files will be uploaded if not specified in the manifest. + dry_run: If True, validate the manifest but do not upload. + merge_existing_annotations: If True, merge annotations with existing + annotations on the file. If False, replace existing annotations. + associate_activity_to_new_version: If True, copy the activity + (provenance) from the previous version to the new version. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + List of File objects that were uploaded. + + Example: Upload files from a manifest + Upload files from a manifest TSV file: + + from synapseclient.models import Project + + import synapseclient + synapseclient.login() + + files = Project.from_manifest( + manifest_path="/path/to/manifest.tsv", + parent_id="syn123" + ) + for file in files: + print(f"Uploaded: {file.name} ({file.id})") + """ + return [] + + @staticmethod + def validate_manifest( + manifest_path: str, + *, + synapse_client: Optional[Synapse] = None, + ) -> Tuple[bool, List[str]]: + """Validate a manifest TSV file without uploading. + + This method validates a manifest file to ensure it is properly formatted + and all paths exist. + + Arguments: + manifest_path: Path to the manifest TSV file. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + Tuple of (is_valid, list_of_error_messages). If the manifest is valid, + is_valid will be True and the list will be empty. + + Example: Validate a manifest file + Validate a manifest file before uploading: + + from synapseclient.models import Project + + is_valid, errors = Project.validate_manifest( + manifest_path="/path/to/manifest.tsv" + ) + if is_valid: + print("Manifest is valid") + else: + for error in errors: + print(f"Error: {error}") + """ + return (True, []) + + def get_manifest_data( + self, + *, + synapse_client: Optional[Synapse] = None, + ) -> Tuple[List[str], List[Dict[str, str]]]: + """Get manifest data for all files in this container. + + This method extracts metadata from all files that have been synced + to this container. The data can be used to generate a manifest file + or for other purposes. + + Arguments: + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + Tuple of (keys, data) where keys is a list of column headers + and data is a list of dictionaries, one per file, containing + the file metadata. + + Raises: + ValueError: If the container has not been synced from Synapse. + + Example: Get manifest data + Get manifest data for all files in a project: + + from synapseclient.models import Project + + import synapseclient + synapseclient.login() + + project = Project(id="syn123").sync_from_synapse( + path="/path/to/download" + ) + keys, data = project.get_manifest_data() + for row in data: + print(f"File: {row['name']} at {row['path']}") + """ + return ([], []) + + @staticmethod + def generate_download_list_manifest( + download_path: str, + csv_separator: str = ",", + include_header: bool = True, + timeout: int = 120, + *, + synapse_client: Optional[Synapse] = None, + ) -> str: + """Generate a manifest file from the current user's download list. + + This method creates a CSV manifest containing metadata about all files in + the user's download list. The manifest is generated server-side by Synapse + and then downloaded to the specified path. + + This is interoperable with the Synapse download list feature and provides + a way to export the download list as a manifest file that can be used for + bulk operations. + + Arguments: + download_path: The local directory path where the manifest will be saved. + csv_separator: The delimiter character for the CSV file. + Defaults to "," for comma-separated values. Use "\t" for tab-separated. + include_header: Whether to include column headers in the first row. + Defaults to True. + timeout: The number of seconds to wait for the job to complete. + Defaults to 120 seconds. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The full path to the downloaded manifest file. + + Example: Generate manifest from download list + Generate a manifest from your Synapse download list: + + from synapseclient.models import Project + + import synapseclient + synapseclient.login() + + # Generate manifest from download list + manifest_path = Project.generate_download_list_manifest( + download_path="/path/to/download" + ) + print(f"Manifest downloaded to: {manifest_path}") + """ + return "" diff --git a/synapseclient/models/protocols/storable_container_protocol.py b/synapseclient/models/protocols/storable_container_protocol.py index 0352132d1..245836adf 100644 --- a/synapseclient/models/protocols/storable_container_protocol.py +++ b/synapseclient/models/protocols/storable_container_protocol.py @@ -29,6 +29,7 @@ def sync_from_synapse( link_hops: int = 1, queue: asyncio.Queue = None, include_types: Optional[List[str]] = None, + generate_manifest: str = "suppress", *, synapse_client: Optional[Synapse] = None, ) -> Self: @@ -40,9 +41,8 @@ def sync_from_synapse( If you only want to retrieve the full tree of metadata about your container specify `download_file` as False. - This works similar to [synapseutils.syncFromSynapse][], however, this does not - currently support the writing of data to a manifest TSV file. This will be a - future enhancement. + This works similar to [synapseutils.syncFromSynapse][] and supports + generating a manifest TSV file with file metadata. Supports syncing Files, Folders, Tables, EntityViews, SubmissionViews, Datasets, DatasetCollections, MaterializedViews, and VirtualTables from Synapse. The @@ -74,6 +74,13 @@ def sync_from_synapse( include_types: Must be a list of entity types (ie. ["folder","file"]) which can be found [here](https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/EntityType.html) + generate_manifest: Controls manifest file generation. Options: + + - "all": Create a manifest in each directory level + - "root": Create a single manifest at the root path only + - "suppress": (Default) Do not create any manifest files + + A path must be specified for manifest generation. synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor. diff --git a/synapseclient/models/protocols/storage_location_mixin_protocol.py b/synapseclient/models/protocols/storage_location_mixin_protocol.py new file mode 100644 index 000000000..7403972a6 --- /dev/null +++ b/synapseclient/models/protocols/storage_location_mixin_protocol.py @@ -0,0 +1,279 @@ +"""Protocol for the specific methods of StorageLocationConfigurable mixin that have +synchronous counterparts generated at runtime.""" + +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Protocol, Union + +from synapseclient import Synapse + +if TYPE_CHECKING: + from synapseclient.models.services.migration_types import MigrationResult + + +class StorageLocationConfigurableSynchronousProtocol(Protocol): + """ + The protocol for methods that are asynchronous but also + have a synchronous counterpart that may also be called. + """ + + def set_storage_location( + self, + storage_location_id: Optional[Union[int, List[int]]] = None, + *, + synapse_client: Optional[Synapse] = None, + ) -> Dict[str, Any]: + """Set the upload storage location for this entity. This configures where + files uploaded to this entity will be stored. + + Arguments: + storage_location_id: The storage location ID(s) to set. Can be a single + ID, a list of IDs (first is default, max 10), or None to use + Synapse default storage. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The project setting dict returned from Synapse. + + Raises: + ValueError: If the entity does not have an id set. + + Example: Setting storage location on a folder + Set storage location on a folder: + + from synapseclient.models import Folder + + import synapseclient + synapseclient.login() + + folder = Folder(id="syn123").get() + setting = folder.set_storage_location(storage_location_id=12345) + print(setting) + """ + return {} + + def get_project_setting( + self, + setting_type: str = "upload", + *, + synapse_client: Optional[Synapse] = None, + ) -> Optional[Dict[str, Any]]: + """Get the project setting for this entity. + + Arguments: + setting_type: The type of setting to retrieve. One of: + 'upload', 'external_sync', 'requester_pays'. Default: 'upload'. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The project setting as a dictionary, or None if no setting exists. + + Raises: + ValueError: If the entity does not have an id set. + + Example: Getting project settings + Get the upload settings for a folder: + + from synapseclient.models import Folder + + import synapseclient + synapseclient.login() + + folder = Folder(id="syn123").get() + setting = folder.get_project_setting(setting_type="upload") + if setting: + print(f"Storage locations: {setting.get('locations')}") + """ + return {} + + def delete_project_setting( + self, + setting_id: str, + *, + synapse_client: Optional[Synapse] = None, + ) -> None: + """Delete a project setting by its setting ID. + + Arguments: + setting_id: The ID of the project setting to delete. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + None + + Raises: + ValueError: If the entity does not have an id set. + + Example: Deleting a project setting + Delete the upload settings for a folder: + + from synapseclient.models import Folder + + import synapseclient + synapseclient.login() + + folder = Folder(id="syn123").get() + setting = folder.get_project_setting(setting_type="upload") + if setting: + folder.delete_project_setting(setting_id=setting['id']) + """ + return None + + def get_sts_storage_token( + self, + permission: str, + *, + output_format: str = "json", + min_remaining_life: Optional[int] = None, + synapse_client: Optional[Synapse] = None, + ) -> Any: + """Get STS (AWS Security Token Service) credentials for direct access to + the storage location backing this entity. These credentials can be used + with AWS tools like awscli and boto3. + + Arguments: + permission: The permission level for the token. Must be 'read_only' + or 'read_write'. + output_format: The output format for the credentials. Options: + 'json' (default), 'boto', 'shell', 'bash', 'cmd', 'powershell'. + min_remaining_life: The minimum remaining life (in seconds) for a + cached token before a new one is fetched. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The STS credentials in the requested format. + + Raises: + ValueError: If the entity does not have an id set. + + Example: Using credentials with boto3 + Get STS credentials for an STS-enabled folder and use with boto3: + + import boto3 + from synapseclient.models import Folder + + import synapseclient + synapseclient.login() + + folder = Folder(id="syn123").get() + credentials = folder.get_sts_storage_token( + permission="read_write", + output_format="boto", + ) + s3_client = boto3.client('s3', **credentials) + """ + return {} + + def index_files_for_migration( + self, + dest_storage_location_id: int, + db_path: Optional[str] = None, + *, + source_storage_location_ids: Optional[List[int]] = None, + file_version_strategy: str = "new", + include_table_files: bool = False, + continue_on_error: bool = False, + synapse_client: Optional[Synapse] = None, + ) -> "MigrationResult": + """Index files in this entity for migration to a new storage location. + + This is the first step in migrating files to a new storage location. + After indexing, use `migrate_indexed_files` to perform the actual migration. + + Arguments: + dest_storage_location_id: The destination storage location ID. + db_path: Path to the SQLite database file for tracking migration state. + If not provided, a temporary directory will be used. The path + can be retrieved from the returned MigrationResult.db_path. + source_storage_location_ids: Optional list of source storage location IDs + to filter which files to migrate. If None, all files are indexed. + file_version_strategy: Strategy for handling file versions. Options: + 'new' (default) - create new versions, 'all' - migrate all versions, + 'latest' - only migrate latest version, 'skip' - skip if file exists. + include_table_files: Whether to include files attached to tables. + continue_on_error: Whether to continue indexing if an error occurs. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + A MigrationResult object containing indexing statistics and the database + path (accessible via result.db_path). + + Example: Indexing files for migration + Index files in a project for migration: + + from synapseclient.models import Project + + import synapseclient + synapseclient.login() + + project = Project(id="syn123").get() + result = project.index_files_for_migration( + dest_storage_location_id=12345, + ) + print(f"Database path: {result.db_path}") + print(f"Indexed {result.counts_by_status}") + """ + return None + + def migrate_indexed_files( + self, + db_path: str, + *, + create_table_snapshots: bool = True, + continue_on_error: bool = False, + force: bool = False, + synapse_client: Optional[Synapse] = None, + ) -> Optional["MigrationResult"]: + """Migrate files that have been indexed with `index_files_for_migration`. + + This is the second step in migrating files to a new storage location. + Files must first be indexed using `index_files_for_migration`. + + Arguments: + db_path: Path to the SQLite database file created by + `index_files_for_migration`. You can get this from the + MigrationResult.db_path returned by index_files_for_migration. + create_table_snapshots: Whether to create table snapshots before + migrating table files. + continue_on_error: Whether to continue migration if an error occurs. + force: Whether to force migration of files that have already been + migrated. Also bypasses interactive confirmation. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + A MigrationResult object containing migration statistics, or None + if the user declined the confirmation prompt. + + Example: Migrating indexed files + Migrate previously indexed files: + + from synapseclient.models import Project + + import synapseclient + synapseclient.login() + + project = Project(id="syn123").get() + + # Index first + index_result = project.index_files_for_migration( + dest_storage_location_id=12345, + ) + + # Then migrate using the db_path from index result + result = project.migrate_indexed_files( + db_path=index_result.db_path, + force=True, # Skip interactive confirmation + ) + print(f"Migrated {result.counts_by_status}") + """ + return None diff --git a/synapseclient/models/protocols/storage_location_protocol.py b/synapseclient/models/protocols/storage_location_protocol.py new file mode 100644 index 000000000..e602daaa6 --- /dev/null +++ b/synapseclient/models/protocols/storage_location_protocol.py @@ -0,0 +1,159 @@ +"""Protocol for the specific methods of StorageLocation that have synchronous counterparts +generated at runtime.""" + +from typing import TYPE_CHECKING, Optional, Protocol, Tuple + +from synapseclient import Synapse + +if TYPE_CHECKING: + from synapseclient.models import Folder + from synapseclient.models.storage_location import StorageLocation + + +class StorageLocationSynchronousProtocol(Protocol): + """ + The protocol for methods that are asynchronous but also + have a synchronous counterpart that may also be called. + """ + + def store( + self, + *, + synapse_client: Optional[Synapse] = None, + ) -> "StorageLocation": + """Create this storage location in Synapse. Storage locations are immutable; + this always creates a new one. If a storage location with identical properties + already exists for this user, the existing one is returned (idempotent). + + Arguments: + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The StorageLocation object with server-assigned fields populated. + + Raises: + ValueError: If `storage_type` is not set. + + Example: Creating an external S3 storage location + Create a storage location backed by your own S3 bucket: + + from synapseclient.models import StorageLocation, StorageLocationType + + import synapseclient + synapseclient.login() + + storage = StorageLocation( + storage_type=StorageLocationType.EXTERNAL_S3, + bucket="my-external-synapse-bucket", + base_key="path/within/bucket", + ).store() + + print(f"Storage location ID: {storage.storage_location_id}") + """ + return self + + def get( + self, + *, + synapse_client: Optional[Synapse] = None, + ) -> "StorageLocation": + """Retrieve this storage location from Synapse by its ID. Only the creator of + a StorageLocationSetting can retrieve it by its id. + + Arguments: + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The StorageLocation object populated with data from Synapse. + + Raises: + ValueError: If `storage_location_id` is not set. + + Example: Retrieving a storage location + Retrieve a storage location by ID: + + from synapseclient.models import StorageLocation + + import synapseclient + synapseclient.login() + + storage = StorageLocation(storage_location_id=12345).get() + print(f"Type: {storage.storage_type}, Bucket: {storage.bucket}") + """ + return self + + @classmethod + def setup_s3( + cls, + *, + parent: str, + folder_name: Optional[str] = None, + folder: Optional["Folder"] = None, + bucket_name: Optional[str] = None, + base_key: Optional[str] = None, + sts_enabled: bool = False, + synapse_client: Optional[Synapse] = None, + ) -> Tuple["Folder", "StorageLocation"]: + """Convenience method to create a folder backed by S3 storage. This will: + + 1. Create or retrieve the folder + 2. Create the storage location setting + 3. Apply the storage location to the folder via project settings + + Arguments: + parent: The parent project or folder ID (e.g., "syn123"). + folder_name: Name for a new folder. Either `folder_name` or `folder` + must be provided. + folder: An existing Folder object or Synapse ID. Either `folder_name` + or `folder` must be provided. + bucket_name: The S3 bucket name. If None, uses Synapse default storage. + base_key: The base key (prefix) within the bucket. Optional. + sts_enabled: Whether to enable STS credentials for this storage location. + Default: False. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + A tuple of (Folder, StorageLocation). + + Raises: + ValueError: If neither `folder_name` nor `folder` is provided, or if both + are provided. + + Example: Creating an STS-enabled folder with external S3 storage + Create a folder with STS-enabled storage: + + from synapseclient.models import StorageLocation + + import synapseclient + synapseclient.login() + + folder, storage = StorageLocation.setup_s3( + folder_name="my-sts-folder", + parent="syn123", + bucket_name="my-external-synapse-bucket", + base_key="path/within/bucket", + sts_enabled=True, + ) + print(f"Folder: {folder.id}, Storage: {storage.storage_location_id}") + + Example: Using an existing folder + Apply S3 storage to an existing folder: + + from synapseclient.models import StorageLocation, Folder + + import synapseclient + synapseclient.login() + + existing_folder = Folder(id="syn456").get() + folder, storage = StorageLocation.setup_s3( + folder=existing_folder, + bucket_name="my-bucket", + ) + """ + return None diff --git a/synapseclient/models/services/__init__.py b/synapseclient/models/services/__init__.py index d1e7227ca..fea05d199 100644 --- a/synapseclient/models/services/__init__.py +++ b/synapseclient/models/services/__init__.py @@ -1,3 +1,16 @@ +from synapseclient.models.services.migration import ( + index_files_for_migration_async, + migrate_indexed_files_async, +) +from synapseclient.models.services.migration_types import ( + MigrationEntry, + MigrationError, + MigrationKey, + MigrationResult, + MigrationSettings, + MigrationStatus, + MigrationType, +) from synapseclient.models.services.search import get_id from synapseclient.models.services.storable_entity import store_entity from synapseclient.models.services.storable_entity_components import ( @@ -5,4 +18,18 @@ store_entity_components, ) -__all__ = ["store_entity_components", "store_entity", "FailureStrategy", "get_id"] +__all__ = [ + "store_entity_components", + "store_entity", + "FailureStrategy", + "get_id", + "index_files_for_migration_async", + "migrate_indexed_files_async", + "MigrationResult", + "MigrationStatus", + "MigrationType", + "MigrationKey", + "MigrationEntry", + "MigrationSettings", + "MigrationError", +] diff --git a/synapseclient/models/services/migration.py b/synapseclient/models/services/migration.py new file mode 100644 index 000000000..0186e8b77 --- /dev/null +++ b/synapseclient/models/services/migration.py @@ -0,0 +1,1650 @@ +""" +Async migration service for migrating files between storage locations. + +This module provides native async implementations of the migration functionality, +replacing the threading-based approach in synapseutils.migrate_functions. +""" + +import asyncio +import collections.abc +import json +import logging +import os +import sys +import tempfile +import traceback +from typing import ( + TYPE_CHECKING, + Any, + AsyncGenerator, + Dict, + List, + Optional, + Set, + Tuple, + Union, +) + +from synapseclient.api.entity_services import get_children +from synapseclient.api.file_services import get_file_handle_for_download_async +from synapseclient.api.table_services import create_table_snapshot, get_columns +from synapseclient.core import utils +from synapseclient.core.constants import concrete_types +from synapseclient.core.upload.multipart_upload import ( + MAX_NUMBER_OF_PARTS, + multipart_copy, +) +from synapseclient.models.table_components import ( + AppendableRowSetRequest, + PartialRow, + PartialRowSet, + TableUpdateTransaction, +) + +from .migration_types import ( + IndexingError, + MigrationError, + MigrationKey, + MigrationResult, + MigrationSettings, + MigrationStatus, + MigrationType, +) + +if TYPE_CHECKING: + from synapseclient import Synapse + +# Default part size for multipart copy (100 MB) +DEFAULT_PART_SIZE = 100 * utils.MB + +# Batch size for database operations +BATCH_SIZE = 500 + +logger = logging.getLogger(__name__) + + +# ============================================================================= +# Temp Directory Helpers +# ============================================================================= + + +def _get_default_db_path(entity_id: str) -> str: + """Generate a default temp database path for migration tracking. + + Arguments: + entity_id: The Synapse entity ID being migrated. + + Returns: + Path to a SQLite database file in a temp directory. + """ + temp_dir = tempfile.mkdtemp(prefix="synapse_migration_") + return os.path.join(temp_dir, f"migration_{entity_id}.db") + + +# ============================================================================= +# Column Name Helpers (replaces legacy synapseclient.table functions) +# ============================================================================= + + +def _escape_column_name(column: Union[str, collections.abc.Mapping]) -> str: + """Escape a column name for use in a Synapse table query statement. + + Arguments: + column: A string column name or a dictionary with a 'name' key. + + Returns: + Escaped column name wrapped in double quotes. + """ + col_name = ( + column["name"] if isinstance(column, collections.abc.Mapping) else str(column) + ) + escaped_name = col_name.replace('"', '""') + return f'"{escaped_name}"' + + +def _join_column_names(columns: List[Any]) -> str: + """Join column names into a comma-delimited list for table queries. + + Arguments: + columns: A list of column names or column objects with 'name' keys. + + Returns: + Comma-separated string of escaped column names. + """ + return ",".join(_escape_column_name(c) for c in columns) + + +# ============================================================================= +# Database Helper Functions (Synchronous - wrapped with asyncio.to_thread) +# ============================================================================= + + +def _ensure_schema(cursor) -> None: + """Ensure the SQLite database has the required schema.""" + # Settings table - stores JSON configuration + cursor.execute( + "CREATE TABLE IF NOT EXISTS migration_settings (settings TEXT NOT NULL)" + ) + + # Main migrations table + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS migrations ( + id TEXT NOT NULL, + type INTEGER NOT NULL, + version INTEGER NULL, + row_id INTEGER NULL, + col_id INTEGER NULL, + parent_id NULL, + status INTEGER NOT NULL, + exception TEXT NULL, + from_storage_location_id NULL, + from_file_handle_id TEXT NULL, + to_file_handle_id TEXT NULL, + file_size INTEGER NULL, + PRIMARY KEY (id, type, row_id, col_id, version) + ) + """ + ) + + # Indexes for common queries + cursor.execute("CREATE INDEX IF NOT EXISTS ix_status ON migrations(status)") + cursor.execute( + "CREATE INDEX IF NOT EXISTS ix_file_handle_ids " + "ON migrations(from_file_handle_id, to_file_handle_id)" + ) + + +def _initialize_database( + db_path: str, + root_id: str, + dest_storage_location_id: str, + source_storage_location_ids: List[str], + file_version_strategy: str, + include_table_files: bool, +) -> None: + """Initialize the migration database with schema and settings. + + Arguments: + db_path: Path to the SQLite database file. + root_id: The root entity ID being migrated. + dest_storage_location_id: Destination storage location ID. + source_storage_location_ids: List of source storage location IDs to filter. + file_version_strategy: Strategy for handling file versions. + include_table_files: Whether to include table-attached files. + """ + import sqlite3 + + with sqlite3.connect(db_path) as conn: + cursor = conn.cursor() + _ensure_schema(cursor) + + # Check if settings already exist + existing = cursor.execute("SELECT settings FROM migration_settings").fetchone() + + settings = MigrationSettings( + root_id=root_id, + dest_storage_location_id=dest_storage_location_id, + source_storage_location_ids=source_storage_location_ids, + file_version_strategy=file_version_strategy, + include_table_files=include_table_files, + ) + + if existing: + # Verify settings match + existing_settings = json.loads(existing[0]) + if existing_settings.get("root_id") != root_id: + raise ValueError( + f"Root entity ID mismatch: database has {existing_settings.get('root_id')}, " + f"but {root_id} was provided" + ) + if ( + existing_settings.get("dest_storage_location_id") + != dest_storage_location_id + ): + raise ValueError( + f"Destination storage location mismatch: database has " + f"{existing_settings.get('dest_storage_location_id')}, " + f"but {dest_storage_location_id} was provided" + ) + else: + # Insert new settings + settings_json = json.dumps( + { + "root_id": settings.root_id, + "dest_storage_location_id": settings.dest_storage_location_id, + "source_storage_location_ids": settings.source_storage_location_ids, + "file_version_strategy": settings.file_version_strategy, + "include_table_files": settings.include_table_files, + } + ) + cursor.execute( + "INSERT INTO migration_settings (settings) VALUES (?)", + (settings_json,), + ) + + conn.commit() + + +def _retrieve_index_settings(db_path: str) -> Optional[Dict[str, Any]]: + """Retrieve index settings from the database. + + Arguments: + db_path: Path to the SQLite database file. + + Returns: + Dictionary of settings or None if not found. + """ + import sqlite3 + + with sqlite3.connect(db_path) as conn: + cursor = conn.cursor() + _ensure_schema(cursor) + + row = cursor.execute("SELECT settings FROM migration_settings").fetchone() + if row: + return json.loads(row[0]) + return None + + +def _check_indexed(db_path: str, entity_id: str) -> bool: + """Check if an entity has already been indexed. + + Arguments: + db_path: Path to the SQLite database file. + entity_id: The entity ID to check. + + Returns: + True if the entity is already indexed, False otherwise. + """ + import sqlite3 + + with sqlite3.connect(db_path) as conn: + cursor = conn.cursor() + row = cursor.execute( + "SELECT 1 FROM migrations WHERE id = ? LIMIT 1", + (entity_id,), + ).fetchone() + return row is not None + + +def _mark_container_indexed( + db_path: str, + entity_id: str, + parent_id: Optional[str], + migration_type: MigrationType, +) -> None: + """Mark a container (Project or Folder) as indexed. + + Arguments: + db_path: Path to the SQLite database file. + entity_id: The entity ID. + parent_id: The parent entity ID. + migration_type: The type of container. + """ + import sqlite3 + + with sqlite3.connect(db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + INSERT OR IGNORE INTO migrations (id, type, parent_id, status) + VALUES (?, ?, ?, ?) + """, + ( + entity_id, + migration_type.value, + parent_id, + MigrationStatus.INDEXED.value, + ), + ) + conn.commit() + + +def _insert_file_migration( + db_path: str, + entity_id: str, + version: Optional[int], + parent_id: Optional[str], + from_storage_location_id: int, + from_file_handle_id: str, + file_size: int, + status: MigrationStatus, +) -> None: + """Insert a file migration entry. + + Arguments: + db_path: Path to the SQLite database file. + entity_id: The file entity ID. + version: The file version (None for new version). + parent_id: The parent entity ID. + from_storage_location_id: Source storage location ID. + from_file_handle_id: Source file handle ID. + file_size: File size in bytes. + status: Migration status. + """ + import sqlite3 + + with sqlite3.connect(db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + INSERT OR IGNORE INTO migrations ( + id, type, version, parent_id, + from_storage_location_id, from_file_handle_id, + file_size, status + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + entity_id, + MigrationType.FILE.value, + version, + parent_id, + from_storage_location_id, + from_file_handle_id, + file_size, + status.value, + ), + ) + conn.commit() + + +def _insert_table_file_migration( + db_path: str, + entity_id: str, + row_id: int, + col_id: int, + row_version: int, + parent_id: Optional[str], + from_storage_location_id: int, + from_file_handle_id: str, + file_size: int, + status: MigrationStatus, +) -> None: + """Insert a table-attached file migration entry. + + Arguments: + db_path: Path to the SQLite database file. + entity_id: The table entity ID. + row_id: The table row ID. + col_id: The table column ID. + row_version: The row version. + parent_id: The parent entity ID. + from_storage_location_id: Source storage location ID. + from_file_handle_id: Source file handle ID. + file_size: File size in bytes. + status: Migration status. + """ + import sqlite3 + + with sqlite3.connect(db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + INSERT OR IGNORE INTO migrations ( + id, type, row_id, col_id, version, parent_id, + from_storage_location_id, from_file_handle_id, + file_size, status + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + entity_id, + MigrationType.TABLE_ATTACHED_FILE.value, + row_id, + col_id, + row_version, + parent_id, + from_storage_location_id, + from_file_handle_id, + file_size, + status.value, + ), + ) + conn.commit() + + +def _record_indexing_error( + db_path: str, + entity_id: str, + parent_id: Optional[str], + exception: Exception, +) -> None: + """Record an indexing error in the database. + + Arguments: + db_path: Path to the SQLite database file. + entity_id: The entity ID that failed. + parent_id: The parent entity ID. + exception: The exception that occurred. + """ + import sqlite3 + + tb_str = "".join( + traceback.format_exception(type(exception), exception, exception.__traceback__) + ) + + with sqlite3.connect(db_path) as conn: + cursor = conn.cursor() + cursor.execute( + """ + INSERT OR IGNORE INTO migrations ( + id, type, parent_id, status, exception + ) VALUES (?, ?, ?, ?, ?) + """, + ( + entity_id, + MigrationType.FILE.value, # Default type for errors + parent_id, + MigrationStatus.ERRORED.value, + tb_str, + ), + ) + conn.commit() + + +def _check_file_handle_exists(db_path: str, from_file_handle_id: str) -> Optional[str]: + """Check if a file handle has already been copied. + + Arguments: + db_path: Path to the SQLite database file. + from_file_handle_id: The source file handle ID. + + Returns: + The destination file handle ID if found, None otherwise. + """ + import sqlite3 + + with sqlite3.connect(db_path) as conn: + cursor = conn.cursor() + row = cursor.execute( + """ + SELECT to_file_handle_id FROM migrations + WHERE from_file_handle_id = ? AND to_file_handle_id IS NOT NULL + """, + (from_file_handle_id,), + ).fetchone() + return row[0] if row else None + + +def _query_migration_batch( + db_path: str, + last_id: str, + last_version: int, + last_row_id: int, + last_col_id: int, + pending_file_handles: Set[str], + completed_file_handles: Set[str], + limit: int, +) -> List[Dict[str, Any]]: + """Query the next batch of items to migrate. + + This matches the original synapseutils query logic: + - Forward progress through entities ordered by id, type, row_id, col_id, version + - Backtracking to pick up files with completed file handles that were skipped + + Arguments: + db_path: Path to the SQLite database file. + last_id: Last processed entity ID. + last_version: Last processed version. + last_row_id: Last processed row ID. + last_col_id: Last processed column ID. + pending_file_handles: Set of file handles currently being processed. + completed_file_handles: Set of file handles already completed. + limit: Maximum number of items to return. + + Returns: + List of migration entries as dictionaries. + """ + import sqlite3 + + if limit <= 0: + return [] + + with sqlite3.connect(db_path) as conn: + cursor = conn.cursor() + + file_type = MigrationType.FILE.value + table_type = MigrationType.TABLE_ATTACHED_FILE.value + indexed_status = MigrationStatus.INDEXED.value + + # Build the IN clauses for file handles + # We use string formatting for the IN clause since sqlite3 doesn't support array parameters + pending_in = ( + "('" + "','".join(pending_file_handles) + "')" + if pending_file_handles + else "('')" + ) + completed_in = ( + "('" + "','".join(completed_file_handles) + "')" + if completed_file_handles + else "('')" + ) + + # Match the original synapseutils query structure exactly + # This handles: + # 1. Forward progress: entities after the current position + # 2. Backtracking: entities before current position that share completed file handles + query = f""" + SELECT + id, + type, + version, + row_id, + col_id, + from_file_handle_id, + file_size + FROM migrations + WHERE + status = :indexed_status + AND ( + ( + ((id > :id AND type IN (:file_type, :table_type)) + OR (id = :id AND type = :file_type AND version IS NOT NULL AND version > :version) + OR (id = :id AND type = :table_type AND (row_id > :row_id OR (row_id = :row_id AND col_id > :col_id)))) + AND from_file_handle_id NOT IN {pending_in} + ) OR + ( + id <= :id + AND from_file_handle_id IN {completed_in} + ) + ) + ORDER BY + id, + type, + row_id, + col_id, + version + LIMIT :limit + """ + + params = { + "indexed_status": indexed_status, + "id": last_id, + "file_type": file_type, + "table_type": table_type, + "version": last_version, + "row_id": last_row_id, + "col_id": last_col_id, + "limit": limit, + } + + results = cursor.execute(query, params) + + batch = [] + for row in results: + batch.append( + { + "id": row[0], + "type": MigrationType(row[1]), + "version": row[2], + "row_id": row[3], + "col_id": row[4], + "from_file_handle_id": row[5], + "file_size": row[6], + } + ) + return batch + + +def _update_migration_success( + db_path: str, + key: MigrationKey, + to_file_handle_id: str, +) -> None: + """Update a migration entry as successful. + + Arguments: + db_path: Path to the SQLite database file. + key: The migration key. + to_file_handle_id: The destination file handle ID. + """ + import sqlite3 + + with sqlite3.connect(db_path) as conn: + cursor = conn.cursor() + + update_sql = """ + UPDATE migrations SET status = ?, to_file_handle_id = ? + WHERE id = ? AND type = ? + """ + params = [ + MigrationStatus.MIGRATED.value, + to_file_handle_id, + key.id, + key.type.value, + ] + + if key.version is not None: + update_sql += " AND version = ?" + params.append(key.version) + else: + update_sql += " AND version IS NULL" + + if key.row_id is not None: + update_sql += " AND row_id = ?" + params.append(key.row_id) + + if key.col_id is not None: + update_sql += " AND col_id = ?" + params.append(key.col_id) + + cursor.execute(update_sql, tuple(params)) + conn.commit() + + +def _update_migration_error( + db_path: str, + key: MigrationKey, + exception: Exception, +) -> None: + """Update a migration entry with an error. + + Arguments: + db_path: Path to the SQLite database file. + key: The migration key. + exception: The exception that occurred. + """ + import sqlite3 + + tb_str = "".join( + traceback.format_exception(type(exception), exception, exception.__traceback__) + ) + + with sqlite3.connect(db_path) as conn: + cursor = conn.cursor() + + update_sql = """ + UPDATE migrations SET status = ?, exception = ? + WHERE id = ? AND type = ? + """ + params = [MigrationStatus.ERRORED.value, tb_str, key.id, key.type.value] + + if key.version is not None: + update_sql += " AND version = ?" + params.append(key.version) + else: + update_sql += " AND version IS NULL" + + if key.row_id is not None: + update_sql += " AND row_id = ?" + params.append(key.row_id) + + if key.col_id is not None: + update_sql += " AND col_id = ?" + params.append(key.col_id) + + cursor.execute(update_sql, tuple(params)) + conn.commit() + + +def _confirm_migration( + db_path: str, dest_storage_location_id: str, force: bool +) -> bool: + """Confirm migration with user if in interactive mode. + + Arguments: + db_path: Path to the SQLite database file. + dest_storage_location_id: Destination storage location ID. + force: Whether to skip confirmation. + + Returns: + True if migration should proceed, False otherwise. + """ + import sqlite3 + + if force: + return True + + with sqlite3.connect(db_path) as conn: + cursor = conn.cursor() + count = cursor.execute( + "SELECT count(*) FROM migrations WHERE status = ?", + (MigrationStatus.INDEXED.value,), + ).fetchone()[0] + + if count == 0: + logger.info("No items for migration.") + return False + + if sys.stdout.isatty(): + user_input = input( + f"{count} items for migration to {dest_storage_location_id}. Proceed? (y/n)? " + ) + return user_input.strip().lower() == "y" + else: + logger.info( + "%s items for migration. " + "force option not used, and console input not available to confirm migration, aborting. " + "Use the force option or run from an interactive shell to proceed with migration.", + count, + ) + return False + + +def _get_part_size(file_size: int) -> int: + """Calculate the part size for multipart copy. + + Arguments: + file_size: The file size in bytes. + + Returns: + The part size in bytes. + """ + import math + + # Ensure we don't exceed max parts + min_part_size = math.ceil(file_size / MAX_NUMBER_OF_PARTS) + return max(DEFAULT_PART_SIZE, min_part_size) + + +# ============================================================================= +# Storage Location Validation +# ============================================================================= + + +async def _verify_storage_location_ownership_async( + storage_location_id: str, + *, + synapse_client: "Synapse", +) -> None: + """Verify the user owns the destination storage location. + + Arguments: + storage_location_id: The storage location ID to verify. + synapse_client: The Synapse client. + + Raises: + ValueError: If the user does not own the storage location. + """ + try: + await synapse_client.rest_get_async(f"/storageLocation/{storage_location_id}") + except Exception as ex: + raise ValueError( + f"Unable to verify ownership of storage location {storage_location_id}. " + f"You must be the creator of the destination storage location. Error: {ex}" + ) from ex + + +def _include_file_in_migration( + file_handle: Dict[str, Any], + source_storage_location_ids: List[str], + dest_storage_location_id: str, +) -> Optional[MigrationStatus]: + """Determine if a file should be included in migration. + + Only S3 file handles can be migrated. External URLs and other file handle types + are skipped. + + Arguments: + file_handle: The file handle metadata. + source_storage_location_ids: List of source storage locations to filter. + dest_storage_location_id: Destination storage location ID. + + Returns: + MigrationStatus if file should be included, None otherwise. + """ + # Only S3 file handles can be migrated + if file_handle.get("concreteType") != concrete_types.S3_FILE_HANDLE: + return None + + from_storage_location_id = str(file_handle.get("storageLocationId", 1)) + + # Check if file matches the migration criteria: + # - If source_storage_location_ids is specified, from_storage_location must be in it + # OR already at the destination + # - If not specified, include all files not already at destination + if source_storage_location_ids: + if ( + from_storage_location_id not in source_storage_location_ids + and from_storage_location_id != dest_storage_location_id + ): + return None + + # Already at destination - mark as already migrated + if from_storage_location_id == dest_storage_location_id: + return MigrationStatus.ALREADY_MIGRATED + + return MigrationStatus.INDEXED + + +# ============================================================================= +# Public API Functions +# ============================================================================= + + +async def index_files_for_migration_async( + entity_id: str, + dest_storage_location_id: str, + db_path: Optional[str] = None, + *, + source_storage_location_ids: Optional[List[str]] = None, + file_version_strategy: str = "new", + include_table_files: bool = False, + continue_on_error: bool = False, + synapse_client: Optional["Synapse"] = None, +) -> MigrationResult: + """Index files for migration to a new storage location. + + This is the first step in migrating files to a new storage location. + After indexing, use `migrate_indexed_files_async` to perform the actual migration. + + Arguments: + entity_id: The Synapse entity ID to migrate (Project, Folder, File, or Table). + dest_storage_location_id: The destination storage location ID. + db_path: Path to create SQLite database. If None, uses temp directory. + source_storage_location_ids: Optional list of source storage locations to filter. + file_version_strategy: Strategy for file versions: "new", "all", "latest", "skip". + include_table_files: Whether to include files attached to tables. + continue_on_error: Whether to continue on individual errors. + synapse_client: Optional Synapse client instance. + + Returns: + MigrationResult object for inspecting the index. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + + # Validate parameters + valid_strategies = {"new", "all", "latest", "skip"} + if file_version_strategy not in valid_strategies: + raise ValueError( + f"Invalid file_version_strategy: {file_version_strategy}, " + f"must be one of {valid_strategies}" + ) + + if file_version_strategy == "skip" and not include_table_files: + raise ValueError( + "Skipping both file entities and table attached files, nothing to migrate" + ) + + # Convert to strings + dest_storage_location_id = str(dest_storage_location_id) + source_storage_location_ids = [str(s) for s in (source_storage_location_ids or [])] + + # Verify ownership + await _verify_storage_location_ownership_async( + storage_location_id=dest_storage_location_id, + synapse_client=client, + ) + + # Create database path if not provided + if db_path is None: + db_path = _get_default_db_path(entity_id) + + # Initialize database + await asyncio.to_thread( + _initialize_database, + db_path, + entity_id, + dest_storage_location_id, + source_storage_location_ids, + file_version_strategy, + include_table_files, + ) + + # Get entity and start indexing + entity = await client.get_async(entity_id, downloadFile=False) + + try: + await _index_entity_async( + entity=entity, + parent_id=None, + db_path=db_path, + dest_storage_location_id=dest_storage_location_id, + source_storage_location_ids=source_storage_location_ids, + file_version_strategy=file_version_strategy, + include_table_files=include_table_files, + continue_on_error=continue_on_error, + synapse_client=client, + ) + except IndexingError as ex: + logger.exception( + "Aborted due to failure to index entity %s of type %s. " + "Use continue_on_error=True to skip individual failures.", + ex.entity_id, + ex.concrete_type, + ) + raise ex + + return MigrationResult(db_path=db_path, synapse_client=client) + + +async def migrate_indexed_files_async( + db_path: str, + *, + create_table_snapshots: bool = True, + continue_on_error: bool = False, + force: bool = False, + max_concurrent_copies: Optional[int] = None, + synapse_client: Optional["Synapse"] = None, +) -> Optional[MigrationResult]: + """Migrate files that have been indexed. + + This is the second step in migrating files to a new storage location. + Files must first be indexed using `index_files_for_migration_async`. + + Arguments: + db_path: Path to SQLite database created by index_files_for_migration_async. + create_table_snapshots: Whether to create table snapshots before migrating. + continue_on_error: Whether to continue on individual migration errors. + force: Whether to skip interactive confirmation. + max_concurrent_copies: Maximum concurrent file copy operations. + synapse_client: Optional Synapse client instance. + + Returns: + MigrationResult object or None if migration was aborted. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + + # Retrieve settings + settings = await asyncio.to_thread(_retrieve_index_settings, db_path) + if settings is None: + raise ValueError( + f"Unable to retrieve existing index settings from '{db_path}'. " + "Either this path does not represent a previously created migration index " + "or the file is corrupt." + ) + + dest_storage_location_id = settings["dest_storage_location_id"] + + # Confirm migration + confirmed = await asyncio.to_thread( + _confirm_migration, db_path, dest_storage_location_id, force + ) + if not confirmed: + logger.info("Migration aborted.") + return None + + # Determine concurrency + max_concurrent = max_concurrent_copies or max(client.max_threads // 2, 1) + + # Execute migration + await _execute_migration_async( + db_path=db_path, + dest_storage_location_id=dest_storage_location_id, + create_table_snapshots=create_table_snapshots, + continue_on_error=continue_on_error, + max_concurrent=max_concurrent, + synapse_client=client, + ) + + return MigrationResult(db_path=db_path, synapse_client=client) + + +# ============================================================================= +# Indexing Implementation +# ============================================================================= + + +async def _index_entity_async( + entity: Any, + parent_id: Optional[str], + db_path: str, + dest_storage_location_id: str, + source_storage_location_ids: List[str], + file_version_strategy: str, + include_table_files: bool, + continue_on_error: bool, + *, + synapse_client: "Synapse", +) -> None: + """Recursively index an entity and its children. + + Arguments: + entity: The Synapse entity object. + parent_id: The parent entity ID. + db_path: Path to the SQLite database. + dest_storage_location_id: Destination storage location ID. + source_storage_location_ids: List of source storage locations to filter. + file_version_strategy: Strategy for file versions. + include_table_files: Whether to include table-attached files. + continue_on_error: Whether to continue on errors. + synapse_client: The Synapse client. + """ + entity_id = utils.id_of(entity) + concrete_type = utils.concrete_type_of(entity) + + # Check if already indexed + is_indexed = await asyncio.to_thread(_check_indexed, db_path, entity_id) + if is_indexed: + return + + try: + if concrete_type == concrete_types.FILE_ENTITY: + if file_version_strategy != "skip": + await _index_file_entity_async( + entity_id=entity_id, + parent_id=parent_id, + db_path=db_path, + dest_storage_location_id=dest_storage_location_id, + source_storage_location_ids=source_storage_location_ids, + file_version_strategy=file_version_strategy, + synapse_client=synapse_client, + ) + + elif concrete_type == concrete_types.TABLE_ENTITY: + if include_table_files: + await _index_table_entity_async( + entity_id=entity_id, + parent_id=parent_id, + db_path=db_path, + dest_storage_location_id=dest_storage_location_id, + source_storage_location_ids=source_storage_location_ids, + synapse_client=synapse_client, + ) + + elif concrete_type in ( + concrete_types.FOLDER_ENTITY, + concrete_types.PROJECT_ENTITY, + ): + await _index_container_async( + entity_id=entity_id, + parent_id=parent_id, + db_path=db_path, + concrete_type=concrete_type, + dest_storage_location_id=dest_storage_location_id, + source_storage_location_ids=source_storage_location_ids, + file_version_strategy=file_version_strategy, + include_table_files=include_table_files, + continue_on_error=continue_on_error, + synapse_client=synapse_client, + ) + + except IndexingError: + raise + except Exception as ex: + if continue_on_error: + logger.warning("Error indexing entity %s: %s", entity_id, ex) + await asyncio.to_thread( + _record_indexing_error, db_path, entity_id, parent_id, ex + ) + else: + raise IndexingError(entity_id, concrete_type) from ex + + +async def _index_file_entity_async( + entity_id: str, + parent_id: Optional[str], + db_path: str, + dest_storage_location_id: str, + source_storage_location_ids: List[str], + file_version_strategy: str, + *, + synapse_client: "Synapse", +) -> None: + """Index a file entity for migration. + + Arguments: + entity_id: The file entity ID. + parent_id: The parent entity ID. + db_path: Path to the SQLite database. + dest_storage_location_id: Destination storage location ID. + source_storage_location_ids: List of source storage locations to filter. + file_version_strategy: Strategy for file versions. + synapse_client: The Synapse client. + """ + logger.info("Indexing file entity %s", entity_id) + + entity_versions: List[Tuple[Any, Optional[int]]] = [] + + if file_version_strategy == "new": + entity = await synapse_client.get_async(entity_id, downloadFile=False) + entity_versions.append((entity, None)) + + elif file_version_strategy == "all": + # Get all versions + async for version in _get_version_numbers_async(entity_id, synapse_client): + entity = await synapse_client.get_async( + entity_id, version=version, downloadFile=False + ) + entity_versions.append((entity, version)) + + elif file_version_strategy == "latest": + entity = await synapse_client.get_async(entity_id, downloadFile=False) + entity_versions.append((entity, entity.versionNumber)) + + for entity, version in entity_versions: + file_handle = entity._file_handle + status = _include_file_in_migration( + file_handle, source_storage_location_ids, dest_storage_location_id + ) + if status: + await asyncio.to_thread( + _insert_file_migration, + db_path, + entity_id, + version, + parent_id, + file_handle["storageLocationId"], + entity.dataFileHandleId, + file_handle["contentSize"], + status, + ) + + +async def _get_version_numbers_async( + entity_id: str, + synapse_client: "Synapse", +) -> AsyncGenerator[int, None]: + """Get all version numbers for an entity. + + Arguments: + entity_id: The entity ID. + synapse_client: The Synapse client. + + Yields: + Version numbers. + """ + offset = 0 + limit = 100 + + while True: + response = await synapse_client.rest_get_async( + f"/entity/{entity_id}/version?offset={offset}&limit={limit}" + ) + results = response.get("results", []) + + for version_info in results: + yield version_info["versionNumber"] + + if len(results) < limit: + break + offset += limit + + +async def _index_table_entity_async( + entity_id: str, + parent_id: Optional[str], + db_path: str, + dest_storage_location_id: str, + source_storage_location_ids: List[str], + *, + synapse_client: "Synapse", +) -> None: + """Index a table entity's file attachments for migration. + + Arguments: + entity_id: The table entity ID. + parent_id: The parent entity ID. + db_path: Path to the SQLite database. + dest_storage_location_id: Destination storage location ID. + source_storage_location_ids: List of source storage locations to filter. + synapse_client: The Synapse client. + """ + logger.info("Indexing table entity %s", entity_id) + + # Get file handle columns using the async API + columns = await get_columns(table_id=entity_id, synapse_client=synapse_client) + file_handle_columns = [c for c in columns if c.column_type == "FILEHANDLEID"] + + if not file_handle_columns: + return + + # Query table for file handles using local helper + file_column_select = _join_column_names(file_handle_columns) + + # tableQuery is still a synchronous method on the Synapse client + results = await asyncio.to_thread( + synapse_client.tableQuery, + f"SELECT {file_column_select} FROM {entity_id}", + ) + + for row in results: + row_id, row_version = row[:2] + file_handle_ids = row[2:] + + for i, file_handle_id in enumerate(file_handle_ids): + if not file_handle_id: + continue + + col_id = file_handle_columns[i].id + + # Get file handle metadata using the async API + fh_response = await get_file_handle_for_download_async( + file_handle_id=str(file_handle_id), + synapse_id=entity_id, + entity_type="TableEntity", + synapse_client=synapse_client, + ) + file_handle = fh_response["fileHandle"] + + status = _include_file_in_migration( + file_handle, source_storage_location_ids, dest_storage_location_id + ) + if status: + await asyncio.to_thread( + _insert_table_file_migration, + db_path, + entity_id, + row_id, + int(col_id), + row_version, + parent_id, + file_handle["storageLocationId"], + file_handle_id, + file_handle["contentSize"], + status, + ) + + +async def _index_container_async( + entity_id: str, + parent_id: Optional[str], + db_path: str, + concrete_type: str, + dest_storage_location_id: str, + source_storage_location_ids: List[str], + file_version_strategy: str, + include_table_files: bool, + continue_on_error: bool, + *, + synapse_client: "Synapse", +) -> None: + """Index a container (Project or Folder) and its children. + + Arguments: + entity_id: The container entity ID. + parent_id: The parent entity ID. + db_path: Path to the SQLite database. + concrete_type: The concrete type of the container. + dest_storage_location_id: Destination storage location ID. + source_storage_location_ids: List of source storage locations to filter. + file_version_strategy: Strategy for file versions. + include_table_files: Whether to include table-attached files. + continue_on_error: Whether to continue on errors. + synapse_client: The Synapse client. + """ + logger.info("Indexing container %s", entity_id) + + # Determine included types + include_types = [] + if file_version_strategy != "skip": + include_types.extend(["folder", "file"]) + if include_table_files: + include_types.append("table") + + # Get children using the async API + children = [] + async for child in get_children( + parent=entity_id, + include_types=include_types, + synapse_client=synapse_client, + ): + children.append(child) + + # Use bounded concurrency for indexing children + semaphore = asyncio.Semaphore(10) + + async def index_child(child: Dict[str, Any]) -> None: + async with semaphore: + child_entity = await synapse_client.get_async( + child["id"], downloadFile=False + ) + await _index_entity_async( + entity=child_entity, + parent_id=entity_id, + db_path=db_path, + dest_storage_location_id=dest_storage_location_id, + source_storage_location_ids=source_storage_location_ids, + file_version_strategy=file_version_strategy, + include_table_files=include_table_files, + continue_on_error=continue_on_error, + synapse_client=synapse_client, + ) + + # Process children with as_completed for progress tracking + tasks = [asyncio.create_task(index_child(child)) for child in children] + for task in asyncio.as_completed(tasks): + await task + + # Mark container as indexed + migration_type = ( + MigrationType.PROJECT + if concrete_type == concrete_types.PROJECT_ENTITY + else MigrationType.FOLDER + ) + await asyncio.to_thread( + _mark_container_indexed, db_path, entity_id, parent_id, migration_type + ) + + +# ============================================================================= +# Migration Execution +# ============================================================================= + + +async def _execute_migration_async( + db_path: str, + dest_storage_location_id: str, + create_table_snapshots: bool, + continue_on_error: bool, + max_concurrent: int, + *, + synapse_client: "Synapse", +) -> None: + """Execute the actual file migration. + + Arguments: + db_path: Path to the SQLite database. + dest_storage_location_id: Destination storage location ID. + create_table_snapshots: Whether to create table snapshots. + continue_on_error: Whether to continue on errors. + max_concurrent: Maximum concurrent operations. + synapse_client: The Synapse client. + """ + pending_file_handles: Set[str] = set() + completed_file_handles: Set[str] = set() + pending_keys: Set[MigrationKey] = set() + table_snapshots_created: Set[str] = set() + + semaphore = asyncio.Semaphore(max_concurrent) + active_tasks: Set[asyncio.Task] = set() + + last_id = "" + last_version = -1 + last_row_id = -1 + last_col_id = -1 + + while True: + # Query next batch + batch = await asyncio.to_thread( + _query_migration_batch, + db_path, + last_id, + last_version, + last_row_id, + last_col_id, + pending_file_handles, + completed_file_handles, + min(BATCH_SIZE, max_concurrent - len(active_tasks)), + ) + + if not batch and not active_tasks: + break + + # Process batch items + for item in batch: + key = MigrationKey( + id=item["id"], + type=item["type"], + version=item["version"], + row_id=item["row_id"], + col_id=item["col_id"], + ) + + if key in pending_keys: + continue + + pending_keys.add(key) + from_file_handle_id = item["from_file_handle_id"] + + # Check for existing copy + to_file_handle_id = await asyncio.to_thread( + _check_file_handle_exists, db_path, from_file_handle_id + ) + + if not to_file_handle_id: + pending_file_handles.add(from_file_handle_id) + + # Create table snapshot if needed using the async API + if ( + item["type"] == MigrationType.TABLE_ATTACHED_FILE + and create_table_snapshots + and item["id"] not in table_snapshots_created + ): + await create_table_snapshot( + table_id=item["id"], + synapse_client=synapse_client, + ) + table_snapshots_created.add(item["id"]) + + # Create migration task + task = asyncio.create_task( + _migrate_item_async( + key=key, + from_file_handle_id=from_file_handle_id, + to_file_handle_id=to_file_handle_id, + file_size=item["file_size"] or 0, + dest_storage_location_id=dest_storage_location_id, + semaphore=semaphore, + synapse_client=synapse_client, + ) + ) + active_tasks.add(task) + + # Update tracking for next batch + last_id = item["id"] + last_version = item["version"] if item["version"] is not None else -1 + last_row_id = item["row_id"] if item["row_id"] is not None else -1 + last_col_id = item["col_id"] if item["col_id"] is not None else -1 + + # Wait for tasks if at capacity or end of batch + if active_tasks and ( + len(active_tasks) >= max_concurrent or len(batch) < BATCH_SIZE + ): + done, active_tasks = await asyncio.wait( + active_tasks, + return_when=asyncio.FIRST_COMPLETED, + ) + + for completed_task in done: + try: + result = completed_task.result() + key = result["key"] + from_fh_id = result["from_file_handle_id"] + to_fh_id = result["to_file_handle_id"] + + # Update database + await asyncio.to_thread( + _update_migration_success, db_path, key, to_fh_id + ) + + completed_file_handles.add(from_fh_id) + pending_file_handles.discard(from_fh_id) + pending_keys.discard(key) + + except Exception as ex: + if hasattr(ex, "key"): + key = ex.key + await asyncio.to_thread( + _update_migration_error, db_path, key, ex.__cause__ or ex + ) + pending_keys.discard(key) + + if not continue_on_error: + # Cancel remaining tasks + for task in active_tasks: + task.cancel() + raise + + # Wait for any remaining tasks + if active_tasks: + done, _ = await asyncio.wait(active_tasks) + for completed_task in done: + try: + result = completed_task.result() + await asyncio.to_thread( + _update_migration_success, + db_path, + result["key"], + result["to_file_handle_id"], + ) + except Exception as ex: + if hasattr(ex, "key"): + await asyncio.to_thread( + _update_migration_error, db_path, ex.key, ex.__cause__ or ex + ) + if not continue_on_error: + raise + + +async def _migrate_item_async( + key: MigrationKey, + from_file_handle_id: str, + to_file_handle_id: Optional[str], + file_size: int, + dest_storage_location_id: str, + semaphore: asyncio.Semaphore, + *, + synapse_client: "Synapse", +) -> Dict[str, Any]: + """Migrate a single item. + + Arguments: + key: The migration key. + from_file_handle_id: Source file handle ID. + to_file_handle_id: Destination file handle ID (if already copied). + file_size: File size in bytes. + dest_storage_location_id: Destination storage location ID. + semaphore: Concurrency semaphore. + synapse_client: The Synapse client. + + Returns: + Dictionary with key, from_file_handle_id, to_file_handle_id. + """ + async with semaphore: + try: + # Copy file handle if needed + if not to_file_handle_id: + source_association = { + "fileHandleId": from_file_handle_id, + "associateObjectId": key.id, + "associateObjectType": ( + "FileEntity" + if key.type == MigrationType.FILE + else "TableEntity" + ), + } + + # Use thread for multipart_copy (it uses threading internally) + to_file_handle_id = await asyncio.to_thread( + multipart_copy, + synapse_client, + source_association, + dest_storage_location_id, + part_size=_get_part_size(file_size), + ) + + # Update entity with new file handle + if key.type == MigrationType.FILE: + if key.version is None: + await _create_new_file_version_async( + entity_id=key.id, + to_file_handle_id=to_file_handle_id, + synapse_client=synapse_client, + ) + else: + await _update_file_version_async( + entity_id=key.id, + version=key.version, + from_file_handle_id=from_file_handle_id, + to_file_handle_id=to_file_handle_id, + synapse_client=synapse_client, + ) + elif key.type == MigrationType.TABLE_ATTACHED_FILE: + await _update_table_file_async( + entity_id=key.id, + row_id=key.row_id, + col_id=key.col_id, + to_file_handle_id=to_file_handle_id, + synapse_client=synapse_client, + ) + + return { + "key": key, + "from_file_handle_id": from_file_handle_id, + "to_file_handle_id": to_file_handle_id, + } + + except Exception as ex: + error = MigrationError(key, from_file_handle_id, to_file_handle_id) + error.__cause__ = ex + raise error + + +async def _create_new_file_version_async( + entity_id: str, + to_file_handle_id: str, + *, + synapse_client: "Synapse", +) -> None: + """Create a new version of a file entity with the new file handle. + + Arguments: + entity_id: The file entity ID. + to_file_handle_id: The new file handle ID. + synapse_client: The Synapse client. + """ + entity = await synapse_client.get_async(entity_id, downloadFile=False) + entity.dataFileHandleId = to_file_handle_id + await synapse_client.store_async(entity) + + +async def _update_file_version_async( + entity_id: str, + version: int, + from_file_handle_id: str, + to_file_handle_id: str, + *, + synapse_client: "Synapse", +) -> None: + """Update an existing file version's file handle. + + Arguments: + entity_id: The file entity ID. + version: The version number. + from_file_handle_id: The original file handle ID. + to_file_handle_id: The new file handle ID. + synapse_client: The Synapse client. + """ + await synapse_client.rest_put_async( + f"/entity/{entity_id}/version/{version}/filehandle", + body=json.dumps( + { + "oldFileHandleId": from_file_handle_id, + "newFileHandleId": to_file_handle_id, + } + ), + ) + + +async def _update_table_file_async( + entity_id: str, + row_id: int, + col_id: int, + to_file_handle_id: str, + *, + synapse_client: "Synapse", +) -> None: + """Update a table cell with a new file handle. + + Arguments: + entity_id: The table entity ID. + row_id: The row ID. + col_id: The column ID. + to_file_handle_id: The new file handle ID. + synapse_client: The Synapse client. + """ + # Create the partial row update using new OOP models + partial_row = PartialRow( + row_id=str(row_id), + values=[{"key": str(col_id), "value": to_file_handle_id}], + ) + partial_row_set = PartialRowSet( + table_id=entity_id, + rows=[partial_row], + ) + appendable_request = AppendableRowSetRequest( + entity_id=entity_id, + to_append=partial_row_set, + ) + + # Execute the update using TableUpdateTransaction + transaction = TableUpdateTransaction( + entity_id=entity_id, + changes=[appendable_request], + ) + await transaction.send_job_and_wait_async(synapse_client=synapse_client) diff --git a/synapseclient/models/services/migration_types.py b/synapseclient/models/services/migration_types.py new file mode 100644 index 000000000..a20cc008d --- /dev/null +++ b/synapseclient/models/services/migration_types.py @@ -0,0 +1,371 @@ +""" +Data classes and enums for the async migration service. + +These types are used to track the state of file migrations between storage locations. +""" + +import asyncio +import csv +from dataclasses import dataclass, field +from enum import Enum +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional + +from synapseclient.core.constants import concrete_types + +if TYPE_CHECKING: + from synapseclient import Synapse + + +class MigrationStatus(Enum): + """Status of a migration entry in the tracking database.""" + + INDEXED = 1 + """The file has been indexed and is ready to be migrated.""" + + MIGRATED = 2 + """The file has been successfully migrated to the new storage location.""" + + ALREADY_MIGRATED = 3 + """The file was already at the destination storage location.""" + + ERRORED = 4 + """An error occurred during indexing or migration.""" + + +class MigrationType(Enum): + """Type of entity being tracked in the migration database.""" + + PROJECT = 1 + """A project container (used for tracking indexed containers).""" + + FOLDER = 2 + """A folder container (used for tracking indexed containers).""" + + FILE = 3 + """A file entity.""" + + TABLE_ATTACHED_FILE = 4 + """A file attached to a table column.""" + + @classmethod + def from_concrete_type(cls, concrete_type: str) -> "MigrationType": + """Convert a Synapse concrete type string to a MigrationType. + + Arguments: + concrete_type: The concrete type string from Synapse API. + + Returns: + The corresponding MigrationType enum value. + + Raises: + ValueError: If the concrete type is not recognized. + """ + if concrete_type == concrete_types.PROJECT_ENTITY: + return cls.PROJECT + elif concrete_type == concrete_types.FOLDER_ENTITY: + return cls.FOLDER + elif concrete_type == concrete_types.FILE_ENTITY: + return cls.FILE + elif concrete_type == concrete_types.TABLE_ENTITY: + return cls.TABLE_ATTACHED_FILE + + raise ValueError(f"Unhandled concrete type: {concrete_type}") + + +@dataclass +class MigrationKey: + """Unique identifier for a migration entry in the tracking database. + + Attributes: + id: The Synapse entity ID. + type: The type of entity being migrated. + version: The file version number (None for new versions or containers). + row_id: The table row ID (for table attached files). + col_id: The table column ID (for table attached files). + """ + + id: str + type: MigrationType + version: Optional[int] = None + row_id: Optional[int] = None + col_id: Optional[int] = None + + def __hash__(self) -> int: + return hash((self.id, self.type, self.version, self.row_id, self.col_id)) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, MigrationKey): + return False + return ( + self.id == other.id + and self.type == other.type + and self.version == other.version + and self.row_id == other.row_id + and self.col_id == other.col_id + ) + + +@dataclass +class MigrationEntry: + """A single migration entry with full details. + + Attributes: + key: The unique identifier for this migration entry. + parent_id: The parent entity ID. + from_storage_location_id: The original storage location ID. + from_file_handle_id: The original file handle ID. + to_file_handle_id: The new file handle ID after migration. + file_size: The file size in bytes. + status: The current migration status. + exception: Stack trace if an error occurred. + """ + + key: MigrationKey + parent_id: Optional[str] = None + from_storage_location_id: Optional[int] = None + from_file_handle_id: Optional[str] = None + to_file_handle_id: Optional[str] = None + file_size: Optional[int] = None + status: MigrationStatus = MigrationStatus.INDEXED + exception: Optional[str] = None + + +@dataclass +class MigrationSettings: + """Settings for a migration index stored in the database. + + Attributes: + root_id: The root entity ID being migrated. + dest_storage_location_id: The destination storage location ID. + source_storage_location_ids: List of source storage location IDs to filter. + file_version_strategy: Strategy for handling file versions. + include_table_files: Whether to include files attached to tables. + """ + + root_id: str + dest_storage_location_id: str + source_storage_location_ids: List[str] = field(default_factory=list) + file_version_strategy: str = "new" + include_table_files: bool = False + + +@dataclass +class MigrationResult: + """Result of a migration operation - proxy to the SQLite tracking database. + + This class provides methods to query the migration database for status counts, + individual migration entries, and CSV export. + + Attributes: + db_path: Path to the SQLite database file. + synapse_client: Optional Synapse client for column name lookups. + """ + + db_path: str + synapse_client: Optional["Synapse"] = None + + @property + def counts_by_status(self) -> Dict[str, int]: + """Get counts by migration status (synchronous). + + Returns: + Dictionary mapping status names to counts. + """ + return self.get_counts_by_status() + + def get_counts_by_status(self) -> Dict[str, int]: + """Get counts by migration status (synchronous). + + Returns: + Dictionary mapping status names to counts. + """ + import sqlite3 + + with sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + + # Only count FILE and TABLE_ATTACHED_FILE entries + result = cursor.execute( + "SELECT status, count(*) FROM migrations " + "WHERE type IN (?, ?) GROUP BY status", + (MigrationType.FILE.value, MigrationType.TABLE_ATTACHED_FILE.value), + ) + + counts = {status.name: 0 for status in MigrationStatus} + for row in result: + status_value = row[0] + count = row[1] + counts[MigrationStatus(status_value).name] = count + + return counts + + async def get_counts_by_status_async(self) -> Dict[str, int]: + """Get counts by migration status (asynchronous). + + Returns: + Dictionary mapping status names to counts. + """ + return await asyncio.to_thread(self.get_counts_by_status) + + def get_migrations(self) -> Iterator[Dict[str, Any]]: + """Iterate over all migration entries (synchronous). + + Yields: + Dictionary for each migration entry with keys: + id, type, version, row_id, col_name, from_storage_location_id, + from_file_handle_id, to_file_handle_id, file_size, status, exception. + """ + import sqlite3 + + with sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + + batch_size = 500 + rowid = -1 + column_names_cache: Dict[int, str] = {} + + while True: + results = cursor.execute( + """ + SELECT + rowid, + id, + type, + version, + row_id, + col_id, + from_storage_location_id, + from_file_handle_id, + to_file_handle_id, + file_size, + status, + exception + FROM migrations + WHERE + rowid > ? + AND type IN (?, ?) + ORDER BY rowid + LIMIT ? + """, + ( + rowid, + MigrationType.FILE.value, + MigrationType.TABLE_ATTACHED_FILE.value, + batch_size, + ), + ) + + rows = results.fetchall() + if not rows: + break + + for row in rows: + rowid = row[0] + col_id = row[5] + + # Resolve column name if needed + col_name = None + if col_id is not None and self.synapse_client: + if col_id not in column_names_cache: + try: + col_info = self.synapse_client.restGET( + f"/column/{col_id}" + ) + column_names_cache[col_id] = col_info.get("name", "") + except Exception: + column_names_cache[col_id] = "" + col_name = column_names_cache[col_id] + + yield { + "id": row[1], + "type": ( + "file" if row[2] == MigrationType.FILE.value else "table" + ), + "version": row[3], + "row_id": row[4], + "col_name": col_name, + "from_storage_location_id": row[6], + "from_file_handle_id": row[7], + "to_file_handle_id": row[8], + "file_size": row[9], + "status": MigrationStatus(row[10]).name, + "exception": row[11], + } + + async def get_migrations_async(self) -> List[Dict[str, Any]]: + """Get all migration entries (asynchronous). + + Returns: + List of dictionaries for each migration entry. + """ + # Convert to list since generators can't be returned from to_thread + return await asyncio.to_thread(lambda: list(self.get_migrations())) + + def as_csv(self, path: str) -> None: + """Export migration results to a CSV file (synchronous). + + Arguments: + path: Path to write the CSV file. + """ + fieldnames = [ + "id", + "type", + "version", + "row_id", + "col_name", + "from_storage_location_id", + "from_file_handle_id", + "to_file_handle_id", + "file_size", + "status", + "exception", + ] + + with open(path, "w", newline="") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for migration in self.get_migrations(): + writer.writerow(migration) + + async def as_csv_async(self, path: str) -> None: + """Export migration results to a CSV file (asynchronous). + + Arguments: + path: Path to write the CSV file. + """ + await asyncio.to_thread(self.as_csv, path) + + +class MigrationError(Exception): + """Error during a migration operation. + + Attributes: + key: The migration key that failed. + from_file_handle_id: The source file handle ID. + to_file_handle_id: The destination file handle ID (if partially complete). + """ + + def __init__( + self, + key: MigrationKey, + from_file_handle_id: str, + to_file_handle_id: Optional[str] = None, + ): + self.key = key + self.from_file_handle_id = from_file_handle_id + self.to_file_handle_id = to_file_handle_id + super().__init__(f"Migration failed for {key.id}") + + +class IndexingError(Exception): + """Error during an indexing operation. + + Attributes: + entity_id: The entity ID that failed to index. + concrete_type: The concrete type of the entity. + """ + + def __init__(self, entity_id: str, concrete_type: str): + self.entity_id = entity_id + self.concrete_type = concrete_type + super().__init__(f"Indexing failed for {entity_id} ({concrete_type})") diff --git a/synapseclient/models/storage_location.py b/synapseclient/models/storage_location.py new file mode 100644 index 000000000..664276855 --- /dev/null +++ b/synapseclient/models/storage_location.py @@ -0,0 +1,600 @@ +"""StorageLocation model for managing storage location settings in Synapse.""" + +from dataclasses import dataclass, field +from enum import Enum +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union + +from synapseclient import Synapse +from synapseclient.api.storage_location_services import ( + create_storage_location_setting, + get_storage_location_setting, +) +from synapseclient.core.async_utils import async_to_sync, otel_trace_method +from synapseclient.models.protocols.storage_location_protocol import ( + StorageLocationSynchronousProtocol, +) + +if TYPE_CHECKING: + from synapseclient.models import Folder + + +class StorageLocationType(str, Enum): + """Enumeration of storage location types supported by Synapse. + + Each type maps to a specific concreteType suffix in the REST API. + + Attributes: + SYNAPSE_S3: Synapse-managed S3 storage (default). + EXTERNAL_S3: User-owned Amazon S3 bucket accessed by Synapse. + EXTERNAL_GOOGLE_CLOUD: User-owned Google Cloud Storage bucket. + EXTERNAL_SFTP: External SFTP server not accessed by Synapse. + EXTERNAL_OBJECT_STORE: S3-like bucket (e.g., AWS S3 or OpenStack) not + accessed by Synapse. + PROXY: A proxy server that controls access to storage. + """ + + SYNAPSE_S3 = "S3StorageLocationSetting" + EXTERNAL_S3 = "ExternalS3StorageLocationSetting" + EXTERNAL_GOOGLE_CLOUD = "ExternalGoogleCloudStorageLocationSetting" + EXTERNAL_SFTP = "ExternalStorageLocationSetting" + EXTERNAL_OBJECT_STORE = "ExternalObjectStorageLocationSetting" + PROXY = "ProxyStorageLocationSettings" + + +class UploadType(str, Enum): + """Enumeration of upload types for storage locations. + + Attributes: + S3: Amazon S3 compatible upload. + GOOGLE_CLOUD_STORAGE: Google Cloud Storage upload. + SFTP: SFTP upload. + HTTPS: HTTPS upload (typically used with proxy storage). + NONE: No upload type specified. + """ + + S3 = "S3" + GOOGLE_CLOUD_STORAGE = "GOOGLECLOUDSTORAGE" + SFTP = "SFTP" + HTTPS = "HTTPS" + NONE = "NONE" + + +# Mapping from StorageLocationType to default UploadType +_STORAGE_TYPE_TO_UPLOAD_TYPE: Dict[StorageLocationType, UploadType] = { + StorageLocationType.SYNAPSE_S3: UploadType.S3, + StorageLocationType.EXTERNAL_S3: UploadType.S3, + StorageLocationType.EXTERNAL_GOOGLE_CLOUD: UploadType.GOOGLE_CLOUD_STORAGE, + StorageLocationType.EXTERNAL_SFTP: UploadType.SFTP, + StorageLocationType.EXTERNAL_OBJECT_STORE: UploadType.S3, + StorageLocationType.PROXY: UploadType.HTTPS, +} + +# Mapping from concreteType suffix to StorageLocationType +_CONCRETE_TYPE_TO_STORAGE_TYPE: Dict[str, StorageLocationType] = { + storage_type.value: storage_type for storage_type in StorageLocationType +} + + +@dataclass() +@async_to_sync +class StorageLocation(StorageLocationSynchronousProtocol): + """A storage location setting describes where files are uploaded to and + downloaded from via Synapse. Storage location settings may be created for + external locations, such as user-owned Amazon S3 buckets, Google Cloud + Storage buckets, SFTP servers, or proxy storage. + + Attributes: + storage_location_id: (Read Only) The unique ID for this storage location, + assigned by the server on creation. + storage_type: The type of storage location. Required when creating a new + storage location via `store()`. Determines the `concreteType` sent to + the Synapse REST API. + banner: The banner text to display to a user every time a file is uploaded. + This field is optional. + description: A description of the storage location. This description is + shown when a user has to choose which upload destination to use. + + Attributes: + bucket: The name of the S3 or Google Cloud Storage bucket. Applicable to + SYNAPSE_S3, EXTERNAL_S3, EXTERNAL_GOOGLE_CLOUD, and + EXTERNAL_OBJECT_STORE types. + base_key: The optional base key (prefix/folder) within the bucket. + Applicable to SYNAPSE_S3, EXTERNAL_S3, and EXTERNAL_GOOGLE_CLOUD types. + sts_enabled: Whether STS (AWS Security Token Service) is enabled on this + storage location. Applicable to SYNAPSE_S3 and EXTERNAL_S3 types. + endpoint_url: The endpoint URL of the S3 service. Applicable to + EXTERNAL_S3 (default: https://s3.amazonaws.com) and + EXTERNAL_OBJECT_STORE types. + + Attributes: + url: The base URL for uploading to the external destination. Applicable to + EXTERNAL_SFTP type. + supports_subfolders: Whether the destination supports creating subfolders + under the base url. Applicable to EXTERNAL_SFTP type. Default: False. + + Attributes: + proxy_url: The HTTPS URL of the proxy used for upload and download. + Applicable to PROXY type. + secret_key: The encryption key used to sign all pre-signed URLs used to + communicate with the proxy. Applicable to PROXY type. + benefactor_id: An Entity ID (such as a Project ID). When set, any user with + the 'create' permission on the given benefactorId will be allowed to + create ProxyFileHandle using its storage location ID. Applicable to + PROXY type. + + Attributes: + upload_type: (Read Only) The upload type for this storage location. + Automatically derived from `storage_type`. + etag: (Read Only) Synapse employs an Optimistic Concurrency Control (OCC) + scheme. The E-Tag changes every time the setting is updated. + created_on: (Read Only) The date this storage location setting was created. + created_by: (Read Only) The ID of the user that created this storage + location setting. + + Example: Creating an external S3 storage location + Create a storage location backed by your own S3 bucket: + + from synapseclient.models import StorageLocation, StorageLocationType + + import synapseclient + synapseclient.login() + + storage = StorageLocation( + storage_type=StorageLocationType.EXTERNAL_S3, + bucket="my-external-synapse-bucket", + base_key="path/within/bucket", + ).store() + + print(f"Storage location ID: {storage.storage_location_id}") + + Example: Creating an STS-enabled S3 storage location with a folder + Use the convenience classmethod to create a folder with STS-enabled + storage: + + from synapseclient.models import StorageLocation + + import synapseclient + synapseclient.login() + + folder, storage = StorageLocation.setup_s3( + folder_name="my-sts-folder", + parent="syn123", + bucket_name="my-external-synapse-bucket", + base_key="path/within/bucket", + sts_enabled=True, + ) + print(f"Folder: {folder.id}, Storage: {storage.storage_location_id}") + + Example: Creating a Google Cloud storage location + Create a storage location backed by your own GCS bucket: + + from synapseclient.models import StorageLocation, StorageLocationType + + import synapseclient + synapseclient.login() + + storage = StorageLocation( + storage_type=StorageLocationType.EXTERNAL_GOOGLE_CLOUD, + bucket="my-gcs-bucket", + base_key="path/within/bucket", + ).store() + """ + + # Core fields - present on all storage locations + storage_location_id: Optional[int] = None + """(Read Only) The unique ID for this storage location, assigned by the server + on creation.""" + + storage_type: Optional[StorageLocationType] = None + """The type of storage location. Required when creating a new storage location + via `store()`. Determines the `concreteType` sent to the Synapse REST API.""" + + banner: Optional[str] = None + """The banner text to display to a user every time a file is uploaded.""" + + description: Optional[str] = None + """A description of the storage location. This description is shown when a user + has to choose which upload destination to use.""" + + # S3/GCS specific fields + bucket: Optional[str] = None + """The name of the S3 or Google Cloud Storage bucket. Applicable to SYNAPSE_S3, + EXTERNAL_S3, EXTERNAL_GOOGLE_CLOUD, and EXTERNAL_OBJECT_STORE types.""" + + base_key: Optional[str] = None + """The optional base key (prefix/folder) within the bucket. Applicable to + SYNAPSE_S3, EXTERNAL_S3, and EXTERNAL_GOOGLE_CLOUD types.""" + + sts_enabled: Optional[bool] = None + """Whether STS (AWS Security Token Service) is enabled on this storage location. + Applicable to SYNAPSE_S3 and EXTERNAL_S3 types.""" + + endpoint_url: Optional[str] = None + """The endpoint URL of the S3 service. Applicable to EXTERNAL_S3 + (default: https://s3.amazonaws.com) and EXTERNAL_OBJECT_STORE types.""" + + # SFTP specific fields + url: Optional[str] = None + """The base URL for uploading to the external destination. Applicable to + EXTERNAL_SFTP type.""" + + supports_subfolders: Optional[bool] = None + """Whether the destination supports creating subfolders under the base url. + Applicable to EXTERNAL_SFTP type. Default: False.""" + + # Proxy specific fields + proxy_url: Optional[str] = None + """The HTTPS URL of the proxy used for upload and download. Applicable to + PROXY type.""" + + secret_key: Optional[str] = None + """The encryption key used to sign all pre-signed URLs used to communicate + with the proxy. Applicable to PROXY type.""" + + benefactor_id: Optional[str] = None + """An Entity ID (such as a Project ID). When set, any user with the 'create' + permission on the given benefactorId will be allowed to create ProxyFileHandle + using its storage location ID. Applicable to PROXY type.""" + + # Read-only fields + upload_type: Optional[UploadType] = field(default=None, repr=False, compare=False) + """(Read Only) The upload type for this storage location. Automatically derived + from `storage_type`.""" + + etag: Optional[str] = field(default=None, compare=False) + """(Read Only) Synapse employs an Optimistic Concurrency Control (OCC) scheme. + The E-Tag changes every time the setting is updated.""" + + created_on: Optional[str] = field(default=None, compare=False) + """(Read Only) The date this storage location setting was created.""" + + created_by: Optional[int] = field(default=None, compare=False) + """(Read Only) The ID of the user that created this storage location setting.""" + + def fill_from_dict(self, synapse_response: Dict[str, Any]) -> "StorageLocation": + """Converts a response from the REST API into this dataclass. + + Arguments: + synapse_response: The response from the REST API. + + Returns: + The StorageLocation object. + """ + self.storage_location_id = synapse_response.get("storageLocationId", None) + self.banner = synapse_response.get("banner", None) + self.description = synapse_response.get("description", None) + self.etag = synapse_response.get("etag", None) + self.created_on = synapse_response.get("createdOn", None) + self.created_by = synapse_response.get("createdBy", None) + + # Parse upload type + upload_type_str = synapse_response.get("uploadType", None) + if upload_type_str: + try: + self.upload_type = UploadType(upload_type_str) + except ValueError: + self.upload_type = None + + # Parse storage type from concreteType + concrete_type = synapse_response.get("concreteType", "") + if concrete_type: + # Extract the suffix after the last dot + type_suffix = concrete_type.split(".")[-1] if "." in concrete_type else "" + if type_suffix in _CONCRETE_TYPE_TO_STORAGE_TYPE: + self.storage_type = _CONCRETE_TYPE_TO_STORAGE_TYPE[type_suffix] + + # S3/GCS fields + self.bucket = synapse_response.get("bucket", None) + self.base_key = synapse_response.get("baseKey", None) + self.sts_enabled = synapse_response.get("stsEnabled", None) + self.endpoint_url = synapse_response.get("endpointUrl", None) + + # SFTP fields + self.url = synapse_response.get("url", None) + self.supports_subfolders = synapse_response.get("supportsSubfolders", None) + + # Proxy fields + self.proxy_url = synapse_response.get("proxyUrl", None) + self.secret_key = synapse_response.get("secretKey", None) + self.benefactor_id = synapse_response.get("benefactorId", None) + + return self + + def _to_synapse_request(self) -> Dict[str, Any]: + """Convert this dataclass to a request body for the REST API. + + Returns: + A dictionary suitable for the REST API. + """ + if not self.storage_type: + raise ValueError( + "storage_type is required when creating a storage location" + ) + + # Build the concrete type + concrete_type = ( + f"org.sagebionetworks.repo.model.project.{self.storage_type.value}" + ) + + # Determine upload type + upload_type = self.upload_type or _STORAGE_TYPE_TO_UPLOAD_TYPE.get( + self.storage_type, UploadType.S3 + ) + + body: Dict[str, Any] = { + "concreteType": concrete_type, + "uploadType": upload_type.value, + } + + # Add optional common fields + if self.banner is not None: + body["banner"] = self.banner + if self.description is not None: + body["description"] = self.description + + # Add type-specific fields + if self.storage_type in ( + StorageLocationType.SYNAPSE_S3, + StorageLocationType.EXTERNAL_S3, + StorageLocationType.EXTERNAL_GOOGLE_CLOUD, + StorageLocationType.EXTERNAL_OBJECT_STORE, + ): + if self.bucket is not None: + body["bucket"] = self.bucket + if self.base_key is not None: + body["baseKey"] = self.base_key + + if self.storage_type in ( + StorageLocationType.SYNAPSE_S3, + StorageLocationType.EXTERNAL_S3, + ): + if self.sts_enabled is not None: + body["stsEnabled"] = self.sts_enabled + + if self.storage_type in ( + StorageLocationType.EXTERNAL_S3, + StorageLocationType.EXTERNAL_OBJECT_STORE, + ): + if self.endpoint_url is not None: + body["endpointUrl"] = self.endpoint_url + + if self.storage_type == StorageLocationType.EXTERNAL_SFTP: + if self.url is not None: + body["url"] = self.url + if self.supports_subfolders is not None: + body["supportsSubfolders"] = self.supports_subfolders + + if self.storage_type == StorageLocationType.PROXY: + if self.proxy_url is not None: + body["proxyUrl"] = self.proxy_url + if self.secret_key is not None: + body["secretKey"] = self.secret_key + if self.benefactor_id is not None: + body["benefactorId"] = self.benefactor_id + + return body + + @otel_trace_method( + method_to_trace_name=lambda self, **kwargs: f"StorageLocation_Store: {self.storage_type}" + ) + async def store_async( + self, + *, + synapse_client: Optional[Synapse] = None, + ) -> "StorageLocation": + """Create this storage location in Synapse. Storage locations are immutable; + this always creates a new one. If a storage location with identical properties + already exists for this user, the existing one is returned (idempotent). + + Arguments: + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The StorageLocation object with server-assigned fields populated. + + Raises: + ValueError: If `storage_type` is not set. + + Example: Using this function + Create an external S3 storage location: + + import asyncio + from synapseclient import Synapse + from synapseclient.models import StorageLocation, StorageLocationType + + syn = Synapse() + syn.login() + + async def main(): + storage = await StorageLocation( + storage_type=StorageLocationType.EXTERNAL_S3, + bucket="my-bucket", + base_key="my/prefix", + ).store_async() + print(f"Created storage location: {storage.storage_location_id}") + + asyncio.run(main()) + """ + body = self._to_synapse_request() + response = await create_storage_location_setting( + body=body, + synapse_client=synapse_client, + ) + self.fill_from_dict(response) + return self + + @otel_trace_method( + method_to_trace_name=lambda self, **kwargs: f"StorageLocation_Get: {self.storage_location_id}" + ) + async def get_async( + self, + *, + synapse_client: Optional[Synapse] = None, + ) -> "StorageLocation": + """Retrieve this storage location from Synapse by its ID. Only the creator of + a StorageLocationSetting can retrieve it by its id. + + Arguments: + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The StorageLocation object populated with data from Synapse. + + Raises: + ValueError: If `storage_location_id` is not set. + + Example: Using this function + Retrieve a storage location by ID: + + import asyncio + from synapseclient import Synapse + from synapseclient.models import StorageLocation + + syn = Synapse() + syn.login() + + async def main(): + storage = await StorageLocation(storage_location_id=12345).get_async() + print(f"Type: {storage.storage_type}, Bucket: {storage.bucket}") + + asyncio.run(main()) + """ + if not self.storage_location_id: + raise ValueError( + "storage_location_id is required to retrieve a storage location" + ) + + response = await get_storage_location_setting( + storage_location_id=self.storage_location_id, + synapse_client=synapse_client, + ) + self.fill_from_dict(response) + return self + + @classmethod + async def setup_s3_async( + cls, + *, + parent: str, + folder_name: Optional[str] = None, + folder: Optional[Union["Folder", str]] = None, + bucket_name: Optional[str] = None, + base_key: Optional[str] = None, + sts_enabled: bool = False, + synapse_client: Optional[Synapse] = None, + ) -> Tuple["Folder", "StorageLocation"]: + """Convenience method to create a folder backed by S3 storage. This will: + + 1. Create or retrieve the folder + 2. Create the storage location setting + 3. Apply the storage location to the folder via project settings + + Arguments: + parent: The parent project or folder ID (e.g., "syn123"). + folder_name: Name for a new folder. Either `folder_name` or `folder` + must be provided. + folder: An existing Folder object or Synapse ID. Either `folder_name` + or `folder` must be provided. + bucket_name: The S3 bucket name. If None, uses Synapse default storage. + base_key: The base key (prefix) within the bucket. Optional. + sts_enabled: Whether to enable STS credentials for this storage location. + Default: False. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + A tuple of (Folder, StorageLocation). + + Raises: + ValueError: If neither `folder_name` nor `folder` is provided, or if both + are provided. + + Example: Using this function + Create an STS-enabled folder with external S3 storage: + + import asyncio + from synapseclient import Synapse + from synapseclient.models import StorageLocation + + syn = Synapse() + syn.login() + + async def main(): + folder, storage = await StorageLocation.setup_s3_async( + folder_name="my-sts-folder", + parent="syn123", + bucket_name="my-external-synapse-bucket", + base_key="path/within/bucket", + sts_enabled=True, + ) + print(f"Folder: {folder.id}, Storage: {storage.storage_location_id}") + + asyncio.run(main()) + + Example: Using existing folder + Apply S3 storage to an existing folder: + + import asyncio + from synapseclient import Synapse + from synapseclient.models import StorageLocation + + syn = Synapse() + syn.login() + + async def main(): + folder, storage = await StorageLocation.setup_s3_async( + folder="syn456", + bucket_name="my-bucket", + ) + + asyncio.run(main()) + """ + # Import here to avoid circular imports + from synapseclient.models import Folder as FolderModel + + # Validate parameters + if folder_name and folder: + raise ValueError( + "folder and folder_name are mutually exclusive, only one should be passed" + ) + if not folder_name and not folder: + raise ValueError("Either folder or folder_name is required") + + # Create or get the folder + if folder_name: + target_folder = await FolderModel( + name=folder_name, parent_id=parent + ).store_async(synapse_client=synapse_client) + elif isinstance(folder, str): + target_folder = await FolderModel(id=folder).get_async( + synapse_client=synapse_client + ) + else: + target_folder = folder + + # Determine storage type + if bucket_name: + storage_type = StorageLocationType.EXTERNAL_S3 + else: + storage_type = StorageLocationType.SYNAPSE_S3 + + # Create the storage location + storage_location = await cls( + storage_type=storage_type, + bucket=bucket_name, + base_key=base_key, + sts_enabled=sts_enabled, + ).store_async(synapse_client=synapse_client) + + # Apply the storage location to the folder + await target_folder.set_storage_location_async( + storage_location_id=storage_location.storage_location_id, + synapse_client=synapse_client, + ) + + return target_folder, storage_location diff --git a/tests/unit/synapseclient/api/unit_test_storage_location_services.py b/tests/unit/synapseclient/api/unit_test_storage_location_services.py new file mode 100644 index 000000000..bebc80d50 --- /dev/null +++ b/tests/unit/synapseclient/api/unit_test_storage_location_services.py @@ -0,0 +1,215 @@ +"""Unit tests for storage_location_services utility functions.""" + +from unittest.mock import AsyncMock, patch + +import pytest + +import synapseclient.api.storage_location_services as storage_location_services + + +class TestCreateStorageLocationSetting: + """Tests for create_storage_location_setting function.""" + + @pytest.mark.asyncio + @patch("synapseclient.Synapse") + async def test_create_storage_location_setting(self, mock_synapse): + """Test create_storage_location_setting creates a storage location.""" + # GIVEN a mock client that returns a storage location + mock_client = AsyncMock() + mock_synapse.get_client.return_value = mock_client + mock_client.rest_post_async.return_value = { + "storageLocationId": 12345, + "concreteType": "org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting", + "uploadType": "S3", + "bucket": "my-bucket", + } + + # WHEN I call create_storage_location_setting + body = { + "concreteType": "org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting", + "uploadType": "S3", + "bucket": "my-bucket", + } + result = await storage_location_services.create_storage_location_setting( + body=body, + synapse_client=None, + ) + + # THEN I expect the storage location to be returned + assert result["storageLocationId"] == 12345 + assert result["bucket"] == "my-bucket" + mock_client.rest_post_async.assert_awaited_once() + + +class TestGetStorageLocationSetting: + """Tests for get_storage_location_setting function.""" + + @pytest.mark.asyncio + @patch("synapseclient.Synapse") + async def test_get_storage_location_setting(self, mock_synapse): + """Test get_storage_location_setting retrieves a storage location.""" + # GIVEN a mock client that returns a storage location + mock_client = AsyncMock() + mock_synapse.get_client.return_value = mock_client + mock_client.rest_get_async.return_value = { + "storageLocationId": 12345, + "concreteType": "org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting", + "uploadType": "S3", + "bucket": "my-bucket", + } + + # WHEN I call get_storage_location_setting + result = await storage_location_services.get_storage_location_setting( + storage_location_id=12345, + synapse_client=None, + ) + + # THEN I expect the storage location to be returned + assert result["storageLocationId"] == 12345 + assert result["bucket"] == "my-bucket" + mock_client.rest_get_async.assert_awaited_once_with( + uri="/storageLocation/12345", + ) + + +class TestGetProjectSetting: + """Tests for get_project_setting function.""" + + @pytest.mark.asyncio + @patch("synapseclient.Synapse") + async def test_get_project_setting_exists(self, mock_synapse): + """Test get_project_setting when setting exists.""" + # GIVEN a mock client that returns a project setting + mock_client = AsyncMock() + mock_synapse.get_client.return_value = mock_client + mock_client.rest_get_async.return_value = { + "id": "setting123", + "projectId": "syn456", + "settingsType": "upload", + "locations": [12345], + } + + # WHEN I call get_project_setting + result = await storage_location_services.get_project_setting( + project_id="syn456", + setting_type="upload", + synapse_client=None, + ) + + # THEN I expect the project setting to be returned + assert result["id"] == "setting123" + assert result["locations"] == [12345] + mock_client.rest_get_async.assert_awaited_once_with( + uri="/projectSettings/syn456/type/upload", + ) + + @pytest.mark.asyncio + @patch("synapseclient.Synapse") + async def test_get_project_setting_not_exists(self, mock_synapse): + """Test get_project_setting when setting does not exist.""" + # GIVEN a mock client that returns empty response + mock_client = AsyncMock() + mock_synapse.get_client.return_value = mock_client + mock_client.rest_get_async.return_value = "" + + # WHEN I call get_project_setting + result = await storage_location_services.get_project_setting( + project_id="syn456", + setting_type="upload", + synapse_client=None, + ) + + # THEN I expect None to be returned + assert result is None + + +class TestCreateProjectSetting: + """Tests for create_project_setting function.""" + + @pytest.mark.asyncio + @patch("synapseclient.Synapse") + async def test_create_project_setting(self, mock_synapse): + """Test create_project_setting creates a project setting.""" + # GIVEN a mock client that returns a project setting + mock_client = AsyncMock() + mock_synapse.get_client.return_value = mock_client + mock_client.rest_post_async.return_value = { + "id": "setting123", + "projectId": "syn456", + "settingsType": "upload", + "locations": [12345], + } + + # WHEN I call create_project_setting + body = { + "concreteType": "org.sagebionetworks.repo.model.project.UploadDestinationListSetting", + "settingsType": "upload", + "locations": [12345], + "projectId": "syn456", + } + result = await storage_location_services.create_project_setting( + body=body, + synapse_client=None, + ) + + # THEN I expect the project setting to be returned + assert result["id"] == "setting123" + mock_client.rest_post_async.assert_awaited_once() + + +class TestUpdateProjectSetting: + """Tests for update_project_setting function.""" + + @pytest.mark.asyncio + @patch("synapseclient.Synapse") + async def test_update_project_setting(self, mock_synapse): + """Test update_project_setting updates a project setting.""" + # GIVEN a mock client that returns an updated project setting + mock_client = AsyncMock() + mock_synapse.get_client.return_value = mock_client + mock_client.rest_put_async.return_value = { + "id": "setting123", + "projectId": "syn456", + "settingsType": "upload", + "locations": [12345, 67890], + } + + # WHEN I call update_project_setting + body = { + "id": "setting123", + "projectId": "syn456", + "settingsType": "upload", + "locations": [12345, 67890], + } + result = await storage_location_services.update_project_setting( + body=body, + synapse_client=None, + ) + + # THEN I expect the updated project setting to be returned + assert result["locations"] == [12345, 67890] + mock_client.rest_put_async.assert_awaited_once() + + +class TestDeleteProjectSetting: + """Tests for delete_project_setting function.""" + + @pytest.mark.asyncio + @patch("synapseclient.Synapse") + async def test_delete_project_setting(self, mock_synapse): + """Test delete_project_setting deletes a project setting.""" + # GIVEN a mock client + mock_client = AsyncMock() + mock_synapse.get_client.return_value = mock_client + mock_client.rest_delete_async.return_value = None + + # WHEN I call delete_project_setting + await storage_location_services.delete_project_setting( + setting_id="setting123", + synapse_client=None, + ) + + # THEN I expect the delete to be called + mock_client.rest_delete_async.assert_awaited_once_with( + uri="/projectSettings/setting123", + ) diff --git a/tests/unit/synapseclient/models/unit_test_manifest.py b/tests/unit/synapseclient/models/unit_test_manifest.py new file mode 100644 index 000000000..4c65ac7c3 --- /dev/null +++ b/tests/unit/synapseclient/models/unit_test_manifest.py @@ -0,0 +1,499 @@ +"""Unit tests for the synapseclient.models.mixins.manifest module.""" + +import datetime +import os +import tempfile + +import pytest + +from synapseclient.models.mixins.manifest import ( + DEFAULT_GENERATED_MANIFEST_KEYS, + MANIFEST_FILENAME, + _convert_manifest_data_items_to_string_list, + _convert_manifest_data_row_to_dict, + _extract_entity_metadata_for_file, + _get_entity_provenance_dict_for_file, + _manifest_filename, + _parse_manifest_value, + _validate_manifest_required_fields, + _write_manifest_data, +) + + +class TestManifestConstants: + """Tests for manifest constants.""" + + def test_manifest_filename_constant(self): + """Test the MANIFEST_FILENAME constant.""" + assert MANIFEST_FILENAME == "SYNAPSE_METADATA_MANIFEST.tsv" + + def test_default_manifest_keys(self): + """Test the DEFAULT_GENERATED_MANIFEST_KEYS constant.""" + expected_keys = [ + "path", + "parent", + "name", + "id", + "synapseStore", + "contentType", + "used", + "executed", + "activityName", + "activityDescription", + ] + assert DEFAULT_GENERATED_MANIFEST_KEYS == expected_keys + + +class TestManifestFilename: + """Tests for _manifest_filename function.""" + + def test_manifest_filename(self): + """Test generating manifest filename.""" + # GIVEN a path + path = "/path/to/directory" + + # WHEN we generate the manifest filename + result = _manifest_filename(path) + + # THEN it should be the path joined with MANIFEST_FILENAME + assert result == os.path.join(path, MANIFEST_FILENAME) + + +class TestConvertManifestDataItemsToStringList: + """Tests for _convert_manifest_data_items_to_string_list function.""" + + def test_single_string(self): + """Test converting a single string.""" + # GIVEN a list with a single string + items = ["hello"] + + # WHEN we convert to string + result = _convert_manifest_data_items_to_string_list(items) + + # THEN it should return the string directly + assert result == "hello" + + def test_multiple_strings(self): + """Test converting multiple strings.""" + # GIVEN a list with multiple strings + items = ["a", "b", "c"] + + # WHEN we convert to string + result = _convert_manifest_data_items_to_string_list(items) + + # THEN it should return a bracketed list + assert result == "[a,b,c]" + + def test_string_with_comma(self): + """Test converting a string with comma.""" + # GIVEN a single item with comma (no quotes needed for single item) + items = ["hello,world"] + + # WHEN we convert to string + result = _convert_manifest_data_items_to_string_list(items) + + # THEN it should return the string directly + assert result == "hello,world" + + def test_multiple_strings_with_comma(self): + """Test converting multiple strings where one has a comma.""" + # GIVEN multiple strings where one contains commas + items = ["string,with,commas", "string without commas"] + + # WHEN we convert to string + result = _convert_manifest_data_items_to_string_list(items) + + # THEN the comma-containing string should be quoted + assert result == '["string,with,commas",string without commas]' + + def test_datetime(self): + """Test converting a datetime.""" + # GIVEN a datetime value + dt = datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc) + + # WHEN we convert to string + result = _convert_manifest_data_items_to_string_list([dt]) + + # THEN it should return ISO format + assert result == "2020-01-01T00:00:00Z" + + def test_multiple_datetimes(self): + """Test converting multiple datetimes.""" + # GIVEN multiple datetime values + dt1 = datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc) + dt2 = datetime.datetime(2021, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc) + + # WHEN we convert to string + result = _convert_manifest_data_items_to_string_list([dt1, dt2]) + + # THEN it should return a bracketed list of ISO dates + assert result == "[2020-01-01T00:00:00Z,2021-01-01T00:00:00Z]" + + def test_boolean_true(self): + """Test converting True.""" + # GIVEN a True value + items = [True] + + # WHEN we convert to string + result = _convert_manifest_data_items_to_string_list(items) + + # THEN it should return "True" + assert result == "True" + + def test_boolean_false(self): + """Test converting False.""" + # GIVEN a False value + items = [False] + + # WHEN we convert to string + result = _convert_manifest_data_items_to_string_list(items) + + # THEN it should return "False" + assert result == "False" + + def test_integer(self): + """Test converting an integer.""" + # GIVEN an integer value + items = [1] + + # WHEN we convert to string + result = _convert_manifest_data_items_to_string_list(items) + + # THEN it should return the string representation + assert result == "1" + + def test_float(self): + """Test converting a float.""" + # GIVEN a float value + items = [1.5] + + # WHEN we convert to string + result = _convert_manifest_data_items_to_string_list(items) + + # THEN it should return the string representation + assert result == "1.5" + + def test_empty_list(self): + """Test converting an empty list.""" + # GIVEN an empty list + items = [] + + # WHEN we convert to string + result = _convert_manifest_data_items_to_string_list(items) + + # THEN it should return an empty string + assert result == "" + + +class TestConvertManifestDataRowToDict: + """Tests for _convert_manifest_data_row_to_dict function.""" + + def test_simple_row(self): + """Test converting a simple row.""" + # GIVEN a row with simple values + row = {"path": "/path/to/file", "name": "file.txt"} + keys = ["path", "name"] + + # WHEN we convert it + result = _convert_manifest_data_row_to_dict(row, keys) + + # THEN it should return the same values + assert result == {"path": "/path/to/file", "name": "file.txt"} + + def test_row_with_list(self): + """Test converting a row with a list value.""" + # GIVEN a row with a list value + row = {"annotations": ["a", "b", "c"]} + keys = ["annotations"] + + # WHEN we convert it + result = _convert_manifest_data_row_to_dict(row, keys) + + # THEN the list should be converted to a string + assert result == {"annotations": "[a,b,c]"} + + def test_missing_key(self): + """Test converting a row with a missing key.""" + # GIVEN a row missing a key + row = {"path": "/path/to/file"} + keys = ["path", "name"] + + # WHEN we convert it + result = _convert_manifest_data_row_to_dict(row, keys) + + # THEN the missing key should be empty string + assert result == {"path": "/path/to/file", "name": ""} + + +class TestParseManifestValue: + """Tests for _parse_manifest_value function.""" + + def test_simple_string(self): + """Test parsing a simple string.""" + assert _parse_manifest_value("hello") == "hello" + + def test_list_syntax(self): + """Test parsing list syntax.""" + assert _parse_manifest_value("[a,b,c]") == ["a", "b", "c"] + + def test_list_with_quoted_string(self): + """Test parsing list with quoted string containing comma.""" + result = _parse_manifest_value('["hello,world",other]') + assert result == ["hello,world", "other"] + + def test_boolean_true(self): + """Test parsing 'true' string.""" + assert _parse_manifest_value("true") is True + assert _parse_manifest_value("True") is True + assert _parse_manifest_value("TRUE") is True + + def test_boolean_false(self): + """Test parsing 'false' string.""" + assert _parse_manifest_value("false") is False + assert _parse_manifest_value("False") is False + assert _parse_manifest_value("FALSE") is False + + def test_integer(self): + """Test parsing an integer string.""" + assert _parse_manifest_value("123") == 123 + + def test_float(self): + """Test parsing a float string.""" + assert _parse_manifest_value("1.5") == 1.5 + + def test_non_numeric_string(self): + """Test that non-numeric strings stay as strings.""" + assert _parse_manifest_value("hello123") == "hello123" + + +class TestWriteManifestData: + """Tests for _write_manifest_data function.""" + + def test_write_simple_manifest(self): + """Test writing a simple manifest file.""" + # GIVEN simple data + keys = ["path", "name", "id"] + data = [ + {"path": "/path/to/file1.txt", "name": "file1.txt", "id": "syn123"}, + {"path": "/path/to/file2.txt", "name": "file2.txt", "id": "syn456"}, + ] + + # WHEN we write it to a temp file + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f: + filename = f.name + + try: + _write_manifest_data(filename, keys, data) + + # THEN the file should contain the expected content + with open(filename, "r") as f: + content = f.read() + + lines = content.strip().split("\n") + assert len(lines) == 3 # header + 2 data rows + assert lines[0] == "path\tname\tid" + assert lines[1] == "/path/to/file1.txt\tfile1.txt\tsyn123" + assert lines[2] == "/path/to/file2.txt\tfile2.txt\tsyn456" + finally: + os.unlink(filename) + + +class TestValidateManifestRequiredFields: + """Tests for _validate_manifest_required_fields function.""" + + def test_valid_manifest(self): + """Test validating a valid manifest file.""" + # GIVEN a valid manifest file + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f: + f.write("path\tparent\n") + f.write(f"{f.name}\tsyn123\n") + filename = f.name + + try: + # Create the file referenced in path column + with open(filename, "a") as f: + pass # File already exists + + # WHEN we validate it + is_valid, errors = _validate_manifest_required_fields(filename) + + # THEN it should be valid + assert is_valid is True + assert errors == [] + finally: + os.unlink(filename) + + def test_missing_file(self): + """Test validating a non-existent manifest file.""" + # WHEN we validate a non-existent file + is_valid, errors = _validate_manifest_required_fields("/nonexistent/file.tsv") + + # THEN it should be invalid + assert is_valid is False + assert len(errors) == 1 + assert "not found" in errors[0] + + def test_missing_required_field(self): + """Test validating a manifest missing a required field.""" + # GIVEN a manifest missing the 'parent' field + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f: + f.write("path\tname\n") + f.write("/path/to/file.txt\tfile.txt\n") + filename = f.name + + try: + # WHEN we validate it + is_valid, errors = _validate_manifest_required_fields(filename) + + # THEN it should be invalid + assert is_valid is False + assert any("parent" in e for e in errors) + finally: + os.unlink(filename) + + def test_empty_path(self): + """Test validating a manifest with empty path.""" + # GIVEN a manifest with empty path + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f: + f.write("path\tparent\n") + f.write("\tsyn123\n") + filename = f.name + + try: + # WHEN we validate it + is_valid, errors = _validate_manifest_required_fields(filename) + + # THEN it should be invalid + assert is_valid is False + assert any("'path' is empty" in e for e in errors) + finally: + os.unlink(filename) + + def test_invalid_parent_id(self): + """Test validating a manifest with invalid parent ID.""" + # GIVEN a manifest with invalid parent ID + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f: + f.write("path\tparent\n") + f.write(f"{f.name}\tinvalid_parent\n") + filename = f.name + + try: + # WHEN we validate it + is_valid, errors = _validate_manifest_required_fields(filename) + + # THEN it should be invalid + assert is_valid is False + assert any("not a valid Synapse ID" in e for e in errors) + finally: + os.unlink(filename) + + +class TestExtractEntityMetadataForFile: + """Tests for _extract_entity_metadata_for_file function.""" + + def test_extract_basic_metadata(self): + """Test extracting basic file metadata.""" + + # GIVEN a mock File object + class MockFile: + def __init__(self): + self.parent_id = "syn123" + self.path = "/path/to/file.txt" + self.name = "file.txt" + self.id = "syn456" + self.synapse_store = True + self.content_type = "text/plain" + self.annotations = None + self.activity = None + + file = MockFile() + + # WHEN we extract metadata + keys, data = _extract_entity_metadata_for_file([file]) + + # THEN we should get the expected data + assert "path" in keys + assert "parent" in keys + assert "name" in keys + assert "id" in keys + assert len(data) == 1 + assert data[0]["path"] == "/path/to/file.txt" + assert data[0]["parent"] == "syn123" + assert data[0]["name"] == "file.txt" + assert data[0]["id"] == "syn456" + + def test_extract_with_annotations(self): + """Test extracting metadata with annotations.""" + + # GIVEN a mock File object with annotations + class MockFile: + def __init__(self): + self.parent_id = "syn123" + self.path = "/path/to/file.txt" + self.name = "file.txt" + self.id = "syn456" + self.synapse_store = True + self.content_type = "text/plain" + self.annotations = {"study": ["Study1"], "dataType": ["RNA-seq"]} + self.activity = None + + file = MockFile() + + # WHEN we extract metadata + keys, data = _extract_entity_metadata_for_file([file]) + + # THEN annotation keys should be included + assert "study" in keys + assert "dataType" in keys + assert data[0]["study"] == ["Study1"] + assert data[0]["dataType"] == ["RNA-seq"] + + +class TestGetEntityProvenanceDictForFile: + """Tests for _get_entity_provenance_dict_for_file function.""" + + def test_no_activity(self): + """Test extracting provenance when there is no activity.""" + + # GIVEN a mock File object with no activity + class MockFile: + def __init__(self): + self.activity = None + + file = MockFile() + + # WHEN we extract provenance + result = _get_entity_provenance_dict_for_file(file) + + # THEN we should get an empty dict + assert result == {} + + def test_with_activity(self): + """Test extracting provenance when there is an activity.""" + + # GIVEN mock objects + class MockUsedEntity: + def format_for_manifest(self): + return "syn789" + + class MockActivity: + def __init__(self): + self.name = "Analysis" + self.description = "Processing data" + self.used = [MockUsedEntity()] + self.executed = [] + + class MockFile: + def __init__(self): + self.activity = MockActivity() + + file = MockFile() + + # WHEN we extract provenance + result = _get_entity_provenance_dict_for_file(file) + + # THEN we should get the expected dict + assert result["activityName"] == "Analysis" + assert result["activityDescription"] == "Processing data" + assert result["used"] == "syn789" + assert result["executed"] == "" diff --git a/tests/unit/synapseclient/models/unit_test_storage_location.py b/tests/unit/synapseclient/models/unit_test_storage_location.py new file mode 100644 index 000000000..400e28566 --- /dev/null +++ b/tests/unit/synapseclient/models/unit_test_storage_location.py @@ -0,0 +1,355 @@ +"""Unit tests for the synapseclient.models.StorageLocation class.""" + +import pytest + +from synapseclient.models import StorageLocation, StorageLocationType, UploadType + + +class TestStorageLocation: + """Unit tests for basic StorageLocation model functionality.""" + + def test_storage_location_type_enum_values(self): + """Test that StorageLocationType enum has correct values.""" + assert StorageLocationType.SYNAPSE_S3.value == "S3StorageLocationSetting" + assert ( + StorageLocationType.EXTERNAL_S3.value == "ExternalS3StorageLocationSetting" + ) + assert ( + StorageLocationType.EXTERNAL_GOOGLE_CLOUD.value + == "ExternalGoogleCloudStorageLocationSetting" + ) + assert ( + StorageLocationType.EXTERNAL_SFTP.value == "ExternalStorageLocationSetting" + ) + assert ( + StorageLocationType.EXTERNAL_OBJECT_STORE.value + == "ExternalObjectStorageLocationSetting" + ) + assert StorageLocationType.PROXY.value == "ProxyStorageLocationSettings" + + def test_upload_type_enum_values(self): + """Test that UploadType enum has correct values.""" + assert UploadType.S3.value == "S3" + assert UploadType.GOOGLE_CLOUD_STORAGE.value == "GOOGLECLOUDSTORAGE" + assert UploadType.SFTP.value == "SFTP" + assert UploadType.HTTPS.value == "HTTPS" + assert UploadType.NONE.value == "NONE" + + def test_to_synapse_request_external_s3(self): + """Test generating a request body for EXTERNAL_S3 storage location.""" + # GIVEN an EXTERNAL_S3 storage location + storage = StorageLocation( + storage_type=StorageLocationType.EXTERNAL_S3, + bucket="my-bucket", + base_key="my/prefix", + sts_enabled=True, + banner="Upload banner", + description="Test storage location", + ) + + # WHEN we generate a request body + request_body = storage._to_synapse_request() + + # THEN it should have the correct structure + assert request_body == { + "concreteType": "org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting", + "uploadType": "S3", + "bucket": "my-bucket", + "baseKey": "my/prefix", + "stsEnabled": True, + "banner": "Upload banner", + "description": "Test storage location", + } + + def test_to_synapse_request_synapse_s3(self): + """Test generating a request body for SYNAPSE_S3 storage location.""" + # GIVEN a SYNAPSE_S3 storage location + storage = StorageLocation( + storage_type=StorageLocationType.SYNAPSE_S3, + sts_enabled=False, + ) + + # WHEN we generate a request body + request_body = storage._to_synapse_request() + + # THEN it should have the correct structure + assert request_body == { + "concreteType": "org.sagebionetworks.repo.model.project.S3StorageLocationSetting", + "uploadType": "S3", + "stsEnabled": False, + } + + def test_to_synapse_request_google_cloud(self): + """Test generating a request body for EXTERNAL_GOOGLE_CLOUD storage location.""" + # GIVEN a EXTERNAL_GOOGLE_CLOUD storage location + storage = StorageLocation( + storage_type=StorageLocationType.EXTERNAL_GOOGLE_CLOUD, + bucket="my-gcs-bucket", + base_key="gcs/prefix", + ) + + # WHEN we generate a request body + request_body = storage._to_synapse_request() + + # THEN it should have the correct structure + assert request_body == { + "concreteType": "org.sagebionetworks.repo.model.project.ExternalGoogleCloudStorageLocationSetting", + "uploadType": "GOOGLECLOUDSTORAGE", + "bucket": "my-gcs-bucket", + "baseKey": "gcs/prefix", + } + + def test_to_synapse_request_sftp(self): + """Test generating a request body for EXTERNAL_SFTP storage location.""" + # GIVEN an EXTERNAL_SFTP storage location + storage = StorageLocation( + storage_type=StorageLocationType.EXTERNAL_SFTP, + url="sftp://example.com/path", + supports_subfolders=True, + ) + + # WHEN we generate a request body + request_body = storage._to_synapse_request() + + # THEN it should have the correct structure + assert request_body == { + "concreteType": "org.sagebionetworks.repo.model.project.ExternalStorageLocationSetting", + "uploadType": "SFTP", + "url": "sftp://example.com/path", + "supportsSubfolders": True, + } + + def test_to_synapse_request_proxy(self): + """Test generating a request body for PROXY storage location.""" + # GIVEN a PROXY storage location + storage = StorageLocation( + storage_type=StorageLocationType.PROXY, + proxy_url="https://proxy.example.com", + secret_key="my-secret-key", + benefactor_id="syn123", + ) + + # WHEN we generate a request body + request_body = storage._to_synapse_request() + + # THEN it should have the correct structure + assert request_body == { + "concreteType": "org.sagebionetworks.repo.model.project.ProxyStorageLocationSettings", + "uploadType": "HTTPS", + "proxyUrl": "https://proxy.example.com", + "secretKey": "my-secret-key", + "benefactorId": "syn123", + } + + def test_to_synapse_request_external_object_store(self): + """Test generating a request body for EXTERNAL_OBJECT_STORE storage location.""" + # GIVEN an EXTERNAL_OBJECT_STORE storage location + storage = StorageLocation( + storage_type=StorageLocationType.EXTERNAL_OBJECT_STORE, + bucket="my-s3-like-bucket", + endpoint_url="https://s3.custom.com", + ) + + # WHEN we generate a request body + request_body = storage._to_synapse_request() + + # THEN it should have the correct structure + assert request_body == { + "concreteType": "org.sagebionetworks.repo.model.project.ExternalObjectStorageLocationSetting", + "uploadType": "S3", + "bucket": "my-s3-like-bucket", + "endpointUrl": "https://s3.custom.com", + } + + def test_to_synapse_request_missing_storage_type(self): + """Test that _to_synapse_request raises ValueError when storage_type is missing.""" + # GIVEN a storage location without a storage_type + storage = StorageLocation( + bucket="my-bucket", + ) + + # THEN it should raise ValueError + with pytest.raises(ValueError, match="storage_type is required"): + storage._to_synapse_request() + + def test_fill_from_dict_external_s3(self): + """Test filling from a REST API response for EXTERNAL_S3.""" + # GIVEN a storage location + storage = StorageLocation() + + # AND a response from the REST API + response = { + "storageLocationId": 12345, + "concreteType": "org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting", + "uploadType": "S3", + "bucket": "my-bucket", + "baseKey": "my/prefix", + "stsEnabled": True, + "banner": "Upload banner", + "description": "Test storage location", + "etag": "abc123", + "createdOn": "2024-01-01T00:00:00.000Z", + "createdBy": 123456, + } + + # WHEN we fill from the response + storage.fill_from_dict(response) + + # THEN the storage location should be populated correctly + assert storage.storage_location_id == 12345 + assert storage.storage_type == StorageLocationType.EXTERNAL_S3 + assert storage.upload_type == UploadType.S3 + assert storage.bucket == "my-bucket" + assert storage.base_key == "my/prefix" + assert storage.sts_enabled is True + assert storage.banner == "Upload banner" + assert storage.description == "Test storage location" + assert storage.etag == "abc123" + assert storage.created_on == "2024-01-01T00:00:00.000Z" + assert storage.created_by == 123456 + + def test_fill_from_dict_synapse_s3(self): + """Test filling from a REST API response for SYNAPSE_S3.""" + # GIVEN a storage location + storage = StorageLocation() + + # AND a response from the REST API + response = { + "storageLocationId": 1, + "concreteType": "org.sagebionetworks.repo.model.project.S3StorageLocationSetting", + "uploadType": "S3", + } + + # WHEN we fill from the response + storage.fill_from_dict(response) + + # THEN the storage location should be populated correctly + assert storage.storage_location_id == 1 + assert storage.storage_type == StorageLocationType.SYNAPSE_S3 + + def test_fill_from_dict_google_cloud(self): + """Test filling from a REST API response for EXTERNAL_GOOGLE_CLOUD.""" + # GIVEN a storage location + storage = StorageLocation() + + # AND a response from the REST API + response = { + "storageLocationId": 67890, + "concreteType": "org.sagebionetworks.repo.model.project.ExternalGoogleCloudStorageLocationSetting", + "uploadType": "GOOGLECLOUDSTORAGE", + "bucket": "my-gcs-bucket", + } + + # WHEN we fill from the response + storage.fill_from_dict(response) + + # THEN the storage location should be populated correctly + assert storage.storage_location_id == 67890 + assert storage.storage_type == StorageLocationType.EXTERNAL_GOOGLE_CLOUD + assert storage.upload_type == UploadType.GOOGLE_CLOUD_STORAGE + assert storage.bucket == "my-gcs-bucket" + + def test_fill_from_dict_sftp(self): + """Test filling from a REST API response for EXTERNAL_SFTP.""" + # GIVEN a storage location + storage = StorageLocation() + + # AND a response from the REST API + response = { + "storageLocationId": 11111, + "concreteType": "org.sagebionetworks.repo.model.project.ExternalStorageLocationSetting", + "uploadType": "SFTP", + "url": "sftp://example.com/path", + "supportsSubfolders": True, + } + + # WHEN we fill from the response + storage.fill_from_dict(response) + + # THEN the storage location should be populated correctly + assert storage.storage_location_id == 11111 + assert storage.storage_type == StorageLocationType.EXTERNAL_SFTP + assert storage.upload_type == UploadType.SFTP + assert storage.url == "sftp://example.com/path" + assert storage.supports_subfolders is True + + def test_fill_from_dict_proxy(self): + """Test filling from a REST API response for PROXY.""" + # GIVEN a storage location + storage = StorageLocation() + + # AND a response from the REST API + response = { + "storageLocationId": 22222, + "concreteType": "org.sagebionetworks.repo.model.project.ProxyStorageLocationSettings", + "uploadType": "HTTPS", + "proxyUrl": "https://proxy.example.com", + "secretKey": "my-secret-key", + "benefactorId": "syn123", + } + + # WHEN we fill from the response + storage.fill_from_dict(response) + + # THEN the storage location should be populated correctly + assert storage.storage_location_id == 22222 + assert storage.storage_type == StorageLocationType.PROXY + assert storage.upload_type == UploadType.HTTPS + assert storage.proxy_url == "https://proxy.example.com" + assert storage.secret_key == "my-secret-key" + assert storage.benefactor_id == "syn123" + + +class TestStorageLocationAsync: + """Async unit tests for StorageLocation model.""" + + @pytest.mark.asyncio + async def test_get_async_missing_id(self): + """Test that get_async raises ValueError when storage_location_id is missing.""" + # GIVEN a storage location without an ID + storage = StorageLocation() + + # THEN it should raise ValueError + with pytest.raises(ValueError, match="storage_location_id is required"): + await storage.get_async() + + @pytest.mark.asyncio + async def test_store_async_missing_storage_type(self): + """Test that store_async raises ValueError when storage_type is missing.""" + # GIVEN a storage location without a storage_type + storage = StorageLocation(bucket="my-bucket") + + # THEN it should raise ValueError + with pytest.raises(ValueError, match="storage_type is required"): + await storage.store_async() + + +class TestSetupS3: + """Tests for the setup_s3 convenience method.""" + + @pytest.mark.asyncio + async def test_setup_s3_async_requires_folder_or_folder_name(self): + """Test that setup_s3_async raises ValueError when neither folder nor folder_name is provided.""" + # WHEN I call setup_s3_async without folder or folder_name + # THEN it should raise ValueError + with pytest.raises( + ValueError, match="Either folder or folder_name is required" + ): + await StorageLocation.setup_s3_async(parent="syn123") + + @pytest.mark.asyncio + async def test_setup_s3_async_folder_and_folder_name_mutually_exclusive(self): + """Test that setup_s3_async raises ValueError when both folder and folder_name are provided.""" + from synapseclient.models import Folder + + # GIVEN both folder and folder_name + folder = Folder(id="syn456") + + # WHEN I call setup_s3_async with both + # THEN it should raise ValueError + with pytest.raises( + ValueError, match="folder and folder_name are mutually exclusive" + ): + await StorageLocation.setup_s3_async( + parent="syn123", folder_name="test", folder=folder + ) From e264f1a47b704dd0bfe7b02bfcc2ff2e440c2871 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 4 Feb 2026 22:31:00 +0000 Subject: [PATCH 2/2] Add download list file --- synapseclient/models/download_list.py | 224 ++++++++++++++++++++++++++ 1 file changed, 224 insertions(+) create mode 100644 synapseclient/models/download_list.py diff --git a/synapseclient/models/download_list.py b/synapseclient/models/download_list.py new file mode 100644 index 000000000..e1c0eb866 --- /dev/null +++ b/synapseclient/models/download_list.py @@ -0,0 +1,224 @@ +"""Models for interacting with Synapse's Download List functionality. + +This module provides classes for generating manifest files from a user's download list +using the Synapse Asynchronous Job service. + +See: https://rest-docs.synapse.org/rest/POST/download/list/manifest/async/start.html +""" + +from dataclasses import dataclass, field +from typing import Any, Dict, Optional + +from typing_extensions import Self + +from synapseclient import Synapse +from synapseclient.core.async_utils import async_to_sync, otel_trace_method +from synapseclient.core.constants.concrete_types import DOWNLOAD_LIST_MANIFEST_REQUEST +from synapseclient.core.download import download_by_file_handle +from synapseclient.core.utils import delete_none_keys +from synapseclient.models.mixins.asynchronous_job import AsynchronousCommunicator +from synapseclient.models.protocols.download_list_protocol import ( + DownloadListManifestRequestSynchronousProtocol, +) +from synapseclient.models.table_components import CsvTableDescriptor + + +@dataclass +@async_to_sync +class DownloadListManifestRequest( + DownloadListManifestRequestSynchronousProtocol, AsynchronousCommunicator +): + """ + A request to generate a manifest file (CSV) of the current user's download list. + + This class uses the Synapse Asynchronous Job service to generate a manifest file + containing metadata about files in the user's download list. The manifest can be + used to download files or for record-keeping purposes. + + See: https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/download/DownloadListManifestRequest.html + + Attributes: + csv_table_descriptor: Optional CSV formatting options for the manifest. + result_file_handle_id: The file handle ID of the generated manifest (populated after completion). + + Example: Generate a manifest from download list + Generate a CSV manifest from your download list: + + from synapseclient.models import DownloadListManifestRequest + import synapseclient + + synapseclient.login() + + # Create and send the request + request = DownloadListManifestRequest() + request.send_job_and_wait() + + print(f"Manifest file handle: {request.result_file_handle_id}") + + Example: Generate manifest with custom CSV formatting + Use custom separator and quote characters: + + from synapseclient.models import DownloadListManifestRequest, CsvTableDescriptor + import synapseclient + + synapseclient.login() + + request = DownloadListManifestRequest( + csv_table_descriptor=CsvTableDescriptor( + separator="\t", # Tab-separated + is_first_line_header=True + ) + ) + request.send_job_and_wait() + """ + + concrete_type: str = field( + default=DOWNLOAD_LIST_MANIFEST_REQUEST, repr=False, compare=False + ) + """The concrete type of this request.""" + + csv_table_descriptor: Optional[CsvTableDescriptor] = None + """Optional CSV formatting options for the manifest file.""" + + result_file_handle_id: Optional[str] = None + """The file handle ID of the generated manifest file. Populated after the job completes.""" + + def to_synapse_request(self) -> Dict[str, Any]: + """ + Convert this request to the format expected by the Synapse REST API. + + Returns: + A dictionary containing the request body for the Synapse API. + """ + request = { + "concreteType": self.concrete_type, + } + if self.csv_table_descriptor: + request[ + "csvTableDescriptor" + ] = self.csv_table_descriptor.to_synapse_request() + delete_none_keys(request) + return request + + def fill_from_dict(self, synapse_response: Dict[str, Any]) -> Self: + """ + Populate this object from a Synapse REST API response. + + Arguments: + synapse_response: The response from the REST API. + + Returns: + This object with fields populated from the response. + """ + self.result_file_handle_id = synapse_response.get("resultFileHandleId", None) + return self + + @otel_trace_method( + method_to_trace_name=lambda self, **kwargs: "DownloadListManifestRequest_send_job_and_wait" + ) + async def send_job_and_wait_async( + self, + post_exchange_args: Optional[Dict[str, Any]] = None, + timeout: int = 120, + *, + synapse_client: Optional[Synapse] = None, + ) -> Self: + """Send the job to the Asynchronous Job service and wait for it to complete. + + This method sends the manifest generation request to Synapse and waits + for the job to complete. After completion, the `result_file_handle_id` + attribute will be populated. + + Arguments: + post_exchange_args: Additional arguments to pass to the request. + timeout: The number of seconds to wait for the job to complete or progress + before raising a SynapseTimeoutError. Defaults to 120. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + This instance with `result_file_handle_id` populated. + + Raises: + SynapseTimeoutError: If the job does not complete within the timeout. + SynapseError: If the job fails. + + Example: Generate a manifest + Generate a manifest from the download list: + + from synapseclient.models import DownloadListManifestRequest + import synapseclient + + synapseclient.login() + + request = DownloadListManifestRequest() + request.send_job_and_wait() + print(f"Manifest file handle: {request.result_file_handle_id}") + """ + return await super().send_job_and_wait_async( + post_exchange_args=post_exchange_args, + timeout=timeout, + synapse_client=synapse_client, + ) + + @otel_trace_method( + method_to_trace_name=lambda self, **kwargs: "DownloadListManifestRequest_download_manifest" + ) + async def download_manifest_async( + self, + download_path: str, + *, + synapse_client: Optional[Synapse] = None, + ) -> str: + """ + Download the generated manifest file to a local path. + + This method should be called after `send_job_and_wait()` has completed + successfully and `result_file_handle_id` is populated. + + Arguments: + download_path: The local directory path where the manifest will be saved. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The full path to the downloaded manifest file. + + Raises: + ValueError: If the manifest has not been generated yet (no result_file_handle_id). + + Example: Download the manifest after generation + Generate and download a manifest: + + from synapseclient.models import DownloadListManifestRequest + import synapseclient + + synapseclient.login() + + request = DownloadListManifestRequest() + request.send_job_and_wait() + + manifest_path = request.download_manifest(download_path="/path/to/download") + print(f"Manifest downloaded to: {manifest_path}") + """ + if not self.result_file_handle_id: + raise ValueError( + "Manifest has not been generated yet. " + "Call send_job_and_wait() before downloading." + ) + + # Download the file handle using the download module + # For download list manifests, the synapse_id parameter is set to the file handle ID + # because these manifests are not associated with a specific entity. The download + # service handles this case by using the file handle directly. + downloaded_path = await download_by_file_handle( + file_handle_id=self.result_file_handle_id, + synapse_id=self.result_file_handle_id, + entity_type="FileEntity", + destination=download_path, + synapse_client=synapse_client, + ) + + return downloaded_path