Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/_util/backup_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import os

SNAPSHOT_TIMEOUT_SEC = int(os.environ.get("SNAPSHOT_TIMEOUT_SEC", "120"))
SNAPSHOT_POLL_INTERVAL_SEC = int(os.environ.get("SNAPSHOT_POLL_INTERVAL_SEC", "5"))
VOLUME_SNAPSHOT_CLASS = os.environ.get("VOLUME_SNAPSHOT_CLASS", "simplyblock-csi-snapshotclass")
21 changes: 7 additions & 14 deletions src/api/_util/backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@

from ...models.backups import BackupEntry, BackupSchedule, BackupScheduleRow, NextBackup
from ...models.branch import Branch
from ..backup_snapshots import (
delete_branch_snapshot,
)
from ..backup_snapshots import build_snapshot_metadata, delete_snapshot

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -114,16 +112,11 @@ async def delete_branch_backups(session: SessionDep, branch_id: Identifier) -> N
return

for backup in backups:
snapshot = build_snapshot_metadata(backup)
if snapshot is None:
logger.warning("Skipping snapshot deletion for backup %s because metadata was incomplete", backup.id)
continue
try:
await delete_branch_snapshot(
name=backup.snapshot_name,
namespace=backup.snapshot_namespace,
content_name=backup.snapshot_content_name,
)
await delete_snapshot(snapshot)
except Exception:
logger.exception(
"Failed to delete snapshot %s/%s for backup %s",
backup.snapshot_namespace,
backup.snapshot_name,
backup.id,
)
logger.exception("Failed to delete snapshots for branch %s", branch_id)
28 changes: 9 additions & 19 deletions src/api/backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,7 @@
from ..models.project import Project
from ._util.backups import _remove_existing_schedule, _validate_project_retention_budget
from .auth import authenticated_user
from .backup_snapshots import (
SNAPSHOT_POLL_INTERVAL_SEC,
create_branch_snapshot,
delete_branch_snapshot,
)
from .backup_snapshots import build_snapshot_metadata, create_branch_db_snapshot, delete_snapshot
from .db import SessionDep
from .dependencies import OrganizationDep

Expand All @@ -46,7 +42,6 @@
# ---------------------------
# Constants
# ---------------------------
VOLUME_SNAPSHOT_CLASS = os.environ.get("VOLUME_SNAPSHOT_CLASS", "simplyblock-csi-snapshotclass")
MANUAL_BACKUP_TIMEOUT_SEC = int(os.environ.get("MANUAL_BACKUP_TIMEOUT_SEC", "10"))

UNIT_MULTIPLIER = {
Expand Down Expand Up @@ -539,13 +534,10 @@ async def manual_backup(session: SessionDep, branch_id: Identifier) -> BackupCre
recorded_at = datetime.now(UTC)

try:
snapshot = await create_branch_snapshot(
snapshot = await create_branch_db_snapshot(
branch.id,
backup_id=backup_id,
snapshot_class=VOLUME_SNAPSHOT_CLASS,
poll_interval=SNAPSHOT_POLL_INTERVAL_SEC,
label="manual",
time_limit=MANUAL_BACKUP_TIMEOUT_SEC,
)
except Exception as exc:
logger.exception("Manual backup failed for branch %s within timeout", branch.id)
Expand Down Expand Up @@ -586,15 +578,13 @@ async def delete_backup(session: SessionDep, backup_id: Identifier) -> BackupDel
if not backup:
raise HTTPException(status_code=404, detail="Backup not found")

try:
await delete_branch_snapshot(
name=backup.snapshot_name,
namespace=backup.snapshot_namespace,
content_name=backup.snapshot_content_name,
)
except Exception as exc:
logger.exception("Failed to delete snapshot for backup %s", backup_id)
raise HTTPException(status_code=500, detail="Failed to delete backup snapshot") from exc
metadata = build_snapshot_metadata(backup)
if metadata is not None:
try:
await delete_snapshot(metadata)
except Exception as exc:
logger.exception("Failed to delete snapshot for backup %s", backup_id)
raise HTTPException(status_code=500, detail="Failed to delete backup snapshot") from exc

await session.delete(backup)

Expand Down
115 changes: 81 additions & 34 deletions src/api/backup_snapshots.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,22 @@

import asyncio
import logging
import os
import re
from dataclasses import dataclass
from typing import TYPE_CHECKING

from pydantic import BaseModel, Field

from .._util import Identifier, quantity_to_bytes
from ..deployment import AUTOSCALER_PVC_SUFFIX, get_autoscaler_vm_identity
from .._util.backup_config import (
SNAPSHOT_POLL_INTERVAL_SEC,
SNAPSHOT_TIMEOUT_SEC,
VOLUME_SNAPSHOT_CLASS,
)
from ..deployment import (
AUTOSCALER_PVC_SUFFIX,
get_autoscaler_vm_identity,
)
from ..deployment.kubernetes.snapshot import (
create_snapshot_from_pvc,
ensure_snapshot_absent,
Expand All @@ -21,12 +30,38 @@
if TYPE_CHECKING:
from ulid import ULID

logger = logging.getLogger(__name__)
from ..models.backups import BackupEntry

SNAPSHOT_TIMEOUT_SEC = int(os.environ.get("SNAPSHOT_TIMEOUT_SEC", "120"))
SNAPSHOT_POLL_INTERVAL_SEC = int(os.environ.get("SNAPSHOT_POLL_INTERVAL_SEC", "5"))
logger = logging.getLogger(__name__)

_K8S_NAME_MAX_LENGTH = 63
DEFAULT_SNAPSHOT_TIMEOUT_SEC = float(SNAPSHOT_TIMEOUT_SEC)
DEFAULT_SNAPSHOT_POLL_INTERVAL_SEC = float(SNAPSHOT_POLL_INTERVAL_SEC)


class SnapshotMetadata(BaseModel):
name: str = Field(..., min_length=1)
namespace: str = Field(..., min_length=1)
# content_name stays optional because there are runtime scenarios where the
# VolumeSnapshotContent hasn’t been bound yet
content_name: str | None


def build_snapshot_metadata(backup: BackupEntry) -> SnapshotMetadata | None:
name = backup.snapshot_name
namespace = backup.snapshot_namespace
if not name or not namespace:
logger.debug(
"Skipping metadata for missing snapshot identifiers (name=%r namespace=%r)",
name,
namespace,
)
return None
return SnapshotMetadata(
name=name,
namespace=namespace,
content_name=backup.snapshot_content_name,
)


@dataclass(frozen=True)
Expand Down Expand Up @@ -59,20 +94,18 @@ def _build_snapshot_name(*, label: str, backup_id: ULID) -> str:
return f"{label_component}{separator}{backup_component}"


async def create_branch_snapshot(
branch_id: Identifier,
async def _create_snapshot_from_pvc(
*,
namespace: str,
pvc_name: str,
backup_id: ULID,
snapshot_class: str,
poll_interval: float,
label: str,
poll_interval: float,
time_limit: float,
) -> SnapshotDetails:
namespace, autoscaler_vm_name = get_autoscaler_vm_identity(branch_id)
pvc_name = f"{autoscaler_vm_name}{AUTOSCALER_PVC_SUFFIX}"
snapshot_name = _build_snapshot_name(label=label, backup_id=backup_id)

logger.info("Creating VolumeSnapshot %s/%s for branch %s", namespace, snapshot_name, branch_id)
logger.info("Creating VolumeSnapshot %s/%s for branch PVC %s", namespace, snapshot_name, pvc_name)
try:
async with asyncio.timeout(time_limit):
await create_snapshot_from_pvc(
Expand All @@ -89,14 +122,14 @@ async def create_branch_snapshot(
)
except TimeoutError as exc:
logger.exception(
"Timed out creating VolumeSnapshot %s/%s for branch %s within %s seconds",
"Timed out creating VolumeSnapshot %s/%s for PVC %s within %s seconds",
namespace,
snapshot_name,
branch_id,
pvc_name,
time_limit,
)
raise VelaSnapshotTimeoutError(
f"Timed out creating VolumeSnapshot {namespace}/{snapshot_name} for branch {branch_id}"
f"Timed out creating VolumeSnapshot {namespace}/{snapshot_name} for namespace {namespace}"
) from exc

status = snapshot.get("status") or {}
Expand All @@ -118,29 +151,43 @@ async def create_branch_snapshot(
)


async def delete_branch_snapshot(
async def create_branch_db_snapshot(
branch_id: Identifier,
*,
name: str | None,
namespace: str | None,
content_name: str | None,
time_limit: float = SNAPSHOT_TIMEOUT_SEC,
poll_interval: float = SNAPSHOT_POLL_INTERVAL_SEC,
) -> None:
if not name or not namespace:
logger.debug(
"Skipping deletion for VolumeSnapshot with missing metadata (name=%s namespace=%s)",
name,
namespace,
)
return
backup_id: ULID,
snapshot_class: str = VOLUME_SNAPSHOT_CLASS,
poll_interval: float = DEFAULT_SNAPSHOT_POLL_INTERVAL_SEC,
label: str,
time_limit: float = DEFAULT_SNAPSHOT_TIMEOUT_SEC,
) -> SnapshotDetails:
namespace, autoscaler_vm_name = get_autoscaler_vm_identity(branch_id)
pvc_name = f"{autoscaler_vm_name}{AUTOSCALER_PVC_SUFFIX}"
return await _create_snapshot_from_pvc(
namespace=namespace,
pvc_name=pvc_name,
backup_id=backup_id,
snapshot_class=snapshot_class,
poll_interval=poll_interval,
label=label,
time_limit=time_limit,
)


derived_content_name = content_name
async def delete_snapshot(
metadata: SnapshotMetadata,
*,
time_limit: float = DEFAULT_SNAPSHOT_TIMEOUT_SEC,
poll_interval: float = DEFAULT_SNAPSHOT_POLL_INTERVAL_SEC,
) -> None:
name = metadata.name
namespace = metadata.namespace
content_name = metadata.content_name
try:
async with asyncio.timeout(time_limit):
snapshot = await read_snapshot(namespace, name)
if snapshot is not None:
status = snapshot.get("status") or {}
derived_content_name = derived_content_name or status.get("boundVolumeSnapshotContentName")
content_name = content_name or status.get("boundVolumeSnapshotContentName")
logger.info("Deleting VolumeSnapshot %s/%s", namespace, name)
await ensure_snapshot_absent(
namespace,
Expand All @@ -151,10 +198,10 @@ async def delete_branch_snapshot(
else:
logger.info("VolumeSnapshot %s/%s already absent", namespace, name)

if derived_content_name:
logger.info("Ensuring VolumeSnapshotContent %s is absent", derived_content_name)
if content_name:
logger.info("Ensuring VolumeSnapshotContent %s is absent", content_name)
await ensure_snapshot_content_absent(
derived_content_name,
content_name,
timeout=time_limit,
poll_interval=poll_interval,
)
Expand Down
25 changes: 13 additions & 12 deletions src/api/backupmonitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from sqlmodel import SQLModel, asc, delete, select
from ulid import ULID

from .._util.backup_config import SNAPSHOT_POLL_INTERVAL_SEC, SNAPSHOT_TIMEOUT_SEC, VOLUME_SNAPSHOT_CLASS
from ..models.backups import (
BackupEntry,
BackupLog,
Expand All @@ -21,18 +22,16 @@
from ..models.organization import Organization
from ..models.project import Project
from .backup_snapshots import (
SNAPSHOT_POLL_INTERVAL_SEC,
SNAPSHOT_TIMEOUT_SEC,
create_branch_snapshot,
delete_branch_snapshot,
build_snapshot_metadata,
create_branch_db_snapshot,
delete_snapshot,
)
from .organization.project.branch import refresh_branch_status
from .settings import get_settings

# ---------------------------
# Config
# ---------------------------
VOLUME_SNAPSHOT_CLASS = os.environ.get("VOLUME_SNAPSHOT_CLASS", "simplyblock-csi-snapshotclass")
POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "60"))

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -204,11 +203,15 @@ async def _delete_many(

deleted_ids: list[ULID] = []
for backup in backups:
metadata = build_snapshot_metadata(backup)
if metadata is None:
logger.warning("Skipping snapshot deletion for backup %s because metadata was incomplete", backup.id)
continue
try:
await delete_branch_snapshot(
name=backup.snapshot_name,
namespace=backup.snapshot_namespace,
content_name=backup.snapshot_content_name,
await delete_snapshot(
metadata,
time_limit=SNAPSHOT_TIMEOUT_SEC,
poll_interval=SNAPSHOT_POLL_INTERVAL_SEC,
)
except Exception:
context = {
Expand Down Expand Up @@ -303,13 +306,11 @@ async def execute_backup(self, db: AsyncSession, branch: Branch, row: BackupSche
backup_id = ULID()

try:
snapshot = await create_branch_snapshot(
snapshot = await create_branch_db_snapshot(
branch.id,
backup_id=backup_id,
snapshot_class=VOLUME_SNAPSHOT_CLASS,
poll_interval=SNAPSHOT_POLL_INTERVAL_SEC,
label=f"row-{row.row_index}",
time_limit=SNAPSHOT_TIMEOUT_SEC,
)
except Exception:
nb.next_at = next_due
Expand Down
1 change: 1 addition & 0 deletions src/deployment/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
DATABASE_PVC_SUFFIX = "-db-pvc"
AUTOSCALER_PVC_SUFFIX = "-block-data"
AUTOSCALER_WAL_PVC_SUFFIX = "-pg-wal"
AUTOSCALER_PVC_SUFFIX = "-block-data"
_LOAD_BALANCER_TIMEOUT_SECONDS = float(600)
_LOAD_BALANCER_POLL_INTERVAL_SECONDS = float(2)
_OVERLAY_IP_TIMEOUT_SECONDS = float(300)
Expand Down