From 0dd7f5ccbeb0faa75a13e867c5883eb350fdbc12 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Thu, 29 Jan 2026 23:45:11 +0100 Subject: [PATCH 1/5] [Services] Add default probes if model is set #3522 --- .../_internal/core/models/configurations.py | 39 ++++++++++++++-- .../core/models/test_configurations.py | 45 +++++++++++++++++++ 2 files changed, 81 insertions(+), 3 deletions(-) diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 3b2c7812b9..f6c1b385a2 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -56,6 +56,8 @@ DEFAULT_PROBE_METHOD = "get" MAX_PROBE_URL_LEN = 2048 DEFAULT_REPLICA_GROUP_NAME = "0" +DEFAULT_MODEL_PROBE_TIMEOUT = 30 +DEFAULT_MODEL_PROBE_URL = "/v1/chat/completions" class RunConfigurationType(str, Enum): @@ -851,9 +853,9 @@ class ServiceConfigurationParams(CoreModel): ] = None rate_limits: Annotated[list[RateLimit], Field(description="Rate limiting rules")] = [] probes: Annotated[ - list[ProbeConfig], + Optional[list[ProbeConfig]], Field(description="List of probes used to determine job health"), - ] = [] + ] = None # None = omitted (may get default when model is set); [] = explicit empty replicas: Annotated[ Optional[Union[List[ReplicaGroup], Range[int]]], @@ -895,7 +897,9 @@ def validate_rate_limits(cls, v: list[RateLimit]) -> list[RateLimit]: return v @validator("probes") - def validate_probes(cls, v: list[ProbeConfig]) -> list[ProbeConfig]: + def validate_probes(cls, v: Optional[list[ProbeConfig]]) -> Optional[list[ProbeConfig]]: + if v is None: + return v if has_duplicates(v): # Using a custom validator instead of Field(unique_items=True) to avoid Pydantic bug: # https://github.com/pydantic/pydantic/issues/3765 @@ -932,6 +936,35 @@ def validate_replicas( ) return v + @root_validator() + def set_default_probes_for_model(cls, values): + model = values.get("model") + probes = values.get("probes") + if model is not None and probes is None: + body = orjson.dumps( + { + "model": model.name, + "messages": [{"role": "user", "content": "hi"}], + "max_tokens": 1, + } + ).decode("utf-8") + values["probes"] = [ + ProbeConfig( + type="http", + method="post", + url=DEFAULT_MODEL_PROBE_URL, + headers=[ + HTTPHeaderSpec(name="Content-Type", value="application/json"), + ], + body=body, + timeout=DEFAULT_MODEL_PROBE_TIMEOUT, + ) + ] + elif probes is None: + # Probes omitted and model not set: normalize to empty list for downstream. + values["probes"] = [] + return values + @root_validator() def validate_scaling(cls, values): scaling = values.get("scaling") diff --git a/src/tests/_internal/core/models/test_configurations.py b/src/tests/_internal/core/models/test_configurations.py index 65eec62642..1ff025ea2f 100644 --- a/src/tests/_internal/core/models/test_configurations.py +++ b/src/tests/_internal/core/models/test_configurations.py @@ -5,6 +5,8 @@ from dstack._internal.core.errors import ConfigurationError from dstack._internal.core.models.common import RegistryAuth from dstack._internal.core.models.configurations import ( + DEFAULT_MODEL_PROBE_TIMEOUT, + DEFAULT_MODEL_PROBE_URL, DevEnvironmentConfigurationParams, RepoSpec, parse_run_configuration, @@ -13,6 +15,49 @@ class TestParseConfiguration: + def test_service_model_sets_default_probes_when_probes_omitted(self): + conf = { + "type": "service", + "commands": ["python3 -m http.server"], + "port": 8000, + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + } + parsed = parse_run_configuration(conf) + assert len(parsed.probes) == 1 + probe = parsed.probes[0] + assert probe.type == "http" + assert probe.method == "post" + assert probe.url == DEFAULT_MODEL_PROBE_URL + assert probe.timeout == DEFAULT_MODEL_PROBE_TIMEOUT + assert len(probe.headers) == 1 + assert probe.headers[0].name == "Content-Type" + assert probe.headers[0].value == "application/json" + assert "meta-llama/Meta-Llama-3.1-8B-Instruct" in (probe.body or "") + assert "max_tokens" in (probe.body or "") + + def test_service_model_does_not_override_explicit_probes(self): + conf = { + "type": "service", + "commands": ["python3 -m http.server"], + "port": 8000, + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "probes": [{"type": "http", "url": "/health"}], + } + parsed = parse_run_configuration(conf) + assert len(parsed.probes) == 1 + assert parsed.probes[0].url == "/health" + + def test_service_model_explicit_empty_probes_no_default(self): + conf = { + "type": "service", + "commands": ["python3 -m http.server"], + "port": 8000, + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "probes": [], + } + parsed = parse_run_configuration(conf) + assert len(parsed.probes) == 0 + def test_services_replicas_and_scaling(self): def test_conf(replicas: Any, scaling: Optional[Any] = None): conf = { From a10565cd4f8edddfef658090cebb0112ce6ee2d8 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Thu, 29 Jan 2026 23:53:13 +0100 Subject: [PATCH 2/5] Fixing `pyright` --- src/dstack/_internal/cli/services/configurators/run.py | 2 +- .../_internal/server/services/jobs/configurators/base.py | 2 +- src/dstack/_internal/server/services/runs/spec.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/dstack/_internal/cli/services/configurators/run.py b/src/dstack/_internal/cli/services/configurators/run.py index fc76fe43ed..1077eff8a9 100644 --- a/src/dstack/_internal/cli/services/configurators/run.py +++ b/src/dstack/_internal/cli/services/configurators/run.py @@ -354,7 +354,7 @@ def interpolate_env(self, conf: RunConfigurationT): password=interpolator.interpolate_or_error(conf.registry_auth.password), ) if isinstance(conf, ServiceConfiguration): - for probe in conf.probes: + for probe in conf.probes or []: for header in probe.headers: header.value = interpolator.interpolate_or_error(header.value) if probe.url: diff --git a/src/dstack/_internal/server/services/jobs/configurators/base.py b/src/dstack/_internal/server/services/jobs/configurators/base.py index df6738a774..44579bf81c 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/base.py +++ b/src/dstack/_internal/server/services/jobs/configurators/base.py @@ -394,7 +394,7 @@ def _service_port(self) -> Optional[int]: def _probes(self) -> list[ProbeSpec]: if isinstance(self.run_spec.configuration, ServiceConfiguration): - return list(map(_probe_config_to_spec, self.run_spec.configuration.probes)) + return list(map(_probe_config_to_spec, self.run_spec.configuration.probes or [])) return [] diff --git a/src/dstack/_internal/server/services/runs/spec.py b/src/dstack/_internal/server/services/runs/spec.py index db81eb724a..ad2fcef1ff 100644 --- a/src/dstack/_internal/server/services/runs/spec.py +++ b/src/dstack/_internal/server/services/runs/spec.py @@ -94,13 +94,13 @@ def validate_run_spec_and_set_defaults( raise ServerClientError( "Scheduled services with autoscaling to zero are not supported" ) - if len(run_spec.configuration.probes) > settings.MAX_PROBES_PER_JOB: + if len(run_spec.configuration.probes or []) > settings.MAX_PROBES_PER_JOB: raise ServerClientError( f"Cannot configure more than {settings.MAX_PROBES_PER_JOB} probes" ) if any( p.timeout is not None and p.timeout > settings.MAX_PROBE_TIMEOUT - for p in run_spec.configuration.probes + for p in (run_spec.configuration.probes or []) ): raise ServerClientError( f"Probe timeout cannot be longer than {settings.MAX_PROBE_TIMEOUT}s" From c4058228fcb8e8f06ad318201575e4ff90ded407 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 2 Feb 2026 20:35:40 +0100 Subject: [PATCH 3/5] PR review feedback --- docs/docs/concepts/services.md | 19 +++- .../_internal/core/models/configurations.py | 29 ------ .../services/jobs/configurators/base.py | 34 ++++++- .../core/models/test_configurations.py | 24 ++--- .../jobs/configurators/test_service.py | 98 +++++++++++++++++++ 5 files changed, 159 insertions(+), 45 deletions(-) create mode 100644 src/tests/_internal/server/services/jobs/configurators/test_service.py diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md index fed9f7cb39..9f4e5900dc 100644 --- a/docs/docs/concepts/services.md +++ b/docs/docs/concepts/services.md @@ -233,6 +233,16 @@ Setting the minimum number of replicas to `0` allows the service to scale down t ??? info "Disaggregated serving" Native support for disaggregated prefill and decode, allowing both worker types to run within a single service, is coming soon. +### Model + +If the service is running a chat model with an OpenAI-compatible interface (i.e., `/v1/chat/completions`), +set the [`model`](../reference/dstack.yml/service.md#model) property to make the model accessible via `dstack`'s +global OpenAI-compatible endpoint, and also accessible via `dstack`'s UI. + +When `model` is set, `dstack` automatically configures [`probes`](#probes) to verify model health. +To customize or disable this, set `probes` explicitly. + + ### Authorization By default, the service enables authorization, meaning the service endpoint requires a `dstack` user token. @@ -290,7 +300,7 @@ $ dstack ps --verbose -??? info "Probe statuses" +??? info "Status" The following symbols are used for probe statuses: - `×` — the last probe execution failed. @@ -328,6 +338,13 @@ Probes are executed for each service replica while the replica is `running`. A p +??? info "Model" + If you set the [`model`](#model) property but don't explicitly configure `probes`, + `dstack` automatically configures a default probe that tests the model using the `/v1/chat/completions` API. + This default probe sends a minimal chat completion request to verify the model is responding correctly. + + To disable probes entirely when `model` is set, explicitly set `probes` to an empty list. + See the [reference](../reference/dstack.yml/service.md#probes) for more probe configuration options. ### Path prefix { #path-prefix } diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index f6c1b385a2..7c5d442e44 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -936,35 +936,6 @@ def validate_replicas( ) return v - @root_validator() - def set_default_probes_for_model(cls, values): - model = values.get("model") - probes = values.get("probes") - if model is not None and probes is None: - body = orjson.dumps( - { - "model": model.name, - "messages": [{"role": "user", "content": "hi"}], - "max_tokens": 1, - } - ).decode("utf-8") - values["probes"] = [ - ProbeConfig( - type="http", - method="post", - url=DEFAULT_MODEL_PROBE_URL, - headers=[ - HTTPHeaderSpec(name="Content-Type", value="application/json"), - ], - body=body, - timeout=DEFAULT_MODEL_PROBE_TIMEOUT, - ) - ] - elif probes is None: - # Probes omitted and model not set: normalize to empty list for downstream. - values["probes"] = [] - return values - @root_validator() def validate_scaling(cls, values): scaling = values.get("scaling") diff --git a/src/dstack/_internal/server/services/jobs/configurators/base.py b/src/dstack/_internal/server/services/jobs/configurators/base.py index 44579bf81c..cb914fe94d 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/base.py +++ b/src/dstack/_internal/server/services/jobs/configurators/base.py @@ -5,12 +5,15 @@ from pathlib import PurePosixPath from typing import Dict, List, Optional +import orjson from cachetools import TTLCache, cached from dstack._internal import settings from dstack._internal.core.errors import DockerRegistryError, ServerClientError from dstack._internal.core.models.common import RegistryAuth from dstack._internal.core.models.configurations import ( + DEFAULT_MODEL_PROBE_TIMEOUT, + DEFAULT_MODEL_PROBE_URL, DEFAULT_PROBE_INTERVAL, DEFAULT_PROBE_METHOD, DEFAULT_PROBE_READY_AFTER, @@ -18,6 +21,7 @@ DEFAULT_PROBE_URL, DEFAULT_REPLICA_GROUP_NAME, LEGACY_REPO_DIR, + HTTPHeaderSpec, PortMapping, ProbeConfig, PythonVersion, @@ -394,7 +398,13 @@ def _service_port(self) -> Optional[int]: def _probes(self) -> list[ProbeSpec]: if isinstance(self.run_spec.configuration, ServiceConfiguration): - return list(map(_probe_config_to_spec, self.run_spec.configuration.probes or [])) + probes = self.run_spec.configuration.probes + if probes is not None: + return list(map(_probe_config_to_spec, probes)) + # Generate default probe if model is set + model = self.run_spec.configuration.model + if model is not None: + return [_default_model_probe_spec(model.name)] return [] @@ -447,6 +457,28 @@ def _probe_config_to_spec(c: ProbeConfig) -> ProbeSpec: ) +def _default_model_probe_spec(model_name: str) -> ProbeSpec: + body = orjson.dumps( + { + "model": model_name, + "messages": [{"role": "user", "content": "hi"}], + "max_tokens": 1, + } + ).decode("utf-8") + return ProbeSpec( + type="http", + method="post", + url=DEFAULT_MODEL_PROBE_URL, + headers=[ + HTTPHeaderSpec(name="Content-Type", value="application/json"), + ], + body=body, + timeout=DEFAULT_MODEL_PROBE_TIMEOUT, + interval=DEFAULT_PROBE_INTERVAL, + ready_after=DEFAULT_PROBE_READY_AFTER, + ) + + def _join_shell_commands(commands: List[str]) -> str: for i, cmd in enumerate(commands): cmd = cmd.strip() diff --git a/src/tests/_internal/core/models/test_configurations.py b/src/tests/_internal/core/models/test_configurations.py index 1ff025ea2f..44c31f5cbb 100644 --- a/src/tests/_internal/core/models/test_configurations.py +++ b/src/tests/_internal/core/models/test_configurations.py @@ -5,17 +5,18 @@ from dstack._internal.core.errors import ConfigurationError from dstack._internal.core.models.common import RegistryAuth from dstack._internal.core.models.configurations import ( - DEFAULT_MODEL_PROBE_TIMEOUT, - DEFAULT_MODEL_PROBE_URL, DevEnvironmentConfigurationParams, RepoSpec, + ServiceConfiguration, parse_run_configuration, ) from dstack._internal.core.models.resources import Range class TestParseConfiguration: - def test_service_model_sets_default_probes_when_probes_omitted(self): + def test_service_model_probes_none_when_omitted(self): + """When model is set but probes omitted, probes should remain None. + The default probe is generated server-side in the job configurator.""" conf = { "type": "service", "commands": ["python3 -m http.server"], @@ -23,17 +24,8 @@ def test_service_model_sets_default_probes_when_probes_omitted(self): "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", } parsed = parse_run_configuration(conf) - assert len(parsed.probes) == 1 - probe = parsed.probes[0] - assert probe.type == "http" - assert probe.method == "post" - assert probe.url == DEFAULT_MODEL_PROBE_URL - assert probe.timeout == DEFAULT_MODEL_PROBE_TIMEOUT - assert len(probe.headers) == 1 - assert probe.headers[0].name == "Content-Type" - assert probe.headers[0].value == "application/json" - assert "meta-llama/Meta-Llama-3.1-8B-Instruct" in (probe.body or "") - assert "max_tokens" in (probe.body or "") + assert isinstance(parsed, ServiceConfiguration) + assert parsed.probes is None def test_service_model_does_not_override_explicit_probes(self): conf = { @@ -44,6 +36,8 @@ def test_service_model_does_not_override_explicit_probes(self): "probes": [{"type": "http", "url": "/health"}], } parsed = parse_run_configuration(conf) + assert isinstance(parsed, ServiceConfiguration) + assert parsed.probes is not None assert len(parsed.probes) == 1 assert parsed.probes[0].url == "/health" @@ -56,6 +50,8 @@ def test_service_model_explicit_empty_probes_no_default(self): "probes": [], } parsed = parse_run_configuration(conf) + assert isinstance(parsed, ServiceConfiguration) + assert parsed.probes is not None assert len(parsed.probes) == 0 def test_services_replicas_and_scaling(self): diff --git a/src/tests/_internal/server/services/jobs/configurators/test_service.py b/src/tests/_internal/server/services/jobs/configurators/test_service.py new file mode 100644 index 0000000000..b52ee297a5 --- /dev/null +++ b/src/tests/_internal/server/services/jobs/configurators/test_service.py @@ -0,0 +1,98 @@ +import pytest + +from dstack._internal.core.models.configurations import ( + DEFAULT_MODEL_PROBE_TIMEOUT, + DEFAULT_MODEL_PROBE_URL, + ProbeConfig, + ServiceConfiguration, +) +from dstack._internal.core.models.services import OpenAIChatModel +from dstack._internal.server.services.jobs.configurators.service import ServiceJobConfigurator +from dstack._internal.server.testing.common import get_run_spec + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("image_config_mock") +class TestProbes: + async def test_default_probe_when_model_set(self): + """When model is set but probes omitted, a default model probe should be generated.""" + configuration = ServiceConfiguration( + port=80, + image="debian", + model=OpenAIChatModel( + name="meta-llama/Meta-Llama-3.1-8B-Instruct", + format="openai", + ), + ) + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = ServiceJobConfigurator(run_spec) + + job_specs = await configurator.get_job_specs(replica_num=0) + + assert len(job_specs) == 1 + probes = job_specs[0].probes + assert len(probes) == 1 + probe = probes[0] + assert probe.type == "http" + assert probe.method == "post" + assert probe.url == DEFAULT_MODEL_PROBE_URL + assert probe.timeout == DEFAULT_MODEL_PROBE_TIMEOUT + assert len(probe.headers) == 1 + assert probe.headers[0].name == "Content-Type" + assert probe.headers[0].value == "application/json" + assert "meta-llama/Meta-Llama-3.1-8B-Instruct" in (probe.body or "") + assert "max_tokens" in (probe.body or "") + + async def test_explicit_probes_not_overridden(self): + """When probes are explicitly set, they should be used as-is.""" + configuration = ServiceConfiguration( + port=80, + image="debian", + model=OpenAIChatModel( + name="meta-llama/Meta-Llama-3.1-8B-Instruct", + format="openai", + ), + probes=[ProbeConfig(type="http", url="/health")], + ) + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = ServiceJobConfigurator(run_spec) + + job_specs = await configurator.get_job_specs(replica_num=0) + + assert len(job_specs) == 1 + probes = job_specs[0].probes + assert len(probes) == 1 + assert probes[0].url == "/health" + + async def test_explicit_empty_probes(self): + """When probes is explicitly set to empty list, no probes should be generated.""" + configuration = ServiceConfiguration( + port=80, + image="debian", + model=OpenAIChatModel( + name="meta-llama/Meta-Llama-3.1-8B-Instruct", + format="openai", + ), + probes=[], + ) + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = ServiceJobConfigurator(run_spec) + + job_specs = await configurator.get_job_specs(replica_num=0) + + assert len(job_specs) == 1 + assert len(job_specs[0].probes) == 0 + + async def test_no_probe_when_no_model(self): + """When neither model nor probes are set, no probes should be generated.""" + configuration = ServiceConfiguration( + port=80, + image="debian", + ) + run_spec = get_run_spec(run_name="run", repo_id="id", configuration=configuration) + configurator = ServiceJobConfigurator(run_spec) + + job_specs = await configurator.get_job_specs(replica_num=0) + + assert len(job_specs) == 1 + assert len(job_specs[0].probes) == 0 From 5f81c696625d298e0d9736dec671aa42453be8db Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 2 Feb 2026 20:39:05 +0100 Subject: [PATCH 4/5] PR review feedback --- src/dstack/_internal/core/models/configurations.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 7c5d442e44..601cf11a81 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -854,7 +854,11 @@ class ServiceConfigurationParams(CoreModel): rate_limits: Annotated[list[RateLimit], Field(description="Rate limiting rules")] = [] probes: Annotated[ Optional[list[ProbeConfig]], - Field(description="List of probes used to determine job health"), + Field( + description="The list of probes to determine service health. " + "If `model` is set, defaults to a `/v1/chat/completions` probe. " + "Set explicitly to override." + ), ] = None # None = omitted (may get default when model is set); [] = explicit empty replicas: Annotated[ From 3e2586747ee26fd8765a14dff15c5c6518b35558 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 2 Feb 2026 20:48:10 +0100 Subject: [PATCH 5/5] Resolved conflict --- docs/docs/concepts/services.md | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md index 9f4e5900dc..d40984866b 100644 --- a/docs/docs/concepts/services.md +++ b/docs/docs/concepts/services.md @@ -233,16 +233,6 @@ Setting the minimum number of replicas to `0` allows the service to scale down t ??? info "Disaggregated serving" Native support for disaggregated prefill and decode, allowing both worker types to run within a single service, is coming soon. -### Model - -If the service is running a chat model with an OpenAI-compatible interface (i.e., `/v1/chat/completions`), -set the [`model`](../reference/dstack.yml/service.md#model) property to make the model accessible via `dstack`'s -global OpenAI-compatible endpoint, and also accessible via `dstack`'s UI. - -When `model` is set, `dstack` automatically configures [`probes`](#probes) to verify model health. -To customize or disable this, set `probes` explicitly. - - ### Authorization By default, the service enables authorization, meaning the service endpoint requires a `dstack` user token. @@ -341,8 +331,6 @@ Probes are executed for each service replica while the replica is `running`. A p ??? info "Model" If you set the [`model`](#model) property but don't explicitly configure `probes`, `dstack` automatically configures a default probe that tests the model using the `/v1/chat/completions` API. - This default probe sends a minimal chat completion request to verify the model is responding correctly. - To disable probes entirely when `model` is set, explicitly set `probes` to an empty list. See the [reference](../reference/dstack.yml/service.md#probes) for more probe configuration options. @@ -442,6 +430,9 @@ Limits apply to the whole service (all replicas) and per client (by IP). Clients If the service runs a model with an OpenAI-compatible interface, you can set the [`model`](#model) property to make the model accessible through `dstack`'s chat UI on the `Models` page. In this case, `dstack` will use the service's `/v1/chat/completions` service. +When `model` is set, `dstack` automatically configures [`probes`](#probes) to verify model health. +To customize or disable this, set `probes` explicitly. + ### Resources If you specify memory size, you can either specify an explicit size (e.g. `24GB`) or a