From e521efe91bd265a53359c2cdeca0dcc8d2f76b90 Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Thu, 26 Mar 2026 00:22:44 +0000 Subject: [PATCH 1/5] feat(gcp): add cache_redis_gcp_url for GCP Memorystore support Add `cache_redis_gcp_url` field to `HostedModelInferenceServiceConfig` and a GCP branch in `cache_redis_url` property. Previously GCP operators had to use `cache_redis_onprem_url` as a workaround; this closes the gap cleanly. Also updates cloud-matrix.md docs to reflect the proper field. Co-Authored-By: Claude Sonnet 4.6 --- docs/internal/cloud-matrix.md | 4 +- .../model_engine_server/common/config.py | 7 ++ .../tests/unit/api/test_dependencies.py | 70 +++++++++++++++++++ 3 files changed, 79 insertions(+), 2 deletions(-) diff --git a/docs/internal/cloud-matrix.md b/docs/internal/cloud-matrix.md index 2d92615a5..ce7406ea3 100644 --- a/docs/internal/cloud-matrix.md +++ b/docs/internal/cloud-matrix.md @@ -355,10 +355,10 @@ Storage auth uses GCP Workload Identity via Application Default Credentials (ADC config: values: launch: - cache_redis_onprem_url: redis://my-memorystore-instance.redis.cache.googleapis.com:6379/0 + cache_redis_gcp_url: redis://MEMORYSTORE_PRIVATE_IP:6379/0 ``` -Alternatively, `cache_redis_aws_url` is also accepted (the on-prem fallback path in `config.py` accepts it with a log warning). The same Redis instance serves as both the caching layer and the Celery broker. +`cache_redis_gcp_url` is the preferred field for GCP — it maps directly to GCP Memorystore and triggers the GCP branch in `HostedModelInferenceServiceConfig.cache_redis_url`. The same Redis instance serves as both the caching layer and the Celery broker. #### Service Account (GCP Workload Identity) diff --git a/model-engine/model_engine_server/common/config.py b/model-engine/model_engine_server/common/config.py index 902c1a898..9e6e39b81 100644 --- a/model-engine/model_engine_server/common/config.py +++ b/model-engine/model_engine_server/common/config.py @@ -77,6 +77,9 @@ class HostedModelInferenceServiceConfig: None # Not an env var because the redis cache info is already here ) cache_redis_onprem_url: Optional[str] = None # For on-prem Redis (e.g., redis://redis:6379/0) + cache_redis_gcp_url: Optional[str] = ( + None # For GCP Memorystore (e.g., redis://MEMORYSTORE_HOST:6379/0) + ) sglang_repository: Optional[str] = None @classmethod @@ -106,6 +109,10 @@ def cache_redis_url(self) -> str: redis_port = getattr(infra_config(), "redis_port", 6379) return f"redis://{redis_host}:{redis_port}/0" + if cloud_provider == "gcp": + assert self.cache_redis_gcp_url, "cache_redis_gcp_url required for GCP" + return self.cache_redis_gcp_url + if self.cache_redis_aws_url: assert cloud_provider == "aws", "cache_redis_aws_url is only for AWS" if self.cache_redis_aws_secret_name: diff --git a/model-engine/tests/unit/api/test_dependencies.py b/model-engine/tests/unit/api/test_dependencies.py index 15491c368..a2712827e 100644 --- a/model-engine/tests/unit/api/test_dependencies.py +++ b/model-engine/tests/unit/api/test_dependencies.py @@ -1,6 +1,8 @@ from unittest.mock import MagicMock, patch +import pytest from model_engine_server.api.dependencies import _get_external_interfaces +from model_engine_server.common.config import HostedModelInferenceServiceConfig from model_engine_server.infra.gateways import ( GCSFileStorageGateway, GCSFilesystemGateway, @@ -119,3 +121,71 @@ def test_gcp_provider_selects_gcp_implementations(): external_interfaces.resource_gateway.queue_delegate, RedisQueueEndpointResourceDelegate, ) + + +def test_gcp_cache_redis_url_returns_gcp_url(): + """Test that cache_redis_url returns cache_redis_gcp_url when cloud_provider is gcp.""" + with patch("model_engine_server.common.config.infra_config") as mock_infra_config: + mock_infra = MagicMock() + mock_infra.cloud_provider = "gcp" + mock_infra_config.return_value = mock_infra + + config = HostedModelInferenceServiceConfig( + gateway_namespace="ns", + endpoint_namespace="ns", + billing_queue_arn="arn", + sqs_profile="default", + sqs_queue_policy_template="{}", + sqs_queue_tag_template="{}", + model_primitive_host="localhost", + cloud_file_llm_fine_tune_repository="gs://bucket/ft", + hf_user_fine_tuned_weights_prefix="gs://bucket/weights", + istio_enabled=False, + dd_trace_enabled=False, + tgi_repository="repo/tgi", + vllm_repository="repo/vllm", + lightllm_repository="repo/lightllm", + tensorrt_llm_repository="repo/tensorrt", + batch_inference_vllm_repository="repo/batch", + user_inference_base_repository="repo/base", + user_inference_pytorch_repository="repo/pytorch", + user_inference_tensorflow_repository="repo/tf", + docker_image_layer_cache_repository="repo/cache", + sensitive_log_mode=False, + cache_redis_gcp_url="redis://10.0.0.1:6379/0", + ) + assert config.cache_redis_url == "redis://10.0.0.1:6379/0" + + +def test_gcp_cache_redis_url_raises_when_not_set(): + """Test that cache_redis_url raises AssertionError for GCP when cache_redis_gcp_url is not set.""" + with patch("model_engine_server.common.config.infra_config") as mock_infra_config: + mock_infra = MagicMock() + mock_infra.cloud_provider = "gcp" + mock_infra_config.return_value = mock_infra + + config = HostedModelInferenceServiceConfig( + gateway_namespace="ns", + endpoint_namespace="ns", + billing_queue_arn="arn", + sqs_profile="default", + sqs_queue_policy_template="{}", + sqs_queue_tag_template="{}", + model_primitive_host="localhost", + cloud_file_llm_fine_tune_repository="gs://bucket/ft", + hf_user_fine_tuned_weights_prefix="gs://bucket/weights", + istio_enabled=False, + dd_trace_enabled=False, + tgi_repository="repo/tgi", + vllm_repository="repo/vllm", + lightllm_repository="repo/lightllm", + tensorrt_llm_repository="repo/tensorrt", + batch_inference_vllm_repository="repo/batch", + user_inference_base_repository="repo/base", + user_inference_pytorch_repository="repo/pytorch", + user_inference_tensorflow_repository="repo/tf", + docker_image_layer_cache_repository="repo/cache", + sensitive_log_mode=False, + ) + with pytest.raises(AssertionError, match="cache_redis_gcp_url required for GCP"): + _ = config.cache_redis_url From 73e6b39b2ffb671768035b5f8d6cb0b99e1e64ee Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Thu, 26 Mar 2026 00:32:25 +0000 Subject: [PATCH 2/5] docs: use example IP in GCP Memorystore cache_redis_gcp_url sample Co-Authored-By: Claude Sonnet 4.6 --- docs/internal/cloud-matrix.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/internal/cloud-matrix.md b/docs/internal/cloud-matrix.md index ce7406ea3..2e67e915a 100644 --- a/docs/internal/cloud-matrix.md +++ b/docs/internal/cloud-matrix.md @@ -355,7 +355,7 @@ Storage auth uses GCP Workload Identity via Application Default Credentials (ADC config: values: launch: - cache_redis_gcp_url: redis://MEMORYSTORE_PRIVATE_IP:6379/0 + cache_redis_gcp_url: redis://10.0.0.3:6379/0 ``` `cache_redis_gcp_url` is the preferred field for GCP — it maps directly to GCP Memorystore and triggers the GCP branch in `HostedModelInferenceServiceConfig.cache_redis_url`. The same Redis instance serves as both the caching layer and the Celery broker. From 965c8ccbfeb82c391647e72aeef56388d5c071cb Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Thu, 26 Mar 2026 00:40:32 +0000 Subject: [PATCH 3/5] feat(vllm): update standard vllm image to 0.17.0 to support Qwen3.5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Pin inferenceFramework.vllm tag to "0.17.0" (was "latest" → 0.11.1) to support qwen3_5 architecture (Qwen3.5 0.8B, etc.) - Allow empty USER_TAG in build_and_upload_image.sh to produce clean version-only image tags (e.g. "0.17.0" instead of "0.17.0-") Build command for the new image: AWS_PROFILE=ml-admin ./build_and_upload_image.sh "" vllm --account=692474966980 Co-Authored-By: Claude Sonnet 4.6 --- charts/model-engine/values.yaml | 2 +- docs/internal/cloud-matrix.md | 2 +- .../inference/vllm/build_and_upload_image.sh | 11 ++++++----- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/charts/model-engine/values.yaml b/charts/model-engine/values.yaml index eb2afaae8..90cc0ea18 100644 --- a/charts/model-engine/values.yaml +++ b/charts/model-engine/values.yaml @@ -47,7 +47,7 @@ keda: # Configurable inference framework image tags inferenceFramework: - vllm: "latest" + vllm: "0.17.0" deepspeed: "latest" text_generation_inference: "latest" lightllm: "latest" diff --git a/docs/internal/cloud-matrix.md b/docs/internal/cloud-matrix.md index 2e67e915a..ef4346059 100644 --- a/docs/internal/cloud-matrix.md +++ b/docs/internal/cloud-matrix.md @@ -355,7 +355,7 @@ Storage auth uses GCP Workload Identity via Application Default Credentials (ADC config: values: launch: - cache_redis_gcp_url: redis://10.0.0.3:6379/0 + cache_redis_gcp_url: redis://my-memorystore-instance.redis.cache.googleapis.com:6379/0 ``` `cache_redis_gcp_url` is the preferred field for GCP — it maps directly to GCP Memorystore and triggers the GCP branch in `HostedModelInferenceServiceConfig.cache_redis_url`. The same Redis instance serves as both the caching layer and the Celery broker. diff --git a/model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh b/model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh index 82f2b9567..ea7124c24 100755 --- a/model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh +++ b/model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh @@ -38,7 +38,8 @@ set -eo pipefail # Examples: # # # 1. Published versions (base image and pip version match) -# ./build_and_upload_image.sh my-tag vllm +# ./build_and_upload_image.sh my-tag vllm # → tag: 0.17.0-my-tag +# ./build_and_upload_image.sh "" vllm # → tag: 0.17.0 (version-only) # ./build_and_upload_image.sh my-tag vllm --vllm-version=0.15.1 # # # 2. Newer pip version on older base image (e.g. 0.16.0 wheel on 0.15.1 base) @@ -105,8 +106,8 @@ VLLM_OMNI_VERSION=${VLLM_OMNI_VERSION:-"0.16.0"} VLLM_OMNI_SOURCE_DIR="" VLLM_OMNI_SOURCE_REF="" -if [ -z "$1" ]; then - echo "Must supply the user-provided tag" +if [ "$#" -lt 1 ]; then + echo "Must supply the user-provided tag (pass empty string \"\" for a version-only tag)" exit 1; fi @@ -264,9 +265,9 @@ fi # Construct image tag based on vllm version and user tag if [ "$BUILD_TARGET" == "vllm_omni" ]; then - IMAGE_TAG="${VLLM_VERSION}-omni-${USER_TAG}" + IMAGE_TAG="${VLLM_VERSION}-omni${USER_TAG:+-$USER_TAG}" else - IMAGE_TAG="${VLLM_VERSION}-${USER_TAG}" + IMAGE_TAG="${VLLM_VERSION}${USER_TAG:+-$USER_TAG}" fi # if build target = vllm use vllm otherwise use vllm_batch From 15470db826ac9b24a455425fb27f171b896ab467 Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Thu, 26 Mar 2026 00:48:25 +0000 Subject: [PATCH 4/5] feat(gcp): add GCP branch in get_engine_url for Cloud SQL via env vars On GCP, DB credentials are injected as env vars (DB_HOST, DB_USER, DB_PASSWORD, DB_PORT, DB_NAME) via a Kubernetes secret. Previously get_engine_url fell through to the AWS Secrets Manager path, causing NoCredentialsError on GCP deployments running alembic migrations. Pattern mirrors the existing onprem branch. Also documents the required K8s secret keys in cloud-matrix.md. Co-Authored-By: Claude Sonnet 4.6 --- docs/internal/cloud-matrix.md | 11 +++++++++++ model-engine/model_engine_server/db/base.py | 14 ++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/docs/internal/cloud-matrix.md b/docs/internal/cloud-matrix.md index ef4346059..d17b43afe 100644 --- a/docs/internal/cloud-matrix.md +++ b/docs/internal/cloud-matrix.md @@ -360,6 +360,17 @@ config: `cache_redis_gcp_url` is the preferred field for GCP — it maps directly to GCP Memorystore and triggers the GCP branch in `HostedModelInferenceServiceConfig.cache_redis_url`. The same Redis instance serves as both the caching layer and the Celery broker. +#### Database (Cloud SQL) + +GCP uses environment variables for DB credentials injected via a Kubernetes secret — the same pattern as on-prem. There is no GCP Secret Manager integration; use `kubernetesDatabaseSecretName` to mount the secret: + +```yaml +secrets: + kubernetesDatabaseSecretName: llm-engine-postgres-credentials +``` + +The K8s secret must contain keys that the chart injects as env vars: `DB_HOST`, `DB_HOST_RO`, `DB_PORT`, `DB_USER`, `DB_PASSWORD`, `DB_NAME`. `get_engine_url` in `db/base.py` reads these directly when `cloud_provider == "gcp"`. + #### Service Account (GCP Workload Identity) ```yaml diff --git a/model-engine/model_engine_server/db/base.py b/model-engine/model_engine_server/db/base.py index f5ea49e7c..3d2f884a3 100644 --- a/model-engine/model_engine_server/db/base.py +++ b/model-engine/model_engine_server/db/base.py @@ -69,6 +69,20 @@ def get_engine_url( engine_url = f"postgresql://{user}:{password}@{host}:{port}/{dbname}" + elif infra_config().cloud_provider == "gcp": + user = os.environ.get("DB_USER", "postgres") + password = os.environ.get("DB_PASSWORD", "postgres") + host = ( + os.environ.get("DB_HOST_RO") + if read_only + else os.environ.get("DB_HOST", "localhost") + ) + port = os.environ.get("DB_PORT", "5432") + dbname = os.environ.get("DB_NAME", "llm_engine") + logger.info(f"Connecting to db {host}:{port}, name {dbname}") + + engine_url = f"postgresql://{user}:{password}@{host}:{port}/{dbname}" + elif infra_config().cloud_provider == "azure": client = SecretClient( vault_url=f"https://{os.environ.get('KEYVAULT_NAME')}.vault.azure.net", From 9e18d0a46034ce41992f8fd4fae3c8c22862435f Mon Sep 17 00:00:00 2001 From: lilyz-ai Date: Thu, 26 Mar 2026 01:22:28 +0000 Subject: [PATCH 5/5] revert(charts): keep vllm default tag as latest in values.yaml Individual deployments should pin specific versions; not all deployments may have the 0.17.0 tag available. --- charts/model-engine/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/model-engine/values.yaml b/charts/model-engine/values.yaml index 90cc0ea18..eb2afaae8 100644 --- a/charts/model-engine/values.yaml +++ b/charts/model-engine/values.yaml @@ -47,7 +47,7 @@ keda: # Configurable inference framework image tags inferenceFramework: - vllm: "0.17.0" + vllm: "latest" deepspeed: "latest" text_generation_inference: "latest" lightllm: "latest"