From e521efe91bd265a53359c2cdeca0dcc8d2f76b90 Mon Sep 17 00:00:00 2001
From: lilyz-ai <lily.zhu@scale.com>
Date: Thu, 26 Mar 2026 00:22:44 +0000
Subject: [PATCH 1/5] feat(gcp): add cache_redis_gcp_url for GCP Memorystore
 support

Add `cache_redis_gcp_url` field to `HostedModelInferenceServiceConfig`
and a GCP branch in `cache_redis_url` property. Previously GCP operators
had to use `cache_redis_onprem_url` as a workaround; this closes the gap
cleanly. Also updates cloud-matrix.md docs to reflect the proper field.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/internal/cloud-matrix.md                 |  4 +-
 .../model_engine_server/common/config.py      |  7 ++
 .../tests/unit/api/test_dependencies.py       | 70 +++++++++++++++++++
 3 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/docs/internal/cloud-matrix.md b/docs/internal/cloud-matrix.md
index 2d92615a5..ce7406ea3 100644
--- a/docs/internal/cloud-matrix.md
+++ b/docs/internal/cloud-matrix.md
@@ -355,10 +355,10 @@ Storage auth uses GCP Workload Identity via Application Default Credentials (ADC
 config:
   values:
     launch:
-      cache_redis_onprem_url: redis://my-memorystore-instance.redis.cache.googleapis.com:6379/0
+      cache_redis_gcp_url: redis://MEMORYSTORE_PRIVATE_IP:6379/0
 ```
 
-Alternatively, `cache_redis_aws_url` is also accepted (the on-prem fallback path in `config.py` accepts it with a log warning). The same Redis instance serves as both the caching layer and the Celery broker.
+`cache_redis_gcp_url` is the preferred field for GCP — it maps directly to GCP Memorystore and triggers the GCP branch in `HostedModelInferenceServiceConfig.cache_redis_url`. The same Redis instance serves as both the caching layer and the Celery broker.
 
 #### Service Account (GCP Workload Identity)
 
diff --git a/model-engine/model_engine_server/common/config.py b/model-engine/model_engine_server/common/config.py
index 902c1a898..9e6e39b81 100644
--- a/model-engine/model_engine_server/common/config.py
+++ b/model-engine/model_engine_server/common/config.py
@@ -77,6 +77,9 @@ class HostedModelInferenceServiceConfig:
         None  # Not an env var because the redis cache info is already here
     )
     cache_redis_onprem_url: Optional[str] = None  # For on-prem Redis (e.g., redis://redis:6379/0)
+    cache_redis_gcp_url: Optional[str] = (
+        None  # For GCP Memorystore (e.g., redis://MEMORYSTORE_HOST:6379/0)
+    )
     sglang_repository: Optional[str] = None
 
     @classmethod
@@ -106,6 +109,10 @@ def cache_redis_url(self) -> str:
             redis_port = getattr(infra_config(), "redis_port", 6379)
             return f"redis://{redis_host}:{redis_port}/0"
 
+        if cloud_provider == "gcp":
+            assert self.cache_redis_gcp_url, "cache_redis_gcp_url required for GCP"
+            return self.cache_redis_gcp_url
+
         if self.cache_redis_aws_url:
             assert cloud_provider == "aws", "cache_redis_aws_url is only for AWS"
             if self.cache_redis_aws_secret_name:
diff --git a/model-engine/tests/unit/api/test_dependencies.py b/model-engine/tests/unit/api/test_dependencies.py
index 15491c368..a2712827e 100644
--- a/model-engine/tests/unit/api/test_dependencies.py
+++ b/model-engine/tests/unit/api/test_dependencies.py
@@ -1,6 +1,8 @@
 from unittest.mock import MagicMock, patch
 
+import pytest
 from model_engine_server.api.dependencies import _get_external_interfaces
+from model_engine_server.common.config import HostedModelInferenceServiceConfig
 from model_engine_server.infra.gateways import (
     GCSFileStorageGateway,
     GCSFilesystemGateway,
@@ -119,3 +121,71 @@ def test_gcp_provider_selects_gcp_implementations():
             external_interfaces.resource_gateway.queue_delegate,
             RedisQueueEndpointResourceDelegate,
         )
+
+
+def test_gcp_cache_redis_url_returns_gcp_url():
+    """Test that cache_redis_url returns cache_redis_gcp_url when cloud_provider is gcp."""
+    with patch("model_engine_server.common.config.infra_config") as mock_infra_config:
+        mock_infra = MagicMock()
+        mock_infra.cloud_provider = "gcp"
+        mock_infra_config.return_value = mock_infra
+
+        config = HostedModelInferenceServiceConfig(
+            gateway_namespace="ns",
+            endpoint_namespace="ns",
+            billing_queue_arn="arn",
+            sqs_profile="default",
+            sqs_queue_policy_template="{}",
+            sqs_queue_tag_template="{}",
+            model_primitive_host="localhost",
+            cloud_file_llm_fine_tune_repository="gs://bucket/ft",
+            hf_user_fine_tuned_weights_prefix="gs://bucket/weights",
+            istio_enabled=False,
+            dd_trace_enabled=False,
+            tgi_repository="repo/tgi",
+            vllm_repository="repo/vllm",
+            lightllm_repository="repo/lightllm",
+            tensorrt_llm_repository="repo/tensorrt",
+            batch_inference_vllm_repository="repo/batch",
+            user_inference_base_repository="repo/base",
+            user_inference_pytorch_repository="repo/pytorch",
+            user_inference_tensorflow_repository="repo/tf",
+            docker_image_layer_cache_repository="repo/cache",
+            sensitive_log_mode=False,
+            cache_redis_gcp_url="redis://10.0.0.1:6379/0",
+        )
+        assert config.cache_redis_url == "redis://10.0.0.1:6379/0"
+
+
+def test_gcp_cache_redis_url_raises_when_not_set():
+    """Test that cache_redis_url raises AssertionError for GCP when cache_redis_gcp_url is not set."""
+    with patch("model_engine_server.common.config.infra_config") as mock_infra_config:
+        mock_infra = MagicMock()
+        mock_infra.cloud_provider = "gcp"
+        mock_infra_config.return_value = mock_infra
+
+        config = HostedModelInferenceServiceConfig(
+            gateway_namespace="ns",
+            endpoint_namespace="ns",
+            billing_queue_arn="arn",
+            sqs_profile="default",
+            sqs_queue_policy_template="{}",
+            sqs_queue_tag_template="{}",
+            model_primitive_host="localhost",
+            cloud_file_llm_fine_tune_repository="gs://bucket/ft",
+            hf_user_fine_tuned_weights_prefix="gs://bucket/weights",
+            istio_enabled=False,
+            dd_trace_enabled=False,
+            tgi_repository="repo/tgi",
+            vllm_repository="repo/vllm",
+            lightllm_repository="repo/lightllm",
+            tensorrt_llm_repository="repo/tensorrt",
+            batch_inference_vllm_repository="repo/batch",
+            user_inference_base_repository="repo/base",
+            user_inference_pytorch_repository="repo/pytorch",
+            user_inference_tensorflow_repository="repo/tf",
+            docker_image_layer_cache_repository="repo/cache",
+            sensitive_log_mode=False,
+        )
+        with pytest.raises(AssertionError, match="cache_redis_gcp_url required for GCP"):
+            _ = config.cache_redis_url

From 73e6b39b2ffb671768035b5f8d6cb0b99e1e64ee Mon Sep 17 00:00:00 2001
From: lilyz-ai <lily.zhu@scale.com>
Date: Thu, 26 Mar 2026 00:32:25 +0000
Subject: [PATCH 2/5] docs: use example IP in GCP Memorystore
 cache_redis_gcp_url sample

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/internal/cloud-matrix.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/internal/cloud-matrix.md b/docs/internal/cloud-matrix.md
index ce7406ea3..2e67e915a 100644
--- a/docs/internal/cloud-matrix.md
+++ b/docs/internal/cloud-matrix.md
@@ -355,7 +355,7 @@ Storage auth uses GCP Workload Identity via Application Default Credentials (ADC
 config:
   values:
     launch:
-      cache_redis_gcp_url: redis://MEMORYSTORE_PRIVATE_IP:6379/0
+      cache_redis_gcp_url: redis://10.0.0.3:6379/0
 ```
 
 `cache_redis_gcp_url` is the preferred field for GCP — it maps directly to GCP Memorystore and triggers the GCP branch in `HostedModelInferenceServiceConfig.cache_redis_url`. The same Redis instance serves as both the caching layer and the Celery broker.

From 965c8ccbfeb82c391647e72aeef56388d5c071cb Mon Sep 17 00:00:00 2001
From: lilyz-ai <lily.zhu@scale.com>
Date: Thu, 26 Mar 2026 00:40:32 +0000
Subject: [PATCH 3/5] feat(vllm): update standard vllm image to 0.17.0 to
 support Qwen3.5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Pin inferenceFramework.vllm tag to "0.17.0" (was "latest" → 0.11.1)
  to support qwen3_5 architecture (Qwen3.5 0.8B, etc.)
- Allow empty USER_TAG in build_and_upload_image.sh to produce
  clean version-only image tags (e.g. "0.17.0" instead of "0.17.0-")

Build command for the new image:
  AWS_PROFILE=ml-admin ./build_and_upload_image.sh "" vllm --account=692474966980

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 charts/model-engine/values.yaml                       |  2 +-
 docs/internal/cloud-matrix.md                         |  2 +-
 .../inference/vllm/build_and_upload_image.sh          | 11 ++++++-----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/charts/model-engine/values.yaml b/charts/model-engine/values.yaml
index eb2afaae8..90cc0ea18 100644
--- a/charts/model-engine/values.yaml
+++ b/charts/model-engine/values.yaml
@@ -47,7 +47,7 @@ keda:
 
 # Configurable inference framework image tags
 inferenceFramework:
-  vllm: "latest"
+  vllm: "0.17.0"
   deepspeed: "latest"
   text_generation_inference: "latest"
   lightllm: "latest"
diff --git a/docs/internal/cloud-matrix.md b/docs/internal/cloud-matrix.md
index 2e67e915a..ef4346059 100644
--- a/docs/internal/cloud-matrix.md
+++ b/docs/internal/cloud-matrix.md
@@ -355,7 +355,7 @@ Storage auth uses GCP Workload Identity via Application Default Credentials (ADC
 config:
   values:
     launch:
-      cache_redis_gcp_url: redis://10.0.0.3:6379/0
+      cache_redis_gcp_url: redis://my-memorystore-instance.redis.cache.googleapis.com:6379/0
 ```
 
 `cache_redis_gcp_url` is the preferred field for GCP — it maps directly to GCP Memorystore and triggers the GCP branch in `HostedModelInferenceServiceConfig.cache_redis_url`. The same Redis instance serves as both the caching layer and the Celery broker.
diff --git a/model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh b/model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh
index 82f2b9567..ea7124c24 100755
--- a/model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh
+++ b/model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh
@@ -38,7 +38,8 @@ set -eo pipefail
 # Examples:
 #
 #   # 1. Published versions (base image and pip version match)
-#   ./build_and_upload_image.sh my-tag vllm
+#   ./build_and_upload_image.sh my-tag vllm                          # → tag: 0.17.0-my-tag
+#   ./build_and_upload_image.sh "" vllm                              # → tag: 0.17.0 (version-only)
 #   ./build_and_upload_image.sh my-tag vllm --vllm-version=0.15.1
 #
 #   # 2. Newer pip version on older base image (e.g. 0.16.0 wheel on 0.15.1 base)
@@ -105,8 +106,8 @@ VLLM_OMNI_VERSION=${VLLM_OMNI_VERSION:-"0.16.0"}
 VLLM_OMNI_SOURCE_DIR=""
 VLLM_OMNI_SOURCE_REF=""
 
-if [ -z "$1" ]; then
-  echo "Must supply the user-provided tag"
+if [ "$#" -lt 1 ]; then
+  echo "Must supply the user-provided tag (pass empty string \"\" for a version-only tag)"
   exit 1;
 fi
 
@@ -264,9 +265,9 @@ fi
 
 # Construct image tag based on vllm version and user tag
 if [ "$BUILD_TARGET" == "vllm_omni" ]; then
-  IMAGE_TAG="${VLLM_VERSION}-omni-${USER_TAG}"
+  IMAGE_TAG="${VLLM_VERSION}-omni${USER_TAG:+-$USER_TAG}"
 else
-  IMAGE_TAG="${VLLM_VERSION}-${USER_TAG}"
+  IMAGE_TAG="${VLLM_VERSION}${USER_TAG:+-$USER_TAG}"
 fi
 
 # if build target = vllm use vllm otherwise use vllm_batch

From 15470db826ac9b24a455425fb27f171b896ab467 Mon Sep 17 00:00:00 2001
From: lilyz-ai <lily.zhu@scale.com>
Date: Thu, 26 Mar 2026 00:48:25 +0000
Subject: [PATCH 4/5] feat(gcp): add GCP branch in get_engine_url for Cloud SQL
 via env vars

On GCP, DB credentials are injected as env vars (DB_HOST, DB_USER,
DB_PASSWORD, DB_PORT, DB_NAME) via a Kubernetes secret. Previously
get_engine_url fell through to the AWS Secrets Manager path, causing
NoCredentialsError on GCP deployments running alembic migrations.

Pattern mirrors the existing onprem branch. Also documents the required
K8s secret keys in cloud-matrix.md.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/internal/cloud-matrix.md               | 11 +++++++++++
 model-engine/model_engine_server/db/base.py | 14 ++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/docs/internal/cloud-matrix.md b/docs/internal/cloud-matrix.md
index ef4346059..d17b43afe 100644
--- a/docs/internal/cloud-matrix.md
+++ b/docs/internal/cloud-matrix.md
@@ -360,6 +360,17 @@ config:
 
 `cache_redis_gcp_url` is the preferred field for GCP — it maps directly to GCP Memorystore and triggers the GCP branch in `HostedModelInferenceServiceConfig.cache_redis_url`. The same Redis instance serves as both the caching layer and the Celery broker.
 
+#### Database (Cloud SQL)
+
+GCP uses environment variables for DB credentials injected via a Kubernetes secret — the same pattern as on-prem. There is no GCP Secret Manager integration; use `kubernetesDatabaseSecretName` to mount the secret:
+
+```yaml
+secrets:
+  kubernetesDatabaseSecretName: llm-engine-postgres-credentials
+```
+
+The K8s secret must contain keys that the chart injects as env vars: `DB_HOST`, `DB_HOST_RO`, `DB_PORT`, `DB_USER`, `DB_PASSWORD`, `DB_NAME`. `get_engine_url` in `db/base.py` reads these directly when `cloud_provider == "gcp"`.
+
 #### Service Account (GCP Workload Identity)
 
 ```yaml
diff --git a/model-engine/model_engine_server/db/base.py b/model-engine/model_engine_server/db/base.py
index f5ea49e7c..3d2f884a3 100644
--- a/model-engine/model_engine_server/db/base.py
+++ b/model-engine/model_engine_server/db/base.py
@@ -69,6 +69,20 @@ def get_engine_url(
 
             engine_url = f"postgresql://{user}:{password}@{host}:{port}/{dbname}"
 
+        elif infra_config().cloud_provider == "gcp":
+            user = os.environ.get("DB_USER", "postgres")
+            password = os.environ.get("DB_PASSWORD", "postgres")
+            host = (
+                os.environ.get("DB_HOST_RO")
+                if read_only
+                else os.environ.get("DB_HOST", "localhost")
+            )
+            port = os.environ.get("DB_PORT", "5432")
+            dbname = os.environ.get("DB_NAME", "llm_engine")
+            logger.info(f"Connecting to db {host}:{port}, name {dbname}")
+
+            engine_url = f"postgresql://{user}:{password}@{host}:{port}/{dbname}"
+
         elif infra_config().cloud_provider == "azure":
             client = SecretClient(
                 vault_url=f"https://{os.environ.get('KEYVAULT_NAME')}.vault.azure.net",

From 9e18d0a46034ce41992f8fd4fae3c8c22862435f Mon Sep 17 00:00:00 2001
From: lilyz-ai <lily.zhu@scale.com>
Date: Thu, 26 Mar 2026 01:22:28 +0000
Subject: [PATCH 5/5] revert(charts): keep vllm default tag as latest in
 values.yaml

Individual deployments should pin specific versions; not all deployments
may have the 0.17.0 tag available.
---
 charts/model-engine/values.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/charts/model-engine/values.yaml b/charts/model-engine/values.yaml
index 90cc0ea18..eb2afaae8 100644
--- a/charts/model-engine/values.yaml
+++ b/charts/model-engine/values.yaml
@@ -47,7 +47,7 @@ keda:
 
 # Configurable inference framework image tags
 inferenceFramework:
-  vllm: "0.17.0"
+  vllm: "latest"
   deepspeed: "latest"
   text_generation_inference: "latest"
   lightllm: "latest"