From dfd04b309244917c68bc0e859ad6478e5f33e34f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 20 Mar 2026 13:12:56 +0000 Subject: [PATCH 1/2] Initial plan From 88024f89a970433a9eb66098345099016943110d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 20 Mar 2026 13:25:13 +0000 Subject: [PATCH 2/2] feat: sync vector store dimensions with embedding output Co-authored-by: sharpninja <16146732+sharpninja@users.noreply.github.com> --- .../patch-20260315024056229023.json | 4 ++ docs/upstream-sync/upstream-3502c222.md | 15 ++++- dotnet/src/GraphRag.Vectors/IndexSchema.cs | 12 ++++ .../src/GraphRag.Vectors/VectorStoreConfig.cs | 17 ++++++ .../GraphRag/Config/Models/GraphRagConfig.cs | 32 +++++++++++ .../Config/GraphRagConfigMethodTests.cs | 55 +++++++++++++++++++ .../Vectors/IndexSchemaTests.cs | 12 ++++ .../Vectors/VectorStoreConfigTests.cs | 33 +++++++++++ .../graphrag/index/validate_config.py | 33 ++++++++++- 9 files changed, 210 insertions(+), 3 deletions(-) create mode 100644 .semversioner/next-release/patch-20260315024056229023.json create mode 100644 dotnet/tests/GraphRag.Tests.Unit/Vectors/VectorStoreConfigTests.cs diff --git a/.semversioner/next-release/patch-20260315024056229023.json b/.semversioner/next-release/patch-20260315024056229023.json new file mode 100644 index 0000000000..84a7316047 --- /dev/null +++ b/.semversioner/next-release/patch-20260315024056229023.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "reconfigure vector store size by embedding model" +} diff --git a/docs/upstream-sync/upstream-3502c222.md b/docs/upstream-sync/upstream-3502c222.md index bcfc668a34..537a3181d5 100644 --- a/docs/upstream-sync/upstream-3502c222.md +++ b/docs/upstream-sync/upstream-3502c222.md @@ -6,6 +6,17 @@ --- -Analysis unavailable: HTTP Error 401: Unauthorized +Manual review complete. -Manual review of upstream commit `3502c222` is required. \ No newline at end of file +## Summary + +Upstream commit `3502c222` updates Python config validation so that, after probing the configured embedding model, GraphRAG automatically realigns `vector_store.vector_size` and each index schema vector dimension to the actual embedding width. + +## Dotnet parity + +The dotnet codebase does not have a direct `validate_config.py` equivalent yet, so parity is implemented in the immutable configuration models: + +- `GraphRagConfig.SyncVectorStoreDimensions(...)` now realigns vector-store dimensions when the configured embed-text model returns a different embedding width. +- `VectorStoreConfig.WithVectorSize(...)` and `IndexSchema.WithVectorSize(...)` propagate the updated dimension consistently. + +No additional missed Python parity changes were identified in this upstream commit beyond the vector-size synchronization behavior. diff --git a/dotnet/src/GraphRag.Vectors/IndexSchema.cs b/dotnet/src/GraphRag.Vectors/IndexSchema.cs index 9509591797..a7d26162fd 100644 --- a/dotnet/src/GraphRag.Vectors/IndexSchema.cs +++ b/dotnet/src/GraphRag.Vectors/IndexSchema.cs @@ -32,4 +32,16 @@ public sealed record IndexSchema /// Gets the mapping of field names to their types. /// public Dictionary? Fields { get; init; } + + /// + /// Returns a copy of the schema with the specified vector size. + /// + /// The vector dimension to apply. + /// A copy of the schema with the updated vector size. + /// Thrown when is less than or equal to zero. + public IndexSchema WithVectorSize(int vectorSize) + { + ArgumentOutOfRangeException.ThrowIfNegativeOrZero(vectorSize); + return this with { VectorSize = vectorSize }; + } } diff --git a/dotnet/src/GraphRag.Vectors/VectorStoreConfig.cs b/dotnet/src/GraphRag.Vectors/VectorStoreConfig.cs index 2589fb0c5f..69159c93be 100644 --- a/dotnet/src/GraphRag.Vectors/VectorStoreConfig.cs +++ b/dotnet/src/GraphRag.Vectors/VectorStoreConfig.cs @@ -52,4 +52,21 @@ public sealed record VectorStoreConfig /// Gets the index schema configuration. /// public IndexSchema? IndexSchema { get; init; } + + /// + /// Returns a copy of the vector store configuration with the specified vector size. + /// + /// The vector dimension to apply. + /// A copy of the vector store configuration with the updated vector size. + /// Thrown when is less than or equal to zero. + public VectorStoreConfig WithVectorSize(int vectorSize) + { + ArgumentOutOfRangeException.ThrowIfNegativeOrZero(vectorSize); + + return this with + { + VectorSize = vectorSize, + IndexSchema = IndexSchema?.WithVectorSize(vectorSize), + }; + } } diff --git a/dotnet/src/GraphRag/Config/Models/GraphRagConfig.cs b/dotnet/src/GraphRag/Config/Models/GraphRagConfig.cs index b1acbca52d..92802014f0 100644 --- a/dotnet/src/GraphRag/Config/Models/GraphRagConfig.cs +++ b/dotnet/src/GraphRag/Config/Models/GraphRagConfig.cs @@ -6,6 +6,7 @@ using GraphRag.Config.Enums; using GraphRag.Input; using GraphRag.Llm.Config; +using GraphRag.Llm.Types; using GraphRag.Storage; using GraphRag.Storage.Tables; using GraphRag.Vectors; @@ -131,4 +132,35 @@ public ModelConfig GetEmbeddingModelConfig(string? modelId = null) throw new KeyNotFoundException($"Embedding model '{key}' not found in configuration."); } + + /// + /// Returns a copy of the configuration with vector store dimensions synchronized to an embedding response. + /// + /// The embedding model that produced the response. + /// The embedding response to inspect. + /// + /// The current configuration when the response is empty, already aligned, or produced by a different embedding model; + /// otherwise a copy with the vector store dimensions updated to match the response. + /// + public GraphRagConfig SyncVectorStoreDimensions(string embeddingModelId, LlmEmbeddingResponse response) + { + ArgumentNullException.ThrowIfNull(embeddingModelId); + ArgumentNullException.ThrowIfNull(response); + + if (!string.Equals(embeddingModelId, EmbedText.EmbeddingModelId, StringComparison.Ordinal)) + { + return this; + } + + var detectedVectorSize = response.FirstEmbedding.Count; + if (detectedVectorSize == 0 || detectedVectorSize == VectorStore.VectorSize) + { + return this; + } + + return this with + { + VectorStore = VectorStore.WithVectorSize(detectedVectorSize), + }; + } } diff --git a/dotnet/tests/GraphRag.Tests.Unit/Config/GraphRagConfigMethodTests.cs b/dotnet/tests/GraphRag.Tests.Unit/Config/GraphRagConfigMethodTests.cs index cdb38a56f4..fc60e88456 100644 --- a/dotnet/tests/GraphRag.Tests.Unit/Config/GraphRagConfigMethodTests.cs +++ b/dotnet/tests/GraphRag.Tests.Unit/Config/GraphRagConfigMethodTests.cs @@ -6,6 +6,8 @@ using GraphRag.Config.Errors; using GraphRag.Config.Models; using GraphRag.Llm.Config; +using GraphRag.Llm.Types; +using GraphRag.Vectors; namespace GraphRag.Tests.Unit.Config; @@ -133,4 +135,57 @@ public void Workflows_CanBeSet() config.Workflows.Should().BeEquivalentTo(workflows); } + + [Fact] + public void SyncVectorStoreDimensions_UpdatesVectorStoreAndSchema_ForConfiguredEmbeddingModel() + { + var config = new GraphRagConfig + { + EmbedText = new EmbedTextConfig { EmbeddingModelId = "embed-model" }, + VectorStore = new VectorStoreConfig + { + Type = "azure_ai_search", + VectorSize = 3072, + IndexSchema = new IndexSchema { IndexName = "entities", VectorSize = 3072 }, + }, + }; + var response = new LlmEmbeddingResponse([[1.0f, 2.0f, 3.0f]]); + + var result = config.SyncVectorStoreDimensions("embed-model", response); + + result.Should().NotBeSameAs(config); + result.VectorStore.VectorSize.Should().Be(3); + result.VectorStore.IndexSchema.Should().NotBeNull(); + result.VectorStore.IndexSchema!.VectorSize.Should().Be(3); + config.VectorStore.VectorSize.Should().Be(3072); + config.VectorStore.IndexSchema!.VectorSize.Should().Be(3072); + } + + [Fact] + public void SyncVectorStoreDimensions_ReturnsSameConfig_WhenEmbeddingModelDoesNotMatch() + { + var config = new GraphRagConfig + { + EmbedText = new EmbedTextConfig { EmbeddingModelId = "embed-model" }, + }; + var response = new LlmEmbeddingResponse([[1.0f, 2.0f, 3.0f]]); + + var result = config.SyncVectorStoreDimensions("different-model", response); + + result.Should().BeSameAs(config); + } + + [Fact] + public void SyncVectorStoreDimensions_ReturnsSameConfig_WhenResponseIsEmpty() + { + var config = new GraphRagConfig + { + EmbedText = new EmbedTextConfig { EmbeddingModelId = "embed-model" }, + }; + var response = new LlmEmbeddingResponse([]); + + var result = config.SyncVectorStoreDimensions("embed-model", response); + + result.Should().BeSameAs(config); + } } diff --git a/dotnet/tests/GraphRag.Tests.Unit/Vectors/IndexSchemaTests.cs b/dotnet/tests/GraphRag.Tests.Unit/Vectors/IndexSchemaTests.cs index 56821e1130..c1b77281ba 100644 --- a/dotnet/tests/GraphRag.Tests.Unit/Vectors/IndexSchemaTests.cs +++ b/dotnet/tests/GraphRag.Tests.Unit/Vectors/IndexSchemaTests.cs @@ -21,4 +21,16 @@ public void DefaultValues_AreCorrect() schema.VectorSize.Should().Be(3072); schema.Fields.Should().BeNull(); } + + [Fact] + public void WithVectorSize_ReturnsUpdatedCopy() + { + var schema = new IndexSchema { IndexName = "test", VectorSize = 3072 }; + + var updated = schema.WithVectorSize(1536); + + updated.Should().NotBeSameAs(schema); + updated.VectorSize.Should().Be(1536); + schema.VectorSize.Should().Be(3072); + } } diff --git a/dotnet/tests/GraphRag.Tests.Unit/Vectors/VectorStoreConfigTests.cs b/dotnet/tests/GraphRag.Tests.Unit/Vectors/VectorStoreConfigTests.cs new file mode 100644 index 0000000000..104a598b3a --- /dev/null +++ b/dotnet/tests/GraphRag.Tests.Unit/Vectors/VectorStoreConfigTests.cs @@ -0,0 +1,33 @@ +// Copyright (c) 2025 Microsoft Corporation. +// Licensed under the MIT License + +using FluentAssertions; +using GraphRag.Vectors; + +namespace GraphRag.Tests.Unit.Vectors; + +/// +/// Unit tests for . +/// +public class VectorStoreConfigTests +{ + [Fact] + public void WithVectorSize_ReturnsUpdatedCopy_AndSynchronizesSchema() + { + var config = new VectorStoreConfig + { + Type = "azure_ai_search", + VectorSize = 3072, + IndexSchema = new IndexSchema { IndexName = "entities", VectorSize = 3072 }, + }; + + var updated = config.WithVectorSize(1536); + + updated.Should().NotBeSameAs(config); + updated.VectorSize.Should().Be(1536); + updated.IndexSchema.Should().NotBeNull(); + updated.IndexSchema!.VectorSize.Should().Be(1536); + config.VectorSize.Should().Be(3072); + config.IndexSchema!.VectorSize.Should().Be(3072); + } +} diff --git a/packages/graphrag/graphrag/index/validate_config.py b/packages/graphrag/graphrag/index/validate_config.py index 4062b8de9a..691837a5b3 100644 --- a/packages/graphrag/graphrag/index/validate_config.py +++ b/packages/graphrag/graphrag/index/validate_config.py @@ -6,12 +6,16 @@ import asyncio import logging import sys +from typing import TYPE_CHECKING from graphrag_llm.completion import create_completion from graphrag_llm.embedding import create_embedding from graphrag.config.models.graph_rag_config import GraphRagConfig +if TYPE_CHECKING: + from graphrag_llm.types import LLMEmbeddingResponse + logger = logging.getLogger(__name__) @@ -29,13 +33,40 @@ def validate_config_names(parameters: GraphRagConfig) -> None: for id, config in parameters.embedding_models.items(): embed_llm = create_embedding(config) try: - asyncio.run( + response = asyncio.run( embed_llm.embedding_async( input=["This is an LLM Embedding Test String"] ) ) logger.info("Embedding LLM Config Params Validated") + + if id == parameters.embed_text.embedding_model_id: + _sync_vector_store_dimensions(parameters, response) except Exception as e: # noqa: BLE001 logger.error(f"Embedding configuration error detected.\n{e}") # noqa print(f"Failed to validate embedding model ({id}) params", e) # noqa: T201 sys.exit(1) + + +def _sync_vector_store_dimensions( + parameters: GraphRagConfig, + response: "LLMEmbeddingResponse", +) -> None: + """Sync vector store dimensions to match the actual embedding model output.""" + detected = len(response.first_embedding) + if detected == 0: + return + + configured = parameters.vector_store.vector_size + if detected == configured: + return + + logger.warning( + "Embedding model produces %d-dimensional vectors but vector_size is " + "configured as %d. Overriding vector_size to match the model.", + detected, + configured, + ) + parameters.vector_store.vector_size = detected + for schema in parameters.vector_store.index_schema.values(): + schema.vector_size = detected