Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20260315024056229023.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "reconfigure vector store size by embedding model"
}
15 changes: 13 additions & 2 deletions docs/upstream-sync/upstream-3502c222.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,17 @@

---

Analysis unavailable: HTTP Error 401: Unauthorized
Manual review complete.

Manual review of upstream commit `3502c222` is required.
## Summary

Upstream commit `3502c222` updates Python config validation so that, after probing the configured embedding model, GraphRAG automatically realigns `vector_store.vector_size` and each index schema vector dimension to the actual embedding width.

## Dotnet parity

The dotnet codebase does not have a direct `validate_config.py` equivalent yet, so parity is implemented in the immutable configuration models:

- `GraphRagConfig.SyncVectorStoreDimensions(...)` now realigns vector-store dimensions when the configured embed-text model returns a different embedding width.
- `VectorStoreConfig.WithVectorSize(...)` and `IndexSchema.WithVectorSize(...)` propagate the updated dimension consistently.

No additional missed Python parity changes were identified in this upstream commit beyond the vector-size synchronization behavior.
12 changes: 12 additions & 0 deletions dotnet/src/GraphRag.Vectors/IndexSchema.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,16 @@ public sealed record IndexSchema
/// Gets the mapping of field names to their types.
/// </summary>
public Dictionary<string, string>? Fields { get; init; }

/// <summary>
/// Returns a copy of the schema with the specified vector size.
/// </summary>
/// <param name="vectorSize">The vector dimension to apply.</param>
/// <returns>A copy of the schema with the updated vector size.</returns>
/// <exception cref="ArgumentOutOfRangeException">Thrown when <paramref name="vectorSize"/> is less than or equal to zero.</exception>
public IndexSchema WithVectorSize(int vectorSize)
{
ArgumentOutOfRangeException.ThrowIfNegativeOrZero(vectorSize);
return this with { VectorSize = vectorSize };
}
}
17 changes: 17 additions & 0 deletions dotnet/src/GraphRag.Vectors/VectorStoreConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,21 @@ public sealed record VectorStoreConfig
/// Gets the index schema configuration.
/// </summary>
public IndexSchema? IndexSchema { get; init; }

/// <summary>
/// Returns a copy of the vector store configuration with the specified vector size.
/// </summary>
/// <param name="vectorSize">The vector dimension to apply.</param>
/// <returns>A copy of the vector store configuration with the updated vector size.</returns>
/// <exception cref="ArgumentOutOfRangeException">Thrown when <paramref name="vectorSize"/> is less than or equal to zero.</exception>
public VectorStoreConfig WithVectorSize(int vectorSize)
{
ArgumentOutOfRangeException.ThrowIfNegativeOrZero(vectorSize);

return this with
{
VectorSize = vectorSize,
IndexSchema = IndexSchema?.WithVectorSize(vectorSize),
};
}
}
32 changes: 32 additions & 0 deletions dotnet/src/GraphRag/Config/Models/GraphRagConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using GraphRag.Config.Enums;
using GraphRag.Input;
using GraphRag.Llm.Config;
using GraphRag.Llm.Types;
using GraphRag.Storage;
using GraphRag.Storage.Tables;
using GraphRag.Vectors;
Expand Down Expand Up @@ -131,4 +132,35 @@ public ModelConfig GetEmbeddingModelConfig(string? modelId = null)

throw new KeyNotFoundException($"Embedding model '{key}' not found in configuration.");
}

/// <summary>
/// Returns a copy of the configuration with vector store dimensions synchronized to an embedding response.
/// </summary>
/// <param name="embeddingModelId">The embedding model that produced the response.</param>
/// <param name="response">The embedding response to inspect.</param>
/// <returns>
/// The current configuration when the response is empty, already aligned, or produced by a different embedding model;
/// otherwise a copy with the vector store dimensions updated to match the response.
/// </returns>
public GraphRagConfig SyncVectorStoreDimensions(string embeddingModelId, LlmEmbeddingResponse response)
{
ArgumentNullException.ThrowIfNull(embeddingModelId);
ArgumentNullException.ThrowIfNull(response);

if (!string.Equals(embeddingModelId, EmbedText.EmbeddingModelId, StringComparison.Ordinal))
{
return this;
}

var detectedVectorSize = response.FirstEmbedding.Count;
if (detectedVectorSize == 0 || detectedVectorSize == VectorStore.VectorSize)
{
return this;
}

return this with
{
VectorStore = VectorStore.WithVectorSize(detectedVectorSize),
};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
using GraphRag.Config.Errors;
using GraphRag.Config.Models;
using GraphRag.Llm.Config;
using GraphRag.Llm.Types;
using GraphRag.Vectors;

namespace GraphRag.Tests.Unit.Config;

Expand Down Expand Up @@ -133,4 +135,57 @@ public void Workflows_CanBeSet()

config.Workflows.Should().BeEquivalentTo(workflows);
}

[Fact]
public void SyncVectorStoreDimensions_UpdatesVectorStoreAndSchema_ForConfiguredEmbeddingModel()
{
var config = new GraphRagConfig
{
EmbedText = new EmbedTextConfig { EmbeddingModelId = "embed-model" },
VectorStore = new VectorStoreConfig
{
Type = "azure_ai_search",
VectorSize = 3072,
IndexSchema = new IndexSchema { IndexName = "entities", VectorSize = 3072 },
},
};
var response = new LlmEmbeddingResponse([[1.0f, 2.0f, 3.0f]]);

var result = config.SyncVectorStoreDimensions("embed-model", response);

result.Should().NotBeSameAs(config);
result.VectorStore.VectorSize.Should().Be(3);
result.VectorStore.IndexSchema.Should().NotBeNull();
result.VectorStore.IndexSchema!.VectorSize.Should().Be(3);
config.VectorStore.VectorSize.Should().Be(3072);
config.VectorStore.IndexSchema!.VectorSize.Should().Be(3072);
}

[Fact]
public void SyncVectorStoreDimensions_ReturnsSameConfig_WhenEmbeddingModelDoesNotMatch()
{
var config = new GraphRagConfig
{
EmbedText = new EmbedTextConfig { EmbeddingModelId = "embed-model" },
};
var response = new LlmEmbeddingResponse([[1.0f, 2.0f, 3.0f]]);

var result = config.SyncVectorStoreDimensions("different-model", response);

result.Should().BeSameAs(config);
}

[Fact]
public void SyncVectorStoreDimensions_ReturnsSameConfig_WhenResponseIsEmpty()
{
var config = new GraphRagConfig
{
EmbedText = new EmbedTextConfig { EmbeddingModelId = "embed-model" },
};
var response = new LlmEmbeddingResponse([]);

var result = config.SyncVectorStoreDimensions("embed-model", response);

result.Should().BeSameAs(config);
}
}
12 changes: 12 additions & 0 deletions dotnet/tests/GraphRag.Tests.Unit/Vectors/IndexSchemaTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,16 @@ public void DefaultValues_AreCorrect()
schema.VectorSize.Should().Be(3072);
schema.Fields.Should().BeNull();
}

[Fact]
public void WithVectorSize_ReturnsUpdatedCopy()
{
var schema = new IndexSchema { IndexName = "test", VectorSize = 3072 };

var updated = schema.WithVectorSize(1536);

updated.Should().NotBeSameAs(schema);
updated.VectorSize.Should().Be(1536);
schema.VectorSize.Should().Be(3072);
}
}
33 changes: 33 additions & 0 deletions dotnet/tests/GraphRag.Tests.Unit/Vectors/VectorStoreConfigTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Copyright (c) 2025 Microsoft Corporation.
// Licensed under the MIT License

using FluentAssertions;
using GraphRag.Vectors;

namespace GraphRag.Tests.Unit.Vectors;

/// <summary>
/// Unit tests for <see cref="VectorStoreConfig"/>.
/// </summary>
public class VectorStoreConfigTests
{
[Fact]
public void WithVectorSize_ReturnsUpdatedCopy_AndSynchronizesSchema()
{
var config = new VectorStoreConfig
{
Type = "azure_ai_search",
VectorSize = 3072,
IndexSchema = new IndexSchema { IndexName = "entities", VectorSize = 3072 },
};

var updated = config.WithVectorSize(1536);

updated.Should().NotBeSameAs(config);
updated.VectorSize.Should().Be(1536);
updated.IndexSchema.Should().NotBeNull();
updated.IndexSchema!.VectorSize.Should().Be(1536);
config.VectorSize.Should().Be(3072);
config.IndexSchema!.VectorSize.Should().Be(3072);
}
}
33 changes: 32 additions & 1 deletion packages/graphrag/graphrag/index/validate_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,16 @@
import asyncio
import logging
import sys
from typing import TYPE_CHECKING

from graphrag_llm.completion import create_completion
from graphrag_llm.embedding import create_embedding

from graphrag.config.models.graph_rag_config import GraphRagConfig

if TYPE_CHECKING:
from graphrag_llm.types import LLMEmbeddingResponse

logger = logging.getLogger(__name__)


Expand All @@ -29,13 +33,40 @@ def validate_config_names(parameters: GraphRagConfig) -> None:
for id, config in parameters.embedding_models.items():
embed_llm = create_embedding(config)
try:
asyncio.run(
response = asyncio.run(
embed_llm.embedding_async(
input=["This is an LLM Embedding Test String"]
)
)
logger.info("Embedding LLM Config Params Validated")

if id == parameters.embed_text.embedding_model_id:
_sync_vector_store_dimensions(parameters, response)
except Exception as e: # noqa: BLE001
logger.error(f"Embedding configuration error detected.\n{e}") # noqa
print(f"Failed to validate embedding model ({id}) params", e) # noqa: T201
sys.exit(1)


def _sync_vector_store_dimensions(
parameters: GraphRagConfig,
response: "LLMEmbeddingResponse",
) -> None:
"""Sync vector store dimensions to match the actual embedding model output."""
detected = len(response.first_embedding)
if detected == 0:
return

configured = parameters.vector_store.vector_size
if detected == configured:
return

logger.warning(
"Embedding model produces %d-dimensional vectors but vector_size is "
"configured as %d. Overriding vector_size to match the model.",
detected,
configured,
)
parameters.vector_store.vector_size = detected
for schema in parameters.vector_store.index_schema.values():
schema.vector_size = detected
Loading