diff --git a/changelog/unreleased/llm-document-enrichment.yml b/changelog/unreleased/llm-document-enrichment.yml new file mode 100644 index 000000000000..fd6e55d6249f --- /dev/null +++ b/changelog/unreleased/llm-document-enrichment.yml @@ -0,0 +1,4 @@ +title: Add DocumentEnrichmentUpdateProcessorFactory for LLM-based document enrichment at index time +type: added # added, changed, fixed, deprecated, removed, dependency_update, security, other +authors: +- name: Nicolò Rinaldi diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 0bcff24a4457..bf0a43939202 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -449,11 +449,14 @@ ktor-client-serialization-json = { module = "io.ktor:ktor-serialization-kotlinx- ktor-server-cio = { module = "io.ktor:ktor-server-cio", version.ref = "ktor" } ktor-server-core = { module = "io.ktor:ktor-server-core", version.ref = "ktor" } ktor-server-htmlBuilder = { module = "io.ktor:ktor-server-html-builder", version.ref = "ktor" } +langchain4j-anthropic = { module = "dev.langchain4j:langchain4j-anthropic" } langchain4j-bom = { module = "dev.langchain4j:langchain4j-bom", version.ref = "langchain4j-bom" } langchain4j-cohere = { module = "dev.langchain4j:langchain4j-cohere" } langchain4j-core = { module = "dev.langchain4j:langchain4j-core" } +langchain4j-google-ai-gemini = { module = "dev.langchain4j:langchain4j-google-ai-gemini" } langchain4j-hugging-face = { module = "dev.langchain4j:langchain4j-hugging-face" } langchain4j-mistral-ai = { module = "dev.langchain4j:langchain4j-mistral-ai" } +langchain4j-ollama = { module = "dev.langchain4j:langchain4j-ollama" } langchain4j-open-ai = { module = "dev.langchain4j:langchain4j-open-ai" } lmax-disruptor = { module = "com.lmax:disruptor", version.ref = "lmax-disruptor" } locationtech-spatial4j = { module = "org.locationtech.spatial4j:spatial4j", version.ref = "spatial4j" } diff --git a/solr/modules/language-models/build.gradle b/solr/modules/language-models/build.gradle index a4dc82fc15cb..17d9716cfd20 100644 --- a/solr/modules/language-models/build.gradle +++ b/solr/modules/language-models/build.gradle @@ -29,9 +29,12 @@ dependencies { implementation libs.apache.lucene.core implementation libs.langchain4j.core + runtimeOnly libs.langchain4j.anthropic runtimeOnly libs.langchain4j.cohere + runtimeOnly libs.langchain4j.google.ai.gemini runtimeOnly libs.langchain4j.hugging.face runtimeOnly libs.langchain4j.mistral.ai + runtimeOnly libs.langchain4j.ollama runtimeOnly libs.langchain4j.open.ai implementation libs.slf4j.api diff --git a/solr/modules/language-models/gradle.lockfile b/solr/modules/language-models/gradle.lockfile index f9973282836f..1f6a427c6d95 100644 --- a/solr/modules/language-models/gradle.lockfile +++ b/solr/modules/language-models/gradle.lockfile @@ -40,13 +40,16 @@ com.tdunning:t-digest:3.3=jarValidation,runtimeClasspath,runtimeLibs,solrPlatfor commons-cli:commons-cli:1.10.0=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath commons-codec:commons-codec:1.19.0=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath commons-io:commons-io:2.20.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath +dev.langchain4j:langchain4j-anthropic:1.9.1=jarValidation,runtimeClasspath,runtimeLibs,testRuntimeClasspath dev.langchain4j:langchain4j-bom:1.9.1=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath dev.langchain4j:langchain4j-cohere:1.9.1-beta17=jarValidation,runtimeClasspath,runtimeLibs,testRuntimeClasspath dev.langchain4j:langchain4j-core:1.9.1=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath +dev.langchain4j:langchain4j-google-ai-gemini:1.9.1=jarValidation,runtimeClasspath,runtimeLibs,testRuntimeClasspath dev.langchain4j:langchain4j-http-client-jdk:1.9.1=jarValidation,runtimeClasspath,runtimeLibs,testRuntimeClasspath dev.langchain4j:langchain4j-http-client:1.9.1=jarValidation,runtimeClasspath,runtimeLibs,testRuntimeClasspath dev.langchain4j:langchain4j-hugging-face:1.9.1-beta17=jarValidation,runtimeClasspath,runtimeLibs,testRuntimeClasspath dev.langchain4j:langchain4j-mistral-ai:1.9.1=jarValidation,runtimeClasspath,runtimeLibs,testRuntimeClasspath +dev.langchain4j:langchain4j-ollama:1.9.1=jarValidation,runtimeClasspath,runtimeLibs,testRuntimeClasspath dev.langchain4j:langchain4j-open-ai:1.9.1=jarValidation,runtimeClasspath,runtimeLibs,testRuntimeClasspath io.dropwizard.metrics:metrics-annotation:4.2.26=jarValidation,testRuntimeClasspath io.dropwizard.metrics:metrics-core:4.2.26=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath diff --git a/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/model/SolrChatModel.java b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/model/SolrChatModel.java new file mode 100644 index 000000000000..afd45d11ca07 --- /dev/null +++ b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/model/SolrChatModel.java @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.languagemodels.documentenrichment.model; + +import dev.langchain4j.data.message.UserMessage; +import dev.langchain4j.model.chat.ChatModel; +import dev.langchain4j.model.chat.request.ChatRequest; +import dev.langchain4j.model.chat.request.ResponseFormat; +import java.lang.invoke.MethodHandles; +import java.lang.reflect.Method; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Map; +import java.util.Objects; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.util.Utils; +import org.apache.solr.core.SolrResourceLoader; +import org.apache.solr.languagemodels.documentenrichment.store.ChatModelException; +import org.apache.solr.languagemodels.documentenrichment.store.rest.ManagedChatModelStore; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This object wraps a {@link ChatModel} to produce the content of new fields from another. + * It's meant to be used as a managed resource with the {@link + * ManagedChatModelStore} + */ +public class SolrChatModel implements Accountable { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final long BASE_RAM_BYTES = + RamUsageEstimator.shallowSizeOfInstance(SolrChatModel.class); + // timeout is type Duration + private static final String TIMEOUT_PARAM = "timeout"; + + // the followings are Integer type + private static final String MAX_RETRIES_PARAM = "maxRetries"; + private static final String THINKING_BUDGET_TOKENS = "thinkingBudgetTokens"; + private static final String RANDOM_SEED = "randomSeed"; + + private final String name; + private final Map params; + private final ChatModel chatModel; + private final int hashCode; + + public static SolrChatModel getInstance( + SolrResourceLoader solrResourceLoader, + String className, + String name, + Map params) + throws ChatModelException { + try { + /* + * The idea here is to build a {@link dev.langchain4j.model.chat.ChatModel} using inversion + * of control. + * Each model has its own list of parameters we don't know beforehand, but each {@link dev.langchain4j.model.chat.ChatModel} class + * has its own builder that uses setters with the same name of the parameter in input. + * */ + ChatModel textToTextModel; + Class modelClass = solrResourceLoader.findClass(className, ChatModel.class); + var builder = modelClass.getMethod("builder").invoke(null); + if (params != null) { + /* + * This block of code has the responsibility of instantiate a {@link + * dev.langchain4j.model.chat.ChatModel} using the params provided.classes have + * params of The specific implementation of {@link + * dev.langchain4j.model.chat.ChatModel} is not known beforehand. So we benefit of + * the design choice in langchain4j that each subclass implementing {@link + * dev.langchain4j.model.chat.ChatModel} uses setters with the same name of the + * param. + */ + for (String paramName : params.keySet()) { + /* + * When a param is not primitive, we need to instantiate the object explicitly and then call the + * setter method. + * N.B. when adding support to new models, pay attention to all the parameters they + * support, some of them may require to be handled in here as separate switch cases + */ + switch (paramName) { + case TIMEOUT_PARAM -> builder + .getClass() + .getMethod(paramName, Duration.class) + .invoke(builder, Duration.ofSeconds((Long) params.get(paramName))); + + case MAX_RETRIES_PARAM, THINKING_BUDGET_TOKENS, RANDOM_SEED -> builder + .getClass() + .getMethod(paramName, Integer.class) + .invoke(builder, ((Long) params.get(paramName)).intValue()); + + /* + * For primitive params if there's only one setter available, we call it. + * If there's choice we default to the string one + */ + default -> { + ArrayList paramNameMatches = new ArrayList<>(); + for (var method : builder.getClass().getMethods()) { + if (paramName.equals(method.getName()) && method.getParameterCount() == 1) { + paramNameMatches.add(method); + } + } + if (paramNameMatches.size() == 1) { + paramNameMatches.getFirst().invoke(builder, params.get(paramName)); + } else { + try { + builder + .getClass() + .getMethod(paramName, String.class) + .invoke(builder, params.get(paramName).toString()); + } catch (NoSuchMethodException e) { + log.error("Parameter {} not supported by model {}", paramName, className); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e); + } + } + } + } + } + } + textToTextModel = (ChatModel) builder.getClass().getMethod("build").invoke(builder); + return new SolrChatModel(name, textToTextModel, params); + } catch (final Exception e) { + throw new ChatModelException("Model loading failed for " + className, e); + } + } + + public SolrChatModel( + String name, ChatModel chatModel, Map params) { + this.name = name; + this.chatModel = chatModel; + this.params = params; + this.hashCode = calculateHashCode(); + } + + /** + * Sends a structured chat request and returns the parsed value from the {@code {"value": ...}} + * JSON object that the model is instructed to produce via {@code responseFormat}. + * + * @return the extracted value: a {@link String}, {@link Number}, {@link Boolean}, or {@link + * java.util.List} depending on the Solr output field type + */ + public Object chat(String text, ResponseFormat responseFormat) { + ChatRequest chatRequest = + ChatRequest.builder() + .responseFormat(responseFormat) + .messages(UserMessage.from(text)) + .build(); + String rawJson = chatModel.chat(chatRequest).aiMessage().text(); + Object parsed = Utils.fromJSONString(rawJson); + if (!(parsed instanceof Map map) || !map.containsKey("value")) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "LLM response is missing the 'value' key: " + rawJson); + } + return map.get("value"); + } + + @Override + public String toString() { + return getClass().getSimpleName() + "(name=" + getName() + ")"; + } + + @Override + public long ramBytesUsed() { + return BASE_RAM_BYTES + + RamUsageEstimator.sizeOfObject(name) + + RamUsageEstimator.sizeOfObject(chatModel); + } + + @Override + public int hashCode() { + return hashCode; + } + + private int calculateHashCode() { + final int prime = 31; + int result = 1; + result = (prime * result) + Objects.hashCode(name); + result = (prime * result) + Objects.hashCode(chatModel); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (!(obj instanceof SolrChatModel)) return false; + final SolrChatModel other = (SolrChatModel) obj; + return Objects.equals(chatModel, other.chatModel) && Objects.equals(name, other.name); + } + + public String getName() { + return name; + } + + public String getChatModelClassName() { + return chatModel.getClass().getName(); + } + + public Map getParams() { + return params; + } +} diff --git a/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/model/package-info.java b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/model/package-info.java new file mode 100644 index 000000000000..9b1575f35d58 --- /dev/null +++ b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/model/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** APIs and classes for implementing text to vector logic. */ +package org.apache.solr.languagemodels.documentenrichment.model; diff --git a/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/store/ChatModelException.java b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/store/ChatModelException.java new file mode 100644 index 000000000000..a3315faaa234 --- /dev/null +++ b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/store/ChatModelException.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.languagemodels.documentenrichment.store; + +public class ChatModelException extends RuntimeException { + + private static final long serialVersionUID = 1L; + + public ChatModelException(String message) { + super(message); + } + + public ChatModelException(String message, Exception cause) { + super(message, cause); + } +} diff --git a/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/store/ChatModelStore.java b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/store/ChatModelStore.java new file mode 100644 index 000000000000..96105919c17d --- /dev/null +++ b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/store/ChatModelStore.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.languagemodels.documentenrichment.store; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import org.apache.solr.languagemodels.documentenrichment.model.SolrChatModel; + +/** Simple store to manage CRUD operations on the {@link SolrChatModel} */ +public class ChatModelStore { + + private final Map availableModels; + + public ChatModelStore() { + availableModels = Collections.synchronizedMap(new LinkedHashMap<>()); + } + + public SolrChatModel getModel(String name) { + return availableModels.get(name); + } + + public void clear() { + availableModels.clear(); + } + + public List getModels() { + synchronized (availableModels) { + final List availableModelsValues = + new ArrayList<>(availableModels.values()); + return Collections.unmodifiableList(availableModelsValues); + } + } + + @Override + public String toString() { + return "ChatModelStore [availableModels=" + availableModels.keySet() + "]"; + } + + public SolrChatModel delete(String modelName) { + return availableModels.remove(modelName); + } + + public void addModel(SolrChatModel modeldata) throws ChatModelException { + final String name = modeldata.getName(); + if (availableModels.putIfAbsent(modeldata.getName(), modeldata) != null) { + throw new ChatModelException( + "model '" + name + "' already exists. Please use a different name"); + } + } +} diff --git a/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/store/package-info.java b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/store/package-info.java new file mode 100644 index 000000000000..ec20da4f87ee --- /dev/null +++ b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/store/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Contains model store related classes. */ +package org.apache.solr.languagemodels.documentenrichment.store; diff --git a/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/store/rest/ManagedChatModelStore.java b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/store/rest/ManagedChatModelStore.java new file mode 100644 index 000000000000..f8c6414354d8 --- /dev/null +++ b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/store/rest/ManagedChatModelStore.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.languagemodels.documentenrichment.store.rest; + +import java.lang.invoke.MethodHandles; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import net.jcip.annotations.ThreadSafe; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.SolrCore; +import org.apache.solr.core.SolrResourceLoader; +import org.apache.solr.languagemodels.documentenrichment.model.SolrChatModel; +import org.apache.solr.languagemodels.documentenrichment.store.ChatModelException; +import org.apache.solr.languagemodels.documentenrichment.store.ChatModelStore; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.rest.BaseSolrResource; +import org.apache.solr.rest.ManagedResource; +import org.apache.solr.rest.ManagedResourceObserver; +import org.apache.solr.rest.ManagedResourceStorage; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Managed Resource wrapper for the {@link ChatModelStore} to expose it via REST */ +@ThreadSafe +public class ManagedChatModelStore extends ManagedResource + implements ManagedResource.ChildResourceSupport { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + /** the model store rest endpoint */ + public static final String REST_END_POINT = "/schema/chat-model-store"; + + /** Managed model store: the name of the attribute containing all the models of a model store */ + private static final String MODELS_JSON_FIELD = "models"; + + /** name of the attribute containing a class */ + static final String CLASS_KEY = "class"; + + /** name of the attribute containing a name */ + static final String NAME_KEY = "name"; + + /** name of the attribute containing parameters */ + static final String PARAMS_KEY = "params"; + + public static void registerManagedChatModelStore( + SolrResourceLoader solrResourceLoader, ManagedResourceObserver managedResourceObserver) { + solrResourceLoader + .getManagedResourceRegistry() + .registerManagedResource( + REST_END_POINT, ManagedChatModelStore.class, managedResourceObserver); + } + + public static ManagedChatModelStore getManagedModelStore(SolrCore core) { + return (ManagedChatModelStore) core.getRestManager().getManagedResource(REST_END_POINT); + } + + /** + * Returns the available models as a list of Maps objects. After an update the managed resources + * needs to return the resources in this format in order to store in json somewhere (zookeeper, + * disk...) + * + * @return the available models as a list of Maps objects + */ + private static List modelsAsManagedResources(List models) { + return models.stream() + .map(ManagedChatModelStore::toModelMap) + .collect(Collectors.toList()); + } + + @SuppressWarnings("unchecked") + public static SolrChatModel fromModelMap( + SolrResourceLoader solrResourceLoader, Map chatModel) { + return SolrChatModel.getInstance( + solrResourceLoader, + (String) chatModel.get(CLASS_KEY), // modelClassName + (String) chatModel.get(NAME_KEY), // modelName + (Map) chatModel.get(PARAMS_KEY)); + } + + private static LinkedHashMap toModelMap(SolrChatModel model) { + final LinkedHashMap modelMap = new LinkedHashMap<>(5, 1.0f); + modelMap.put(NAME_KEY, model.getName()); + modelMap.put(CLASS_KEY, model.getChatModelClassName()); + modelMap.put(PARAMS_KEY, model.getParams()); + return modelMap; + } + + private final ChatModelStore store; + private Object managedData; + + public ManagedChatModelStore( + String resourceId, SolrResourceLoader loader, ManagedResourceStorage.StorageIO storageIO) + throws SolrException { + super(resourceId, loader, storageIO); + store = new ChatModelStore(); + } + + @Override + protected ManagedResourceStorage createStorage( + ManagedResourceStorage.StorageIO storageIO, SolrResourceLoader loader) throws SolrException { + return new ManagedResourceStorage.JsonStorage(storageIO, loader, -1); + } + + @Override + protected void onManagedDataLoadedFromStorage(NamedList managedInitArgs, Object managedData) + throws SolrException { + store.clear(); + this.managedData = managedData; + } + + public void loadStoredModels() { + log.info("------ managed models ~ loading ------"); + + if ((managedData != null) && (managedData instanceof List)) { + @SuppressWarnings({"unchecked"}) + final List> chatModels = (List>) managedData; + for (final Map chatModel : chatModels) { + addModelFromMap(chatModel); + } + } + } + + private void addModelFromMap(Map modelMap) { + try { + addModel(fromModelMap(solrResourceLoader, modelMap)); + } catch (final ChatModelException e) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); + } + } + + public void addModel(SolrChatModel model) throws SolrException { + try { + if (log.isInfoEnabled()) { + log.info("adding model {}", model.getName()); + } + store.addModel(model); + } catch (final ChatModelException e) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); + } + } + + @SuppressWarnings("unchecked") + @Override + protected Object applyUpdatesToManagedData(Object updates) { + if (updates instanceof List) { + final List> chatModels = (List>) updates; + for (final Map chatModel : chatModels) { + addModelFromMap(chatModel); + } + } + + if (updates instanceof Map) { + final Map map = (Map) updates; + addModelFromMap(map); + } + + return modelsAsManagedResources(store.getModels()); + } + + @Override + public void doDeleteChild(BaseSolrResource endpoint, String childId) { + store.delete(childId); + storeManagedData(applyUpdatesToManagedData(null)); + } + + /** + * Called to retrieve a named part (the given childId) of the resource at the given endpoint. + * Note: since we have a unique child managed store we ignore the childId. + */ + @Override + public void doGet(BaseSolrResource endpoint, String childId) { + final SolrQueryResponse response = endpoint.getSolrResponse(); + response.add(MODELS_JSON_FIELD, modelsAsManagedResources(store.getModels())); + } + + public SolrChatModel getModel(String modelName) { + return store.getModel(modelName); + } + + @Override + public String toString() { + return "ManagedChatModelStore [store=" + store + "]"; + } +} diff --git a/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/store/rest/package-info.java b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/store/rest/package-info.java new file mode 100644 index 000000000000..dfb013a8a902 --- /dev/null +++ b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/store/rest/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Contains the {@link org.apache.solr.rest.ManagedResource} that encapsulate the model stores. */ +package org.apache.solr.languagemodels.documentenrichment.store.rest; diff --git a/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/update/processor/DocumentEnrichmentUpdateProcessor.java b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/update/processor/DocumentEnrichmentUpdateProcessor.java new file mode 100644 index 000000000000..3f90fd8eb580 --- /dev/null +++ b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/update/processor/DocumentEnrichmentUpdateProcessor.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.languagemodels.documentenrichment.update.processor; + +import dev.langchain4j.model.chat.request.ResponseFormat; +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.List; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.SolrInputField; +import org.apache.solr.languagemodels.documentenrichment.model.SolrChatModel; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.update.AddUpdateCommand; +import org.apache.solr.update.processor.UpdateRequestProcessor; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class DocumentEnrichmentUpdateProcessor extends UpdateRequestProcessor { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private final IndexSchema schema; + private final List inputFields; + private final String outputField; + private final String prompt; + private final SolrChatModel chatModel; + private final boolean multiValued; + private final ResponseFormat responseFormat; + + public DocumentEnrichmentUpdateProcessor( + List inputFields, + String outputField, + String prompt, + SolrChatModel chatModel, + boolean multiValued, + ResponseFormat responseFormat, + SolrQueryRequest req, + UpdateRequestProcessor next) { + super(next); + this.schema = req.getSchema(); + this.inputFields = inputFields; + this.outputField = outputField; + this.prompt = prompt; + this.chatModel = chatModel; + this.multiValued = multiValued; + this.responseFormat = responseFormat; + } + + /** + * @param cmd the update command in input containing the Document to process + * @throws IOException If there is a low-level I/O error + */ + @Override + public void processAdd(AddUpdateCommand cmd) throws IOException { + SolrInputDocument doc = cmd.getSolrInputDocument(); + + // Collect all field values; skip enrichment if any declared field is null or empty + String injectedPrompt = prompt; + for (String fieldName : inputFields) { + SolrInputField field = doc.get(fieldName); + if (isNullOrEmpty(field)) { + super.processAdd(cmd); + return; + } + injectedPrompt = injectedPrompt.replace("{" + fieldName + "}", field.getValue().toString()); + } + + try { + // as for now, only a plain text as prompt is sent to the model (no support for tools/skills/agents) + // chatModel.chat returns the parsed value from the structured JSON response + Object value = chatModel.chat(injectedPrompt, responseFormat); + if (multiValued && value instanceof List list) { + for (Object item : list) { + doc.addField(outputField, item); + } + } else { + doc.setField(outputField, value); + } + } catch (RuntimeException chatModelFailure) { + if (log.isErrorEnabled()) { + SchemaField uniqueKeyField = schema.getUniqueKeyField(); + String uniqueKeyFieldName = uniqueKeyField.getName(); + log.error( + "Could not process fields {} for the document with {}: {}", + inputFields, + uniqueKeyFieldName, + doc.getFieldValue(uniqueKeyFieldName), + chatModelFailure); + } + } + super.processAdd(cmd); + } + + protected boolean isNullOrEmpty(SolrInputField inputFieldContent) { + return (inputFieldContent == null + || inputFieldContent.getValue() == null + || inputFieldContent.getValue().toString().isEmpty()); + } +} diff --git a/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/update/processor/DocumentEnrichmentUpdateProcessorFactory.java b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/update/processor/DocumentEnrichmentUpdateProcessorFactory.java new file mode 100644 index 000000000000..93d730c852f7 --- /dev/null +++ b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/update/processor/DocumentEnrichmentUpdateProcessorFactory.java @@ -0,0 +1,313 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.languagemodels.documentenrichment.update.processor; + +import dev.langchain4j.model.chat.request.ResponseFormat; +import dev.langchain4j.model.chat.request.ResponseFormatType; +import dev.langchain4j.model.chat.request.json.JsonArraySchema; +import dev.langchain4j.model.chat.request.json.JsonBooleanSchema; +import dev.langchain4j.model.chat.request.json.JsonIntegerSchema; +import dev.langchain4j.model.chat.request.json.JsonNumberSchema; +import dev.langchain4j.model.chat.request.json.JsonObjectSchema; +import dev.langchain4j.model.chat.request.json.JsonSchema; +import dev.langchain4j.model.chat.request.json.JsonSchemaElement; +import dev.langchain4j.model.chat.request.json.JsonStringSchema; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.Collection; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.RequiredSolrParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.SolrCore; +import org.apache.solr.core.SolrResourceLoader; +import org.apache.solr.languagemodels.documentenrichment.model.SolrChatModel; +import org.apache.solr.languagemodels.documentenrichment.store.rest.ManagedChatModelStore; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.rest.ManagedResource; +import org.apache.solr.rest.ManagedResourceObserver; +import org.apache.solr.schema.BoolField; +import org.apache.solr.schema.DatePointField; +import org.apache.solr.schema.DenseVectorField; +import org.apache.solr.schema.DoublePointField; +import org.apache.solr.schema.FieldType; +import org.apache.solr.schema.FloatPointField; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.schema.IntPointField; +import org.apache.solr.schema.LongPointField; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.schema.StrField; +import org.apache.solr.schema.TextField; +import org.apache.solr.update.processor.UpdateRequestProcessor; +import org.apache.solr.update.processor.UpdateRequestProcessorFactory; +import org.apache.solr.util.plugin.SolrCoreAware; + +/** + * Insert in an existing field the output of the model coming from one or more textual field values. + * + *

One or more {@code inputField} parameters specify the Solr fields to use as input. Each field + * name must appear as a {@code {fieldName}} placeholder in the prompt. Exactly one of {@code + * prompt} or {@code promptFile} must be provided. + * + *

+ * <processor class="solr.llm.documentenrichment.update.processor.DocumentEnrichmentUpdateProcessorFactory">
+ *   <str name="inputField">title_field</str>
+ *   <str name="inputField">body_field</str>
+ *   <str name="outputField">enriched_field</str>
+ *   <str name="prompt">Title: {title_field}. Body: {body_field}.</str>
+ *   <str name="model">ChatModel</str>
+ * </processor>
+ * 
+ * + *

Alternatively, the prompt can be loaded from a text file using {@code promptFile}: + * + *

+ * <processor class="solr.llm.documentenrichment.update.processor.DocumentEnrichmentUpdateProcessorFactory">
+ *   <str name="inputField">title_field</str>
+ *   <str name="outputField">enriched_field</str>
+ *   <str name="promptFile">prompt.txt</str>
+ *   <str name="model">ChatModel</str>
+ * </processor>
+ * 
+ * + *

Validation rules: + * + *

    + *
  • At least one {@code inputField} must be declared. + *
  • Exactly one of {@code prompt} or {@code promptFile} must be provided. + *
  • Every declared {@code inputField} must have a corresponding {@code {fieldName}} placeholder + * in the prompt. + *
  • Every {@code {placeholder}} in the prompt must correspond to a declared {@code inputField}. + *
+ */ +public class DocumentEnrichmentUpdateProcessorFactory extends UpdateRequestProcessorFactory + implements SolrCoreAware, ManagedResourceObserver { + private static final String INPUT_FIELD_PARAM = "inputField"; + private static final String OUTPUT_FIELD_PARAM = "outputField"; + private static final String PROMPT = "prompt"; + private static final String PROMPT_FILE = "promptFile"; + private static final String MODEL_NAME = "model"; + private static final Pattern PLACEHOLDER_PATTERN = Pattern.compile("\\{([^}]+)\\}"); + + private List inputFields; + private String outputField; + private String promptText; + private String promptFile; + private String modelName; + + @Override + public void init(final NamedList args) { + // removeConfigArgs handles both multiple and + // and must be called before toSolrParams() since it mutates args in place + Collection fieldNames = args.removeConfigArgs(INPUT_FIELD_PARAM); + if (fieldNames.isEmpty()) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "At least one 'inputField' must be provided"); + } + inputFields = List.copyOf(fieldNames); + + SolrParams params = args.toSolrParams(); + RequiredSolrParams required = params.required(); + outputField = required.get(OUTPUT_FIELD_PARAM); + modelName = required.get(MODEL_NAME); + + String inlinePrompt = params.get(PROMPT); + String promptFilePath = params.get(PROMPT_FILE); + + if (inlinePrompt == null && promptFilePath == null) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "Either 'prompt' or 'promptFile' must be provided"); + } + if (inlinePrompt != null && promptFilePath != null) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "Only one of 'prompt' or 'promptFile' can be provided, not both"); + } + if (inlinePrompt != null) { + validatePromptPlaceholders(inlinePrompt, inputFields); + this.promptText = inlinePrompt; + } + this.promptFile = promptFilePath; + } + + @Override + public void inform(SolrCore core) { + final SolrResourceLoader solrResourceLoader = core.getResourceLoader(); + ManagedChatModelStore.registerManagedChatModelStore(solrResourceLoader, this); + if (promptFile != null) { + try (InputStream is = solrResourceLoader.openResource(promptFile)) { + promptText = new String(is.readAllBytes(), StandardCharsets.UTF_8).trim(); + } catch (IOException e) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "Cannot read prompt file: " + promptFile, + e); + } + validatePromptPlaceholders(promptText, inputFields); + } + } + + @Override + public void onManagedResourceInitialized(NamedList args, ManagedResource res) + throws SolrException { + if (res instanceof ManagedChatModelStore store) { + store.loadStoredModels(); + } + } + + @Override + public UpdateRequestProcessor getInstance( + SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { + IndexSchema latestSchema = req.getCore().getLatestSchema(); + + for (String fieldName : inputFields) { + if (!latestSchema.isDynamicField(fieldName) && !latestSchema.hasExplicitField(fieldName)) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "undefined field: \"" + fieldName + "\""); + } + } + + final SchemaField outputFieldSchema = latestSchema.getField(outputField); + + ResponseFormat responseFormat = buildResponseFormat(outputFieldSchema); + boolean multiValued = outputFieldSchema.multiValued(); + + ManagedChatModelStore store = ManagedChatModelStore.getManagedModelStore(req.getCore()); + SolrChatModel chatModel = store.getModel(modelName); + if (chatModel == null) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "The model configured in the Update Request Processor '" + + modelName + + "' can't be found in the store: " + + ManagedChatModelStore.REST_END_POINT); + } + + return new DocumentEnrichmentUpdateProcessor( + inputFields, outputField, promptText, chatModel, multiValued, responseFormat, req, next); + } + + /** + * Builds a {@link ResponseFormat} that instructs the model to return a JSON object {@code + * {"value": ...}} whose value type matches the Solr field type. For multivalued fields the value + * is wrapped in a {@link JsonArraySchema} nested inside the root {@link JsonObjectSchema}. + * + *

Nesting {@link JsonArraySchema} inside a {@link JsonObjectSchema} property is supported by + * all langchain4j providers that implement structured outputs with {@link JsonObjectSchema} (OpenAI, Azure OpenAI, + * Google AI, Gemini, Mistral, Ollama, Amazon Bedrock, Watsonx). + */ + static ResponseFormat buildResponseFormat(SchemaField schemaField) { + JsonSchemaElement valueElement = toJsonSchemaElement(schemaField.getType()); + JsonSchemaElement valueSchema = + schemaField.multiValued() + ? JsonArraySchema.builder().items(valueElement).build() + : valueElement; + return ResponseFormat.builder() + .type(ResponseFormatType.JSON) + .jsonSchema( + JsonSchema.builder() + .name("output") + .rootElement( + JsonObjectSchema.builder() + .addProperty("value", valueSchema) + .required("value") + .build()) + .build()) + .build(); + } + + private static JsonSchemaElement toJsonSchemaElement(FieldType fieldType) { + // DenseVectorField extends FloatPointField, so it must be rejected before the numeric checks + if (fieldType instanceof DenseVectorField) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "field type is not supported by Document Enrichment: " + + fieldType.getClass().getSimpleName()); + } + if (fieldType instanceof StrField + || fieldType instanceof TextField + || fieldType instanceof DatePointField) { + return new JsonStringSchema(); + } else if (fieldType instanceof IntPointField || fieldType instanceof LongPointField) { + return new JsonIntegerSchema(); + } else if (fieldType instanceof FloatPointField || fieldType instanceof DoublePointField) { + return new JsonNumberSchema(); + } else if (fieldType instanceof BoolField) { + return new JsonBooleanSchema(); + } else { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "field type is not supported by Document Enrichment: " + + fieldType.getClass().getSimpleName()); + } + } + + private static void validatePromptPlaceholders(String prompt, List fieldNames) { + Set promptPlaceholders = new LinkedHashSet<>(); + Matcher m = PLACEHOLDER_PATTERN.matcher(prompt); + while (m.find()) { + promptPlaceholders.add(m.group(1)); + } + + Set missingInPrompt = new LinkedHashSet<>(fieldNames); + missingInPrompt.removeAll(promptPlaceholders); + if (!missingInPrompt.isEmpty()) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "prompt is missing placeholders for inputField(s): " + missingInPrompt); + } + + Set unknownInPrompt = new LinkedHashSet<>(promptPlaceholders); + unknownInPrompt.removeAll(new HashSet<>(fieldNames)); + if (!unknownInPrompt.isEmpty()) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "prompt contains placeholders not declared as inputField(s): " + unknownInPrompt); + } + } + + public List getInputFields() { + return inputFields; + } + + public String getOutputField() { + return outputField; + } + + public String getPrompt() { + return promptText; + } + + public String getModelName() { + return modelName; + } + + public String getPromptFile() { + return promptFile; + } +} diff --git a/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/update/processor/package-info.java b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/update/processor/package-info.java new file mode 100644 index 000000000000..1aaedcf004fd --- /dev/null +++ b/solr/modules/language-models/src/java/org/apache/solr/languagemodels/documentenrichment/update/processor/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Contains update request processor related classes. */ +package org.apache.solr.languagemodels.documentenrichment.update.processor; diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/anthropic-chat-model.json b/solr/modules/language-models/src/test-files/modelChatExamples/anthropic-chat-model.json new file mode 100644 index 000000000000..c4bd85ada4bb --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/anthropic-chat-model.json @@ -0,0 +1,13 @@ +{ + "class": "dev.langchain4j.model.anthropic.AnthropicChatModel", + "name": "anthropic-chat-1", + "params": { + "baseUrl": "https://api.anthropic.com/v1", + "apiKey": "apiKey-anthropic", + "modelName": "claude-3-5-haiku-latest", + "timeout": 60, + "logRequests": true, + "logResponses": true, + "maxRetries": 5 + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-ambiguous.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-ambiguous.json new file mode 100644 index 000000000000..1d737c9ae9d2 --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-ambiguous.json @@ -0,0 +1,8 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-chat-1", + "params": { + "response": "enriched content", + "ambiguous": 10 + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-malformed-json.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-malformed-json.json new file mode 100644 index 000000000000..bdc8394add3b --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-malformed-json.json @@ -0,0 +1,7 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-chat-1", + "params": { + "response": "not valid json at all" + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-missing-value-key.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-missing-value-key.json new file mode 100644 index 000000000000..42a52faf650a --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-missing-value-key.json @@ -0,0 +1,7 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-chat-1", + "params": { + "response": "{\"result\": \"some value\"}" + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-boolean.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-boolean.json new file mode 100644 index 000000000000..7ba22888cb2b --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-boolean.json @@ -0,0 +1,7 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-boolean-multi", + "params": { + "response": "{\"value\": [true, false]}" + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-date.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-date.json new file mode 100644 index 000000000000..f159e3334614 --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-date.json @@ -0,0 +1,7 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-date-multi", + "params": { + "response": "{\"value\": [\"2024-01-15T00:00:00Z\", \"2025-06-30T00:00:00Z\"]}" + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-double.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-double.json new file mode 100644 index 000000000000..8b01495e474e --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-double.json @@ -0,0 +1,7 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-double-multi", + "params": { + "response": "{\"value\": [3.14, 2.71]}" + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-float.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-float.json new file mode 100644 index 000000000000..0415048c1315 --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-float.json @@ -0,0 +1,7 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-float-multi", + "params": { + "response": "{\"value\": [1.5, 2.5]}" + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-int.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-int.json new file mode 100644 index 000000000000..ff15d3f0b584 --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-int.json @@ -0,0 +1,7 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-int-multi", + "params": { + "response": "{\"value\": [1, 2]}" + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-long.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-long.json new file mode 100644 index 000000000000..03c06eb0f5d3 --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-long.json @@ -0,0 +1,7 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-long-multi", + "params": { + "response": "{\"value\": [10, 20, 30]}" + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-scalar.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-scalar.json new file mode 100644 index 000000000000..2deb27259554 --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-scalar.json @@ -0,0 +1,7 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-chat-multivalued-1", + "params": { + "response": "{\"value\": \"a single string\"}" + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-string.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-string.json new file mode 100644 index 000000000000..b482ef654211 --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-multivalued-string.json @@ -0,0 +1,7 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-chat-multivalued-1", + "params": { + "response": "{\"value\": [\"tag1\", \"tag2\"]}" + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-boolean.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-boolean.json new file mode 100644 index 000000000000..caca167287a6 --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-boolean.json @@ -0,0 +1,7 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-boolean", + "params": { + "response": "{\"value\": true}" + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-date.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-date.json new file mode 100644 index 000000000000..b98eb53cf506 --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-date.json @@ -0,0 +1,7 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-date", + "params": { + "response": "{\"value\": \"2024-01-15T00:00:00Z\"}" + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-double.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-double.json new file mode 100644 index 000000000000..5301937628f7 --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-double.json @@ -0,0 +1,7 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-double", + "params": { + "response": "{\"value\": 2.5}" + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-float.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-float.json new file mode 100644 index 000000000000..8f0c63512a35 --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-float.json @@ -0,0 +1,7 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-float", + "params": { + "response": "{\"value\": 1.5}" + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-int.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-int.json new file mode 100644 index 000000000000..664d846e1260 --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-int.json @@ -0,0 +1,7 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-int", + "params": { + "response": "{\"value\": 7}" + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-long.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-long.json new file mode 100644 index 000000000000..6d58cab102fa --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-single-long.json @@ -0,0 +1,7 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-long", + "params": { + "response": "{\"value\": 42}" + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-unsupported.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-unsupported.json new file mode 100644 index 000000000000..5f3404982b90 --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model-unsupported.json @@ -0,0 +1,8 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-chat-1", + "params": { + "response": "enriched content", + "unsupported": 10 + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model.json b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model.json new file mode 100644 index 000000000000..169cbc710450 --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/dummy-chat-model.json @@ -0,0 +1,7 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.DummyChatModel", + "name": "dummy-chat-1", + "params": { + "response": "{\"value\": \"enriched content\"}" + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/exception-throwing-chat-model.json b/solr/modules/language-models/src/test-files/modelChatExamples/exception-throwing-chat-model.json new file mode 100644 index 000000000000..3fad70744ff5 --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/exception-throwing-chat-model.json @@ -0,0 +1,6 @@ +{ + "class": "org.apache.solr.languagemodels.documentenrichment.model.ExceptionThrowingChatModel", + "name": "exception-throwing-chat-model", + "params": { + } +} diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/gemini-chat-model.json b/solr/modules/language-models/src/test-files/modelChatExamples/gemini-chat-model.json new file mode 100644 index 000000000000..0ac0a612daa2 --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/gemini-chat-model.json @@ -0,0 +1,12 @@ +{ + "class": "dev.langchain4j.model.googleai.GoogleAiGeminiChatModel", + "name": "gemini-chat-1", + "params": { + "apiKey": "apiKey-gemini", + "modelName": "gemini-2.0-flash", + "timeout": 60, + "logRequests": true, + "logResponses": true, + "maxRetries": 5 + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/mistralai-chat-model.json b/solr/modules/language-models/src/test-files/modelChatExamples/mistralai-chat-model.json new file mode 100644 index 000000000000..b8a130191ceb --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/mistralai-chat-model.json @@ -0,0 +1,13 @@ +{ + "class": "dev.langchain4j.model.mistralai.MistralAiChatModel", + "name": "mistralai-chat-1", + "params": { + "baseUrl": "https://api.mistral.ai/v1", + "apiKey": "apiKey-mistralAI", + "modelName": "mistral-small-latest", + "timeout": 60, + "logRequests": true, + "logResponses": true, + "maxRetries": 5 + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/ollama-chat-model.json b/solr/modules/language-models/src/test-files/modelChatExamples/ollama-chat-model.json new file mode 100644 index 000000000000..411a6468452b --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/ollama-chat-model.json @@ -0,0 +1,11 @@ +{ + "class": "dev.langchain4j.model.ollama.OllamaChatModel", + "name": "ollama-chat-1", + "params": { + "baseUrl": "http://localhost:11434", + "modelName": "llama3.2", + "timeout": 60, + "logRequests": true, + "logResponses": true + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/modelChatExamples/openai-model.json b/solr/modules/language-models/src/test-files/modelChatExamples/openai-model.json new file mode 100644 index 000000000000..74ffde65e3b6 --- /dev/null +++ b/solr/modules/language-models/src/test-files/modelChatExamples/openai-model.json @@ -0,0 +1,13 @@ +{ + "class": "dev.langchain4j.model.openai.OpenAiChatModel", + "name": "openai-1", + "params": { + "baseUrl": "https://api.openai.com/v1", + "apiKey": "apiKey-openAI", + "modelName": "gpt-5.4-nano", + "timeout": 60, + "logRequests": true, + "logResponses": true, + "maxRetries": 5 + } +} diff --git a/solr/modules/language-models/src/test-files/modelExamples/cohere-model.json b/solr/modules/language-models/src/test-files/modelEmbeddingExamples/cohere-model.json similarity index 100% rename from solr/modules/language-models/src/test-files/modelExamples/cohere-model.json rename to solr/modules/language-models/src/test-files/modelEmbeddingExamples/cohere-model.json diff --git a/solr/modules/language-models/src/test-files/modelExamples/dummy-model-ambiguous.json b/solr/modules/language-models/src/test-files/modelEmbeddingExamples/dummy-model-ambiguous.json similarity index 100% rename from solr/modules/language-models/src/test-files/modelExamples/dummy-model-ambiguous.json rename to solr/modules/language-models/src/test-files/modelEmbeddingExamples/dummy-model-ambiguous.json diff --git a/solr/modules/language-models/src/test-files/modelExamples/dummy-model-unsupported.json b/solr/modules/language-models/src/test-files/modelEmbeddingExamples/dummy-model-unsupported.json similarity index 100% rename from solr/modules/language-models/src/test-files/modelExamples/dummy-model-unsupported.json rename to solr/modules/language-models/src/test-files/modelEmbeddingExamples/dummy-model-unsupported.json diff --git a/solr/modules/language-models/src/test-files/modelExamples/dummy-model.json b/solr/modules/language-models/src/test-files/modelEmbeddingExamples/dummy-model.json similarity index 100% rename from solr/modules/language-models/src/test-files/modelExamples/dummy-model.json rename to solr/modules/language-models/src/test-files/modelEmbeddingExamples/dummy-model.json diff --git a/solr/modules/language-models/src/test-files/modelExamples/exception-throwing-model.json b/solr/modules/language-models/src/test-files/modelEmbeddingExamples/exception-throwing-model.json similarity index 100% rename from solr/modules/language-models/src/test-files/modelExamples/exception-throwing-model.json rename to solr/modules/language-models/src/test-files/modelEmbeddingExamples/exception-throwing-model.json diff --git a/solr/modules/language-models/src/test-files/modelExamples/huggingface-model.json b/solr/modules/language-models/src/test-files/modelEmbeddingExamples/huggingface-model.json similarity index 100% rename from solr/modules/language-models/src/test-files/modelExamples/huggingface-model.json rename to solr/modules/language-models/src/test-files/modelEmbeddingExamples/huggingface-model.json diff --git a/solr/modules/language-models/src/test-files/modelExamples/mistralai-model.json b/solr/modules/language-models/src/test-files/modelEmbeddingExamples/mistralai-model.json similarity index 100% rename from solr/modules/language-models/src/test-files/modelExamples/mistralai-model.json rename to solr/modules/language-models/src/test-files/modelEmbeddingExamples/mistralai-model.json diff --git a/solr/modules/language-models/src/test-files/modelExamples/openai-model.json b/solr/modules/language-models/src/test-files/modelEmbeddingExamples/openai-model.json similarity index 100% rename from solr/modules/language-models/src/test-files/modelExamples/openai-model.json rename to solr/modules/language-models/src/test-files/modelEmbeddingExamples/openai-model.json diff --git a/solr/modules/language-models/src/test-files/solr/collection1/conf/prompt-multi-field.txt b/solr/modules/language-models/src/test-files/solr/collection1/conf/prompt-multi-field.txt new file mode 100644 index 000000000000..65c2f125e36c --- /dev/null +++ b/solr/modules/language-models/src/test-files/solr/collection1/conf/prompt-multi-field.txt @@ -0,0 +1 @@ +Title: {string_field}. Body: {body_field}. \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/solr/collection1/conf/prompt-no-placeholder.txt b/solr/modules/language-models/src/test-files/solr/collection1/conf/prompt-no-placeholder.txt new file mode 100644 index 000000000000..c43c5399dc07 --- /dev/null +++ b/solr/modules/language-models/src/test-files/solr/collection1/conf/prompt-no-placeholder.txt @@ -0,0 +1 @@ +Summarize this content without the placeholder. \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/solr/collection1/conf/prompt.txt b/solr/modules/language-models/src/test-files/solr/collection1/conf/prompt.txt new file mode 100644 index 000000000000..502449a5cf5d --- /dev/null +++ b/solr/modules/language-models/src/test-files/solr/collection1/conf/prompt.txt @@ -0,0 +1 @@ +Summarize this content: {string_field} \ No newline at end of file diff --git a/solr/modules/language-models/src/test-files/solr/collection1/conf/schema-language-models.xml b/solr/modules/language-models/src/test-files/solr/collection1/conf/schema-language-models.xml index ef93fbc057dd..a7d329e1a88f 100644 --- a/solr/modules/language-models/src/test-files/solr/collection1/conf/schema-language-models.xml +++ b/solr/modules/language-models/src/test-files/solr/collection1/conf/schema-language-models.xml @@ -25,7 +25,11 @@ - + + + + + @@ -36,11 +40,31 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/solr/modules/language-models/src/test-files/solr/collection1/conf/solrconfig-document-enrichment-update-request-processor-only.xml b/solr/modules/language-models/src/test-files/solr/collection1/conf/solrconfig-document-enrichment-update-request-processor-only.xml new file mode 100644 index 000000000000..7aa85a8b362a --- /dev/null +++ b/solr/modules/language-models/src/test-files/solr/collection1/conf/solrconfig-document-enrichment-update-request-processor-only.xml @@ -0,0 +1,62 @@ + + + + + ${tests.luceneMatchVersion:LATEST} + ${solr.data.dir:} + + + + + + + + + + + + + + + 15000 + false + + + 1000 + + + ${solr.data.dir:} + + + + + + explicit + json + true + id + + + + + + string_field + enriched_field + Summarize this content: {string_field} + dummy-chat-1 + + + + + diff --git a/solr/modules/language-models/src/test-files/solr/collection1/conf/solrconfig-document-enrichment.xml b/solr/modules/language-models/src/test-files/solr/collection1/conf/solrconfig-document-enrichment.xml new file mode 100644 index 000000000000..f9b82c153d9e --- /dev/null +++ b/solr/modules/language-models/src/test-files/solr/collection1/conf/solrconfig-document-enrichment.xml @@ -0,0 +1,235 @@ + + + + + ${tests.luceneMatchVersion:LATEST} + ${solr.data.dir:} + + + + + + + + + + + + + + + 15000 + false + + + 1000 + + + ${solr.data.dir:} + + + + + + explicit + json + true + id + + + + + + string_field + enriched_field + Summarize this content: {string_field} + dummy-chat-1 + + + + + + + string_field + enriched_field + Summarize this content: {string_field} + exception-throwing-chat-model + + + + + + + + string_field + enriched_field + Summarize this content: {string_field} + dummy-chat-1 + + + + + + + string_field + body_field + enriched_field + Title: {string_field}. Body: {body_field}. + dummy-chat-1 + + + + + + + string_field + enriched_field_multi + Extract tags from: {string_field} + dummy-chat-multivalued-1 + + + + + + + string_field + body_field + enriched_field + Title: {string_field}. Body: {body_field}. + exception-throwing-chat-model + + + + + + + string_field + output_long + Extract a number from: {string_field} + dummy-long + + + + + + + string_field + output_int + Extract a number from: {string_field} + dummy-int + + + + + + + string_field + output_float + Extract a number from: {string_field} + dummy-float + + + + + + + string_field + output_double + Extract a number from: {string_field} + dummy-double + + + + + + + string_field + output_boolean + Is this true or false: {string_field} + dummy-boolean + + + + + + + string_field + output_date + Extract a date from: {string_field} + dummy-date + + + + + + + string_field + output_long_multi + Extract numbers from: {string_field} + dummy-long-multi + + + + + + + string_field + output_int_multi + Extract numbers from: {string_field} + dummy-int-multi + + + + + + + string_field + output_float_multi + Extract numbers from: {string_field} + dummy-float-multi + + + + + + + string_field + output_double_multi + Extract numbers from: {string_field} + dummy-double-multi + + + + + + + string_field + output_boolean_multi + Extract boolean values from: {string_field} + dummy-boolean-multi + + + + + + + string_field + output_date_multi + Extract dates from: {string_field} + dummy-date-multi + + + + + \ No newline at end of file diff --git a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/TestLanguageModelBase.java b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/TestLanguageModelBase.java index aaf3143e3513..d7a4ac9b8c96 100644 --- a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/TestLanguageModelBase.java +++ b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/TestLanguageModelBase.java @@ -26,6 +26,7 @@ import java.util.List; import org.apache.commons.io.file.PathUtils; import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.languagemodels.documentenrichment.store.rest.ManagedChatModelStore; import org.apache.solr.languagemodels.textvectorisation.store.rest.ManagedTextToVectorModelStore; import org.apache.solr.util.RestTestBase; import org.slf4j.Logger; @@ -38,11 +39,13 @@ public class TestLanguageModelBase extends RestTestBase { protected static Path tmpSolrHome; protected static Path tmpConfDir; - public static final String MODEL_FILE_NAME = "_schema_text-to-vector-model-store.json"; + public static final String EMBEDDING_MODEL_FILE_NAME = "_schema_text-to-vector-model-store.json"; + public static final String CHAT_MODEL_FILE_NAME = "_schema_chat-model-store.json"; protected static final String COLLECTION = "collection1"; protected static final String CONF_DIR = COLLECTION + "/conf"; protected static Path embeddingModelStoreFile = null; + protected static Path chatModelStoreFile = null; protected static String IDField = "id"; protected static String vectorField = "vector"; @@ -61,17 +64,26 @@ protected static void initFolders(boolean isPersistent) throws Exception { tmpSolrHome = createTempDir(); tmpConfDir = tmpSolrHome.resolve(CONF_DIR); PathUtils.copyDirectory(TEST_PATH(), tmpSolrHome.toAbsolutePath()); - final Path modelStore = tmpConfDir.resolve(MODEL_FILE_NAME); + final Path embeddingStore = tmpConfDir.resolve(EMBEDDING_MODEL_FILE_NAME); + final Path chatStore = tmpConfDir.resolve(CHAT_MODEL_FILE_NAME); if (isPersistent) { - embeddingModelStoreFile = modelStore; + embeddingModelStoreFile = embeddingStore; + chatModelStoreFile = chatStore; } - if (Files.exists(modelStore)) { + if (Files.exists(embeddingStore)) { if (log.isInfoEnabled()) { - log.info("remove model store config file in {}", modelStore.toAbsolutePath()); + log.info("remove model store config file in {}", embeddingStore.toAbsolutePath()); } - Files.delete(modelStore); + Files.delete(embeddingStore); + } + + if (Files.exists(chatStore)) { + if (log.isInfoEnabled()) { + log.info("remove chat model store config file in {}", chatStore.toAbsolutePath()); + } + Files.delete(chatStore); } System.setProperty("managed.schema.mutable", "true"); @@ -90,7 +102,7 @@ protected static void afterTest() throws Exception { } public static void loadModel(String fileName, String status) throws Exception { - final URL url = TestLanguageModelBase.class.getResource("/modelExamples/" + fileName); + final URL url = TestLanguageModelBase.class.getResource("/modelEmbeddingExamples/" + fileName); final String multipleModels = Files.readString(Path.of(url.toURI()), StandardCharsets.UTF_8); assertJPut( @@ -100,13 +112,29 @@ public static void loadModel(String fileName, String status) throws Exception { } public static void loadModel(String fileName) throws Exception { - final URL url = TestLanguageModelBase.class.getResource("/modelExamples/" + fileName); + final URL url = TestLanguageModelBase.class.getResource("/modelEmbeddingExamples/" + fileName); final String multipleModels = Files.readString(Path.of(url.toURI()), StandardCharsets.UTF_8); assertJPut( ManagedTextToVectorModelStore.REST_END_POINT, multipleModels, "/responseHeader/status==0"); } + public static void loadChatModel(String fileName, String status) throws Exception { + final URL url = TestLanguageModelBase.class.getResource("/modelChatExamples/" + fileName); + final String model = Files.readString(Path.of(url.toURI()), StandardCharsets.UTF_8); + + assertJPut( + ManagedChatModelStore.REST_END_POINT, model, "/responseHeader/status==" + status); + } + + public static void loadChatModel(String fileName) throws Exception { + final URL url = TestLanguageModelBase.class.getResource("/modelChatExamples/" + fileName); + final String model = Files.readString(Path.of(url.toURI()), StandardCharsets.UTF_8); + + assertJPut( + ManagedChatModelStore.REST_END_POINT, model, "/responseHeader/status==0"); + } + protected static void prepareIndex() throws Exception { List docsToIndex = prepareDocs(); for (SolrInputDocument doc : docsToIndex) { diff --git a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/model/DummyChatModel.java b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/model/DummyChatModel.java new file mode 100644 index 000000000000..42987b1d69ce --- /dev/null +++ b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/model/DummyChatModel.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.languagemodels.documentenrichment.model; + +import dev.langchain4j.data.message.AiMessage; +import dev.langchain4j.model.chat.ChatModel; +import dev.langchain4j.model.chat.request.ChatRequest; +import dev.langchain4j.model.chat.response.ChatResponse; + +/** + * A deterministic {@link ChatModel} for testing. It returns a fixed response string regardless of + * the input, allowing tests to assert exact enriched-field values without real API calls. + * + *

The builder also exposes {@code unsupported} and {@code ambiguous} setter methods to exercise + * the reflection-based parameter handling in {@link + * org.apache.solr.languagemodels.documentenrichment.model.SolrChatModel#getInstance}. + */ +public class DummyChatModel implements ChatModel { + + private final String response; + + public DummyChatModel(String response) { + this.response = response; + } + + @Override + public ChatResponse chat(ChatRequest chatRequest) { + return ChatResponse.builder().aiMessage(AiMessage.from(response)).build(); + } + + public static DummyChatModelBuilder builder() { + return new DummyChatModelBuilder(); + } + + public static class DummyChatModelBuilder { + private String response = "dummy response"; + private int intValue; + + public DummyChatModelBuilder() {} + + public DummyChatModelBuilder response(String response) { + this.response = response; + return this; + } + + /** Intentionally has no String overload so the reflection code raises a BAD_REQUEST error. */ + public DummyChatModelBuilder unsupported(Integer input) { + return this; + } + + /** Two overloads make this param "ambiguous": the reflection code should default to String. */ + public DummyChatModelBuilder ambiguous(int input) { + this.intValue = input; + return this; + } + + public DummyChatModelBuilder ambiguous(String input) { + this.intValue = Integer.valueOf(input); + return this; + } + + public DummyChatModel build() { + return new DummyChatModel(this.response); + } + } +} diff --git a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/model/DummyChatModelTest.java b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/model/DummyChatModelTest.java new file mode 100644 index 000000000000..6449b7b2f55c --- /dev/null +++ b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/model/DummyChatModelTest.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.languagemodels.documentenrichment.model; + +import dev.langchain4j.data.message.UserMessage; +import dev.langchain4j.model.chat.request.ChatRequest; +import org.apache.solr.SolrTestCase; +import org.junit.Test; + +public class DummyChatModelTest extends SolrTestCase { + + @Test + public void constructAndChat() throws Exception { + assertEquals( + "hello world", + new DummyChatModel("hello world") + .chat(ChatRequest.builder().messages(UserMessage.from("any input")).build()) + .aiMessage() + .text()); + assertEquals( + "fixed response", + new DummyChatModel("fixed response") + .chat(ChatRequest.builder().messages(UserMessage.from("another input")).build()) + .aiMessage() + .text()); + assertEquals( + "dummy response", + DummyChatModel.builder() + .build() + .chat(ChatRequest.builder().messages(UserMessage.from("default")).build()) + .aiMessage() + .text()); + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/model/ExceptionThrowingChatModel.java b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/model/ExceptionThrowingChatModel.java new file mode 100644 index 000000000000..e5eda8d493f1 --- /dev/null +++ b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/model/ExceptionThrowingChatModel.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.languagemodels.documentenrichment.model; + +import dev.langchain4j.model.chat.ChatModel; +import dev.langchain4j.model.chat.request.ChatRequest; +import dev.langchain4j.model.chat.response.ChatResponse; + +/** + * A {@link ChatModel} that always throws a {@link RuntimeException}. Used to verify that {@link + * org.apache.solr.languagemodels.documentenrichment.update.processor.DocumentEnrichmentUpdateProcessor} + * handles chat-model failures gracefully (logs the error and continues indexing without the + * enriched field). + */ +public class ExceptionThrowingChatModel implements ChatModel { + + @Override + public ChatResponse chat(ChatRequest chatRequest) { + throw new RuntimeException("Failed to enrich"); + } + + public static ExceptionThrowingChatModelBuilder builder() { + return new ExceptionThrowingChatModelBuilder(); + } + + public static class ExceptionThrowingChatModelBuilder { + + public ExceptionThrowingChatModelBuilder() {} + + public ExceptionThrowingChatModel build() { + return new ExceptionThrowingChatModel(); + } + } +} diff --git a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/store/rest/TestChatModelManager.java b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/store/rest/TestChatModelManager.java new file mode 100644 index 000000000000..25880eecbcd6 --- /dev/null +++ b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/store/rest/TestChatModelManager.java @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.languagemodels.documentenrichment.store.rest; + +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.SolrResourceLoader; +import org.apache.solr.languagemodels.TestLanguageModelBase; +import org.apache.solr.languagemodels.documentenrichment.update.processor.DocumentEnrichmentUpdateProcessorFactory; +import org.apache.solr.rest.ManagedResource; +import org.apache.solr.rest.ManagedResourceStorage; +import org.apache.solr.rest.RestManager; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +public class TestChatModelManager extends TestLanguageModelBase { + + @BeforeClass + public static void init() throws Exception { + setupTest("solrconfig-document-enrichment.xml", "schema-language-models.xml", false, false); + } + + @AfterClass + public static void cleanup() throws Exception { + afterTest(); + } + + @Test + public void test() throws Exception { + final SolrResourceLoader loader = new SolrResourceLoader(tmpSolrHome); + + final RestManager.Registry registry = loader.getManagedResourceRegistry(); + assertNotNull( + "Expected a non-null RestManager.Registry from the SolrResourceLoader!", registry); + + final String resourceId = "/schema/mstore1"; + registry.registerManagedResource( + resourceId, ManagedChatModelStore.class, new DocumentEnrichmentUpdateProcessorFactory()); + + final NamedList initArgs = new NamedList<>(); + + final RestManager restManager = new RestManager(); + restManager.init(loader, initArgs, new ManagedResourceStorage.InMemoryStorageIO()); + + final ManagedResource res = restManager.getManagedResource(resourceId); + assertTrue(res instanceof ManagedChatModelStore); + assertEquals(res.getResourceId(), resourceId); + } + + @Test + public void testRestManagerEndpoints() throws Exception { + assertJQ("/schema/managed", "/responseHeader/status==0"); + + final String openAiClassName = "dev.langchain4j.model.openai.OpenAiChatModel"; + + // success + String model = + "{ name:\"testChatModel2\", class:\"" + + openAiClassName + + "\"," + + "params:{" + + "baseUrl:\"https://api.openai.com/v1\"," + + "apiKey:\"testApiKey2\"," + + "modelName:\"gpt-4o-mini\"," + + "logRequests:true," + + "logResponses:false" + + "}}"; + assertJPut(ManagedChatModelStore.REST_END_POINT, model, "/responseHeader/status==0"); + + // success — multiple models in one PUT + final String multipleModels = + "[{ name:\"testChatModel3\", class:\"" + + openAiClassName + + "\"," + + "params:{baseUrl:\"https://api.openai.com/v1\"," + + "apiKey:\"testApiKey3\"," + + "modelName:\"gpt-4o-mini\"," + + "logRequests:true," + + "logResponses:false" + + "}}\n" + + ",{ name:\"testChatModel4\", class:\"" + + openAiClassName + + "\"," + + "params:{baseUrl:\"https://api.openai.com/v1\"," + + "apiKey:\"testApiKey4\"," + + "modelName:\"gpt-4o-mini\"," + + "logRequests:true," + + "logResponses:false" + + "}}]"; + assertJPut(ManagedChatModelStore.REST_END_POINT, multipleModels, "/responseHeader/status==0"); + + final String qryResult = JQ(ManagedChatModelStore.REST_END_POINT); + assertTrue( + qryResult.contains("\"name\":\"testChatModel2\"") + && qryResult.contains("\"name\":\"testChatModel3\"") + && qryResult.contains("\"name\":\"testChatModel4\"")); + + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/name=='testChatModel2'"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[1]/name=='testChatModel3'"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[2]/name=='testChatModel4'"); + restTestHarness.delete(ManagedChatModelStore.REST_END_POINT + "/testChatModel2"); + restTestHarness.delete(ManagedChatModelStore.REST_END_POINT + "/testChatModel3"); + restTestHarness.delete(ManagedChatModelStore.REST_END_POINT + "/testChatModel4"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models==[]"); + } + + @Test + public void loadChatModel_openAi_shouldLoadModelConfig() throws Exception { + loadChatModel("openai-model.json"); + + final String modelName = "openai-1"; + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/name=='" + modelName + "'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, + "/models/[0]/params/baseUrl=='https://api.openai.com/v1'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/apiKey=='apiKey-openAI'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, + "/models/[0]/params/modelName=='gpt-5.4-nano'"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/timeout==60"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/logRequests==true"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/logResponses==true"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/maxRetries==5"); + + restTestHarness.delete(ManagedChatModelStore.REST_END_POINT + "/" + modelName); + } + + @Test + public void loadChatModel_mistralAi_shouldLoadModelConfig() throws Exception { + loadChatModel("mistralai-chat-model.json"); + + final String modelName = "mistralai-chat-1"; + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/name=='" + modelName + "'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, + "/models/[0]/params/baseUrl=='https://api.mistral.ai/v1'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/apiKey=='apiKey-mistralAI'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, + "/models/[0]/params/modelName=='mistral-small-latest'"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/timeout==60"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/logRequests==true"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/logResponses==true"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/maxRetries==5"); + + restTestHarness.delete(ManagedChatModelStore.REST_END_POINT + "/" + modelName); + } + + @Test + public void loadChatModel_anthropic_shouldLoadModelConfig() throws Exception { + loadChatModel("anthropic-chat-model.json"); + + final String modelName = "anthropic-chat-1"; + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/name=='" + modelName + "'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, + "/models/[0]/params/baseUrl=='https://api.anthropic.com/v1'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/apiKey=='apiKey-anthropic'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, + "/models/[0]/params/modelName=='claude-3-5-haiku-latest'"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/timeout==60"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/logRequests==true"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/logResponses==true"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/maxRetries==5"); + + restTestHarness.delete(ManagedChatModelStore.REST_END_POINT + "/" + modelName); + } + + @Test + public void loadChatModel_ollama_shouldLoadModelConfig() throws Exception { + loadChatModel("ollama-chat-model.json"); + + final String modelName = "ollama-chat-1"; + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/name=='" + modelName + "'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, + "/models/[0]/params/baseUrl=='http://localhost:11434'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/modelName=='llama3.2'"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/timeout==60"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/logRequests==true"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/logResponses==true"); + + restTestHarness.delete(ManagedChatModelStore.REST_END_POINT + "/" + modelName); + } + + @Test + public void loadChatModel_gemini_shouldLoadModelConfig() throws Exception { + loadChatModel("gemini-chat-model.json"); + + final String modelName = "gemini-chat-1"; + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/name=='" + modelName + "'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/apiKey=='apiKey-gemini'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, + "/models/[0]/params/modelName=='gemini-2.0-flash'"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/timeout==60"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/logRequests==true"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/logResponses==true"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/maxRetries==5"); + + restTestHarness.delete(ManagedChatModelStore.REST_END_POINT + "/" + modelName); + } + + @Test + public void loadChatModel_dummyUnsupportedParam_shouldRaiseError() throws Exception { + loadChatModel("dummy-chat-model-unsupported.json", "400"); + } + + @Test + public void loadChatModel_dummyAmbiguousParam_shouldDefaultToString() throws Exception { + loadChatModel("dummy-chat-model-ambiguous.json"); + + final String modelName = "dummy-chat-1"; + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/name=='" + modelName + "'"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/ambiguous==10"); + + restTestHarness.delete(ManagedChatModelStore.REST_END_POINT + "/" + modelName); + } +} diff --git a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/store/rest/TestChatModelManagerPersistence.java b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/store/rest/TestChatModelManagerPersistence.java new file mode 100644 index 000000000000..654c98556ab4 --- /dev/null +++ b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/store/rest/TestChatModelManagerPersistence.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.languagemodels.documentenrichment.store.rest; + +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import org.apache.solr.common.util.Utils; +import org.apache.solr.languagemodels.TestLanguageModelBase; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestChatModelManagerPersistence extends TestLanguageModelBase { + + @Before + public void init() throws Exception { + setupTest("solrconfig-document-enrichment.xml", "schema-language-models.xml", false, true); + } + + @After + public void cleanup() throws Exception { + afterTest(); + } + + @Test + public void testModelAreStoredCompact() throws Exception { + loadChatModel("openai-model.json"); + + final String JSONOnDisk = Files.readString(chatModelStoreFile, StandardCharsets.UTF_8); + Object objectFromDisk = Utils.fromJSONString(JSONOnDisk); + assertEquals(new String(Utils.toJSON(objectFromDisk, -1), UTF_8), JSONOnDisk); + } + + @Test + public void testModelStorePersistence() throws Exception { + // check store is empty at start + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/==[]"); + + // load a model + loadChatModel("openai-model.json"); + + final String modelName = "openai-1"; + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/name=='" + modelName + "'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, + "/models/[0]/params/baseUrl=='https://api.openai.com/v1'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/apiKey=='apiKey-openAI'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/modelName=='gpt-5.4-nano'"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/timeout==60"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/logRequests==true"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/logResponses==true"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/maxRetries==5"); + + // check persistence after reload + restTestHarness.reload(); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/name=='" + modelName + "'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, + "/models/[0]/params/baseUrl=='https://api.openai.com/v1'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/apiKey=='apiKey-openAI'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/modelName=='gpt-5.4-nano'"); + + // check persistence after restart + getJetty().stop(); + getJetty().start(); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/[0]/name=='" + modelName + "'"); + assertJQ( + ManagedChatModelStore.REST_END_POINT, "/models/[0]/params/modelName=='gpt-5.4-nano'"); + + // delete model and verify persistence of the empty state + restTestHarness.delete(ManagedChatModelStore.REST_END_POINT + "/" + modelName); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/==[]"); + + restTestHarness.reload(); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/==[]"); + + getJetty().stop(); + getJetty().start(); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models/==[]"); + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/store/rest/TestManagedChatModelStoreInitialization.java b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/store/rest/TestManagedChatModelStoreInitialization.java new file mode 100644 index 000000000000..0106558401a8 --- /dev/null +++ b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/store/rest/TestManagedChatModelStoreInitialization.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.languagemodels.documentenrichment.store.rest; + +import org.apache.solr.languagemodels.TestLanguageModelBase; +import org.junit.After; +import org.junit.Test; + +public class TestManagedChatModelStoreInitialization extends TestLanguageModelBase { + + @After + public void cleanUp() throws Exception { + afterTest(); + } + + @Test + public void managedChatModelStore_whenUpdateRequestComponentConfigured_shouldBeInitialized() + throws Exception { + setupTest( + "solrconfig-document-enrichment-update-request-processor-only.xml", + "schema-language-models.xml", + false, + false); + + assertJQ(ManagedChatModelStore.REST_END_POINT, "/responseHeader/status==0"); + assertJQ(ManagedChatModelStore.REST_END_POINT, "/models==[]"); + } + + @Test + public void managedChatModelStore_whenNoComponents_shouldNotBeInitialized() throws Exception { + setupTest( + "solrconfig-language-models-no-components.xml", "schema-language-models.xml", false, false); + assertJQ( + ManagedChatModelStore.REST_END_POINT, + "/responseHeader/status==400", + "/error/msg=='No REST managed resource registered for path " + + ManagedChatModelStore.REST_END_POINT + + "'"); + } +} \ No newline at end of file diff --git a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/update/processor/DocumentEnrichmentUpdateProcessorFactoryTest.java b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/update/processor/DocumentEnrichmentUpdateProcessorFactoryTest.java new file mode 100644 index 000000000000..e92bded3c75e --- /dev/null +++ b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/update/processor/DocumentEnrichmentUpdateProcessorFactoryTest.java @@ -0,0 +1,422 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.languagemodels.documentenrichment.update.processor; + +import java.util.List; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.SolrCore; +import org.apache.solr.languagemodels.TestLanguageModelBase; +import org.apache.solr.languagemodels.documentenrichment.model.SolrChatModel; +import org.apache.solr.languagemodels.documentenrichment.store.rest.ManagedChatModelStore; +import org.apache.solr.request.SolrQueryRequestBase; +import org.apache.solr.update.processor.UpdateRequestProcessor; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +public class DocumentEnrichmentUpdateProcessorFactoryTest extends TestLanguageModelBase { + + @BeforeClass + public static void init() throws Exception { + setupTest("solrconfig-document-enrichment.xml", "schema-language-models.xml", false, false); + } + + @AfterClass + public static void cleanup() throws Exception { + afterTest(); + } + + SolrCore collection1; + + @Before + public void setup() { + collection1 = solrTestRule.getCoreContainer().getCore("collection1"); + } + + @After + public void after() { + collection1.close(); + } + + @Test + public void init_fullArgs_shouldInitAllParams() { + NamedList args = new NamedList<>(); + args.add("inputField", "string_field"); + args.add("outputField", "enriched_field"); + args.add("prompt", "Summarize: {string_field}"); + args.add("model", "model1"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + factory.init(args); + + assertEquals(List.of("string_field"), factory.getInputFields()); + assertEquals("enriched_field", factory.getOutputField()); + assertEquals("Summarize: {string_field}", factory.getPrompt()); + assertEquals("model1", factory.getModelName()); + } + + @Test + public void init_multipleInputFields_shouldInitAllFields() { + NamedList args = new NamedList<>(); + args.add("inputField", "string_field"); + args.add("inputField", "body_field"); + args.add("outputField", "enriched_field"); + args.add("prompt", "Title: {string_field}. Body: {body_field}."); + args.add("model", "model1"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + factory.init(args); + + assertEquals(List.of("string_field", "body_field"), factory.getInputFields()); + } + + @Test + public void init_noInputField_shouldThrowExceptionWithDetailedMessage() { + NamedList args = new NamedList<>(); + args.add("outputField", "enriched_field"); + args.add("prompt", "Summarize: {string_field}"); + args.add("model", "model1"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + + SolrException e = assertThrows(SolrException.class, () -> factory.init(args)); + assertEquals("At least one 'inputField' must be provided", e.getMessage()); + } + + @Test + public void init_nullOutputField_shouldThrowExceptionWithDetailedMessage() { + NamedList args = new NamedList<>(); + args.add("inputField", "string_field"); + args.add("prompt", "Summarize: {string_field}"); + args.add("model", "model1"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + + SolrException e = assertThrows(SolrException.class, () -> factory.init(args)); + assertEquals("Missing required parameter: outputField", e.getMessage()); + } + + @Test + public void init_neitherPromptNorPromptFile_shouldThrowExceptionWithDetailedMessage() { + NamedList args = new NamedList<>(); + args.add("inputField", "string_field"); + args.add("outputField", "enriched_field"); + args.add("model", "model1"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + + SolrException e = assertThrows(SolrException.class, () -> factory.init(args)); + assertEquals("Either 'prompt' or 'promptFile' must be provided", e.getMessage()); + } + + @Test + public void init_bothPromptAndPromptFile_shouldThrowExceptionWithDetailedMessage() { + NamedList args = new NamedList<>(); + args.add("inputField", "string_field"); + args.add("outputField", "enriched_field"); + args.add("prompt", "Summarize: {string_field}"); + args.add("promptFile", "prompt.txt"); + args.add("model", "model1"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + + SolrException e = assertThrows(SolrException.class, () -> factory.init(args)); + assertEquals("Only one of 'prompt' or 'promptFile' can be provided, not both", e.getMessage()); + } + + @Test + public void init_promptMissingPlaceholderForDeclaredField_shouldThrowExceptionWithDetailedMessage() { + NamedList args = new NamedList<>(); + args.add("inputField", "string_field"); + args.add("outputField", "enriched_field"); + args.add("prompt", "Summarize:"); + args.add("model", "model1"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + + SolrException e = assertThrows(SolrException.class, () -> factory.init(args)); + assertEquals("prompt is missing placeholders for inputField(s): [string_field]", e.getMessage()); + } + + @Test + public void init_promptMissingOnePlaceholderOfMultipleFields_shouldThrowExceptionWithDetailedMessage() { + NamedList args = new NamedList<>(); + args.add("inputField", "string_field"); + args.add("inputField", "body_field"); + args.add("outputField", "enriched_field"); + args.add("prompt", "Title: {string_field}."); + args.add("model", "model1"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + + SolrException e = assertThrows(SolrException.class, () -> factory.init(args)); + assertEquals("prompt is missing placeholders for inputField(s): [body_field]", e.getMessage()); + } + + @Test + public void init_promptHasExtraPlaceholderNotDeclaredAsInputField_shouldThrowExceptionWithDetailedMessage() { + NamedList args = new NamedList<>(); + args.add("inputField", "string_field"); + args.add("outputField", "enriched_field"); + args.add("prompt", "Title: {string_field}. Extra: {unknown_field}."); + args.add("model", "model1"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + + SolrException e = assertThrows(SolrException.class, () -> factory.init(args)); + assertEquals( + "prompt contains placeholders not declared as inputField(s): [unknown_field]", + e.getMessage()); + } + + @Test + public void init_nullModel_shouldThrowExceptionWithDetailedMessage() { + NamedList args = new NamedList<>(); + args.add("inputField", "string_field"); + args.add("outputField", "enriched_field"); + args.add("prompt", "Summarize: {string_field}"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + + SolrException e = assertThrows(SolrException.class, () -> factory.init(args)); + assertEquals("Missing required parameter: model", e.getMessage()); + } + + @Test + public void init_promptFile_shouldLoadPromptFromFile() { + NamedList args = new NamedList<>(); + args.add("inputField", "string_field"); + args.add("outputField", "enriched_field"); + args.add("promptFile", "prompt.txt"); + args.add("model", "model1"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + factory.init(args); + factory.inform(collection1); + + assertEquals("prompt.txt", factory.getPromptFile()); + assertNotNull(factory.getPrompt()); + assertTrue(factory.getPrompt().contains("{string_field}")); + } + + @Test + public void init_promptFileMultiField_shouldLoadAndValidateBothPlaceholders() { + NamedList args = new NamedList<>(); + args.add("inputField", "string_field"); + args.add("inputField", "body_field"); + args.add("outputField", "enriched_field"); + args.add("promptFile", "prompt-multi-field.txt"); + args.add("model", "model1"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + factory.init(args); + factory.inform(collection1); + + assertNotNull(factory.getPrompt()); + assertTrue(factory.getPrompt().contains("{string_field}")); + assertTrue(factory.getPrompt().contains("{body_field}")); + } + + @Test + public void init_promptFileWithMissingPlaceholder_shouldThrowExceptionInInform() { + NamedList args = new NamedList<>(); + args.add("inputField", "string_field"); + args.add("outputField", "enriched_field"); + args.add("promptFile", "prompt-no-placeholder.txt"); + args.add("model", "model1"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + factory.init(args); + + SolrException e = assertThrows(SolrException.class, () -> factory.inform(collection1)); + assertEquals( + "prompt is missing placeholders for inputField(s): [string_field]", e.getMessage()); + } + + /* Following tests depend on a real solr schema and depend on BeforeClass-AfterClass methods */ + + @Test + public void init_notExistentOutputField_shouldThrowExceptionWithDetailedMessage() { + NamedList args = new NamedList<>(); + args.add("inputField", "string_field"); + args.add("outputField", "notExistentOutput"); + args.add("prompt", "Summarize: {string_field}"); + args.add("model", "model1"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + ModifiableSolrParams params = new ModifiableSolrParams(); + SolrQueryRequestBase req = new SolrQueryRequestBase(collection1, params) {}; + factory.init(args); + + SolrException e = assertThrows(SolrException.class, () -> factory.getInstance(req, null, null)); + assertEquals("undefined field: \"notExistentOutput\"", e.getMessage()); + } + + @Test + public void init_notTextualOutputField_shouldThrowExceptionWithDetailedMessage() { + // vector is a DenseVectorField — not a textual field + NamedList args = new NamedList<>(); + args.add("inputField", "string_field"); + args.add("outputField", "vector"); + args.add("prompt", "Summarize: {string_field}"); + args.add("model", "model1"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + ModifiableSolrParams params = new ModifiableSolrParams(); + SolrQueryRequestBase req = new SolrQueryRequestBase(collection1, params) {}; + factory.init(args); + + SolrException e = assertThrows(SolrException.class, () -> factory.getInstance(req, null, null)); + assertEquals( + "field type is not supported by Document Enrichment: DenseVectorField", e.getMessage()); + } + + @Test + public void init_notExistentInputField_shouldThrowExceptionWithDetailedMessage() { + NamedList args = new NamedList<>(); + args.add("inputField", "notExistentInput"); + args.add("outputField", "enriched_field"); + args.add("prompt", "Summarize: {notExistentInput}"); + args.add("model", "model1"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + ModifiableSolrParams params = new ModifiableSolrParams(); + SolrQueryRequestBase req = new SolrQueryRequestBase(collection1, params) {}; + factory.init(args); + + SolrException e = assertThrows(SolrException.class, () -> factory.getInstance(req, null, null)); + assertEquals("undefined field: \"notExistentInput\"", e.getMessage()); + } + + @Test + public void init_multipleInputFields_oneNotExistent_shouldThrowExceptionWithDetailedMessage() { + NamedList args = new NamedList<>(); + args.add("inputField", "string_field"); + args.add("inputField", "notExistentInput"); + args.add("outputField", "enriched_field"); + args.add("prompt", "Title: {string_field}. Body: {notExistentInput}."); + args.add("model", "model1"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + ModifiableSolrParams params = new ModifiableSolrParams(); + SolrQueryRequestBase req = new SolrQueryRequestBase(collection1, params) {}; + factory.init(args); + + SolrException e = assertThrows(SolrException.class, () -> factory.getInstance(req, null, null)); + assertEquals("undefined field: \"notExistentInput\"", e.getMessage()); + } + + @Test + public void init_multivaluedStringOutputField_shouldNotThrowException() throws Exception { + UpdateRequestProcessor instance = + createUpdateProcessor("string_field", "enriched_field_multi", collection1, "model-mv"); + assertNotNull(instance); + restTestHarness.delete(ManagedChatModelStore.REST_END_POINT + "/model-mv"); + } + + @Test + public void init_multivaluedStringOutputField_buildResponseFormat_shouldProduceArraySchema() throws Exception { + NamedList args = new NamedList<>(); + ManagedChatModelStore.getManagedModelStore(collection1) + .addModel(new SolrChatModel("model-rf", null, null)); + args.add("inputField", "string_field"); + args.add("outputField", "enriched_field_multi"); + args.add("prompt", "Summarize: {string_field}"); + args.add("model", "model-rf"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + factory.init(args); + ModifiableSolrParams params = new ModifiableSolrParams(); + SolrQueryRequestBase req = new SolrQueryRequestBase(collection1, params) {}; + assertNotNull(factory.getInstance(req, null, null)); + + // verify the ResponseFormat is constructed correctly for the multivalued field + var schema = collection1.getLatestSchema(); + var schemaField = schema.getField("enriched_field_multi"); + assertTrue(schemaField.multiValued()); + var responseFormat = DocumentEnrichmentUpdateProcessorFactory.buildResponseFormat(schemaField); + assertNotNull(responseFormat); + assertEquals( + dev.langchain4j.model.chat.request.ResponseFormatType.JSON, responseFormat.type()); + assertNotNull(responseFormat.jsonSchema()); + restTestHarness.delete(ManagedChatModelStore.REST_END_POINT + "/model-rf"); + } + + @Test + public void init_singleValuedStringOutputField_buildResponseFormat_shouldProduceStringSchema() { + var schema = collection1.getLatestSchema(); + var schemaField = schema.getField("enriched_field"); + assertFalse(schemaField.multiValued()); + var responseFormat = DocumentEnrichmentUpdateProcessorFactory.buildResponseFormat(schemaField); + assertNotNull(responseFormat); + assertEquals( + dev.langchain4j.model.chat.request.ResponseFormatType.JSON, responseFormat.type()); + assertNotNull(responseFormat.jsonSchema()); + } + + @Test + public void init_dynamicInputField_shouldNotThrowException() throws Exception{ + UpdateRequestProcessor instance = + createUpdateProcessor("text_s", "enriched_field", collection1, "model2"); + assertNotNull(instance); + restTestHarness.delete(ManagedChatModelStore.REST_END_POINT + "/model2"); + } + + @Test + public void init_multipleDynamicInputFields_shouldNotThrowException() throws Exception{ + NamedList args = new NamedList<>(); + ManagedChatModelStore.getManagedModelStore(collection1) + .addModel(new SolrChatModel("model1", null, null)); + args.add("inputField", "text_s"); + args.add("inputField", "body_field"); + args.add("outputField", "enriched_field"); + args.add("prompt", "Title: {text_s}. Body: {body_field}."); + args.add("model", "model1"); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + ModifiableSolrParams params = new ModifiableSolrParams(); + factory.init(args); + + SolrQueryRequestBase req = new SolrQueryRequestBase(collection1, params) {}; + assertNotNull(factory.getInstance(req, null, null)); + restTestHarness.delete(ManagedChatModelStore.REST_END_POINT + "/model1"); + } + + private UpdateRequestProcessor createUpdateProcessor( + String inputFieldName, String outputFieldName, SolrCore core, String modelName) { + NamedList args = new NamedList<>(); + + ManagedChatModelStore.getManagedModelStore(core) + .addModel(new SolrChatModel(modelName, null, null)); + args.add("inputField", inputFieldName); + args.add("outputField", outputFieldName); + args.add("prompt", "Summarize: {" + inputFieldName + "}"); + args.add("model", modelName); + + DocumentEnrichmentUpdateProcessorFactory factory = new DocumentEnrichmentUpdateProcessorFactory(); + ModifiableSolrParams params = new ModifiableSolrParams(); + factory.init(args); + + SolrQueryRequestBase req = new SolrQueryRequestBase(core, params) {}; + + return factory.getInstance(req, null, null); + } +} diff --git a/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/update/processor/DocumentEnrichmentUpdateProcessorTest.java b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/update/processor/DocumentEnrichmentUpdateProcessorTest.java new file mode 100644 index 000000000000..048e073da9f0 --- /dev/null +++ b/solr/modules/language-models/src/test/org/apache/solr/languagemodels/documentenrichment/update/processor/DocumentEnrichmentUpdateProcessorTest.java @@ -0,0 +1,718 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.languagemodels.documentenrichment.update.processor; + +import java.io.IOException; +import java.util.Map; +import org.apache.solr.client.solrj.RemoteSolrException; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.request.SolrQuery; +import org.apache.solr.client.solrj.request.UpdateRequest; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.languagemodels.TestLanguageModelBase; +import org.apache.solr.languagemodels.documentenrichment.store.rest.ManagedChatModelStore; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +public class DocumentEnrichmentUpdateProcessorTest extends TestLanguageModelBase { + + @BeforeClass + public static void init() throws Exception { + setupTest("solrconfig-document-enrichment.xml", "schema-language-models.xml", false, false); + } + + @AfterClass + public static void cleanup() throws Exception { + afterTest(); + } + + private String loadedModelId; + + @After + public void afterEachTest() throws Exception { + if (loadedModelId != null) { + restTestHarness.delete(ManagedChatModelStore.REST_END_POINT + "/" + loadedModelId); + loadedModelId = null; + } + } + + private void loadTestChatModel(String fileName, String modelId) throws Exception { + loadChatModel(fileName); + loadedModelId = modelId; + } + + @Test + public void processAdd_inputField_shouldEnrichInputField() throws Exception { + loadTestChatModel("dummy-chat-model.json", "dummy-chat-1"); + + addWithChain(sdoc("id", "99", "string_field", "Vegeta is the saiyan prince."), "documentEnrichment"); + addWithChain(sdoc("id", "98", "string_field", "Kakaroth is a saiyan grown up on planet Earth."), "documentEnrichment"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("enriched_field"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/enriched_field=='enriched content'", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/enriched_field=='enriched content'"); + } + + /* + This test looks for the 'dummy-chat-1' model, but such model is not loaded — + the model store is empty, so the update fails. + */ + @Test + public void processAdd_modelNotFound_shouldThrowException() { + RuntimeException thrown = + assertThrows( + "model not found should throw an exception", + RemoteSolrException.class, + () -> + addWithChain( + sdoc("id", "99", "string_field", "Vegeta is the saiyan prince."), + "documentEnrichment")); + assertTrue( + thrown + .getMessage() + .contains( + "The model configured in the Update Request Processor 'dummy-chat-1' can't be found in the store: /schema/chat-model-store")); + } + + @Test + public void processAdd_emptyInputField_shouldLogAndIndexWithNoEnrichedField() throws Exception { + loadTestChatModel("dummy-chat-model.json", "dummy-chat-1"); + addWithChain(sdoc("id", "99", "string_field", ""), "documentEnrichment"); + addWithChain(sdoc("id", "98", "string_field", "Vegeta is the saiyan prince."), "documentEnrichment"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("enriched_field"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "!/response/docs/[0]/enriched_field==", // no enriched field for doc 99 + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/enriched_field=='enriched content'"); + } + + @Test + public void processAdd_nullInputField_shouldLogAndIndexWithNoEnrichedField() throws Exception { + loadTestChatModel("dummy-chat-model.json", "dummy-chat-1"); + addWithChain(sdoc("id", "99", "string_field", "Vegeta is the saiyan prince."), "documentEnrichment"); + assertU(adoc("id", "98")); // no string_field + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("enriched_field"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/enriched_field=='enriched content'", + "/response/docs/[1]/id=='98'", + "!/response/docs/[1]/enriched_field=="); // no enriched field for doc 98 + } + + @Test + public void processAdd_failingEnrichment_shouldLogAndIndexWithNoEnrichedField() throws Exception { + loadTestChatModel("exception-throwing-chat-model.json", "exception-throwing-chat-model"); + addWithChain(sdoc("id", "99", "string_field", "Vegeta is the saiyan prince."), "failingDocumentEnrichment"); + addWithChain(sdoc("id", "98", "string_field", "Kakaroth is a saiyan grown up on planet Earth."), "failingDocumentEnrichment"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("enriched_field"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "!/response/docs/[0]/enriched_field==", // no enriched field for doc 99 + "/response/docs/[1]/id=='98'", + "!/response/docs/[1]/enriched_field=="); // no enriched field for doc 98 + } + + @Test + public void processAtomicUpdate_shouldTriggerEnrichmentAndFetchTheStoredContent() + throws Exception { + // Verifies that when using a processor chain configured for partial updates + // (i.e., DistributedUpdateProcessorFactory before DocumentEnrichmentUpdateProcessorFactory), + // the system correctly retrieves the stored value of string_field and generates the + // enriched content for the document. + loadTestChatModel("dummy-chat-model.json", "dummy-chat-1"); + assertU(adoc("id", "99", "string_field", "Vegeta is the saiyan prince.")); + assertU(adoc("id", "98", "string_field", "Kakaroth is a saiyan grown up on planet Earth.")); + assertU(commit()); + + SolrInputDocument atomicDoc = new SolrInputDocument(); + atomicDoc.setField("id", "99"); + atomicDoc.setField("enriched", Map.of("set", true)); + addWithChain(atomicDoc, "documentEnrichmentForPartialUpdates"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("enriched_field"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/enriched_field=='enriched content'", + "/response/docs/[1]/id=='98'", + "!/response/docs/[1]/enriched_field==" // no enriched field for document 98 + ); + } + + @Test + public void processAtomicUpdate_shouldReplaceExistingEnrichedFieldNotAppend() throws Exception { + // Verifies that when a document already contains an enriched_field and string_field is + // modified via atomic update, the enriched content is recomputed and replaces the previous + // value rather than being appended. + loadTestChatModel("dummy-chat-model.json", "dummy-chat-1"); + addWithChain(sdoc("id", "99", "string_field", "Vegeta is the saiyan prince."), "documentEnrichment"); + addWithChain(sdoc("id", "98", "string_field", "Kakaroth is a saiyan grown up on planet Earth."), "documentEnrichment"); + assertU(commit()); + + SolrInputDocument atomicDoc = new SolrInputDocument(); + atomicDoc.setField("id", "99"); + atomicDoc.setField("string_field", Map.of("set", "Vegeta is the saiyan prince from the Dragon Ball series.")); + addWithChain(atomicDoc, "documentEnrichmentForPartialUpdates"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("enriched_field"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/enriched_field=='enriched content'", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/enriched_field=='enriched content'"); + } + + // --- multi-field tests --- + + @Test + public void processAdd_multipleInputFields_allPresent_shouldEnrichDocument() throws Exception { + loadTestChatModel("dummy-chat-model.json", "dummy-chat-1"); + + addWithChain( + sdoc("id", "99", "string_field", "Vegeta is the saiyan prince.", "body_field", "He is very proud."), + "documentEnrichmentMultiField"); + addWithChain( + sdoc("id", "98", "string_field", "Kakaroth is a saiyan.", "body_field", "He grew up on Earth."), + "documentEnrichmentMultiField"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("enriched_field"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/enriched_field=='enriched content'", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/enriched_field=='enriched content'"); + } + + @Test + public void processAdd_multipleInputFields_firstFieldNull_shouldSkipEnrichment() throws Exception { + loadTestChatModel("dummy-chat-model.json", "dummy-chat-1"); + + addWithChain( + sdoc("id", "99", "body_field", "He is very proud."), // string_field absent + "documentEnrichmentMultiField"); + addWithChain( + sdoc("id", "98", "body_field", "He is very jealous."), // string_field absent + "documentEnrichmentMultiField"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("enriched_field"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "!/response/docs/[0]/enriched_field==", + "/response/docs/[1]/id=='98'", + "!/response/docs/[1]/enriched_field=="); + } + + @Test + public void processAdd_multipleInputFields_secondFieldEmpty_shouldSkipEnrichment() throws Exception { + loadTestChatModel("dummy-chat-model.json", "dummy-chat-1"); + + addWithChain( + sdoc("id", "99", "string_field", "Vegeta is the saiyan prince.", "body_field", ""), + "documentEnrichmentMultiField"); + addWithChain( + sdoc("id", "98", "string_field", "Goku is the best saiyan.", "body_field", ""), + "documentEnrichmentMultiField"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("enriched_field"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "!/response/docs/[0]/enriched_field==", + "/response/docs/[1]/id=='98'", + "!/response/docs/[1]/enriched_field=="); + } + + @Test + public void processAdd_multipleInputFields_bothFieldsAbsent_shouldSkipEnrichment() throws Exception { + loadTestChatModel("dummy-chat-model.json", "dummy-chat-1"); + + addWithChain(sdoc("id", "99"), "documentEnrichmentMultiField"); + addWithChain(sdoc("id", "98"), "documentEnrichmentMultiField"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("enriched_field"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "!/response/docs/[0]/enriched_field==", + "/response/docs/[1]/id=='98'", + "!/response/docs/[1]/enriched_field=="); + } + + @Test + public void processAdd_multipleInputFields_failingModel_shouldLogAndSkipEnrichment() throws Exception { + loadTestChatModel("exception-throwing-chat-model.json", "exception-throwing-chat-model"); + + addWithChain( + sdoc("id", "99", "string_field", "Vegeta is the saiyan prince.", "body_field", "He is very proud."), + "failingDocumentEnrichmentMultiField"); + addWithChain( + sdoc("id", "98", "string_field", "Kakaroth is a saiyan.", "body_field", "He grew up on Earth."), + "failingDocumentEnrichmentMultiField"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("enriched_field"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "!/response/docs/[0]/enriched_field==", + "/response/docs/[1]/id=='98'", + "!/response/docs/[1]/enriched_field=="); + } + + @Test + public void processAdd_multivaluedStringOutputField_shouldPopulateAllValues() throws Exception { + loadTestChatModel("dummy-chat-model-multivalued-string.json", "dummy-chat-multivalued-1"); + + addWithChain( + sdoc("id", "99", "string_field", "Vegeta is the saiyan prince."), + "documentEnrichmentMultivaluedString"); + addWithChain( + sdoc("id", "98", "string_field", "Kakaroth is a saiyan grown up on planet Earth."), + "documentEnrichmentMultivaluedString"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("enriched_field_multi"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/enriched_field_multi/[0]=='tag1'", + "/response/docs/[0]/enriched_field_multi/[1]=='tag2'", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/enriched_field_multi/[0]=='tag1'", + "/response/docs/[1]/enriched_field_multi/[1]=='tag2'"); + } + + @Test + public void processAdd_multivaluedStringOutputField_emptyInput_shouldSkipEnrichment() + throws Exception { + loadTestChatModel("dummy-chat-model-multivalued-string.json", "dummy-chat-multivalued-1"); + + addWithChain(sdoc("id", "99", "string_field", ""), "documentEnrichmentMultivaluedString"); + addWithChain(sdoc("id", "98", "string_field", ""), "documentEnrichmentMultivaluedString"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("enriched_field_multi"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "!/response/docs/[0]/enriched_field_multi==", + "/response/docs/[1]/id=='98'", + "!/response/docs/[1]/enriched_field_multi=="); + } + + // --- typed single-valued output field tests --- + + @Test + public void processAdd_singleLongOutputField_shouldPopulateValue() throws Exception { + loadTestChatModel("dummy-chat-model-single-long.json", "dummy-long"); + + addWithChain(sdoc("id", "99", "string_field", "some content"), "documentEnrichmentSingleLong"); + addWithChain(sdoc("id", "98", "string_field", "other content"), "documentEnrichmentSingleLong"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("output_long"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/output_long==42", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/output_long==42"); + } + + @Test + public void processAdd_singleIntOutputField_shouldPopulateValue() throws Exception { + loadTestChatModel("dummy-chat-model-single-int.json", "dummy-int"); + + addWithChain(sdoc("id", "99", "string_field", "some content"), "documentEnrichmentSingleInt"); + addWithChain(sdoc("id", "98", "string_field", "other content"), "documentEnrichmentSingleInt"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("output_int"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/output_int==7", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/output_int==7"); + } + + @Test + public void processAdd_singleFloatOutputField_shouldPopulateValue() throws Exception { + loadTestChatModel("dummy-chat-model-single-float.json", "dummy-float"); + + addWithChain(sdoc("id", "99", "string_field", "some content"), "documentEnrichmentSingleFloat"); + addWithChain(sdoc("id", "98", "string_field", "other content"), "documentEnrichmentSingleFloat"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("output_float"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/output_float==1.5", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/output_float==1.5"); + } + + @Test + public void processAdd_singleDoubleOutputField_shouldPopulateValue() throws Exception { + loadTestChatModel("dummy-chat-model-single-double.json", "dummy-double"); + + addWithChain( + sdoc("id", "99", "string_field", "some content"), "documentEnrichmentSingleDouble"); + addWithChain( + sdoc("id", "98", "string_field", "other content"), "documentEnrichmentSingleDouble"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("output_double"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/output_double==2.5", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/output_double==2.5"); + } + + @Test + public void processAdd_singleBooleanOutputField_shouldPopulateValue() throws Exception { + loadTestChatModel("dummy-chat-model-single-boolean.json", "dummy-boolean"); + + addWithChain( + sdoc("id", "99", "string_field", "some content"), "documentEnrichmentSingleBoolean"); + addWithChain( + sdoc("id", "98", "string_field", "other content"), "documentEnrichmentSingleBoolean"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("output_boolean"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/output_boolean==true", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/output_boolean==true"); + } + + @Test + public void processAdd_singleDateOutputField_shouldPopulateValue() throws Exception { + loadTestChatModel("dummy-chat-model-single-date.json", "dummy-date"); + + addWithChain(sdoc("id", "99", "string_field", "some content"), "documentEnrichmentSingleDate"); + addWithChain(sdoc("id", "98", "string_field", "other content"), "documentEnrichmentSingleDate"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("output_date"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/output_date=='2024-01-15T00:00:00Z'", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/output_date=='2024-01-15T00:00:00Z'"); + } + + // --- typed multivalued output field tests --- + + @Test + public void processAdd_multivaluedLongOutputField_shouldPopulateAllValues() throws Exception { + loadTestChatModel("dummy-chat-model-multivalued-long.json", "dummy-long-multi"); + + addWithChain( + sdoc("id", "99", "string_field", "some content"), "documentEnrichmentMultivaluedLong"); + addWithChain( + sdoc("id", "98", "string_field", "other content"), "documentEnrichmentMultivaluedLong"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("output_long_multi"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/output_long_multi/[0]==10", + "/response/docs/[0]/output_long_multi/[1]==20", + "/response/docs/[0]/output_long_multi/[2]==30", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/output_long_multi/[0]==10", + "/response/docs/[1]/output_long_multi/[1]==20", + "/response/docs/[1]/output_long_multi/[2]==30"); + } + + @Test + public void processAdd_multivaluedIntOutputField_shouldPopulateAllValues() throws Exception { + loadTestChatModel("dummy-chat-model-multivalued-int.json", "dummy-int-multi"); + + addWithChain( + sdoc("id", "99", "string_field", "some content"), "documentEnrichmentMultivaluedInt"); + addWithChain( + sdoc("id", "98", "string_field", "other content"), "documentEnrichmentMultivaluedInt"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("output_int_multi"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/output_int_multi/[0]==1", + "/response/docs/[0]/output_int_multi/[1]==2", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/output_int_multi/[0]==1", + "/response/docs/[1]/output_int_multi/[1]==2"); + } + + @Test + public void processAdd_multivaluedFloatOutputField_shouldPopulateAllValues() throws Exception { + loadTestChatModel("dummy-chat-model-multivalued-float.json", "dummy-float-multi"); + + addWithChain( + sdoc("id", "99", "string_field", "some content"), "documentEnrichmentMultivaluedFloat"); + addWithChain( + sdoc("id", "98", "string_field", "other content"), "documentEnrichmentMultivaluedFloat"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("output_float_multi"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/output_float_multi/[0]==1.5", + "/response/docs/[0]/output_float_multi/[1]==2.5", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/output_float_multi/[0]==1.5", + "/response/docs/[1]/output_float_multi/[1]==2.5"); + } + + @Test + public void processAdd_multivaluedDoubleOutputField_shouldPopulateAllValues() throws Exception { + loadTestChatModel("dummy-chat-model-multivalued-double.json", "dummy-double-multi"); + + addWithChain( + sdoc("id", "99", "string_field", "some content"), "documentEnrichmentMultivaluedDouble"); + addWithChain( + sdoc("id", "98", "string_field", "other content"), "documentEnrichmentMultivaluedDouble"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("output_double_multi"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/output_double_multi/[0]==3.14", + "/response/docs/[0]/output_double_multi/[1]==2.71", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/output_double_multi/[0]==3.14", + "/response/docs/[1]/output_double_multi/[1]==2.71"); + } + + @Test + public void processAdd_multivaluedBooleanOutputField_shouldPopulateAllValues() throws Exception { + loadTestChatModel("dummy-chat-model-multivalued-boolean.json", "dummy-boolean-multi"); + + addWithChain( + sdoc("id", "99", "string_field", "some content"), "documentEnrichmentMultivaluedBoolean"); + addWithChain( + sdoc("id", "98", "string_field", "other content"), "documentEnrichmentMultivaluedBoolean"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("output_boolean_multi"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/output_boolean_multi/[0]==true", + "/response/docs/[0]/output_boolean_multi/[1]==false", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/output_boolean_multi/[0]==true", + "/response/docs/[1]/output_boolean_multi/[1]==false"); + } + + @Test + public void processAdd_multivaluedDateOutputField_shouldPopulateAllValues() throws Exception { + loadTestChatModel("dummy-chat-model-multivalued-date.json", "dummy-date-multi"); + + addWithChain( + sdoc("id", "99", "string_field", "some content"), "documentEnrichmentMultivaluedDate"); + addWithChain( + sdoc("id", "98", "string_field", "other content"), "documentEnrichmentMultivaluedDate"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("output_date_multi"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/output_date_multi/[0]=='2024-01-15T00:00:00Z'", + "/response/docs/[0]/output_date_multi/[1]=='2025-06-30T00:00:00Z'", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/output_date_multi/[0]=='2024-01-15T00:00:00Z'", + "/response/docs/[1]/output_date_multi/[1]=='2025-06-30T00:00:00Z'"); + } + + // --- LLM response contract violation tests --- + + @Test + public void processAdd_llmResponseMissingValueKey_shouldLogAndIndexWithNoEnrichedField() + throws Exception { + // Model returns valid JSON but without the required "value" key + loadTestChatModel("dummy-chat-model-missing-value-key.json", "dummy-chat-1"); + + addWithChain(sdoc("id", "99", "string_field", "Vegeta is the saiyan prince."), "documentEnrichment"); + addWithChain(sdoc("id", "98", "string_field", "Kakaroth is a saiyan grown up on planet Earth."), "documentEnrichment"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("enriched_field"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "!/response/docs/[0]/enriched_field==", + "/response/docs/[1]/id=='98'", + "!/response/docs/[1]/enriched_field=="); + } + + @Test + public void processAdd_llmResponseMalformedJson_shouldLogAndIndexWithNoEnrichedField() + throws Exception { + // Model returns a plain string that cannot be parsed as JSON + loadTestChatModel("dummy-chat-model-malformed-json.json", "dummy-chat-1"); + + addWithChain(sdoc("id", "99", "string_field", "Vegeta is the saiyan prince."), "documentEnrichment"); + addWithChain(sdoc("id", "98", "string_field", "Kakaroth is a saiyan grown up on planet Earth."), "documentEnrichment"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("enriched_field"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "!/response/docs/[0]/enriched_field==", + "/response/docs/[1]/id=='98'", + "!/response/docs/[1]/enriched_field=="); + } + + // --- multivalued output field / scalar response test --- + + @Test + public void processAdd_multivaluedOutputField_scalarLlmResponse_shouldStoreSingleValue() + throws Exception { + // Model returns {"value": "a single string"} for a multivalued output field. + // The scalar falls through the List check and is stored as a single-element value. + loadTestChatModel("dummy-chat-model-multivalued-scalar.json", "dummy-chat-multivalued-1"); + + addWithChain( + sdoc("id", "99", "string_field", "Vegeta is the saiyan prince."), + "documentEnrichmentMultivaluedString"); + addWithChain( + sdoc("id", "98", "string_field", "Kakaroth is a saiyan grown up on planet Earth."), + "documentEnrichmentMultivaluedString"); + assertU(commit()); + + final SolrQuery query = getEnrichmentQuery("enriched_field_multi"); + + assertJQ( + "/query" + query.toQueryString(), + "/response/numFound==2]", + "/response/docs/[0]/id=='99'", + "/response/docs/[0]/enriched_field_multi/[0]=='a single string'", + "/response/docs/[1]/id=='98'", + "/response/docs/[1]/enriched_field_multi/[0]=='a single string'"); + } + + private SolrQuery getEnrichmentQuery(String enrichedFieldName) { + final SolrQuery query = new SolrQuery(); + query.setQuery("*:*"); + query.add("fl", "id,"+enrichedFieldName); + query.add("sort", "id desc"); + return query; + } + + void addWithChain(SolrInputDocument document, String updateChain) + throws SolrServerException, IOException { + UpdateRequest req = new UpdateRequest(); + req.add(document); + req.setParam("update.chain", updateChain); + solrTestRule.getSolrClient("collection1").request(req); + } +} diff --git a/solr/solr-ref-guide/modules/indexing-guide/indexing-nav.adoc b/solr/solr-ref-guide/modules/indexing-guide/indexing-nav.adoc index 9b50849716c3..940225e8d4ef 100644 --- a/solr/solr-ref-guide/modules/indexing-guide/indexing-nav.adoc +++ b/solr/solr-ref-guide/modules/indexing-guide/indexing-nav.adoc @@ -58,5 +58,6 @@ ** xref:partial-document-updates.adoc[] ** xref:reindexing.adoc[] ** xref:language-detection.adoc[] +** xref:document-enrichment-with-llms.adoc[] ** xref:de-duplication.adoc[] ** xref:content-streams.adoc[] diff --git a/solr/solr-ref-guide/modules/indexing-guide/pages/document-enrichment-with-llms.adoc b/solr/solr-ref-guide/modules/indexing-guide/pages/document-enrichment-with-llms.adoc new file mode 100644 index 000000000000..0681e99724aa --- /dev/null +++ b/solr/solr-ref-guide/modules/indexing-guide/pages/document-enrichment-with-llms.adoc @@ -0,0 +1,478 @@ += Document Enrichment with LLMs +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +This module brings the power of *Large Language Models* to Solr. + +More specifically, it provides the capability, at indexing time, given a prompt and a set of input fields, of calling an +LLM through https://github.com/langchain4j/langchain4j[LangChain4j] for each document and store the result of the call +in an `outputField`, that can be of multiple types and even multivalued. + +_Without_ this module, the LLM calls must be done _outside_ Solr, before indexing. + +[IMPORTANT] +==== +This module sends your documents off to some hosted service on the internet. +There are cost, privacy, performance, and service availability implications on such a strong dependency that should be +diligently examined before employing this module in a serious way. + +==== + +At the moment a subset of LLM providers supported by LangChain4j is supported by Solr. + +*Disclaimer*: Apache Solr is *in no way* affiliated to any of these corporations or services. + +If you want to add support for additional services or improve the support for the existing ones, feel free to +contribute: + +* https://github.com/apache/solr/blob/main/CONTRIBUTING.md[Contributing to Solr] + +== Module + +This is provided via the `language-models` xref:configuration-guide:solr-modules.adoc[Solr Module] that needs to be +enabled before use. + +== Language Model Configuration + +Language Models is a module and therefore its plugins must be configured in `solrconfig.xml`. + +=== Minimum Requirements + +* Enable the `language-models` module to make the Language Models classes available on Solr's classpath. +See xref:configuration-guide:solr-modules.adoc[Solr Module] for more details. + +* An update processor, similar to the one below, must be declared in `solrconfig.xml`: ++ +[source,xml] +---- + + + string_field + summary + Summarize this content: {string_field} + model-name + + + +---- +[NOTE] +==== +If no component is configured in `solrconfig.xml`, the `ChatModel` store will not be registered and requests to `/schema/chat-model-store` will return an error. +==== + +== Document Enrichment Lifecycle + +=== Models + +* A model in this module is a chat model, that answers with text given a prompt. +* A model in this Solr module is a reference to an external API that runs the Large Language Model responsible chat +completion. + +[IMPORTANT] +==== +the Solr chat model specifies the parameters to access the APIs, the LLM doesn't run internally in Solr + +==== + +A model is described by these parameters: + + +`class`:: ++ +[%autowidth,frame=none] +|=== +s|Required |Default: none +|=== ++ +The model implementation. +Accepted values: + +* `dev.langchain4j.model.ollama.OllamaChatModel` +* `dev.langchain4j.model.mistralai.MistralAiChatModel` +* `dev.langchain4j.model.anthropic.AnthropicChatModel` +* `dev.langchain4j.model.openai.OpenAiChatModel` +* `dev.langchain4j.model.googleai.GoogleAiGeminiChatModel` + +`name`:: ++ +[%autowidth,frame=none] +|=== +s|Required |Default: none +|=== ++ +The identifier of your model, this is used by any component that intends to use the model (e.g., `DocumentEnrichmentUpdateProcessorFactory` update processor). + +`params`:: ++ +[%autowidth,frame=none] +|=== +|Optional |Default: none +|=== ++ +Each model class has potentially different params. +Many are shared but for the full set of parameters of the model you are interested in please refer to the official documentation of the LangChain4j version included in Solr: https://docs.langchain4j.dev/category/language-models[Chat Models in LangChain4j]. + +=== Supported Models +Apache Solr uses https://github.com/langchain4j/langchain4j[LangChain4j] to support document enrichement with LLMs. +The models currently supported are: + +[tabs#supported-chat-models] +====== +Ollama:: ++ +==== + +[source,json] +---- +{ + "class": "dev.langchain4j.model.ollama.OllamaChatModel", + "name": "", + "params": { + "baseUrl": "http://localhost:11434", + "modelName": "", + "timeout": 300, + "logRequests": true, + "logResponses": true, + "maxRetries": 5 + } +} +---- +==== + +MistralAI:: ++ +==== +[source,json] +---- +{ + "class": "dev.langchain4j.model.mistralai.MistralAiChatModel", + "name": "", + "params": { + "baseUrl": "https://api.mistral.ai/v1", + "apiKey": "", + "modelName": "", + "timeout": 60, + "logRequests": true, + "logResponses": true, + "maxRetries": 5 + } +} +---- +==== +OpenAI:: ++ +==== +[source,json] +---- +{ + "class": "dev.langchain4j.model.openai.OpenAiChatModel", + "name": "", + "params": { + "baseUrl": "https://api.openai.com/v1", + "apiKey": "", + "modelName": "", + "timeout": 60, + "logRequests": true, + "logResponses": true, + "maxRetries": 5 + } +} +---- +==== + +Anthropic:: ++ +==== +[source,json] +---- +{ + "class": "dev.langchain4j.model.anthropic.AnthropicChatModel", + "name": "", + "params": { + "baseUrl": "https://api.anthropic.com/v1/", + "apiKey": "", + "modelName": "", + "timeout": 60, + "logRequests": true, + "logResponses": true, + "maxRetries": 5 + } +} +---- +==== + +Gemini:: ++ +==== +[source,json] +---- +{ + "class": "dev.langchain4j.model.googleai.GoogleAiGeminiChatModel", + "name": "", + "params": { + "baseUrl": "https://generativelanguage.googleapis.com/v1beta/", + "apiKey": "", + "modelName": "", + "timeout": 60, + "logRequests": true, + "logResponses": true, + "maxRetries": 5 + } +} +---- +==== +====== + +=== Uploading a Model + +To upload the model in a `/path/myModel.json` file, please run: + +[source,bash] +---- +curl -XPUT 'http://localhost:8983/solr/YOUR_COLLECTION/schema/chat-model-store' --data-binary "@/path/myModel.json" -H 'Content-type:application/json' +---- + +To delete the `currentModel` model: + +[source,bash] +---- +curl -XDELETE 'http://localhost:8983/solr/YOUR_COLLECTION/schema/chat-model-store/currentModel' +---- + +To view all models: + +[source,text] +http://localhost:8983/solr/YOUR_COLLECTION/schema/chat-model-store + + +.Example: /path/myModel.json +[source,json] +---- +{ + "class": "dev.langchain4j.model.openai.OpenAiChatModel", + "name": "openai-1", + "params": { + "baseUrl": "https://api.openai.com/v1", + "apiKey": "apiKey-openAI", + "modelName": "gpt-5.4-nano", + "timeout": 60, + "logRequests": true, + "logResponses": true, + "maxRetries": 5 + } +} +---- + +=== How to Trigger Document Enrichment during Indexing +To create new fields starting from existent ones in your documents at indexing time you need to configure an {solr-javadocs}/core/org/apache/solr/update/processor/UpdateRequestProcessorChain.html[Update Request Processor Chain] that includes at least one `DocumentEnrichmentUpdateProcessor` update request processor in one of the 2 following way: + +* Update processor with parameter `prompt` ++ +[source,xml] +---- + + + string_field + summary + Summarize this content: {string_field} + model-name + + + +---- + +* Update processor with parameter `promptFile`: in this case, the file `prompt.txt` must be uploaded to Solr similarly to any other configuration file (e.g., `solrconfig.xml`, `synonyms.txt`, etc.) ++ +[source,xml] +---- + + + string_field + summary + prompt.txt + model-name + + + +---- + +Exactly one of the following parameters is required: `prompt` or `promptFile`. + +Another important feature of this module is that one (or more) `inputField` needs to be injected in the prompt. This is +done by some special tokens, that are the `fieldName` surrounded by curly brackets (e.g., `{fieldName}`). These tokens +are _mandatory_ for this module to work properly. Solr will throw an error if the parameters are not properly defined. +For example, both the prompt and the content of the file prompt.txt, must contain the text '{string_field}', which +will be substituted with the content of the `string_field` field for each document. An example of a valid prompt with +multiple input fields is as follows: + +[source,xml] +---- + + + title + body + summary + Summarize with the following information. Title: {title}. Body: {body}. + chat-model + + + +---- + +The LLM response is mapped to the specified `outputField`. Note that this module only supports a subset of Solr's +available field types, which includes: + +* *String/Text*: `StrField`, `TextField` +* *Date*: `DatePointField` +* *Numeric*: `IntPointField`, `LongPointField, `FloatPointField`, `DoublePointField` +* *Boolean*: `BoolField` + + +This fields _can_ be multivalued. Solr uses structured output form LangChain4j to deal with LLMs' responses. + + +For more details on how to work with update request processors in Apache Solr, please refer to the dedicated page: +xref:configuration-guide:update-request-processors.adoc[Update Request Processor] + +[IMPORTANT] +==== +This update processor sends your document field content off to some hosted service on the internet. +There are serious performance implications that should be diligently examined before employing this component in production. +It will slow down substantially your indexing pipeline so make sure to stress test your solution before going live. + +==== + +=== Index first and enrich your documents on a second pass +LLM calls are usually quite slow, so, depending on your use case it could be a good idea to index first your documents +enrich them with new LLM-generated fields later on. + +This can be done in Solr defining two update request processors chains: one that includes all the processors you need, +excluded the `DocumentEnrichmentUpdateProcessor` (let's call it 'no-enrichment') and one that includes the +`DocumentEnrichmentUpdateProcessor` (let's call it 'enrichment'). + +[source,xml] +---- + + + ... + + ... + + ... + + + +---- + +[source,xml] +---- + + + ... + + ... + + ... + + + string_field + summary + Summarize this content: {string_field} + chat-model + + + +---- + +You would index your documents first using the 'no-enrichment' and when finished, incrementally repeat the indexing +targeting the 'enrichment' chain. + +[IMPORTANT] +==== +This implies you need to send the documents you want to index to Solr twice and re-run any other update request +processor you need, in the second chain. This has data traffic implications (you transfer your documents over the +network twice) and processing implications (if you have other update request processors in your chain, those must be +repeated the second time as we are literally replacing the indexed documents one by one). +==== + +If your use case is compatible with xref:indexing-guide:partial-document-updates.adoc[Partial Updates], you can do better: + +You still define two chains, but this time the 'enrichment' one only includes the 'DocumentEnrichmentUpdateProcessor' +(and the xref:configuration-guide:update-request-processors.adoc[Mandatory Processors] ) + +[source,xml] +---- + + + ... + + ... + + ... + + + +---- + +[source,xml] +---- + + + + string_field + summary + Summarize this content: {string_field} + chat-model + + + +---- + +[NOTE] +==== +Since partial updates are resolved by `DistributedUpdateProcessorFactory`, be sure to place +`DocumentEnrichmentUpdateProcessorFactory` afterwards so that it sees normal/complete documents. +==== + +Add to your schema a simple field that will be useful to track the enrichment process and use atomic updates: + +[source,xml] +---- + + +---- + +In the first pass just index your documents using your reliable and fast 'no-enrichment' chain. + +On the second pass, re-index all your documents using atomic updates and targeting the 'enrichment' chain: + +[source,json] +---- +{ + "id":"mydoc", + "enriched": { + "set": true + } +} +---- + +What will happen is that internally Solr fetches the stored content of the docs to update, all the existing fields are +retrieved and a re-indexing happens, targeting the 'enrichment' chain that will add the LLM-generated fields and set the +boolean `enriched` field to `true`. + +Faceting or querying on the boolean `enriched` field can also give you a quick idea on how many documents have been +enriched with the new generated fields.