From cd914b79254f783f25bab020e0b40303f6f6f03f Mon Sep 17 00:00:00 2001 From: fhuaulme Date: Fri, 3 Apr 2026 10:56:36 +0200 Subject: [PATCH 1/3] SOLR-18189: Improve tracking of duplicate Solr docs with content hash https://issues.apache.org/jira/browse/SOLR-18189 --- .../ContentHashVersionProcessor.java | 196 +++++++++ .../ContentHashVersionProcessorFactory.java | 144 +++++++ ...ontentHashVersionProcessorFactoryTest.java | 147 +++++++ .../ContentHashVersionProcessorTest.java | 399 ++++++++++++++++++ 4 files changed, 886 insertions(+) create mode 100644 solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java create mode 100644 solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java create mode 100644 solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java create mode 100644 solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java diff --git a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java new file mode 100644 index 00000000000..5127a6453f2 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java @@ -0,0 +1,196 @@ +package org.apache.solr.update.processor; + +import org.apache.lucene.util.BytesRef; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.core.SolrCore; +import org.apache.solr.handler.component.RealTimeGetComponent; +import org.apache.solr.handler.component.RealTimeGetComponent.Resolution; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.schema.TextField; +import org.apache.solr.update.AddUpdateCommand; +import org.apache.solr.update.UpdateCommand; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.Base64; +import java.util.Collection; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.function.Predicate; + +/** + * An implementation of {@link UpdateRequestProcessor} which computes a hash of selected doc values, and uses this hash + * value to reject/accept doc updates. + * + * Depending on {#discardSameDocuments} value, this processor may reject or accept doc update. + * This implementation can be used for monitoring or rejecting no-op updates (updates that do not change Solr document). + *

+ * Note: hash is computed using {@link Lookup3Signature}. + *

+ * @see Lookup3Signature + */ +public class ContentHashVersionProcessor extends UpdateRequestProcessor { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private final SchemaField hashField; + private final SolrQueryResponse rsp; + private final SolrCore core; + private final Predicate includedFields; // Matcher for included fields in hash + private final Predicate excludedFields; // Matcher for excluded fields from hash + private OldDocProvider oldDocProvider = new DefaultDocProvider(); + private boolean discardSameDocuments; + private int sameCount = 0; + private int differentCount = 0; + private int unknownCount = 0; + + public ContentHashVersionProcessor(Predicate hashIncludedFields, Predicate hashExcludedFields, String hashFieldName, SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { + super(next); + this.core = req.getCore(); + this.hashField = new SchemaField(hashFieldName, new TextField()); + this.rsp = rsp; + this.includedFields = hashIncludedFields; + this.excludedFields = hashExcludedFields; + } + + public void processAdd(AddUpdateCommand cmd) throws IOException { + SolrInputDocument newDoc = cmd.getSolrInputDocument(); + String newHash = computeDocHash(newDoc); + newDoc.setField(hashField.getName(), newHash); + int i = 0; + + if (!validateHash(cmd.getIndexedId(), newHash)) { + return; + } + + while (true) { + logOverlyFailedRetries(i, cmd); + try { + super.processAdd(cmd); + return; + } catch (SolrException e) { + if (e.code() != 409) { + throw e; + } + ++i; + } + } + } + + @Override + public void finish() throws IOException { + try { + super.finish(); + } finally { + rsp.addToLog("numAddsExisting", sameCount + differentCount + unknownCount); + rsp.addToLog("numAddsExistingWithIdentical", sameCount); + rsp.addToLog("numAddsExistingUnknown", unknownCount); + } + } + + private static void logOverlyFailedRetries(int i, UpdateCommand cmd) { + if ((i & 255) == 255) { + log.warn("Unusual number of optimistic concurrency retries: retries={} cmd={}", i, cmd); + } + } + + void setOldDocProvider(OldDocProvider oldDocProvider) { + this.oldDocProvider = oldDocProvider; + } + + void setDiscardSameDocuments(boolean discardSameDocuments) { + this.discardSameDocuments = discardSameDocuments; + } + + private boolean validateHash(BytesRef indexedDocId, String newHash) throws IOException { + assert null != indexedDocId; + + var docFoundAndOldUserVersions = getOldUserVersionsFromStored(indexedDocId); + if (docFoundAndOldUserVersions.found) { + String oldHash = docFoundAndOldUserVersions.oldHash; // No hash: might want to keep track of these too + if (oldHash == null) { + unknownCount++; + return true; + } else if (Objects.equals(newHash, oldHash)) { + sameCount++; + return !discardSameDocuments; + } else { + differentCount++; + return true; + } + } + return true; // Doc not found + } + + private DocFoundAndOldUserAndSolrVersions getOldUserVersionsFromStored(BytesRef indexedDocId) throws IOException { + SolrInputDocument oldDoc = oldDocProvider.getDocument(core, hashField.getName(), indexedDocId); + return null == oldDoc ? DocFoundAndOldUserAndSolrVersions.NOT_FOUND : getUserVersionAndSolrVersionFromDocument(oldDoc); + } + + private DocFoundAndOldUserAndSolrVersions getUserVersionAndSolrVersionFromDocument(SolrInputDocument oldDoc) { + Object o = oldDoc.getFieldValue(hashField.getName()); + if (o != null) { + return new DocFoundAndOldUserAndSolrVersions(o.toString()); + } + return new DocFoundAndOldUserAndSolrVersions(); + } + + public String computeDocHash(SolrInputDocument doc) { + List docIncludedFieldNames = doc.getFieldNames().stream() + .filter(includedFields) // Keep fields that match 'included fields' matcher... + .filter(excludedFields.negate()) // ...and exclude fields that match 'excluded fields' matcher + .sorted() // Sort to ensure consistent field order across different doc field orders + .toList(); + + final Signature sig = new Lookup3Signature(); + for (String fieldName : docIncludedFieldNames) { + sig.add(fieldName); + Object o = doc.getFieldValue(fieldName); + if (o instanceof Collection) { + for (Object oo : (Collection) o) { + sig.add(String.valueOf(oo)); + } + } else { + sig.add(String.valueOf(o)); + } + } + + // Signature, depending on implementation, may return 8-byte or 16-byte value + byte[] signature = sig.getSignature(); + return Base64.getEncoder().encodeToString(signature); // Makes a base64 hash out of signature value + } + + interface OldDocProvider { + SolrInputDocument getDocument(SolrCore core, String hashField, BytesRef indexedDocId) throws IOException; + } + + private static class DefaultDocProvider implements OldDocProvider { + @Override + public SolrInputDocument getDocument(SolrCore core, String hashField, BytesRef indexedDocId) throws IOException { + return RealTimeGetComponent.getInputDocument(core, indexedDocId, indexedDocId, null, Set.of(hashField), Resolution.PARTIAL); + } + } + + private static class DocFoundAndOldUserAndSolrVersions { + private static final DocFoundAndOldUserAndSolrVersions NOT_FOUND = new DocFoundAndOldUserAndSolrVersions(); + private final boolean found; + + public String oldHash; + + private DocFoundAndOldUserAndSolrVersions() { + this.found = false; + } + private DocFoundAndOldUserAndSolrVersions(String oldHash) { + this.found = true; + this.oldHash = oldHash; + } + + } +} diff --git a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java new file mode 100644 index 00000000000..a59c98e5882 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java @@ -0,0 +1,144 @@ +package org.apache.solr.update.processor; + +import org.apache.solr.common.SolrException; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.StrUtils; +import org.apache.solr.core.SolrCore; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.util.plugin.SolrCoreAware; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +/** + * Factory for {@link ContentHashVersionProcessor} instances. + */ +public class ContentHashVersionProcessorFactory extends UpdateRequestProcessorFactory implements SolrCoreAware, UpdateRequestProcessorFactory.RunAlways { + private static final char SEPARATOR = ','; // Separator for included/excluded fields + static final String CONTENT_HASH_ENABLED_PARAM = "contentHashEnabled"; + private List includeFields = Collections.singletonList("*"); // Included fields defaults to 'all' + private List excludeFields = new ArrayList<>(); // No excluded field by default, yet hashFieldName is excluded by default + private String hashFieldName = "content_hash"; // Field name to store computed hash on create/update operations + private boolean discardSameDocuments = true; + + public ContentHashVersionProcessorFactory() { + } + + public void init(NamedList args) { + Object tmp = args.remove("includeFields"); + if (tmp != null) { + if (!(tmp instanceof String)) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, + "'includeFields' must be configured as a "); + } + // Include fields support comma separated list of fields (e.g. "field1,field2,field3"). + // Also supports "*" to include all fields + this.includeFields = StrUtils.splitSmart((String) tmp, SEPARATOR) + .stream() + .map(String::trim) + .collect(Collectors.toList()); + } + tmp = args.remove("hashFieldName"); + if (tmp != null) { + if (!(tmp instanceof String)) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "'hashFieldName' must be configured as a "); + } + this.hashFieldName = (String) tmp; + } + + tmp = args.remove("excludeFields"); + if (tmp != null) { + if (!(tmp instanceof String)) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "'excludeFields' must be configured as a "); + } + if ("*".equals(((String) tmp).trim())) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "'excludeFields' can't exclude all fields."); + } + // Exclude fields support comma separated list of fields (e.g. "excluded_field1,excluded_field2"). + // Also supports "*" to exclude all fields + this.excludeFields = StrUtils.splitSmart((String) tmp, SEPARATOR) + .stream() + .map(String::trim) + .collect(Collectors.toList()); + } + excludeFields.add(hashFieldName); // Hash field name is excluded from hash computation + + tmp = args.remove("hashCompareStrategy"); + if (tmp != null) { + if (!(tmp instanceof String)) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "'hashCompareStrategy' must be configured as a "); + } + String value = ((String) tmp).toLowerCase(); + if ("discard".equalsIgnoreCase(value)) { + discardSameDocuments = true; + } else if ("log".equalsIgnoreCase(value)) { + discardSameDocuments = false; + } else { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Value '" + value + "' is unsupported for 'hashCompareStrategy', only 'discard' and 'log' are supported."); + } + } + + super.init(args); + } + + public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { + if (!req.getParams().getBool(CONTENT_HASH_ENABLED_PARAM, false)) { + return next; + } + + ContentHashVersionProcessor processor = new ContentHashVersionProcessor( + buildFieldMatcher(includeFields), + buildFieldMatcher(excludeFields), + hashFieldName, + req, + rsp, + next); + processor.setDiscardSameDocuments(discardSameDocuments); + return processor; + } + + public void inform(SolrCore core) { + if (core.getLatestSchema().getUniqueKeyField() == null) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "schema must have uniqueKey defined."); + } + } + + public String getHashFieldName() { + return hashFieldName; + } + + public List getIncludeFields() { + return includeFields; + } + + public List getExcludeFields() { + return excludeFields; + } + + public boolean discardSameDocuments() { + return discardSameDocuments; + } + + static Predicate buildFieldMatcher(List fieldNames) { + return fieldName -> { + for (String currentFieldName : fieldNames) { + if ("*".equals(currentFieldName)) { + return true; + } + if (fieldName.equals(currentFieldName)) { + return true; + } + if (currentFieldName.length() > 1 + && currentFieldName.endsWith("*") + && fieldName.startsWith(currentFieldName.substring(0, currentFieldName.length() - 1))) { + return true; + } + } + return false; + }; + } +} diff --git a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java new file mode 100644 index 00000000000..51af18a075d --- /dev/null +++ b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java @@ -0,0 +1,147 @@ +package org.apache.solr.update.processor; + +import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.junit.Test; + +import java.util.List; + +import static org.apache.solr.update.processor.ContentHashVersionProcessorFactory.CONTENT_HASH_ENABLED_PARAM; +import static org.junit.Assert.*; +import static org.mockito.ArgumentMatchers.*; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class ContentHashVersionProcessorFactoryTest { + + @Test + public void shouldHaveSensibleDefaultValues() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + assertEquals(List.of("*"), factory.getIncludeFields()); + assertEquals("content_hash", factory.getHashFieldName()); + assertTrue(factory.discardSameDocuments()); + } + + @Test + public void shouldInitWithHashFieldName() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashFieldName", "_hash_field_"); + factory.init(args); + + assertEquals("_hash_field_", factory.getHashFieldName()); + } + + @Test + public void shouldInitWithAllField() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("includeFields", "*"); + factory.init(args); + + assertEquals(1, factory.getIncludeFields().size()); + assertEquals("*", factory.getIncludeFields().getFirst()); + } + + @Test + public void shouldInitWithIncludedFields() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("includeFields", " field1,field2 , field3 "); + factory.init(args); + + assertEquals(3, factory.getIncludeFields().size()); + assertEquals(List.of("field1", "field2", "field3"), factory.getIncludeFields()); + } + + @Test + public void shouldInitWithExcludedFields() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("excludeFields", " field1,field2 , field3 "); + factory.init(args); + + assertEquals(4, factory.getExcludeFields().size()); + assertEquals(List.of("field1", "field2", "field3", "content_hash"), factory.getExcludeFields()); + } + + @Test + public void shouldSelectRejectSameHashStrategy() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashCompareStrategy", "discard"); + factory.init(args); + + assertTrue(factory.discardSameDocuments()); + } + + @Test + public void shouldSelectLogStrategy() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashCompareStrategy", "log"); + factory.init(args); + + assertFalse(factory.discardSameDocuments()); + } + + @Test + public void shouldSelectDiscardStrategy() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashCompareStrategy", "discard"); + factory.init(args); + + assertTrue(factory.discardSameDocuments()); + } + + @Test(expected = SolrException.class) + public void shouldSelectUnsupportedStrategy() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashCompareStrategy", "unsupported value"); + factory.init(args); + } + + @Test(expected = SolrException.class) + public void shouldRejectExcludeAllFields() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("excludeFields", "*"); + factory.init(args); + } + + @Test + public void shouldDisableContentHashByQueryParameter() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + UpdateRequestProcessor next = mock(UpdateRequestProcessor.class); + SolrQueryRequest updateRequest = createUpdateRequest(false); // Request disables processor + + UpdateRequestProcessor instance = factory.getInstance(updateRequest, mock(SolrQueryResponse.class), next); + + assertEquals(instance, next); + } + + @Test + public void shouldEnableContentHashByQueryParameter() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + UpdateRequestProcessor next = mock(UpdateRequestProcessor.class); + SolrQueryRequest updateRequest = createUpdateRequest(true); // Request enables processor + + UpdateRequestProcessor instance = factory.getInstance(updateRequest, mock(SolrQueryResponse.class), next); + + assertNotEquals(instance, next); + } + + private static SolrQueryRequest createUpdateRequest(boolean enableContentHashParamValue) { + SolrQueryRequest req = mock(SolrQueryRequest.class); + SolrParams solrParams = mock(SolrParams.class); + when(solrParams.getBool(eq(CONTENT_HASH_ENABLED_PARAM), anyBoolean())).thenReturn(enableContentHashParamValue); + when(req.getParams()).thenReturn(solrParams); + + return req; + } +} diff --git a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java new file mode 100644 index 00000000000..1f399816ce7 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java @@ -0,0 +1,399 @@ +package org.apache.solr.update.processor; + +import org.apache.lucene.util.BytesRef; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.schema.UUIDField; +import org.apache.solr.update.AddUpdateCommand; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.UUID; + +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.*; + +public class ContentHashVersionProcessorTest extends UpdateProcessorTestBase { + + public static final String ID_FIELD = "_id"; + public static final String HASH_FIELD_NAME = "_hash_"; + public static final String FIRST_FIELD = "field1"; + public static final String SECOND_FIELD = "field2"; + public static final String THIRD_FIELD = "docField3"; + public static final String FOURTH_FIELD = "field4"; + private SolrQueryRequest req; + + private ContentHashVersionProcessor getContentHashVersionProcessor( + SolrQueryRequest req, + SolrQueryResponse rsp, + UpdateRequestProcessor next, + List includedFields, + List excludedFields) { + ContentHashVersionProcessor processor = new ContentHashVersionProcessor( + ContentHashVersionProcessorFactory.buildFieldMatcher(includedFields), + ContentHashVersionProcessorFactory.buildFieldMatcher(excludedFields), + HASH_FIELD_NAME, + req, + rsp, + next); + + // Given (previous doc retrieval configuration) + processor.setOldDocProvider((core, hashField, indexedDocId) -> { + final SolrInputDocument inputDocument = doc( + f(ID_FIELD, indexedDocId.utf8ToString()), + f(FIRST_FIELD, "Initial values used to compute initial hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields") + ); + return doc( + f(ID_FIELD, inputDocument.getFieldValue(ID_FIELD)), + f(FIRST_FIELD, inputDocument.getFieldValue(FIRST_FIELD)), + f(SECOND_FIELD, inputDocument.getFieldValue(SECOND_FIELD)), + f(hashField, processor.computeDocHash(inputDocument)) + ); + }); + return processor; + } + + @Before + public void setUp() throws Exception { + super.setUp(); + + // Given (processor configuration) + req = mock(SolrQueryRequest.class); + when(req.getParams()).thenReturn(mock(SolrParams.class)); + + // Given (schema configuration) + IndexSchema indexSchema = mock(IndexSchema.class); + when(req.getSchema()).thenReturn(indexSchema); + when(indexSchema.getUniqueKeyField()).thenReturn(new SchemaField(ID_FIELD, new UUIDField())); + when(indexSchema.indexableUniqueKey(anyString())).then(invocationOnMock -> new BytesRef(invocationOnMock.getArgument( + 0).toString())); + } + + @Test + public void shouldComputeHashForDoc() { + // Given + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + mock(UpdateRequestProcessor.class), + List.of("*"), + List.of(ID_FIELD)); + + // Given (doc for update) + SolrInputDocument inputDocument1 = doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Values will serve as input to compute a hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields") + ); + + // Then + assertEquals("Tak0G5a/DIE=", processor.computeDocHash(inputDocument1)); + + // Given (doc for update - with different order) + SolrInputDocument inputDocument2 = doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields"), + f(FIRST_FIELD, "Values will serve as input to compute a hash") + ); + + // Then (hash remain same, since id is excluded from signature fields) + assertEquals("Tak0G5a/DIE=", processor.computeDocHash(inputDocument2)); + } + + @Test + public void shouldUseExcludedFieldsWildcard() { + // Given + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), + List.of("*"), + List.of("field*")); + + // Given (doc for update) + SolrInputDocument inputDocument = doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, UUID.randomUUID().toString()), + f(SECOND_FIELD, UUID.randomUUID().toString()), + f(THIRD_FIELD, "constant to have a constant hash"), + f(FOURTH_FIELD, UUID.randomUUID().toString()) + ); + + // Then (only ID and THIRD_FIELD is used in hash, other fields contain random values) + assertEquals("bwE8Zjq0aOs=", processor.computeDocHash(inputDocument)); // Hash if only ID field was used + } + + @Test + public void shouldUseIncludedFieldsWildcard() { + // Given + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), + List.of("field*"), + List.of(THIRD_FIELD)); + + // Given (doc for update) + SolrInputDocument inputDocument = doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, "constant to have a constant hash for field1"), + f(SECOND_FIELD, "constant to have a constant hash for field2"), + f(THIRD_FIELD, UUID.randomUUID().toString()), + f(FOURTH_FIELD, "constant to have a constant hash for field4") + ); + + // Then + assertEquals("PozPs2qZQtw=", processor.computeDocHash(inputDocument)); + } + + @Test + public void shouldUseIncludedFieldsWildcard2() { + // Given (variant of previous shouldUseIncludedFieldsWildcard, without the excludedField config) + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), + List.of("field*"), + Collections.emptyList()); + + // Given (doc for update) + SolrInputDocument inputDocument = doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, "constant to have a constant hash for field1"), + f(SECOND_FIELD, "constant to have a constant hash for field2"), + f(THIRD_FIELD, UUID.randomUUID().toString()), + f(FOURTH_FIELD, "constant to have a constant hash for field4") + ); + + // Then + assertEquals("PozPs2qZQtw=", processor.computeDocHash(inputDocument)); + } + + @Test + public void shouldDedupIncludedFields() { + // Given (processor to include field1 and field2 only) + ContentHashVersionProcessor processorWithDuplicatedFieldName = getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), + List.of(FIRST_FIELD, FIRST_FIELD, SECOND_FIELD), + Collections.emptyList()); + + ContentHashVersionProcessor processorWithWildcard = getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), + List.of(SECOND_FIELD, FIRST_FIELD, "field1*"), // Also change order of config (test reorder of field names) + Collections.emptyList()); + + // Given (doc for update) + SolrInputDocument inputDocument = doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, "constant to have a constant hash for field1"), + f(SECOND_FIELD, "constant to have a constant hash for field2"), + f(THIRD_FIELD, UUID.randomUUID().toString()), + f(FOURTH_FIELD, "constant to have a constant hash for field4") + ); + + // Then + assertEquals("XavrOYGlkXM=", processorWithDuplicatedFieldName.computeDocHash(inputDocument)); + assertEquals("XavrOYGlkXM=", processorWithWildcard.computeDocHash(inputDocument)); + } + + @Test + public void shouldCreateSignatureForNewDoc() throws IOException { + // Given + SolrQueryResponse response = mock(SolrQueryResponse.class); + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + response, + mock(UpdateRequestProcessor.class), + Arrays.asList(FIRST_FIELD, SECOND_FIELD), + Collections.emptyList()); + processor.setDiscardSameDocuments(false); + processor.setOldDocProvider((core, hashField, indexedDocId) -> null); + + // Given (command) + AddUpdateCommand cmd = new AddUpdateCommand(req); + + // Given (doc for update) + SolrInputDocument inputDocument = doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Values will serve as input to compute a hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields") + ); + cmd.solrDoc = inputDocument; + + // When + processor.processAdd(cmd); + processor.finish(); + + // Then + assertNotNull(inputDocument.getField(HASH_FIELD_NAME)); // signature field got added + assertEquals( + processor.computeDocHash(inputDocument), + inputDocument.getField(HASH_FIELD_NAME).getValue()); // ... and contains expected value + + // Then (asserts on hash comparison results) + verify(response, times(1)).addToLog(eq("numAddsExisting"), eq(0)); + verify(response, times(1)).addToLog(eq("numAddsExistingWithIdentical"), eq(0)); // And no hash clash with old doc + } + + @Test + public void shouldAddToResponseLog() throws IOException { + // Given + SolrQueryResponse response = mock(SolrQueryResponse.class); + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + response, + mock(UpdateRequestProcessor.class), + Arrays.asList(FIRST_FIELD, SECOND_FIELD), + Collections.emptyList()); + processor.setDiscardSameDocuments(false); + + // Given (command to update existing doc) + AddUpdateCommand cmdDoesNotChangeValues = new AddUpdateCommand(req); + + // Given (doc for update - matches the existing doc, see getContentHashVersionProcessor()) + SolrInputDocument initialDocument = doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Initial values used to compute initial hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields") + ); + cmdDoesNotChangeValues.solrDoc = doc( + f(ID_FIELD, initialDocument.getFieldValue(ID_FIELD)), + f(FIRST_FIELD, initialDocument.getFieldValue(FIRST_FIELD)), + f(SECOND_FIELD, initialDocument.getFieldValue(SECOND_FIELD)), + f(HASH_FIELD_NAME, processor.computeDocHash(initialDocument)) + ); + + // Given (command to update existing doc with different content) + AddUpdateCommand cmdChangesDocValues = new AddUpdateCommand(req); + + // Given (doc for update - does *not* match the existing doc, see getContentHashVersionProcessor()) + cmdChangesDocValues.solrDoc = doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "This is a doc with values"), + f(SECOND_FIELD, "that differs from stored doc, so it's considered new") + ); + + // When + processor.processAdd(cmdDoesNotChangeValues); + processor.processAdd(cmdChangesDocValues); + processor.finish(); + + // Then (read as follows: 2 updates occurred for an existing doc. Among these updates, 1 update tried to replace + // doc with the same content) + verify(response, times(1)).addToLog(eq("numAddsExisting"), eq(2)); + verify(response, times(1)).addToLog(eq("numAddsExistingWithIdentical"), eq(1)); + } + + @Test + public void shouldNotUpdateSignatureForNewDoc() throws IOException { + // Given + SolrQueryResponse response = mock(SolrQueryResponse.class); + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + response, + mock(UpdateRequestProcessor.class), + List.of(SECOND_FIELD), + Collections.emptyList()); + + // Given (command) + AddUpdateCommand cmd = new AddUpdateCommand(req); + + // Given (doc for update) + SolrInputDocument inputDocument = doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Values will serve as input to compute a hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields") + ); + cmd.solrDoc = inputDocument; + + // When + processor.processAdd(cmd); + + // Then + assertNotNull(inputDocument.getField(HASH_FIELD_NAME)); // signature field got added + assertEquals( + processor.computeDocHash(inputDocument), + inputDocument.getField(HASH_FIELD_NAME).getValue()); // ... and contains expected value + verify(response, never()).addToLog(eq("numAddsExisting"), eq(0)); + verify(response, never()).addToLog(eq("numAddsExistingWithIdentical"), eq(0)); + } + + @Test + public void shouldExcludeFieldsUpdateSignatureForNewDoc() throws IOException { + // Given + SolrQueryResponse response = mock(SolrQueryResponse.class); + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + response, + mock(UpdateRequestProcessor.class), + List.of(FIRST_FIELD, SECOND_FIELD), + List.of(FIRST_FIELD)); + processor.setDiscardSameDocuments(false); + + // Given (command) + AddUpdateCommand cmd = new AddUpdateCommand(req); + + // Given (doc for update) + SolrInputDocument inputDocument = doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Values will serve as input to compute a hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields") + ); + cmd.solrDoc = inputDocument; + + // When + processor.processAdd(cmd); + + // Then + assertNotNull(inputDocument.getField(HASH_FIELD_NAME)); // signature field got added + assertEquals( + processor.computeDocHash(inputDocument), + inputDocument.getField(HASH_FIELD_NAME).getValue()); // ... and contains expected value + verify(response, never()).addToLog(eq("numAddsExisting"), eq(1)); + verify(response, never()).addToLog(eq("numAddsExistingWithIdentical"), eq(0)); + } + + @Test + public void shouldCommitWithDiscardModeEnabled() throws IOException { + // Given + UpdateRequestProcessor nextProcessor = mock(UpdateRequestProcessor.class); + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + nextProcessor, + List.of(FIRST_FIELD, SECOND_FIELD), + List.of(FIRST_FIELD)); + processor.setDiscardSameDocuments(true); + + // Given (command to update existing doc) + AddUpdateCommand cmdDoesNotChangeValues = new AddUpdateCommand(req); + + // Given (doc for update - matches the existing doc, see getContentHashVersionProcessor()) + SolrInputDocument initialDocument = doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Initial values used to compute initial hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields") + ); + cmdDoesNotChangeValues.solrDoc = doc( + f(ID_FIELD, initialDocument.getFieldValue(ID_FIELD)), + f(FIRST_FIELD, initialDocument.getFieldValue(FIRST_FIELD)), + f(SECOND_FIELD, initialDocument.getFieldValue(SECOND_FIELD)), + f(HASH_FIELD_NAME, processor.computeDocHash(initialDocument)) + ); + + // When + processor.processAdd(cmdDoesNotChangeValues); + + // Then + verify(nextProcessor, never()).processAdd(eq(cmdDoesNotChangeValues)); + } + +} From 527765262aa5081bd1793e61f552448959e2709b Mon Sep 17 00:00:00 2001 From: fhuaulme Date: Fri, 3 Apr 2026 17:42:07 +0200 Subject: [PATCH 2/3] * lint fixes --- .../ContentHashVersionProcessor.java | 344 ++++++++------- .../ContentHashVersionProcessorFactory.java | 99 +++-- ...ontentHashVersionProcessorFactoryTest.java | 51 ++- .../ContentHashVersionProcessorTest.java | 400 ++++++++++-------- 4 files changed, 519 insertions(+), 375 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java index 5127a6453f2..f4b40221a1b 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java +++ b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java @@ -1,5 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.solr.update.processor; +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.Base64; +import java.util.Collection; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.function.Predicate; import org.apache.lucene.util.BytesRef; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; @@ -15,182 +40,197 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.lang.invoke.MethodHandles; -import java.util.Base64; -import java.util.Collection; -import java.util.List; -import java.util.Objects; -import java.util.Set; -import java.util.function.Predicate; - /** - * An implementation of {@link UpdateRequestProcessor} which computes a hash of selected doc values, and uses this hash - * value to reject/accept doc updates. + * An implementation of {@link UpdateRequestProcessor} which computes a hash of selected doc values, + * and uses this hash value to reject/accept doc updates. + * *
    - *
  • When no corresponding doc with same id exists (create), computed hash is added to the document.
  • - *
  • When a previous doc exists (update), a new hash is computed using new version values and compared with old hash.
  • + *
  • When no corresponding doc with same id exists (create), computed hash is added to the + * document. + *
  • When a previous doc exists (update), a new hash is computed using new version values and + * compared with old hash. *
- * Depending on {#discardSameDocuments} value, this processor may reject or accept doc update. - * This implementation can be used for monitoring or rejecting no-op updates (updates that do not change Solr document). - *

- * Note: hash is computed using {@link Lookup3Signature}. - *

+ * + * Depending on {#discardSameDocuments} value, this processor may reject or accept doc update. This + * implementation can be used for monitoring or rejecting no-op updates (updates that do not change + * Solr document). + * + *

Note: hash is computed using {@link Lookup3Signature}. + * * @see Lookup3Signature */ public class ContentHashVersionProcessor extends UpdateRequestProcessor { - private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - private final SchemaField hashField; - private final SolrQueryResponse rsp; - private final SolrCore core; - private final Predicate includedFields; // Matcher for included fields in hash - private final Predicate excludedFields; // Matcher for excluded fields from hash - private OldDocProvider oldDocProvider = new DefaultDocProvider(); - private boolean discardSameDocuments; - private int sameCount = 0; - private int differentCount = 0; - private int unknownCount = 0; - - public ContentHashVersionProcessor(Predicate hashIncludedFields, Predicate hashExcludedFields, String hashFieldName, SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { - super(next); - this.core = req.getCore(); - this.hashField = new SchemaField(hashFieldName, new TextField()); - this.rsp = rsp; - this.includedFields = hashIncludedFields; - this.excludedFields = hashExcludedFields; + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private final SchemaField hashField; + private final SolrQueryResponse rsp; + private final SolrCore core; + private final Predicate includedFields; // Matcher for included fields in hash + private final Predicate excludedFields; // Matcher for excluded fields from hash + private OldDocProvider oldDocProvider = new DefaultDocProvider(); + private boolean discardSameDocuments; + private int sameCount = 0; + private int differentCount = 0; + private int unknownCount = 0; + + public ContentHashVersionProcessor( + Predicate hashIncludedFields, + Predicate hashExcludedFields, + String hashFieldName, + SolrQueryRequest req, + SolrQueryResponse rsp, + UpdateRequestProcessor next) { + super(next); + this.core = req.getCore(); + this.hashField = new SchemaField(hashFieldName, new TextField()); + this.rsp = rsp; + this.includedFields = hashIncludedFields; + this.excludedFields = hashExcludedFields; + } + + public void processAdd(AddUpdateCommand cmd) throws IOException { + SolrInputDocument newDoc = cmd.getSolrInputDocument(); + String newHash = computeDocHash(newDoc); + newDoc.setField(hashField.getName(), newHash); + int i = 0; + + if (!validateHash(cmd.getIndexedId(), newHash)) { + return; } - public void processAdd(AddUpdateCommand cmd) throws IOException { - SolrInputDocument newDoc = cmd.getSolrInputDocument(); - String newHash = computeDocHash(newDoc); - newDoc.setField(hashField.getName(), newHash); - int i = 0; - - if (!validateHash(cmd.getIndexedId(), newHash)) { - return; - } - - while (true) { - logOverlyFailedRetries(i, cmd); - try { - super.processAdd(cmd); - return; - } catch (SolrException e) { - if (e.code() != 409) { - throw e; - } - ++i; - } + while (true) { + logOverlyFailedRetries(i, cmd); + try { + super.processAdd(cmd); + return; + } catch (SolrException e) { + if (e.code() != 409) { + throw e; } + ++i; + } } - - @Override - public void finish() throws IOException { - try { - super.finish(); - } finally { - rsp.addToLog("numAddsExisting", sameCount + differentCount + unknownCount); - rsp.addToLog("numAddsExistingWithIdentical", sameCount); - rsp.addToLog("numAddsExistingUnknown", unknownCount); - } + } + + @Override + public void finish() throws IOException { + try { + super.finish(); + } finally { + rsp.addToLog("numAddsExisting", sameCount + differentCount + unknownCount); + rsp.addToLog("numAddsExistingWithIdentical", sameCount); + rsp.addToLog("numAddsExistingUnknown", unknownCount); } + } - private static void logOverlyFailedRetries(int i, UpdateCommand cmd) { - if ((i & 255) == 255) { - log.warn("Unusual number of optimistic concurrency retries: retries={} cmd={}", i, cmd); - } + private static void logOverlyFailedRetries(int i, UpdateCommand cmd) { + if ((i & 255) == 255) { + log.warn("Unusual number of optimistic concurrency retries: retries={} cmd={}", i, cmd); } - - void setOldDocProvider(OldDocProvider oldDocProvider) { - this.oldDocProvider = oldDocProvider; + } + + void setOldDocProvider(OldDocProvider oldDocProvider) { + this.oldDocProvider = oldDocProvider; + } + + void setDiscardSameDocuments(boolean discardSameDocuments) { + this.discardSameDocuments = discardSameDocuments; + } + + private boolean validateHash(BytesRef indexedDocId, String newHash) throws IOException { + assert null != indexedDocId; + + var docFoundAndOldUserVersions = getOldUserVersionsFromStored(indexedDocId); + if (docFoundAndOldUserVersions.found) { + String oldHash = + docFoundAndOldUserVersions.oldHash; // No hash: might want to keep track of these too + if (oldHash == null) { + unknownCount++; + return true; + } else if (Objects.equals(newHash, oldHash)) { + sameCount++; + return !discardSameDocuments; + } else { + differentCount++; + return true; + } } - - void setDiscardSameDocuments(boolean discardSameDocuments) { - this.discardSameDocuments = discardSameDocuments; + return true; // Doc not found + } + + private DocFoundAndOldUserAndSolrVersions getOldUserVersionsFromStored(BytesRef indexedDocId) + throws IOException { + SolrInputDocument oldDoc = oldDocProvider.getDocument(core, hashField.getName(), indexedDocId); + return null == oldDoc + ? DocFoundAndOldUserAndSolrVersions.NOT_FOUND + : getUserVersionAndSolrVersionFromDocument(oldDoc); + } + + private DocFoundAndOldUserAndSolrVersions getUserVersionAndSolrVersionFromDocument( + SolrInputDocument oldDoc) { + Object o = oldDoc.getFieldValue(hashField.getName()); + if (o != null) { + return new DocFoundAndOldUserAndSolrVersions(o.toString()); } - - private boolean validateHash(BytesRef indexedDocId, String newHash) throws IOException { - assert null != indexedDocId; - - var docFoundAndOldUserVersions = getOldUserVersionsFromStored(indexedDocId); - if (docFoundAndOldUserVersions.found) { - String oldHash = docFoundAndOldUserVersions.oldHash; // No hash: might want to keep track of these too - if (oldHash == null) { - unknownCount++; - return true; - } else if (Objects.equals(newHash, oldHash)) { - sameCount++; - return !discardSameDocuments; - } else { - differentCount++; - return true; - } + return new DocFoundAndOldUserAndSolrVersions(); + } + + public String computeDocHash(SolrInputDocument doc) { + List docIncludedFieldNames = + doc.getFieldNames().stream() + .filter(includedFields) // Keep fields that match 'included fields' matcher... + .filter( + excludedFields + .negate()) // ...and exclude fields that match 'excluded fields' matcher + .sorted() // Sort to ensure consistent field order across different doc field orders + .toList(); + + final Signature sig = new Lookup3Signature(); + for (String fieldName : docIncludedFieldNames) { + sig.add(fieldName); + Object o = doc.getFieldValue(fieldName); + if (o instanceof Collection) { + for (Object oo : (Collection) o) { + sig.add(String.valueOf(oo)); } - return true; // Doc not found + } else { + sig.add(String.valueOf(o)); + } } - private DocFoundAndOldUserAndSolrVersions getOldUserVersionsFromStored(BytesRef indexedDocId) throws IOException { - SolrInputDocument oldDoc = oldDocProvider.getDocument(core, hashField.getName(), indexedDocId); - return null == oldDoc ? DocFoundAndOldUserAndSolrVersions.NOT_FOUND : getUserVersionAndSolrVersionFromDocument(oldDoc); - } + // Signature, depending on implementation, may return 8-byte or 16-byte value + byte[] signature = sig.getSignature(); + return Base64.getEncoder() + .encodeToString(signature); // Makes a base64 hash out of signature value + } - private DocFoundAndOldUserAndSolrVersions getUserVersionAndSolrVersionFromDocument(SolrInputDocument oldDoc) { - Object o = oldDoc.getFieldValue(hashField.getName()); - if (o != null) { - return new DocFoundAndOldUserAndSolrVersions(o.toString()); - } - return new DocFoundAndOldUserAndSolrVersions(); - } + interface OldDocProvider { + SolrInputDocument getDocument(SolrCore core, String hashField, BytesRef indexedDocId) + throws IOException; + } - public String computeDocHash(SolrInputDocument doc) { - List docIncludedFieldNames = doc.getFieldNames().stream() - .filter(includedFields) // Keep fields that match 'included fields' matcher... - .filter(excludedFields.negate()) // ...and exclude fields that match 'excluded fields' matcher - .sorted() // Sort to ensure consistent field order across different doc field orders - .toList(); - - final Signature sig = new Lookup3Signature(); - for (String fieldName : docIncludedFieldNames) { - sig.add(fieldName); - Object o = doc.getFieldValue(fieldName); - if (o instanceof Collection) { - for (Object oo : (Collection) o) { - sig.add(String.valueOf(oo)); - } - } else { - sig.add(String.valueOf(o)); - } - } - - // Signature, depending on implementation, may return 8-byte or 16-byte value - byte[] signature = sig.getSignature(); - return Base64.getEncoder().encodeToString(signature); // Makes a base64 hash out of signature value - } - - interface OldDocProvider { - SolrInputDocument getDocument(SolrCore core, String hashField, BytesRef indexedDocId) throws IOException; - } - - private static class DefaultDocProvider implements OldDocProvider { - @Override - public SolrInputDocument getDocument(SolrCore core, String hashField, BytesRef indexedDocId) throws IOException { - return RealTimeGetComponent.getInputDocument(core, indexedDocId, indexedDocId, null, Set.of(hashField), Resolution.PARTIAL); - } + private static class DefaultDocProvider implements OldDocProvider { + @Override + public SolrInputDocument getDocument(SolrCore core, String hashField, BytesRef indexedDocId) + throws IOException { + return RealTimeGetComponent.getInputDocument( + core, indexedDocId, indexedDocId, null, Set.of(hashField), Resolution.PARTIAL); } + } - private static class DocFoundAndOldUserAndSolrVersions { - private static final DocFoundAndOldUserAndSolrVersions NOT_FOUND = new DocFoundAndOldUserAndSolrVersions(); - private final boolean found; + private static class DocFoundAndOldUserAndSolrVersions { + private static final DocFoundAndOldUserAndSolrVersions NOT_FOUND = + new DocFoundAndOldUserAndSolrVersions(); + private final boolean found; - public String oldHash; + public String oldHash; - private DocFoundAndOldUserAndSolrVersions() { - this.found = false; - } - private DocFoundAndOldUserAndSolrVersions(String oldHash) { - this.found = true; - this.oldHash = oldHash; - } + private DocFoundAndOldUserAndSolrVersions() { + this.found = false; + } + private DocFoundAndOldUserAndSolrVersions(String oldHash) { + this.found = true; + this.oldHash = oldHash; } + } } diff --git a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java index a59c98e5882..48fa52db928 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java +++ b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java @@ -1,5 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.solr.update.processor; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Locale; +import java.util.function.Predicate; +import java.util.stream.Collectors; import org.apache.solr.common.SolrException; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.StrUtils; @@ -8,44 +31,40 @@ import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.util.plugin.SolrCoreAware; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.function.Predicate; -import java.util.stream.Collectors; - -/** - * Factory for {@link ContentHashVersionProcessor} instances. - */ -public class ContentHashVersionProcessorFactory extends UpdateRequestProcessorFactory implements SolrCoreAware, UpdateRequestProcessorFactory.RunAlways { +/** Factory for {@link ContentHashVersionProcessor} instances. */ +public class ContentHashVersionProcessorFactory extends UpdateRequestProcessorFactory + implements SolrCoreAware, UpdateRequestProcessorFactory.RunAlways { private static final char SEPARATOR = ','; // Separator for included/excluded fields static final String CONTENT_HASH_ENABLED_PARAM = "contentHashEnabled"; - private List includeFields = Collections.singletonList("*"); // Included fields defaults to 'all' - private List excludeFields = new ArrayList<>(); // No excluded field by default, yet hashFieldName is excluded by default - private String hashFieldName = "content_hash"; // Field name to store computed hash on create/update operations + private List includeFields = + Collections.singletonList("*"); // Included fields defaults to 'all' + private List excludeFields = new ArrayList<>(); + // No excluded field by default, yet hashFieldName is excluded by default + private String hashFieldName = + "content_hash"; // Field name to store computed hash on create/update operations private boolean discardSameDocuments = true; - public ContentHashVersionProcessorFactory() { - } + public ContentHashVersionProcessorFactory() {} public void init(NamedList args) { Object tmp = args.remove("includeFields"); if (tmp != null) { if (!(tmp instanceof String)) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, - "'includeFields' must be configured as a "); + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "'includeFields' must be configured as a "); } // Include fields support comma separated list of fields (e.g. "field1,field2,field3"). // Also supports "*" to include all fields - this.includeFields = StrUtils.splitSmart((String) tmp, SEPARATOR) - .stream() + this.includeFields = + StrUtils.splitSmart((String) tmp, SEPARATOR).stream() .map(String::trim) .collect(Collectors.toList()); } tmp = args.remove("hashFieldName"); if (tmp != null) { if (!(tmp instanceof String)) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "'hashFieldName' must be configured as a "); + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "'hashFieldName' must be configured as a "); } this.hashFieldName = (String) tmp; } @@ -53,15 +72,18 @@ public void init(NamedList args) { tmp = args.remove("excludeFields"); if (tmp != null) { if (!(tmp instanceof String)) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "'excludeFields' must be configured as a "); + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "'excludeFields' must be configured as a "); } if ("*".equals(((String) tmp).trim())) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "'excludeFields' can't exclude all fields."); + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "'excludeFields' can't exclude all fields."); } - // Exclude fields support comma separated list of fields (e.g. "excluded_field1,excluded_field2"). + // Exclude fields support comma separated list of fields (e.g. + // "excluded_field1,excluded_field2"). // Also supports "*" to exclude all fields - this.excludeFields = StrUtils.splitSmart((String) tmp, SEPARATOR) - .stream() + this.excludeFields = + StrUtils.splitSmart((String) tmp, SEPARATOR).stream() .map(String::trim) .collect(Collectors.toList()); } @@ -70,27 +92,35 @@ public void init(NamedList args) { tmp = args.remove("hashCompareStrategy"); if (tmp != null) { if (!(tmp instanceof String)) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "'hashCompareStrategy' must be configured as a "); + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "'hashCompareStrategy' must be configured as a "); } - String value = ((String) tmp).toLowerCase(); + String value = ((String) tmp).toLowerCase(Locale.ROOT); if ("discard".equalsIgnoreCase(value)) { discardSameDocuments = true; } else if ("log".equalsIgnoreCase(value)) { discardSameDocuments = false; } else { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Value '" + value + "' is unsupported for 'hashCompareStrategy', only 'discard' and 'log' are supported."); + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "Value '" + + value + + "' is unsupported for 'hashCompareStrategy', only 'discard' and 'log' are supported."); } } super.init(args); } - public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { + public UpdateRequestProcessor getInstance( + SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { if (!req.getParams().getBool(CONTENT_HASH_ENABLED_PARAM, false)) { return next; } - ContentHashVersionProcessor processor = new ContentHashVersionProcessor( + ContentHashVersionProcessor processor = + new ContentHashVersionProcessor( buildFieldMatcher(includeFields), buildFieldMatcher(excludeFields), hashFieldName, @@ -103,7 +133,8 @@ public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryRespons public void inform(SolrCore core) { if (core.getLatestSchema().getUniqueKeyField() == null) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "schema must have uniqueKey defined."); + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "schema must have uniqueKey defined."); } } @@ -120,7 +151,7 @@ public List getExcludeFields() { } public boolean discardSameDocuments() { - return discardSameDocuments; + return discardSameDocuments; } static Predicate buildFieldMatcher(List fieldNames) { @@ -133,8 +164,8 @@ static Predicate buildFieldMatcher(List fieldNames) { return true; } if (currentFieldName.length() > 1 - && currentFieldName.endsWith("*") - && fieldName.startsWith(currentFieldName.substring(0, currentFieldName.length() - 1))) { + && currentFieldName.endsWith("*") + && fieldName.startsWith(currentFieldName.substring(0, currentFieldName.length() - 1))) { return true; } } diff --git a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java index 51af18a075d..2f2e920ac6d 100644 --- a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java +++ b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java @@ -1,22 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.solr.update.processor; +import static org.apache.solr.SolrTestCaseJ4.assumeWorkingMockito; +import static org.apache.solr.update.processor.ContentHashVersionProcessorFactory.CONTENT_HASH_ENABLED_PARAM; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertTrue; +import static org.mockito.ArgumentMatchers.anyBoolean; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.util.List; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; +import org.junit.BeforeClass; import org.junit.Test; -import java.util.List; - -import static org.apache.solr.update.processor.ContentHashVersionProcessorFactory.CONTENT_HASH_ENABLED_PARAM; -import static org.junit.Assert.*; -import static org.mockito.ArgumentMatchers.*; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - public class ContentHashVersionProcessorFactoryTest { + @BeforeClass + public static void beforeClass() throws Exception { + assumeWorkingMockito(); + } + @Test public void shouldHaveSensibleDefaultValues() { ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); @@ -120,7 +146,8 @@ public void shouldDisableContentHashByQueryParameter() { UpdateRequestProcessor next = mock(UpdateRequestProcessor.class); SolrQueryRequest updateRequest = createUpdateRequest(false); // Request disables processor - UpdateRequestProcessor instance = factory.getInstance(updateRequest, mock(SolrQueryResponse.class), next); + UpdateRequestProcessor instance = + factory.getInstance(updateRequest, mock(SolrQueryResponse.class), next); assertEquals(instance, next); } @@ -131,7 +158,8 @@ public void shouldEnableContentHashByQueryParameter() { UpdateRequestProcessor next = mock(UpdateRequestProcessor.class); SolrQueryRequest updateRequest = createUpdateRequest(true); // Request enables processor - UpdateRequestProcessor instance = factory.getInstance(updateRequest, mock(SolrQueryResponse.class), next); + UpdateRequestProcessor instance = + factory.getInstance(updateRequest, mock(SolrQueryResponse.class), next); assertNotEquals(instance, next); } @@ -139,7 +167,8 @@ public void shouldEnableContentHashByQueryParameter() { private static SolrQueryRequest createUpdateRequest(boolean enableContentHashParamValue) { SolrQueryRequest req = mock(SolrQueryRequest.class); SolrParams solrParams = mock(SolrParams.class); - when(solrParams.getBool(eq(CONTENT_HASH_ENABLED_PARAM), anyBoolean())).thenReturn(enableContentHashParamValue); + when(solrParams.getBool(eq(CONTENT_HASH_ENABLED_PARAM), anyBoolean())) + .thenReturn(enableContentHashParamValue); when(req.getParams()).thenReturn(solrParams); return req; diff --git a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java index 1f399816ce7..92ea8282f9e 100644 --- a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java +++ b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java @@ -1,5 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.solr.update.processor; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.UUID; import org.apache.lucene.util.BytesRef; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.params.SolrParams; @@ -10,17 +38,9 @@ import org.apache.solr.schema.UUIDField; import org.apache.solr.update.AddUpdateCommand; import org.junit.Before; +import org.junit.BeforeClass; import org.junit.Test; -import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.UUID; - -import static org.mockito.ArgumentMatchers.anyString; -import static org.mockito.Mockito.*; - public class ContentHashVersionProcessorTest extends UpdateProcessorTestBase { public static final String ID_FIELD = "_id"; @@ -37,31 +57,37 @@ private ContentHashVersionProcessor getContentHashVersionProcessor( UpdateRequestProcessor next, List includedFields, List excludedFields) { - ContentHashVersionProcessor processor = new ContentHashVersionProcessor( - ContentHashVersionProcessorFactory.buildFieldMatcher(includedFields), - ContentHashVersionProcessorFactory.buildFieldMatcher(excludedFields), - HASH_FIELD_NAME, - req, - rsp, - next); + ContentHashVersionProcessor processor = + new ContentHashVersionProcessor( + ContentHashVersionProcessorFactory.buildFieldMatcher(includedFields), + ContentHashVersionProcessorFactory.buildFieldMatcher(excludedFields), + HASH_FIELD_NAME, + req, + rsp, + next); // Given (previous doc retrieval configuration) - processor.setOldDocProvider((core, hashField, indexedDocId) -> { - final SolrInputDocument inputDocument = doc( - f(ID_FIELD, indexedDocId.utf8ToString()), - f(FIRST_FIELD, "Initial values used to compute initial hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields") - ); - return doc( - f(ID_FIELD, inputDocument.getFieldValue(ID_FIELD)), - f(FIRST_FIELD, inputDocument.getFieldValue(FIRST_FIELD)), - f(SECOND_FIELD, inputDocument.getFieldValue(SECOND_FIELD)), - f(hashField, processor.computeDocHash(inputDocument)) - ); - }); + processor.setOldDocProvider( + (core, hashField, indexedDocId) -> { + final SolrInputDocument inputDocument = + doc( + f(ID_FIELD, indexedDocId.utf8ToString()), + f(FIRST_FIELD, "Initial values used to compute initial hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); + return doc( + f(ID_FIELD, inputDocument.getFieldValue(ID_FIELD)), + f(FIRST_FIELD, inputDocument.getFieldValue(FIRST_FIELD)), + f(SECOND_FIELD, inputDocument.getFieldValue(SECOND_FIELD)), + f(hashField, processor.computeDocHash(inputDocument))); + }); return processor; } + @BeforeClass + public static void beforeClass() throws Exception { + assumeWorkingMockito(); + } + @Before public void setUp() throws Exception { super.setUp(); @@ -74,36 +100,37 @@ public void setUp() throws Exception { IndexSchema indexSchema = mock(IndexSchema.class); when(req.getSchema()).thenReturn(indexSchema); when(indexSchema.getUniqueKeyField()).thenReturn(new SchemaField(ID_FIELD, new UUIDField())); - when(indexSchema.indexableUniqueKey(anyString())).then(invocationOnMock -> new BytesRef(invocationOnMock.getArgument( - 0).toString())); + when(indexSchema.indexableUniqueKey(anyString())) + .then(invocationOnMock -> new BytesRef(invocationOnMock.getArgument(0).toString())); } @Test public void shouldComputeHashForDoc() { // Given - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), - mock(UpdateRequestProcessor.class), - List.of("*"), - List.of(ID_FIELD)); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + mock(UpdateRequestProcessor.class), + List.of("*"), + List.of(ID_FIELD)); // Given (doc for update) - SolrInputDocument inputDocument1 = doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Values will serve as input to compute a hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields") - ); + SolrInputDocument inputDocument1 = + doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Values will serve as input to compute a hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); // Then assertEquals("Tak0G5a/DIE=", processor.computeDocHash(inputDocument1)); // Given (doc for update - with different order) - SolrInputDocument inputDocument2 = doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields"), - f(FIRST_FIELD, "Values will serve as input to compute a hash") - ); + SolrInputDocument inputDocument2 = + doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields"), + f(FIRST_FIELD, "Values will serve as input to compute a hash")); // Then (hash remain same, since id is excluded from signature fields) assertEquals("Tak0G5a/DIE=", processor.computeDocHash(inputDocument2)); @@ -112,42 +139,47 @@ public void shouldComputeHashForDoc() { @Test public void shouldUseExcludedFieldsWildcard() { // Given - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), - List.of("*"), - List.of("field*")); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + mock(UpdateRequestProcessor.class), + List.of("*"), + List.of("field*")); // Given (doc for update) - SolrInputDocument inputDocument = doc( - f(ID_FIELD, "0000000001"), - f(FIRST_FIELD, UUID.randomUUID().toString()), - f(SECOND_FIELD, UUID.randomUUID().toString()), - f(THIRD_FIELD, "constant to have a constant hash"), - f(FOURTH_FIELD, UUID.randomUUID().toString()) - ); + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, UUID.randomUUID().toString()), + f(SECOND_FIELD, UUID.randomUUID().toString()), + f(THIRD_FIELD, "constant to have a constant hash"), + f(FOURTH_FIELD, UUID.randomUUID().toString())); // Then (only ID and THIRD_FIELD is used in hash, other fields contain random values) - assertEquals("bwE8Zjq0aOs=", processor.computeDocHash(inputDocument)); // Hash if only ID field was used + assertEquals( + "bwE8Zjq0aOs=", processor.computeDocHash(inputDocument)); // Hash if only ID field was used } @Test public void shouldUseIncludedFieldsWildcard() { // Given - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), - List.of("field*"), - List.of(THIRD_FIELD)); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + mock(UpdateRequestProcessor.class), + List.of("field*"), + List.of(THIRD_FIELD)); // Given (doc for update) - SolrInputDocument inputDocument = doc( - f(ID_FIELD, "0000000001"), - f(FIRST_FIELD, "constant to have a constant hash for field1"), - f(SECOND_FIELD, "constant to have a constant hash for field2"), - f(THIRD_FIELD, UUID.randomUUID().toString()), - f(FOURTH_FIELD, "constant to have a constant hash for field4") - ); + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, "constant to have a constant hash for field1"), + f(SECOND_FIELD, "constant to have a constant hash for field2"), + f(THIRD_FIELD, UUID.randomUUID().toString()), + f(FOURTH_FIELD, "constant to have a constant hash for field4")); // Then assertEquals("PozPs2qZQtw=", processor.computeDocHash(inputDocument)); @@ -156,20 +188,22 @@ public void shouldUseIncludedFieldsWildcard() { @Test public void shouldUseIncludedFieldsWildcard2() { // Given (variant of previous shouldUseIncludedFieldsWildcard, without the excludedField config) - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), - List.of("field*"), - Collections.emptyList()); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + mock(UpdateRequestProcessor.class), + List.of("field*"), + List.of()); // Given (doc for update) - SolrInputDocument inputDocument = doc( - f(ID_FIELD, "0000000001"), - f(FIRST_FIELD, "constant to have a constant hash for field1"), - f(SECOND_FIELD, "constant to have a constant hash for field2"), - f(THIRD_FIELD, UUID.randomUUID().toString()), - f(FOURTH_FIELD, "constant to have a constant hash for field4") - ); + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, "constant to have a constant hash for field1"), + f(SECOND_FIELD, "constant to have a constant hash for field2"), + f(THIRD_FIELD, UUID.randomUUID().toString()), + f(FOURTH_FIELD, "constant to have a constant hash for field4")); // Then assertEquals("PozPs2qZQtw=", processor.computeDocHash(inputDocument)); @@ -178,26 +212,33 @@ public void shouldUseIncludedFieldsWildcard2() { @Test public void shouldDedupIncludedFields() { // Given (processor to include field1 and field2 only) - ContentHashVersionProcessor processorWithDuplicatedFieldName = getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), - List.of(FIRST_FIELD, FIRST_FIELD, SECOND_FIELD), - Collections.emptyList()); - - ContentHashVersionProcessor processorWithWildcard = getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), - List.of(SECOND_FIELD, FIRST_FIELD, "field1*"), // Also change order of config (test reorder of field names) - Collections.emptyList()); + ContentHashVersionProcessor processorWithDuplicatedFieldName = + getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + mock(UpdateRequestProcessor.class), + List.of(FIRST_FIELD, FIRST_FIELD, SECOND_FIELD), + List.of()); + + ContentHashVersionProcessor processorWithWildcard = + getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + mock(UpdateRequestProcessor.class), + List.of( + SECOND_FIELD, + FIRST_FIELD, + "field1*"), // Also change order of config (test reorder of field names) + List.of()); // Given (doc for update) - SolrInputDocument inputDocument = doc( - f(ID_FIELD, "0000000001"), - f(FIRST_FIELD, "constant to have a constant hash for field1"), - f(SECOND_FIELD, "constant to have a constant hash for field2"), - f(THIRD_FIELD, UUID.randomUUID().toString()), - f(FOURTH_FIELD, "constant to have a constant hash for field4") - ); + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, "constant to have a constant hash for field1"), + f(SECOND_FIELD, "constant to have a constant hash for field2"), + f(THIRD_FIELD, UUID.randomUUID().toString()), + f(FOURTH_FIELD, "constant to have a constant hash for field4")); // Then assertEquals("XavrOYGlkXM=", processorWithDuplicatedFieldName.computeDocHash(inputDocument)); @@ -208,12 +249,13 @@ public void shouldDedupIncludedFields() { public void shouldCreateSignatureForNewDoc() throws IOException { // Given SolrQueryResponse response = mock(SolrQueryResponse.class); - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - response, - mock(UpdateRequestProcessor.class), - Arrays.asList(FIRST_FIELD, SECOND_FIELD), - Collections.emptyList()); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, + response, + mock(UpdateRequestProcessor.class), + Arrays.asList(FIRST_FIELD, SECOND_FIELD), + List.of()); processor.setDiscardSameDocuments(false); processor.setOldDocProvider((core, hashField, indexedDocId) -> null); @@ -221,11 +263,11 @@ public void shouldCreateSignatureForNewDoc() throws IOException { AddUpdateCommand cmd = new AddUpdateCommand(req); // Given (doc for update) - SolrInputDocument inputDocument = doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Values will serve as input to compute a hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields") - ); + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Values will serve as input to compute a hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); cmd.solrDoc = inputDocument; // When @@ -240,53 +282,57 @@ public void shouldCreateSignatureForNewDoc() throws IOException { // Then (asserts on hash comparison results) verify(response, times(1)).addToLog(eq("numAddsExisting"), eq(0)); - verify(response, times(1)).addToLog(eq("numAddsExistingWithIdentical"), eq(0)); // And no hash clash with old doc + verify(response, times(1)) + .addToLog(eq("numAddsExistingWithIdentical"), eq(0)); // And no hash clash with old doc } @Test public void shouldAddToResponseLog() throws IOException { // Given SolrQueryResponse response = mock(SolrQueryResponse.class); - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - response, - mock(UpdateRequestProcessor.class), - Arrays.asList(FIRST_FIELD, SECOND_FIELD), - Collections.emptyList()); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, + response, + mock(UpdateRequestProcessor.class), + Arrays.asList(FIRST_FIELD, SECOND_FIELD), + List.of()); processor.setDiscardSameDocuments(false); // Given (command to update existing doc) AddUpdateCommand cmdDoesNotChangeValues = new AddUpdateCommand(req); // Given (doc for update - matches the existing doc, see getContentHashVersionProcessor()) - SolrInputDocument initialDocument = doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Initial values used to compute initial hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields") - ); - cmdDoesNotChangeValues.solrDoc = doc( - f(ID_FIELD, initialDocument.getFieldValue(ID_FIELD)), - f(FIRST_FIELD, initialDocument.getFieldValue(FIRST_FIELD)), - f(SECOND_FIELD, initialDocument.getFieldValue(SECOND_FIELD)), - f(HASH_FIELD_NAME, processor.computeDocHash(initialDocument)) - ); + SolrInputDocument initialDocument = + doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Initial values used to compute initial hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); + cmdDoesNotChangeValues.solrDoc = + doc( + f(ID_FIELD, initialDocument.getFieldValue(ID_FIELD)), + f(FIRST_FIELD, initialDocument.getFieldValue(FIRST_FIELD)), + f(SECOND_FIELD, initialDocument.getFieldValue(SECOND_FIELD)), + f(HASH_FIELD_NAME, processor.computeDocHash(initialDocument))); // Given (command to update existing doc with different content) AddUpdateCommand cmdChangesDocValues = new AddUpdateCommand(req); - // Given (doc for update - does *not* match the existing doc, see getContentHashVersionProcessor()) - cmdChangesDocValues.solrDoc = doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "This is a doc with values"), - f(SECOND_FIELD, "that differs from stored doc, so it's considered new") - ); + // Given (doc for update - does *not* match the existing doc, see + // getContentHashVersionProcessor()) + cmdChangesDocValues.solrDoc = + doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "This is a doc with values"), + f(SECOND_FIELD, "that differs from stored doc, so it's considered new")); // When processor.processAdd(cmdDoesNotChangeValues); processor.processAdd(cmdChangesDocValues); processor.finish(); - // Then (read as follows: 2 updates occurred for an existing doc. Among these updates, 1 update tried to replace + // Then (read as follows: 2 updates occurred for an existing doc. Among these updates, 1 update + // tried to replace // doc with the same content) verify(response, times(1)).addToLog(eq("numAddsExisting"), eq(2)); verify(response, times(1)).addToLog(eq("numAddsExistingWithIdentical"), eq(1)); @@ -296,22 +342,19 @@ public void shouldAddToResponseLog() throws IOException { public void shouldNotUpdateSignatureForNewDoc() throws IOException { // Given SolrQueryResponse response = mock(SolrQueryResponse.class); - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - response, - mock(UpdateRequestProcessor.class), - List.of(SECOND_FIELD), - Collections.emptyList()); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, response, mock(UpdateRequestProcessor.class), List.of(SECOND_FIELD), List.of()); // Given (command) AddUpdateCommand cmd = new AddUpdateCommand(req); // Given (doc for update) - SolrInputDocument inputDocument = doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Values will serve as input to compute a hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields") - ); + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Values will serve as input to compute a hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); cmd.solrDoc = inputDocument; // When @@ -330,23 +373,24 @@ public void shouldNotUpdateSignatureForNewDoc() throws IOException { public void shouldExcludeFieldsUpdateSignatureForNewDoc() throws IOException { // Given SolrQueryResponse response = mock(SolrQueryResponse.class); - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - response, - mock(UpdateRequestProcessor.class), - List.of(FIRST_FIELD, SECOND_FIELD), - List.of(FIRST_FIELD)); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, + response, + mock(UpdateRequestProcessor.class), + List.of(FIRST_FIELD, SECOND_FIELD), + List.of(FIRST_FIELD)); processor.setDiscardSameDocuments(false); // Given (command) AddUpdateCommand cmd = new AddUpdateCommand(req); // Given (doc for update) - SolrInputDocument inputDocument = doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Values will serve as input to compute a hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields") - ); + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Values will serve as input to compute a hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); cmd.solrDoc = inputDocument; // When @@ -365,29 +409,30 @@ public void shouldExcludeFieldsUpdateSignatureForNewDoc() throws IOException { public void shouldCommitWithDiscardModeEnabled() throws IOException { // Given UpdateRequestProcessor nextProcessor = mock(UpdateRequestProcessor.class); - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), - nextProcessor, - List.of(FIRST_FIELD, SECOND_FIELD), - List.of(FIRST_FIELD)); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + nextProcessor, + List.of(FIRST_FIELD, SECOND_FIELD), + List.of(FIRST_FIELD)); processor.setDiscardSameDocuments(true); // Given (command to update existing doc) AddUpdateCommand cmdDoesNotChangeValues = new AddUpdateCommand(req); // Given (doc for update - matches the existing doc, see getContentHashVersionProcessor()) - SolrInputDocument initialDocument = doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Initial values used to compute initial hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields") - ); - cmdDoesNotChangeValues.solrDoc = doc( - f(ID_FIELD, initialDocument.getFieldValue(ID_FIELD)), - f(FIRST_FIELD, initialDocument.getFieldValue(FIRST_FIELD)), - f(SECOND_FIELD, initialDocument.getFieldValue(SECOND_FIELD)), - f(HASH_FIELD_NAME, processor.computeDocHash(initialDocument)) - ); + SolrInputDocument initialDocument = + doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Initial values used to compute initial hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); + cmdDoesNotChangeValues.solrDoc = + doc( + f(ID_FIELD, initialDocument.getFieldValue(ID_FIELD)), + f(FIRST_FIELD, initialDocument.getFieldValue(FIRST_FIELD)), + f(SECOND_FIELD, initialDocument.getFieldValue(SECOND_FIELD)), + f(HASH_FIELD_NAME, processor.computeDocHash(initialDocument))); // When processor.processAdd(cmdDoesNotChangeValues); @@ -395,5 +440,4 @@ public void shouldCommitWithDiscardModeEnabled() throws IOException { // Then verify(nextProcessor, never()).processAdd(eq(cmdDoesNotChangeValues)); } - } From cd3b510a97f2c11acfbd6566600eed5a2ee4d835 Mon Sep 17 00:00:00 2001 From: fhuaulme Date: Fri, 3 Apr 2026 18:52:45 +0200 Subject: [PATCH 3/3] * java lint fixes --- .../solr/update/processor/ContentHashVersionProcessor.java | 1 + 1 file changed, 1 insertion(+) diff --git a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java index f4b40221a1b..f728e0905d4 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java +++ b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java @@ -87,6 +87,7 @@ public ContentHashVersionProcessor( this.excludedFields = hashExcludedFields; } + @Override public void processAdd(AddUpdateCommand cmd) throws IOException { SolrInputDocument newDoc = cmd.getSolrInputDocument(); String newHash = computeDocHash(newDoc);