Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.solr.update.processor;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Base64;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.function.Predicate;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.RealTimeGetComponent;
import org.apache.solr.handler.component.RealTimeGetComponent.Resolution;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TextField;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.UpdateCommand;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* An implementation of {@link UpdateRequestProcessor} which computes a hash of selected doc values,
* and uses this hash value to reject/accept doc updates.
*
* <ul>
* <li>When no corresponding doc with same id exists (create), computed hash is added to the
* document.
* <li>When a previous doc exists (update), a new hash is computed using new version values and
* compared with old hash.
* </ul>
*
* Depending on {#discardSameDocuments} value, this processor may reject or accept doc update. This
* implementation can be used for monitoring or rejecting no-op updates (updates that do not change
* Solr document).
*
* <p>Note: hash is computed using {@link Lookup3Signature}.
*
* @see Lookup3Signature
*/
public class ContentHashVersionProcessor extends UpdateRequestProcessor {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private final SchemaField hashField;
private final SolrQueryResponse rsp;
private final SolrCore core;
private final Predicate<String> includedFields; // Matcher for included fields in hash
private final Predicate<String> excludedFields; // Matcher for excluded fields from hash
private OldDocProvider oldDocProvider = new DefaultDocProvider();
private boolean discardSameDocuments;
private int sameCount = 0;
private int differentCount = 0;
private int unknownCount = 0;

public ContentHashVersionProcessor(
Predicate<String> hashIncludedFields,
Predicate<String> hashExcludedFields,
String hashFieldName,
SolrQueryRequest req,
SolrQueryResponse rsp,
UpdateRequestProcessor next) {
super(next);
this.core = req.getCore();
this.hashField = new SchemaField(hashFieldName, new TextField());
this.rsp = rsp;
this.includedFields = hashIncludedFields;
this.excludedFields = hashExcludedFields;
}

@Override
public void processAdd(AddUpdateCommand cmd) throws IOException {
SolrInputDocument newDoc = cmd.getSolrInputDocument();
String newHash = computeDocHash(newDoc);
newDoc.setField(hashField.getName(), newHash);
int i = 0;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't this be redone as a for-i loop?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See DocBasedVersionConstraintsProcessor. It's worth a comment here that you are emulating similar logic there.


if (!validateHash(cmd.getIndexedId(), newHash)) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's not at all obvious to me that the name "validateHash" does more than hash validation, whatever that is. Yet it seems the return value is whether we should "discard". (FWIW DocBasedVersionConstraintsProcessor uses the word "Dropping" or simply "drop").

return;
}

while (true) {
logOverlyFailedRetries(i, cmd);
try {
super.processAdd(cmd);
return;
} catch (SolrException e) {
if (e.code() != 409) {
throw e;
}
++i;
}
}
}

@Override
public void finish() throws IOException {
try {
super.finish();
} finally {
rsp.addToLog("numAddsExisting", sameCount + differentCount + unknownCount);
rsp.addToLog("numAddsExistingWithIdentical", sameCount);
rsp.addToLog("numAddsExistingUnknown", unknownCount);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RE "numAdds" -- lets use "contentHash" prefix so as to associate these with this URP, and not confusing it with DocBasedVersionConstraintsProcessor which ought to include similar logs but doesn't yet do so.

IMO adding a new doc shouldn't add any log. I would expect one of these to incorporate the word "drop" or "discard" to reflect an action taken. I understand this URP can be configured to not take that action, albeit that would be in an exploratory / trial situation that ought to be temporary (I imagine).

}
}

private static void logOverlyFailedRetries(int i, UpdateCommand cmd) {
if ((i & 255) == 255) {
log.warn("Unusual number of optimistic concurrency retries: retries={} cmd={}", i, cmd);
}
}

void setOldDocProvider(OldDocProvider oldDocProvider) {
this.oldDocProvider = oldDocProvider;
}

void setDiscardSameDocuments(boolean discardSameDocuments) {
this.discardSameDocuments = discardSameDocuments;
}

private boolean validateHash(BytesRef indexedDocId, String newHash) throws IOException {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps we should align with a similar name in DocBasedVersionConstraintsProcessor, and rename to hashIsAcceptable ?

assert null != indexedDocId;

var docFoundAndOldUserVersions = getOldUserVersionsFromStored(indexedDocId);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is one possible algorithm -- lookup hash from docValues of a doc that we got by first looking up the current doc by ID (a terms index lookup). But this is ~twice as expensive as doing a terms index lookup of the new hash, assuming we find no doc with that hash. If we do find one, then it's similar, as we then need to fetch the ID via docValues to see if it's the same doc. But we're about to drop the doc then, which isn't the common path. Even "need" there is debatable; it'd be extremely unlikely to find a match by content hash to a different doc ID.

Basically -- does the content hash field have indexed=true XOR docValues=true. We don't need to support both (IMO) -- we can choose what's most efficient for this use-case.
I understand you may not have time to implement another algorithm.

Just an FYI: the postingsFormat (of the terms index) is configurable and there are a number of lovely implementations, incuding a bloom filter one, which would make a lot of sense for "lookup" / exact match only use-case. Such a choice is up to the user to configure in the schema if they wish.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh wait... docs that have been but not yet visible via the IndexSearcher will not have their content hashes searchable. Albeit in such a situation, the "worst" thing that would happen is we index a doc that could have been dropped/discarded -- a missed optimization opportunity.

A terms lookup on the hash should be much faster than the RTG + double-docValues lookups though.

if (docFoundAndOldUserVersions.found) {
String oldHash =
docFoundAndOldUserVersions.oldHash; // No hash: might want to keep track of these too
if (oldHash == null) {
unknownCount++;
return true;
} else if (Objects.equals(newHash, oldHash)) {
sameCount++;
return !discardSameDocuments;
} else {
differentCount++;
return true;
}
}
return true; // Doc not found
}

private DocFoundAndOldUserAndSolrVersions getOldUserVersionsFromStored(BytesRef indexedDocId)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO we should not support this. It's not unreasonable to demand that someone using this feature enable docValues on the hash field.

throws IOException {
SolrInputDocument oldDoc = oldDocProvider.getDocument(core, hashField.getName(), indexedDocId);
return null == oldDoc
? DocFoundAndOldUserAndSolrVersions.NOT_FOUND
: getUserVersionAndSolrVersionFromDocument(oldDoc);
}

private DocFoundAndOldUserAndSolrVersions getUserVersionAndSolrVersionFromDocument(
SolrInputDocument oldDoc) {
Object o = oldDoc.getFieldValue(hashField.getName());
if (o != null) {
return new DocFoundAndOldUserAndSolrVersions(o.toString());
}
return new DocFoundAndOldUserAndSolrVersions();
}

public String computeDocHash(SolrInputDocument doc) {
List<String> docIncludedFieldNames =
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This results in a double-navigation of the fields of a document in order to have the hash be insensitive to the order of fields added. But in practice, a given end-to-end pipeline is going to be deterministic. Hmm. I suppose its no big deal. To improve, you cold instead stream SolrInputField, sorting by name, and get the value from them without another lookup. You could also do the whole thing as a stream pipeline without needlessing producing a List. It'd end in a forEach to process the value.

doc.getFieldNames().stream()
.filter(includedFields) // Keep fields that match 'included fields' matcher...
.filter(
excludedFields
.negate()) // ...and exclude fields that match 'excluded fields' matcher
.sorted() // Sort to ensure consistent field order across different doc field orders
.toList();

final Signature sig = new Lookup3Signature();
for (String fieldName : docIncludedFieldNames) {
sig.add(fieldName);
Object o = doc.getFieldValue(fieldName);
if (o instanceof Collection) {
for (Object oo : (Collection<?>) o) {
sig.add(String.valueOf(oo));
}
} else {
sig.add(String.valueOf(o));
}
}

// Signature, depending on implementation, may return 8-byte or 16-byte value
byte[] signature = sig.getSignature();
return Base64.getEncoder()
.encodeToString(signature); // Makes a base64 hash out of signature value
}

interface OldDocProvider {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

realistically, what impls are we going to have?

SolrInputDocument getDocument(SolrCore core, String hashField, BytesRef indexedDocId)
throws IOException;
}

private static class DefaultDocProvider implements OldDocProvider {
@Override
public SolrInputDocument getDocument(SolrCore core, String hashField, BytesRef indexedDocId)
throws IOException {
return RealTimeGetComponent.getInputDocument(
core, indexedDocId, indexedDocId, null, Set.of(hashField), Resolution.PARTIAL);
}
}

private static class DocFoundAndOldUserAndSolrVersions {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be a record? Or simply an Optional of the oldHash?

private static final DocFoundAndOldUserAndSolrVersions NOT_FOUND =
new DocFoundAndOldUserAndSolrVersions();
private final boolean found;

public String oldHash;

private DocFoundAndOldUserAndSolrVersions() {
this.found = false;
}

private DocFoundAndOldUserAndSolrVersions(String oldHash) {
this.found = true;
this.oldHash = oldHash;
}
}
}
Loading
Loading