From c89bdf13b141af3921518561f204303966a83760 Mon Sep 17 00:00:00 2001 From: Rahul Goswami Date: Thu, 26 Mar 2026 01:13:46 -0400 Subject: [PATCH 1/6] CoreAdmin UPGRDECOREINDEX API changes to make it ready for collection level upgrade calls --- .../model/UpgradeCoreIndexRequestBody.java | 9 ++ .../handler/admin/UpgradeCoreIndexOp.java | 12 ++- .../handler/admin/api/UpgradeCoreIndex.java | 90 ++++++++++++++++++- 3 files changed, 106 insertions(+), 5 deletions(-) diff --git a/solr/api/src/java/org/apache/solr/client/api/model/UpgradeCoreIndexRequestBody.java b/solr/api/src/java/org/apache/solr/client/api/model/UpgradeCoreIndexRequestBody.java index ecc3081014e9..62f9348a9fa6 100644 --- a/solr/api/src/java/org/apache/solr/client/api/model/UpgradeCoreIndexRequestBody.java +++ b/solr/api/src/java/org/apache/solr/client/api/model/UpgradeCoreIndexRequestBody.java @@ -30,4 +30,13 @@ public class UpgradeCoreIndexRequestBody { "updateChain to be used for reindexing during index upgrade if you don't want to use the one used by /update by default") @JsonProperty public String updateChain; + + @Schema( + description = + "Internal flag set by the collection-level UPGRADECOLLECTIONINDEX command to indicate" + + " this upgrade is being coordinated by SolrCloud. When true, the SolrCloud guard is" + + " bypassed, a local-only update chain is used (no distributed forwarding), and the" + + " core's readOnly flag is temporarily cleared for the duration of the upgrade.") + @JsonProperty + public Boolean cloudMode; } diff --git a/solr/core/src/java/org/apache/solr/handler/admin/UpgradeCoreIndexOp.java b/solr/core/src/java/org/apache/solr/handler/admin/UpgradeCoreIndexOp.java index 8fff5c93d310..08706c37d470 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/UpgradeCoreIndexOp.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/UpgradeCoreIndexOp.java @@ -51,17 +51,23 @@ public boolean isExpensive() { public void execute(CoreAdminHandler.CallInfo it) throws Exception { assert it.handler.coreContainer != null; - if (it.handler.coreContainer.isZooKeeperAware()) { + SolrParams params = it.req.getParams(); + final boolean cloudMode = params.getBool("cloudMode", false); + + if (it.handler.coreContainer.isZooKeeperAware() && !cloudMode) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, - "action=UPGRADECOREINDEX is not supported in SolrCloud mode. As an alternative, in order to upgrade index, configure LatestVersionMergePolicyFactory in solrconfig.xml and reindex the data in your collection."); + "action=UPGRADECOREINDEX is not supported in SolrCloud mode. Use the" + + " UPGRADECOLLECTIONINDEX Collections API action instead, or configure" + + " LatestVersionMergePolicyFactory in solrconfig.xml and reindex the data in your" + + " collection."); } - SolrParams params = it.req.getParams(); String cname = params.required().get(CoreAdminParams.CORE); final boolean isAsync = params.get(CommonAdminParams.ASYNC) != null; final var requestBody = new UpgradeCoreIndexRequestBody(); requestBody.updateChain = params.get(UpdateParams.UPDATE_CHAIN); + requestBody.cloudMode = cloudMode; UpgradeCoreIndex upgradeCoreIndexApi = UPGRADE_CORE_INDEX_FACTORY.create( diff --git a/solr/core/src/java/org/apache/solr/handler/admin/api/UpgradeCoreIndex.java b/solr/core/src/java/org/apache/solr/handler/admin/api/UpgradeCoreIndex.java index c91f107e9522..4fd639d76acd 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/api/UpgradeCoreIndex.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/api/UpgradeCoreIndex.java @@ -61,6 +61,8 @@ import org.apache.solr.update.AddUpdateCommand; import org.apache.solr.update.CommitUpdateCommand; import org.apache.solr.update.DocumentBuilder; +import org.apache.solr.update.processor.LogUpdateProcessorFactory; +import org.apache.solr.update.processor.RunUpdateProcessorFactory; import org.apache.solr.update.processor.UpdateRequestProcessor; import org.apache.solr.update.processor.UpdateRequestProcessorChain; import org.apache.solr.util.RefCounted; @@ -139,11 +141,32 @@ private UpgradeCoreIndexResponse performUpgrade( private UpgradeCoreIndexResponse performUpgradeImpl( SolrCore core, UpgradeCoreIndexRequestBody requestBody, UpgradeCoreIndexResponse response) { + final boolean cloudMode = Boolean.TRUE.equals(requestBody.cloudMode); + boolean readOnlyWasCleared = false; + RefCounted iwRef = null; MergePolicy originalMergePolicy = null; int numSegmentsEligibleForUpgrade = 0, numSegmentsUpgraded = 0; String coreName = core.getName(); try { + // In cloud mode, the collection-level readOnly property is set in ZooKeeper by the + // UPGRADECOLLECTIONINDEX coordinator. DistributedZkUpdateProcessor checks this ZK property + // on every request and blocks external writes regardless of core.readOnly. Our stripped-down + // chain bypasses DistributedZkUpdateProcessor entirely, so external writes remain blocked. + // + // The core.readOnly volatile flag is normally NOT set by MODIFYCOLLECTION on running cores + // (it is only synced on core reload). However, it may have been set by other operations + // (e.g., RestoreCmd). If it is set, we need to temporarily clear it so that + // DefaultSolrCoreState.getIndexWriter() does not reject our IndexWriter request. + if (cloudMode && core.readOnly) { + log.info( + "Cloud mode upgrade: temporarily clearing core.readOnly on core [{}] for upgrade." + + " External writes remain blocked by the collection-level readOnly in ZooKeeper.", + coreName); + core.readOnly = false; + readOnlyWasCleared = true; + } + iwRef = core.getSolrCoreState().getIndexWriter(core); IndexWriter iw = iwRef.get(); @@ -173,7 +196,9 @@ private UpgradeCoreIndexResponse performUpgradeImpl( DocValuesIteratorCache dvICache = new DocValuesIteratorCache(searcherRef.get()); UpdateRequestProcessorChain updateProcessorChain = - getUpdateProcessorChain(core, requestBody.updateChain); + cloudMode + ? buildLocalOnlyChain(core) + : getUpdateProcessorChain(core, requestBody.updateChain); for (LeafReaderContext lrc : leafContexts) { if (!shouldUpgradeSegment(lrc)) { @@ -199,12 +224,33 @@ private UpgradeCoreIndexResponse performUpgradeImpl( searcherRef.decref(); } + // Restore original merge policy BEFORE commit so that the commit can trigger + // normal merging which will clean up tombstone segments (old segments whose live docs + // have all been migrated to new segments). + if (originalMergePolicy != null) { + iw.getConfig().setMergePolicy(originalMergePolicy); + originalMergePolicy = null; // prevent double-restore in finally block + } + try { doCommit(core); } catch (IOException e) { throw new CoreAdminAPIBaseException(e); } + // After commit, run expungeDeletes to remove tombstone segments (old-format segments + // whose live documents have all been re-indexed into new-format segments). + try { + doExpungeDeletes(core); + } catch (IOException e) { + log.warn( + "expungeDeletes failed on core [{}] during index upgrade; tombstone segments may" + + " remain until the next merge cycle", + coreName, + e); + // Non-fatal: the upgrade itself succeeded; tombstone cleanup can happen later + } + boolean indexUpgraded = isIndexUpgraded(core); if (!indexUpgraded) { @@ -234,7 +280,7 @@ private UpgradeCoreIndexResponse performUpgradeImpl( throw new CoreAdminAPIBaseException(ioEx); } finally { - // Restore original merge policy + // Restore original merge policy if not already restored if (iwRef != null) { IndexWriter iw = iwRef.get(); if (originalMergePolicy != null) { @@ -242,6 +288,11 @@ private UpgradeCoreIndexResponse performUpgradeImpl( } iwRef.decref(); } + // Restore core readOnly flag if we cleared it + if (readOnlyWasCleared) { + core.readOnly = true; + log.info("Cloud mode upgrade: restored readOnly on core [{}]", coreName); + } } return response; @@ -376,6 +427,24 @@ private boolean isIndexUpgraded(SolrCore core) throws IOException { } } + /** + * Builds a local-only update processor chain for SolrCloud mode upgrades. This chain consists of + * only {@link LogUpdateProcessorFactory} and {@link RunUpdateProcessorFactory}, deliberately + * excluding {@link org.apache.solr.update.processor.DistributedUpdateProcessorFactory}. This + * ensures: + * + * + */ + private UpdateRequestProcessorChain buildLocalOnlyChain(SolrCore core) { + return new UpdateRequestProcessorChain( + List.of(new LogUpdateProcessorFactory(), new RunUpdateProcessorFactory()), core); + } + private void doCommit(SolrCore core) throws IOException { try (SolrQueryRequestBase req = new SolrQueryRequestBase(core, new ModifiableSolrParams())) { CommitUpdateCommand cmd = new CommitUpdateCommand(req, false); // optimize=false @@ -386,6 +455,23 @@ private void doCommit(SolrCore core) throws IOException { } } + /** + * Runs expungeDeletes to clean up tombstone segments — old-format segments whose live documents + * have all been re-indexed into new-format segments. Without this, {@link #isIndexUpgraded} would + * fail because the tombstone segments still report an old {@code minVersion}. + */ + private void doExpungeDeletes(SolrCore core) throws IOException { + try (SolrQueryRequestBase req = new SolrQueryRequestBase(core, new ModifiableSolrParams())) { + CommitUpdateCommand cmd = new CommitUpdateCommand(req, false); + cmd.expungeDeletes = true; + core.getUpdateHandler().commit(cmd); + } catch (IOException ioEx) { + log.warn( + "Error during expungeDeletes on core [{}] during index upgrade", core.getName(), ioEx); + throw ioEx; + } + } + private void processSegment( LeafReaderContext leafReaderContext, UpdateRequestProcessorChain processorChain, From bcc60b874b76d6ccb62011ea62289b62dda5ce25 Mon Sep 17 00:00:00 2001 From: Rahul Goswami Date: Thu, 26 Mar 2026 01:44:35 -0400 Subject: [PATCH 2/6] New Collections API command - UPGRADECOLLECTIONINDEX --- .../cloud/api/collections/CollApiCmds.java | 4 +- .../UpgradeCollectionIndexCmd.java | 241 ++++++++++++++++++ .../handler/admin/CollectionsHandler.java | 9 + .../solr/common/params/CollectionParams.java | 3 +- 4 files changed, 255 insertions(+), 2 deletions(-) create mode 100644 solr/core/src/java/org/apache/solr/cloud/api/collections/UpgradeCollectionIndexCmd.java diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/CollApiCmds.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/CollApiCmds.java index ab8a8c58e4c2..9b29ffcb5687 100644 --- a/solr/core/src/java/org/apache/solr/cloud/api/collections/CollApiCmds.java +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/CollApiCmds.java @@ -65,6 +65,7 @@ import static org.apache.solr.common.params.CollectionParams.CollectionAction.REPLACENODE; import static org.apache.solr.common.params.CollectionParams.CollectionAction.RESTORE; import static org.apache.solr.common.params.CollectionParams.CollectionAction.SPLITSHARD; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.UPGRADECOLLECTIONINDEX; import static org.apache.solr.common.params.CommonParams.NAME; import io.opentelemetry.api.trace.Span; @@ -189,7 +190,8 @@ private CommandMap(OverseerNodePrioritizer overseerPrioritizer, CollectionComman Map.entry(ADDREPLICA, new AddReplicaCmd(ccc)), Map.entry(MOVEREPLICA, new MoveReplicaCmd(ccc)), Map.entry(REINDEXCOLLECTION, new ReindexCollectionCmd(ccc)), - Map.entry(RENAME, new RenameCmd(ccc))); + Map.entry(RENAME, new RenameCmd(ccc)), + Map.entry(UPGRADECOLLECTIONINDEX, new UpgradeCollectionIndexCmd(ccc))); } CollApiCmds.CollectionApiCommand getActionCommand(CollectionParams.CollectionAction action) { diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/UpgradeCollectionIndexCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/UpgradeCollectionIndexCmd.java new file mode 100644 index 000000000000..d3fd6f4c9021 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/UpgradeCollectionIndexCmd.java @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.cloud.api.collections; + +import static org.apache.solr.common.params.CommonParams.NAME; + +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import org.apache.solr.cloud.DistributedClusterStateUpdater; +import org.apache.solr.cloud.Overseer; +import org.apache.solr.cloud.api.collections.CollectionHandlingUtils.ShardRequestTracker; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.cloud.ClusterState; +import org.apache.solr.common.cloud.DocCollection; +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.cloud.Slice; +import org.apache.solr.common.cloud.ZkNodeProps; +import org.apache.solr.common.cloud.ZkStateReader; +import org.apache.solr.common.params.CollectionParams; +import org.apache.solr.common.params.CoreAdminParams; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.handler.component.ShardHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Upgrades a collection's index by rewriting old-format Lucene segments into the current format. + * + *

The operation sets the collection to readOnly, then for each shard: + * + *

    + *
  1. Upgrades the shard leader's index locally (no distributed forwarding). + *
  2. Upgrades each NRT non-leader replica's index locally in parallel. + *
  3. TLOG and PULL replicas converge by replicating the upgraded index from the leader via their + * normal background replication mechanism. + *
  4. Validates that all replicas have no old-format segments remaining. + *
+ * + *

After all shards are processed, readOnly is cleared. + * + * @see org.apache.solr.handler.admin.api.UpgradeCoreIndex + */ +public class UpgradeCollectionIndexCmd implements CollApiCmds.CollectionApiCommand { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private final CollectionCommandContext ccc; + + public UpgradeCollectionIndexCmd(CollectionCommandContext ccc) { + this.ccc = ccc; + } + + @Override + public void call(AdminCmdContext adminCmdContext, ZkNodeProps message, NamedList results) + throws Exception { + String collectionName = message.getStr(NAME); + if (collectionName == null) { + throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, + "Collection name is required for UPGRADECOLLECTIONINDEX"); + } + + ClusterState clusterState = adminCmdContext.getClusterState(); + DocCollection collection = clusterState.getCollectionOrNull(collectionName); + if (collection == null) { + throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, "Collection '" + collectionName + "' not found"); + } + + log.info("Starting UPGRADECOLLECTIONINDEX for collection [{}]", collectionName); + + boolean readOnlyWasSet = false; + try { + // 1. Set collection readOnly to block external writes + setCollectionReadOnly(collectionName, true); + readOnlyWasSet = true; + + // 2. Process each shard + Map shardResults = new LinkedHashMap<>(); + for (Slice slice : collection.getSlices()) { + NamedList shardResult = new NamedList<>(); + upgradeShardIndex(adminCmdContext, collectionName, slice, shardResult); + shardResults.put(slice.getName(), shardResult); + } + results.add("shardResults", shardResults); + + log.info("UPGRADECOLLECTIONINDEX completed successfully for collection [{}]", collectionName); + } finally { + // 3. Always clear readOnly, even on failure + if (readOnlyWasSet) { + try { + setCollectionReadOnly(collectionName, false); + } catch (Exception e) { + log.error( + "Failed to clear readOnly on collection [{}] after UPGRADECOLLECTIONINDEX. " + + "Collection remains read-only and must be manually restored via " + + "MODIFYCOLLECTION action.", + collectionName, + e); + results.add( + "warning", + "Failed to clear readOnly on collection. Use MODIFYCOLLECTION to restore writes."); + } + } + } + } + + private void upgradeShardIndex( + AdminCmdContext adminCmdContext, + String collectionName, + Slice slice, + NamedList shardResult) + throws Exception { + + String shardName = slice.getName(); + log.info("Upgrading shard [{}] of collection [{}]", shardName, collectionName); + + // Refresh cluster state to get the current leader + ClusterState clusterState = ccc.getZkStateReader().getClusterState(); + + Replica leader = ccc.getZkStateReader().getLeaderRetry(collectionName, shardName, 30000); + + ShardHandler shardHandler = ccc.newShardHandler(); + ShardRequestTracker tracker = CollectionHandlingUtils.asyncRequestTracker(adminCmdContext, ccc); + + // Step 1: Upgrade the leader (can be NRT or TLOG — both are leader-eligible and have + // an active IndexWriter when serving as leader) + log.info( + "Upgrading leader [{}] (type={}) for shard [{}]", + leader.getCoreName(), + leader.getType(), + shardName); + ModifiableSolrParams leaderParams = buildUpgradeParams(leader.getCoreName()); + tracker.sendShardRequest(leader, leaderParams, shardHandler); + tracker.processResponses( + shardResult, shardHandler, true, "Leader upgrade failed for shard " + shardName); + + // Step 2: Upgrade NRT non-leader replicas in parallel. + // NRT replicas own their index via IndexWriter, so they can upgrade locally. + // If the leader is TLOG, all NRT replicas in the shard are non-leaders and are upgraded here. + List nrtNonLeaders = new ArrayList<>(); + for (Replica replica : slice.getReplicas(EnumSet.of(Replica.Type.NRT))) { + if (!replica.getName().equals(leader.getName()) + && clusterState.liveNodesContain(replica.getNodeName())) { + nrtNonLeaders.add(replica); + } + } + + if (!nrtNonLeaders.isEmpty()) { + log.info( + "Upgrading {} NRT non-leader replica(s) for shard [{}]", nrtNonLeaders.size(), shardName); + + ShardHandler nrtHandler = ccc.newShardHandler(); + ShardRequestTracker nrtTracker = + CollectionHandlingUtils.asyncRequestTracker(adminCmdContext, ccc); + for (Replica nrtReplica : nrtNonLeaders) { + ModifiableSolrParams nrtParams = buildUpgradeParams(nrtReplica.getCoreName()); + nrtTracker.sendShardRequest(nrtReplica, nrtParams, nrtHandler); + } + nrtTracker.processResponses( + shardResult, nrtHandler, true, "NRT replica upgrade failed for shard " + shardName); + } + + // Step 3: TLOG and PULL non-leader replicas converge via their normal background + // replication from the now-upgraded leader. TLOG non-leaders get their Lucene index from + // the leader via ReplicateFromLeader; PULL replicas do the same. We do not upgrade them + // locally to avoid racing with their background replication thread. + List replicatingReplicas = new ArrayList<>(); + for (Replica replica : slice.getReplicas(EnumSet.of(Replica.Type.TLOG, Replica.Type.PULL))) { + if (!replica.getName().equals(leader.getName())) { + replicatingReplicas.add(replica.getCoreName() + " (" + replica.getType() + ")"); + } + } + if (!replicatingReplicas.isEmpty()) { + log.info( + "TLOG/PULL replicas for shard [{}] will converge via replication from leader: {}", + shardName, + replicatingReplicas); + shardResult.add("replicatingFromLeader", replicatingReplicas); + } + + log.info("Shard [{}] upgrade complete", shardName); + } + + private ModifiableSolrParams buildUpgradeParams(String coreName) { + ModifiableSolrParams params = new ModifiableSolrParams(); + params.set(CoreAdminParams.ACTION, CoreAdminParams.CoreAdminAction.UPGRADECOREINDEX.toString()); + params.set(CoreAdminParams.CORE, coreName); + params.set("cloudMode", "true"); + return params; + } + + /** + * Sets or clears the readOnly property on a collection via MODIFYCOLLECTION. Setting readOnly to + * true blocks all external writes at the {@link + * org.apache.solr.update.processor.DistributedZkUpdateProcessor} layer. + */ + private void setCollectionReadOnly(String collectionName, boolean readOnly) throws Exception { + String readOnlyValue = readOnly ? "true" : null; // null clears the property + + log.info("Setting readOnly={} on collection [{}]", readOnly, collectionName); + + ZkNodeProps props = + new ZkNodeProps( + Overseer.QUEUE_OPERATION, + CollectionParams.CollectionAction.MODIFYCOLLECTION.toLower(), + ZkStateReader.COLLECTION_PROP, + collectionName, + ZkStateReader.READ_ONLY, + readOnlyValue); + + if (ccc.getDistributedClusterStateUpdater().isDistributedStateUpdate()) { + ccc.getDistributedClusterStateUpdater() + .doSingleStateUpdate( + DistributedClusterStateUpdater.MutatingCommand.CollectionModifyCollection, + props, + ccc.getSolrCloudManager(), + ccc.getZkStateReader()); + } else { + ccc.offerStateUpdate(props); + } + } +} diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java index 0fa337f63622..70f1d11089a6 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java @@ -86,6 +86,7 @@ import static org.apache.solr.common.params.CollectionParams.CollectionAction.RESTORE; import static org.apache.solr.common.params.CollectionParams.CollectionAction.SPLITSHARD; import static org.apache.solr.common.params.CollectionParams.CollectionAction.SYNCSHARD; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.UPGRADECOLLECTIONINDEX; import static org.apache.solr.common.params.CollectionParams.SOURCE_NODE; import static org.apache.solr.common.params.CollectionParams.TARGET_NODE; import static org.apache.solr.common.params.CommonAdminParams.ASYNC; @@ -590,6 +591,14 @@ public enum CollectionOperation implements CollectionOp { return m; }), + UPGRADECOLLECTIONINDEX_OP( + UPGRADECOLLECTIONINDEX, + (req, rsp, h) -> { + Map m = copy(req.getParams().required(), null, NAME); + copy(req.getParams(), m, FOLLOW_ALIASES); + return m; + }), + SYNCSHARD_OP( SYNCSHARD, (req, rsp, h) -> { diff --git a/solr/solrj/src/java/org/apache/solr/common/params/CollectionParams.java b/solr/solrj/src/java/org/apache/solr/common/params/CollectionParams.java index 6ae82508df4b..69a7a3de8883 100644 --- a/solr/solrj/src/java/org/apache/solr/common/params/CollectionParams.java +++ b/solr/solrj/src/java/org/apache/solr/common/params/CollectionParams.java @@ -143,7 +143,8 @@ enum CollectionAction { COLSTATUS(true, LockLevel.NONE), // this command implements its own locking REINDEXCOLLECTION(true, LockLevel.NONE), - RENAME(true, LockLevel.COLLECTION); + RENAME(true, LockLevel.COLLECTION), + UPGRADECOLLECTIONINDEX(true, LockLevel.COLLECTION); public final boolean isWrite; public final String lowerName; From 305a405b1a27003f1de72581744788eed6c278cf Mon Sep 17 00:00:00 2001 From: Rahul Goswami Date: Thu, 26 Mar 2026 02:01:22 -0400 Subject: [PATCH 3/6] Add core-level validation endpoint to ensure all relevant segments have been upgraded --- .../model/UpgradeCoreIndexRequestBody.java | 8 ++++ .../api/model/UpgradeCoreIndexResponse.java | 6 +++ .../handler/admin/UpgradeCoreIndexOp.java | 6 ++- .../handler/admin/api/UpgradeCoreIndex.java | 38 ++++++++++++++++++- 4 files changed, 56 insertions(+), 2 deletions(-) diff --git a/solr/api/src/java/org/apache/solr/client/api/model/UpgradeCoreIndexRequestBody.java b/solr/api/src/java/org/apache/solr/client/api/model/UpgradeCoreIndexRequestBody.java index 62f9348a9fa6..39f4490e2eb1 100644 --- a/solr/api/src/java/org/apache/solr/client/api/model/UpgradeCoreIndexRequestBody.java +++ b/solr/api/src/java/org/apache/solr/client/api/model/UpgradeCoreIndexRequestBody.java @@ -39,4 +39,12 @@ public class UpgradeCoreIndexRequestBody { + " core's readOnly flag is temporarily cleared for the duration of the upgrade.") @JsonProperty public Boolean cloudMode; + + @Schema( + description = + "When true, only checks whether the index contains old-format segments without performing" + + " any upgrade. Returns the number of segments needing upgrade and whether the index" + + " is already at the current Lucene format. No writes are performed.") + @JsonProperty + public Boolean checkOnly; } diff --git a/solr/api/src/java/org/apache/solr/client/api/model/UpgradeCoreIndexResponse.java b/solr/api/src/java/org/apache/solr/client/api/model/UpgradeCoreIndexResponse.java index 09e0ba1e8b22..b099d5d56ef4 100644 --- a/solr/api/src/java/org/apache/solr/client/api/model/UpgradeCoreIndexResponse.java +++ b/solr/api/src/java/org/apache/solr/client/api/model/UpgradeCoreIndexResponse.java @@ -35,4 +35,10 @@ public class UpgradeCoreIndexResponse extends SolrJerseyResponse { @Schema(description = "Status of the core index upgrade operation.") @JsonProperty public String upgradeStatus; + + @Schema( + description = + "Whether the index is fully at the current Lucene format. Set when checkOnly=true.") + @JsonProperty + public Boolean indexUpgraded; } diff --git a/solr/core/src/java/org/apache/solr/handler/admin/UpgradeCoreIndexOp.java b/solr/core/src/java/org/apache/solr/handler/admin/UpgradeCoreIndexOp.java index 08706c37d470..c34d3441e09f 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/UpgradeCoreIndexOp.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/UpgradeCoreIndexOp.java @@ -53,8 +53,11 @@ public void execute(CoreAdminHandler.CallInfo it) throws Exception { assert it.handler.coreContainer != null; SolrParams params = it.req.getParams(); final boolean cloudMode = params.getBool("cloudMode", false); + final boolean checkOnly = params.getBool("checkOnly", false); - if (it.handler.coreContainer.isZooKeeperAware() && !cloudMode) { + // checkOnly is a read-only operation and is allowed in SolrCloud mode. + // Full upgrade requires either cloudMode (coordinator-initiated) or standalone mode. + if (it.handler.coreContainer.isZooKeeperAware() && !cloudMode && !checkOnly) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "action=UPGRADECOREINDEX is not supported in SolrCloud mode. Use the" @@ -68,6 +71,7 @@ public void execute(CoreAdminHandler.CallInfo it) throws Exception { final var requestBody = new UpgradeCoreIndexRequestBody(); requestBody.updateChain = params.get(UpdateParams.UPDATE_CHAIN); requestBody.cloudMode = cloudMode; + requestBody.checkOnly = checkOnly; UpgradeCoreIndex upgradeCoreIndexApi = UPGRADE_CORE_INDEX_FACTORY.create( diff --git a/solr/core/src/java/org/apache/solr/handler/admin/api/UpgradeCoreIndex.java b/solr/core/src/java/org/apache/solr/handler/admin/api/UpgradeCoreIndex.java index 4fd639d76acd..e899e7ee411c 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/api/UpgradeCoreIndex.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/api/UpgradeCoreIndex.java @@ -97,7 +97,9 @@ public class UpgradeCoreIndex extends CoreAdminAPIBase { public enum CoreIndexUpgradeStatus { UPGRADE_SUCCESSFUL, ERROR, - NO_UPGRADE_NEEDED + NO_UPGRADE_NEEDED, + /** Returned by {@code checkOnly=true} when old-format segments are present. */ + UPGRADE_NEEDED } private static final int RETRY_COUNT_FOR_SEGMENT_DELETION = 5; @@ -142,8 +144,14 @@ private UpgradeCoreIndexResponse performUpgradeImpl( SolrCore core, UpgradeCoreIndexRequestBody requestBody, UpgradeCoreIndexResponse response) { final boolean cloudMode = Boolean.TRUE.equals(requestBody.cloudMode); + final boolean checkOnly = Boolean.TRUE.equals(requestBody.checkOnly); boolean readOnlyWasCleared = false; + // checkOnly: count old-format segments and return without performing any writes + if (checkOnly) { + return performCheckOnly(core, response); + } + RefCounted iwRef = null; MergePolicy originalMergePolicy = null; int numSegmentsEligibleForUpgrade = 0, numSegmentsUpgraded = 0; @@ -298,6 +306,34 @@ private UpgradeCoreIndexResponse performUpgradeImpl( return response; } + /** + * Counts old-format segments without performing any writes. Returns immediately with the count + * and whether the index is already at the current Lucene format. + */ + private UpgradeCoreIndexResponse performCheckOnly( + SolrCore core, UpgradeCoreIndexResponse response) { + String coreName = core.getName(); + RefCounted searcherRef = core.getSearcher(); + try { + int segmentsNeedingUpgrade = 0; + for (LeafReaderContext lrc : searcherRef.get().getIndexReader().leaves()) { + if (shouldUpgradeSegment(lrc)) { + segmentsNeedingUpgrade++; + } + } + response.core = coreName; + response.numSegmentsEligibleForUpgrade = segmentsNeedingUpgrade; + response.indexUpgraded = (segmentsNeedingUpgrade == 0); + response.upgradeStatus = + response.indexUpgraded + ? CoreIndexUpgradeStatus.NO_UPGRADE_NEEDED.toString() + : CoreIndexUpgradeStatus.UPGRADE_NEEDED.toString(); + return response; + } finally { + searcherRef.decref(); + } + } + private boolean shouldUpgradeSegment(LeafReaderContext lrc) { Version segmentMinVersion = null; From 180b344876897207ba0788b48f9e8876239bf844 Mon Sep 17 00:00:00 2001 From: Rahul Goswami Date: Fri, 27 Mar 2026 02:29:40 -0400 Subject: [PATCH 4/6] Wait for TLOG/PULL replica convergence before signaling upgrade completion --- .../UpgradeCollectionIndexCmd.java | 172 ++++++++++++++++-- .../handler/admin/CollectionsHandler.java | 3 + 2 files changed, 164 insertions(+), 11 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/UpgradeCollectionIndexCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/UpgradeCollectionIndexCmd.java index d3fd6f4c9021..07ed07a80ffa 100644 --- a/solr/core/src/java/org/apache/solr/cloud/api/collections/UpgradeCollectionIndexCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/UpgradeCollectionIndexCmd.java @@ -24,6 +24,7 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.concurrent.TimeUnit; import org.apache.solr.cloud.DistributedClusterStateUpdater; import org.apache.solr.cloud.Overseer; import org.apache.solr.cloud.api.collections.CollectionHandlingUtils.ShardRequestTracker; @@ -38,7 +39,9 @@ import org.apache.solr.common.params.CoreAdminParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.handler.component.ShardHandler; +import org.apache.solr.util.TimeOut; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -50,18 +53,33 @@ *
    *
  1. Upgrades the shard leader's index locally (no distributed forwarding). *
  2. Upgrades each NRT non-leader replica's index locally in parallel. - *
  3. TLOG and PULL replicas converge by replicating the upgraded index from the leader via their - * normal background replication mechanism. - *
  4. Validates that all replicas have no old-format segments remaining. + *
  5. TLOG and PULL replicas converge by replicating committed leader states via their normal + * background replication mechanism. Intermediate commits during upgrade are allowed, so these + * replicas may fetch more than once before reaching the terminal state. + *
  6. Polls every live replica with {@code checkOnly=true} until all report no old-format + * segments remaining. *
* - *

After all shards are processed, readOnly is cleared. + *

After all shards are processed and validated, readOnly is cleared. If validation fails, the + * collection remains read-only so writes do not resume on a partially upgraded collection. * * @see org.apache.solr.handler.admin.api.UpgradeCoreIndex */ public class UpgradeCollectionIndexCmd implements CollApiCmds.CollectionApiCommand { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + /** + * Parameter name for the configurable convergence timeout (in seconds). If not specified, + * defaults to {@link #DEFAULT_REPLICA_CONVERGENCE_TIMEOUT_SECS}. + */ + public static final String REPLICA_CONVERGENCE_TIMEOUT_SECS_PARAM = + "replicaConvergenceTimeoutSecs"; + + /** Default timeout for waiting for TLOG/PULL replicas to converge after the upgrade. */ + static final int DEFAULT_REPLICA_CONVERGENCE_TIMEOUT_SECS = 1800; // 30 minutes + + private static final long REPLICA_CONVERGENCE_POLL_MS = TimeUnit.SECONDS.toMillis(2); + private final CollectionCommandContext ccc; public UpgradeCollectionIndexCmd(CollectionCommandContext ccc) { @@ -87,7 +105,12 @@ public void call(AdminCmdContext adminCmdContext, ZkNodeProps message, NamedList log.info("Starting UPGRADECOLLECTIONINDEX for collection [{}]", collectionName); + final int convergenceTimeoutSecs = + message.getInt( + REPLICA_CONVERGENCE_TIMEOUT_SECS_PARAM, DEFAULT_REPLICA_CONVERGENCE_TIMEOUT_SECS); + boolean readOnlyWasSet = false; + boolean upgradeCompleted = false; try { // 1. Set collection readOnly to block external writes setCollectionReadOnly(collectionName, true); @@ -97,15 +120,18 @@ public void call(AdminCmdContext adminCmdContext, ZkNodeProps message, NamedList Map shardResults = new LinkedHashMap<>(); for (Slice slice : collection.getSlices()) { NamedList shardResult = new NamedList<>(); - upgradeShardIndex(adminCmdContext, collectionName, slice, shardResult); + upgradeShardIndex( + adminCmdContext, collectionName, slice, shardResult, convergenceTimeoutSecs); shardResults.put(slice.getName(), shardResult); } results.add("shardResults", shardResults); + upgradeCompleted = true; log.info("UPGRADECOLLECTIONINDEX completed successfully for collection [{}]", collectionName); } finally { - // 3. Always clear readOnly, even on failure - if (readOnlyWasSet) { + // 3. Clear readOnly only after all shards are verified upgraded. On failure we keep the + // collection read-only to avoid resuming writes on a partially upgraded collection. + if (readOnlyWasSet && upgradeCompleted) { try { setCollectionReadOnly(collectionName, false); } catch (Exception e) { @@ -119,6 +145,15 @@ public void call(AdminCmdContext adminCmdContext, ZkNodeProps message, NamedList "warning", "Failed to clear readOnly on collection. Use MODIFYCOLLECTION to restore writes."); } + } else if (readOnlyWasSet) { + log.warn( + "UPGRADECOLLECTIONINDEX did not complete successfully for collection [{}]; keeping" + + " collection readOnly for operator intervention.", + collectionName); + results.add( + "warning", + "Upgrade did not complete successfully. Collection remains read-only until it is" + + " manually restored via MODIFYCOLLECTION."); } } } @@ -127,7 +162,8 @@ private void upgradeShardIndex( AdminCmdContext adminCmdContext, String collectionName, Slice slice, - NamedList shardResult) + NamedList shardResult, + int convergenceTimeoutSecs) throws Exception { String shardName = slice.getName(); @@ -180,9 +216,11 @@ private void upgradeShardIndex( } // Step 3: TLOG and PULL non-leader replicas converge via their normal background - // replication from the now-upgraded leader. TLOG non-leaders get their Lucene index from - // the leader via ReplicateFromLeader; PULL replicas do the same. We do not upgrade them - // locally to avoid racing with their background replication thread. + // replication from committed leader states. Because auto-commit may persist intermediate + // progress during the leader upgrade, these replicas can begin converging before the leader's + // final explicit commit, so completion must be gated on replica-side validation rather than on + // assuming a single terminal fetch. We do not upgrade them locally to avoid racing with their + // background replication thread. List replicatingReplicas = new ArrayList<>(); for (Replica replica : slice.getReplicas(EnumSet.of(Replica.Type.TLOG, Replica.Type.PULL))) { if (!replica.getName().equals(leader.getName())) { @@ -197,6 +235,9 @@ private void upgradeShardIndex( shardResult.add("replicatingFromLeader", replicatingReplicas); } + waitForShardReplicaConvergence( + adminCmdContext, collectionName, slice, shardResult, convergenceTimeoutSecs); + log.info("Shard [{}] upgrade complete", shardName); } @@ -208,6 +249,115 @@ private ModifiableSolrParams buildUpgradeParams(String coreName) { return params; } + private ModifiableSolrParams buildCheckOnlyParams(String coreName) { + ModifiableSolrParams params = buildUpgradeParams(coreName); + params.set("checkOnly", "true"); + return params; + } + + private void waitForShardReplicaConvergence( + AdminCmdContext adminCmdContext, + String collectionName, + Slice slice, + NamedList shardResult, + int convergenceTimeoutSecs) + throws Exception { + final String shardName = slice.getName(); + final TimeOut timeout = + new TimeOut( + convergenceTimeoutSecs, TimeUnit.SECONDS, ccc.getSolrCloudManager().getTimeSource()); + + while (true) { + ClusterState latestClusterState = ccc.getZkStateReader().getClusterState(); + List liveReplicas = new ArrayList<>(); + Slice latestSlice = latestClusterState.getCollection(collectionName).getSlice(shardName); + for (Replica replica : latestSlice.getReplicas()) { + if (latestClusterState.liveNodesContain(replica.getNodeName())) { + liveReplicas.add(replica); + } + } + + NamedList checkResults = new NamedList<>(); + ShardHandler checkHandler = ccc.newShardHandler(); + ShardRequestTracker checkTracker = + CollectionHandlingUtils.syncRequestTracker(adminCmdContext, ccc); + for (Replica replica : liveReplicas) { + checkTracker.sendShardRequest( + replica, buildCheckOnlyParams(replica.getCoreName()), checkHandler); + } + // abortOnError=false: transient request failures are treated as "not yet converged" + // rather than fatal errors that would permanently leave the collection read-only. + checkTracker.processResponses( + checkResults, + checkHandler, + false, + "Replica validation failed for shard " + shardName + " during UPGRADECOLLECTIONINDEX"); + + List pendingReplicas = getPendingReplicaUpgrades(checkResults); + if (pendingReplicas.isEmpty()) { + shardResult.add( + "validatedReplicas", liveReplicas.stream().map(Replica::getCoreName).toList()); + return; + } + + if (timeout.hasTimedOut()) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "Timed out waiting for shard " + + shardName + + " replicas to converge to the upgraded index format after " + + convergenceTimeoutSecs + + "s: " + + pendingReplicas); + } + + log.info( + "Shard [{}] waiting for replica convergence ({} pending, {}s remaining): {}", + shardName, + pendingReplicas.size(), + timeout.timeLeft(TimeUnit.SECONDS), + pendingReplicas); + timeout.sleep(REPLICA_CONVERGENCE_POLL_MS); + } + } + + /** + * Returns the keys of replicas that have not yet converged to the upgraded index format. + * + *

Replicas in the "success" map are pending if their {@code indexUpgraded} field is not {@code + * true}. Replicas in the "failure" map (those whose {@code checkOnly} request failed) are treated + * as pending — a transient failure does not mean the replica has converged. + */ + private List getPendingReplicaUpgrades(NamedList checkResults) { + List pendingReplicas = new ArrayList<>(); + + @SuppressWarnings("unchecked") + SimpleOrderedMap successes = (SimpleOrderedMap) checkResults.get("success"); + if (successes != null) { + for (int i = 0; i < successes.size(); i++) { + Object responseObj = successes.getVal(i); + boolean indexUpgraded = + (responseObj instanceof NamedList response) + && Boolean.TRUE.equals(response.get("indexUpgraded")); + if (!indexUpgraded) { + pendingReplicas.add(successes.getName(i)); + } + } + } + + // Replicas that returned an error are treated as not-yet-converged. A transient error + // (network glitch, restarting node) should not permanently leave the collection read-only. + @SuppressWarnings("unchecked") + SimpleOrderedMap failures = (SimpleOrderedMap) checkResults.get("failure"); + if (failures != null) { + for (int i = 0; i < failures.size(); i++) { + pendingReplicas.add(failures.getName(i)); + } + } + + return pendingReplicas; + } + /** * Sets or clears the readOnly property on a collection via MODIFYCOLLECTION. Setting readOnly to * true blocks all external writes at the {@link diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java index 70f1d11089a6..8a30df9f5113 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java @@ -143,6 +143,7 @@ import org.apache.solr.cloud.api.collections.CollectionHandlingUtils; import org.apache.solr.cloud.api.collections.DistributedCollectionConfigSetCommandRunner; import org.apache.solr.cloud.api.collections.ReindexCollectionCmd; +import org.apache.solr.cloud.api.collections.UpgradeCollectionIndexCmd; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.cloud.DocCollection; @@ -596,6 +597,8 @@ public enum CollectionOperation implements CollectionOp { (req, rsp, h) -> { Map m = copy(req.getParams().required(), null, NAME); copy(req.getParams(), m, FOLLOW_ALIASES); + copy( + req.getParams(), m, UpgradeCollectionIndexCmd.REPLICA_CONVERGENCE_TIMEOUT_SECS_PARAM); return m; }), From ae86418d8c78abbfc7657a071fcc96fc69f3393c Mon Sep 17 00:00:00 2001 From: Rahul Goswami Date: Sun, 29 Mar 2026 02:33:52 -0400 Subject: [PATCH 5/6] 1) Tighten polling logic for replica convergence. 2) Increase polling interval to 30 seconds --- .../UpgradeCollectionIndexCmd.java | 108 ++++++++++-------- 1 file changed, 58 insertions(+), 50 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/UpgradeCollectionIndexCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/UpgradeCollectionIndexCmd.java index 07ed07a80ffa..acc04d01ff4b 100644 --- a/solr/core/src/java/org/apache/solr/cloud/api/collections/UpgradeCollectionIndexCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/UpgradeCollectionIndexCmd.java @@ -21,9 +21,11 @@ import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.EnumSet; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.TimeUnit; import org.apache.solr.cloud.DistributedClusterStateUpdater; import org.apache.solr.cloud.Overseer; @@ -78,7 +80,7 @@ public class UpgradeCollectionIndexCmd implements CollApiCmds.CollectionApiComma /** Default timeout for waiting for TLOG/PULL replicas to converge after the upgrade. */ static final int DEFAULT_REPLICA_CONVERGENCE_TIMEOUT_SECS = 1800; // 30 minutes - private static final long REPLICA_CONVERGENCE_POLL_MS = TimeUnit.SECONDS.toMillis(2); + private static final long REPLICA_CONVERGENCE_POLL_MS = TimeUnit.SECONDS.toMillis(30); private final CollectionCommandContext ccc; @@ -267,36 +269,59 @@ private void waitForShardReplicaConvergence( new TimeOut( convergenceTimeoutSecs, TimeUnit.SECONDS, ccc.getSolrCloudManager().getTimeSource()); + // Monotonically accumulates replica keys that have confirmed indexUpgraded=true. + // A replica that confirms in iteration N is not re-checked in iteration N+1, so a + // momentary probe failure (GC pause, brief network blip) on a subsequent poll cannot + // un-confirm an already-verified replica. + final Set confirmedConverged = new HashSet<>(); + while (true) { ClusterState latestClusterState = ccc.getZkStateReader().getClusterState(); - List liveReplicas = new ArrayList<>(); Slice latestSlice = latestClusterState.getCollection(collectionName).getSlice(shardName); - for (Replica replica : latestSlice.getReplicas()) { - if (latestClusterState.liveNodesContain(replica.getNodeName())) { - liveReplicas.add(replica); + List allReplicas = new ArrayList<>(latestSlice.getReplicas()); + + // Probe only the live replicas that have not yet been confirmed. Non-live replicas + // remain in pendingReplicas until they recover and report indexUpgraded=true. + List toCheck = new ArrayList<>(); + for (Replica replica : allReplicas) { + String key = CollectionHandlingUtils.requestKey(replica); + if (!confirmedConverged.contains(key) + && latestClusterState.liveNodesContain(replica.getNodeName())) { + toCheck.add(replica); } } - NamedList checkResults = new NamedList<>(); - ShardHandler checkHandler = ccc.newShardHandler(); - ShardRequestTracker checkTracker = - CollectionHandlingUtils.syncRequestTracker(adminCmdContext, ccc); - for (Replica replica : liveReplicas) { - checkTracker.sendShardRequest( - replica, buildCheckOnlyParams(replica.getCoreName()), checkHandler); + if (!toCheck.isEmpty()) { + NamedList checkResults = new NamedList<>(); + ShardHandler checkHandler = ccc.newShardHandler(); + ShardRequestTracker checkTracker = + CollectionHandlingUtils.syncRequestTracker(adminCmdContext, ccc); + for (Replica replica : toCheck) { + checkTracker.sendShardRequest( + replica, buildCheckOnlyParams(replica.getCoreName()), checkHandler); + } + // abortOnError=false: transient request failures are treated as "not yet converged" + // rather than fatal errors that would permanently leave the collection read-only. + checkTracker.processResponses( + checkResults, + checkHandler, + false, + "Replica validation failed for shard " + shardName + " during UPGRADECOLLECTIONINDEX"); + + updateConfirmedConverged(checkResults, confirmedConverged); } - // abortOnError=false: transient request failures are treated as "not yet converged" - // rather than fatal errors that would permanently leave the collection read-only. - checkTracker.processResponses( - checkResults, - checkHandler, - false, - "Replica validation failed for shard " + shardName + " during UPGRADECOLLECTIONINDEX"); - - List pendingReplicas = getPendingReplicaUpgrades(checkResults); + + // Pending = ALL replicas of the shard (live or not) not yet confirmed. A down replica + // is still unverified and must eventually confirm before readOnly is cleared. + List pendingReplicas = + allReplicas.stream() + .map(CollectionHandlingUtils::requestKey) + .filter(key -> !confirmedConverged.contains(key)) + .toList(); + if (pendingReplicas.isEmpty()) { shardResult.add( - "validatedReplicas", liveReplicas.stream().map(Replica::getCoreName).toList()); + "validatedReplicas", allReplicas.stream().map(Replica::getCoreName).toList()); return; } @@ -322,40 +347,23 @@ private void waitForShardReplicaConvergence( } /** - * Returns the keys of replicas that have not yet converged to the upgraded index format. - * - *

Replicas in the "success" map are pending if their {@code indexUpgraded} field is not {@code - * true}. Replicas in the "failure" map (those whose {@code checkOnly} request failed) are treated - * as pending — a transient failure does not mean the replica has converged. + * Adds to {@code confirmedConverged} the keys of replicas that responded with {@code + * indexUpgraded=true} in this poll round. */ - private List getPendingReplicaUpgrades(NamedList checkResults) { - List pendingReplicas = new ArrayList<>(); - + private void updateConfirmedConverged( + NamedList checkResults, Set confirmedConverged) { @SuppressWarnings("unchecked") SimpleOrderedMap successes = (SimpleOrderedMap) checkResults.get("success"); - if (successes != null) { - for (int i = 0; i < successes.size(); i++) { - Object responseObj = successes.getVal(i); - boolean indexUpgraded = - (responseObj instanceof NamedList response) - && Boolean.TRUE.equals(response.get("indexUpgraded")); - if (!indexUpgraded) { - pendingReplicas.add(successes.getName(i)); - } - } + if (successes == null) { + return; } - - // Replicas that returned an error are treated as not-yet-converged. A transient error - // (network glitch, restarting node) should not permanently leave the collection read-only. - @SuppressWarnings("unchecked") - SimpleOrderedMap failures = (SimpleOrderedMap) checkResults.get("failure"); - if (failures != null) { - for (int i = 0; i < failures.size(); i++) { - pendingReplicas.add(failures.getName(i)); + for (int i = 0; i < successes.size(); i++) { + Object responseObj = successes.getVal(i); + if ((responseObj instanceof NamedList response) + && Boolean.TRUE.equals(response.get("indexUpgraded"))) { + confirmedConverged.add(successes.getName(i)); } } - - return pendingReplicas; } /** From 944f604ce8f49976f7160a331cb985ebb2ea7e28 Mon Sep 17 00:00:00 2001 From: Rahul Goswami Date: Fri, 3 Apr 2026 02:35:59 -0400 Subject: [PATCH 6/6] setVersion() on AddUpdateCommand for tlog consistency --- .../apache/solr/handler/admin/api/UpgradeCoreIndex.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/solr/core/src/java/org/apache/solr/handler/admin/api/UpgradeCoreIndex.java b/solr/core/src/java/org/apache/solr/handler/admin/api/UpgradeCoreIndex.java index e899e7ee411c..c3ea1844a794 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/api/UpgradeCoreIndex.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/api/UpgradeCoreIndex.java @@ -41,6 +41,8 @@ import org.apache.solr.client.api.model.UpgradeCoreIndexResponse; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.SolrInputField; +import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.UpdateParams; import org.apache.solr.common.util.NamedList; @@ -544,6 +546,12 @@ private void processSegment( AddUpdateCommand currDocCmd = new AddUpdateCommand(solrRequest); currDocCmd.solrDoc = solrDoc; + // Preserve the original _version_ on the command so that the tlog entry + // is consistent with the indexed document's _version_ field. + SolrInputField versionField = solrDoc.getField(CommonParams.VERSION_FIELD); + if (versionField != null) { + currDocCmd.setVersion(((Number) versionField.getValue()).longValue()); + } processor.processAdd(currDocCmd); } } finally {