From 3c0574693cfbea882c64db0d6d4786ac019c5225 Mon Sep 17 00:00:00 2001 From: Matthew Biscocho Date: Tue, 24 Mar 2026 16:12:18 -0400 Subject: [PATCH 1/5] Get collection state from cache --- .../handler/component/CloudReplicaSource.java | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java b/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java index 836d0951f06..bf8cb48d797 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java +++ b/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java @@ -109,12 +109,14 @@ private void withShardsParam(Builder builder, String shardsParam) { if (sliceOrUrl.indexOf('/') < 0) { // this is a logical shard this.slices[i] = sliceOrUrl; + DocCollection coll = clusterState.getCollectionOrNull(builder.collection, true); + if (coll == null) { + throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, + "Could not find collection to resolve replicas: " + builder.collection); + } replicas[i] = - findReplicas( - builder, - shardsParam, - clusterState, - clusterState.getCollection(builder.collection).getSlice(sliceOrUrl)); + findReplicas(builder, shardsParam, clusterState, coll.getSlice(sliceOrUrl)); } else { // this has urls this.replicas[i] = StrUtils.splitSmart(sliceOrUrl, "|", true); @@ -189,7 +191,11 @@ private void addSlices( String collectionName, String shardKeys, boolean multiCollection) { - DocCollection coll = state.getCollection(collectionName); + DocCollection coll = state.getCollectionOrNull(collectionName, true); + if (coll == null) { + throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, "Could not find collection to add slices: " + collectionName); + } Collection slices = coll.getRouter().getSearchSlices(shardKeys, params, coll); ClientUtils.addSlices(target, collectionName, slices, multiCollection); } From b2c817d584f5521ea83c14d4d48808b738ec02f0 Mon Sep 17 00:00:00 2001 From: Matthew Biscocho Date: Wed, 25 Mar 2026 11:10:50 -0400 Subject: [PATCH 2/5] Add changelog --- .../unreleased/SOLR-18176-shardhandler-bottleneck.yml | 7 +++++++ .../apache/solr/handler/component/CloudReplicaSource.java | 6 +++--- 2 files changed, 10 insertions(+), 3 deletions(-) create mode 100644 changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml diff --git a/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml b/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml new file mode 100644 index 00000000000..52ba15ba0ce --- /dev/null +++ b/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml @@ -0,0 +1,7 @@ +title: Increased query throughput by removing a call to ZooKeeper for cluster state that should have been cached +type: fixed +authors: + - name: Matthew Biscocho +links: + - name: SOLR-18176 + url: https://issues.apache.org/jira/browse/SOLR-18176 diff --git a/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java b/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java index bf8cb48d797..5315d413f42 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java +++ b/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java @@ -115,8 +115,7 @@ private void withShardsParam(Builder builder, String shardsParam) { SolrException.ErrorCode.BAD_REQUEST, "Could not find collection to resolve replicas: " + builder.collection); } - replicas[i] = - findReplicas(builder, shardsParam, clusterState, coll.getSlice(sliceOrUrl)); + replicas[i] = findReplicas(builder, shardsParam, clusterState, coll.getSlice(sliceOrUrl)); } else { // this has urls this.replicas[i] = StrUtils.splitSmart(sliceOrUrl, "|", true); @@ -194,7 +193,8 @@ private void addSlices( DocCollection coll = state.getCollectionOrNull(collectionName, true); if (coll == null) { throw new SolrException( - SolrException.ErrorCode.BAD_REQUEST, "Could not find collection to add slices: " + collectionName); + SolrException.ErrorCode.BAD_REQUEST, + "Could not find collection to add slices: " + collectionName); } Collection slices = coll.getRouter().getSearchSlices(shardKeys, params, coll); ClientUtils.addSlices(target, collectionName, slices, multiCollection); From 14483dd117264560d8e95acc2ce666fe0d7b49b1 Mon Sep 17 00:00:00 2001 From: Matthew Biscocho Date: Mon, 30 Mar 2026 16:41:22 -0400 Subject: [PATCH 3/5] Add test --- .../SOLR-18176-shardhandler-bottleneck.yml | 2 +- ...ributedQueryComponentOptimizationTest.java | 53 +++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml b/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml index 52ba15ba0ce..167b1b59e24 100644 --- a/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml +++ b/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml @@ -1,5 +1,5 @@ title: Increased query throughput by removing a call to ZooKeeper for cluster state that should have been cached -type: fixed +type: changed authors: - name: Matthew Biscocho links: diff --git a/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java b/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java index 349d6dda711..186589d192f 100644 --- a/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java +++ b/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java @@ -24,15 +24,19 @@ import java.util.Set; import java.util.concurrent.TimeUnit; import org.apache.solr.BaseDistributedSearchTestCase; +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.jetty.HttpJettySolrClient; import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.client.solrj.request.SolrQuery; import org.apache.solr.client.solrj.request.UpdateRequest; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.cloud.SolrCloudTestCase; +import org.apache.solr.common.cloud.SolrZKMetricsListener; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.ShardParams; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.common.util.StrUtils; +import org.apache.solr.embedded.JettySolrRunner; import org.junit.BeforeClass; import org.junit.Test; @@ -707,6 +711,55 @@ private QueryResponse queryWithAsserts(String... q) throws Exception { return response; } + /** + * When a node resolves collection state for a collection it doesn't host, queries should use + * cached state and not make ZK calls on every query. + */ + @Test + public void testDistributedQueryDoesNotReadFromZk() throws Exception { + final String testCollection = "testCollection"; + + // Create a collection on only 1 node so the other node uses LazyCollectionRef for state + List jettys = cluster.getJettySolrRunners(); + CollectionAdminRequest.createCollection(testCollection, "conf", 1, 1) + .setCreateNodeSet(jettys.get(0).getNodeName()) + .processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT); + cluster + .getZkStateReader() + .waitForState( + testCollection, + DEFAULT_TIMEOUT, + TimeUnit.SECONDS, + (n, c) -> SolrCloudTestCase.replicasForCollectionAreFullyActive(n, c, 1, 1)); + + try { + // Node 1 hosts COLLECTION but not testCollection. + // Send a multi-collection query to trigger LazyCollectionRef get call + JettySolrRunner nodeWithoutOther = jettys.get(1); + try (SolrClient client = + new HttpJettySolrClient.Builder(nodeWithoutOther.getBaseUrl().toString()).build()) { + + String collectionsParameter = COLLECTION + "," + testCollection; + + // Warm up LazyCollectionRef state cache with query + client.query(COLLECTION, new SolrQuery("q", "*:*", "collection", collectionsParameter)); + + SolrZKMetricsListener metrics = cluster.getZkStateReader().getZkClient().getMetrics(); + long existsBefore = metrics.getExistsChecks(); + + // Query again and assert that exists call is not made + client.query(COLLECTION, new SolrQuery("q", "*:*", "collection", collectionsParameter)); + assertEquals( + "Query should not cause ZK exists checks as collection state should be cached", + existsBefore, + metrics.getExistsChecks()); + } + } finally { + CollectionAdminRequest.deleteCollection(testCollection) + .processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT); + } + } + private int getNumRequests( Map> requests) { int beforeNumRequests = 0; From 15c7534f3ee7ac0a5687bc04433beb71c532dd92 Mon Sep 17 00:00:00 2001 From: Matthew Biscocho Date: Wed, 1 Apr 2026 15:57:36 -0400 Subject: [PATCH 4/5] Address PR comments --- .../SOLR-18176-shardhandler-bottleneck.yml | 4 +++- ...ributedQueryComponentOptimizationTest.java | 24 ++++++++++++------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml b/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml index 167b1b59e24..1ce13b101d4 100644 --- a/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml +++ b/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml @@ -1,7 +1,9 @@ -title: Increased query throughput by removing a call to ZooKeeper for cluster state that should have been cached +title: Fix query throughput bottleneck caused by uncached ZooKeeper get calls for queries with explicit 'collection' parameter type: changed authors: - name: Matthew Biscocho links: - name: SOLR-18176 url: https://issues.apache.org/jira/browse/SOLR-18176 + - name: SOLR-15352 + url: https://issues.apache.org/jira/browse/SOLR-15352 diff --git a/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java b/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java index 186589d192f..65fca131af8 100644 --- a/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java +++ b/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java @@ -717,34 +717,40 @@ private QueryResponse queryWithAsserts(String... q) throws Exception { */ @Test public void testDistributedQueryDoesNotReadFromZk() throws Exception { - final String testCollection = "testCollection"; + final String secondColl = "secondColl"; // Create a collection on only 1 node so the other node uses LazyCollectionRef for state List jettys = cluster.getJettySolrRunners(); - CollectionAdminRequest.createCollection(testCollection, "conf", 1, 1) + CollectionAdminRequest.createCollection(secondColl, "conf", 1, 1) .setCreateNodeSet(jettys.get(0).getNodeName()) .processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT); cluster .getZkStateReader() .waitForState( - testCollection, + secondColl, DEFAULT_TIMEOUT, TimeUnit.SECONDS, (n, c) -> SolrCloudTestCase.replicasForCollectionAreFullyActive(n, c, 1, 1)); try { - // Node 1 hosts COLLECTION but not testCollection. + // Node 1 hosts COLLECTION but not secondColl. // Send a multi-collection query to trigger LazyCollectionRef get call - JettySolrRunner nodeWithoutOther = jettys.get(1); + JettySolrRunner nodeWithoutSecondColl = jettys.get(1); try (SolrClient client = - new HttpJettySolrClient.Builder(nodeWithoutOther.getBaseUrl().toString()).build()) { + new HttpJettySolrClient.Builder(nodeWithoutSecondColl.getBaseUrl().toString()).build()) { - String collectionsParameter = COLLECTION + "," + testCollection; + String collectionsParameter = COLLECTION + "," + secondColl; // Warm up LazyCollectionRef state cache with query client.query(COLLECTION, new SolrQuery("q", "*:*", "collection", collectionsParameter)); - SolrZKMetricsListener metrics = cluster.getZkStateReader().getZkClient().getMetrics(); + // Get ZK metrics from the coordinator node (the one we're querying) + SolrZKMetricsListener metrics = + nodeWithoutSecondColl + .getCoreContainer() + .getZkController() + .getZkClient() + .getMetrics(); long existsBefore = metrics.getExistsChecks(); // Query again and assert that exists call is not made @@ -755,7 +761,7 @@ public void testDistributedQueryDoesNotReadFromZk() throws Exception { metrics.getExistsChecks()); } } finally { - CollectionAdminRequest.deleteCollection(testCollection) + CollectionAdminRequest.deleteCollection(secondColl) .processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT); } } From bcd9ad8d5c124794e28e8eb3770d5416ccd701ee Mon Sep 17 00:00:00 2001 From: Matthew Biscocho Date: Thu, 2 Apr 2026 10:39:21 -0400 Subject: [PATCH 5/5] tidy --- .../DistributedQueryComponentOptimizationTest.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java b/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java index 65fca131af8..9387d8680b7 100644 --- a/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java +++ b/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java @@ -746,11 +746,7 @@ public void testDistributedQueryDoesNotReadFromZk() throws Exception { // Get ZK metrics from the coordinator node (the one we're querying) SolrZKMetricsListener metrics = - nodeWithoutSecondColl - .getCoreContainer() - .getZkController() - .getZkClient() - .getMetrics(); + nodeWithoutSecondColl.getCoreContainer().getZkController().getZkClient().getMetrics(); long existsBefore = metrics.getExistsChecks(); // Query again and assert that exists call is not made