apache · mlbiscoc · Mar 24, 2026 · Mar 25, 2026 · Mar 30, 2026 · Apr 1, 2026
diff --git a/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml b/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml
@@ -0,0 +1,9 @@
+title: Fix query throughput bottleneck caused by uncached ZooKeeper get calls for queries with explicit 'collection' parameter
+type: changed
+authors:
+  - name: Matthew Biscocho
+links:
+  - name: SOLR-18176
+    url: https://issues.apache.org/jira/browse/SOLR-18176
+  - name: SOLR-15352
+    url: https://issues.apache.org/jira/browse/SOLR-15352
diff --git a/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java b/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java
@@ -109,12 +109,13 @@ private void withShardsParam(Builder builder, String shardsParam) {
       if (sliceOrUrl.indexOf('/') < 0) {
         // this is a logical shard
         this.slices[i] = sliceOrUrl;
-        replicas[i] =
-            findReplicas(
-                builder,
-                shardsParam,
-                clusterState,
-                clusterState.getCollection(builder.collection).getSlice(sliceOrUrl));
+        DocCollection coll = clusterState.getCollectionOrNull(builder.collection, true);
+        if (coll == null) {
+          throw new SolrException(
+              SolrException.ErrorCode.BAD_REQUEST,
+              "Could not find collection to resolve replicas: " + builder.collection);
+        }
+        replicas[i] = findReplicas(builder, shardsParam, clusterState, coll.getSlice(sliceOrUrl));
       } else {
         // this has urls
         this.replicas[i] = StrUtils.splitSmart(sliceOrUrl, "|", true);
@@ -189,7 +190,12 @@ private void addSlices(
       String collectionName,
       String shardKeys,
       boolean multiCollection) {
-    DocCollection coll = state.getCollection(collectionName);
+    DocCollection coll = state.getCollectionOrNull(collectionName, true);
+    if (coll == null) {
+      throw new SolrException(
+          SolrException.ErrorCode.BAD_REQUEST,
+          "Could not find collection to add slices: " + collectionName);
+    }
     Collection<Slice> slices = coll.getRouter().getSearchSlices(shardKeys, params, coll);
     ClientUtils.addSlices(target, collectionName, slices, multiCollection);
   }

diff --git a/...src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java b/...src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java
@@ -24,15 +24,19 @@
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import org.apache.solr.BaseDistributedSearchTestCase;
+import org.apache.solr.client.solrj.SolrClient;
+import org.apache.solr.client.solrj.jetty.HttpJettySolrClient;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.client.solrj.request.SolrQuery;
 import org.apache.solr.client.solrj.request.UpdateRequest;
 import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.cloud.SolrCloudTestCase;
+import org.apache.solr.common.cloud.SolrZKMetricsListener;
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.common.params.ShardParams;
 import org.apache.solr.common.util.SimpleOrderedMap;
 import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.embedded.JettySolrRunner;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
@@ -707,6 +711,57 @@ private QueryResponse queryWithAsserts(String... q) throws Exception {
     return response;
   }
 
+  /**
+   * When a node resolves collection state for a collection it doesn't host, queries should use
+   * cached state and not make ZK calls on every query.
+   */
+  @Test
+  public void testDistributedQueryDoesNotReadFromZk() throws Exception {
+    final String secondColl = "secondColl";
+
+    // Create a collection on only 1 node so the other node uses LazyCollectionRef for state
+    List<JettySolrRunner> jettys = cluster.getJettySolrRunners();
+    CollectionAdminRequest.createCollection(secondColl, "conf", 1, 1)
+        .setCreateNodeSet(jettys.get(0).getNodeName())
+        .processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT);
+    cluster
+        .getZkStateReader()
+        .waitForState(
+            secondColl,
+            DEFAULT_TIMEOUT,
+            TimeUnit.SECONDS,
+            (n, c) -> SolrCloudTestCase.replicasForCollectionAreFullyActive(n, c, 1, 1));
+
+    try {
+      // Node 1 hosts COLLECTION but not secondColl.
+      // Send a multi-collection query to trigger LazyCollectionRef get call
+      JettySolrRunner nodeWithoutSecondColl = jettys.get(1);
+      try (SolrClient client =
+          new HttpJettySolrClient.Builder(nodeWithoutSecondColl.getBaseUrl().toString()).build()) {
+
+        String collectionsParameter = COLLECTION + "," + secondColl;
+
+        // Warm up LazyCollectionRef state cache with query
+        client.query(COLLECTION, new SolrQuery("q", "*:*", "collection", collectionsParameter));
+
+        // Get ZK metrics from the coordinator node (the one we're querying)
+        SolrZKMetricsListener metrics =
+            nodeWithoutSecondColl.getCoreContainer().getZkController().getZkClient().getMetrics();
+        long existsBefore = metrics.getExistsChecks();
+
+        // Query again and assert that exists call is not made
+        client.query(COLLECTION, new SolrQuery("q", "*:*", "collection", collectionsParameter));
+        assertEquals(
+            "Query should not cause ZK exists checks as collection state should be cached",
+            existsBefore,
+            metrics.getExistsChecks());
+      }
+    } finally {
+      CollectionAdminRequest.deleteCollection(secondColl)
+          .processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT);
+    }
+  }
+
   private int getNumRequests(
       Map<String, List<TrackingShardHandlerFactory.ShardRequestAndParams>> requests) {
     int beforeNumRequests = 0;