diff --git a/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml b/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml new file mode 100644 index 00000000000..1ce13b101d4 --- /dev/null +++ b/changelog/unreleased/SOLR-18176-shardhandler-bottleneck.yml @@ -0,0 +1,9 @@ +title: Fix query throughput bottleneck caused by uncached ZooKeeper get calls for queries with explicit 'collection' parameter +type: changed +authors: + - name: Matthew Biscocho +links: + - name: SOLR-18176 + url: https://issues.apache.org/jira/browse/SOLR-18176 + - name: SOLR-15352 + url: https://issues.apache.org/jira/browse/SOLR-15352 diff --git a/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java b/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java index 836d0951f06..5315d413f42 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java +++ b/solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java @@ -109,12 +109,13 @@ private void withShardsParam(Builder builder, String shardsParam) { if (sliceOrUrl.indexOf('/') < 0) { // this is a logical shard this.slices[i] = sliceOrUrl; - replicas[i] = - findReplicas( - builder, - shardsParam, - clusterState, - clusterState.getCollection(builder.collection).getSlice(sliceOrUrl)); + DocCollection coll = clusterState.getCollectionOrNull(builder.collection, true); + if (coll == null) { + throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, + "Could not find collection to resolve replicas: " + builder.collection); + } + replicas[i] = findReplicas(builder, shardsParam, clusterState, coll.getSlice(sliceOrUrl)); } else { // this has urls this.replicas[i] = StrUtils.splitSmart(sliceOrUrl, "|", true); @@ -189,7 +190,12 @@ private void addSlices( String collectionName, String shardKeys, boolean multiCollection) { - DocCollection coll = state.getCollection(collectionName); + DocCollection coll = state.getCollectionOrNull(collectionName, true); + if (coll == null) { + throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, + "Could not find collection to add slices: " + collectionName); + } Collection slices = coll.getRouter().getSearchSlices(shardKeys, params, coll); ClientUtils.addSlices(target, collectionName, slices, multiCollection); } diff --git a/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java b/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java index 349d6dda711..9387d8680b7 100644 --- a/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java +++ b/solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java @@ -24,15 +24,19 @@ import java.util.Set; import java.util.concurrent.TimeUnit; import org.apache.solr.BaseDistributedSearchTestCase; +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.jetty.HttpJettySolrClient; import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.client.solrj.request.SolrQuery; import org.apache.solr.client.solrj.request.UpdateRequest; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.cloud.SolrCloudTestCase; +import org.apache.solr.common.cloud.SolrZKMetricsListener; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.ShardParams; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.common.util.StrUtils; +import org.apache.solr.embedded.JettySolrRunner; import org.junit.BeforeClass; import org.junit.Test; @@ -707,6 +711,57 @@ private QueryResponse queryWithAsserts(String... q) throws Exception { return response; } + /** + * When a node resolves collection state for a collection it doesn't host, queries should use + * cached state and not make ZK calls on every query. + */ + @Test + public void testDistributedQueryDoesNotReadFromZk() throws Exception { + final String secondColl = "secondColl"; + + // Create a collection on only 1 node so the other node uses LazyCollectionRef for state + List jettys = cluster.getJettySolrRunners(); + CollectionAdminRequest.createCollection(secondColl, "conf", 1, 1) + .setCreateNodeSet(jettys.get(0).getNodeName()) + .processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT); + cluster + .getZkStateReader() + .waitForState( + secondColl, + DEFAULT_TIMEOUT, + TimeUnit.SECONDS, + (n, c) -> SolrCloudTestCase.replicasForCollectionAreFullyActive(n, c, 1, 1)); + + try { + // Node 1 hosts COLLECTION but not secondColl. + // Send a multi-collection query to trigger LazyCollectionRef get call + JettySolrRunner nodeWithoutSecondColl = jettys.get(1); + try (SolrClient client = + new HttpJettySolrClient.Builder(nodeWithoutSecondColl.getBaseUrl().toString()).build()) { + + String collectionsParameter = COLLECTION + "," + secondColl; + + // Warm up LazyCollectionRef state cache with query + client.query(COLLECTION, new SolrQuery("q", "*:*", "collection", collectionsParameter)); + + // Get ZK metrics from the coordinator node (the one we're querying) + SolrZKMetricsListener metrics = + nodeWithoutSecondColl.getCoreContainer().getZkController().getZkClient().getMetrics(); + long existsBefore = metrics.getExistsChecks(); + + // Query again and assert that exists call is not made + client.query(COLLECTION, new SolrQuery("q", "*:*", "collection", collectionsParameter)); + assertEquals( + "Query should not cause ZK exists checks as collection state should be cached", + existsBefore, + metrics.getExistsChecks()); + } + } finally { + CollectionAdminRequest.deleteCollection(secondColl) + .processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT); + } + } + private int getNumRequests( Map> requests) { int beforeNumRequests = 0;