From f5b9e13dbc693a25487ea1adc0c0a2757d80df96 Mon Sep 17 00:00:00 2001 From: esoteric-ephemera Date: Thu, 9 Apr 2026 11:46:08 -0700 Subject: [PATCH 1/5] fix bugs in batched retrieval --- mp_api/client/core/client.py | 52 +++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/mp_api/client/core/client.py b/mp_api/client/core/client.py index e2127b22..181f6a89 100644 --- a/mp_api/client/core/client.py +++ b/mp_api/client/core/client.py @@ -805,14 +805,15 @@ def _query_resource( except RequestException as ex: raise MPRestError(str(ex)) - def _submit_requests( # noqa + def _submit_requests( self, - url, - criteria, - use_document_model, - chunk_size, - num_chunks=None, - timeout=None, + url: str, + criteria: dict[str, Any], + use_document_model: bool, + chunk_size: int, + num_chunks: int | None = None, + timeout: float | None = None, + max_batch_size: int = 100, ) -> dict: """Handle submitting requests sequentially with pagination. @@ -820,12 +821,13 @@ def _submit_requests( # noqa split them into multiple sequential requests and combine results. Arguments: - criteria: dictionary of criteria to filter down - url: url used to make request - use_document_model: if None, will defer to the self.use_document_model attribute - num_chunks: Maximum number of chunks of data to yield. None will yield all possible. - chunk_size: Number of data entries per chunk. - timeout: Time in seconds to wait until a request timeout error is thrown + url (str): url used to make request + criteria (dict of str): dictionary of criteria to filter down + use_document_model (bool): whether to use the document model + num_chunks (int or None): Maximum number of chunks of data to yield. None will yield all possible. + chunk_size (int): Number of data entries per chunk. + timeout (float): Time in seconds to wait until a request timeout error is thrown + max_batch_size (int) : Maximum size of a batch for when retrieving batches in parallel Returns: Dictionary containing data and metadata @@ -884,14 +886,6 @@ def _submit_requests( # noqa timeout=timeout, ) - # Check if we got 0 results - some parameters are silently ignored by the API - # when passed as comma-separated values, so we need to split them anyway - if total_num_docs == 0 and len(split_values) > 1: - # Treat this the same as a 422 error - split into batches - raise MPRestError( - "Got 0 results for comma-separated parameter, will try splitting" - ) - # If successful, continue with normal pagination data_chunks = [data["data"]] total_data: dict[str, Any] = {"data": []} @@ -904,17 +898,25 @@ def _submit_requests( # noqa except MPRestError as e: # If we get 422 or 414 error, or 0 results for comma-separated params, split into batches - if any(trace in str(e) for trace in ("422", "414", "Got 0 results")): + if any( + trace in str(e) + for trace in ( + "422", + "414", + ) + ): total_data = {"data": []} total_num_docs = 0 data_chunks = [] # Batch the split values to reduce number of requests # Use batches of up to 100 values to balance URL length and request count - batch_size = min(100, max(1, len(split_values) // 10)) + num_batches = min( + max_batch_size, max(1, len(split_values) // max_batch_size) + ) + batch_size = min(len(split_values), max_batch_size) # Setup progress bar for split parameter requests - num_batches = ceil(len(split_values) / batch_size) pbar_message = f"Retrieving {len(split_values)} {split_param} values in {num_batches} batches" pbar = ( tqdm( @@ -998,7 +1000,7 @@ def _submit_requests( # noqa desc=pbar_message, total=num_docs_needed, ) - if not self.mute_progress_bars + if not self.mute_progress_bars and total_num_docs > 0 else None ) From 4c82145950a7f74d104f06e92b0d37b3c03e10c1 Mon Sep 17 00:00:00 2001 From: esoteric-ephemera Date: Thu, 9 Apr 2026 11:57:08 -0700 Subject: [PATCH 2/5] recursive guard --- mp_api/client/core/client.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mp_api/client/core/client.py b/mp_api/client/core/client.py index 181f6a89..9ac5fc66 100644 --- a/mp_api/client/core/client.py +++ b/mp_api/client/core/client.py @@ -814,6 +814,7 @@ def _submit_requests( num_chunks: int | None = None, timeout: float | None = None, max_batch_size: int = 100, + norecur: bool = False, ) -> dict: """Handle submitting requests sequentially with pagination. @@ -827,7 +828,9 @@ def _submit_requests( num_chunks (int or None): Maximum number of chunks of data to yield. None will yield all possible. chunk_size (int): Number of data entries per chunk. timeout (float): Time in seconds to wait until a request timeout error is thrown - max_batch_size (int) : Maximum size of a batch for when retrieving batches in parallel + max_batch_size (int) : Maximum size of a batch when retrieving batches in parallel + norecur (bool) : Whether to forbid recursive splitting of a query field + when a direct query fails Returns: Dictionary containing data and metadata @@ -897,8 +900,8 @@ def _submit_requests( # Continue with pagination if needed (handled below) except MPRestError as e: - # If we get 422 or 414 error, or 0 results for comma-separated params, split into batches - if any( + # If we get 422 or 414 error, split into batches + if not norecur and any( trace in str(e) for trace in ( "422", @@ -940,6 +943,7 @@ def _submit_requests( chunk_size=chunk_size, num_chunks=num_chunks, timeout=timeout, + norecur=len(batch) <= max_batch_size, ) data_chunks.append(result["data"]) From c6078142091ae6dba925b5dd9c5f9e643b5c9e45 Mon Sep 17 00:00:00 2001 From: esoteric-ephemera Date: Thu, 9 Apr 2026 14:52:20 -0700 Subject: [PATCH 3/5] temporarily reorganize symmetry query around splitting into separate query chunks --- mp_api/client/core/client.py | 20 +++++++++-------- mp_api/client/routes/materials/summary.py | 27 ++++++++++++++++++++--- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/mp_api/client/core/client.py b/mp_api/client/core/client.py index 9ac5fc66..17b62bfa 100644 --- a/mp_api/client/core/client.py +++ b/mp_api/client/core/client.py @@ -659,7 +659,7 @@ def _query_resource( use_document_model: if None, will defer to the self.use_document_model attribute num_chunks: Maximum number of chunks of data to yield. None will yield all possible. chunk_size: Number of data entries per chunk. - timeout : Time in seconds to wait until a request timeout error is thrown + timeout (float or None): Time in seconds to wait until a request timeout error is thrown Returns: A Resource, a dict with two keys, "data" containing a list of documents, and @@ -810,9 +810,9 @@ def _submit_requests( url: str, criteria: dict[str, Any], use_document_model: bool, - chunk_size: int, + chunk_size: int | None, num_chunks: int | None = None, - timeout: float | None = None, + timeout: int | None = None, max_batch_size: int = 100, norecur: bool = False, ) -> dict: @@ -826,8 +826,8 @@ def _submit_requests( criteria (dict of str): dictionary of criteria to filter down use_document_model (bool): whether to use the document model num_chunks (int or None): Maximum number of chunks of data to yield. None will yield all possible. - chunk_size (int): Number of data entries per chunk. - timeout (float): Time in seconds to wait until a request timeout error is thrown + chunk_size (int or None): Number of data entries per chunk. + timeout (int or None): Time in seconds to wait until a request timeout error is thrown max_batch_size (int) : Maximum size of a batch when retrieving batches in parallel norecur (bool) : Whether to forbid recursive splitting of a query field when a direct query fails @@ -985,6 +985,12 @@ def _submit_requests( if "meta" in data: total_data["meta"] = data["meta"] + # otherwise, paginate sequentially + if chunk_size is None or chunk_size < 1: + raise ValueError( + "A positive chunk size must be provided to enable pagination" + ) + # Get max number of response pages max_pages = ( num_chunks if num_chunks is not None else ceil(total_num_docs / chunk_size) @@ -1024,10 +1030,6 @@ def _submit_requests( pbar.close() return new_total_data - # otherwise, paginate sequentially - if chunk_size is None: - raise ValueError("A chunk size must be provided to enable pagination") - # Warning to select specific fields only for many results if criteria.get("_all_fields", False) and (total_num_docs / chunk_size > 10): warnings.warn( diff --git a/mp_api/client/routes/materials/summary.py b/mp_api/client/routes/materials/summary.py index 1769781e..7cb2f2db 100644 --- a/mp_api/client/routes/materials/summary.py +++ b/mp_api/client/routes/materials/summary.py @@ -2,6 +2,7 @@ import warnings from collections import defaultdict +from itertools import chain, product from emmet.core.summary import HasProps, SummaryDoc from emmet.core.symmetry import CrystalSystem @@ -200,8 +201,9 @@ def search( # noqa: D417 mmnd_inv = {v: k for k, v in min_max_name_dict.items() if k != v} # Set user query params from `locals` + _locals = locals() user_settings = { - k: v for k, v in locals().items() if k in min_max_name_dict and v + k: v for k, v in _locals.items() if k in min_max_name_dict and v } # Check to see if user specified _search fields using **kwargs, @@ -328,10 +330,11 @@ def _csrc(x): "spacegroup_number": 230, "spacegroup_symbol": 230, } + batched_symm_query = {} for k, cardinality in symm_cardinality.items(): - if isinstance(symm_vals := locals().get(k), list | tuple | set): + if isinstance(symm_vals := _locals.get(k), list | tuple | set): if len(symm_vals) < cardinality // 2: - query_params.update({k: ",".join(str(v) for v in symm_vals)}) + batched_symm_query[k] = symm_vals else: raise MPRestError( f"Querying `{k}` by a list of values is only " @@ -378,6 +381,24 @@ def _csrc(x): if query_params[entry] is not None } + if batched_symm_query: + ordered_symm_key = sorted(batched_symm_query) + return list( + chain.from_iterable( + self._search( # type: ignore[return-value] + num_chunks=num_chunks, + chunk_size=chunk_size, + all_fields=all_fields, + fields=fields, + **query_params, + **{sk: symm_params[i] for i, sk in enumerate(ordered_symm_key)}, + ) + for symm_params in product( + *[batched_symm_query[k] for k in ordered_symm_key] + ) + ) + ) + return super()._search( # type: ignore[return-value] num_chunks=num_chunks, chunk_size=chunk_size, From d26d52ebe76c0fa3e6d37146ef4a846112896557 Mon Sep 17 00:00:00 2001 From: esoteric-ephemera Date: Tue, 14 Apr 2026 11:59:02 -0700 Subject: [PATCH 4/5] cleanup --- mp_api/client/mprester.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/mp_api/client/mprester.py b/mp_api/client/mprester.py index ece9d7bc..c2dfefde 100644 --- a/mp_api/client/mprester.py +++ b/mp_api/client/mprester.py @@ -1359,23 +1359,17 @@ def get_download_info( return meta, urls def _check_get_download_info_url_by_task_id(self, prefix, task_ids) -> list[str]: - nomad_exist_task_ids: list[str] = [] prefix = prefix.replace("/raw/query", "/repo/") - for task_id in task_ids: - url = prefix + task_id - if self._check_nomad_exist(url): - nomad_exist_task_ids.append(task_id) - return nomad_exist_task_ids + return [ + task_id for task_id in task_ids if self._check_nomad_exist(prefix + task_id) + ] @staticmethod def _check_nomad_exist(url) -> bool: response = get(url=url) if response.status_code != 200: return False - content = load_json(response.text) - if content["pagination"]["total"] == 0: - return False - return True + return load_json(response.text)["pagination"]["total"] != 0 @staticmethod def _print_help_message(nomad_exist_task_ids, task_ids, file_patterns, calc_types): From 4495ef9d85cdfd59c5d7921a80f76a5b15bab4f5 Mon Sep 17 00:00:00 2001 From: esoteric-ephemera Date: Tue, 14 Apr 2026 12:12:35 -0700 Subject: [PATCH 5/5] add regression test --- tests/client/test_core_client.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/client/test_core_client.py b/tests/client/test_core_client.py index 89c83c67..2fffae46 100644 --- a/tests/client/test_core_client.py +++ b/tests/client/test_core_client.py @@ -94,3 +94,21 @@ def test_warnings_exceptions(): with pytest.raises(MPRestError, match="Number of chunks must be greater than zero"): MaterialsRester()._get_all_documents({}, num_chunks=-1) + + +def test_regression_batching(): + # See https://github.com/materialsproject/api/pull/1077 + # This test ensures that queries with batched input + # that return no results do not infinitely recur + + # This test should simply run if there is no regression + + num_idxs = 100 + assert ( + len( + MaterialsRester().search( + material_ids=[f"mp-{idx}" for idx in range(num_idxs)], deprecated=True + ) + ) + <= num_idxs + )