From 5dc01daad2d776163bc9daad93517a422fba9a21 Mon Sep 17 00:00:00 2001 From: andrei Date: Fri, 27 Feb 2026 12:45:03 +0100 Subject: [PATCH] feat: code documentation with examples --- server/workers/orcid/src/orcid_service.py | 39 +++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/server/workers/orcid/src/orcid_service.py b/server/workers/orcid/src/orcid_service.py index fb0125c72..82517643f 100644 --- a/server/workers/orcid/src/orcid_service.py +++ b/server/workers/orcid/src/orcid_service.py @@ -229,6 +229,14 @@ def _prepare_dois_for_base_query(self, dois: List[str]) -> Tuple[List[str], Dict For each DOI that contains uppercase letters, this function adds a lowercase version to ensure case-insensitive matching in BASE search. + Example: + - Case 1: DOI = 10.1594/PANGAEA.982329 + - DOIs for BASE query = [10.1594/PANGAEA.982329, 10.1594/pangaea.982329] + - Added lowercase DOI = 10.1594/pangaea.982329 + - Case 2: DOI = 10.1038/s41586-025-0410-x + - DOIs for BASE query = [10.1038/s41586-025-0410-x] + - Added lowercase DOI = 10.1038/s41586-025-0410-x + Parameters: - dois: List of original DOIs from ORCID @@ -265,6 +273,18 @@ def _normalize_base_results_to_original_dois( If BASE returns results with lowercase DOI variants, this function maps them back to the original DOI format from ORCID to ensure proper merging. + Example: + - Original DOIs from ORCID: ["10.1594/PANGAEA.982329"] + - DOIs sent to BASE (after `_prepare_dois_for_base_query`): + ["10.1594/PANGAEA.982329", "10.1594/pangaea.982329"] + - Suppose BASE returns rows with: + base_metadata['doi'] == ["10.1594/PANGAEA.982329", "10.1594/pangaea.982329"] + - And `doi_mapping` contains: + {"10.1594/pangaea.982329": ["10.1594/PANGAEA.982329"]} + - After `_normalize_base_results_to_original_dois`: + base_metadata['doi'] == ["10.1594/PANGAEA.982329", "10.1594/PANGAEA.982329"] + - Both DOI variants are now normalized to the original format from ORCID + Parameters: - base_metadata: DataFrame with results from BASE - doi_mapping: Mapping from lowercase DOI to list of original DOIs @@ -296,10 +316,10 @@ def _match_dois_by_version( original_dois: List[str], ) -> pd.DataFrame: """ - Match BASE results that have versioned DOIs (e.g. .v1, .v2) to original DOIs without version. + Match BASE results that have versioned DOIs (e.g. `.v1`, `.v2`) to original DOIs without version. If BASE returned a DOI with a version suffix but the original ORCID DOI is without version, - this function updates the base_metadata 'doi' column so that those rows match the original + this function updates the `base_metadata['doi']` column so that those rows match the original DOI for merging. Parameters: @@ -308,6 +328,21 @@ def _match_dois_by_version( Returns: - DataFrame with 'doi' updated where versioned variants were matched to original DOIs + + Example: + - Original DOIs from ORCID: ["10.1000/example"] + - BASE returns: + base_metadata['doi'] == ["10.1000/example.v1", "10.1000/example.v2"] + - After calculation: + base_unversioned_to_versioned == { + "10.1000/example": ["10.1000/example.v1", "10.1000/example.v2"] + } + - Since "10.1000/example" is in `original_dois`, but not in `dois_received`, + the function considers it lost (`dois_lost`) and finds versioned variants for it. + - After `_match_dois_by_version`: + base_metadata['doi'] == ["10.1000/example", "10.1000/example"] + (both versioned records are now bound to the original DOI without version, + and are further processed as a group of duplicates for "10.1000/example"). """ pattern_doi_version = re.compile(r"\.v(\d)+$")