diff --git a/server/workers/orcid/src/orcid_service.py b/server/workers/orcid/src/orcid_service.py index e138ee274..055badf92 100644 --- a/server/workers/orcid/src/orcid_service.py +++ b/server/workers/orcid/src/orcid_service.py @@ -345,50 +345,6 @@ def get_unversioned_doi(doi_str): return base_metadata - def _explode_merged_dois(self, base_metadata: pd.DataFrame) -> pd.DataFrame: - """ - Explode merged_dois field to create separate rows for each DOI variant. - - If base_metadata contains a 'merged_dois' field with multiple DOIs separated by '; ', - this function creates separate rows for each DOI, allowing matching with ORCID metadata - by any of those DOIs. - - Parameters: - - base_metadata: DataFrame with BASE metadata, potentially containing 'merged_dois' field - - Returns: - - DataFrame with exploded rows where each row has a single DOI in the 'doi' column - """ - # Process merged_dois: explode to create separate rows for each DOI variant - # This allows us to match BASE records with multiple DOIs to ORCID records by any of those DOIs - if 'merged_dois' in base_metadata.columns: - # Split merged_dois by "; " and create a list of DOIs for each row - # If merged_dois is empty/NaN, create empty list; otherwise split and process each DOI - base_metadata['merged_dois_list'] = base_metadata['merged_dois'].apply( - lambda x: [remove_doi_prefix(doi.strip()) for doi in str(x).split('; ') if doi.strip()] - if pd.notna(x) and str(x).strip() else [] - ) - - # Use explode to create separate rows for each DOI in merged_dois_list - # Rows with empty lists will remain as single rows - base_metadata = base_metadata.explode('merged_dois_list', ignore_index=True) - - # For rows where merged_dois_list is not empty, use it as the DOI - # For rows where merged_dois_list is empty/NaN, use the regular doi column - mask_has_merged_doi = pd.notna(base_metadata['merged_dois_list']) & (base_metadata['merged_dois_list'] != '') - base_metadata.loc[mask_has_merged_doi, 'doi'] = base_metadata.loc[mask_has_merged_doi, 'merged_dois_list'] - - # Process regular doi column for rows without merged_dois - base_metadata.loc[~mask_has_merged_doi, 'doi'] = base_metadata.loc[~mask_has_merged_doi, 'doi'].apply(remove_doi_prefix) - - # Drop temporary column - base_metadata = base_metadata.drop(columns=['merged_dois_list']) - else: - # No merged_dois column, process regular doi column - base_metadata.loc[:, 'doi'] = base_metadata['doi'].apply(remove_doi_prefix) - - return base_metadata - def enrich_metadata_with_base(self, params: Dict[str, str], metadata: pd.DataFrame) -> pd.DataFrame: self.logger.debug(f"Enriching metadata with base for ORCID {params.get('orcid')}") @@ -437,7 +393,6 @@ def enrich_metadata_with_base(self, params: Dict[str, str], metadata: pd.DataFra base_metadata = base_metadata.reindex(columns=required_fields) - #base_metadata = self._explode_merged_dois(base_metadata) base_metadata['merged_dois'] = base_metadata['merged_dois'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x) base_metadata['merged_dois'] = base_metadata['merged_dois'].apply(lambda x: x.split(';') if isinstance(x, str) else []) base_metadata['merged_dois'] = base_metadata['merged_dois'].apply(lambda x: [x.strip() for x in x] if isinstance(x, list) else x)