Merge pull request #101 from kermitt2/bugfix/refactor-reference-parsing

lfoppiano · web-flow · commit cdd3ab71fdb7 · 2026-01-04T10:06:41.000Z
Refactor IO handling, fix assertions, fix reference parsing (1)
diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml
@@ -22,7 +22,11 @@ jobs:
           python-version: ${{ matrix.python-version }}
           cache: 'pip'
       - name: Cleanup more disk space
-        run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc 
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
diff --git a/grobid_client/format/TEI2LossyJSON.py b/grobid_client/format/TEI2LossyJSON.py
@@ -9,6 +9,8 @@
 import uuid
 from collections import OrderedDict
 from concurrent.futures import ProcessPoolExecutor, as_completed
+import html
+import re
 from pathlib import Path
 from typing import Dict, Union, BinaryIO, Iterator
 
@@ -41,8 +43,15 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO], stream: bool = False
         If stream=False returns the full document dict (same shape as original function).
         """
         # Load with BeautifulSoup but avoid building huge structures when streaming
-        with open(tei_file, 'r') as f:
-            content = f.read()
+        if hasattr(tei_file, 'read'):
+            # File-like object (BinaryIO/StringIO)
+            content = tei_file.read()
+            if isinstance(content, bytes):
+                content = content.decode('utf-8')
+        else:
+            # Path-like object
+            with open(tei_file, 'r', encoding='utf-8') as f:
+                content = f.read()
         soup = BeautifulSoup(content, 'xml')
 
         if soup.TEI is None:
@@ -222,7 +231,6 @@ def _extract_comprehensive_reference_data(self, bibl_struct: Tag, index: int) ->
         Extract detailed bibliographic information from TEI biblStruct elements.
         Implements comprehensive parsing for all standard TEI bibliographic components.
         """
-        import re
 
         citation_data = OrderedDict()
         citation_data['id'] = f"b{index}"
@@ -430,7 +438,6 @@ def _process_pointer_element(self, pointer_element: Tag, link_references: list):
 
     def _process_imprint_details(self, imprint_element: Tag, publication_metadata: Dict):
         """Extract and process imprint information including publisher, dates, and page ranges."""
-        import re
 
         # Extract publisher information
         publisher_elements = imprint_element.find_all("publisher")
@@ -557,7 +564,6 @@ def _extract_person_data(self, person_element: Tag) -> Dict:
         Extract person data (author/editor) from TEI persName or author elements.
         Handles various name formats and affiliations.
         """
-        import re
 
         person_data = {}
 
@@ -628,11 +634,9 @@ def _clean_text(self, text: str) -> str:
                     text = text.decode('utf-8', errors='ignore')
 
         # Normalize whitespace and strip
-        import re
         text = re.sub(r'\s+', ' ', text.strip())
 
         # Remove any potential XML/HTML entities
-        import html
         text = html.unescape(text)
 
         return text
@@ -740,15 +744,15 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
                         struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence)
                         if self.validate_refs:
                             for ref in struct['refs']:
-                                assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
-                                assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
+                                assert ref['offset_start'] < ref['offset_end'], "Wrong offsets"
+                                assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets"
                         yield struct
                 else:
                     struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, p)
                     if self.validate_refs:
                         for ref in struct['refs']:
-                            assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
-                            assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
+                            assert ref['offset_start'] < ref['offset_end'], "Wrong offsets"
+                            assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets"
                     yield struct
 
         # Update head_paragraph for potential next div
diff --git a/grobid_client/format/TEI2Markdown.py b/grobid_client/format/TEI2Markdown.py
@@ -11,8 +11,7 @@
 - Annex
 - References
 """
-import os
-import uuid
+import re
 from pathlib import Path
 from typing import List, Dict, Union, Optional, BinaryIO
 from bs4 import BeautifulSoup, NavigableString, Tag
@@ -44,9 +43,12 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO]) -> Optional[str]:
         try:
             # Load with BeautifulSoup
             if isinstance(tei_file, (str, Path)):
-                content = open(tei_file, 'r', encoding='utf-8').read()
+                with open(tei_file, 'r', encoding='utf-8') as f:
+                    content = f.read()
             else:
                 content = tei_file.read()
+                if isinstance(content, bytes):
+                    content = content.decode('utf-8')
                 
             soup = BeautifulSoup(content, 'xml')
 
@@ -77,7 +79,7 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO]) -> Optional[str]:
             # Extract publication date
             pub_date = self._extract_publication_date(soup)
             if pub_date:
-                markdown_sections.append(f"Publishd on {pub_date}\n\n")
+                markdown_sections.append(f"Published on {pub_date}\n\n")
 
             # Extract abstract
             abstract = self._extract_abstract(soup)
@@ -511,17 +513,25 @@ def _process_imprint_section(self, imprint: Tag, bib_data: dict) -> None:
             unit = bibl_scope.get("unit", "").lower()
             text = bibl_scope.get_text().strip()
 
-            if unit == "vol" and text:
+            if unit in ["vol", "volume"] and text:
                 bib_data['volume'] = text
             elif unit == "issue" and text:
                 bib_data['issue'] = text
             elif unit == "page" and text:
                 # Handle page ranges
-                if "from" in bibl_scope.attrs:
-                    bib_data['pages'] = f"{text}-"
-                elif "to" in bibl_scope.attrs and bib_data.get('pages'):
-                    bib_data['pages'] += text
-                else:
+                from_val = bibl_scope.get("from")
+                to_val = bibl_scope.get("to")
+                if from_val and to_val:
+                    # Both from and to in same element
+                    bib_data['pages'] = f"{from_val}-{to_val}"
+                elif from_val:
+                    # Only from specified, may get combined with another element
+                    bib_data['pages'] = f"{from_val}-"
+                elif to_val and bib_data.get('pages'):
+                    # Only to specified, append to existing from
+                    bib_data['pages'] = bib_data['pages'] + to_val
+                elif text and not bib_data.get('pages'):
+                    # Plain text, no from/to attributes
                     bib_data['pages'] = text
 
     def _extract_author_info(self, author: Tag) -> dict:
@@ -629,6 +639,9 @@ def _build_publication_details(self, ref_data: dict) -> str:
         """Build publication details string from extracted data."""
         details = []
 
+        if ref_data.get('year'):
+            details.append(f"({ref_data['year']})")
+
         if ref_data.get('volume'):
             details.append(ref_data['volume'])
 
@@ -684,7 +697,6 @@ def _extract_raw_reference(self, bibl_struct: Tag) -> str:
         raw_text = bibl_struct.get_text().strip()
 
         # Remove reference number if present
-        import re
         raw_text = re.sub(r'^\[\d+\]\s*', '', raw_text)
 
         # Clean up excessive whitespace