Skip to content

Commit cdd3ab7

Browse files
authored
Merge pull request #101 from kermitt2/bugfix/refactor-reference-parsing
Refactor IO handling, fix assertions, fix reference parsing (1)
2 parents 87d8f49 + 96663de commit cdd3ab7

3 files changed

Lines changed: 43 additions & 23 deletions

File tree

.github/workflows/ci-build.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,11 @@ jobs:
2222
python-version: ${{ matrix.python-version }}
2323
cache: 'pip'
2424
- name: Cleanup more disk space
25-
run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
25+
run: |
26+
sudo rm -rf /usr/share/dotnet
27+
sudo rm -rf /opt/ghc
28+
sudo rm -rf "/usr/local/share/boost"
29+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
2630
- name: Install dependencies
2731
run: |
2832
python -m pip install --upgrade pip

grobid_client/format/TEI2LossyJSON.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
import uuid
1010
from collections import OrderedDict
1111
from concurrent.futures import ProcessPoolExecutor, as_completed
12+
import html
13+
import re
1214
from pathlib import Path
1315
from typing import Dict, Union, BinaryIO, Iterator
1416

@@ -41,8 +43,15 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO], stream: bool = False
4143
If stream=False returns the full document dict (same shape as original function).
4244
"""
4345
# Load with BeautifulSoup but avoid building huge structures when streaming
44-
with open(tei_file, 'r') as f:
45-
content = f.read()
46+
if hasattr(tei_file, 'read'):
47+
# File-like object (BinaryIO/StringIO)
48+
content = tei_file.read()
49+
if isinstance(content, bytes):
50+
content = content.decode('utf-8')
51+
else:
52+
# Path-like object
53+
with open(tei_file, 'r', encoding='utf-8') as f:
54+
content = f.read()
4655
soup = BeautifulSoup(content, 'xml')
4756

4857
if soup.TEI is None:
@@ -222,7 +231,6 @@ def _extract_comprehensive_reference_data(self, bibl_struct: Tag, index: int) ->
222231
Extract detailed bibliographic information from TEI biblStruct elements.
223232
Implements comprehensive parsing for all standard TEI bibliographic components.
224233
"""
225-
import re
226234

227235
citation_data = OrderedDict()
228236
citation_data['id'] = f"b{index}"
@@ -430,7 +438,6 @@ def _process_pointer_element(self, pointer_element: Tag, link_references: list):
430438

431439
def _process_imprint_details(self, imprint_element: Tag, publication_metadata: Dict):
432440
"""Extract and process imprint information including publisher, dates, and page ranges."""
433-
import re
434441

435442
# Extract publisher information
436443
publisher_elements = imprint_element.find_all("publisher")
@@ -557,7 +564,6 @@ def _extract_person_data(self, person_element: Tag) -> Dict:
557564
Extract person data (author/editor) from TEI persName or author elements.
558565
Handles various name formats and affiliations.
559566
"""
560-
import re
561567

562568
person_data = {}
563569

@@ -628,11 +634,9 @@ def _clean_text(self, text: str) -> str:
628634
text = text.decode('utf-8', errors='ignore')
629635

630636
# Normalize whitespace and strip
631-
import re
632637
text = re.sub(r'\s+', ' ', text.strip())
633638

634639
# Remove any potential XML/HTML entities
635-
import html
636640
text = html.unescape(text)
637641

638642
return text
@@ -740,15 +744,15 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
740744
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence)
741745
if self.validate_refs:
742746
for ref in struct['refs']:
743-
assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
744-
assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
747+
assert ref['offset_start'] < ref['offset_end'], "Wrong offsets"
748+
assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets"
745749
yield struct
746750
else:
747751
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, p)
748752
if self.validate_refs:
749753
for ref in struct['refs']:
750-
assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
751-
assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
754+
assert ref['offset_start'] < ref['offset_end'], "Wrong offsets"
755+
assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets"
752756
yield struct
753757

754758
# Update head_paragraph for potential next div

grobid_client/format/TEI2Markdown.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,7 @@
1111
- Annex
1212
- References
1313
"""
14-
import os
15-
import uuid
14+
import re
1615
from pathlib import Path
1716
from typing import List, Dict, Union, Optional, BinaryIO
1817
from bs4 import BeautifulSoup, NavigableString, Tag
@@ -44,9 +43,12 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO]) -> Optional[str]:
4443
try:
4544
# Load with BeautifulSoup
4645
if isinstance(tei_file, (str, Path)):
47-
content = open(tei_file, 'r', encoding='utf-8').read()
46+
with open(tei_file, 'r', encoding='utf-8') as f:
47+
content = f.read()
4848
else:
4949
content = tei_file.read()
50+
if isinstance(content, bytes):
51+
content = content.decode('utf-8')
5052

5153
soup = BeautifulSoup(content, 'xml')
5254

@@ -77,7 +79,7 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO]) -> Optional[str]:
7779
# Extract publication date
7880
pub_date = self._extract_publication_date(soup)
7981
if pub_date:
80-
markdown_sections.append(f"Publishd on {pub_date}\n\n")
82+
markdown_sections.append(f"Published on {pub_date}\n\n")
8183

8284
# Extract abstract
8385
abstract = self._extract_abstract(soup)
@@ -511,17 +513,25 @@ def _process_imprint_section(self, imprint: Tag, bib_data: dict) -> None:
511513
unit = bibl_scope.get("unit", "").lower()
512514
text = bibl_scope.get_text().strip()
513515

514-
if unit == "vol" and text:
516+
if unit in ["vol", "volume"] and text:
515517
bib_data['volume'] = text
516518
elif unit == "issue" and text:
517519
bib_data['issue'] = text
518520
elif unit == "page" and text:
519521
# Handle page ranges
520-
if "from" in bibl_scope.attrs:
521-
bib_data['pages'] = f"{text}-"
522-
elif "to" in bibl_scope.attrs and bib_data.get('pages'):
523-
bib_data['pages'] += text
524-
else:
522+
from_val = bibl_scope.get("from")
523+
to_val = bibl_scope.get("to")
524+
if from_val and to_val:
525+
# Both from and to in same element
526+
bib_data['pages'] = f"{from_val}-{to_val}"
527+
elif from_val:
528+
# Only from specified, may get combined with another element
529+
bib_data['pages'] = f"{from_val}-"
530+
elif to_val and bib_data.get('pages'):
531+
# Only to specified, append to existing from
532+
bib_data['pages'] = bib_data['pages'] + to_val
533+
elif text and not bib_data.get('pages'):
534+
# Plain text, no from/to attributes
525535
bib_data['pages'] = text
526536

527537
def _extract_author_info(self, author: Tag) -> dict:
@@ -629,6 +639,9 @@ def _build_publication_details(self, ref_data: dict) -> str:
629639
"""Build publication details string from extracted data."""
630640
details = []
631641

642+
if ref_data.get('year'):
643+
details.append(f"({ref_data['year']})")
644+
632645
if ref_data.get('volume'):
633646
details.append(ref_data['volume'])
634647

@@ -684,7 +697,6 @@ def _extract_raw_reference(self, bibl_struct: Tag) -> str:
684697
raw_text = bibl_struct.get_text().strip()
685698

686699
# Remove reference number if present
687-
import re
688700
raw_text = re.sub(r'^\[\d+\]\s*', '', raw_text)
689701

690702
# Clean up excessive whitespace

0 commit comments

Comments
 (0)