|
9 | 9 | import uuid |
10 | 10 | from collections import OrderedDict |
11 | 11 | from concurrent.futures import ProcessPoolExecutor, as_completed |
| 12 | +import html |
| 13 | +import re |
12 | 14 | from pathlib import Path |
13 | 15 | from typing import Dict, Union, BinaryIO, Iterator |
14 | 16 |
|
@@ -41,8 +43,15 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO], stream: bool = False |
41 | 43 | If stream=False returns the full document dict (same shape as original function). |
42 | 44 | """ |
43 | 45 | # Load with BeautifulSoup but avoid building huge structures when streaming |
44 | | - with open(tei_file, 'r') as f: |
45 | | - content = f.read() |
| 46 | + if hasattr(tei_file, 'read'): |
| 47 | + # File-like object (BinaryIO/StringIO) |
| 48 | + content = tei_file.read() |
| 49 | + if isinstance(content, bytes): |
| 50 | + content = content.decode('utf-8') |
| 51 | + else: |
| 52 | + # Path-like object |
| 53 | + with open(tei_file, 'r', encoding='utf-8') as f: |
| 54 | + content = f.read() |
46 | 55 | soup = BeautifulSoup(content, 'xml') |
47 | 56 |
|
48 | 57 | if soup.TEI is None: |
@@ -222,7 +231,6 @@ def _extract_comprehensive_reference_data(self, bibl_struct: Tag, index: int) -> |
222 | 231 | Extract detailed bibliographic information from TEI biblStruct elements. |
223 | 232 | Implements comprehensive parsing for all standard TEI bibliographic components. |
224 | 233 | """ |
225 | | - import re |
226 | 234 |
|
227 | 235 | citation_data = OrderedDict() |
228 | 236 | citation_data['id'] = f"b{index}" |
@@ -430,7 +438,6 @@ def _process_pointer_element(self, pointer_element: Tag, link_references: list): |
430 | 438 |
|
431 | 439 | def _process_imprint_details(self, imprint_element: Tag, publication_metadata: Dict): |
432 | 440 | """Extract and process imprint information including publisher, dates, and page ranges.""" |
433 | | - import re |
434 | 441 |
|
435 | 442 | # Extract publisher information |
436 | 443 | publisher_elements = imprint_element.find_all("publisher") |
@@ -557,7 +564,6 @@ def _extract_person_data(self, person_element: Tag) -> Dict: |
557 | 564 | Extract person data (author/editor) from TEI persName or author elements. |
558 | 565 | Handles various name formats and affiliations. |
559 | 566 | """ |
560 | | - import re |
561 | 567 |
|
562 | 568 | person_data = {} |
563 | 569 |
|
@@ -628,11 +634,9 @@ def _clean_text(self, text: str) -> str: |
628 | 634 | text = text.decode('utf-8', errors='ignore') |
629 | 635 |
|
630 | 636 | # Normalize whitespace and strip |
631 | | - import re |
632 | 637 | text = re.sub(r'\s+', ' ', text.strip()) |
633 | 638 |
|
634 | 639 | # Remove any potential XML/HTML entities |
635 | | - import html |
636 | 640 | text = html.unescape(text) |
637 | 641 |
|
638 | 642 | return text |
@@ -740,15 +744,15 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa |
740 | 744 | struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence) |
741 | 745 | if self.validate_refs: |
742 | 746 | for ref in struct['refs']: |
743 | | - assert "Wrong offsets", ref['offset_start'] < ref['offset_end'] |
744 | | - assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'] |
| 747 | + assert ref['offset_start'] < ref['offset_end'], "Wrong offsets" |
| 748 | + assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets" |
745 | 749 | yield struct |
746 | 750 | else: |
747 | 751 | struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, p) |
748 | 752 | if self.validate_refs: |
749 | 753 | for ref in struct['refs']: |
750 | | - assert "Wrong offsets", ref['offset_start'] < ref['offset_end'] |
751 | | - assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'] |
| 754 | + assert ref['offset_start'] < ref['offset_end'], "Wrong offsets" |
| 755 | + assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets" |
752 | 756 | yield struct |
753 | 757 |
|
754 | 758 | # Update head_paragraph for potential next div |
|
0 commit comments