|
1 | | -"""Generic URL processor - fallback for any URL.""" |
| 1 | +"""Generic URL processor - fallback for any URL with section support.""" |
2 | 2 |
|
3 | 3 | import httpx |
4 | | -from readability import Document |
5 | 4 |
|
6 | 5 | from fourdpocket.processors.base import BaseProcessor, ProcessorResult, ProcessorStatus |
| 6 | +from fourdpocket.processors.medium import _trafilatura_or_readability_sections |
7 | 7 | from fourdpocket.processors.registry import register_processor |
8 | 8 |
|
9 | 9 |
|
10 | 10 | @register_processor |
11 | 11 | class GenericURLProcessor(BaseProcessor): |
12 | | - """Extract content from any URL using readability and metadata parsing.""" |
| 12 | + """Extract content from any URL using trafilatura/readability as sections.""" |
13 | 13 |
|
14 | 14 | url_patterns = [] # matches nothing - used as fallback |
15 | 15 | priority = -1 # lowest priority |
@@ -47,23 +47,15 @@ async def process(self, url: str, **kwargs) -> ProcessorResult: |
47 | 47 | status=ProcessorStatus.partial, |
48 | 48 | ) |
49 | 49 |
|
50 | | - # Extract readable content via readability |
51 | | - try: |
52 | | - doc = Document(raw_html) |
53 | | - readable_title = doc.title() |
54 | | - readable_content = doc.summary() |
55 | | - doc.short_title() |
56 | | - except Exception: |
57 | | - readable_title = None |
58 | | - readable_content = None |
59 | | - |
60 | 50 | # Extract OG metadata |
61 | 51 | og_meta = self._extract_og_metadata(raw_html) |
62 | 52 |
|
| 53 | + # Emit structured sections via trafilatura/readability |
| 54 | + sections = _trafilatura_or_readability_sections(raw_html, url, og_meta) |
| 55 | + |
63 | 56 | # Determine best title |
64 | 57 | title = ( |
65 | 58 | og_meta.get("og_title") |
66 | | - or readable_title |
67 | 59 | or og_meta.get("html_title") |
68 | 60 | or url |
69 | 61 | ) |
@@ -99,11 +91,12 @@ async def process(self, url: str, **kwargs) -> ProcessorResult: |
99 | 91 | return ProcessorResult( |
100 | 92 | title=title, |
101 | 93 | description=description, |
102 | | - content=readable_content, |
103 | | - raw_content=raw_html[:100000], # cap raw HTML at 100KB |
| 94 | + content=None, |
| 95 | + raw_content=raw_html[:100000], |
104 | 96 | media=media, |
105 | 97 | metadata=metadata, |
106 | 98 | source_platform="generic", |
107 | 99 | item_type="url", |
108 | 100 | status=ProcessorStatus.success, |
| 101 | + sections=sections, |
109 | 102 | ) |
0 commit comments