Skip to content

Commit 90f34ed

Browse files
committed
fix(processors): URL pattern bugs, pipeline hardening, quality polish
- Fix Instagram not matching /{username}/p/ and /{username}/reel/ URLs - Fix Substack not matching substack.com/home/post/ aggregator URLs - Fix Reddit not matching hyphenated subreddit names (r/some-sub) - Fix SE not matching hyphenated site names (code-review.stackexchange) - Simplify Mastodon pattern (drop protocol requirement for consistency) - YouTube/TikTok: always set item_type="video" even in fallback paths - LinkedIn: lower limited-extraction threshold from 250→150 chars - Threads: lower limited threshold from 50→30 chars (short by design) - GenericURL: emit structured sections via trafilatura/readability - Meilisearch: index section_kind/role/author in chunk documents - Pipeline: catch all exceptions in section re-hydration, preserve provenance - Add 48 new tests: URL matching (42 URLs × 14 platforms), pipeline integration (round-trip provenance, heading paths, boilerplate filtering) 304 tests pass, lint clean, frontend builds clean.
1 parent b03a9ae commit 90f34ed

16 files changed

Lines changed: 421 additions & 35 deletions

src/fourdpocket/processors/generic_url.py

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
"""Generic URL processor - fallback for any URL."""
1+
"""Generic URL processor - fallback for any URL with section support."""
22

33
import httpx
4-
from readability import Document
54

65
from fourdpocket.processors.base import BaseProcessor, ProcessorResult, ProcessorStatus
6+
from fourdpocket.processors.medium import _trafilatura_or_readability_sections
77
from fourdpocket.processors.registry import register_processor
88

99

1010
@register_processor
1111
class GenericURLProcessor(BaseProcessor):
12-
"""Extract content from any URL using readability and metadata parsing."""
12+
"""Extract content from any URL using trafilatura/readability as sections."""
1313

1414
url_patterns = [] # matches nothing - used as fallback
1515
priority = -1 # lowest priority
@@ -47,23 +47,15 @@ async def process(self, url: str, **kwargs) -> ProcessorResult:
4747
status=ProcessorStatus.partial,
4848
)
4949

50-
# Extract readable content via readability
51-
try:
52-
doc = Document(raw_html)
53-
readable_title = doc.title()
54-
readable_content = doc.summary()
55-
doc.short_title()
56-
except Exception:
57-
readable_title = None
58-
readable_content = None
59-
6050
# Extract OG metadata
6151
og_meta = self._extract_og_metadata(raw_html)
6252

53+
# Emit structured sections via trafilatura/readability
54+
sections = _trafilatura_or_readability_sections(raw_html, url, og_meta)
55+
6356
# Determine best title
6457
title = (
6558
og_meta.get("og_title")
66-
or readable_title
6759
or og_meta.get("html_title")
6860
or url
6961
)
@@ -99,11 +91,12 @@ async def process(self, url: str, **kwargs) -> ProcessorResult:
9991
return ProcessorResult(
10092
title=title,
10193
description=description,
102-
content=readable_content,
103-
raw_content=raw_html[:100000], # cap raw HTML at 100KB
94+
content=None,
95+
raw_content=raw_html[:100000],
10496
media=media,
10597
metadata=metadata,
10698
source_platform="generic",
10799
item_type="url",
108100
status=ProcessorStatus.success,
101+
sections=sections,
109102
)

src/fourdpocket/processors/instagram.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727

2828
def _extract_shortcode(url: str) -> str | None:
29-
m = re.search(r"instagram\.com/(?:p|reel|reels)/([A-Za-z0-9_-]+)", url)
29+
m = re.search(r"instagram\.com/(?:[^/]+/)?(?:p|reel|reels)/([A-Za-z0-9_-]+)", url)
3030
return m.group(1) if m else None
3131

3232

@@ -38,6 +38,8 @@ class InstagramProcessor(BaseProcessor):
3838
r"instagram\.com/p/[A-Za-z0-9_-]+",
3939
r"instagram\.com/reel/[A-Za-z0-9_-]+",
4040
r"instagram\.com/reels/[A-Za-z0-9_-]+",
41+
r"instagram\.com/[^/]+/p/[A-Za-z0-9_-]+",
42+
r"instagram\.com/[^/]+/reel/[A-Za-z0-9_-]+",
4143
]
4244
priority = 10
4345

src/fourdpocket/processors/linkedin.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
"Chrome/124.0.0.0 Safari/537.36"
3939
)
4040

41-
_MIN_BODY_CHARS = 250 # below this we still ship but flag as limited
41+
_MIN_BODY_CHARS = 150 # below this we still ship but flag as limited
4242

4343

4444
@register_processor

src/fourdpocket/processors/mastodon.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class MastodonProcessor(BaseProcessor):
5959
"""Mastodon status + reply tree as sections."""
6060

6161
url_patterns = [
62-
r"https?://[^/]+/@[^/]+/\d+",
62+
r"/@[^/]+/\d+",
6363
]
6464
priority = 8
6565

src/fourdpocket/processors/reddit.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,8 @@ class RedditProcessor(BaseProcessor):
8484
"""Extract a Reddit submission with threaded comments as sections."""
8585

8686
url_patterns = [
87-
r"reddit\.com/r/\w+/comments/",
88-
r"old\.reddit\.com/r/\w+/comments/",
87+
r"reddit\.com/r/[\w-]+/comments/",
88+
r"old\.reddit\.com/r/[\w-]+/comments/",
8989
r"redd\.it/\w+",
9090
]
9191
priority = 10

src/fourdpocket/processors/stackoverflow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ class StackOverflowProcessor(BaseProcessor):
6666
r"serverfault\.com/questions/\d+",
6767
r"superuser\.com/questions/\d+",
6868
r"askubuntu\.com/questions/\d+",
69-
r"\w+\.stackexchange\.com/questions/\d+",
69+
r"[\w-]+\.stackexchange\.com/questions/\d+",
7070
]
7171
priority = 10
7272

src/fourdpocket/processors/substack.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,15 @@
2121

2222

2323
def _extract_pub_and_slug(url: str) -> tuple[str | None, str | None]:
24+
# Standard: https://{pub}.substack.com/p/{slug}
2425
m = re.match(r"https?://([^.]+)\.substack\.com/p/([\w\-]+)", url)
25-
if not m:
26-
return None, None
27-
return m.group(1), m.group(2)
26+
if m:
27+
return m.group(1), m.group(2)
28+
# Aggregator: https://substack.com/home/post/p-{id}
29+
m2 = re.match(r"https?://substack\.com/home/post/([\w\-]+)", url)
30+
if m2:
31+
return None, m2.group(1)
32+
return None, None
2833

2934

3035
def _try_substack_api(pub: str, slug: str) -> dict | None:
@@ -99,7 +104,10 @@ def _html_to_sections(html: str, url: str, start_order: int) -> tuple[list[Secti
99104
class SubstackProcessor(BaseProcessor):
100105
"""Substack post → typed sections via API; trafilatura fallback."""
101106

102-
url_patterns = [r"[a-z0-9-]+\.substack\.com/p/"]
107+
url_patterns = [
108+
r"[a-z0-9-]+\.substack\.com/p/",
109+
r"substack\.com/home/post/",
110+
]
103111
priority = 10
104112

105113
async def process(self, url: str, **kwargs) -> ProcessorResult:

src/fourdpocket/processors/threads.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ async def process(self, url: str, **kwargs) -> ProcessorResult:
9696
body_text = og_desc
9797

9898
body_text = body_text.strip()
99-
limited = len(body_text) < 50
99+
limited = len(body_text) < 30
100100

101101
# ─── Sections ───
102102
sections: list[Section] = []

src/fourdpocket/processors/tiktok.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ async def _og_fallback(self, url: str, reason: str) -> ProcessorResult:
8686
media=media,
8787
metadata={"url": url, "fallback": "og_metadata", "limited_extraction": True},
8888
source_platform="tiktok",
89-
item_type="url",
89+
item_type="video",
9090
status=ProcessorStatus.partial,
9191
error=reason,
9292
sections=sections,

src/fourdpocket/processors/youtube.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ def _flush():
264264
media=media,
265265
metadata=metadata,
266266
source_platform="youtube",
267-
item_type="video" if media else "url",
267+
item_type="video",
268268
status=status,
269269
error=error,
270270
sections=sections,

0 commit comments

Comments
 (0)