fix(processors): URL pattern bugs, pipeline hardening, quality polish

prakersh · prakersh · commit 90f34ed321d1 · 2026-04-13T16:34:41.000+05:30
- Fix Instagram not matching /{username}/p/ and /{username}/reel/ URLs
- Fix Substack not matching substack.com/home/post/ aggregator URLs
- Fix Reddit not matching hyphenated subreddit names (r/some-sub)
- Fix SE not matching hyphenated site names (code-review.stackexchange)
- Simplify Mastodon pattern (drop protocol requirement for consistency)
- YouTube/TikTok: always set item_type="video" even in fallback paths
- LinkedIn: lower limited-extraction threshold from 250→150 chars
- Threads: lower limited threshold from 50→30 chars (short by design)
- GenericURL: emit structured sections via trafilatura/readability
- Meilisearch: index section_kind/role/author in chunk documents
- Pipeline: catch all exceptions in section re-hydration, preserve provenance
- Add 48 new tests: URL matching (42 URLs × 14 platforms), pipeline
  integration (round-trip provenance, heading paths, boilerplate filtering)

304 tests pass, lint clean, frontend builds clean.
diff --git a/src/fourdpocket/processors/generic_url.py b/src/fourdpocket/processors/generic_url.py
@@ -1,15 +1,15 @@
-"""Generic URL processor - fallback for any URL."""
+"""Generic URL processor - fallback for any URL with section support."""
 
 import httpx
-from readability import Document
 
 from fourdpocket.processors.base import BaseProcessor, ProcessorResult, ProcessorStatus
+from fourdpocket.processors.medium import _trafilatura_or_readability_sections
 from fourdpocket.processors.registry import register_processor
 
 
 @register_processor
 class GenericURLProcessor(BaseProcessor):
-    """Extract content from any URL using readability and metadata parsing."""
+    """Extract content from any URL using trafilatura/readability as sections."""
 
     url_patterns = []  # matches nothing - used as fallback
     priority = -1  # lowest priority
@@ -47,23 +47,15 @@ async def process(self, url: str, **kwargs) -> ProcessorResult:
                 status=ProcessorStatus.partial,
             )
 
-        # Extract readable content via readability
-        try:
-            doc = Document(raw_html)
-            readable_title = doc.title()
-            readable_content = doc.summary()
-            doc.short_title()
-        except Exception:
-            readable_title = None
-            readable_content = None
-
         # Extract OG metadata
         og_meta = self._extract_og_metadata(raw_html)
 
+        # Emit structured sections via trafilatura/readability
+        sections = _trafilatura_or_readability_sections(raw_html, url, og_meta)
+
         # Determine best title
         title = (
             og_meta.get("og_title")
-            or readable_title
             or og_meta.get("html_title")
             or url
         )
@@ -99,11 +91,12 @@ async def process(self, url: str, **kwargs) -> ProcessorResult:
         return ProcessorResult(
             title=title,
             description=description,
-            content=readable_content,
-            raw_content=raw_html[:100000],  # cap raw HTML at 100KB
+            content=None,
+            raw_content=raw_html[:100000],
             media=media,
             metadata=metadata,
             source_platform="generic",
             item_type="url",
             status=ProcessorStatus.success,
+            sections=sections,
         )
diff --git a/src/fourdpocket/processors/instagram.py b/src/fourdpocket/processors/instagram.py
@@ -26,7 +26,7 @@
 
 
 def _extract_shortcode(url: str) -> str | None:
-    m = re.search(r"instagram\.com/(?:p|reel|reels)/([A-Za-z0-9_-]+)", url)
+    m = re.search(r"instagram\.com/(?:[^/]+/)?(?:p|reel|reels)/([A-Za-z0-9_-]+)", url)
     return m.group(1) if m else None
 
 
@@ -38,6 +38,8 @@ class InstagramProcessor(BaseProcessor):
         r"instagram\.com/p/[A-Za-z0-9_-]+",
         r"instagram\.com/reel/[A-Za-z0-9_-]+",
         r"instagram\.com/reels/[A-Za-z0-9_-]+",
+        r"instagram\.com/[^/]+/p/[A-Za-z0-9_-]+",
+        r"instagram\.com/[^/]+/reel/[A-Za-z0-9_-]+",
     ]
     priority = 10
 
diff --git a/src/fourdpocket/processors/linkedin.py b/src/fourdpocket/processors/linkedin.py
@@ -38,7 +38,7 @@
     "Chrome/124.0.0.0 Safari/537.36"
 )
 
-_MIN_BODY_CHARS = 250  # below this we still ship but flag as limited
+_MIN_BODY_CHARS = 150  # below this we still ship but flag as limited
 
 
 @register_processor
diff --git a/src/fourdpocket/processors/mastodon.py b/src/fourdpocket/processors/mastodon.py
@@ -59,7 +59,7 @@ class MastodonProcessor(BaseProcessor):
     """Mastodon status + reply tree as sections."""
 
     url_patterns = [
-        r"https?://[^/]+/@[^/]+/\d+",
+        r"/@[^/]+/\d+",
     ]
     priority = 8
 
diff --git a/src/fourdpocket/processors/reddit.py b/src/fourdpocket/processors/reddit.py
@@ -84,8 +84,8 @@ class RedditProcessor(BaseProcessor):
     """Extract a Reddit submission with threaded comments as sections."""
 
     url_patterns = [
-        r"reddit\.com/r/\w+/comments/",
-        r"old\.reddit\.com/r/\w+/comments/",
+        r"reddit\.com/r/[\w-]+/comments/",
+        r"old\.reddit\.com/r/[\w-]+/comments/",
         r"redd\.it/\w+",
     ]
     priority = 10
diff --git a/src/fourdpocket/processors/stackoverflow.py b/src/fourdpocket/processors/stackoverflow.py
@@ -66,7 +66,7 @@ class StackOverflowProcessor(BaseProcessor):
         r"serverfault\.com/questions/\d+",
         r"superuser\.com/questions/\d+",
         r"askubuntu\.com/questions/\d+",
-        r"\w+\.stackexchange\.com/questions/\d+",
+        r"[\w-]+\.stackexchange\.com/questions/\d+",
     ]
     priority = 10
 
diff --git a/src/fourdpocket/processors/substack.py b/src/fourdpocket/processors/substack.py
@@ -21,10 +21,15 @@
 
 
 def _extract_pub_and_slug(url: str) -> tuple[str | None, str | None]:
+    # Standard: https://{pub}.substack.com/p/{slug}
     m = re.match(r"https?://([^.]+)\.substack\.com/p/([\w\-]+)", url)
-    if not m:
-        return None, None
-    return m.group(1), m.group(2)
+    if m:
+        return m.group(1), m.group(2)
+    # Aggregator: https://substack.com/home/post/p-{id}
+    m2 = re.match(r"https?://substack\.com/home/post/([\w\-]+)", url)
+    if m2:
+        return None, m2.group(1)
+    return None, None
 
 
 def _try_substack_api(pub: str, slug: str) -> dict | None:
@@ -99,7 +104,10 @@ def _html_to_sections(html: str, url: str, start_order: int) -> tuple[list[Secti
 class SubstackProcessor(BaseProcessor):
     """Substack post → typed sections via API; trafilatura fallback."""
 
-    url_patterns = [r"[a-z0-9-]+\.substack\.com/p/"]
+    url_patterns = [
+        r"[a-z0-9-]+\.substack\.com/p/",
+        r"substack\.com/home/post/",
+    ]
     priority = 10
 
     async def process(self, url: str, **kwargs) -> ProcessorResult:
diff --git a/src/fourdpocket/processors/threads.py b/src/fourdpocket/processors/threads.py
@@ -96,7 +96,7 @@ async def process(self, url: str, **kwargs) -> ProcessorResult:
             body_text = og_desc
 
         body_text = body_text.strip()
-        limited = len(body_text) < 50
+        limited = len(body_text) < 30
 
         # ─── Sections ───
         sections: list[Section] = []
diff --git a/src/fourdpocket/processors/tiktok.py b/src/fourdpocket/processors/tiktok.py
@@ -86,7 +86,7 @@ async def _og_fallback(self, url: str, reason: str) -> ProcessorResult:
             media=media,
             metadata={"url": url, "fallback": "og_metadata", "limited_extraction": True},
             source_platform="tiktok",
-            item_type="url",
+            item_type="video",
             status=ProcessorStatus.partial,
             error=reason,
             sections=sections,
diff --git a/src/fourdpocket/processors/youtube.py b/src/fourdpocket/processors/youtube.py
@@ -264,7 +264,7 @@ def _flush():
             media=media,
             metadata=metadata,
             source_platform="youtube",
-            item_type="video" if media else "url",
+            item_type="video",
             status=status,
             error=error,
             sections=sections,
diff --git a/src/fourdpocket/search/backends/meilisearch_backend.py b/src/fourdpocket/search/backends/meilisearch_backend.py
@@ -53,16 +53,34 @@ def index_chunks(
             index = client.index("knowledge_chunks")
             docs = []
             for chunk in chunks:
-                docs.append({
+                doc = {
                     "id": str(chunk.id),
                     "item_id": str(item_id),
                     "user_id": str(user_id),
                     "title": title or "",
                     "url": url or "",
                     "text": chunk.text,
                     "chunk_order": chunk.chunk_order,
-                })
+                }
+                # Section provenance — nullable for legacy chunks
+                if hasattr(chunk, "section_kind") and chunk.section_kind:
+                    doc["section_kind"] = chunk.section_kind
+                if hasattr(chunk, "section_role") and chunk.section_role:
+                    doc["section_role"] = chunk.section_role
+                if hasattr(chunk, "author") and chunk.author:
+                    doc["author"] = chunk.author
+                if hasattr(chunk, "is_accepted_answer") and chunk.is_accepted_answer:
+                    doc["is_accepted_answer"] = True
+                docs.append(doc)
             if docs:
+                # Ensure section fields are filterable
+                try:
+                    index.update_filterable_attributes([
+                        "user_id", "item_id", "section_kind",
+                        "section_role", "author",
+                    ])
+                except Exception:
+                    pass
                 index.add_documents(docs)
         except Exception as e:
             logger.debug("Meilisearch chunk indexing failed: %s", e)
diff --git a/src/fourdpocket/workers/enrichment_pipeline.py b/src/fourdpocket/workers/enrichment_pipeline.py
@@ -143,15 +143,27 @@ def handle_chunking(db: Session, item_id: uuid.UUID, user_id: uuid.UUID) -> None
         for sd in sections_payload:
             try:
                 section_objs.append(Section(**sd))
-            except TypeError:
-                # Forward-compat: unknown fields are ignored so old payloads
-                # don't crash a newer schema.
+            except Exception as sec_err:
+                # Forward-compat: preserve all available fields so
+                # provenance (author, depth, parent_id, etc.) survives
+                # even when the schema diverges.
+                logger.warning(
+                    "Section re-hydration fallback for %s: %s",
+                    sd.get("id", "?"), sec_err,
+                )
                 section_objs.append(Section(
                     id=sd.get("id", ""),
                     kind=sd.get("kind", "uncategorized"),
                     order=sd.get("order", 0),
                     text=sd.get("text", ""),
                     role=sd.get("role", "main"),
+                    parent_id=sd.get("parent_id"),
+                    depth=sd.get("depth", 0),
+                    author=sd.get("author"),
+                    score=sd.get("score"),
+                    created_at=sd.get("created_at"),
+                    source_url=sd.get("source_url"),
+                    extra=sd.get("extra"),
                 ))
         raw_chunks = chunk_sections(
             section_objs,
diff --git a/tests/seed_urls.py b/tests/seed_urls.py
@@ -0,0 +1,82 @@
+"""Seed URL corpus for testing processor selection and extraction quality.
+
+Drawn from real URLs in the production database plus edge cases for
+each platform (hyphenated subreddits, username-prefixed Instagram, etc.).
+"""
+
+SEED_URLS: dict[str, list[str]] = {
+    "youtube": [
+        "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
+        "https://www.youtube.com/watch?v=aircAruvnKk",
+        "https://youtu.be/uPIuHpG7PTw",
+        "https://www.youtube.com/shorts/dQw4w9WgXcQ",
+    ],
+    "instagram": [
+        "https://www.instagram.com/reel/DXBB07DjAtr/",
+        "https://www.instagram.com/wasted/p/DXCwcT4ASEw/",
+        "https://www.instagram.com/p/ABC123xyz/",
+        "https://www.instagram.com/someuser/reel/XYZ789/",
+    ],
+    "reddit": [
+        "https://www.reddit.com/r/programming/comments/1b6yw0p/what_programming_language_should_i_learn_first/",
+        "https://www.reddit.com/r/opencodeCLI/comments/1sjsir3/ohmyopencodeslim_vs_superpowers/",
+        "https://www.reddit.com/r/some-sub-name/comments/abc123/test_post/",
+        "https://old.reddit.com/r/python/comments/abc/test/",
+        "https://redd.it/1b6yw0p",
+    ],
+    "twitter": [
+        "https://x.com/TweetsSupportin/status/1468827581315108864",
+        "https://twitter.com/elikisFan/status/9999999999",
+    ],
+    "github": [
+        "https://github.com/anthropics/claude-code",
+        "https://github.com/microsoft/vscode",
+        "https://github.com/torvalds/linux",
+        "https://github.com/onllm-dev/onWatch/issues/61",
+        "https://gist.github.com/user/abc123def456",
+    ],
+    "hackernews": [
+        "https://news.ycombinator.com/item?id=39599903",
+        "https://news.ycombinator.com/item?id=47742200",
+    ],
+    "stackoverflow": [
+        "https://stackoverflow.com/questions/11227809/why-is-processing-a-sorted-array-faster-than-processing-an-unsorted-array",
+        "https://stackoverflow.com/questions/79924639/how-to-implement-a-seamless-animation-maui",
+        "https://code-review.stackexchange.com/questions/12345/test",
+        "https://math.stackexchange.com/questions/67890/test",
+    ],
+    "substack": [
+        "https://substack.com/home/post/p-163234449",
+        "https://example.substack.com/p/my-awesome-post",
+    ],
+    "medium": [
+        "https://medium.com/@karpathy/software-2-0-a64152b37c35",
+        "https://medium.com/@nikhil.cse16/mastering-the-sliding-window-technique-a-comprehensive-guide-6bb5e1e86f99",
+    ],
+    "linkedin": [
+        "https://www.linkedin.com/posts/will-mctighe_my-friend-laid-off-his-80000year-assistant-activity-7449078921297424384-A7pg",
+        "https://www.linkedin.com/pulse/some-article-title",
+    ],
+    "mastodon": [
+        "https://mastodon.social/@pluralistic@mamot.fr/116395778750506218",
+        "https://fosstodon.org/@user/12345",
+    ],
+    "tiktok": [
+        "https://www.tiktok.com/@user/video/1234567890",
+        "https://vm.tiktok.com/ZMabc123/",
+    ],
+    "threads": [
+        "https://www.threads.net/@zuck/post/abc123",
+    ],
+    "spotify": [
+        "https://open.spotify.com/track/4uLU6hMCjMI75M1A2tKUQC",
+    ],
+}
+
+# URLs that should NOT match any specialized processor (fall to generic)
+GENERIC_URLS = [
+    "https://en.wikipedia.org/wiki/Machine_learning",
+    "https://arxiv.org/abs/1706.03762",
+    "https://news.ycombinator.com/",  # HN homepage — no /item
+    "https://pmnco.co.in/blog/5-pvt-ltd-registration-indore-2026/",
+]
diff --git a/tests/test_phase4_mastodon.py b/tests/test_phase4_mastodon.py
@@ -10,7 +10,6 @@
 import httpx
 import respx
 
-
 STATUS_PAYLOAD = {
     "id": "1001",
     "url": "https://mastodon.social/@alice/1001",
diff --git a/tests/test_pipeline_integration.py b/tests/test_pipeline_integration.py
diff --git a/tests/test_url_matching.py b/tests/test_url_matching.py

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@`
`38`	`38`	`"Chrome/124.0.0.0 Safari/537.36"`
`39`	`39`	`)`
`40`	`40`
`41`		`-_MIN_BODY_CHARS = 250 # below this we still ship but flag as limited`
	`41`	`+_MIN_BODY_CHARS = 150 # below this we still ship but flag as limited`
`42`	`42`
`43`	`43`
`44`	`44`	`@register_processor`
Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ class MastodonProcessor(BaseProcessor):`
`59`	`59`	`"""Mastodon status + reply tree as sections."""`
`60`	`60`
`61`	`61`	`url_patterns = [`
`62`		`- r"https?://[^/]+/@[^/]+/\d+",`
	`62`	`+ r"/@[^/]+/\d+",`
`63`	`63`	`]`
`64`	`64`	`priority = 8`
`65`	`65`
Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@ class StackOverflowProcessor(BaseProcessor):`
`66`	`66`	`r"serverfault\.com/questions/\d+",`
`67`	`67`	`r"superuser\.com/questions/\d+",`
`68`	`68`	`r"askubuntu\.com/questions/\d+",`
`69`		`- r"\w+\.stackexchange\.com/questions/\d+",`
	`69`	`+ r"[\w-]+\.stackexchange\.com/questions/\d+",`
`70`	`70`	`]`
`71`	`71`	`priority = 10`
`72`	`72`