fix: parse_inline drops links at position 0 and marks() loses href (#36)

ASRagab · cellarius · web-flow · commit 327736887ce8 · 2026-03-07T16:07:48.000+01:00
* fix: parse_inline drops links at position 0 and marks() loses href

Two bugs in inline link handling:

1. parse_inline(): Links at the start of a line (position 0) were silently
   dropped. The image-exclusion check `match.start() &gt; 0 and ...` would
   short-circuit to False when the link started at position 0, causing it
   to be skipped entirely. Fixed by checking `match.start() == 0 or
   text[match.start() - 1] != '!'` instead.

2. marks(): All link hrefs were set to null. parse_inline() returns marks
   with the structure `{'type': 'link', 'attrs': {'href': url}}`, but
   marks() only checked `mark.get('href')` at the top level. Added
   fallback to `mark.get('attrs', {}).get('href')` to handle both
   the attrs-nested format from parse_inline and the top-level format
   used in the README examples.

Added tests covering both bugs plus regression tests for image exclusion.

* chore: verify pre-commit formatting

* fix: keep links at pos 0; preserve link href (no reformat)

---------

Co-authored-by: cellarius &lt;ahmad.ragab.0001@gmail.com&gt;
diff --git a/substack/post.py b/substack/post.py
@@ -39,40 +39,41 @@ def parse_inline(text: str) -> List[Dict]:
     tokens = []
     # Process text character by character to handle nested formatting
     # We'll use regex to find all markdown patterns, then process them in order
-    
+
     # Find all markdown patterns: links, bold, italic
     # Pattern order: links first (to avoid conflicts), then bold, then italic
     link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
     bold_pattern = r'\*\*([^*]+)\*\*'
     italic_pattern = r'(?<!\*)\*([^*]+)\*(?!\*)'  # Not preceded or followed by *
-    
+
     # Find all matches with their positions
     matches = []
     for match in re.finditer(link_pattern, text):
         # Skip if it's an image link (starts with ![)
-        if match.start() > 0 and text[match.start()-1:match.start()+1] != "![":
+        # But do NOT skip normal links at position 0.
+        if match.start() == 0 or text[match.start()-1:match.start()+1] != "![":
             matches.append((match.start(), match.end(), "link", match.group(1), match.group(2)))
-    
+
     for match in re.finditer(bold_pattern, text):
         # Check if this range is already covered by a link
         if not any(start <= match.start() < end for start, end, _, _, _ in matches):
             matches.append((match.start(), match.end(), "bold", match.group(1), None))
-    
+
     for match in re.finditer(italic_pattern, text):
         # Check if this range is already covered by a link or bold
         if not any(start <= match.start() < end for start, end, _, _, _ in matches):
             matches.append((match.start(), match.end(), "italic", match.group(1), None))
-    
+
     # Sort matches by position
     matches.sort(key=lambda x: x[0])
-    
+
     # Build tokens
     last_pos = 0
     for start, end, match_type, content, url in matches:
         # Add text before this match
         if start > last_pos:
             tokens.append({"content": text[last_pos:start]})
-        
+
         # Add the formatted content
         if match_type == "link":
             tokens.append({
@@ -89,16 +90,16 @@ def parse_inline(text: str) -> List[Dict]:
                 "content": content,
                 "marks": [{"type": "em"}]
             })
-        
+
         last_pos = end
-    
+
     # Add remaining text
     if last_pos < len(text):
         tokens.append({"content": text[last_pos:]})
-    
+
     # Filter out empty tokens
     tokens = [t for t in tokens if t.get("content")]
-    
+
     return tokens
 
 
@@ -351,7 +352,7 @@ def marks(self, marks):
         for mark in marks:
             new_mark = {"type": mark.get("type")}
             if mark.get("type") == "link":
-                href = mark.get("href")
+                href = mark.get("href") or mark.get("attrs", {}).get("href")
                 new_mark.update({"attrs": {"href": href}})
             content_marks.append(new_mark)
         content["marks"] = content_marks
@@ -572,7 +573,7 @@ def from_markdown(self, markdown_content: str, api=None):
                         alt_text = linked_image_match.group(1)
                         image_url = linked_image_match.group(2)
                         link_url = linked_image_match.group(3)
-                        
+
                         # Adjust image URL if it starts with a slash
                         image_url = image_url[1:] if image_url.startswith("/") else image_url
 
diff --git a/tests/substack/test_post.py b/tests/substack/test_post.py
@@ -0,0 +1,82 @@
+"""Tests for Post and parse_inline."""
+
+import json
+
+from substack.post import Post, parse_inline
+
+
+class TestParseInline:
+    """Tests for parse_inline link handling."""
+
+    def test_link_at_start_of_text(self):
+        """Links at position 0 should be parsed correctly."""
+        result = parse_inline("[GPT](https://openai.com/)")
+        assert len(result) == 1
+        assert result[0]["content"] == "GPT"
+        assert result[0]["marks"][0]["attrs"]["href"] == "https://openai.com/"
+
+    def test_multiple_links_on_same_line(self):
+        """All links on the same line should be parsed."""
+        result = parse_inline(
+            "[GPT](https://openai.com/) and [Claude](https://anthropic.com/)"
+        )
+        links = [r for r in result if r.get("marks")]
+        assert len(links) == 2
+        assert links[0]["content"] == "GPT"
+        assert links[0]["marks"][0]["attrs"]["href"] == "https://openai.com/"
+        assert links[1]["content"] == "Claude"
+        assert links[1]["marks"][0]["attrs"]["href"] == "https://anthropic.com/"
+
+    def test_image_not_parsed_as_link(self):
+        """Image syntax ![alt](url) should not be parsed as a link."""
+        result = parse_inline("![alt](https://example.com/img.png)")
+        links = [r for r in result if r.get("marks")]
+        assert len(links) == 0
+
+    def test_link_mid_text(self):
+        """Links in the middle of text should work."""
+        result = parse_inline("Check [this](https://example.com) out")
+        links = [r for r in result if r.get("marks")]
+        assert len(links) == 1
+        assert links[0]["marks"][0]["attrs"]["href"] == "https://example.com"
+
+
+class TestPostMarks:
+    """Tests for Post.marks() link href handling."""
+
+    def test_marks_preserves_href_from_attrs(self):
+        """marks() should read href from attrs when present."""
+        post = Post(title="Test", subtitle="", user_id=1)
+        post.from_markdown("[Example](https://example.com)")
+        body = json.loads(post.get_draft()["draft_body"])
+        # Find the link mark
+        for block in body["content"]:
+            for node in block.get("content", []):
+                for mark in node.get("marks", []):
+                    if mark.get("type") == "link":
+                        assert mark["attrs"]["href"] == "https://example.com"
+                        return
+        raise AssertionError("No link mark found in output")
+
+    def test_marks_preserves_href_from_top_level(self):
+        """marks() should also work when href is at top level (legacy format)."""
+        post = Post(title="Test", subtitle="", user_id=1)
+        post.add(
+            {
+                "type": "paragraph",
+                "content": [
+                    {
+                        "content": "Link",
+                        "marks": [{"type": "link", "href": "https://example.com"}],
+                    }
+                ],
+            }
+        )
+        body = json.loads(post.get_draft()["draft_body"])
+        for block in body["content"]:
+            for node in block.get("content", []):
+                for mark in node.get("marks", []):
+                    if mark.get("type") == "link":
+                        assert mark["attrs"]["href"] == "https://example.com"
+                        return
+        raise AssertionError("No link mark found in output")