Skip to content

Commit 3277368

Browse files
ASRagabcellarius
andauthored
fix: parse_inline drops links at position 0 and marks() loses href (#36)
* fix: parse_inline drops links at position 0 and marks() loses href Two bugs in inline link handling: 1. parse_inline(): Links at the start of a line (position 0) were silently dropped. The image-exclusion check `match.start() > 0 and ...` would short-circuit to False when the link started at position 0, causing it to be skipped entirely. Fixed by checking `match.start() == 0 or text[match.start() - 1] != '!'` instead. 2. marks(): All link hrefs were set to null. parse_inline() returns marks with the structure `{'type': 'link', 'attrs': {'href': url}}`, but marks() only checked `mark.get('href')` at the top level. Added fallback to `mark.get('attrs', {}).get('href')` to handle both the attrs-nested format from parse_inline and the top-level format used in the README examples. Added tests covering both bugs plus regression tests for image exclusion. * chore: verify pre-commit formatting * fix: keep links at pos 0; preserve link href (no reformat) --------- Co-authored-by: cellarius <ahmad.ragab.0001@gmail.com>
1 parent fe45356 commit 3277368

File tree

2 files changed

+97
-14
lines changed

2 files changed

+97
-14
lines changed

substack/post.py

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -39,40 +39,41 @@ def parse_inline(text: str) -> List[Dict]:
3939
tokens = []
4040
# Process text character by character to handle nested formatting
4141
# We'll use regex to find all markdown patterns, then process them in order
42-
42+
4343
# Find all markdown patterns: links, bold, italic
4444
# Pattern order: links first (to avoid conflicts), then bold, then italic
4545
link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
4646
bold_pattern = r'\*\*([^*]+)\*\*'
4747
italic_pattern = r'(?<!\*)\*([^*]+)\*(?!\*)' # Not preceded or followed by *
48-
48+
4949
# Find all matches with their positions
5050
matches = []
5151
for match in re.finditer(link_pattern, text):
5252
# Skip if it's an image link (starts with ![)
53-
if match.start() > 0 and text[match.start()-1:match.start()+1] != "![":
53+
# But do NOT skip normal links at position 0.
54+
if match.start() == 0 or text[match.start()-1:match.start()+1] != "![":
5455
matches.append((match.start(), match.end(), "link", match.group(1), match.group(2)))
55-
56+
5657
for match in re.finditer(bold_pattern, text):
5758
# Check if this range is already covered by a link
5859
if not any(start <= match.start() < end for start, end, _, _, _ in matches):
5960
matches.append((match.start(), match.end(), "bold", match.group(1), None))
60-
61+
6162
for match in re.finditer(italic_pattern, text):
6263
# Check if this range is already covered by a link or bold
6364
if not any(start <= match.start() < end for start, end, _, _, _ in matches):
6465
matches.append((match.start(), match.end(), "italic", match.group(1), None))
65-
66+
6667
# Sort matches by position
6768
matches.sort(key=lambda x: x[0])
68-
69+
6970
# Build tokens
7071
last_pos = 0
7172
for start, end, match_type, content, url in matches:
7273
# Add text before this match
7374
if start > last_pos:
7475
tokens.append({"content": text[last_pos:start]})
75-
76+
7677
# Add the formatted content
7778
if match_type == "link":
7879
tokens.append({
@@ -89,16 +90,16 @@ def parse_inline(text: str) -> List[Dict]:
8990
"content": content,
9091
"marks": [{"type": "em"}]
9192
})
92-
93+
9394
last_pos = end
94-
95+
9596
# Add remaining text
9697
if last_pos < len(text):
9798
tokens.append({"content": text[last_pos:]})
98-
99+
99100
# Filter out empty tokens
100101
tokens = [t for t in tokens if t.get("content")]
101-
102+
102103
return tokens
103104

104105

@@ -351,7 +352,7 @@ def marks(self, marks):
351352
for mark in marks:
352353
new_mark = {"type": mark.get("type")}
353354
if mark.get("type") == "link":
354-
href = mark.get("href")
355+
href = mark.get("href") or mark.get("attrs", {}).get("href")
355356
new_mark.update({"attrs": {"href": href}})
356357
content_marks.append(new_mark)
357358
content["marks"] = content_marks
@@ -572,7 +573,7 @@ def from_markdown(self, markdown_content: str, api=None):
572573
alt_text = linked_image_match.group(1)
573574
image_url = linked_image_match.group(2)
574575
link_url = linked_image_match.group(3)
575-
576+
576577
# Adjust image URL if it starts with a slash
577578
image_url = image_url[1:] if image_url.startswith("/") else image_url
578579

tests/substack/test_post.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
"""Tests for Post and parse_inline."""
2+
3+
import json
4+
5+
from substack.post import Post, parse_inline
6+
7+
8+
class TestParseInline:
9+
"""Tests for parse_inline link handling."""
10+
11+
def test_link_at_start_of_text(self):
12+
"""Links at position 0 should be parsed correctly."""
13+
result = parse_inline("[GPT](https://openai.com/)")
14+
assert len(result) == 1
15+
assert result[0]["content"] == "GPT"
16+
assert result[0]["marks"][0]["attrs"]["href"] == "https://openai.com/"
17+
18+
def test_multiple_links_on_same_line(self):
19+
"""All links on the same line should be parsed."""
20+
result = parse_inline(
21+
"[GPT](https://openai.com/) and [Claude](https://anthropic.com/)"
22+
)
23+
links = [r for r in result if r.get("marks")]
24+
assert len(links) == 2
25+
assert links[0]["content"] == "GPT"
26+
assert links[0]["marks"][0]["attrs"]["href"] == "https://openai.com/"
27+
assert links[1]["content"] == "Claude"
28+
assert links[1]["marks"][0]["attrs"]["href"] == "https://anthropic.com/"
29+
30+
def test_image_not_parsed_as_link(self):
31+
"""Image syntax ![alt](url) should not be parsed as a link."""
32+
result = parse_inline("![alt](https://example.com/img.png)")
33+
links = [r for r in result if r.get("marks")]
34+
assert len(links) == 0
35+
36+
def test_link_mid_text(self):
37+
"""Links in the middle of text should work."""
38+
result = parse_inline("Check [this](https://example.com) out")
39+
links = [r for r in result if r.get("marks")]
40+
assert len(links) == 1
41+
assert links[0]["marks"][0]["attrs"]["href"] == "https://example.com"
42+
43+
44+
class TestPostMarks:
45+
"""Tests for Post.marks() link href handling."""
46+
47+
def test_marks_preserves_href_from_attrs(self):
48+
"""marks() should read href from attrs when present."""
49+
post = Post(title="Test", subtitle="", user_id=1)
50+
post.from_markdown("[Example](https://example.com)")
51+
body = json.loads(post.get_draft()["draft_body"])
52+
# Find the link mark
53+
for block in body["content"]:
54+
for node in block.get("content", []):
55+
for mark in node.get("marks", []):
56+
if mark.get("type") == "link":
57+
assert mark["attrs"]["href"] == "https://example.com"
58+
return
59+
raise AssertionError("No link mark found in output")
60+
61+
def test_marks_preserves_href_from_top_level(self):
62+
"""marks() should also work when href is at top level (legacy format)."""
63+
post = Post(title="Test", subtitle="", user_id=1)
64+
post.add(
65+
{
66+
"type": "paragraph",
67+
"content": [
68+
{
69+
"content": "Link",
70+
"marks": [{"type": "link", "href": "https://example.com"}],
71+
}
72+
],
73+
}
74+
)
75+
body = json.loads(post.get_draft()["draft_body"])
76+
for block in body["content"]:
77+
for node in block.get("content", []):
78+
for mark in node.get("marks", []):
79+
if mark.get("type") == "link":
80+
assert mark["attrs"]["href"] == "https://example.com"
81+
return
82+
raise AssertionError("No link mark found in output")

0 commit comments

Comments
 (0)