diff --git a/src/Converter/HtmlToDjot.php b/src/Converter/HtmlToDjot.php index 1d881af..90aa295 100644 --- a/src/Converter/HtmlToDjot.php +++ b/src/Converter/HtmlToDjot.php @@ -970,6 +970,22 @@ protected function processLink(DOMElement $node): string // Skip href, title, and data-djot-ref since they're in the reference syntax $attrs = $this->formatInlineAttributes($node, ['href', 'title', 'data-djot-ref']); + if ($refLabel === '' && !$this->isSafeReferenceLabel($text)) { + if ($title !== '') { + return '[' . $text . '](' . $href . ' ' . $this->quoteLinkTitle($title) . ')' . $attrs; + } + + return '[' . $text . '](' . $href . ')' . $attrs; + } + + if ($refLabel !== '' && !$this->isSafeReferenceLabel($refLabel)) { + if ($title !== '') { + return '[' . $text . '](' . $href . ' ' . $this->quoteLinkTitle($title) . ')' . $attrs; + } + + return '[' . $text . '](' . $href . ')' . $attrs; + } + // Collect reference definition // For collapsed reference (empty label), use the link text as label $defLabel = $refLabel === '' ? $text : $refLabel; @@ -1014,6 +1030,22 @@ protected function processImage(DOMElement $node): string // Skip src, alt, title, and data-djot-ref since they're in the reference syntax $attrs = $this->formatInlineAttributes($node, ['src', 'alt', 'title', 'data-djot-ref']); + if ($refLabel === '' && !$this->isSafeReferenceLabel($alt)) { + if ($title !== '') { + return '![' . $alt . '](' . $src . ' ' . $this->quoteLinkTitle($title) . ')' . $attrs; + } + + return '![' . $alt . '](' . $src . ')' . $attrs; + } + + if ($refLabel !== '' && !$this->isSafeReferenceLabel($refLabel)) { + if ($title !== '') { + return '![' . $alt . '](' . $src . ' ' . $this->quoteLinkTitle($title) . ')' . $attrs; + } + + return '![' . $alt . '](' . $src . ')' . $attrs; + } + // Collect reference definition // For collapsed reference (empty label), use the alt text as label $defLabel = $refLabel === '' ? $alt : $refLabel; @@ -1551,7 +1583,9 @@ protected function serializeTableCellContent(DOMElement $cell): string $content = $hasBlockChildren ? $this->processBlock($cell) : $this->processChildren($cell); $content = trim($content); - return preg_replace('/\s+/', ' ', $content) ?? $content; + $content = preg_replace('/\s+/', ' ', $content) ?? $content; + + return str_replace('|', '\|', $content); } protected function findFirstDirectChildByTagName(DOMElement $node, string $tagName): ?DOMElement @@ -1729,7 +1763,12 @@ protected function processRawInline(DOMElement $node): string protected function processRawHtmlInlineElement(DOMElement $node): string { - $html = $node->ownerDocument?->saveHTML($node); + $clone = $node->cloneNode(true); + if ($clone instanceof DOMElement) { + $this->stripDjotDataAttributes($clone); + } + + $html = $clone instanceof DOMElement ? $clone->ownerDocument?->saveHTML($clone) : null; if (!is_string($html)) { $html = ''; } @@ -1754,6 +1793,32 @@ protected function linkRequiresRawHtmlFallback(DOMElement $node): bool return false; } + protected function isSafeReferenceLabel(string $label): bool + { + return strpbrk($label, '[]\\') === false; + } + + protected function stripDjotDataAttributes(DOMElement $node): void + { + $toRemove = []; + /** @var \DOMAttr $attr */ + foreach ($node->attributes as $attr) { + if (str_starts_with($attr->name, 'data-djot-')) { + $toRemove[] = $attr->name; + } + } + + foreach ($toRemove as $name) { + $node->removeAttribute($name); + } + + foreach ($node->childNodes as $child) { + if ($child instanceof DOMElement) { + $this->stripDjotDataAttributes($child); + } + } + } + /** * Process semantic HTML elements to Djot span syntax * diff --git a/tests/TestCase/Converter/HtmlToDjotTest.php b/tests/TestCase/Converter/HtmlToDjotTest.php index b86a97f..99abfe0 100644 --- a/tests/TestCase/Converter/HtmlToDjotTest.php +++ b/tests/TestCase/Converter/HtmlToDjotTest.php @@ -148,6 +148,23 @@ public function testLinkEscapesBackslashInLabel(): void $this->assertStringContainsString('a \ b', $htmlBack); } + public function testCollapsedReferenceLinkWithUnsafeLabelFallsBackToInlineLink(): void + { + $result = $this->converter->convert('a ] b'); + + $this->assertSame("[a \\] b](https://example.com)\n", $result); + $this->assertStringNotContainsString("\n[a \\] b]:", $result); + } + + public function testReferenceLinkWithUnsafeReferenceLabelFallsBackToInlineLink(): void + { + $result = $this->converter->convert('txt'); + + $this->assertSame("[txt](https://example.com)\n", $result); + $this->assertStringNotContainsString('[txt][', $result); + $this->assertStringNotContainsString("\n[ref]x]:", $result); + } + // ==================== Images ==================== public function testImage(): void @@ -195,6 +212,14 @@ public function testLinkWrappingProblematicImageFallsBackToRawHtml(): void $this->assertStringContainsString('a [ b', $htmlBack); } + public function testRawImageFallbackStripsDjotMetadata(): void + { + $result = $this->converter->convert('a [ b'); + + $this->assertSame("`\"a`{=html}\n", $result); + $this->assertStringNotContainsString('data-djot-ref', $result); + } + // ==================== Code ==================== public function testInlineCode(): void @@ -342,8 +367,8 @@ public function testNestedTableDoesNotLeakInnerRowsIntoOuterTable(): void $result = $this->converter->convert($html); - $this->assertStringContainsString('| outer', $result); - $this->assertSame(1, substr_count($result, '| inner |')); + $this->assertSame("| outer \\| inner \\| |\n", $result); + $this->assertStringNotContainsString("\n| inner |", $result); } public function testDivWithoutClassPreservesAttributes(): void @@ -543,6 +568,27 @@ public function testTableCellWithNestedListFallsBackToSingleLineCellText(): void $this->assertStringContainsString('- Item', $htmlBack); } + public function testTableCellEscapesLiteralPipeCharacters(): void + { + $html = '
A | B
'; + $result = $this->converter->convert($html); + + $this->assertSame("| A \\| B |\n", $result); + $htmlBack = (new DjotConverter())->convert($result); + $this->assertStringContainsString('A | B', $htmlBack); + $this->assertStringNotContainsString('A', $htmlBack); + } + + public function testTableCellEscapesPipeCharactersAfterBlockDegradation(): void + { + $html = '

A | B

C

'; + $result = $this->converter->convert($html); + + $this->assertSame("| A \\| B C |\n", $result); + $htmlBack = (new DjotConverter())->convert($result); + $this->assertStringContainsString('A | B C', $htmlBack); + } + public function testTableWithMultilineCaptionKeepsAllCaptionTextInsideCaption(): void { $html = '

cap one

cap two

x
';