Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
4618e3e
Add unit tests for streamInfo, baseconverter and exception handling
NadChern Dec 2, 2025
f2905c5
Added unit tests for html_converter, ipynb_converter, outlook_msg_con…
lpetriuc Dec 3, 2025
6564943
Merge branch 'tp3' of https://github.com/NadChern/markitdown into tp3
lpetriuc Dec 3, 2025
f3fe7b5
Tested exiftool.py, llm_caption.py and markitdowify.py
lpetriuc Dec 3, 2025
dc4c602
- Added unit tests for RssConverter
NadChern Dec 3, 2025
c560ced
Added unit tests for markitdown, transcribe_audio, youtube
NadChern Dec 4, 2025
c91ad54
Created _test_audio_converter.py with Unit Tests for the methods in a…
ffreyli Dec 4, 2025
5e2d4ba
Created _test_epub_converter.py with Unit Tests for the methods in ep…
ffreyli Dec 4, 2025
169bc79
Created _test_docx_converter.py with Unit Tests for the methods in do…
ffreyli Dec 4, 2025
dbe5023
Created test_bing_serp_converter.py with Unit Tests for the methods i…
ffreyli Dec 4, 2025
9cf8a86
Created test_image_converter.py with 100% coverage of _image_converte…
ffreyli Dec 4, 2025
fffdb68
Actually achieved 100% coverage on _image_converter.py from test_imag…
ffreyli Dec 4, 2025
d863367
Updated ipynb_converter test_plain_text_converter tests
lpetriuc Dec 4, 2025
bf718b8
updated id names for pytest parameters on test_accepts in test_image_…
ffreyli Dec 4, 2025
372fd64
created unit tests in test_xlsx_converter.py covering _xlsx_converter…
ffreyli Dec 4, 2025
643e90b
96% coverage on _bing_serp_converter.py with unit testing in file tes…
ffreyli Dec 4, 2025
8d8c052
added test_docx_converter_integration.py, for integration testing of …
ffreyli Dec 4, 2025
628b0d7
deleted import of depreciated python3.11 import that was causing test…
ffreyli Dec 5, 2025
7ce11dc
Refactored _CustomMarkdownify by improving heading conversion with pa…
NadChern Dec 6, 2025
7131230
Added error handling for transcript translation in YouTubeConverter
NadChern Dec 6, 2025
de73c7b
Added warning handling for DOCX processing in test_mammoth_files_open…
NadChern Dec 6, 2025
80db7e6
Fix monkeypatching of shutil.which in test_fallback_to_which
NadChern Dec 6, 2025
296cc5e
Refactor heading conversion tests to use updated method signatures an…
NadChern Dec 6, 2025
6f6bfdc
Added fallback mechanism for YouTube transcript retrieval in tests
NadChern Dec 6, 2025
979306e
Added test HTML file for YouTube video metadata
NadChern Dec 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 20 additions & 7 deletions packages/markitdown/src/markitdown/converters/_markdownify.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,33 @@ def __init__(self, **options: Any):
# Explicitly cast options to the expected type if necessary
super().__init__(**options)

def convert_hn(
def convert_hN(
self,
n: int,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
parent_tags: Any = None,
) -> str:
"""Same as usual, but be sure to start with a new line"""
if not convert_as_inline:
# Get the parent's conversion result
result = super().convert_hN(n, el, text, parent_tags) # type: ignore

# Check if this is an inline heading based on parent tags
convert_as_inline = parent_tags and any(tag in ['p', 'span', 'a'] for tag in parent_tags) if isinstance(parent_tags, list) else False

if convert_as_inline:
# For inline headings, remove all leading newlines
return result.lstrip('\n')
else:
# For block headings, ensure exactly one leading newline
# Strip existing leading newlines and add exactly one
stripped = result.lstrip('\n')
# Only add leading newline if the original text didn't start with one
if not re.search(r"^\n", text):
return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore

return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
return "\n" + stripped
else:
# Text already had a leading newline, don't add another
return stripped

def convert_a(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,9 @@ def _parse_rss_type(self, doc: Document) -> DocumentConverterResult:
channel_title = self._get_data_by_tag_name(channel, "title")
channel_description = self._get_data_by_tag_name(channel, "description")
items = channel.getElementsByTagName("item")
md_text = ""
if channel_title:
md_text = f"# {channel_title}\n"
md_text += f"# {channel_title}\n"
if channel_description:
md_text += f"{channel_description}\n"
for item in items:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,13 +178,16 @@ def convert(
if len(languages) == 1:
print(f"Error fetching transcript: {e}")
else:
# Translate transcript into first kwarg
transcript = (
transcript_list.find_transcript(languages)
.translate(youtube_transcript_languages[0])
.fetch()
)
transcript_text = " ".join([part.text for part in transcript])
# Try to translate transcript into first kwarg
try:
transcript = (
transcript_list.find_transcript(languages)
.translate(youtube_transcript_languages[0])
.fetch()
)
transcript_text = " ".join([part.text for part in transcript])
except Exception as translation_error:
print(f"Error translating transcript: {translation_error}")
if transcript_text:
webpage_text += f"\n### Transcript\n{transcript_text}\n"

Expand Down
80 changes: 80 additions & 0 deletions packages/markitdown/tests/test_audio_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import io
from unittest.mock import patch

import pytest

import markitdown.converters._audio_converter
from markitdown import StreamInfo, DocumentConverter, MissingDependencyException


class TestAudioConverter:
@pytest.fixture(autouse=True)
def setup(self):
self.converter = markitdown.converters._audio_converter.AudioConverter()


@pytest.mark.parametrize("input_value, expected_output", [
pytest.param(".mp3", True, id=".mp3_extension_case"),
pytest.param(".docx", False, id=".docx_extension_case")
])
@patch("markitdown._stream_info.StreamInfo")
def test_accepts_valid_extension(self, stream_info, input_value, expected_output):
stream_info.extension = input_value
stream_info.mimetype = "invalid"
assert self.converter.accepts(io.BytesIO(), stream_info) == expected_output

@pytest.mark.parametrize("input_value, expected_output", [
pytest.param("audio/mpeg", True, id="audio/mpeg_mimetype_case"),
pytest.param("video/mp4", True, id="video/mp4_mimetype_case"),
pytest.param("audio/x-wav", True, id="audio/x-wav_mimetype_case"),
pytest.param("x-wav", False, id="x-wav_mimetype_case")
])
@patch("markitdown._stream_info.StreamInfo")
def test_accepts_valid_mimetype(self, stream_info, input_value, expected_output):
stream_info.mimetype = input_value
assert self.converter.accepts(io.BytesIO(), stream_info) == expected_output


@pytest.mark.parametrize("mimetype, extension, expected_output", [
pytest.param("audio/x-wav", ".wav", True, id=".wav_format_case"),
pytest.param("audio/mpeg", ".mp3", True, id=".mp3_format_case"),
pytest.param("video/mp4", ".mp4", True, id=".mp4_format_case"),
pytest.param("test", ".test", True, id="no_format_case")
])
@patch("markitdown.converters._audio_converter.exiftool_metadata")
@patch("markitdown.converters._audio_converter.transcribe_audio")
@patch("markitdown._stream_info.StreamInfo")
def test_convert(self, stream_info, mock_transcribe, mock_exif, mimetype, extension, expected_output):
mock_exif.return_value = {
"Title": "Song",
"Artist": "Alice",
"SampleRate": 48000,
}

mock_transcribe.return_value = "hello world"

stream_info.mimetype = mimetype
stream_info.extension = extension
result = self.converter.convert(io.BytesIO(), stream_info)

assert "Title: Song" in result.markdown
assert "Artist: Alice" in result.markdown
assert "SampleRate: 48000" in result.markdown

@patch("markitdown.converters._audio_converter.transcribe_audio")
@patch("markitdown._stream_info.StreamInfo")
def test_convert_raises_exception(self, stream_info, mock_transcribe):
mock_transcribe.side_effect = MissingDependencyException("dependency not installed")
stream_info.mimetype = "audio/mpeg"
stream_info.extension =".mp3"
file_stream = io.BytesIO(b"dummy audio data")


mock_transcribe.side_effect = MissingDependencyException("missing dependency")

# ACT
result = self.converter.convert(file_stream, stream_info)

# ASSERT
assert "Audio Transcript" not in result.markdown
assert result.markdown == "" # no metadata, no transcript
Loading