From aa9df16dd7fb7389455ddf7905cfdce95f59b351 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eric=20Meadows-J=C3=B6nsson?= Date: Sun, 8 Mar 2026 20:52:29 +0100 Subject: [PATCH 1/7] Stream tar file entries to disk instead of loading into memory When extracting tar archives to disk, stream file entries in chunks instead of reading them fully into memory. Also replace hand-rolled tar parsing in hex_tarball with hex_erl_tar:extract. --- src/hex_erl_tar.erl | 99 ++++++++++++++++-- src/hex_erl_tar.hrl | 6 +- src/hex_tarball.erl | 205 ++++++++++++++++++------------------- test/hex_tarball_SUITE.erl | 50 ++++++++- 4 files changed, 239 insertions(+), 121 deletions(-) diff --git a/src/hex_erl_tar.erl b/src/hex_erl_tar.erl index 89efade..72eb9fc 100644 --- a/src/hex_erl_tar.erl +++ b/src/hex_erl_tar.erl @@ -4,6 +4,9 @@ %% 3. -doc and -moduledoc attributes removed for OTP 24 compatibility %% 4. safe_link_name/2 fixed to validate symlink targets relative to symlink's %% parent directory instead of in isolation +%% 5. When extracting to disk (cwd option), stream file entries in chunks +%% instead of loading them fully into memory +%% 6. Default chunk_size to 65536 in add_opts instead of 0 with special case %% %% OTP commit: 013041bd68c2547848e88963739edea7f0a1a90f %% @@ -147,12 +150,18 @@ extract1(eof, Reader, _, Acc) -> extract1(#tar_header{name=Name,size=Size}=Header, Reader0, Opts, Acc0) -> case check_extract(Name, Opts) of true -> - case do_read(Reader0, Size) of - {ok, Bin, Reader1} -> - Acc = extract2(Header, Bin, Opts, Acc0), - {ok, Acc, Reader1}; - {error, _} = Err -> - throw(Err) + case Opts#read_opts.output of + memory -> + case do_read(Reader0, Size) of + {ok, Bin, Reader1} -> + Acc = extract2(Header, Bin, Opts, Acc0), + {ok, Acc, Reader1}; + {error, _} = Err -> + throw(Err) + end; + file -> + Reader1 = extract_to_file(Header, Reader0, Opts), + {ok, Acc0, Reader1} end; false -> {ok, Acc0, skip_file(Reader0)} @@ -173,6 +182,79 @@ extract2(Header, Bin, Opts, Acc) -> throw(Err) end. +extract_to_file(#tar_header{name=Name0}=Header, Reader0, Opts) -> + case typeflag(Header#tar_header.typeflag) of + regular -> + Name1 = make_safe_path(Name0, Opts), + case stream_to_file(Name1, Reader0, Opts) of + {ok, Reader1} -> + read_verbose(Opts, "x ~ts~n", [Name0]), + set_extracted_file_info(Name1, Header), + Reader1; + {error, _} = Err -> + throw(Err) + end; + _ -> + Reader1 = skip_file(Reader0), + write_extracted_element(Header, <<>>, Opts), + Reader1 + end. + +stream_to_file(Name, Reader0, Opts) -> + Write = + case Opts#read_opts.keep_old_files of + true -> + case file:read_file_info(Name) of + {ok, _} -> false; + _ -> true + end; + false -> true + end, + case Write of + true -> + ChunkSize = Opts#read_opts.chunk_size, + case open_output_file(Name) of + {ok, Fd} -> + try + stream_to_file_loop(Fd, Reader0, ChunkSize) + after + file:close(Fd) + end; + {error, _} = Err -> + Err + end; + false -> + {ok, skip_file(Reader0)} + end. + +open_output_file(Name) -> + case file:open(Name, [write, raw, binary]) of + {ok, _} = Ok -> + Ok; + {error, enoent} -> + ok = make_dirs(Name, file), + file:open(Name, [write, raw, binary]); + {error, _} = Err -> + Err + end. + +stream_to_file_loop(_Fd, #reg_file_reader{num_bytes=0}=Reader, _ChunkSize) -> + {ok, Reader}; +stream_to_file_loop(_Fd, #sparse_file_reader{num_bytes=0}=Reader, _ChunkSize) -> + {ok, Reader}; +stream_to_file_loop(Fd, Reader, ChunkSize) -> + case do_read(Reader, ChunkSize) of + {ok, Bin, Reader1} -> + case file:write(Fd, Bin) of + ok -> + stream_to_file_loop(Fd, Reader1, ChunkSize); + {error, _} = Err -> + Err + end; + {error, _} = Err -> + Err + end. + %% Checks if the file Name should be extracted. check_extract(_, #read_opts{files=all}) -> true; @@ -1833,9 +1915,6 @@ do_write(#reader{handle=Handle,func=Fun}=Reader0, Data) Err end. -do_copy(#reader{func=Fun}=Reader, Source, #add_opts{chunk_size=0}=Opts) - when is_function(Fun, 2) -> - do_copy(Reader, Source, Opts#add_opts{chunk_size=65536}); do_copy(#reader{func=Fun}=Reader, Source, #add_opts{chunk_size=ChunkSize}) when is_function(Fun, 2) -> case file:open(Source, [read, binary]) of @@ -2009,6 +2088,8 @@ extract_opts([cooked|Rest], Opts=#read_opts{open_mode=OpenMode}) -> extract_opts(Rest, Opts#read_opts{open_mode=[cooked|OpenMode]}); extract_opts([verbose|Rest], Opts) -> extract_opts(Rest, Opts#read_opts{verbose=true}); +extract_opts([{chunks,N}|Rest], Opts) -> + extract_opts(Rest, Opts#read_opts{chunk_size=N}); extract_opts([Other|Rest], Opts) -> extract_opts(Rest, read_opts([Other], Opts)); extract_opts([], Opts) -> diff --git a/src/hex_erl_tar.hrl b/src/hex_erl_tar.hrl index 7cb6c63..98b095e 100644 --- a/src/hex_erl_tar.hrl +++ b/src/hex_erl_tar.hrl @@ -25,7 +25,7 @@ %% Options used when adding files to a tar archive. -record(add_opts, { read_info, %% Fun to use for read file/link info. - chunk_size = 0, %% For file reading when sending to sftp. 0=do not chunk + chunk_size = 65536, %% Chunk size for reading files. verbose = false, %% Verbose on/off. atime = undefined, mtime = undefined, @@ -42,7 +42,8 @@ files = all, %% Set of files to extract (or all) output = file :: 'file' | 'memory', open_mode = [], %% Open mode options. - verbose = false :: boolean()}). %% Verbose on/off. + verbose = false :: boolean(), %% Verbose on/off. + chunk_size = 65536}). %% Chunk size for streaming to disk. -type read_opts() :: #read_opts{}. -type add_opt() :: dereference | @@ -59,6 +60,7 @@ -type extract_opt() :: {cwd, string()} | {files, [name_in_archive()]} | + {chunks, pos_integer()} | compressed | cooked | memory | diff --git a/src/hex_tarball.erl b/src/hex_tarball.erl index 1ea06ad..7c8d7fa 100644 --- a/src/hex_tarball.erl +++ b/src/hex_tarball.erl @@ -27,7 +27,7 @@ -include_lib("kernel/include/file.hrl"). -type checksum() :: binary(). --type contents() :: #{filename() => binary()}. +-type contents() :: [{filename(), binary()}]. -type filename() :: string(). -type files() :: [{filename(), filename() | binary()}]. -type metadata() :: map(). @@ -159,6 +159,16 @@ create_docs(Files) -> %% Remember to verify the outer tarball checksum against the registry checksum %% returned from `hex_repo:get_package(Config, Package)'. %% +%% The first argument is the tarball, either as a binary or `{file, Path}' +%% to read from a file on disk. Using `{file, Path}' avoids loading the +%% tarball into memory. +%% +%% The second argument controls the output: +%% +%% - `memory' - unpack contents into memory and return them +%% - `none' - only extract metadata and checksums, skip contents +%% - A path string - extract contents to the given directory +%% %% Examples: %% %% ``` @@ -167,12 +177,16 @@ create_docs(Files) -> %% contents => [{"src/foo.erl",<<"-module(foo).">>}], %% metadata => #{<<"name">> => <<"foo">>, ...}}} %% +%% > hex_tarball:unpack(Tarball, none). +%% {ok,#{outer_checksum => <<...>>, +%% metadata => #{<<"name">> => <<"foo">>, ...}}} +%% %% > hex_tarball:unpack(Tarball, "path/to/unpack"). %% {ok,#{outer_checksum => <<...>>, %% metadata => #{<<"name">> => <<"foo">>, ...}}} %% ''' -spec unpack - (tarball(), memory, hex_core:config()) -> + (tarball() | {file, filename()}, memory, hex_core:config()) -> {ok, #{ outer_checksum => checksum(), inner_checksum => checksum(), @@ -180,7 +194,14 @@ create_docs(Files) -> contents => contents() }} | {error, term()}; - (tarball(), filename(), hex_core:config()) -> + (tarball() | {file, filename()}, none, hex_core:config()) -> + {ok, #{ + outer_checksum => checksum(), + inner_checksum => checksum(), + metadata => metadata() + }} + | {error, term()}; + (tarball() | {file, filename()}, filename(), hex_core:config()) -> {ok, #{ outer_checksum => checksum(), inner_checksum => checksum(), @@ -190,13 +211,13 @@ create_docs(Files) -> unpack({file, Path}, Output, Config) -> case valid_file_size(Path, maps:get(tarball_max_size, Config)) of true -> + OuterChecksum = file_checksum(Path), TmpDir = tmp_path(), ok = file:make_dir(TmpDir), try case hex_erl_tar:extract(Path, [{cwd, TmpDir}]) of ok -> - OuterChecksum = file_checksum(Path), - case read_outer_files_from_dir(TmpDir) of + case read_outer_files(TmpDir) of {ok, Files} -> do_unpack(Files, OuterChecksum, Output); {error, _} = Error -> @@ -219,9 +240,12 @@ unpack(Tarball, Output, Config) -> {error, {tarball, empty}}; {ok, FileList} -> OuterChecksum = crypto:hash(sha256, Tarball), - do_unpack( - validate_outer_file_sizes(maps:from_list(FileList)), OuterChecksum, Output - ); + case validate_outer_file_sizes(maps:from_list(FileList)) of + {ok, Files} -> + do_unpack(Files, OuterChecksum, Output); + {error, _} = Error -> + Error + end; {error, Reason} -> {error, {tarball, Reason}} end; @@ -234,7 +258,7 @@ unpack(Tarball, Output, Config) -> %% %% @see unpack/3 -spec unpack - (tarball(), memory) -> + (tarball() | {file, filename()}, memory) -> {ok, #{ outer_checksum => checksum(), inner_checksum => checksum(), @@ -242,7 +266,14 @@ unpack(Tarball, Output, Config) -> contents => contents() }} | {error, term()}; - (tarball(), filename()) -> + (tarball() | {file, filename()}, none) -> + {ok, #{ + outer_checksum => checksum(), + inner_checksum => checksum(), + metadata => metadata() + }} + | {error, term()}; + (tarball() | {file, filename()}, filename()) -> {ok, #{ outer_checksum => checksum(), inner_checksum => checksum(), @@ -356,8 +387,6 @@ encode_metadata(Meta) -> iolist_to_binary(Data). %% @private -do_unpack({error, _} = Error, _OuterChecksum, _Output) -> - Error; do_unpack(Files, OuterChecksum, Output) -> State = #{ inner_checksum => undefined, @@ -386,54 +415,47 @@ finish_unpack(#{ _ = maps:get("VERSION", Files), Contents = maps:get("contents.tar.gz", Files), - case {Contents, Output} of - {{path, _}, memory} -> - {ok, #{ - inner_checksum => InnerChecksum, - outer_checksum => OuterChecksum, - metadata => Metadata - }}; - {{path, ContentsPath}, _} -> - filelib:ensure_dir(filename:join(Output, "*")), - {ok, ContentsBinary} = file:read_file(ContentsPath), - case unpack_tarball(ContentsBinary, Output) of - ok -> - copy_metadata_config(Output, maps:get("metadata.config", Files)), - {ok, #{ - inner_checksum => InnerChecksum, - outer_checksum => OuterChecksum, - metadata => Metadata - }}; - {error, Reason} -> - {error, {inner_tarball, Reason}} - end; - {ContentsBinary, memory} -> - case unpack_tarball(ContentsBinary, Output) of + Result = #{ + inner_checksum => InnerChecksum, + outer_checksum => OuterChecksum, + metadata => Metadata + }, + + case Output of + none -> + {ok, Result}; + memory -> + case unpack_contents(Contents, memory) of {ok, UnpackedContents} -> - {ok, #{ - inner_checksum => InnerChecksum, - outer_checksum => OuterChecksum, - metadata => Metadata, - contents => UnpackedContents - }}; + {ok, Result#{contents => UnpackedContents}}; {error, Reason} -> {error, {inner_tarball, Reason}} end; - {ContentsBinary, _} -> + _ -> filelib:ensure_dir(filename:join(Output, "*")), - case unpack_tarball(ContentsBinary, Output) of + case unpack_contents(Contents, Output) of ok -> + [ + try_updating_mtime(filename:join(Output, P)) + || P <- filelib:wildcard("**", Output) + ], copy_metadata_config(Output, maps:get("metadata.config", Files)), - {ok, #{ - inner_checksum => InnerChecksum, - outer_checksum => OuterChecksum, - metadata => Metadata - }}; + {ok, Result}; {error, Reason} -> {error, {inner_tarball, Reason}} end end. +%% @private +unpack_contents({path, ContentsPath}, memory) -> + hex_erl_tar:extract(ContentsPath, [memory, compressed]); +unpack_contents({path, ContentsPath}, Output) -> + hex_erl_tar:extract(ContentsPath, [{cwd, Output}, compressed]); +unpack_contents(ContentsBinary, memory) -> + hex_erl_tar:extract({binary, ContentsBinary}, [memory, compressed]); +unpack_contents(ContentsBinary, Output) -> + hex_erl_tar:extract({binary, ContentsBinary}, [{cwd, Output}, compressed]). + %% @private copy_metadata_config(Output, MetadataBinary) -> ok = file:write_file(filename:join(Output, "hex_metadata.config"), MetadataBinary). @@ -732,68 +754,35 @@ stream_file_hash_loop(Fd, HashState) -> end. %% @private -read_outer_files_from_dir(Dir) -> - VersionPath = filename:join(Dir, "VERSION"), - ChecksumPath = filename:join(Dir, "CHECKSUM"), - MetadataPath = filename:join(Dir, "metadata.config"), - ContentsPath = filename:join(Dir, "contents.tar.gz"), - - case - { - filelib:is_regular(VersionPath), - filelib:is_regular(ChecksumPath), - filelib:is_regular(MetadataPath), - filelib:is_regular(ContentsPath) - } - of - {true, true, true, true} -> - case check_outer_file_sizes(VersionPath, ChecksumPath, MetadataPath) of - ok -> - {ok, Version} = file:read_file(VersionPath), - {ok, Checksum} = file:read_file(ChecksumPath), - {ok, MetadataConfig} = file:read_file(MetadataPath), - {ok, #{ - "VERSION" => Version, - "CHECKSUM" => Checksum, - "metadata.config" => MetadataConfig, - "contents.tar.gz" => {path, ContentsPath} - }}; - {error, _} = Error -> - Error - end; - _ -> - Missing = lists:filtermap( - fun({Path, Name}) -> - case filelib:is_regular(Path) of - true -> false; - false -> {true, Name} - end - end, - [ - {VersionPath, "VERSION"}, - {ChecksumPath, "CHECKSUM"}, - {MetadataPath, "metadata.config"}, - {ContentsPath, "contents.tar.gz"} - ] - ), - {error, {tarball, {missing_files, Missing}}} +%% Reads outer tar files from a directory after extraction. +%% Small files (VERSION, CHECKSUM, metadata.config) are read into memory. +%% contents.tar.gz is referenced by path. +read_outer_files(Dir) -> + RequiredFiles = ["VERSION", "CHECKSUM", "metadata.config", "contents.tar.gz"], + case read_outer_files(Dir, RequiredFiles, #{}) of + {ok, Files} -> + validate_outer_file_sizes(Files); + {error, _} = Error -> + Error end. -%% @private -check_outer_file_sizes(VersionPath, ChecksumPath, MetadataPath) -> - case valid_file_size(VersionPath, ?MAX_VERSION_SIZE) of - false -> - {error, {tarball, {file_too_big, "VERSION"}}}; +read_outer_files(_Dir, [], Acc) -> + {ok, Acc}; +read_outer_files(Dir, ["contents.tar.gz" | Rest], Acc) -> + Path = filename:join(Dir, "contents.tar.gz"), + case filelib:is_regular(Path) of true -> - case valid_file_size(ChecksumPath, ?MAX_CHECKSUM_SIZE) of - false -> - {error, {tarball, {file_too_big, "CHECKSUM"}}}; - true -> - case valid_file_size(MetadataPath, ?MAX_METADATA_SIZE) of - false -> {error, {tarball, {file_too_big, "metadata.config"}}}; - true -> ok - end - end + read_outer_files(Dir, Rest, Acc#{"contents.tar.gz" => {path, Path}}); + false -> + {error, {tarball, {missing_files, ["contents.tar.gz"]}}} + end; +read_outer_files(Dir, [Name | Rest], Acc) -> + Path = filename:join(Dir, Name), + case file:read_file(Path) of + {ok, Data} -> + read_outer_files(Dir, Rest, Acc#{Name => Data}); + {error, enoent} -> + {error, {tarball, {missing_files, [Name]}}} end. %% @private @@ -808,7 +797,7 @@ validate_outer_file_sizes(Files) -> false -> case byte_size(maps:get("metadata.config", Files, <<>>)) > ?MAX_METADATA_SIZE of true -> {error, {tarball, {file_too_big, "metadata.config"}}}; - false -> Files + false -> {ok, Files} end end end. diff --git a/test/hex_tarball_SUITE.erl b/test/hex_tarball_SUITE.erl index 07b27bf..bcb4131 100644 --- a/test/hex_tarball_SUITE.erl +++ b/test/hex_tarball_SUITE.erl @@ -22,7 +22,9 @@ all() -> too_big_to_unpack_test, docs_too_big_to_create_test, docs_too_big_to_unpack_test, + none_test, file_unpack_memory_test, + file_unpack_none_test, file_unpack_disk_test, file_unpack_too_big_test, file_unpack_oversized_inner_files_test, @@ -440,6 +442,26 @@ docs_test(Config) -> ok. +none_test(_Config) -> + Metadata = #{ + <<"name">> => <<"foo">>, + <<"version">> => <<"1.0.0">>, + <<"build_tool">> => <<"rebar3">> + }, + Contents = [{"src/foo.erl", <<"-module(foo).">>}], + {ok, #{tarball := Tarball, inner_checksum := InnerChecksum, outer_checksum := OuterChecksum}} = + hex_tarball:create(Metadata, Contents), + + %% Unpack with none output - should return metadata and checksums but no contents + {ok, + #{ + inner_checksum := InnerChecksum, + outer_checksum := OuterChecksum, + metadata := Metadata + } = Result} = hex_tarball:unpack(Tarball, none), + false = maps:is_key(contents, Result), + ok. + file_unpack_memory_test(Config) -> BaseDir = ?config(priv_dir, Config), Metadata = #{ @@ -455,13 +477,37 @@ file_unpack_memory_test(Config) -> TarballPath = filename:join(BaseDir, "test_file_unpack.tar"), ok = file:write_file(TarballPath, Tarball), - %% Unpack from file to memory - should return metadata and checksums but no contents + %% Unpack from file to memory - should return metadata, checksums, and contents + {ok, #{ + inner_checksum := InnerChecksum, + outer_checksum := OuterChecksum, + metadata := Metadata, + contents := Contents + }} = hex_tarball:unpack({file, TarballPath}, memory), + ok. + +file_unpack_none_test(Config) -> + BaseDir = ?config(priv_dir, Config), + Metadata = #{ + <<"name">> => <<"foo">>, + <<"version">> => <<"1.0.0">>, + <<"build_tool">> => <<"rebar3">> + }, + Contents = [{"src/foo.erl", <<"-module(foo).">>}], + {ok, #{tarball := Tarball, inner_checksum := InnerChecksum, outer_checksum := OuterChecksum}} = + hex_tarball:create(Metadata, Contents), + + %% Write tarball to file + TarballPath = filename:join(BaseDir, "test_file_unpack_none.tar"), + ok = file:write_file(TarballPath, Tarball), + + %% Unpack from file with none output - should return metadata and checksums but no contents {ok, #{ inner_checksum := InnerChecksum, outer_checksum := OuterChecksum, metadata := Metadata - } = Result} = hex_tarball:unpack({file, TarballPath}, memory), + } = Result} = hex_tarball:unpack({file, TarballPath}, none), false = maps:is_key(contents, Result), ok. From fe5e0f79ccb7e3bf796ca931ed2ec5ba623a857d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eric=20Meadows-J=C3=B6nsson?= Date: Sun, 8 Mar 2026 21:08:03 +0100 Subject: [PATCH 2/7] Add streamed extraction test for various file sizes Backport the streamed_extract test from erlang/otp#10818 to verify that files of various sizes (empty, small, chunk-boundary, and large) are correctly extracted when streaming to disk. --- test/hex_tarball_SUITE.erl | 62 +++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/test/hex_tarball_SUITE.erl b/test/hex_tarball_SUITE.erl index bcb4131..1a80087 100644 --- a/test/hex_tarball_SUITE.erl +++ b/test/hex_tarball_SUITE.erl @@ -29,7 +29,8 @@ all() -> file_unpack_too_big_test, file_unpack_oversized_inner_files_test, oversized_outer_files_test, - too_big_metadata_to_create_test + too_big_metadata_to_create_test, + streamed_extract_test ]. too_big_to_create_test(_Config) -> @@ -601,6 +602,65 @@ too_big_metadata_to_create_test(_Config) -> hex_tarball:create(Metadata, Contents), ok. +%% Test that extracting to disk streams file entries in chunks +%% instead of loading them fully into memory. +streamed_extract_test(Config) -> + BaseDir = ?config(priv_dir, Config), + + Metadata = #{<<"name">> => <<"foo">>, <<"version">> => <<"1.0.0">>}, + + %% Create test files of various sizes + EmptyData = <<>>, + SmallData = <<"hello">>, + + %% A file exactly equal to the default chunk size (65536 bytes) + ChunkSize = 65536, + BoundaryData = crypto:strong_rand_bytes(ChunkSize), + + %% A file larger than the default chunk size + LargeSize = 200000, + LargeData = crypto:strong_rand_bytes(LargeSize), + + Contents = [ + {"empty", EmptyData}, + {"small", SmallData}, + {"boundary", BoundaryData}, + {"large", LargeData} + ], + + {ok, #{tarball := Tarball, inner_checksum := InnerChecksum, outer_checksum := OuterChecksum}} = + hex_tarball:create(Metadata, Contents), + + %% Extract from binary to disk and verify contents + UnpackDir1 = filename:join(BaseDir, "streamed_extract_binary"), + {ok, #{inner_checksum := InnerChecksum, outer_checksum := OuterChecksum}} = + hex_tarball:unpack(Tarball, UnpackDir1), + {ok, EmptyData} = file:read_file(filename:join(UnpackDir1, "empty")), + {ok, SmallData} = file:read_file(filename:join(UnpackDir1, "small")), + {ok, BoundaryData} = file:read_file(filename:join(UnpackDir1, "boundary")), + {ok, LargeData} = file:read_file(filename:join(UnpackDir1, "large")), + + %% Extract from file to disk and verify contents + TarballPath = filename:join(BaseDir, "streamed_extract.tar"), + ok = file:write_file(TarballPath, Tarball), + UnpackDir2 = filename:join(BaseDir, "streamed_extract_file"), + {ok, #{inner_checksum := InnerChecksum, outer_checksum := OuterChecksum}} = + hex_tarball:unpack({file, TarballPath}, UnpackDir2), + {ok, EmptyData} = file:read_file(filename:join(UnpackDir2, "empty")), + {ok, SmallData} = file:read_file(filename:join(UnpackDir2, "small")), + {ok, BoundaryData} = file:read_file(filename:join(UnpackDir2, "boundary")), + {ok, LargeData} = file:read_file(filename:join(UnpackDir2, "large")), + + %% Verify that memory extraction still works (not affected by streaming) + {ok, #{contents := MemContents}} = hex_tarball:unpack(Tarball, memory), + MemMap = maps:from_list(MemContents), + EmptyData = maps:get("empty", MemMap), + SmallData = maps:get("small", MemMap), + BoundaryData = maps:get("boundary", MemMap), + LargeData = maps:get("large", MemMap), + + ok. + %%==================================================================== %% Helpers %%==================================================================== From 3e5521893af94eb4dbda404d8bf9f96c352b4b47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eric=20Meadows-J=C3=B6nsson?= Date: Sun, 8 Mar 2026 21:29:24 +0100 Subject: [PATCH 3/7] Fix OTP 24 compat: use compressed instead of compressed_one The compressed_one option for file:open is not available on OTP 24, causing file-based extraction of compressed tar entries to silently open files without decompression. Use compressed instead, which has the same behavior for single-member gzip files like contents.tar.gz. --- src/hex_erl_tar.erl | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/hex_erl_tar.erl b/src/hex_erl_tar.erl index 72eb9fc..68d67aa 100644 --- a/src/hex_erl_tar.erl +++ b/src/hex_erl_tar.erl @@ -7,6 +7,7 @@ %% 5. When extracting to disk (cwd option), stream file entries in chunks %% instead of loading them fully into memory %% 6. Default chunk_size to 65536 in add_opts instead of 0 with special case +%% 7. Use compressed instead of compressed_one for file:open for OTP 24 compat %% %% OTP commit: 013041bd68c2547848e88963739edea7f0a1a90f %% @@ -464,13 +465,7 @@ open1({file, Fd}=Handle, read, [raw], Opts) -> end; open1({file, _Fd}=Handle, read, [], _Opts) -> {error, {Handle, {incompatible_option, cooked}}}; -open1(Name, Access, Raw, Opts0) when is_list(Name); is_binary(Name) -> - Opts = case lists:member(compressed, Opts0) andalso Access == read of - true -> - [compressed_one | (Opts0 -- [compressed])]; - false -> - Opts0 - end, +open1(Name, Access, Raw, Opts) when is_list(Name); is_binary(Name) -> case file:open(Name, Raw ++ [binary, Access|Opts]) of {ok, File} -> {ok, #reader{handle=File,access=Access,func=fun file_op/2}}; From 0bc87a93fa574e05c20043ec9d8933516cac18c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eric=20Meadows-J=C3=B6nsson?= Date: Sun, 8 Mar 2026 21:31:52 +0100 Subject: [PATCH 4/7] Fix dialyzer warnings for unmatched return values in hex_erl_tar --- src/hex_erl_tar.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hex_erl_tar.erl b/src/hex_erl_tar.erl index 68d67aa..1e1c1c0 100644 --- a/src/hex_erl_tar.erl +++ b/src/hex_erl_tar.erl @@ -190,14 +190,14 @@ extract_to_file(#tar_header{name=Name0}=Header, Reader0, Opts) -> case stream_to_file(Name1, Reader0, Opts) of {ok, Reader1} -> read_verbose(Opts, "x ~ts~n", [Name0]), - set_extracted_file_info(Name1, Header), + _ = set_extracted_file_info(Name1, Header), Reader1; {error, _} = Err -> throw(Err) end; _ -> Reader1 = skip_file(Reader0), - write_extracted_element(Header, <<>>, Opts), + _ = write_extracted_element(Header, <<>>, Opts), Reader1 end. From 16341284bb4a39a1bdb1622db4b9d2a810b5f14b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eric=20Meadows-J=C3=B6nsson?= Date: Sun, 8 Mar 2026 22:07:19 +0100 Subject: [PATCH 5/7] Fix hex_erl_tar.hrl header comment to document modifications --- src/hex_erl_tar.hrl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/hex_erl_tar.hrl b/src/hex_erl_tar.hrl index 98b095e..f093812 100644 --- a/src/hex_erl_tar.hrl +++ b/src/hex_erl_tar.hrl @@ -1,4 +1,7 @@ -%% This file is a copy of erl_tar.hrl from OTP with no modifications. +%% This file is a copy of erl_tar.hrl from OTP with the following modifications: +%% 1. Added chunk_size field to #read_opts{} for streaming extraction to disk +%% 2. Added {chunks, pos_integer()} to extract_opt() type +%% 3. Default chunk_size to 65536 in #add_opts{} instead of 0 %% %% OTP commit: 013041bd68c2547848e88963739edea7f0a1a90f %% From 3b00ef3a7827cf049e4980affaff86ede7c6412f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eric=20Meadows-J=C3=B6nsson?= Date: Sun, 8 Mar 2026 23:09:23 +0100 Subject: [PATCH 6/7] Refactor unpack/3 to let Output drive outer extraction strategy and add {file, Path} support to unpack_docs Previously the outer tarball extraction mode was tied to the input format. Now Output drives the strategy: memory extracts to memory, path/none extracts to a temp dir. Also adds {file, Path} input support to unpack_docs/2,3. --- src/hex_tarball.erl | 101 +++++++++++++++++++++++-------------- test/hex_tarball_SUITE.erl | 44 +++++++++++++++- 2 files changed, 107 insertions(+), 38 deletions(-) diff --git a/src/hex_tarball.erl b/src/hex_tarball.erl index 7c8d7fa..88af262 100644 --- a/src/hex_tarball.erl +++ b/src/hex_tarball.erl @@ -208,14 +208,36 @@ create_docs(Files) -> metadata => metadata() }} | {error, term()}. -unpack({file, Path}, Output, Config) -> - case valid_file_size(Path, maps:get(tarball_max_size, Config)) of +unpack(Input, memory, Config) -> + case check_input_size(Input, Config) of true -> - OuterChecksum = file_checksum(Path), + OuterChecksum = outer_checksum(Input), + Source = tar_source(Input), + case hex_erl_tar:extract(Source, [memory]) of + {ok, []} -> + {error, {tarball, empty}}; + {ok, FileList} -> + case validate_outer_file_sizes(maps:from_list(FileList)) of + {ok, Files} -> + do_unpack(Files, OuterChecksum, memory); + {error, _} = Error -> + Error + end; + {error, Reason} -> + {error, {tarball, Reason}} + end; + false -> + {error, {tarball, too_big}} + end; +unpack(Input, Output, Config) -> + case check_input_size(Input, Config) of + true -> + OuterChecksum = outer_checksum(Input), + Source = tar_source(Input), TmpDir = tmp_path(), ok = file:make_dir(TmpDir), try - case hex_erl_tar:extract(Path, [{cwd, TmpDir}]) of + case hex_erl_tar:extract(Source, [{cwd, TmpDir}]) of ok -> case read_outer_files(TmpDir) of {ok, Files} -> @@ -231,26 +253,6 @@ unpack({file, Path}, Output, Config) -> end; false -> {error, {tarball, too_big}} - end; -unpack(Tarball, Output, Config) -> - case valid_size(Tarball, maps:get(tarball_max_size, Config)) of - true -> - case hex_erl_tar:extract({binary, Tarball}, [memory]) of - {ok, []} -> - {error, {tarball, empty}}; - {ok, FileList} -> - OuterChecksum = crypto:hash(sha256, Tarball), - case validate_outer_file_sizes(maps:from_list(FileList)) of - {ok, Files} -> - do_unpack(Files, OuterChecksum, Output); - {error, _} = Error -> - Error - end; - {error, Reason} -> - {error, {tarball, Reason}} - end; - false -> - {error, {tarball, too_big}} end. %% @doc @@ -286,6 +288,10 @@ unpack(Tarball, Output) -> %% @doc %% Unpacks a documentation tarball. %% +%% The first argument is the tarball, either as a binary or `{file, Path}' +%% to read from a file on disk. Using `{file, Path}' avoids loading the +%% tarball into memory. +%% %% Examples: %% %% ``` @@ -296,21 +302,22 @@ unpack(Tarball, Output) -> %% ok %% ''' -spec unpack_docs - (tarball(), memory, hex_core:config()) -> {ok, contents()} | {error, term()}; - (tarball(), filename(), hex_core:config()) -> ok | {error, term()}. -unpack_docs(Tarball, Output, Config) -> - case valid_size(Tarball, maps:get(docs_tarball_max_size, Config)) of + (tarball() | {file, filename()}, memory, hex_core:config()) -> + {ok, contents()} | {error, term()}; + (tarball() | {file, filename()}, filename(), hex_core:config()) -> ok | {error, term()}. +unpack_docs(Input, Output, Config) -> + case check_docs_input_size(Input, Config) of true -> - unpack_tarball(Tarball, Output); + unpack_tarball(tar_source(Input), Output); false -> {error, {tarball, too_big}} end. -spec unpack_docs - (tarball(), memory) -> {ok, contents()} | {error, term()}; - (tarball(), filename()) -> ok | {error, term()}. -unpack_docs(Tarball, Output) -> - unpack_docs(Tarball, Output, hex_core:default_config()). + (tarball() | {file, filename()}, memory) -> {ok, contents()} | {error, term()}; + (tarball() | {file, filename()}, filename()) -> ok | {error, term()}. +unpack_docs(Input, Output) -> + unpack_docs(Input, Output, hex_core:default_config()). %% @doc %% Returns base16-encoded representation of checksum. @@ -375,6 +382,26 @@ inner_checksum(Version, MetadataBinary, ContentsBinary) -> checksum(ContentsBinary) when is_binary(ContentsBinary) -> crypto:hash(sha256, ContentsBinary). +%% @private +tar_source({file, Path}) -> Path; +tar_source(Tarball) -> {binary, Tarball}. + +%% @private +outer_checksum({file, Path}) -> file_checksum(Path); +outer_checksum(Tarball) -> crypto:hash(sha256, Tarball). + +%% @private +check_input_size({file, Path}, Config) -> + valid_file_size(Path, maps:get(tarball_max_size, Config)); +check_input_size(Tarball, Config) -> + valid_size(Tarball, maps:get(tarball_max_size, Config)). + +%% @private +check_docs_input_size({file, Path}, Config) -> + valid_file_size(Path, maps:get(docs_tarball_max_size, Config)); +check_docs_input_size(Tarball, Config) -> + valid_size(Tarball, maps:get(docs_tarball_max_size, Config)). + %% @private encode_metadata(Meta) -> Data = lists:map( @@ -590,11 +617,11 @@ guess_build_tools(Metadata) -> %%==================================================================== %% @private -unpack_tarball(ContentsBinary, memory) -> - hex_erl_tar:extract({binary, ContentsBinary}, [memory, compressed]); -unpack_tarball(ContentsBinary, Output) -> +unpack_tarball(Source, memory) -> + hex_erl_tar:extract(Source, [memory, compressed]); +unpack_tarball(Source, Output) -> filelib:ensure_dir(filename:join(Output, "*")), - case hex_erl_tar:extract({binary, ContentsBinary}, [{cwd, Output}, compressed]) of + case hex_erl_tar:extract(Source, [{cwd, Output}, compressed]) of ok -> [ try_updating_mtime(filename:join(Output, Path)) diff --git a/test/hex_tarball_SUITE.erl b/test/hex_tarball_SUITE.erl index 1a80087..d003c59 100644 --- a/test/hex_tarball_SUITE.erl +++ b/test/hex_tarball_SUITE.erl @@ -30,7 +30,10 @@ all() -> file_unpack_oversized_inner_files_test, oversized_outer_files_test, too_big_metadata_to_create_test, - streamed_extract_test + streamed_extract_test, + file_unpack_docs_memory_test, + file_unpack_docs_disk_test, + file_unpack_docs_too_big_test ]. too_big_to_create_test(_Config) -> @@ -661,6 +664,45 @@ streamed_extract_test(Config) -> ok. +file_unpack_docs_memory_test(Config) -> + BaseDir = ?config(priv_dir, Config), + + Files = [{"index.html", <<"Docs">>}], + {ok, Tarball} = hex_tarball:create_docs(Files), + TarballPath = filename:join(BaseDir, "docs.tar.gz"), + ok = file:write_file(TarballPath, Tarball), + + {ok, Files} = hex_tarball:unpack_docs({file, TarballPath}, memory), + + ok. + +file_unpack_docs_disk_test(Config) -> + BaseDir = ?config(priv_dir, Config), + + Files = [{"index.html", <<"Docs">>}], + {ok, Tarball} = hex_tarball:create_docs(Files), + TarballPath = filename:join(BaseDir, "docs_disk.tar.gz"), + ok = file:write_file(TarballPath, Tarball), + + UnpackDir = filename:join(BaseDir, "unpack_file_docs"), + ok = hex_tarball:unpack_docs({file, TarballPath}, UnpackDir), + {ok, <<"Docs">>} = file:read_file(filename:join(UnpackDir, "index.html")), + + ok. + +file_unpack_docs_too_big_test(Config) -> + BaseDir = ?config(priv_dir, Config), + + Files = [{"index.html", <<"Docs">>}], + {ok, Tarball} = hex_tarball:create_docs(Files), + TarballPath = filename:join(BaseDir, "docs_big.tar.gz"), + ok = file:write_file(TarballPath, Tarball), + + SmallConfig = maps:put(docs_tarball_max_size, 10, hex_core:default_config()), + {error, {tarball, too_big}} = hex_tarball:unpack_docs({file, TarballPath}, memory, SmallConfig), + + ok. + %%==================================================================== %% Helpers %%==================================================================== From 7ecdf1c644e3e4b2742814090d4d946f7b47d8ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eric=20Meadows-J=C3=B6nsson?= Date: Sun, 8 Mar 2026 23:35:28 +0100 Subject: [PATCH 7/7] Handle all file read errors in read_outer_files, not just enoent --- src/hex_tarball.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hex_tarball.erl b/src/hex_tarball.erl index 88af262..bab4afd 100644 --- a/src/hex_tarball.erl +++ b/src/hex_tarball.erl @@ -808,7 +808,7 @@ read_outer_files(Dir, [Name | Rest], Acc) -> case file:read_file(Path) of {ok, Data} -> read_outer_files(Dir, Rest, Acc#{Name => Data}); - {error, enoent} -> + {error, _} -> {error, {tarball, {missing_files, [Name]}}} end.