diff --git a/src/hex_erl_tar.erl b/src/hex_erl_tar.erl index 89efade..1e1c1c0 100644 --- a/src/hex_erl_tar.erl +++ b/src/hex_erl_tar.erl @@ -4,6 +4,10 @@ %% 3. -doc and -moduledoc attributes removed for OTP 24 compatibility %% 4. safe_link_name/2 fixed to validate symlink targets relative to symlink's %% parent directory instead of in isolation +%% 5. When extracting to disk (cwd option), stream file entries in chunks +%% instead of loading them fully into memory +%% 6. Default chunk_size to 65536 in add_opts instead of 0 with special case +%% 7. Use compressed instead of compressed_one for file:open for OTP 24 compat %% %% OTP commit: 013041bd68c2547848e88963739edea7f0a1a90f %% @@ -147,12 +151,18 @@ extract1(eof, Reader, _, Acc) -> extract1(#tar_header{name=Name,size=Size}=Header, Reader0, Opts, Acc0) -> case check_extract(Name, Opts) of true -> - case do_read(Reader0, Size) of - {ok, Bin, Reader1} -> - Acc = extract2(Header, Bin, Opts, Acc0), - {ok, Acc, Reader1}; - {error, _} = Err -> - throw(Err) + case Opts#read_opts.output of + memory -> + case do_read(Reader0, Size) of + {ok, Bin, Reader1} -> + Acc = extract2(Header, Bin, Opts, Acc0), + {ok, Acc, Reader1}; + {error, _} = Err -> + throw(Err) + end; + file -> + Reader1 = extract_to_file(Header, Reader0, Opts), + {ok, Acc0, Reader1} end; false -> {ok, Acc0, skip_file(Reader0)} @@ -173,6 +183,79 @@ extract2(Header, Bin, Opts, Acc) -> throw(Err) end. +extract_to_file(#tar_header{name=Name0}=Header, Reader0, Opts) -> + case typeflag(Header#tar_header.typeflag) of + regular -> + Name1 = make_safe_path(Name0, Opts), + case stream_to_file(Name1, Reader0, Opts) of + {ok, Reader1} -> + read_verbose(Opts, "x ~ts~n", [Name0]), + _ = set_extracted_file_info(Name1, Header), + Reader1; + {error, _} = Err -> + throw(Err) + end; + _ -> + Reader1 = skip_file(Reader0), + _ = write_extracted_element(Header, <<>>, Opts), + Reader1 + end. + +stream_to_file(Name, Reader0, Opts) -> + Write = + case Opts#read_opts.keep_old_files of + true -> + case file:read_file_info(Name) of + {ok, _} -> false; + _ -> true + end; + false -> true + end, + case Write of + true -> + ChunkSize = Opts#read_opts.chunk_size, + case open_output_file(Name) of + {ok, Fd} -> + try + stream_to_file_loop(Fd, Reader0, ChunkSize) + after + file:close(Fd) + end; + {error, _} = Err -> + Err + end; + false -> + {ok, skip_file(Reader0)} + end. + +open_output_file(Name) -> + case file:open(Name, [write, raw, binary]) of + {ok, _} = Ok -> + Ok; + {error, enoent} -> + ok = make_dirs(Name, file), + file:open(Name, [write, raw, binary]); + {error, _} = Err -> + Err + end. + +stream_to_file_loop(_Fd, #reg_file_reader{num_bytes=0}=Reader, _ChunkSize) -> + {ok, Reader}; +stream_to_file_loop(_Fd, #sparse_file_reader{num_bytes=0}=Reader, _ChunkSize) -> + {ok, Reader}; +stream_to_file_loop(Fd, Reader, ChunkSize) -> + case do_read(Reader, ChunkSize) of + {ok, Bin, Reader1} -> + case file:write(Fd, Bin) of + ok -> + stream_to_file_loop(Fd, Reader1, ChunkSize); + {error, _} = Err -> + Err + end; + {error, _} = Err -> + Err + end. + %% Checks if the file Name should be extracted. check_extract(_, #read_opts{files=all}) -> true; @@ -382,13 +465,7 @@ open1({file, Fd}=Handle, read, [raw], Opts) -> end; open1({file, _Fd}=Handle, read, [], _Opts) -> {error, {Handle, {incompatible_option, cooked}}}; -open1(Name, Access, Raw, Opts0) when is_list(Name); is_binary(Name) -> - Opts = case lists:member(compressed, Opts0) andalso Access == read of - true -> - [compressed_one | (Opts0 -- [compressed])]; - false -> - Opts0 - end, +open1(Name, Access, Raw, Opts) when is_list(Name); is_binary(Name) -> case file:open(Name, Raw ++ [binary, Access|Opts]) of {ok, File} -> {ok, #reader{handle=File,access=Access,func=fun file_op/2}}; @@ -1833,9 +1910,6 @@ do_write(#reader{handle=Handle,func=Fun}=Reader0, Data) Err end. -do_copy(#reader{func=Fun}=Reader, Source, #add_opts{chunk_size=0}=Opts) - when is_function(Fun, 2) -> - do_copy(Reader, Source, Opts#add_opts{chunk_size=65536}); do_copy(#reader{func=Fun}=Reader, Source, #add_opts{chunk_size=ChunkSize}) when is_function(Fun, 2) -> case file:open(Source, [read, binary]) of @@ -2009,6 +2083,8 @@ extract_opts([cooked|Rest], Opts=#read_opts{open_mode=OpenMode}) -> extract_opts(Rest, Opts#read_opts{open_mode=[cooked|OpenMode]}); extract_opts([verbose|Rest], Opts) -> extract_opts(Rest, Opts#read_opts{verbose=true}); +extract_opts([{chunks,N}|Rest], Opts) -> + extract_opts(Rest, Opts#read_opts{chunk_size=N}); extract_opts([Other|Rest], Opts) -> extract_opts(Rest, read_opts([Other], Opts)); extract_opts([], Opts) -> diff --git a/src/hex_erl_tar.hrl b/src/hex_erl_tar.hrl index 7cb6c63..f093812 100644 --- a/src/hex_erl_tar.hrl +++ b/src/hex_erl_tar.hrl @@ -1,4 +1,7 @@ -%% This file is a copy of erl_tar.hrl from OTP with no modifications. +%% This file is a copy of erl_tar.hrl from OTP with the following modifications: +%% 1. Added chunk_size field to #read_opts{} for streaming extraction to disk +%% 2. Added {chunks, pos_integer()} to extract_opt() type +%% 3. Default chunk_size to 65536 in #add_opts{} instead of 0 %% %% OTP commit: 013041bd68c2547848e88963739edea7f0a1a90f %% @@ -25,7 +28,7 @@ %% Options used when adding files to a tar archive. -record(add_opts, { read_info, %% Fun to use for read file/link info. - chunk_size = 0, %% For file reading when sending to sftp. 0=do not chunk + chunk_size = 65536, %% Chunk size for reading files. verbose = false, %% Verbose on/off. atime = undefined, mtime = undefined, @@ -42,7 +45,8 @@ files = all, %% Set of files to extract (or all) output = file :: 'file' | 'memory', open_mode = [], %% Open mode options. - verbose = false :: boolean()}). %% Verbose on/off. + verbose = false :: boolean(), %% Verbose on/off. + chunk_size = 65536}). %% Chunk size for streaming to disk. -type read_opts() :: #read_opts{}. -type add_opt() :: dereference | @@ -59,6 +63,7 @@ -type extract_opt() :: {cwd, string()} | {files, [name_in_archive()]} | + {chunks, pos_integer()} | compressed | cooked | memory | diff --git a/src/hex_tarball.erl b/src/hex_tarball.erl index 1ea06ad..bab4afd 100644 --- a/src/hex_tarball.erl +++ b/src/hex_tarball.erl @@ -27,7 +27,7 @@ -include_lib("kernel/include/file.hrl"). -type checksum() :: binary(). --type contents() :: #{filename() => binary()}. +-type contents() :: [{filename(), binary()}]. -type filename() :: string(). -type files() :: [{filename(), filename() | binary()}]. -type metadata() :: map(). @@ -159,6 +159,16 @@ create_docs(Files) -> %% Remember to verify the outer tarball checksum against the registry checksum %% returned from `hex_repo:get_package(Config, Package)'. %% +%% The first argument is the tarball, either as a binary or `{file, Path}' +%% to read from a file on disk. Using `{file, Path}' avoids loading the +%% tarball into memory. +%% +%% The second argument controls the output: +%% +%% - `memory' - unpack contents into memory and return them +%% - `none' - only extract metadata and checksums, skip contents +%% - A path string - extract contents to the given directory +%% %% Examples: %% %% ``` @@ -167,12 +177,16 @@ create_docs(Files) -> %% contents => [{"src/foo.erl",<<"-module(foo).">>}], %% metadata => #{<<"name">> => <<"foo">>, ...}}} %% +%% > hex_tarball:unpack(Tarball, none). +%% {ok,#{outer_checksum => <<...>>, +%% metadata => #{<<"name">> => <<"foo">>, ...}}} +%% %% > hex_tarball:unpack(Tarball, "path/to/unpack"). %% {ok,#{outer_checksum => <<...>>, %% metadata => #{<<"name">> => <<"foo">>, ...}}} %% ''' -spec unpack - (tarball(), memory, hex_core:config()) -> + (tarball() | {file, filename()}, memory, hex_core:config()) -> {ok, #{ outer_checksum => checksum(), inner_checksum => checksum(), @@ -180,23 +194,52 @@ create_docs(Files) -> contents => contents() }} | {error, term()}; - (tarball(), filename(), hex_core:config()) -> + (tarball() | {file, filename()}, none, hex_core:config()) -> + {ok, #{ + outer_checksum => checksum(), + inner_checksum => checksum(), + metadata => metadata() + }} + | {error, term()}; + (tarball() | {file, filename()}, filename(), hex_core:config()) -> {ok, #{ outer_checksum => checksum(), inner_checksum => checksum(), metadata => metadata() }} | {error, term()}. -unpack({file, Path}, Output, Config) -> - case valid_file_size(Path, maps:get(tarball_max_size, Config)) of +unpack(Input, memory, Config) -> + case check_input_size(Input, Config) of true -> + OuterChecksum = outer_checksum(Input), + Source = tar_source(Input), + case hex_erl_tar:extract(Source, [memory]) of + {ok, []} -> + {error, {tarball, empty}}; + {ok, FileList} -> + case validate_outer_file_sizes(maps:from_list(FileList)) of + {ok, Files} -> + do_unpack(Files, OuterChecksum, memory); + {error, _} = Error -> + Error + end; + {error, Reason} -> + {error, {tarball, Reason}} + end; + false -> + {error, {tarball, too_big}} + end; +unpack(Input, Output, Config) -> + case check_input_size(Input, Config) of + true -> + OuterChecksum = outer_checksum(Input), + Source = tar_source(Input), TmpDir = tmp_path(), ok = file:make_dir(TmpDir), try - case hex_erl_tar:extract(Path, [{cwd, TmpDir}]) of + case hex_erl_tar:extract(Source, [{cwd, TmpDir}]) of ok -> - OuterChecksum = file_checksum(Path), - case read_outer_files_from_dir(TmpDir) of + case read_outer_files(TmpDir) of {ok, Files} -> do_unpack(Files, OuterChecksum, Output); {error, _} = Error -> @@ -210,23 +253,6 @@ unpack({file, Path}, Output, Config) -> end; false -> {error, {tarball, too_big}} - end; -unpack(Tarball, Output, Config) -> - case valid_size(Tarball, maps:get(tarball_max_size, Config)) of - true -> - case hex_erl_tar:extract({binary, Tarball}, [memory]) of - {ok, []} -> - {error, {tarball, empty}}; - {ok, FileList} -> - OuterChecksum = crypto:hash(sha256, Tarball), - do_unpack( - validate_outer_file_sizes(maps:from_list(FileList)), OuterChecksum, Output - ); - {error, Reason} -> - {error, {tarball, Reason}} - end; - false -> - {error, {tarball, too_big}} end. %% @doc @@ -234,7 +260,7 @@ unpack(Tarball, Output, Config) -> %% %% @see unpack/3 -spec unpack - (tarball(), memory) -> + (tarball() | {file, filename()}, memory) -> {ok, #{ outer_checksum => checksum(), inner_checksum => checksum(), @@ -242,7 +268,14 @@ unpack(Tarball, Output, Config) -> contents => contents() }} | {error, term()}; - (tarball(), filename()) -> + (tarball() | {file, filename()}, none) -> + {ok, #{ + outer_checksum => checksum(), + inner_checksum => checksum(), + metadata => metadata() + }} + | {error, term()}; + (tarball() | {file, filename()}, filename()) -> {ok, #{ outer_checksum => checksum(), inner_checksum => checksum(), @@ -255,6 +288,10 @@ unpack(Tarball, Output) -> %% @doc %% Unpacks a documentation tarball. %% +%% The first argument is the tarball, either as a binary or `{file, Path}' +%% to read from a file on disk. Using `{file, Path}' avoids loading the +%% tarball into memory. +%% %% Examples: %% %% ``` @@ -265,21 +302,22 @@ unpack(Tarball, Output) -> %% ok %% ''' -spec unpack_docs - (tarball(), memory, hex_core:config()) -> {ok, contents()} | {error, term()}; - (tarball(), filename(), hex_core:config()) -> ok | {error, term()}. -unpack_docs(Tarball, Output, Config) -> - case valid_size(Tarball, maps:get(docs_tarball_max_size, Config)) of + (tarball() | {file, filename()}, memory, hex_core:config()) -> + {ok, contents()} | {error, term()}; + (tarball() | {file, filename()}, filename(), hex_core:config()) -> ok | {error, term()}. +unpack_docs(Input, Output, Config) -> + case check_docs_input_size(Input, Config) of true -> - unpack_tarball(Tarball, Output); + unpack_tarball(tar_source(Input), Output); false -> {error, {tarball, too_big}} end. -spec unpack_docs - (tarball(), memory) -> {ok, contents()} | {error, term()}; - (tarball(), filename()) -> ok | {error, term()}. -unpack_docs(Tarball, Output) -> - unpack_docs(Tarball, Output, hex_core:default_config()). + (tarball() | {file, filename()}, memory) -> {ok, contents()} | {error, term()}; + (tarball() | {file, filename()}, filename()) -> ok | {error, term()}. +unpack_docs(Input, Output) -> + unpack_docs(Input, Output, hex_core:default_config()). %% @doc %% Returns base16-encoded representation of checksum. @@ -344,6 +382,26 @@ inner_checksum(Version, MetadataBinary, ContentsBinary) -> checksum(ContentsBinary) when is_binary(ContentsBinary) -> crypto:hash(sha256, ContentsBinary). +%% @private +tar_source({file, Path}) -> Path; +tar_source(Tarball) -> {binary, Tarball}. + +%% @private +outer_checksum({file, Path}) -> file_checksum(Path); +outer_checksum(Tarball) -> crypto:hash(sha256, Tarball). + +%% @private +check_input_size({file, Path}, Config) -> + valid_file_size(Path, maps:get(tarball_max_size, Config)); +check_input_size(Tarball, Config) -> + valid_size(Tarball, maps:get(tarball_max_size, Config)). + +%% @private +check_docs_input_size({file, Path}, Config) -> + valid_file_size(Path, maps:get(docs_tarball_max_size, Config)); +check_docs_input_size(Tarball, Config) -> + valid_size(Tarball, maps:get(docs_tarball_max_size, Config)). + %% @private encode_metadata(Meta) -> Data = lists:map( @@ -356,8 +414,6 @@ encode_metadata(Meta) -> iolist_to_binary(Data). %% @private -do_unpack({error, _} = Error, _OuterChecksum, _Output) -> - Error; do_unpack(Files, OuterChecksum, Output) -> State = #{ inner_checksum => undefined, @@ -386,54 +442,47 @@ finish_unpack(#{ _ = maps:get("VERSION", Files), Contents = maps:get("contents.tar.gz", Files), - case {Contents, Output} of - {{path, _}, memory} -> - {ok, #{ - inner_checksum => InnerChecksum, - outer_checksum => OuterChecksum, - metadata => Metadata - }}; - {{path, ContentsPath}, _} -> - filelib:ensure_dir(filename:join(Output, "*")), - {ok, ContentsBinary} = file:read_file(ContentsPath), - case unpack_tarball(ContentsBinary, Output) of - ok -> - copy_metadata_config(Output, maps:get("metadata.config", Files)), - {ok, #{ - inner_checksum => InnerChecksum, - outer_checksum => OuterChecksum, - metadata => Metadata - }}; - {error, Reason} -> - {error, {inner_tarball, Reason}} - end; - {ContentsBinary, memory} -> - case unpack_tarball(ContentsBinary, Output) of + Result = #{ + inner_checksum => InnerChecksum, + outer_checksum => OuterChecksum, + metadata => Metadata + }, + + case Output of + none -> + {ok, Result}; + memory -> + case unpack_contents(Contents, memory) of {ok, UnpackedContents} -> - {ok, #{ - inner_checksum => InnerChecksum, - outer_checksum => OuterChecksum, - metadata => Metadata, - contents => UnpackedContents - }}; + {ok, Result#{contents => UnpackedContents}}; {error, Reason} -> {error, {inner_tarball, Reason}} end; - {ContentsBinary, _} -> + _ -> filelib:ensure_dir(filename:join(Output, "*")), - case unpack_tarball(ContentsBinary, Output) of + case unpack_contents(Contents, Output) of ok -> + [ + try_updating_mtime(filename:join(Output, P)) + || P <- filelib:wildcard("**", Output) + ], copy_metadata_config(Output, maps:get("metadata.config", Files)), - {ok, #{ - inner_checksum => InnerChecksum, - outer_checksum => OuterChecksum, - metadata => Metadata - }}; + {ok, Result}; {error, Reason} -> {error, {inner_tarball, Reason}} end end. +%% @private +unpack_contents({path, ContentsPath}, memory) -> + hex_erl_tar:extract(ContentsPath, [memory, compressed]); +unpack_contents({path, ContentsPath}, Output) -> + hex_erl_tar:extract(ContentsPath, [{cwd, Output}, compressed]); +unpack_contents(ContentsBinary, memory) -> + hex_erl_tar:extract({binary, ContentsBinary}, [memory, compressed]); +unpack_contents(ContentsBinary, Output) -> + hex_erl_tar:extract({binary, ContentsBinary}, [{cwd, Output}, compressed]). + %% @private copy_metadata_config(Output, MetadataBinary) -> ok = file:write_file(filename:join(Output, "hex_metadata.config"), MetadataBinary). @@ -568,11 +617,11 @@ guess_build_tools(Metadata) -> %%==================================================================== %% @private -unpack_tarball(ContentsBinary, memory) -> - hex_erl_tar:extract({binary, ContentsBinary}, [memory, compressed]); -unpack_tarball(ContentsBinary, Output) -> +unpack_tarball(Source, memory) -> + hex_erl_tar:extract(Source, [memory, compressed]); +unpack_tarball(Source, Output) -> filelib:ensure_dir(filename:join(Output, "*")), - case hex_erl_tar:extract({binary, ContentsBinary}, [{cwd, Output}, compressed]) of + case hex_erl_tar:extract(Source, [{cwd, Output}, compressed]) of ok -> [ try_updating_mtime(filename:join(Output, Path)) @@ -732,68 +781,35 @@ stream_file_hash_loop(Fd, HashState) -> end. %% @private -read_outer_files_from_dir(Dir) -> - VersionPath = filename:join(Dir, "VERSION"), - ChecksumPath = filename:join(Dir, "CHECKSUM"), - MetadataPath = filename:join(Dir, "metadata.config"), - ContentsPath = filename:join(Dir, "contents.tar.gz"), - - case - { - filelib:is_regular(VersionPath), - filelib:is_regular(ChecksumPath), - filelib:is_regular(MetadataPath), - filelib:is_regular(ContentsPath) - } - of - {true, true, true, true} -> - case check_outer_file_sizes(VersionPath, ChecksumPath, MetadataPath) of - ok -> - {ok, Version} = file:read_file(VersionPath), - {ok, Checksum} = file:read_file(ChecksumPath), - {ok, MetadataConfig} = file:read_file(MetadataPath), - {ok, #{ - "VERSION" => Version, - "CHECKSUM" => Checksum, - "metadata.config" => MetadataConfig, - "contents.tar.gz" => {path, ContentsPath} - }}; - {error, _} = Error -> - Error - end; - _ -> - Missing = lists:filtermap( - fun({Path, Name}) -> - case filelib:is_regular(Path) of - true -> false; - false -> {true, Name} - end - end, - [ - {VersionPath, "VERSION"}, - {ChecksumPath, "CHECKSUM"}, - {MetadataPath, "metadata.config"}, - {ContentsPath, "contents.tar.gz"} - ] - ), - {error, {tarball, {missing_files, Missing}}} +%% Reads outer tar files from a directory after extraction. +%% Small files (VERSION, CHECKSUM, metadata.config) are read into memory. +%% contents.tar.gz is referenced by path. +read_outer_files(Dir) -> + RequiredFiles = ["VERSION", "CHECKSUM", "metadata.config", "contents.tar.gz"], + case read_outer_files(Dir, RequiredFiles, #{}) of + {ok, Files} -> + validate_outer_file_sizes(Files); + {error, _} = Error -> + Error end. -%% @private -check_outer_file_sizes(VersionPath, ChecksumPath, MetadataPath) -> - case valid_file_size(VersionPath, ?MAX_VERSION_SIZE) of - false -> - {error, {tarball, {file_too_big, "VERSION"}}}; +read_outer_files(_Dir, [], Acc) -> + {ok, Acc}; +read_outer_files(Dir, ["contents.tar.gz" | Rest], Acc) -> + Path = filename:join(Dir, "contents.tar.gz"), + case filelib:is_regular(Path) of true -> - case valid_file_size(ChecksumPath, ?MAX_CHECKSUM_SIZE) of - false -> - {error, {tarball, {file_too_big, "CHECKSUM"}}}; - true -> - case valid_file_size(MetadataPath, ?MAX_METADATA_SIZE) of - false -> {error, {tarball, {file_too_big, "metadata.config"}}}; - true -> ok - end - end + read_outer_files(Dir, Rest, Acc#{"contents.tar.gz" => {path, Path}}); + false -> + {error, {tarball, {missing_files, ["contents.tar.gz"]}}} + end; +read_outer_files(Dir, [Name | Rest], Acc) -> + Path = filename:join(Dir, Name), + case file:read_file(Path) of + {ok, Data} -> + read_outer_files(Dir, Rest, Acc#{Name => Data}); + {error, _} -> + {error, {tarball, {missing_files, [Name]}}} end. %% @private @@ -808,7 +824,7 @@ validate_outer_file_sizes(Files) -> false -> case byte_size(maps:get("metadata.config", Files, <<>>)) > ?MAX_METADATA_SIZE of true -> {error, {tarball, {file_too_big, "metadata.config"}}}; - false -> Files + false -> {ok, Files} end end end. diff --git a/test/hex_tarball_SUITE.erl b/test/hex_tarball_SUITE.erl index 07b27bf..d003c59 100644 --- a/test/hex_tarball_SUITE.erl +++ b/test/hex_tarball_SUITE.erl @@ -22,12 +22,18 @@ all() -> too_big_to_unpack_test, docs_too_big_to_create_test, docs_too_big_to_unpack_test, + none_test, file_unpack_memory_test, + file_unpack_none_test, file_unpack_disk_test, file_unpack_too_big_test, file_unpack_oversized_inner_files_test, oversized_outer_files_test, - too_big_metadata_to_create_test + too_big_metadata_to_create_test, + streamed_extract_test, + file_unpack_docs_memory_test, + file_unpack_docs_disk_test, + file_unpack_docs_too_big_test ]. too_big_to_create_test(_Config) -> @@ -440,6 +446,26 @@ docs_test(Config) -> ok. +none_test(_Config) -> + Metadata = #{ + <<"name">> => <<"foo">>, + <<"version">> => <<"1.0.0">>, + <<"build_tool">> => <<"rebar3">> + }, + Contents = [{"src/foo.erl", <<"-module(foo).">>}], + {ok, #{tarball := Tarball, inner_checksum := InnerChecksum, outer_checksum := OuterChecksum}} = + hex_tarball:create(Metadata, Contents), + + %% Unpack with none output - should return metadata and checksums but no contents + {ok, + #{ + inner_checksum := InnerChecksum, + outer_checksum := OuterChecksum, + metadata := Metadata + } = Result} = hex_tarball:unpack(Tarball, none), + false = maps:is_key(contents, Result), + ok. + file_unpack_memory_test(Config) -> BaseDir = ?config(priv_dir, Config), Metadata = #{ @@ -455,13 +481,37 @@ file_unpack_memory_test(Config) -> TarballPath = filename:join(BaseDir, "test_file_unpack.tar"), ok = file:write_file(TarballPath, Tarball), - %% Unpack from file to memory - should return metadata and checksums but no contents + %% Unpack from file to memory - should return metadata, checksums, and contents + {ok, #{ + inner_checksum := InnerChecksum, + outer_checksum := OuterChecksum, + metadata := Metadata, + contents := Contents + }} = hex_tarball:unpack({file, TarballPath}, memory), + ok. + +file_unpack_none_test(Config) -> + BaseDir = ?config(priv_dir, Config), + Metadata = #{ + <<"name">> => <<"foo">>, + <<"version">> => <<"1.0.0">>, + <<"build_tool">> => <<"rebar3">> + }, + Contents = [{"src/foo.erl", <<"-module(foo).">>}], + {ok, #{tarball := Tarball, inner_checksum := InnerChecksum, outer_checksum := OuterChecksum}} = + hex_tarball:create(Metadata, Contents), + + %% Write tarball to file + TarballPath = filename:join(BaseDir, "test_file_unpack_none.tar"), + ok = file:write_file(TarballPath, Tarball), + + %% Unpack from file with none output - should return metadata and checksums but no contents {ok, #{ inner_checksum := InnerChecksum, outer_checksum := OuterChecksum, metadata := Metadata - } = Result} = hex_tarball:unpack({file, TarballPath}, memory), + } = Result} = hex_tarball:unpack({file, TarballPath}, none), false = maps:is_key(contents, Result), ok. @@ -555,6 +605,104 @@ too_big_metadata_to_create_test(_Config) -> hex_tarball:create(Metadata, Contents), ok. +%% Test that extracting to disk streams file entries in chunks +%% instead of loading them fully into memory. +streamed_extract_test(Config) -> + BaseDir = ?config(priv_dir, Config), + + Metadata = #{<<"name">> => <<"foo">>, <<"version">> => <<"1.0.0">>}, + + %% Create test files of various sizes + EmptyData = <<>>, + SmallData = <<"hello">>, + + %% A file exactly equal to the default chunk size (65536 bytes) + ChunkSize = 65536, + BoundaryData = crypto:strong_rand_bytes(ChunkSize), + + %% A file larger than the default chunk size + LargeSize = 200000, + LargeData = crypto:strong_rand_bytes(LargeSize), + + Contents = [ + {"empty", EmptyData}, + {"small", SmallData}, + {"boundary", BoundaryData}, + {"large", LargeData} + ], + + {ok, #{tarball := Tarball, inner_checksum := InnerChecksum, outer_checksum := OuterChecksum}} = + hex_tarball:create(Metadata, Contents), + + %% Extract from binary to disk and verify contents + UnpackDir1 = filename:join(BaseDir, "streamed_extract_binary"), + {ok, #{inner_checksum := InnerChecksum, outer_checksum := OuterChecksum}} = + hex_tarball:unpack(Tarball, UnpackDir1), + {ok, EmptyData} = file:read_file(filename:join(UnpackDir1, "empty")), + {ok, SmallData} = file:read_file(filename:join(UnpackDir1, "small")), + {ok, BoundaryData} = file:read_file(filename:join(UnpackDir1, "boundary")), + {ok, LargeData} = file:read_file(filename:join(UnpackDir1, "large")), + + %% Extract from file to disk and verify contents + TarballPath = filename:join(BaseDir, "streamed_extract.tar"), + ok = file:write_file(TarballPath, Tarball), + UnpackDir2 = filename:join(BaseDir, "streamed_extract_file"), + {ok, #{inner_checksum := InnerChecksum, outer_checksum := OuterChecksum}} = + hex_tarball:unpack({file, TarballPath}, UnpackDir2), + {ok, EmptyData} = file:read_file(filename:join(UnpackDir2, "empty")), + {ok, SmallData} = file:read_file(filename:join(UnpackDir2, "small")), + {ok, BoundaryData} = file:read_file(filename:join(UnpackDir2, "boundary")), + {ok, LargeData} = file:read_file(filename:join(UnpackDir2, "large")), + + %% Verify that memory extraction still works (not affected by streaming) + {ok, #{contents := MemContents}} = hex_tarball:unpack(Tarball, memory), + MemMap = maps:from_list(MemContents), + EmptyData = maps:get("empty", MemMap), + SmallData = maps:get("small", MemMap), + BoundaryData = maps:get("boundary", MemMap), + LargeData = maps:get("large", MemMap), + + ok. + +file_unpack_docs_memory_test(Config) -> + BaseDir = ?config(priv_dir, Config), + + Files = [{"index.html", <<"Docs">>}], + {ok, Tarball} = hex_tarball:create_docs(Files), + TarballPath = filename:join(BaseDir, "docs.tar.gz"), + ok = file:write_file(TarballPath, Tarball), + + {ok, Files} = hex_tarball:unpack_docs({file, TarballPath}, memory), + + ok. + +file_unpack_docs_disk_test(Config) -> + BaseDir = ?config(priv_dir, Config), + + Files = [{"index.html", <<"Docs">>}], + {ok, Tarball} = hex_tarball:create_docs(Files), + TarballPath = filename:join(BaseDir, "docs_disk.tar.gz"), + ok = file:write_file(TarballPath, Tarball), + + UnpackDir = filename:join(BaseDir, "unpack_file_docs"), + ok = hex_tarball:unpack_docs({file, TarballPath}, UnpackDir), + {ok, <<"Docs">>} = file:read_file(filename:join(UnpackDir, "index.html")), + + ok. + +file_unpack_docs_too_big_test(Config) -> + BaseDir = ?config(priv_dir, Config), + + Files = [{"index.html", <<"Docs">>}], + {ok, Tarball} = hex_tarball:create_docs(Files), + TarballPath = filename:join(BaseDir, "docs_big.tar.gz"), + ok = file:write_file(TarballPath, Tarball), + + SmallConfig = maps:put(docs_tarball_max_size, 10, hex_core:default_config()), + {error, {tarball, too_big}} = hex_tarball:unpack_docs({file, TarballPath}, memory, SmallConfig), + + ok. + %%==================================================================== %% Helpers %%====================================================================