diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 2ff4992..6e0f1f1 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -2,4 +2,5 @@ 235927ce73125df31ec3a0049b067afa1f0a135b 25eefd52d023f97870d8b4a27988f8fc91c3ed73 7d8c46cce63ff1b93836b72cdad74ec796b09ced +e917dfbe72cb0cccafb38fb0694f7fe9dff4c158 diff --git a/README.md b/README.md index fa65886..6f4e1ab 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,30 @@ # inputdataTools -rimport used for publishing inputdata -relink.py used to remove files from $CESMDATAROOT and replace with links. +Tools used for publishing CESM input data. -Process to publish data: -The first step is to place your datafile(s) in /glade/campaign/cesm/cesmdata/inputdata following the inputdata naming convention. -When you have tested on derecho and are ready to share the new file(s) publically: +## Process to publish data +1. Place your datafile(s) in `/glade/campaign/cesm/cesmdata/inputdata/` (`$CESMDATAROOT`) following the input data naming conventions (see below). +2. When you have tested on derecho and are ready to share the new file(s) publically, run the `rimport` script. This will ask you for a password and 2FA login before it can copy the files from this "input data" directory" to the "publication" or "staging" directory. (This authentication should be possible for any member of the `cseg` group.) +3. Once that's done, `rimport` will replace the original with a link to the copy. +4. Sometime in the next 24 hours, your file should be uploaded to the GDEX server and available for download during CESM runs. -As user cesmdata run the rimport script. This requires a 2FA login, everyone in cseg should have access to the account, you will need to -contact cislhelp and request access if you are new to the group. +Notes: +- Use `rimport --check` if you'd like to see the current status of a file, including whether it's available for download. +- The `relink.py` script was previously used for step 3 above, but that functionality is now built into `rimport`. It's still there if you want to use it by itself. -As owner of the files in /glade/campaign/cesm/cesmdata/inputdata run script relink.py, this will remove the files from /glade/campaign/cesm/cesmdata/inputdata -and replace them with links to the published data location. /glade/campaign/collections/gdex/data/d651077/cesmdata/inputdata/ - -Filenames and metadata: +## Filenames and metadata: There is a good description of metadata that should be included in inputdata files here: https://www.cesm.ucar.edu/models/cam/metadata Filenames should be descriptive and should contain the date the file was created. Other information in the filename is also useful to keep as shown in the list below. Files published in inputdata should never be overwritten. + Replacement files should be different at least by creation date. Files that come from CESM simualtions should normally follow the output naming conventions from https://www.cesm.ucar.edu/models/cesm2/naming-conventions#modelOutputFilenames -Files should be placed under the appropriate directory for the component it's used or applicable for (so under lnd/clm2 for data that applies to the CLM/CTSM land model). Subdirectories under those levels should be used to seperate data by general types as needed for that component. +Files should be placed under the appropriate directory for the component it's used or applicable for (so under `lnd/clm2/` for data that applies to the CLM/CTSM land model). Subdirectories under those levels should be used to seperate data by general types as needed for that component. Some suggestions on things to include in the filename: - Spatial resolution of the gridded data - Year (or years) for which the data was observed or applicable to - Institution or project source of the data -- Creation date in the form of _cMMDDYY.nc +- Creation date in the form of `_cMMDDYY.nc` - CESM casename that was used to create the data (also simulation date for it) (see output file naming conventions above) -- Things needed to distinquish it from other similar files in inputdata (i.e. things like number of vertical levels, land-mask, number of Plant Functional Types, etc.) +- Things needed to distinquish it from other similar input files (e.g., number of vertical levels, land mask, number of Plant Functional Types, etc.) diff --git a/relink.py b/relink.py index e9f2cc5..1863433 100644 --- a/relink.py +++ b/relink.py @@ -222,6 +222,7 @@ def replace_files_with_symlinks( for file_path in find_owned_files_scandir( item_to_process, user_uid, inputdata_root ): + logger.info("'%s':", file_path) replace_one_file_with_symlink( inputdata_root, target_dir, file_path, dry_run=dry_run ) @@ -238,7 +239,6 @@ def replace_one_file_with_symlink(inputdata_root, target_dir, file_path, dry_run file_path (str): The path of the file to be replaced. dry_run (bool): If True, only show what would be done without making changes. """ - logger.info("'%s':", file_path) # Determine the relative path and the new link's destination relative_path = os.path.relpath(file_path, inputdata_root) @@ -270,7 +270,9 @@ def replace_one_file_with_symlink(inputdata_root, target_dir, file_path, dry_run os.rename(link_name, link_name + ".tmp") logger.info("%sDeleted original file: %s", INDENT, link_name) except OSError as e: - logger.error("%sError deleting file %s: %s. Skipping.", INDENT, link_name, e) + logger.error( + "%sError deleting file %s: %s. Skipping relink.", INDENT, link_name, e + ) return # Create the symbolic link, handling necessary parent directories @@ -283,7 +285,10 @@ def replace_one_file_with_symlink(inputdata_root, target_dir, file_path, dry_run except OSError as e: os.rename(link_name + ".tmp", link_name) logger.error( - "%sError creating symlink for %s: %s. Skipping.", INDENT, link_name, e + "%sError creating symlink for %s: %s. Skipping relink.", + INDENT, + link_name, + e, ) diff --git a/rimport b/rimport index 3539cd3..82c52c3 100755 --- a/rimport +++ b/rimport @@ -2,14 +2,14 @@ # TODO: Move all the Python into new file rimport.py for simpler testing. Keep rimport as a # convenience wrapper. """ -Copy files from CESM inputdata directory to a publishing directory. +Copy files from CESM inputdata directory to a publishing directory, then replace the original with a +symlink to the copy. Do `rimport --help` for more information. """ from __future__ import annotations import argparse -import logging import os import pwd import shutil @@ -19,6 +19,8 @@ from typing import Iterable, List from urllib.request import Request, urlopen from urllib.error import HTTPError +from relink import replace_one_file_with_symlink + import shared INDENT = shared.INDENT @@ -34,20 +36,13 @@ logger = shared.logger def build_parser() -> argparse.ArgumentParser: """Build and configure the argument parser for rimport. - Creates an ArgumentParser with the following options: - - Mutually exclusive required group: - --file: Import a single file (relative to inputdata directory) - --list: Import multiple files from a list file - - Optional: - --inputdata: Override the default inputdata directory - Returns: argparse.ArgumentParser: Configured parser ready to parse command-line arguments. """ parser = argparse.ArgumentParser( description=( f"Copy files from CESM inputdata directory ({DEFAULT_INPUTDATA_ROOT}) to a publishing" - " directory." + " directory, then replace the original with a symlink to the copy." ), add_help=False, # Disable automatic help to add custom -help flag ) @@ -144,10 +139,24 @@ def normalize_paths(root: Path, relnames: Iterable[str]) -> List[Path]: return paths +def check_relink_worked(src: Path, dst: Path) -> None: + """Check whether relink worked + + Args: + src (Path): Source file (should have been converted to symlink) + dst (Path): Destination file (symlink target) + + Raises: + RuntimeError: If src is not a symlink pointing to dst. + """ + if not (src.is_symlink() and src.resolve() == dst): + raise RuntimeError("Error relinking during rimport") + + def stage_data( src: Path, inputdata_root: Path, staging_root: Path, check: bool = False ) -> None: - """Stage a file by mirroring its path under `staging_root`. + """Stage a file by mirroring its path under `staging_root`, then replace with symlink to staged. Destination path is computed by replacing the `inputdata_root` prefix of `src` with `staging_root`, i.e.: @@ -163,6 +172,7 @@ def stage_data( RuntimeError: If `src` is a live symlink pointing outside staging, or if `src` is outside the inputdata root, or if `src` is already under staging directory. RuntimeError: If `src` is a broken symlink. + RuntimeError: If it failed to replace `src` with a symlink to the staged file. FileNotFoundError: If `src` does not exist. Guardrails: @@ -200,20 +210,25 @@ def stage_data( dst = staging_root / rel if dst.exists(): - logger.info("%sFile is already published but NOT linked; do", INDENT) - logger.info("%srelink.py %s", 2 * INDENT, rel) - logger.info("%sto resolve.", INDENT) + logger.info("File is already published but NOT linked; linking now.") + replace_one_file_with_symlink(inputdata_root, staging_root, str(src)) print_can_file_be_downloaded(can_file_be_downloaded(rel, staging_root)) + check_relink_worked(src, dst) return if check: logger.info("%sFile is not already published", INDENT) return + # Copy file to destination dst.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(src, dst) logger.info("%s[rimport] staged %s -> %s", INDENT, src, dst) + # Replace original with symlink to destination + replace_one_file_with_symlink(inputdata_root, staging_root, str(src)) + check_relink_worked(src, dst) + def ensure_running_as(target_user: str, argv: list[str]) -> None: """Ensure the script is running as the target user, re-executing via sudo if needed. @@ -354,7 +369,7 @@ def get_files_to_process(file: str, filelist: str, items_to_process: list): def main(argv: List[str] | None = None) -> int: """Main entry point for the rimport tool. - Copies files from the CESM inputdata directory to a staging/publishing directory, + Copies and relinks files from the CESM inputdata directory to a staging/publishing directory, preserving the directory structure. Ensures the script runs as the correct user (STAGE_OWNER) and handles both single files and file lists. @@ -370,7 +385,7 @@ def main(argv: List[str] | None = None) -> int: Exit Codes: 0: All files staged successfully. - 1: One or more files failed to stage (errors printed to stderr). + 1: One or more files failed to stage or relink (errors printed to stderr). 2: Fatal error (missing inputdata directory, missing file list, etc.). """ parser = build_parser() @@ -410,7 +425,11 @@ def main(argv: List[str] | None = None) -> int: errors += 1 logger.error("%srimport: error processing %s: %s", INDENT, p, e) - return 0 if errors == 0 else 1 + if errors: + return 1 + if not args.check: + logger.info("\nNo need to run relink.py") + return 0 if __name__ == "__main__": diff --git a/tests/relink/test_replace_files_with_symlinks.py b/tests/relink/test_replace_files_with_symlinks.py index 6fa2820..73b0244 100644 --- a/tests/relink/test_replace_files_with_symlinks.py +++ b/tests/relink/test_replace_files_with_symlinks.py @@ -12,6 +12,8 @@ from unittest.mock import patch, call import pytest +import shared + # Add parent directory to path to import relink module sys.path.insert( 0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -20,6 +22,15 @@ import relink # noqa: E402 +@pytest.fixture(autouse=True) +def configure_logging_for_tests(): + """Configure logging for all tests in this module.""" + shared.configure_logging(logging.INFO) + yield + # Cleanup + relink.logger.handlers.clear() + + @pytest.fixture(name="mock_replace_one") def fixture_mock_replace_one(): """Fixture that mocks relink.replace_one_file_with_symlink""" @@ -27,7 +38,9 @@ def fixture_mock_replace_one(): yield mock -def test_basic_file_replacement_given_dir(temp_dirs, current_user, mock_replace_one): +def test_basic_file_replacement_given_dir( + temp_dirs, current_user, mock_replace_one, caplog +): """Test basic functionality: given directory, replace owned file with symlink.""" inputdata_root, target_dir = temp_dirs username = current_user @@ -55,8 +68,13 @@ def test_basic_file_replacement_given_dir(temp_dirs, current_user, mock_replace_ dry_run=False, ) + # Verify message with filename was printed + assert f"'{source_file}':" in caplog.text -def test_basic_file_replacement_given_file(temp_dirs, current_user, mock_replace_one): + +def test_basic_file_replacement_given_file( + temp_dirs, current_user, mock_replace_one, caplog +): """Test basic functionality: given owned file, replace with symlink.""" inputdata_root, target_dir = temp_dirs username = current_user @@ -84,8 +102,11 @@ def test_basic_file_replacement_given_file(temp_dirs, current_user, mock_replace dry_run=False, ) + # Verify message with filename was printed + assert f"'{source_file}':" in caplog.text + -def test_dry_run(temp_dirs, current_user, mock_replace_one): +def test_dry_run(temp_dirs, current_user, mock_replace_one, caplog): """Test that dry_run=True is passed correctly.""" inputdata_root, target_dir = temp_dirs username = current_user @@ -117,6 +138,9 @@ def test_dry_run(temp_dirs, current_user, mock_replace_one): dry_run=True, ) + # Verify message with filename was printed + assert f"'{source_file}':" in caplog.text + def test_nested_directory_structure(temp_dirs, current_user, mock_replace_one): """Test with nested directory structures.""" @@ -175,6 +199,9 @@ def test_skip_existing_symlinks(temp_dirs, current_user, caplog, mock_replace_on # Verify replace_one_file_with_symlink() wasn't called mock_replace_one.assert_not_called() + # Verify message with filename was NOT printed + assert f"'{source_link}':" not in caplog.text + def test_missing_target_file(temp_dirs, current_user, caplog, mock_replace_one): """Test behavior when target file doesn't exist.""" @@ -224,7 +251,7 @@ def test_invalid_username(temp_dirs, caplog, mock_replace_one): mock_replace_one.assert_not_called() -def test_multiple_files(temp_dirs, current_user, mock_replace_one): +def test_multiple_files(temp_dirs, current_user, mock_replace_one, caplog): """Test with multiple files in the directory.""" inputdata_root, target_dir = temp_dirs username = current_user @@ -251,6 +278,11 @@ def test_multiple_files(temp_dirs, current_user, mock_replace_one): calls.append(call(inputdata_root, target_dir, source_file, dry_run=False)) mock_replace_one.assert_has_calls(calls, any_order=True) + # Verify message with filename was printed + for i in range(5): + source_file = os.path.join(inputdata_root, f"file_{i}.txt") + assert f"'{source_file}':" in caplog.text + def test_multiple_files_nested(temp_dirs, current_user, mock_replace_one): """Test with multiple files scattered throughout a nested directory tree.""" diff --git a/tests/relink/test_replace_one_file_with_symlink.py b/tests/relink/test_replace_one_file_with_symlink.py index 741f9f8..2a84cd8 100644 --- a/tests/relink/test_replace_one_file_with_symlink.py +++ b/tests/relink/test_replace_one_file_with_symlink.py @@ -118,7 +118,7 @@ def test_absolute_paths(temp_dirs): os.chdir(cwd) -def test_print_found_owned_file(temp_dirs, caplog): +def test_no_print_found_owned_file(temp_dirs, caplog): """Test that message with filename is printed.""" source_dir, target_dir = temp_dirs @@ -135,9 +135,8 @@ def test_print_found_owned_file(temp_dirs, caplog): with caplog.at_level(logging.INFO): relink.replace_one_file_with_symlink(source_dir, target_dir, source_file) - # Check that message was logged - assert f"'{source_file}':" in caplog.text - assert source_file in caplog.text + # Check that message was NOT logged (should happen in replace_files_with_symlinks instead) + assert f"'{source_file}':" not in caplog.text def test_print_deleted_and_created_messages(temp_dirs, caplog): diff --git a/tests/rimport/test_check_relink_worked.py b/tests/rimport/test_check_relink_worked.py new file mode 100644 index 0000000..3b28068 --- /dev/null +++ b/tests/rimport/test_check_relink_worked.py @@ -0,0 +1,64 @@ +""" +Tests for check_relink_worked() function in rimport script. +""" + +import os +import importlib.util +from importlib.machinery import SourceFileLoader + +import pytest + + +# Import rimport module from file without .py extension +rimport_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + "rimport", +) +loader = SourceFileLoader("rimport", rimport_path) +spec = importlib.util.spec_from_loader("rimport", loader) +if spec is None: + raise ImportError(f"Could not create spec for rimport from {rimport_path}") +rimport = importlib.util.module_from_spec(spec) +# Don't add to sys.modules to avoid conflict with other test files +loader.exec_module(rimport) + + +def test_ok(tmp_path): + """Check that it doesn't error if src is a symlink pointing to dst""" + # Set up + dst = tmp_path / "dst.nc" + src = tmp_path / "src.nc" + src.symlink_to(dst) + + # Shouldn't error + rimport.check_relink_worked(src, dst) + + +def test_error_not_symlink(tmp_path): + """Check that it does error if src isn't a symlink""" + # Set up + dst = tmp_path / "dst.nc" + src = tmp_path / "src.nc" + + # Should error + with pytest.raises(RuntimeError) as exc_info: + rimport.check_relink_worked(src, dst) + + # Verify error message was printed + assert "Error relinking during rimport" in str(exc_info.value) + + +def test_error_symlink_but_not_to_dst(tmp_path): + """Check that it does error if src is a symlink but not pointing to dst""" + # Set up + dst = tmp_path / "dst.nc" + other = tmp_path / "other.nc" + src = tmp_path / "src.nc" + src.symlink_to(other) + + # Should error + with pytest.raises(RuntimeError) as exc_info: + rimport.check_relink_worked(src, dst) + + # Verify error message was printed + assert "Error relinking during rimport" in str(exc_info.value) diff --git a/tests/rimport/test_cmdline.py b/tests/rimport/test_cmdline.py index d209422..62edd7e 100644 --- a/tests/rimport/test_cmdline.py +++ b/tests/rimport/test_cmdline.py @@ -82,6 +82,10 @@ def test_file_option_stages_single_file( assert staged_file.exists() assert staged_file.read_text() == "test data" + # Verify file was relinked + assert test_file.is_symlink() + assert test_file.resolve() == staged_file + def test_list_option_stages_multiple_files( self, rimport_script, test_env, rimport_env ): @@ -127,6 +131,12 @@ def test_list_option_stages_multiple_files( assert (staging_root / "file1.nc").read_text() == "data1" assert (staging_root / "file2.nc").read_text() == "data2" + # Verify both files were relinked + assert file1.is_symlink() + assert file1.resolve() == (staging_root / "file1.nc") + assert file2.is_symlink() + assert file2.resolve() == (staging_root / "file2.nc") + def test_preserves_directory_structure(self, rimport_script, test_env, rimport_env): """Test that directory structure is preserved in staging.""" inputdata_root = test_env["inputdata_root"] @@ -163,6 +173,10 @@ def test_preserves_directory_structure(self, rimport_script, test_env, rimport_e assert staged_file.exists() assert staged_file.read_text() == "nested data" + # Verify file was relinked + assert nested_file.is_symlink() + assert nested_file.resolve() == staged_file + def test_error_for_nonexistent_file(self, rimport_script, test_env, rimport_env): """Test that error is reported for nonexistent file.""" inputdata_root = test_env["inputdata_root"] @@ -308,6 +322,12 @@ def test_list_with_comments_and_blanks(self, rimport_script, test_env, rimport_e assert (staging_root / "file1.nc").exists() assert (staging_root / "file2.nc").exists() + # Verify both files were relinked + assert file1.is_symlink() + assert file1.resolve() == (staging_root / "file1.nc") + assert file2.is_symlink() + assert file2.resolve() == (staging_root / "file2.nc") + def test_prints_and_exits_for_already_published_linked_file( self, rimport_script, test_env, rimport_env ): @@ -473,5 +493,8 @@ def test_check_doesnt_copy(self, rimport_script, test_env, rimport_env): staged_file = staging_root / "test.nc" assert not staged_file.exists() + # Verify file was not replaced with a symlink + assert not test_file.is_symlink() + # Verify message was printed assert "not already published" in result.stdout diff --git a/tests/rimport/test_main.py b/tests/rimport/test_main.py index b35ae2d..3451199 100644 --- a/tests/rimport/test_main.py +++ b/tests/rimport/test_main.py @@ -41,6 +41,7 @@ def test_single_file_success( mock_get_staging_root, mock_stage_data, tmp_path, + caplog, ): """Test main() logic flow when a single file stages successfully.""" # Setup @@ -59,9 +60,11 @@ def test_single_file_success( # Verify assert result == 0 mock_normalize_paths.assert_called_once_with(inputdata_root, ["test.nc"]) + check = False mock_stage_data.assert_called_once_with( - test_file, inputdata_root, staging_root, False + test_file, inputdata_root, staging_root, check ) + assert "No need to run relink.py" in caplog.text @patch.object(rimport, "stage_data") @patch.object(rimport, "get_staging_root") @@ -104,10 +107,11 @@ def test_file_list_success( inputdata_root, ["file1.nc", "file2.nc"] ) assert mock_stage_data.call_count == 2 + check = False mock_stage_data.assert_has_calls( [ - call(file1, inputdata_root, staging_root, False), - call(file2, inputdata_root, staging_root, False), + call(file1, inputdata_root, staging_root, check), + call(file2, inputdata_root, staging_root, check), ] ) @@ -153,6 +157,10 @@ def stage_data_side_effect(src, *_args, **_kwargs): assert "error processing" in captured.err assert "Test error for file2" in captured.err + # Check that message about not needing to run relink was NOT printed + assert "No need to run relink.py" not in captured.err + assert "No need to run relink.py" not in captured.out + @patch.object(rimport, "ensure_running_as") def test_nonexistent_inputdata_directory( self, _mock_ensure_running_as, tmp_path, capsys @@ -226,6 +234,7 @@ def test_check_mode_calls( mock_get_staging_root, mock_stage_data, tmp_path, + caplog, ): """Test that --check mode skips the user check but does call stage_data.""" inputdata_root = tmp_path / "inputdata" @@ -245,9 +254,12 @@ def test_check_mode_calls( # ensure_running_as should NOT be called in check mode mock_ensure_running_as.assert_not_called() # stage_data should be called with check=True + check = True mock_stage_data.assert_called_once_with( - test_file, inputdata_root, staging_root, True + test_file, inputdata_root, staging_root, check ) + # Message about relink.py should not have been printed + assert "No need to run relink.py" not in caplog.text @patch.object(rimport, "stage_data") @patch.object(rimport, "get_staging_root") @@ -367,3 +379,81 @@ def stage_data_side_effect(src, *_args, **_kwargs): captured = capsys.readouterr() # Should have 2 error messages assert captured.err.count("error processing") == 2 + + @patch.object(rimport, "replace_one_file_with_symlink") + @patch.object(rimport, "get_staging_root") + @patch.object(rimport, "normalize_paths") + @patch.object(rimport, "ensure_running_as") + def test_error_if_file_already_published_but_relink_fails( + self, + _mock_ensure_running_as, + mock_normalize_paths, + mock_get_staging_root, + _mock_replace_one_file_with_symlink, + tmp_path, + capsys, + ): + """ + Test that main() returns error code 1 if attempting to relink an already-published file + fails. + """ + inputdata_root = tmp_path / "inputdata" + inputdata_root.mkdir() + staging_root = tmp_path / "staging" + staging_root.mkdir() + + filename = "test.nc" + src = inputdata_root / filename + src.write_text("some data") + assert src.exists() + dst = staging_root / filename + dst.write_text("some data") + + mock_get_staging_root.return_value = staging_root + test_file = inputdata_root / filename + mock_normalize_paths.return_value = [test_file] + + result = rimport.main(["-inputdata", str(inputdata_root), str(src)]) + + assert result == 1 + captured = capsys.readouterr() + assert "rimport: error processing" in captured.err + assert "Error relinking during rimport" in captured.err + + @patch.object(rimport, "replace_one_file_with_symlink") + @patch.object(rimport, "get_staging_root") + @patch.object(rimport, "normalize_paths") + @patch.object(rimport, "ensure_running_as") + def test_error_if_file_newly_published_but_relink_fails( + self, + _mock_ensure_running_as, + mock_normalize_paths, + mock_get_staging_root, + _mock_replace_one_file_with_symlink, + tmp_path, + capsys, + ): + """ + Test that main() returns error code 1 if attempting to relink a newly-published file + fails. + """ + inputdata_root = tmp_path / "inputdata" + inputdata_root.mkdir() + staging_root = tmp_path / "staging" + staging_root.mkdir() + + filename = "test.nc" + src = inputdata_root / filename + src.write_text("some data") + assert src.exists() + + mock_get_staging_root.return_value = staging_root + test_file = inputdata_root / filename + mock_normalize_paths.return_value = [test_file] + + result = rimport.main(["-inputdata", str(inputdata_root), str(src)]) + + assert result == 1 + captured = capsys.readouterr() + assert "rimport: error processing" in captured.err + assert "Error relinking during rimport" in captured.err diff --git a/tests/rimport/test_stage_data.py b/tests/rimport/test_stage_data.py index 85c3bb1..fe3a957 100644 --- a/tests/rimport/test_stage_data.py +++ b/tests/rimport/test_stage_data.py @@ -68,6 +68,10 @@ def test_copies_file_to_staging(self, inputdata_root, staging_root): assert dst.exists() assert dst.read_text() == "data content" + # Verify original was replaced with symlink + assert src.is_symlink() + assert src.resolve() == dst + def test_check_doesnt_copy(self, inputdata_root, staging_root, caplog): """Test that a file is NOT copied to the staging directory if check is True""" # Create file in inputdata root @@ -80,6 +84,7 @@ def test_check_doesnt_copy(self, inputdata_root, staging_root, caplog): # Verify file was NOT copied to staging dst = staging_root / "file.nc" assert not dst.exists() + assert not src.is_symlink() # Verify message was logged assert "not already published" in caplog.text @@ -99,6 +104,10 @@ def test_preserves_directory_structure(self, inputdata_root, staging_root): assert dst.exists() assert dst.read_text() == "nested data" + # Verify original was replaced with symlink + assert src.is_symlink() + assert src.resolve() == dst + def test_prints_live_symlink_already_published_not_downloadable( self, inputdata_root, staging_root, caplog ): @@ -195,11 +204,15 @@ def test_prints_published_but_not_linked( # Verify the right messages were logged or not msg = "File is already published and linked" assert msg not in caplog.text - msg = "File is already published but NOT linked; do" + msg = "File is already published but NOT linked; linking" assert msg in caplog.text msg = "File is available for download" assert msg in caplog.text + # Verify the file got linked + assert inputdata.is_symlink() + assert inputdata.resolve() == staged + def test_raises_error_for_live_symlink_pointing_somewhere_other_than_staging( self, tmp_path, inputdata_root, staging_root ): @@ -301,6 +314,10 @@ def test_preserves_file_metadata(self, inputdata_root, staging_root): assert dst_stat.st_mtime == src_stat.st_mtime assert dst_stat.st_mode == src_stat.st_mode + # Verify the file got linked + assert src.is_symlink() + assert src.resolve() == dst + def test_handles_files_with_spaces(self, inputdata_root, staging_root): """Test handling files with spaces in names.""" # Create file with spaces in inputdata root @@ -315,6 +332,10 @@ def test_handles_files_with_spaces(self, inputdata_root, staging_root): assert dst.exists() assert dst.read_text() == "data" + # Verify the file got linked + assert src.is_symlink() + assert src.resolve() == dst + def test_handles_files_with_special_characters(self, inputdata_root, staging_root): """Test handling files with special characters.""" # Create file with special chars in inputdata root @@ -328,3 +349,65 @@ def test_handles_files_with_special_characters(self, inputdata_root, staging_roo dst = staging_root / "file-name_123@test.nc" assert dst.exists() assert dst.read_text() == "data" + + # Verify the file got linked + assert src.is_symlink() + assert src.resolve() == dst + + @patch.object(rimport, "replace_one_file_with_symlink") + def test_not_already_published_errors_if_relink_fails( + self, _mock_replace_one_file_with_symlink, inputdata_root, staging_root + ): + """ + Test that it errors if, after publishing, replace_one_file_with_symlink() doesn't make the + symlink. + """ + # Create file in inputdata root + src = inputdata_root / "file.nc" + src.write_text("data content") + + assert not inputdata_root.is_symlink() + assert not src.is_symlink() + dst = staging_root / "file.nc" + assert not dst.exists() + + # Stage the file but expect failure during relink check + with pytest.raises(RuntimeError) as exc_info: + rimport.stage_data(src, inputdata_root, staging_root) + + # Verify file was copied to staging + assert dst.exists() + assert dst.read_text() == "data content" + + # Verify error message was printed + assert "Error relinking during rimport" in str(exc_info.value) + + @patch.object(rimport, "can_file_be_downloaded") + @patch.object(rimport, "replace_one_file_with_symlink") + def test_already_published_errors_if_relink_fails( + self, + _mock_replace_one_file_with_symlink, + mock_can_file_be_downloaded, + inputdata_root, + staging_root, + ): + """ + Test that it errors if, for an already-published file, replace_one_file_with_symlink() + doesn't make the symlink. + """ + # Create a real file in staging AND in inputdata + filename = "real_file.nc" + staged = staging_root / filename + staged.write_text("data") + inputdata = inputdata_root / filename + inputdata.write_text("data") + + # Mock can_file_be_downloaded to return True + mock_can_file_be_downloaded.return_value = True + + # Expect failure during relink check + with pytest.raises(RuntimeError) as exc_info: + rimport.stage_data(inputdata, inputdata_root, staging_root) + + # Verify error message was printed + assert "Error relinking during rimport" in str(exc_info.value)