From 85aee415bb35d9daa039b3cf09aa5ec88699ae64 Mon Sep 17 00:00:00 2001 From: Sam Rabin Date: Mon, 2 Feb 2026 14:05:56 -0700 Subject: [PATCH 01/13] Extend a couple rimport test string comparisons. --- tests/rimport/test_stage_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/rimport/test_stage_data.py b/tests/rimport/test_stage_data.py index f8f607a..34dbbff 100644 --- a/tests/rimport/test_stage_data.py +++ b/tests/rimport/test_stage_data.py @@ -82,7 +82,7 @@ def test_raises_error_for_live_symlink( src.symlink_to(real_file) # Should raise RuntimeError for live symlink - with pytest.raises(RuntimeError, match="already published"): + with pytest.raises(RuntimeError, match="File is already published"): rimport.stage_data(src, inputdata_root, staging_root) def test_raises_error_for_broken_symlink( @@ -94,7 +94,7 @@ def test_raises_error_for_broken_symlink( src.symlink_to(tmp_path / "nonexistent.nc") # Should raise RuntimeError for broken symlink - with pytest.raises(RuntimeError, match="broken symlink"): + with pytest.raises(RuntimeError, match="Source is a broken symlink"): rimport.stage_data(src, inputdata_root, staging_root) def test_raises_error_for_nonexistent_file(self, inputdata_root, staging_root): From 566c156343df48bedab4af040bd7d2a0dc03188c Mon Sep 17 00:00:00 2001 From: Sam Rabin Date: Mon, 2 Feb 2026 14:09:25 -0700 Subject: [PATCH 02/13] rimport stage_data(): Small refactor. --- rimport | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/rimport b/rimport index 55e8510..6350730 100755 --- a/rimport +++ b/rimport @@ -156,11 +156,12 @@ def stage_data(src: Path, inputdata_root: Path, staging_root: Path) -> None: * Raise if `src` is a *live* symlink ("already published"). * Raise if `src` is a broken symlink or is outside the inputdata root. """ - if src.is_symlink() and src.exists(): + if src.is_symlink(): + if not src.exists(follow_symlinks=True): + raise RuntimeError(f"Source is a broken symlink: {src}") # TODO: This should be a regular message, not an error. raise RuntimeError("File is already published.") - if src.is_symlink() and not src.exists(): - raise RuntimeError(f"Source is a broken symlink: {src}") + if not src.exists(): raise FileNotFoundError(f"source not found: {src}") From b9cedd2f2acc7de7216149d09018546105655680 Mon Sep 17 00:00:00 2001 From: Sam Rabin Date: Mon, 2 Feb 2026 14:22:21 -0700 Subject: [PATCH 03/13] rimport: Don't say "already published" if target is outside staging dir. --- rimport | 17 +++++++++++++---- tests/rimport/test_stage_data.py | 28 ++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/rimport b/rimport index 6350730..b9119ad 100755 --- a/rimport +++ b/rimport @@ -147,20 +147,29 @@ def stage_data(src: Path, inputdata_root: Path, staging_root: Path) -> None: staging_root: Root directory where files will be staged. Raises: - RuntimeError: If `src` is a live symlink (already published), or if `src` - is outside the inputdata root, or if `src` is already under staging directory. + RuntimeError: If `src` is a live symlink (already published, or pointing outside staging), + or if `src` is outside the inputdata root, or if `src` is already under staging + directory. RuntimeError: If `src` is a broken symlink. FileNotFoundError: If `src` does not exist. Guardrails: - * Raise if `src` is a *live* symlink ("already published"). + * Raise if `src` is a *live* symlink to a file in staging root ("already published"). + * Raise if `src` is a *live* symlink to a file outside staging root ("outside staging"). * Raise if `src` is a broken symlink or is outside the inputdata root. """ if src.is_symlink(): if not src.exists(follow_symlinks=True): raise RuntimeError(f"Source is a broken symlink: {src}") + if not src.resolve().is_relative_to(staging_root.resolve()): + raise RuntimeError( + f"Source is a symlink, but target ({src.resolve()}) is outside staging directory " + f"({staging_root})" + ) # TODO: This should be a regular message, not an error. - raise RuntimeError("File is already published.") + raise RuntimeError("File is already published and linked.") + + # TODO: Check whether file is published but not relinked. if not src.exists(): raise FileNotFoundError(f"source not found: {src}") diff --git a/tests/rimport/test_stage_data.py b/tests/rimport/test_stage_data.py index 34dbbff..afac38b 100644 --- a/tests/rimport/test_stage_data.py +++ b/tests/rimport/test_stage_data.py @@ -71,18 +71,38 @@ def test_preserves_directory_structure(self, inputdata_root, staging_root): assert dst.exists() assert dst.read_text() == "nested data" - def test_raises_error_for_live_symlink( - self, tmp_path, inputdata_root, staging_root + def test_raises_error_for_live_symlink_already_published( + self, inputdata_root, staging_root ): - """Test that staging a live symlink raises RuntimeError.""" + """ + Test that staging a live, already-published symlink raises RuntimeError with accurate + message. + """ # Create a real file and a symlink to it + real_file = staging_root / "real_file.nc" + real_file.write_text("data") + src = inputdata_root / "link.nc" + src.symlink_to(real_file) + + # Should raise RuntimeError for live symlink + with pytest.raises(RuntimeError, match="File is already published and linked"): + rimport.stage_data(src, inputdata_root, staging_root) + + def test_raises_error_for_live_symlink_pointing_somewhere_other_than_staging( + self, tmp_path, inputdata_root, staging_root + ): + """ + Test that staging a live symlink that points to somewhere other than staging directory + raises RuntimeError with accurate message. + """ + # Create a real file outside the staging directory and a symlink to it real_file = tmp_path / "real_file.nc" real_file.write_text("data") src = inputdata_root / "link.nc" src.symlink_to(real_file) # Should raise RuntimeError for live symlink - with pytest.raises(RuntimeError, match="File is already published"): + with pytest.raises(RuntimeError, match="outside staging directory"): rimport.stage_data(src, inputdata_root, staging_root) def test_raises_error_for_broken_symlink( From 545f21d969cfba18add76e0d09afe817c1a41faf Mon Sep 17 00:00:00 2001 From: Sam Rabin Date: Mon, 2 Feb 2026 14:34:25 -0700 Subject: [PATCH 04/13] rimport: "already published" msg is no longer an error. --- rimport | 10 ++++------ tests/rimport/test_stage_data.py | 23 ++++++++++++++++------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/rimport b/rimport index b9119ad..43521a3 100755 --- a/rimport +++ b/rimport @@ -147,14 +147,12 @@ def stage_data(src: Path, inputdata_root: Path, staging_root: Path) -> None: staging_root: Root directory where files will be staged. Raises: - RuntimeError: If `src` is a live symlink (already published, or pointing outside staging), - or if `src` is outside the inputdata root, or if `src` is already under staging - directory. + RuntimeError: If `src` is a live symlink pointing outside staging, or if `src` is outside + the inputdata root, or if `src` is already under staging directory. RuntimeError: If `src` is a broken symlink. FileNotFoundError: If `src` does not exist. Guardrails: - * Raise if `src` is a *live* symlink to a file in staging root ("already published"). * Raise if `src` is a *live* symlink to a file outside staging root ("outside staging"). * Raise if `src` is a broken symlink or is outside the inputdata root. """ @@ -166,8 +164,8 @@ def stage_data(src: Path, inputdata_root: Path, staging_root: Path) -> None: f"Source is a symlink, but target ({src.resolve()}) is outside staging directory " f"({staging_root})" ) - # TODO: This should be a regular message, not an error. - raise RuntimeError("File is already published and linked.") + print("File is already published and linked.") + return # TODO: Check whether file is published but not relinked. diff --git a/tests/rimport/test_stage_data.py b/tests/rimport/test_stage_data.py index afac38b..6644e84 100644 --- a/tests/rimport/test_stage_data.py +++ b/tests/rimport/test_stage_data.py @@ -6,6 +6,7 @@ import sys import importlib.util from importlib.machinery import SourceFileLoader +from unittest.mock import patch import pytest @@ -71,23 +72,31 @@ def test_preserves_directory_structure(self, inputdata_root, staging_root): assert dst.exists() assert dst.read_text() == "nested data" - def test_raises_error_for_live_symlink_already_published( - self, inputdata_root, staging_root + def test_prints_live_symlink_already_published( + self, inputdata_root, staging_root, capsys ): """ - Test that staging a live, already-published symlink raises RuntimeError with accurate - message. + Test that staging a live, already-published symlink prints a message and returns + immediately without copying anything. """ - # Create a real file and a symlink to it + # Create a real file in staging and a symlink to it in inputdata real_file = staging_root / "real_file.nc" real_file.write_text("data") src = inputdata_root / "link.nc" src.symlink_to(real_file) - # Should raise RuntimeError for live symlink - with pytest.raises(RuntimeError, match="File is already published and linked"): + # Mock shutil.copy2 to verify it's never called + with patch("shutil.copy2") as mock_copy: + # Should print message for live symlink and return early rimport.stage_data(src, inputdata_root, staging_root) + # Verify the message was printed + msg = "File is already published and linked" + assert msg in capsys.readouterr().out.strip() + + # Verify that shutil.copy2 was never called (function returned early) + mock_copy.assert_not_called() + def test_raises_error_for_live_symlink_pointing_somewhere_other_than_staging( self, tmp_path, inputdata_root, staging_root ): From 03064b13654333da09f6eebf7d7dadb7e05b6c3d Mon Sep 17 00:00:00 2001 From: Sam Rabin Date: Mon, 2 Feb 2026 15:07:58 -0700 Subject: [PATCH 05/13] rimport: Print file and indent its messages. Makes it easier to understand output when processing lists of files. --- rimport | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/rimport b/rimport index 43521a3..03ac292 100755 --- a/rimport +++ b/rimport @@ -21,6 +21,7 @@ import shared DEFAULT_INPUTDATA_ROOT = Path(shared.DEFAULT_INPUTDATA_ROOT) DEFAULT_STAGING_ROOT = Path(shared.DEFAULT_STAGING_ROOT) STAGE_OWNER = "cesmdata" +INDENT = " " def build_parser() -> argparse.ArgumentParser: @@ -164,7 +165,7 @@ def stage_data(src: Path, inputdata_root: Path, staging_root: Path) -> None: f"Source is a symlink, but target ({src.resolve()}) is outside staging directory " f"({staging_root})" ) - print("File is already published and linked.") + print(f"{INDENT}File is already published and linked.") return # TODO: Check whether file is published but not relinked. @@ -186,7 +187,7 @@ def stage_data(src: Path, inputdata_root: Path, staging_root: Path) -> None: dst = staging_root / rel dst.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(src, dst) - print(f"[rimport] staged {src} -> {dst}") + print(f"{INDENT}[rimport] staged {src} -> {dst}") def ensure_running_as(target_user: str, argv: list[str]) -> None: @@ -298,12 +299,13 @@ def main(argv: List[str] | None = None) -> int: # Execute the new action per file errors = 0 for p in paths: + print(f"'{p}':") try: stage_data(p, root, staging_root) except Exception as e: # pylint: disable=broad-exception-caught # General Exception keeps CLI robust for batch runs errors += 1 - print(f"rimport: error processing {p}: {e}", file=sys.stderr) + print(f"{INDENT}rimport: error processing {p}: {e}", file=sys.stderr) return 0 if errors == 0 else 1 From 6151b1b14dd973f8445276eed0ddfb773f7c3b6d Mon Sep 17 00:00:00 2001 From: Sam Rabin Date: Mon, 2 Feb 2026 15:23:39 -0700 Subject: [PATCH 06/13] rimport bugfix: Don't resolve symlinks before stage_data(). stage_data() has special handling for symlinks, but they were getting resolved (i.e., replaced with their targets) during resolve_paths(), so stage_data() only ever saw their targets. resolve_paths() has been renamed to normalize_paths() and no longer resolves symlinks. Tests have been added to ensure this behavior is preserved. --- rimport | 26 ++-- tests/rimport/test_cmdline.py | 127 ++++++++++++++++++ ...solve_paths.py => test_normalize_paths.py} | 35 +++-- tests/rimport/test_stage_data.py | 9 +- 4 files changed, 169 insertions(+), 28 deletions(-) rename tests/rimport/{test_resolve_paths.py => test_normalize_paths.py} (77%) diff --git a/rimport b/rimport index 03ac292..b06630e 100755 --- a/rimport +++ b/rimport @@ -109,28 +109,26 @@ def read_filelist(list_path: Path) -> List[str]: return lines -def resolve_paths(root: Path, relnames: Iterable[str]) -> List[Path]: - """Convert relative or absolute path names to resolved absolute Paths. +def normalize_paths(root: Path, relnames: Iterable[str]) -> List[Path]: + """Convert relative or absolute path names to normalized absolute Paths. For each name in relnames: - - If the name is relative, it is resolved relative to `root` - - If the name is already absolute, it is resolved as-is - All paths are resolved to their canonical absolute form. + - If the name is relative, it is assumed to be relative to `root` and made absolute + All paths are then normalized to their absolute form, replacing . and .. as needed. + + Note that symlinks are NOT resolved. Args: - root: Base directory for resolving relative paths. - relnames: Iterable of path names (relative or absolute) to resolve. + root: Base directory under which relative paths are assumed to be. + relnames: Iterable of path names (relative or absolute) to normalize. Returns: - List of resolved absolute Path objects. + List of normalized absolute Path objects. """ paths: List[Path] = [] for name in relnames: - p = ( - (root / name).resolve() - if not Path(name).is_absolute() - else Path(name).resolve() - ) + p = root / name if not Path(name).is_absolute() else Path(name) + p = Path(os.path.normpath(p.absolute())) paths.append(p) return paths @@ -294,7 +292,7 @@ def main(argv: List[str] | None = None) -> int: return 2 # Resolve to full paths (keep accepting absolute names too) - paths = resolve_paths(root, relnames) + paths = normalize_paths(root, relnames) staging_root = get_staging_root() # Execute the new action per file errors = 0 diff --git a/tests/rimport/test_cmdline.py b/tests/rimport/test_cmdline.py index f5e7957..034eed8 100644 --- a/tests/rimport/test_cmdline.py +++ b/tests/rimport/test_cmdline.py @@ -307,3 +307,130 @@ def test_list_with_comments_and_blanks(self, rimport_script, test_env, rimport_e # Verify both files were staged assert (staging_root / "file1.nc").exists() assert (staging_root / "file2.nc").exists() + + def test_prints_and_exits_for_already_published_linked_file( + self, rimport_script, test_env, rimport_env + ): + """ + Test that stage_data returns early with msg if file already published/linked. Note that the + only thing this test does that the stage_data tests don't is to check that main() correctly + passes the unresolved symlink to normalize_paths. + """ + inputdata_root = test_env["inputdata_root"] + staging_root = test_env["staging_root"] + + # Create a real file in staging and a symlink to it in inputdata + real_file = staging_root / "real_file.nc" + real_file.write_text("data") + src = inputdata_root / "link.nc" + src.symlink_to(real_file) + + # Run rimport with -file option + command = [ + sys.executable, + rimport_script, + "-file", + "link.nc", + "-inputdata", + str(inputdata_root), + ] + + result = subprocess.run( + command, + capture_output=True, + text=True, + check=False, + env=rimport_env, + ) + + # Verify success + assert result.returncode == 0, f"Command failed: {result.stderr}" + + # Verify the right message was printed + msg = "File is already published and linked" + assert msg in result.stdout + + # Verify the WRONG message was NOT printed + msg = "is already under staging directory" + assert msg not in result.stdout + + def test_error_broken_symlink(self, rimport_script, test_env, rimport_env): + """ + Test that stage_data errors with msg if file is a link w/ nonexistent target. Note that the + only thing this test does that the stage_data tests don't is to check that main() correctly + passes the unresolved symlink to stage_data. + """ + inputdata_root = test_env["inputdata_root"] + staging_root = test_env["staging_root"] + + # Create a symlink in inputdata pointing to a nonexistent file + real_file = staging_root / "real_file.nc" + src = inputdata_root / "link.nc" + src.symlink_to(real_file) + + # Run rimport + command = [ + sys.executable, + rimport_script, + "-file", + "link.nc", + "-inputdata", + str(inputdata_root), + ] + + result = subprocess.run( + command, + capture_output=True, + text=True, + check=False, + env=rimport_env, + ) + + # Verify failure + assert result.returncode != 0, f"Command unexpectedly passed: {result.stdout}" + + # Verify the right message was printed + msg = "Source is a broken symlink" + assert msg in result.stderr + + def test_error_symlink_pointing_outside_staging( + self, rimport_script, test_env, rimport_env + ): + """ + Test that stage_data errors w/ msg if file is link w/ target outside staging. Note that the + only thing this test does that the stage_data tests don't is to check that main() correctly + passes the unresolved symlink to stage_data. + """ + inputdata_root = test_env["inputdata_root"] + tmp_path = test_env["tmp_path"] + + # Create a real file outside staging and a symlink to it in inputdata + real_file = tmp_path / "real_file.nc" + real_file.write_text("data") + src = inputdata_root / "link.nc" + src.symlink_to(real_file) + + # Run rimport + command = [ + sys.executable, + rimport_script, + "-file", + "link.nc", + "-inputdata", + str(inputdata_root), + ] + + result = subprocess.run( + command, + capture_output=True, + text=True, + check=False, + env=rimport_env, + ) + + # Verify failure + assert result.returncode != 0, f"Command unexpectedly passed: {result.stdout}" + + # Verify the right message was printed + msg = "is outside staging directory" + assert msg in result.stderr diff --git a/tests/rimport/test_resolve_paths.py b/tests/rimport/test_normalize_paths.py similarity index 77% rename from tests/rimport/test_resolve_paths.py rename to tests/rimport/test_normalize_paths.py index 6a8a780..594569a 100644 --- a/tests/rimport/test_resolve_paths.py +++ b/tests/rimport/test_normalize_paths.py @@ -1,5 +1,5 @@ """ -Tests for resolve_paths() function in rimport script. +Tests for normalize_paths() function in rimport script. """ import os @@ -33,11 +33,11 @@ def fixture_root(tmp_path): class TestResolvePaths: - """Test suite for resolve_paths() function.""" + """Test suite for normalize_paths() function.""" def test_single_relative_path(self, root): """Test resolving a single relative path.""" - result = rimport.resolve_paths(root, ["file1.nc"]) + result = rimport.normalize_paths(root, ["file1.nc"]) assert len(result) == 1 assert result[0] == (root / "file1.nc").resolve() @@ -45,7 +45,7 @@ def test_single_relative_path(self, root): def test_multiple_relative_paths(self, root): """Test resolving multiple relative paths.""" relnames = ["file1.nc", "file2.nc", "file3.nc"] - result = rimport.resolve_paths(root, relnames) + result = rimport.normalize_paths(root, relnames) assert len(result) == 3 assert result[0] == (root / "file1.nc").resolve() @@ -55,7 +55,7 @@ def test_multiple_relative_paths(self, root): def test_nested_relative_paths(self, root): """Test resolving nested relative paths.""" relnames = ["dir1/file1.nc", "dir2/subdir/file2.nc"] - result = rimport.resolve_paths(root, relnames) + result = rimport.normalize_paths(root, relnames) assert len(result) == 2 assert result[0] == (root / "dir1" / "file1.nc").resolve() @@ -66,7 +66,7 @@ def test_absolute_path(self, tmp_path, root): abs_path = tmp_path / "other" / "file.nc" relnames = [str(abs_path)] - result = rimport.resolve_paths(root, relnames) + result = rimport.normalize_paths(root, relnames) assert len(result) == 1 assert result[0] == abs_path.resolve() @@ -76,7 +76,7 @@ def test_mixed_relative_and_absolute(self, tmp_path, root): abs_path = tmp_path / "other" / "file.nc" relnames = ["file1.nc", str(abs_path), "dir/file2.nc"] - result = rimport.resolve_paths(root, relnames) + result = rimport.normalize_paths(root, relnames) assert len(result) == 3 assert result[0] == (root / "file1.nc").resolve() @@ -85,7 +85,7 @@ def test_mixed_relative_and_absolute(self, tmp_path, root): def test_empty_list(self, root): """Test with empty list of names.""" - result = rimport.resolve_paths(root, []) + result = rimport.normalize_paths(root, []) assert len(result) == 0 assert result == [] @@ -93,7 +93,7 @@ def test_empty_list(self, root): def test_paths_with_spaces(self, root): """Test paths with spaces in names.""" relnames = ["file with spaces.nc", "dir with spaces/file.nc"] - result = rimport.resolve_paths(root, relnames) + result = rimport.normalize_paths(root, relnames) assert len(result) == 2 assert result[0] == (root / "file with spaces.nc").resolve() @@ -102,7 +102,7 @@ def test_paths_with_spaces(self, root): def test_paths_with_special_characters(self, root): """Test paths with special characters.""" relnames = ["file-name_123.nc", "dir@test/file.nc"] - result = rimport.resolve_paths(root, relnames) + result = rimport.normalize_paths(root, relnames) assert len(result) == 2 assert result[0] == (root / "file-name_123.nc").resolve() @@ -110,7 +110,7 @@ def test_paths_with_special_characters(self, root): def test_returns_path_objects(self, root): """Test that result contains Path objects.""" - result = rimport.resolve_paths(root, ["file.nc"]) + result = rimport.normalize_paths(root, ["file.nc"]) assert len(result) == 1 assert isinstance(result[0], Path) @@ -118,9 +118,20 @@ def test_returns_path_objects(self, root): def test_resolves_dot_and_dotdot(self, root): """Test that . and .. are resolved.""" relnames = ["./file1.nc", "dir/../file2.nc", "dir/./file3.nc"] - result = rimport.resolve_paths(root, relnames) + result = rimport.normalize_paths(root, relnames) assert len(result) == 3 assert result[0] == (root / "file1.nc").resolve() assert result[1] == (root / "file2.nc").resolve() assert result[2] == (root / "dir" / "file3.nc").resolve() + + def test_abs_symlink_unchanged(self, root): + """Test that an absolute path to a symlink is unchanged""" + # Create a real file in staging and a symlink to it in inputdata + real_file = root / "real_file.nc" + real_file.write_text("data") + symlink = root / "link.nc" + symlink.symlink_to(real_file) + + result = rimport.normalize_paths(root, [symlink]) + assert result[0] == symlink diff --git a/tests/rimport/test_stage_data.py b/tests/rimport/test_stage_data.py index 6644e84..301d99c 100644 --- a/tests/rimport/test_stage_data.py +++ b/tests/rimport/test_stage_data.py @@ -90,9 +90,14 @@ def test_prints_live_symlink_already_published( # Should print message for live symlink and return early rimport.stage_data(src, inputdata_root, staging_root) - # Verify the message was printed + # Verify the right message was printed + stdout = capsys.readouterr().out.strip() msg = "File is already published and linked" - assert msg in capsys.readouterr().out.strip() + assert msg in stdout + + # Verify the WRONG message was NOT printed + msg = "is already under staging directory" + assert msg not in stdout # Verify that shutil.copy2 was never called (function returned early) mock_copy.assert_not_called() From 53dadd6b144665d681257f3fc7d0188c7d08dd48 Mon Sep 17 00:00:00 2001 From: Sam Rabin Date: Mon, 2 Feb 2026 15:34:29 -0700 Subject: [PATCH 07/13] rimport stage_data(): Bugfix for older Pythons. --- rimport | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rimport b/rimport index b06630e..74372bb 100755 --- a/rimport +++ b/rimport @@ -156,7 +156,7 @@ def stage_data(src: Path, inputdata_root: Path, staging_root: Path) -> None: * Raise if `src` is a broken symlink or is outside the inputdata root. """ if src.is_symlink(): - if not src.exists(follow_symlinks=True): + if not os.path.exists(src.resolve()): raise RuntimeError(f"Source is a broken symlink: {src}") if not src.resolve().is_relative_to(staging_root.resolve()): raise RuntimeError( From 43250d752871fe995d9512c0d9beb72f53563b55 Mon Sep 17 00:00:00 2001 From: Sam Rabin Date: Mon, 2 Feb 2026 17:14:57 -0700 Subject: [PATCH 08/13] rimport: Add --check option. Resolves ESMCI/inputdataTools#2. --- rimport | 18 ++++++++++--- tests/rimport/test_build_parser.py | 13 ++++++++++ tests/rimport/test_cmdline.py | 41 ++++++++++++++++++++++++++++++ tests/rimport/test_stage_data.py | 16 ++++++++++++ 4 files changed, 85 insertions(+), 3 deletions(-) diff --git a/rimport b/rimport index 74372bb..f9eb00c 100755 --- a/rimport +++ b/rimport @@ -74,6 +74,14 @@ def build_parser() -> argparse.ArgumentParser: ), ) + parser.add_argument( + "--check", + "-check", + "-c", + action="store_true", + help="Check whether file(s) is/are already published.", + ) + # Provide -help to mirror legacy behavior (in addition to -h and --help) parser.add_argument( "-h", @@ -133,7 +141,7 @@ def normalize_paths(root: Path, relnames: Iterable[str]) -> List[Path]: return paths -def stage_data(src: Path, inputdata_root: Path, staging_root: Path) -> None: +def stage_data(src: Path, inputdata_root: Path, staging_root: Path, check: bool = False) -> None: """Stage a file by mirroring its path under `staging_root`. Destination path is computed by replacing the `inputdata_root` prefix of `src` @@ -144,6 +152,7 @@ def stage_data(src: Path, inputdata_root: Path, staging_root: Path) -> None: src: Source file path to stage. inputdata_root: Root directory of the inputdata tree. staging_root: Root directory where files will be staged. + check: If True, just check whether the file is already published. Raises: RuntimeError: If `src` is a live symlink pointing outside staging, or if `src` is outside @@ -167,6 +176,9 @@ def stage_data(src: Path, inputdata_root: Path, staging_root: Path) -> None: return # TODO: Check whether file is published but not relinked. + if check: + print(f"{INDENT}File is not already published") + return if not src.exists(): raise FileNotFoundError(f"source not found: {src}") @@ -270,7 +282,7 @@ def main(argv: List[str] | None = None) -> int: # Ensure we are running as the STAGE_OWNER account before touching the tree # Set env var RIMPORT_SKIP_USER_CHECK=1 if you prefer to run `sudox -u STAGE_OWNER rimport …` # explicitly (or for testing). - if os.getenv("RIMPORT_SKIP_USER_CHECK") != "1": + if not args.check and os.getenv("RIMPORT_SKIP_USER_CHECK") != "1": ensure_running_as(STAGE_OWNER, sys.argv) root = Path(args.inputdata).expanduser().resolve() @@ -299,7 +311,7 @@ def main(argv: List[str] | None = None) -> int: for p in paths: print(f"'{p}':") try: - stage_data(p, root, staging_root) + stage_data(p, root, staging_root, args.check) except Exception as e: # pylint: disable=broad-exception-caught # General Exception keeps CLI robust for batch runs errors += 1 diff --git a/tests/rimport/test_build_parser.py b/tests/rimport/test_build_parser.py index ec09c30..6213dbd 100644 --- a/tests/rimport/test_build_parser.py +++ b/tests/rimport/test_build_parser.py @@ -84,6 +84,19 @@ def test_inputdata_default(self): args = parser.parse_args(["-file", "test.txt"]) assert args.inputdata == rimport.DEFAULT_INPUTDATA_ROOT + def test_check_default(self): + """Test that --check has the correct default value.""" + parser = rimport.build_parser() + args = parser.parse_args(["-file", "test.txt"]) + assert args.check is False + + @pytest.mark.parametrize("check_flag", ["-check", "-c", "--check"]) + def test_check_arguments_accepted(self, check_flag): + """Test that all check argument flags are accepted.""" + parser = rimport.build_parser() + args = parser.parse_args(["-file", "test.txt", check_flag]) + assert args.check is True + def test_inputdata_custom(self): """Test that -inputdata can be customized.""" parser = rimport.build_parser() diff --git a/tests/rimport/test_cmdline.py b/tests/rimport/test_cmdline.py index 034eed8..d209422 100644 --- a/tests/rimport/test_cmdline.py +++ b/tests/rimport/test_cmdline.py @@ -434,3 +434,44 @@ def test_error_symlink_pointing_outside_staging( # Verify the right message was printed msg = "is outside staging directory" assert msg in result.stderr + + def test_check_doesnt_copy(self, rimport_script, test_env, rimport_env): + """Test that a file is NOT copied to the staging directory if check is True.""" + inputdata_root = test_env["inputdata_root"] + staging_root = test_env["staging_root"] + + # Create a file in inputdata + test_file = inputdata_root / "test.nc" + test_file.write_text("test data") + + # Make sure --check skips ensure_running_as() + del rimport_env["RIMPORT_SKIP_USER_CHECK"] + + # Run rimport with --check option + command = [ + sys.executable, + rimport_script, + "-file", + "test.nc", + "-inputdata", + str(inputdata_root), + "--check", + ] + + result = subprocess.run( + command, + capture_output=True, + text=True, + check=False, + env=rimport_env, + ) + + # Verify success + assert result.returncode == 0, f"Command failed: {result.stderr}" + + # Verify file was not staged + staged_file = staging_root / "test.nc" + assert not staged_file.exists() + + # Verify message was printed + assert "not already published" in result.stdout diff --git a/tests/rimport/test_stage_data.py b/tests/rimport/test_stage_data.py index 301d99c..886d052 100644 --- a/tests/rimport/test_stage_data.py +++ b/tests/rimport/test_stage_data.py @@ -57,6 +57,22 @@ def test_copies_file_to_staging(self, inputdata_root, staging_root): assert dst.exists() assert dst.read_text() == "data content" + def test_check_doesnt_copy(self, inputdata_root, staging_root, capsys): + """Test that a file is NOT copied to the staging directory if check is True""" + # Create file in inputdata root + src = inputdata_root / "file.nc" + src.write_text("data content") + + # Check the file + rimport.stage_data(src, inputdata_root, staging_root, check=True) + + # Verify file was NOT copied to staging + dst = staging_root / "file.nc" + assert not dst.exists() + + # Verify message was printed + assert "not already published" in capsys.readouterr().out.strip() + def test_preserves_directory_structure(self, inputdata_root, staging_root): """Test that directory structure is preserved in staging.""" # Create nested file in inputdata root From 013177220bd41a54e29b3f590a06978f1e50da46 Mon Sep 17 00:00:00 2001 From: Sam Rabin Date: Mon, 2 Feb 2026 17:56:34 -0700 Subject: [PATCH 09/13] rimport: stage_data() now checks whether file can be downloaded. --- rimport | 43 +++++++++++++- tests/rimport/test_can_file_be_downloaded.py | 61 ++++++++++++++++++++ tests/rimport/test_stage_data.py | 42 +++++++++++++- 3 files changed, 142 insertions(+), 4 deletions(-) create mode 100644 tests/rimport/test_can_file_be_downloaded.py diff --git a/rimport b/rimport index f9eb00c..a741d1c 100755 --- a/rimport +++ b/rimport @@ -15,6 +15,8 @@ import shutil import sys from pathlib import Path from typing import Iterable, List +from urllib.request import Request, urlopen +from urllib.error import HTTPError import shared @@ -22,6 +24,7 @@ DEFAULT_INPUTDATA_ROOT = Path(shared.DEFAULT_INPUTDATA_ROOT) DEFAULT_STAGING_ROOT = Path(shared.DEFAULT_STAGING_ROOT) STAGE_OWNER = "cesmdata" INDENT = " " +INPUTDATA_URL = "https://osdf-data.gdex.ucar.edu/ncar/gdex/d651077/cesmdata/inputdata" def build_parser() -> argparse.ArgumentParser: @@ -141,7 +144,9 @@ def normalize_paths(root: Path, relnames: Iterable[str]) -> List[Path]: return paths -def stage_data(src: Path, inputdata_root: Path, staging_root: Path, check: bool = False) -> None: +def stage_data( + src: Path, inputdata_root: Path, staging_root: Path, check: bool = False +) -> None: """Stage a file by mirroring its path under `staging_root`. Destination path is computed by replacing the `inputdata_root` prefix of `src` @@ -173,6 +178,10 @@ def stage_data(src: Path, inputdata_root: Path, staging_root: Path, check: bool f"({staging_root})" ) print(f"{INDENT}File is already published and linked.") + if can_file_be_downloaded(src.resolve(), staging_root): + print(f"{INDENT}File is available for download.") + else: + print(f"{INDENT}File is not (yet) available for download.") return # TODO: Check whether file is published but not relinked. @@ -254,6 +263,38 @@ def get_staging_root() -> Path: return DEFAULT_STAGING_ROOT +def can_file_be_downloaded(file_relpath: Path, staging_root: Path, timeout: float = 10): + """Check whether a file is available for download from the CESM inputdata server. + + Sends a HEAD request to the CESM inputdata URL to verify if the file exists and is + accessible without downloading the entire file. + + Args: + file_relpath: Relative path to the file (relative to staging_root), or an absolute + path that will be made relative to staging_root. + staging_root: Root directory of the staging area, used to compute relative path + if file_relpath is absolute. + timeout: Maximum time in seconds to wait for the server response. Default is 10. + + Returns: + bool: True if the file is accessible (HTTP status 2xx or 3xx), False otherwise + (including 404, network errors, timeouts, etc.). + """ + # Get URL + if file_relpath.is_absolute(): + file_relpath = file_relpath.relative_to(staging_root) + url = os.path.join(INPUTDATA_URL, file_relpath) + + # Check whether URL can be accessed + req = Request(url, method="HEAD") + try: + with urlopen(req, timeout=timeout) as resp: + return 200 <= resp.status < 400 + except HTTPError: + # Server reached, but resource doesn't exist (404, 410, etc.) + return False + + def main(argv: List[str] | None = None) -> int: """Main entry point for the rimport tool. diff --git a/tests/rimport/test_can_file_be_downloaded.py b/tests/rimport/test_can_file_be_downloaded.py new file mode 100644 index 0000000..36fd851 --- /dev/null +++ b/tests/rimport/test_can_file_be_downloaded.py @@ -0,0 +1,61 @@ +""" +Tests for can_file_be_downloaded() function in rimport script. +""" + +import os +import sys +import importlib.util +from importlib.machinery import SourceFileLoader +from pathlib import Path + +from shared import DEFAULT_STAGING_ROOT + +# Import rimport module from file without .py extension +rimport_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + "rimport", +) +loader = SourceFileLoader("rimport", rimport_path) +spec = importlib.util.spec_from_loader("rimport", loader) +if spec is None: + raise ImportError(f"Could not create spec for rimport from {rimport_path}") +rimport = importlib.util.module_from_spec(spec) +sys.modules["rimport"] = rimport +loader.exec_module(rimport) + +RELPATH_THAT_DOES_EXIST = os.path.join( + "share", "meshes", "ne3pg3_ESMFmesh_c221214_cdf5.asc" +) + + +class TestCanFileBeDownloaded: + """Test suite for can_file_be_downloaded() function.""" + + def test_existing_file_exists(self): + """Test that the file that should exist does. If not, other tests will definitely fail.""" + file_abspath = Path(os.path.join(DEFAULT_STAGING_ROOT, RELPATH_THAT_DOES_EXIST)) + assert file_abspath.exists() + + def test_true_abspath(self): + """Test that can_file_be_downloaded() is true for an existing file given absolute path""" + file_abspath = Path(os.path.join(DEFAULT_STAGING_ROOT, RELPATH_THAT_DOES_EXIST)) + assert rimport.can_file_be_downloaded( + file_abspath, + DEFAULT_STAGING_ROOT, + ) + + def test_true_relpath(self): + """Test that can_file_be_downloaded() is true for an existing file given relative path""" + file_relpath = Path(RELPATH_THAT_DOES_EXIST) + assert rimport.can_file_be_downloaded( + file_relpath, + DEFAULT_STAGING_ROOT, + ) + + def test_false_nonexistent(self): + """Test that can_file_be_downloaded() is false for a nonexistent file""" + file_relpath = Path("weurueridniduafnea/smfnigsroerij/msdif8ernnr.nc") + assert not rimport.can_file_be_downloaded( + file_relpath, + DEFAULT_STAGING_ROOT, + ) diff --git a/tests/rimport/test_stage_data.py b/tests/rimport/test_stage_data.py index 886d052..ef148c3 100644 --- a/tests/rimport/test_stage_data.py +++ b/tests/rimport/test_stage_data.py @@ -88,12 +88,12 @@ def test_preserves_directory_structure(self, inputdata_root, staging_root): assert dst.exists() assert dst.read_text() == "nested data" - def test_prints_live_symlink_already_published( + def test_prints_live_symlink_already_published_not_downloadable( self, inputdata_root, staging_root, capsys ): """ Test that staging a live, already-published symlink prints a message and returns - immediately without copying anything. + immediately without copying anything. Should say it's not available for download. """ # Create a real file in staging and a symlink to it in inputdata real_file = staging_root / "real_file.nc" @@ -106,10 +106,12 @@ def test_prints_live_symlink_already_published( # Should print message for live symlink and return early rimport.stage_data(src, inputdata_root, staging_root) - # Verify the right message was printed + # Verify the right messages were printed stdout = capsys.readouterr().out.strip() msg = "File is already published and linked" assert msg in stdout + msg = "File is not (yet) available for download" + assert msg in stdout # Verify the WRONG message was NOT printed msg = "is already under staging directory" @@ -118,6 +120,40 @@ def test_prints_live_symlink_already_published( # Verify that shutil.copy2 was never called (function returned early) mock_copy.assert_not_called() + def test_prints_live_symlink_already_published_is_downloadable( + self, inputdata_root, staging_root, capsys + ): + """ + Like test_prints_live_symlink_already_published_not_downloadable, but mocks + can_file_be_downloaded() to test "is available for download" message. + """ + # Create a real file in staging and a symlink to it in inputdata + real_file = staging_root / "real_file.nc" + real_file.write_text("data") + src = inputdata_root / "link.nc" + src.symlink_to(real_file) + + # Mock shutil.copy2 to verify it's never called + with patch("shutil.copy2") as mock_copy: + # Mock can_file_be_downloaded to return True + with patch("rimport.can_file_be_downloaded", return_value=True): + # Should print message for live symlink and return early + rimport.stage_data(src, inputdata_root, staging_root) + + # Verify that shutil.copy2 was never called (function returned early) + mock_copy.assert_not_called() + + # Verify the right messages were printed + stdout = capsys.readouterr().out.strip() + msg = "File is already published and linked" + assert msg in stdout + msg = "File is available for download" + assert msg in stdout + + # Verify the WRONG message was NOT printed + msg = "is already under staging directory" + assert msg not in stdout + def test_raises_error_for_live_symlink_pointing_somewhere_other_than_staging( self, tmp_path, inputdata_root, staging_root ): From 3f2ba82d37bf6011e1edf3cd573792dc288716de Mon Sep 17 00:00:00 2001 From: Sam Rabin Date: Wed, 4 Feb 2026 09:59:44 -0700 Subject: [PATCH 10/13] Add .coverage to .gitignore. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index bee8a64..d8a1bb0 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ __pycache__ +.coverage From decf50a61e13d1e053943e7f7f79924a0fdd50de Mon Sep 17 00:00:00 2001 From: Sam Rabin Date: Wed, 4 Feb 2026 10:04:27 -0700 Subject: [PATCH 11/13] test_stage_data: Delete an unused arg. --- tests/rimport/test_stage_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/rimport/test_stage_data.py b/tests/rimport/test_stage_data.py index ef148c3..d9e3aa4 100644 --- a/tests/rimport/test_stage_data.py +++ b/tests/rimport/test_stage_data.py @@ -222,7 +222,7 @@ def test_raises_error_for_file_outside_inputdata_root_with_special_str( rimport.stage_data(src, inputdata_root, staging_root) def test_raises_error_for_already_published_file( - self, tmp_path, inputdata_root, staging_root + self, inputdata_root, staging_root ): """Test that staging an already published file raises RuntimeError.""" # Create a file in staging_root From 2d686fd3dc16d0cd5abae2f1b2638480d332ddca Mon Sep 17 00:00:00 2001 From: Sam Rabin Date: Wed, 4 Feb 2026 10:22:05 -0700 Subject: [PATCH 12/13] rimport: Print msg if file copied to staging but not relinked. --- rimport | 38 ++++++++++++++++++++++++-------- tests/rimport/test_stage_data.py | 33 +++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 9 deletions(-) diff --git a/rimport b/rimport index a741d1c..9c09239 100755 --- a/rimport +++ b/rimport @@ -178,15 +178,9 @@ def stage_data( f"({staging_root})" ) print(f"{INDENT}File is already published and linked.") - if can_file_be_downloaded(src.resolve(), staging_root): - print(f"{INDENT}File is available for download.") - else: - print(f"{INDENT}File is not (yet) available for download.") - return - - # TODO: Check whether file is published but not relinked. - if check: - print(f"{INDENT}File is not already published") + print_can_file_be_downloaded( + can_file_be_downloaded(src.resolve(), staging_root) + ) return if not src.exists(): @@ -204,6 +198,20 @@ def stage_data( ) from exc dst = staging_root / rel + + if dst.exists(): + print(f"{INDENT}File is already published but NOT linked; do") + print(f"{2*INDENT}relink.py {rel}") + print(f"{INDENT}to resolve.") + print_can_file_be_downloaded( + can_file_be_downloaded(rel, staging_root) + ) + return + + if check: + print(f"{INDENT}File is not already published") + return + dst.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(src, dst) print(f"{INDENT}[rimport] staged {src} -> {dst}") @@ -295,6 +303,18 @@ def can_file_be_downloaded(file_relpath: Path, staging_root: Path, timeout: floa return False +def print_can_file_be_downloaded(file_can_be_downloaded: bool): + """Print a message indicating whether a file is available for download. + + Args: + file_can_be_downloaded: Boolean indicating if the file can be downloaded. + """ + if file_can_be_downloaded: + print(f"{INDENT}File is available for download.") + else: + print(f"{INDENT}File is not (yet) available for download.") + + def main(argv: List[str] | None = None) -> int: """Main entry point for the rimport tool. diff --git a/tests/rimport/test_stage_data.py b/tests/rimport/test_stage_data.py index d9e3aa4..d24e769 100644 --- a/tests/rimport/test_stage_data.py +++ b/tests/rimport/test_stage_data.py @@ -154,6 +154,39 @@ def test_prints_live_symlink_already_published_is_downloadable( msg = "is already under staging directory" assert msg not in stdout + def test_prints_published_but_not_linked( + self, inputdata_root, staging_root, capsys + ): + """ + Tests printed message for when a file has been published (copied to staging root) but not + yet linked (inputdata version replaced with symlink to staging version). + """ + # Create a real file in staging AND in inputdata + filename = "real_file.nc" + staged = staging_root / filename + staged.write_text("data") + inputdata = inputdata_root / filename + inputdata.write_text("data") + + # Mock shutil.copy2 to verify it's never called + with patch("shutil.copy2") as mock_copy: + # Mock can_file_be_downloaded to return True + with patch("rimport.can_file_be_downloaded", return_value=True): + # Should print message for live symlink and return early + rimport.stage_data(inputdata, inputdata_root, staging_root) + + # Verify that shutil.copy2 was never called (function returned early) + mock_copy.assert_not_called() + + # Verify the right messages were printed or not + stdout = capsys.readouterr().out.strip() + msg = "File is already published and linked" + assert msg not in stdout + msg = "File is already published but NOT linked; do" + assert msg in stdout + msg = "File is available for download" + assert msg in stdout + def test_raises_error_for_live_symlink_pointing_somewhere_other_than_staging( self, tmp_path, inputdata_root, staging_root ): From 92cf1a46eb16dfdfa04d7ed9b9bdf811c20a8d8b Mon Sep 17 00:00:00 2001 From: Sam Rabin Date: Wed, 4 Feb 2026 10:59:02 -0700 Subject: [PATCH 13/13] Skip test_existing_file_exists if not on Glade. --- tests/rimport/test_can_file_be_downloaded.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/rimport/test_can_file_be_downloaded.py b/tests/rimport/test_can_file_be_downloaded.py index 36fd851..30fcfa4 100644 --- a/tests/rimport/test_can_file_be_downloaded.py +++ b/tests/rimport/test_can_file_be_downloaded.py @@ -8,6 +8,8 @@ from importlib.machinery import SourceFileLoader from pathlib import Path +import pytest + from shared import DEFAULT_STAGING_ROOT # Import rimport module from file without .py extension @@ -30,7 +32,8 @@ class TestCanFileBeDownloaded: """Test suite for can_file_be_downloaded() function.""" - + + @pytest.mark.skipif(not os.path.exists("/glade"), reason="This test can only run on Glade") def test_existing_file_exists(self): """Test that the file that should exist does. If not, other tests will definitely fail.""" file_abspath = Path(os.path.join(DEFAULT_STAGING_ROOT, RELPATH_THAT_DOES_EXIST))