Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
__pycache__
.coverage
135 changes: 108 additions & 27 deletions rimport
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,16 @@ import shutil
import sys
from pathlib import Path
from typing import Iterable, List
from urllib.request import Request, urlopen
from urllib.error import HTTPError

import shared

DEFAULT_INPUTDATA_ROOT = Path(shared.DEFAULT_INPUTDATA_ROOT)
DEFAULT_STAGING_ROOT = Path(shared.DEFAULT_STAGING_ROOT)
STAGE_OWNER = "cesmdata"
INDENT = " "
INPUTDATA_URL = "https://osdf-data.gdex.ucar.edu/ncar/gdex/d651077/cesmdata/inputdata"


def build_parser() -> argparse.ArgumentParser:
Expand Down Expand Up @@ -73,6 +77,14 @@ def build_parser() -> argparse.ArgumentParser:
),
)

parser.add_argument(
"--check",
"-check",
"-c",
action="store_true",
help="Check whether file(s) is/are already published.",
)

# Provide -help to mirror legacy behavior (in addition to -h and --help)
parser.add_argument(
"-h",
Expand Down Expand Up @@ -108,33 +120,33 @@ def read_filelist(list_path: Path) -> List[str]:
return lines


def resolve_paths(root: Path, relnames: Iterable[str]) -> List[Path]:
"""Convert relative or absolute path names to resolved absolute Paths.
def normalize_paths(root: Path, relnames: Iterable[str]) -> List[Path]:
"""Convert relative or absolute path names to normalized absolute Paths.

For each name in relnames:
- If the name is relative, it is resolved relative to `root`
- If the name is already absolute, it is resolved as-is
All paths are resolved to their canonical absolute form.
- If the name is relative, it is assumed to be relative to `root` and made absolute
All paths are then normalized to their absolute form, replacing . and .. as needed.

Note that symlinks are NOT resolved.

Args:
root: Base directory for resolving relative paths.
relnames: Iterable of path names (relative or absolute) to resolve.
root: Base directory under which relative paths are assumed to be.
relnames: Iterable of path names (relative or absolute) to normalize.

Returns:
List of resolved absolute Path objects.
List of normalized absolute Path objects.
"""
paths: List[Path] = []
for name in relnames:
p = (
(root / name).resolve()
if not Path(name).is_absolute()
else Path(name).resolve()
)
p = root / name if not Path(name).is_absolute() else Path(name)
p = Path(os.path.normpath(p.absolute()))
paths.append(p)
return paths


def stage_data(src: Path, inputdata_root: Path, staging_root: Path) -> None:
def stage_data(
src: Path, inputdata_root: Path, staging_root: Path, check: bool = False
) -> None:
"""Stage a file by mirroring its path under `staging_root`.

Destination path is computed by replacing the `inputdata_root` prefix of `src`
Expand All @@ -145,22 +157,32 @@ def stage_data(src: Path, inputdata_root: Path, staging_root: Path) -> None:
src: Source file path to stage.
inputdata_root: Root directory of the inputdata tree.
staging_root: Root directory where files will be staged.
check: If True, just check whether the file is already published.

Raises:
RuntimeError: If `src` is a live symlink (already published), or if `src`
is outside the inputdata root, or if `src` is already under staging directory.
RuntimeError: If `src` is a live symlink pointing outside staging, or if `src` is outside
the inputdata root, or if `src` is already under staging directory.
RuntimeError: If `src` is a broken symlink.
FileNotFoundError: If `src` does not exist.

Guardrails:
* Raise if `src` is a *live* symlink ("already published").
* Raise if `src` is a *live* symlink to a file outside staging root ("outside staging").
* Raise if `src` is a broken symlink or is outside the inputdata root.
"""
if src.is_symlink() and src.exists():
# TODO: This should be a regular message, not an error.
raise RuntimeError("File is already published.")
if src.is_symlink() and not src.exists():
raise RuntimeError(f"Source is a broken symlink: {src}")
if src.is_symlink():
if not os.path.exists(src.resolve()):
raise RuntimeError(f"Source is a broken symlink: {src}")
if not src.resolve().is_relative_to(staging_root.resolve()):
raise RuntimeError(
f"Source is a symlink, but target ({src.resolve()}) is outside staging directory "
f"({staging_root})"
)
print(f"{INDENT}File is already published and linked.")
print_can_file_be_downloaded(
can_file_be_downloaded(src.resolve(), staging_root)
)
return

if not src.exists():
raise FileNotFoundError(f"source not found: {src}")

Expand All @@ -176,9 +198,23 @@ def stage_data(src: Path, inputdata_root: Path, staging_root: Path) -> None:
) from exc

dst = staging_root / rel

if dst.exists():
print(f"{INDENT}File is already published but NOT linked; do")
print(f"{2*INDENT}relink.py {rel}")
print(f"{INDENT}to resolve.")
print_can_file_be_downloaded(
can_file_be_downloaded(rel, staging_root)
)
return

if check:
print(f"{INDENT}File is not already published")
return

dst.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(src, dst)
print(f"[rimport] staged {src} -> {dst}")
print(f"{INDENT}[rimport] staged {src} -> {dst}")


def ensure_running_as(target_user: str, argv: list[str]) -> None:
Expand Down Expand Up @@ -235,6 +271,50 @@ def get_staging_root() -> Path:
return DEFAULT_STAGING_ROOT


def can_file_be_downloaded(file_relpath: Path, staging_root: Path, timeout: float = 10):
"""Check whether a file is available for download from the CESM inputdata server.

Sends a HEAD request to the CESM inputdata URL to verify if the file exists and is
accessible without downloading the entire file.

Args:
file_relpath: Relative path to the file (relative to staging_root), or an absolute
path that will be made relative to staging_root.
staging_root: Root directory of the staging area, used to compute relative path
if file_relpath is absolute.
timeout: Maximum time in seconds to wait for the server response. Default is 10.

Returns:
bool: True if the file is accessible (HTTP status 2xx or 3xx), False otherwise
(including 404, network errors, timeouts, etc.).
"""
# Get URL
if file_relpath.is_absolute():
file_relpath = file_relpath.relative_to(staging_root)
url = os.path.join(INPUTDATA_URL, file_relpath)

# Check whether URL can be accessed
req = Request(url, method="HEAD")
try:
with urlopen(req, timeout=timeout) as resp:
return 200 <= resp.status < 400
except HTTPError:
# Server reached, but resource doesn't exist (404, 410, etc.)
return False


def print_can_file_be_downloaded(file_can_be_downloaded: bool):
"""Print a message indicating whether a file is available for download.

Args:
file_can_be_downloaded: Boolean indicating if the file can be downloaded.
"""
if file_can_be_downloaded:
print(f"{INDENT}File is available for download.")
else:
print(f"{INDENT}File is not (yet) available for download.")


def main(argv: List[str] | None = None) -> int:
"""Main entry point for the rimport tool.

Expand Down Expand Up @@ -263,7 +343,7 @@ def main(argv: List[str] | None = None) -> int:
# Ensure we are running as the STAGE_OWNER account before touching the tree
# Set env var RIMPORT_SKIP_USER_CHECK=1 if you prefer to run `sudox -u STAGE_OWNER rimport …`
# explicitly (or for testing).
if os.getenv("RIMPORT_SKIP_USER_CHECK") != "1":
if not args.check and os.getenv("RIMPORT_SKIP_USER_CHECK") != "1":
ensure_running_as(STAGE_OWNER, sys.argv)

root = Path(args.inputdata).expanduser().resolve()
Expand All @@ -285,17 +365,18 @@ def main(argv: List[str] | None = None) -> int:
return 2

# Resolve to full paths (keep accepting absolute names too)
paths = resolve_paths(root, relnames)
paths = normalize_paths(root, relnames)
staging_root = get_staging_root()
# Execute the new action per file
errors = 0
for p in paths:
print(f"'{p}':")
try:
stage_data(p, root, staging_root)
stage_data(p, root, staging_root, args.check)
except Exception as e: # pylint: disable=broad-exception-caught
# General Exception keeps CLI robust for batch runs
errors += 1
print(f"rimport: error processing {p}: {e}", file=sys.stderr)
print(f"{INDENT}rimport: error processing {p}: {e}", file=sys.stderr)

return 0 if errors == 0 else 1

Expand Down
13 changes: 13 additions & 0 deletions tests/rimport/test_build_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,19 @@ def test_inputdata_default(self):
args = parser.parse_args(["-file", "test.txt"])
assert args.inputdata == rimport.DEFAULT_INPUTDATA_ROOT

def test_check_default(self):
"""Test that --check has the correct default value."""
parser = rimport.build_parser()
args = parser.parse_args(["-file", "test.txt"])
assert args.check is False

@pytest.mark.parametrize("check_flag", ["-check", "-c", "--check"])
def test_check_arguments_accepted(self, check_flag):
"""Test that all check argument flags are accepted."""
parser = rimport.build_parser()
args = parser.parse_args(["-file", "test.txt", check_flag])
assert args.check is True

def test_inputdata_custom(self):
"""Test that -inputdata can be customized."""
parser = rimport.build_parser()
Expand Down
64 changes: 64 additions & 0 deletions tests/rimport/test_can_file_be_downloaded.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""
Tests for can_file_be_downloaded() function in rimport script.
"""

import os
import sys
import importlib.util
from importlib.machinery import SourceFileLoader
from pathlib import Path

import pytest

from shared import DEFAULT_STAGING_ROOT

# Import rimport module from file without .py extension
rimport_path = os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
"rimport",
)
loader = SourceFileLoader("rimport", rimport_path)
spec = importlib.util.spec_from_loader("rimport", loader)
if spec is None:
raise ImportError(f"Could not create spec for rimport from {rimport_path}")
rimport = importlib.util.module_from_spec(spec)
sys.modules["rimport"] = rimport
loader.exec_module(rimport)

RELPATH_THAT_DOES_EXIST = os.path.join(
"share", "meshes", "ne3pg3_ESMFmesh_c221214_cdf5.asc"
)


class TestCanFileBeDownloaded:
"""Test suite for can_file_be_downloaded() function."""

@pytest.mark.skipif(not os.path.exists("/glade"), reason="This test can only run on Glade")
def test_existing_file_exists(self):
"""Test that the file that should exist does. If not, other tests will definitely fail."""
file_abspath = Path(os.path.join(DEFAULT_STAGING_ROOT, RELPATH_THAT_DOES_EXIST))
assert file_abspath.exists()

def test_true_abspath(self):
"""Test that can_file_be_downloaded() is true for an existing file given absolute path"""
file_abspath = Path(os.path.join(DEFAULT_STAGING_ROOT, RELPATH_THAT_DOES_EXIST))
assert rimport.can_file_be_downloaded(
file_abspath,
DEFAULT_STAGING_ROOT,
)

def test_true_relpath(self):
"""Test that can_file_be_downloaded() is true for an existing file given relative path"""
file_relpath = Path(RELPATH_THAT_DOES_EXIST)
assert rimport.can_file_be_downloaded(
file_relpath,
DEFAULT_STAGING_ROOT,
)

def test_false_nonexistent(self):
"""Test that can_file_be_downloaded() is false for a nonexistent file"""
file_relpath = Path("weurueridniduafnea/smfnigsroerij/msdif8ernnr.nc")
assert not rimport.can_file_be_downloaded(
file_relpath,
DEFAULT_STAGING_ROOT,
)
Loading