Skip to content

Commit 7e01fdf

Browse files
fix: Catch OSError with exponential backoff (#450)
* fix: Catch OSError with exponential backoff Signed-off-by: oliver könig <okoenig@nvidia.com> * Potential fix for code scanning alert no. 534: Illegal raise Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Signed-off-by: oliver könig <okoenig@nvidia.com> --------- Signed-off-by: oliver könig <okoenig@nvidia.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
1 parent c87f6cc commit 7e01fdf

2 files changed

Lines changed: 32 additions & 7 deletions

File tree

nemo_run/run/torchx_backend/schedulers/slurm.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,11 @@ def _cancel_existing(self, app_id: str) -> None:
228228
self.tunnel.run(f"scancel {app_id}", hide=False)
229229

230230
def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
231-
job_dirs = _get_job_dirs()
231+
try:
232+
job_dirs = _get_job_dirs()
233+
except OSError as e:
234+
log.warning(f"Could not read job dirs file, returning UNKNOWN status for {app_id}: {e}")
235+
return DescribeAppResponse(app_id=app_id, state=AppState.UNKNOWN)
232236
if app_id in job_dirs:
233237
_, tunnel_cfg, _ = job_dirs[app_id]
234238
self._initialize_tunnel(tunnel_cfg)
@@ -426,7 +430,7 @@ def _save_job_dir(
426430

427431
def _get_job_dirs(retries: int = 5) -> dict[str, tuple[str, SSHTunnel | LocalTunnel, str]]:
428432
last_exc: OSError | None = None
429-
for _ in range(retries):
433+
for attempt in range(retries):
430434
try:
431435
with open(SLURM_JOB_DIRS, "rt") as f:
432436
lines = f.readlines()
@@ -435,12 +439,17 @@ def _get_job_dirs(retries: int = 5) -> dict[str, tuple[str, SSHTunnel | LocalTun
435439
return {}
436440
except OSError as e:
437441
last_exc = e
438-
time.sleep(1)
442+
delay = min(2**attempt, 30)
443+
log.warning(
444+
f"OSError reading {SLURM_JOB_DIRS} (attempt {attempt + 1}/{retries}): {e}. "
445+
f"Retrying in {delay}s..."
446+
)
447+
time.sleep(delay)
439448
else:
440449
if last_exc is not None:
441450
raise last_exc
442-
raise OSError(
443-
f"Failed to read SLURM job dirs from {SLURM_JOB_DIRS} after {retries} retries"
451+
raise RuntimeError(
452+
f"Failed to read {SLURM_JOB_DIRS} after {retries} retries, but no OSError was captured."
444453
)
445454

446455
out = {}

test/run/torchx_backend/schedulers/test_slurm.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -340,8 +340,8 @@ def test_get_job_dirs():
340340

341341

342342
def test_get_job_dirs_retries_on_permission_error(tmp_path, mocker):
343-
"""Transient PermissionError should be retried; success on 3rd attempt returns data."""
344-
mocker.patch("time.sleep") # don't actually sleep in tests
343+
"""Transient PermissionError should be retried with exponential backoff; success on 3rd attempt returns data."""
344+
mock_sleep = mocker.patch("time.sleep")
345345
mock_open = mocker.mock_open(read_data="")
346346
mock_open.side_effect = [
347347
PermissionError("[Errno 1] Operation not permitted"),
@@ -353,6 +353,8 @@ def test_get_job_dirs_retries_on_permission_error(tmp_path, mocker):
353353
result = _get_job_dirs(retries=5)
354354
assert result == {}
355355
assert mock_open.call_count == 3
356+
# Exponential backoff: attempt 0 -> sleep(1), attempt 1 -> sleep(2)
357+
assert mock_sleep.call_args_list == [mock.call(1), mock.call(2)]
356358

357359

358360
def test_get_job_dirs_raises_after_exhausting_retries(mocker):
@@ -364,6 +366,20 @@ def test_get_job_dirs_raises_after_exhausting_retries(mocker):
364366
_get_job_dirs(retries=3)
365367

366368

369+
def test_describe_returns_unknown_on_persistent_permission_error(slurm_scheduler, mocker):
370+
"""describe() should return UNKNOWN state when _get_job_dirs() raises OSError, not propagate."""
371+
from torchx.specs import AppState
372+
373+
mocker.patch(
374+
"nemo_run.run.torchx_backend.schedulers.slurm._get_job_dirs",
375+
side_effect=PermissionError("[Errno 1] Operation not permitted"),
376+
)
377+
378+
result = slurm_scheduler.describe("12345")
379+
assert result is not None
380+
assert result.state == AppState.UNKNOWN
381+
382+
367383
def test_schedule_with_dependencies(slurm_scheduler, slurm_executor):
368384
mock_request = mock.MagicMock()
369385
mock_request.cmd = ["sbatch", "--requeue", "--parsable"]

0 commit comments

Comments
 (0)