Skip to content

Commit ac93cca

Browse files
committed
fix(gpu-test): resolve SSH timing issues in Vast.ai test infrastructure
1 parent 1a8bcc5 commit ac93cca

1 file changed

Lines changed: 27 additions & 1 deletion

File tree

gpu_test/conftest.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,8 @@ def _wait_for_ssh(self) -> None:
181181
self.ssh_host,
182182
self.ssh_port,
183183
)
184-
# Upload and compile the C++ runner
184+
# Wait for sshd to actually accept connections, then compile runner
185+
self._wait_for_sshd()
185186
self._compile_runner()
186187
return
187188

@@ -190,6 +191,25 @@ def _wait_for_ssh(self) -> None:
190191
msg = f"Instance {self.instance_id} did not become SSH-ready within {POLL_TIMEOUT_S}s"
191192
raise TimeoutError(msg)
192193

194+
def _wait_for_sshd(self) -> None:
195+
"""Poll until sshd is actually accepting connections."""
196+
deadline = time.monotonic() + POLL_TIMEOUT_S
197+
while time.monotonic() < deadline:
198+
result = subprocess.run(
199+
[*self._ssh_cmd(), "true"],
200+
capture_output=True,
201+
timeout=15,
202+
check=False,
203+
)
204+
if result.returncode == 0:
205+
logger.info("sshd ready on %s:%s", self.ssh_host, self.ssh_port)
206+
return
207+
time.sleep(POLL_INTERVAL_S)
208+
msg = (
209+
f"sshd on {self.ssh_host}:{self.ssh_port} did not become ready within {POLL_TIMEOUT_S}s"
210+
)
211+
raise TimeoutError(msg)
212+
193213
def _compile_runner(self) -> None:
194214
"""Upload warpforth-runner.cpp and compile it on the remote host."""
195215
self.scp_upload(RUNNER_SRC, f"{REMOTE_TMP}/warpforth-runner.cpp")
@@ -241,6 +261,8 @@ def _ssh_cmd(self) -> list[str]:
241261
"-o",
242262
"ConnectTimeout=10",
243263
"-o",
264+
"ConnectionAttempts=3",
265+
"-o",
244266
"LogLevel=ERROR",
245267
"-p",
246268
str(self.ssh_port),
@@ -271,6 +293,10 @@ def scp_upload(self, local_path: str | Path, remote_path: str) -> None:
271293
"-o",
272294
"UserKnownHostsFile=/dev/null",
273295
"-o",
296+
"ConnectTimeout=10",
297+
"-o",
298+
"ConnectionAttempts=3",
299+
"-o",
274300
"LogLevel=ERROR",
275301
"-P",
276302
str(self.ssh_port),

0 commit comments

Comments
 (0)