@@ -181,7 +181,8 @@ def _wait_for_ssh(self) -> None:
181181 self .ssh_host ,
182182 self .ssh_port ,
183183 )
184- # Upload and compile the C++ runner
184+ # Wait for sshd to actually accept connections, then compile runner
185+ self ._wait_for_sshd ()
185186 self ._compile_runner ()
186187 return
187188
@@ -190,6 +191,25 @@ def _wait_for_ssh(self) -> None:
190191 msg = f"Instance { self .instance_id } did not become SSH-ready within { POLL_TIMEOUT_S } s"
191192 raise TimeoutError (msg )
192193
194+ def _wait_for_sshd (self ) -> None :
195+ """Poll until sshd is actually accepting connections."""
196+ deadline = time .monotonic () + POLL_TIMEOUT_S
197+ while time .monotonic () < deadline :
198+ result = subprocess .run (
199+ [* self ._ssh_cmd (), "true" ],
200+ capture_output = True ,
201+ timeout = 15 ,
202+ check = False ,
203+ )
204+ if result .returncode == 0 :
205+ logger .info ("sshd ready on %s:%s" , self .ssh_host , self .ssh_port )
206+ return
207+ time .sleep (POLL_INTERVAL_S )
208+ msg = (
209+ f"sshd on { self .ssh_host } :{ self .ssh_port } did not become ready within { POLL_TIMEOUT_S } s"
210+ )
211+ raise TimeoutError (msg )
212+
193213 def _compile_runner (self ) -> None :
194214 """Upload warpforth-runner.cpp and compile it on the remote host."""
195215 self .scp_upload (RUNNER_SRC , f"{ REMOTE_TMP } /warpforth-runner.cpp" )
@@ -241,6 +261,8 @@ def _ssh_cmd(self) -> list[str]:
241261 "-o" ,
242262 "ConnectTimeout=10" ,
243263 "-o" ,
264+ "ConnectionAttempts=3" ,
265+ "-o" ,
244266 "LogLevel=ERROR" ,
245267 "-p" ,
246268 str (self .ssh_port ),
@@ -271,6 +293,10 @@ def scp_upload(self, local_path: str | Path, remote_path: str) -> None:
271293 "-o" ,
272294 "UserKnownHostsFile=/dev/null" ,
273295 "-o" ,
296+ "ConnectTimeout=10" ,
297+ "-o" ,
298+ "ConnectionAttempts=3" ,
299+ "-o" ,
274300 "LogLevel=ERROR" ,
275301 "-P" ,
276302 str (self .ssh_port ),
0 commit comments