From bb4eec27a873aa2eba9f8d90ad37c9673b8368e6 Mon Sep 17 00:00:00 2001 From: Sebastian Heid <8442432+s4heid@users.noreply.github.com> Date: Wed, 26 Nov 2025 16:13:24 +0100 Subject: [PATCH 1/2] Fix race condition causing sshd start failure during provisioning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Run first-boot tasks via systemd so sshd never races with host-key regeneration. The old `rc.local` script ran after network.target, but in parallel with other regular system services, like ssh.service. Therefore, ssh.service often started (and restarted) while `/root/firstboot.sh` was deleting keys. cloud-init’s set-passwords module made this worse by restarting ssh mid-run. * Replace `rc.local` with a oneshot firstboot.service (delete keys, create new keys, reconfigure sysstat) that runs Before=ssh.service and leaves the `/root/firstboot_done` file as a marker. * Add a cloud-config.service drop-in so cloud-init's config stage waits for firstboot.service, and * Update walinuxagent.service to wait for firstboot.service, ensuring ssh keys have been regenerated. This guarantees sshd, cloud-init, and WALinuxAgent all start only after the first-boot tasks succeed. --- .../stages/base_ubuntu_firstboot/apply.sh | 9 ++++---- .../base_ubuntu_firstboot/assets/etc/rc.local | 22 ------------------- .../etc/systemd/system/firstboot.service | 15 +++++++++++++ .../assets/root/firstboot.sh | 6 ----- .../stages/system_azure_init/apply.sh | 6 ++++- .../firstboot-blocker.conf | 3 +++ .../assets/etc/waagent/walinuxagent.service | 8 +++---- 7 files changed, 31 insertions(+), 38 deletions(-) delete mode 100644 stemcell_builder/stages/base_ubuntu_firstboot/assets/etc/rc.local create mode 100644 stemcell_builder/stages/base_ubuntu_firstboot/assets/etc/systemd/system/firstboot.service delete mode 100755 stemcell_builder/stages/base_ubuntu_firstboot/assets/root/firstboot.sh create mode 100644 stemcell_builder/stages/system_azure_init/assets/etc/systemd/system/cloud-config.service.d/firstboot-blocker.conf diff --git a/stemcell_builder/stages/base_ubuntu_firstboot/apply.sh b/stemcell_builder/stages/base_ubuntu_firstboot/apply.sh index 37e0378211..4d379948ca 100755 --- a/stemcell_builder/stages/base_ubuntu_firstboot/apply.sh +++ b/stemcell_builder/stages/base_ubuntu_firstboot/apply.sh @@ -5,7 +5,8 @@ set -e base_dir=$(readlink -nf $(dirname $0)/../..) source $base_dir/lib/prelude_apply.bash -cp $assets_dir/etc/rc.local $chroot/etc/rc.local -cp $assets_dir/root/firstboot.sh $chroot/root/firstboot.sh -chmod u+x "${chroot}/etc/rc.local" -chmod 0755 $chroot/root/firstboot.sh +install -D -m 0644 \ + $assets_dir/etc/systemd/system/firstboot.service \ + $chroot/etc/systemd/system/firstboot.service + +run_in_chroot $chroot "systemctl enable firstboot.service" diff --git a/stemcell_builder/stages/base_ubuntu_firstboot/assets/etc/rc.local b/stemcell_builder/stages/base_ubuntu_firstboot/assets/etc/rc.local deleted file mode 100644 index dd5376a0dc..0000000000 --- a/stemcell_builder/stages/base_ubuntu_firstboot/assets/etc/rc.local +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/sh -e -#execute firstboot.sh only once -if [ ! -e /root/firstboot_done ]; then - if [ -e /root/firstboot.sh ]; then - MAX_RETRIES=5 - COUNT=0 - while [ $COUNT -lt $MAX_RETRIES ]; do - /root/firstboot.sh - if [ $? -eq 0 ]; then - touch /root/firstboot_done - break - fi - COUNT=$((COUNT+1)) - sleep 1 - done - if [ $COUNT -eq $MAX_RETRIES ]; then - echo "Max retries reached. Exiting..." - exit 1 - fi - fi -fi -exit 0 \ No newline at end of file diff --git a/stemcell_builder/stages/base_ubuntu_firstboot/assets/etc/systemd/system/firstboot.service b/stemcell_builder/stages/base_ubuntu_firstboot/assets/etc/systemd/system/firstboot.service new file mode 100644 index 0000000000..6c5daf1835 --- /dev/null +++ b/stemcell_builder/stages/base_ubuntu_firstboot/assets/etc/systemd/system/firstboot.service @@ -0,0 +1,15 @@ +[Unit] +Description=Run first boot tasks +ConditionPathExists=!/root/firstboot_done +Before=ssh.service + +[Service] +Type=oneshot +ExecStartPre=/bin/sh -c '/bin/rm -f /etc/ssh/ssh_host*key*' +ExecStart=/usr/bin/ssh-keygen -A -v +ExecStartPost=/usr/sbin/dpkg-reconfigure -fnoninteractive sysstat +ExecStartPost=/usr/bin/touch /root/firstboot_done +RemainAfterExit=yes + +[Install] +WantedBy=multi-user.target diff --git a/stemcell_builder/stages/base_ubuntu_firstboot/assets/root/firstboot.sh b/stemcell_builder/stages/base_ubuntu_firstboot/assets/root/firstboot.sh deleted file mode 100755 index ef85602b9c..0000000000 --- a/stemcell_builder/stages/base_ubuntu_firstboot/assets/root/firstboot.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/sh - -rm -f /etc/ssh/ssh_host*key* - -dpkg-reconfigure -fnoninteractive -pcritical openssh-server -dpkg-reconfigure -fnoninteractive sysstat diff --git a/stemcell_builder/stages/system_azure_init/apply.sh b/stemcell_builder/stages/system_azure_init/apply.sh index ac0f6f9a09..764c7b9a2c 100755 --- a/stemcell_builder/stages/system_azure_init/apply.sh +++ b/stemcell_builder/stages/system_azure_init/apply.sh @@ -46,12 +46,16 @@ cat > $chroot/etc/logrotate.d/waagent < Date: Wed, 4 Mar 2026 16:13:00 +0000 Subject: [PATCH 2/2] Invoke firstboot script in firstboot.service * Move multiple firstboot commands back into a firstboot script in case we need to extend this in the future. * Use systemd's `ConditionFirstBoot` native option to ensure the script is only run on firstboot, which allows us to get rid of the file-based approach. --- .../stages/base_ssh/assets/10-ssh-firstboot-done.conf | 3 ++- stemcell_builder/stages/base_ubuntu_firstboot/apply.sh | 4 ++++ .../assets/etc/systemd/system/firstboot.service | 7 ++----- .../stages/base_ubuntu_firstboot/assets/root/firstboot.sh | 7 +++++++ 4 files changed, 15 insertions(+), 6 deletions(-) create mode 100755 stemcell_builder/stages/base_ubuntu_firstboot/assets/root/firstboot.sh diff --git a/stemcell_builder/stages/base_ssh/assets/10-ssh-firstboot-done.conf b/stemcell_builder/stages/base_ssh/assets/10-ssh-firstboot-done.conf index 6101bf28ed..1d776524e8 100644 --- a/stemcell_builder/stages/base_ssh/assets/10-ssh-firstboot-done.conf +++ b/stemcell_builder/stages/base_ssh/assets/10-ssh-firstboot-done.conf @@ -1,2 +1,3 @@ [Unit] -ConditionPathExists=/root/firstboot_done \ No newline at end of file +Wants=firstboot.service +After=firstboot.service \ No newline at end of file diff --git a/stemcell_builder/stages/base_ubuntu_firstboot/apply.sh b/stemcell_builder/stages/base_ubuntu_firstboot/apply.sh index 4d379948ca..63fff84e40 100755 --- a/stemcell_builder/stages/base_ubuntu_firstboot/apply.sh +++ b/stemcell_builder/stages/base_ubuntu_firstboot/apply.sh @@ -5,6 +5,10 @@ set -e base_dir=$(readlink -nf $(dirname $0)/../..) source $base_dir/lib/prelude_apply.bash +install -D -m 0755 \ + $assets_dir/root/firstboot.sh \ + $chroot/root/firstboot.sh + install -D -m 0644 \ $assets_dir/etc/systemd/system/firstboot.service \ $chroot/etc/systemd/system/firstboot.service diff --git a/stemcell_builder/stages/base_ubuntu_firstboot/assets/etc/systemd/system/firstboot.service b/stemcell_builder/stages/base_ubuntu_firstboot/assets/etc/systemd/system/firstboot.service index 6c5daf1835..cb2584e817 100644 --- a/stemcell_builder/stages/base_ubuntu_firstboot/assets/etc/systemd/system/firstboot.service +++ b/stemcell_builder/stages/base_ubuntu_firstboot/assets/etc/systemd/system/firstboot.service @@ -1,14 +1,11 @@ [Unit] Description=Run first boot tasks -ConditionPathExists=!/root/firstboot_done +ConditionFirstBoot=yes Before=ssh.service [Service] Type=oneshot -ExecStartPre=/bin/sh -c '/bin/rm -f /etc/ssh/ssh_host*key*' -ExecStart=/usr/bin/ssh-keygen -A -v -ExecStartPost=/usr/sbin/dpkg-reconfigure -fnoninteractive sysstat -ExecStartPost=/usr/bin/touch /root/firstboot_done +ExecStart=/root/firstboot.sh RemainAfterExit=yes [Install] diff --git a/stemcell_builder/stages/base_ubuntu_firstboot/assets/root/firstboot.sh b/stemcell_builder/stages/base_ubuntu_firstboot/assets/root/firstboot.sh new file mode 100755 index 0000000000..365789753b --- /dev/null +++ b/stemcell_builder/stages/base_ubuntu_firstboot/assets/root/firstboot.sh @@ -0,0 +1,7 @@ +#!/bin/sh +set -e + +rm -f /etc/ssh/ssh_host*key* +ssh-keygen -A -v + +dpkg-reconfigure -fnoninteractive sysstat