Skip to content

Commit 1728663

Browse files
jmsperuclaude
authored andcommitted
nasbackup.sh: add timeout, cleanup trap, space check, quiesce, and error handling
- Add BACKUP_TIMEOUT (default 6h) to prevent indefinitely stuck backup jobs; aborts via domjobabort when exceeded - Add EXIT trap with cleanup() that resumes paused VMs, removes temp dirs, and unmounts NFS on any exit (error, signal, or normal) - Add check_free_space() pre-flight check (default 1 GB minimum) - Add -q/--quiesce flag for optional fsfreeze/thaw via qemu-guest-agent - Use set -eo pipefail for stricter error handling - Fix mount_operation: proper if/then instead of broken $? after pipe - Quote all variable expansions to prevent word splitting - Remove manual umount/rmdir from functions (handled by trap) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 61afb4c commit 1728663

1 file changed

Lines changed: 115 additions & 15 deletions

File tree

scripts/vm/hypervisor/kvm/nasbackup.sh

Lines changed: 115 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,10 @@
1616
## specific language governing permissions and limitations
1717
## under the License.
1818

19-
set -e
19+
set -eo pipefail
2020

2121
# CloudStack B&R NAS Backup and Recovery Tool for KVM
2222

23-
# TODO: do libvirt/logging etc checks
24-
2523
### Declare variables ###
2624

2725
OP=""
@@ -31,8 +29,17 @@ NAS_ADDRESS=""
3129
MOUNT_OPTS=""
3230
BACKUP_DIR=""
3331
DISK_PATHS=""
32+
QUIESCE=""
3433
logFile="/var/log/cloudstack/agent/agent.log"
3534

35+
# Exit codes
36+
EXIT_CLEANUP_FAILED=20
37+
38+
# Backup job timeout in seconds (default: 6 hours)
39+
BACKUP_TIMEOUT=${BACKUP_TIMEOUT:-21600}
40+
# Minimum free space required on backup target in bytes (default: 1 GB)
41+
MIN_FREE_SPACE=${MIN_FREE_SPACE:-1073741824}
42+
3643
log() {
3744
[[ "$verb" -eq 1 ]] && builtin echo "$@"
3845
if [[ "$1" == "-ne" || "$1" == "-e" || "$1" == "-n" ]]; then
@@ -42,6 +49,56 @@ log() {
4249
fi
4350
}
4451

52+
cleanup() {
53+
local status=0
54+
55+
# Resume the VM if it was paused during backup to prevent it from
56+
# remaining indefinitely paused when the backup job fails (e.g. due
57+
# to storage full or I/O errors on the backup target)
58+
if [[ -n "$VM" ]]; then
59+
local vm_state
60+
vm_state=$(virsh -c qemu:///system domstate "$VM" 2>/dev/null || true)
61+
if [[ "$vm_state" == "paused" ]]; then
62+
log -ne "Resuming paused VM $VM during backup cleanup"
63+
if ! virsh -c qemu:///system resume "$VM" > /dev/null 2>&1; then
64+
echo "Failed to resume VM $VM"
65+
status=1
66+
fi
67+
fi
68+
fi
69+
70+
if [[ -n "$dest" && -d "$dest" ]]; then
71+
rm -rf "$dest" || { echo "Failed to delete $dest"; status=1; }
72+
fi
73+
if [[ -n "$mount_point" && -d "$mount_point" ]]; then
74+
umount "$mount_point" 2>/dev/null || { echo "Failed to unmount $mount_point"; status=1; }
75+
rmdir "$mount_point" 2>/dev/null || true
76+
fi
77+
78+
if [[ $status -ne 0 ]]; then
79+
echo "Backup cleanup failed"
80+
exit $EXIT_CLEANUP_FAILED
81+
fi
82+
}
83+
84+
# Trap ensures cleanup always runs on exit (error, signal, or normal exit)
85+
# This prevents orphan NFS mounts from accumulating after failed backups
86+
trap cleanup EXIT
87+
88+
check_free_space() {
89+
local free_bytes
90+
free_bytes=$(df -P "$mount_point" 2>/dev/null | awk 'NR==2 {print $4}')
91+
if [[ -n "$free_bytes" ]]; then
92+
# df reports in 1K blocks, convert to bytes
93+
free_bytes=$((free_bytes * 1024))
94+
if [[ $free_bytes -lt $MIN_FREE_SPACE ]]; then
95+
echo "Insufficient free space on backup target: $((free_bytes / 1048576)) MB available, $((MIN_FREE_SPACE / 1048576)) MB required"
96+
exit 1
97+
fi
98+
log -ne "Backup target has $((free_bytes / 1073741824)) GB free space"
99+
fi
100+
}
101+
45102
vercomp() {
46103
local IFS=.
47104
local i ver1=($1) ver2=($3)
@@ -88,6 +145,7 @@ sanity_checks() {
88145

89146
backup_running_vm() {
90147
mount_operation
148+
check_free_space
91149
mkdir -p $dest
92150

93151
name="root"
@@ -99,39 +157,69 @@ backup_running_vm() {
99157
done
100158
echo "</disks></domainbackup>" >> $dest/backup.xml
101159

160+
# Quiesce guest filesystem before backup if requested and agent is available
161+
if [[ "$QUIESCE" == "true" ]]; then
162+
if virsh -c qemu:///system qemu-agent-command $VM '{"execute":"guest-ping"}' > /dev/null 2>&1; then
163+
log -ne "Quiescing guest filesystem on $VM"
164+
virsh -c qemu:///system domfsfreeze $VM > /dev/null 2>&1 || log -ne "Warning: fsfreeze failed on $VM, proceeding without quiesce"
165+
else
166+
log -ne "Warning: qemu-guest-agent not available on $VM, skipping quiesce"
167+
fi
168+
fi
169+
102170
# Start push backup
103-
virsh -c qemu:///system backup-begin --domain $VM --backupxml $dest/backup.xml > /dev/null 2>/dev/null
171+
if ! virsh -c qemu:///system backup-begin --domain $VM --backupxml $dest/backup.xml > /dev/null 2>&1; then
172+
echo "Failed to start backup for VM $VM"
173+
# Thaw filesystem if we froze it
174+
[[ "$QUIESCE" == "true" ]] && virsh -c qemu:///system domfsthaw $VM > /dev/null 2>&1 || true
175+
exit 1
176+
fi
177+
178+
# Thaw filesystem immediately after backup-begin (QEMU has its own consistent snapshot)
179+
if [[ "$QUIESCE" == "true" ]]; then
180+
virsh -c qemu:///system domfsthaw $VM > /dev/null 2>&1 || true
181+
log -ne "Thawed guest filesystem on $VM"
182+
fi
104183

105184
# Backup domain information
106185
virsh -c qemu:///system dumpxml $VM > $dest/domain-config.xml 2>/dev/null
107186
virsh -c qemu:///system dominfo $VM > $dest/dominfo.xml 2>/dev/null
108187
virsh -c qemu:///system domiflist $VM > $dest/domiflist.xml 2>/dev/null
109188
virsh -c qemu:///system domblklist $VM > $dest/domblklist.xml 2>/dev/null
110189

190+
# Wait for backup to complete with timeout
191+
local elapsed=0
111192
until virsh -c qemu:///system domjobinfo $VM --completed --keep-completed 2>/dev/null | grep "Completed" > /dev/null; do
193+
if [[ $elapsed -ge $BACKUP_TIMEOUT ]]; then
194+
echo "Backup timed out after ${BACKUP_TIMEOUT}s for VM $VM"
195+
virsh -c qemu:///system domjobabort $VM > /dev/null 2>&1 || true
196+
exit 1
197+
fi
112198
sleep 5
199+
elapsed=$((elapsed + 5))
113200
done
114201
rm -f $dest/backup.xml
115202
sync
116203

117204
# Print statistics
118205
virsh -c qemu:///system domjobinfo $VM --completed
119206
du -sb $dest | cut -f1
120-
121-
umount $mount_point
122-
rmdir $mount_point
123207
}
124208

125209
backup_stopped_vm() {
126210
mount_operation
211+
check_free_space
127212
mkdir -p $dest
128213

129214
IFS=","
130215

131216
name="root"
132217
for disk in $DISK_PATHS; do
133218
volUuid="${disk##*/}"
134-
qemu-img convert -O qcow2 $disk $dest/$name.$volUuid.qcow2 | tee -a "$logFile"
219+
if ! qemu-img convert -O qcow2 "$disk" "$dest/$name.$volUuid.qcow2" 2>&1 | tee -a "$logFile"; then
220+
echo "Failed to convert disk $disk"
221+
exit 1
222+
fi
135223
name="datadisk"
136224
done
137225
sync
@@ -142,20 +230,18 @@ backup_stopped_vm() {
142230
delete_backup() {
143231
mount_operation
144232

145-
rm -frv $dest
233+
rm -frv "$dest"
146234
sync
147-
umount $mount_point
148-
rmdir $mount_point
235+
# cleanup trap handles umount and rmdir
149236
}
150237

151238
mount_operation() {
152239
mount_point=$(mktemp -d -t csbackup.XXXXX)
153240
dest="$mount_point/${BACKUP_DIR}"
154-
if [ ${NAS_TYPE} == "cifs" ]; then
241+
if [[ "${NAS_TYPE}" == "cifs" ]]; then
155242
MOUNT_OPTS="${MOUNT_OPTS},nobrl"
156243
fi
157-
mount -t ${NAS_TYPE} ${NAS_ADDRESS} ${mount_point} $([[ ! -z "${MOUNT_OPTS}" ]] && echo -o ${MOUNT_OPTS}) | tee -a "$logFile"
158-
if [ $? -eq 0 ]; then
244+
if mount -t "${NAS_TYPE}" "${NAS_ADDRESS}" "${mount_point}" $([[ -n "${MOUNT_OPTS}" ]] && echo "-o" "${MOUNT_OPTS}") 2>&1 | tee -a "$logFile"; then
159245
log -ne "Successfully mounted ${NAS_TYPE} store"
160246
else
161247
echo "Failed to mount ${NAS_TYPE} store"
@@ -165,7 +251,17 @@ mount_operation() {
165251

166252
function usage {
167253
echo ""
168-
echo "Usage: $0 -o <operation> -v|--vm <domain name> -t <storage type> -s <storage address> -m <mount options> -p <backup path> -d <disks path>"
254+
echo "Usage: $0 -o <operation> -v|--vm <domain name> -t <storage type> -s <storage address> -m <mount options> -p <backup path> -d <disks path> [-q]"
255+
echo ""
256+
echo "Options:"
257+
echo " -o, --operation Operation to perform: backup, delete"
258+
echo " -v, --vm VM domain name"
259+
echo " -t, --type NAS type: nfs, cifs"
260+
echo " -s, --storage NAS address (e.g. 192.168.1.1:/share)"
261+
echo " -m, --mount Mount options"
262+
echo " -p, --path Backup directory path on NAS"
263+
echo " -d, --diskpaths Comma-separated disk paths (for stopped VM backup)"
264+
echo " -q, --quiesce Quiesce guest filesystem before backup (requires qemu-guest-agent)"
169265
echo ""
170266
exit 1
171267
}
@@ -207,6 +303,10 @@ while [[ $# -gt 0 ]]; do
207303
shift
208304
shift
209305
;;
306+
-q|--quiesce)
307+
QUIESCE="true"
308+
shift
309+
;;
210310
-h|--help)
211311
usage
212312
shift

0 commit comments

Comments
 (0)