From ad5aabb28d6646b212e1709fa5a0248f66b0bd53 Mon Sep 17 00:00:00 2001 From: RajeshKumar11 Date: Tue, 24 Feb 2026 09:08:34 +0530 Subject: [PATCH 1/2] src: fix deadlock in NodePlatform::DrainTasks on process shutdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the unconditional BlockingDrain() in DrainTasks() with a 1ms timed drain loop that periodically flushes foreground tasks while waiting for outstanding kUserBlocking worker tasks to complete. This fixes a deadlock where a kUserBlocking worker task (e.g. Maglev JIT compilation) posts a foreground task and waits for it. If the foreground task is posted after the main thread flushes the foreground queue but before BlockingDrain() enters its indefinite sleep, both threads wait on each other — a mutual deadlock. With the timed drain, the main thread wakes every 1ms and flushes any foreground tasks that were posted in that window. In the common case, tasks complete before the timeout and the wait is woken immediately by NotifyOfOutstandingCompletion(), so there is no performance regression. Adds ConditionVariableBase::TimedWait() backed by uv_cond_timedwait() and TaskQueue::Locked::TimedBlockingDrain() to support this. Fixes: https://github.com/nodejs/node/issues/54918 --- src/node_mutex.h | 15 +++++++++++ src/node_platform.cc | 60 +++++++++++++++++++++++++++++--------------- src/node_platform.h | 6 +++++ 3 files changed, 61 insertions(+), 20 deletions(-) diff --git a/src/node_mutex.h b/src/node_mutex.h index 69e64ba27c3106..199df0250001e5 100644 --- a/src/node_mutex.h +++ b/src/node_mutex.h @@ -139,6 +139,9 @@ class ConditionVariableBase { inline void Broadcast(const ScopedLock&); inline void Signal(const ScopedLock&); inline void Wait(const ScopedLock& scoped_lock); + // Returns true if signaled before timeout, false if timed out. + // timeout_ns is in nanoseconds. + inline bool TimedWait(const ScopedLock& scoped_lock, uint64_t timeout_ns); ConditionVariableBase(const ConditionVariableBase&) = delete; ConditionVariableBase& operator=(const ConditionVariableBase&) = delete; @@ -175,6 +178,11 @@ struct LibuvMutexTraits { uv_cond_wait(cond, mutex); } + static inline int cond_timedwait(CondT* cond, MutexT* mutex, + uint64_t timeout) { + return uv_cond_timedwait(cond, mutex, timeout); + } + static inline void mutex_destroy(MutexT* mutex) { uv_mutex_destroy(mutex); } @@ -249,6 +257,13 @@ void ConditionVariableBase::Wait(const ScopedLock& scoped_lock) { Traits::cond_wait(&cond_, &scoped_lock.mutex_.mutex_); } +template +bool ConditionVariableBase::TimedWait(const ScopedLock& scoped_lock, + uint64_t timeout_ns) { + return Traits::cond_timedwait( + &cond_, &scoped_lock.mutex_.mutex_, timeout_ns) == 0; +} + template MutexBase::MutexBase() { CHECK_EQ(0, Traits::mutex_init(&mutex_)); diff --git a/src/node_platform.cc b/src/node_platform.cc index 197102068b74f4..7ce8db9971bac1 100644 --- a/src/node_platform.cc +++ b/src/node_platform.cc @@ -300,6 +300,10 @@ void WorkerThreadsTaskRunner::BlockingDrain() { pending_worker_tasks_.Lock().BlockingDrain(); } +bool WorkerThreadsTaskRunner::TimedBlockingDrain(uint64_t timeout_ns) { + return pending_worker_tasks_.Lock().TimedBlockingDrain(timeout_ns); +} + void WorkerThreadsTaskRunner::Shutdown() { pending_worker_tasks_.Lock().Stop(); delayed_task_scheduler_->Stop(); @@ -580,27 +584,33 @@ void NodePlatform::DrainTasks(Isolate* isolate) { std::shared_ptr per_isolate = ForNodeIsolate(isolate); if (!per_isolate) return; + // Use a timed drain instead of blocking indefinitely on outstanding worker + // tasks. This fixes the deadlock described in: + // https://github.com/nodejs/node/issues/54918 + // + // A kUserBlocking worker task (e.g. Maglev JIT compilation) may post a + // foreground task and wait for it to complete. If that foreground task is + // posted after we flush the foreground queue but before we enter the wait + // inside BlockingDrain(), the main thread sleeps while the worker is also + // waiting for its foreground task — a mutual deadlock. + // + // With a timed drain, the main thread wakes up every kDrainIntervalNs and + // flushes any foreground tasks that were posted in that window, allowing + // the blocked worker task to proceed. In the common case, tasks complete + // before the timeout and NotifyOfOutstandingCompletion() wakes the wait + // immediately, so there is no performance regression. + // + // We still wait only on kUserBlocking tasks (tracked via is_outstanding()) + // because they are documented to require completion before execution + // continues (e.g. wasm async compilation). + static constexpr uint64_t kDrainIntervalNs = 1'000'000; // 1ms in nanoseconds + do { - // FIXME(54918): we should not be blocking on the worker tasks on the - // main thread in one go. Doing so leads to two problems: - // 1. If any of the worker tasks post another foreground task and wait - // for it to complete, and that foreground task is posted right after - // we flush the foreground task queue and before the foreground thread - // goes into sleep, we'll never be able to wake up to execute that - // foreground task and in turn the worker task will never complete, and - // we have a deadlock. - // 2. Worker tasks can be posted from any thread, not necessarily associated - // with the current isolate, and we can be blocking on a worker task that - // is associated with a completely unrelated isolate in the event loop. - // This is suboptimal. - // - // However, not blocking on the worker tasks at all can lead to loss of some - // critical user-blocking worker tasks e.g. wasm async compilation tasks, - // which should block the main thread until they are completed, as the - // documentation suggets. As a compromise, we currently only block on - // user-blocking tasks to reduce the chance of deadlocks while making sure - // that criticl user-blocking tasks are not lost. - worker_thread_task_runner_->BlockingDrain(); + while (!worker_thread_task_runner_->TimedBlockingDrain(kDrainIntervalNs)) { + // Timed out: a worker task may have posted a foreground task and is + // waiting for it. Flush the foreground queue now so it can proceed. + per_isolate->FlushForegroundTasksInternal(); + } } while (per_isolate->FlushForegroundTasksInternal()); } @@ -832,6 +842,16 @@ void TaskQueue::Locked::BlockingDrain() { } } +template +bool TaskQueue::Locked::TimedBlockingDrain(uint64_t timeout_ns) { + while (queue_->outstanding_tasks_ > 0) { + if (!queue_->outstanding_tasks_drained_.TimedWait(lock_, timeout_ns)) { + return false; // timed out, outstanding tasks still pending + } + } + return true; +} + template void TaskQueue::Locked::Stop() { queue_->stopped_ = true; diff --git a/src/node_platform.h b/src/node_platform.h index f47e2a46b66b84..302f278a1c1c33 100644 --- a/src/node_platform.h +++ b/src/node_platform.h @@ -53,6 +53,9 @@ class TaskQueue { std::unique_ptr BlockingPop(); void NotifyOfOutstandingCompletion(); void BlockingDrain(); + // Returns true if all outstanding tasks completed before the timeout, + // false if timed out. timeout_ns is in nanoseconds. + bool TimedBlockingDrain(uint64_t timeout_ns); void Stop(); PriorityQueue PopAll(); @@ -196,6 +199,9 @@ class WorkerThreadsTaskRunner { double delay_in_seconds); void BlockingDrain(); + // Returns true if all outstanding tasks completed before the timeout, + // false if timed out. timeout_ns is in nanoseconds. + bool TimedBlockingDrain(uint64_t timeout_ns); void Shutdown(); int NumberOfWorkerThreads() const; From 01323a3e5c286f030d9611aa028e80bb42d69873 Mon Sep 17 00:00:00 2001 From: RajeshKumar11 Date: Tue, 24 Feb 2026 09:49:36 +0530 Subject: [PATCH 2/2] fixup! src: fix deadlock in NodePlatform::DrainTasks on process shutdown Fix clang-format alignment issues in node_mutex.h: - Split cond_timedwait parameters onto separate lines - Correct continuation alignment in TimedWait implementation --- src/node_mutex.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/node_mutex.h b/src/node_mutex.h index 199df0250001e5..b8831a9805cf66 100644 --- a/src/node_mutex.h +++ b/src/node_mutex.h @@ -178,7 +178,8 @@ struct LibuvMutexTraits { uv_cond_wait(cond, mutex); } - static inline int cond_timedwait(CondT* cond, MutexT* mutex, + static inline int cond_timedwait(CondT* cond, + MutexT* mutex, uint64_t timeout) { return uv_cond_timedwait(cond, mutex, timeout); } @@ -259,9 +260,9 @@ void ConditionVariableBase::Wait(const ScopedLock& scoped_lock) { template bool ConditionVariableBase::TimedWait(const ScopedLock& scoped_lock, - uint64_t timeout_ns) { + uint64_t timeout_ns) { return Traits::cond_timedwait( - &cond_, &scoped_lock.mutex_.mutex_, timeout_ns) == 0; + &cond_, &scoped_lock.mutex_.mutex_, timeout_ns) == 0; } template