diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-02-05-13-30-00.gh-issue-144513.IjSTd7.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-02-05-13-30-00.gh-issue-144513.IjSTd7.rst new file mode 100644 index 00000000000000..f97160172735e1 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-02-05-13-30-00.gh-issue-144513.IjSTd7.rst @@ -0,0 +1,2 @@ +Fix potential deadlock when using critical sections during stop-the-world +pauses in the free-threaded build. diff --git a/Modules/_testinternalcapi/test_critical_sections.c b/Modules/_testinternalcapi/test_critical_sections.c index e3b2fe716d48d3..72a1fa2cdc7224 100644 --- a/Modules/_testinternalcapi/test_critical_sections.c +++ b/Modules/_testinternalcapi/test_critical_sections.c @@ -4,6 +4,8 @@ #include "parts.h" #include "pycore_critical_section.h" +#include "pycore_pystate.h" +#include "pycore_pythread.h" #ifdef MS_WINDOWS # include // Sleep() @@ -381,6 +383,87 @@ test_critical_section2_reacquisition(PyObject *self, PyObject *Py_UNUSED(args)) #endif // Py_GIL_DISABLED +#ifdef Py_CAN_START_THREADS + +// gh-144513: Test that critical sections don't deadlock with stop-the-world. +// This test is designed to deadlock (timeout) on builds without the fix. +struct test_data_stw { + PyObject *obj; + Py_ssize_t num_threads; + Py_ssize_t started; + PyEvent ready; +}; + +static void +thread_stw(void *arg) +{ + struct test_data_stw *test_data = arg; + PyGILState_STATE gil = PyGILState_Ensure(); + + if (_Py_atomic_add_ssize(&test_data->started, 1) == test_data->num_threads - 1) { + _PyEvent_Notify(&test_data->ready); + } + + // All threads: acquire critical section and hold it long enough to + // trigger TIME_TO_BE_FAIR_NS (1 ms), which causes direct handoff on unlock. + Py_BEGIN_CRITICAL_SECTION(test_data->obj); + pysleep(10); // 10 ms = 10 x TIME_TO_BE_FAIR_NS + Py_END_CRITICAL_SECTION(); + + PyGILState_Release(gil); +} + +static PyObject * +test_critical_sections_stw(PyObject *self, PyObject *Py_UNUSED(args)) +{ + // gh-144513: Test that critical sections don't deadlock during STW. + // + // The deadlock occurs when lock ownership is handed off (due to fairness + // after TIME_TO_BE_FAIR_NS) to a thread that has already suspended for + // stop-the-world. The STW requester then cannot acquire the lock. + // + // With the fix, the STW requester detects world_stopped and skips locking. + + #define STW_NUM_THREADS 2 + struct test_data_stw test_data = { + .obj = PyDict_New(), + .num_threads = STW_NUM_THREADS, + }; + if (test_data.obj == NULL) { + return NULL; + } + + PyThread_handle_t handles[STW_NUM_THREADS]; + PyThread_ident_t idents[STW_NUM_THREADS]; + for (Py_ssize_t i = 0; i < STW_NUM_THREADS; i++) { + PyThread_start_joinable_thread(&thread_stw, &test_data, + &idents[i], &handles[i]); + } + + // Wait for threads to start, then let them compete for the lock + PyEvent_Wait(&test_data.ready); + pysleep(5); + + // Request stop-the-world and try to acquire the critical section. + // Without the fix, this may deadlock. + PyInterpreterState *interp = PyInterpreterState_Get(); + _PyEval_StopTheWorld(interp); + + Py_BEGIN_CRITICAL_SECTION(test_data.obj); + Py_END_CRITICAL_SECTION(); + + _PyEval_StartTheWorld(interp); + + for (Py_ssize_t i = 0; i < STW_NUM_THREADS; i++) { + PyThread_join_thread(handles[i]); + } + #undef STW_NUM_THREADS + Py_DECREF(test_data.obj); + Py_RETURN_NONE; +} + +#endif // Py_CAN_START_THREADS + static PyMethodDef test_methods[] = { {"test_critical_sections", test_critical_sections, METH_NOARGS}, {"test_critical_sections_nest", test_critical_sections_nest, METH_NOARGS}, @@ -392,6 +475,7 @@ static PyMethodDef test_methods[] = { #ifdef Py_CAN_START_THREADS {"test_critical_sections_threads", test_critical_sections_threads, METH_NOARGS}, {"test_critical_sections_gc", test_critical_sections_gc, METH_NOARGS}, + {"test_critical_sections_stw", test_critical_sections_stw, METH_NOARGS}, #endif {NULL, NULL} /* sentinel */ }; diff --git a/Python/critical_section.c b/Python/critical_section.c index 2c2152f5de4716..98e23eda7cdd77 100644 --- a/Python/critical_section.c +++ b/Python/critical_section.c @@ -1,7 +1,8 @@ #include "Python.h" -#include "pycore_lock.h" #include "pycore_critical_section.h" +#include "pycore_interp.h" +#include "pycore_lock.h" #ifdef Py_GIL_DISABLED static_assert(_Alignof(PyCriticalSection) >= 4, @@ -42,6 +43,15 @@ _PyCriticalSection_BeginSlow(PyThreadState *tstate, PyCriticalSection *c, PyMute } } } + // If the world is stopped, we don't need to acquire the lock because + // there are no other threads that could be accessing the object. + // Without this check, acquiring a critical section while the world is + // stopped could lead to a deadlock. + if (tstate->interp->stoptheworld.world_stopped) { + c->_cs_mutex = NULL; + c->_cs_prev = 0; + return; + } c->_cs_mutex = NULL; c->_cs_prev = (uintptr_t)tstate->critical_section; tstate->critical_section = (uintptr_t)c; @@ -56,6 +66,12 @@ _PyCriticalSection2_BeginSlow(PyThreadState *tstate, PyCriticalSection2 *c, PyMu int is_m1_locked) { #ifdef Py_GIL_DISABLED + if (tstate->interp->stoptheworld.world_stopped) { + c->_cs_base._cs_mutex = NULL; + c->_cs_mutex2 = NULL; + c->_cs_base._cs_prev = 0; + return; + } c->_cs_base._cs_mutex = NULL; c->_cs_mutex2 = NULL; c->_cs_base._cs_prev = tstate->critical_section;