Skip to content

Commit db08ab1

Browse files
committed
gh-144513: Skip critical section locking during stop-the-world
When the interpreter is in a stop-the-world pause, critical sections don't need to acquire locks since no other threads can be running. This avoids a potential deadlock where lock fairness hands off ownership to a thread that has already suspended for stop-the-world.
1 parent 7e777c5 commit db08ab1

File tree

3 files changed

+100
-1
lines changed

3 files changed

+100
-1
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix potential deadlock when using critical sections during stop-the-world
2+
pauses in the free-threaded build.

Modules/_testinternalcapi/test_critical_sections.c

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
#include "parts.h"
66
#include "pycore_critical_section.h"
7+
#include "pycore_pystate.h"
8+
#include "pycore_pythread.h"
79

810
#ifdef MS_WINDOWS
911
# include <windows.h> // Sleep()
@@ -381,6 +383,86 @@ test_critical_section2_reacquisition(PyObject *self, PyObject *Py_UNUSED(args))
381383

382384
#endif // Py_GIL_DISABLED
383385

386+
#ifdef Py_CAN_START_THREADS
387+
388+
// gh-144513: Test that critical sections don't deadlock with stop-the-world.
389+
// This test is designed to deadlock (timeout) on builds without the fix.
390+
struct test_data_stw {
391+
PyObject *obj;
392+
Py_ssize_t num_threads;
393+
Py_ssize_t started;
394+
PyEvent ready;
395+
};
396+
397+
static void
398+
thread_stw(void *arg)
399+
{
400+
struct test_data_stw *test_data = arg;
401+
PyGILState_STATE gil = PyGILState_Ensure();
402+
403+
if (_Py_atomic_add_ssize(&test_data->started, 1) == test_data->num_threads - 1) {
404+
_PyEvent_Notify(&test_data->ready);
405+
}
406+
407+
// All threads: acquire critical section and hold it long enough to
408+
// trigger TIME_TO_BE_FAIR_NS, which causes direct handoff on unlock.
409+
Py_BEGIN_CRITICAL_SECTION(test_data->obj);
410+
pysleep(10);
411+
Py_END_CRITICAL_SECTION();
412+
413+
PyGILState_Release(gil);
414+
}
415+
416+
static PyObject *
417+
test_critical_sections_stw(PyObject *self, PyObject *Py_UNUSED(args))
418+
{
419+
// gh-144513: Test that critical sections don't deadlock during STW.
420+
//
421+
// The deadlock occurs when lock ownership is handed off (due to fairness
422+
// after TIME_TO_BE_FAIR_NS) to a thread that has already suspended for
423+
// stop-the-world. The STW requester then cannot acquire the lock.
424+
//
425+
// With the fix, the STW requester detects world_stopped and skips locking.
426+
427+
const Py_ssize_t NUM_THREADS = 2;
428+
struct test_data_stw test_data = {
429+
.obj = PyDict_New(),
430+
.num_threads = NUM_THREADS,
431+
};
432+
if (test_data.obj == NULL) {
433+
return NULL;
434+
}
435+
436+
PyThread_handle_t handles[NUM_THREADS];
437+
PyThread_ident_t idents[NUM_THREADS];
438+
for (Py_ssize_t i = 0; i < NUM_THREADS; i++) {
439+
PyThread_start_joinable_thread(&thread_stw, &test_data,
440+
&idents[i], &handles[i]);
441+
}
442+
443+
// Wait for threads to start, then let them compete for the lock
444+
PyEvent_Wait(&test_data.ready);
445+
pysleep(5);
446+
447+
// Request stop-the-world and try to acquire the critical section.
448+
// Without the fix, this may deadlock.
449+
PyInterpreterState *interp = PyInterpreterState_Get();
450+
_PyEval_StopTheWorld(interp);
451+
452+
Py_BEGIN_CRITICAL_SECTION(test_data.obj);
453+
Py_END_CRITICAL_SECTION();
454+
455+
_PyEval_StartTheWorld(interp);
456+
457+
for (Py_ssize_t i = 0; i < NUM_THREADS; i++) {
458+
PyThread_join_thread(handles[i]);
459+
}
460+
Py_DECREF(test_data.obj);
461+
Py_RETURN_NONE;
462+
}
463+
464+
#endif // Py_CAN_START_THREADS
465+
384466
static PyMethodDef test_methods[] = {
385467
{"test_critical_sections", test_critical_sections, METH_NOARGS},
386468
{"test_critical_sections_nest", test_critical_sections_nest, METH_NOARGS},
@@ -392,6 +474,7 @@ static PyMethodDef test_methods[] = {
392474
#ifdef Py_CAN_START_THREADS
393475
{"test_critical_sections_threads", test_critical_sections_threads, METH_NOARGS},
394476
{"test_critical_sections_gc", test_critical_sections_gc, METH_NOARGS},
477+
{"test_critical_sections_stw", test_critical_sections_stw, METH_NOARGS},
395478
#endif
396479
{NULL, NULL} /* sentinel */
397480
};

Python/critical_section.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
#include "Python.h"
22

3-
#include "pycore_lock.h"
43
#include "pycore_critical_section.h"
4+
#include "pycore_interp.h"
5+
#include "pycore_lock.h"
56

67
#ifdef Py_GIL_DISABLED
78
static_assert(_Alignof(PyCriticalSection) >= 4,
@@ -42,6 +43,13 @@ _PyCriticalSection_BeginSlow(PyThreadState *tstate, PyCriticalSection *c, PyMute
4243
}
4344
}
4445
}
46+
// If the world is stopped, we don't need to acquire the lock because
47+
// there are no other threads that could be accessing the object.
48+
if (tstate->interp->stoptheworld.world_stopped) {
49+
c->_cs_mutex = NULL;
50+
c->_cs_prev = 0;
51+
return;
52+
}
4553
c->_cs_mutex = NULL;
4654
c->_cs_prev = (uintptr_t)tstate->critical_section;
4755
tstate->critical_section = (uintptr_t)c;
@@ -56,6 +64,12 @@ _PyCriticalSection2_BeginSlow(PyThreadState *tstate, PyCriticalSection2 *c, PyMu
5664
int is_m1_locked)
5765
{
5866
#ifdef Py_GIL_DISABLED
67+
if (tstate->interp->stoptheworld.world_stopped) {
68+
c->_cs_base._cs_mutex = NULL;
69+
c->_cs_mutex2 = NULL;
70+
c->_cs_base._cs_prev = 0;
71+
return;
72+
}
5973
c->_cs_base._cs_mutex = NULL;
6074
c->_cs_mutex2 = NULL;
6175
c->_cs_base._cs_prev = tstate->critical_section;

0 commit comments

Comments
 (0)