Skip to content

Commit d82fe38

Browse files
fix: replace concorekill.bat single-PID overwrite with safe PID registry (#391)
Problem: On Windows, concore.py wrote a single PID to concorekill.bat at import time using 'w' mode. When multiple Python nodes started simultaneously (via makestudy), each overwrote the file. Only the last node's PID survived, making the kill script unable to terminate other nodes. Stale PIDs from crashed studies could also kill unrelated processes. Solution: - Replace single-PID overwrite with append-based PID registry (concorekill_pids.txt) storing one PID per line - Register atexit handler to remove current PID on graceful shutdown - Generate concorekill.bat dynamically with PID validation: each PID is checked via tasklist before taskkill executes - Clean up both files when last node exits - Backward compatible: users still run concorekill.bat Added 9 tests covering registration, cleanup, multi-node scenarios, missing registry handling, and kill script generation. Fixes #391
1 parent b6cfcaa commit d82fe38

2 files changed

Lines changed: 224 additions & 6 deletions

File tree

concore.py

Lines changed: 91 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,98 @@
2121
logging.getLogger('requests').setLevel(logging.WARNING)
2222

2323

24-
# if windows, create script to kill this process
25-
# because batch files don't provide easy way to know pid of last command
26-
# ignored for posix != windows, because "concorepid" is handled by script
27-
# ignored for docker (linux != windows), because handled by docker stop
24+
# ===================================================================
25+
# Windows PID Registry (Issue #391)
26+
# ===================================================================
27+
# Previous implementation wrote a single PID to concorekill.bat at
28+
# import time, causing race conditions when multiple Python nodes
29+
# started simultaneously (each overwrote the file). Only the last
30+
# node's PID survived, and stale PIDs could kill unrelated processes.
31+
#
32+
# New approach:
33+
# - Append each node's PID to concorekill_pids.txt (one per line)
34+
# - Generate concorekill.bat that validates each PID before killing
35+
# - Use atexit to remove the PID on graceful shutdown
36+
# - Backward compatible: users still run concorekill.bat
37+
# ===================================================================
38+
39+
_PID_REGISTRY_FILE = "concorekill_pids.txt"
40+
_KILL_SCRIPT_FILE = "concorekill.bat"
41+
42+
43+
def _register_pid():
44+
"""Append the current process PID to the registry file."""
45+
pid = os.getpid()
46+
try:
47+
with open(_PID_REGISTRY_FILE, "a") as f:
48+
f.write(str(pid) + "\n")
49+
except OSError:
50+
pass # Non-fatal: best-effort registration
51+
52+
53+
def _cleanup_pid():
54+
"""Remove the current process PID from the registry on exit."""
55+
pid = str(os.getpid())
56+
try:
57+
if not os.path.exists(_PID_REGISTRY_FILE):
58+
return
59+
with open(_PID_REGISTRY_FILE, "r") as f:
60+
pids = [line.strip() for line in f if line.strip()]
61+
remaining = [p for p in pids if p != pid]
62+
if remaining:
63+
with open(_PID_REGISTRY_FILE, "w") as f:
64+
for p in remaining:
65+
f.write(p + "\n")
66+
else:
67+
# No PIDs left — clean up both files
68+
try:
69+
os.remove(_PID_REGISTRY_FILE)
70+
except OSError:
71+
pass
72+
try:
73+
os.remove(_KILL_SCRIPT_FILE)
74+
except OSError:
75+
pass
76+
except OSError:
77+
pass # Non-fatal: best-effort cleanup
78+
79+
80+
def _write_kill_script():
81+
"""Generate concorekill.bat that validates PIDs before killing.
82+
83+
The script reads concorekill_pids.txt and for each PID:
84+
1. Checks if the process still exists
85+
2. Verifies it is a Python process
86+
3. Only then issues taskkill
87+
After killing, the registry file is deleted.
88+
"""
89+
try:
90+
script = "@echo off\r\n"
91+
script += "if not exist \"%~dp0" + _PID_REGISTRY_FILE + "\" (\r\n"
92+
script += " echo No PID registry found. Nothing to kill.\r\n"
93+
script += " exit /b 0\r\n"
94+
script += ")\r\n"
95+
script += "for /f \"tokens=*\" %%p in (%~dp0" + _PID_REGISTRY_FILE + ") do (\r\n"
96+
script += " tasklist /FI \"PID eq %%p\" 2>nul | find /i \"python\" >nul\r\n"
97+
script += " if not errorlevel 1 (\r\n"
98+
script += " echo Killing Python process %%p\r\n"
99+
script += " taskkill /F /PID %%p >nul 2>&1\r\n"
100+
script += " ) else (\r\n"
101+
script += " echo Skipping PID %%p - not a Python process or not running\r\n"
102+
script += " )\r\n"
103+
script += ")\r\n"
104+
script += "del /q \"%~dp0" + _PID_REGISTRY_FILE + "\" 2>nul\r\n"
105+
script += "del /q \"%~dp0" + _KILL_SCRIPT_FILE + "\" 2>nul\r\n"
106+
with open(_KILL_SCRIPT_FILE, "w") as f:
107+
f.write(script)
108+
except OSError:
109+
pass # Non-fatal: best-effort script generation
110+
111+
28112
if hasattr(sys, 'getwindowsversion'):
29-
with open("concorekill.bat","w") as fpid:
30-
fpid.write("taskkill /F /PID "+str(os.getpid())+"\n")
113+
_register_pid()
114+
_write_kill_script()
115+
atexit.register(_cleanup_pid)
31116

32117
ZeroMQPort = concore_base.ZeroMQPort
33118
convert_numpy_to_python = concore_base.convert_numpy_to_python

tests/test_concore.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,141 @@
11
import pytest
22
import os
3+
import sys
34
import numpy as np
45

56

7+
# ===================================================================
8+
# PID Registry Tests (Issue #391)
9+
# ===================================================================
10+
11+
class TestPidRegistry:
12+
"""Tests for the Windows PID registry mechanism that replaces the
13+
old single-overwrite concorekill.bat approach."""
14+
15+
@pytest.fixture(autouse=True)
16+
def use_temp_dir(self, temp_dir, monkeypatch):
17+
"""Run each test in an isolated temp directory."""
18+
self.temp_dir = temp_dir
19+
monkeypatch.chdir(temp_dir)
20+
21+
def test_register_pid_creates_registry_file(self):
22+
"""_register_pid should create concorekill_pids.txt with current PID."""
23+
from concore import _register_pid, _PID_REGISTRY_FILE
24+
_register_pid()
25+
assert os.path.exists(_PID_REGISTRY_FILE)
26+
with open(_PID_REGISTRY_FILE) as f:
27+
pids = [line.strip() for line in f if line.strip()]
28+
assert str(os.getpid()) in pids
29+
30+
def test_register_pid_appends_not_overwrites(self):
31+
"""Multiple calls to _register_pid should append, not overwrite."""
32+
from concore import _register_pid, _PID_REGISTRY_FILE
33+
# Simulate two different PIDs by writing manually then registering
34+
with open(_PID_REGISTRY_FILE, "w") as f:
35+
f.write("11111\n")
36+
f.write("22222\n")
37+
_register_pid()
38+
with open(_PID_REGISTRY_FILE) as f:
39+
pids = [line.strip() for line in f if line.strip()]
40+
assert "11111" in pids
41+
assert "22222" in pids
42+
assert str(os.getpid()) in pids
43+
assert len(pids) == 3
44+
45+
def test_cleanup_pid_removes_current_pid(self):
46+
"""_cleanup_pid should remove only the current PID from the registry."""
47+
from concore import _cleanup_pid, _PID_REGISTRY_FILE
48+
current_pid = str(os.getpid())
49+
with open(_PID_REGISTRY_FILE, "w") as f:
50+
f.write("99999\n")
51+
f.write(current_pid + "\n")
52+
f.write("88888\n")
53+
_cleanup_pid()
54+
with open(_PID_REGISTRY_FILE) as f:
55+
pids = [line.strip() for line in f if line.strip()]
56+
assert current_pid not in pids
57+
assert "99999" in pids
58+
assert "88888" in pids
59+
60+
def test_cleanup_pid_deletes_files_when_last_pid(self):
61+
"""When the current PID is the only one left, cleanup should
62+
remove both the registry file and the kill script."""
63+
from concore import _cleanup_pid, _PID_REGISTRY_FILE, _KILL_SCRIPT_FILE
64+
current_pid = str(os.getpid())
65+
with open(_PID_REGISTRY_FILE, "w") as f:
66+
f.write(current_pid + "\n")
67+
# Create a dummy kill script to verify it gets cleaned up
68+
with open(_KILL_SCRIPT_FILE, "w") as f:
69+
f.write("@echo off\n")
70+
_cleanup_pid()
71+
assert not os.path.exists(_PID_REGISTRY_FILE)
72+
assert not os.path.exists(_KILL_SCRIPT_FILE)
73+
74+
def test_cleanup_pid_handles_missing_registry(self):
75+
"""_cleanup_pid should not crash when registry file doesn't exist."""
76+
from concore import _cleanup_pid, _PID_REGISTRY_FILE
77+
assert not os.path.exists(_PID_REGISTRY_FILE)
78+
_cleanup_pid() # Should not raise
79+
80+
def test_write_kill_script_generates_bat_file(self):
81+
"""_write_kill_script should create concorekill.bat with validation logic."""
82+
from concore import _write_kill_script, _KILL_SCRIPT_FILE, _PID_REGISTRY_FILE
83+
_write_kill_script()
84+
assert os.path.exists(_KILL_SCRIPT_FILE)
85+
with open(_KILL_SCRIPT_FILE) as f:
86+
content = f.read()
87+
# Script should reference the PID registry file
88+
assert _PID_REGISTRY_FILE in content
89+
# Script should validate processes before killing
90+
assert "tasklist" in content
91+
assert "taskkill" in content
92+
assert "python" in content.lower()
93+
94+
def test_multi_node_registration(self):
95+
"""Simulate 3 nodes registering PIDs — all should be present."""
96+
from concore import _register_pid, _PID_REGISTRY_FILE
97+
fake_pids = ["1204", "1932", "8120"]
98+
with open(_PID_REGISTRY_FILE, "w") as f:
99+
for pid in fake_pids:
100+
f.write(pid + "\n")
101+
_register_pid() # Current process is the 4th
102+
with open(_PID_REGISTRY_FILE) as f:
103+
pids = [line.strip() for line in f if line.strip()]
104+
for pid in fake_pids:
105+
assert pid in pids
106+
assert str(os.getpid()) in pids
107+
assert len(pids) == 4
108+
109+
def test_cleanup_preserves_other_pids(self):
110+
"""After cleanup, only the current process PID should be removed."""
111+
from concore import _cleanup_pid, _PID_REGISTRY_FILE
112+
current_pid = str(os.getpid())
113+
other_pids = ["1111", "2222", "3333"]
114+
with open(_PID_REGISTRY_FILE, "w") as f:
115+
for pid in other_pids:
116+
f.write(pid + "\n")
117+
f.write(current_pid + "\n")
118+
_cleanup_pid()
119+
with open(_PID_REGISTRY_FILE) as f:
120+
pids = [line.strip() for line in f if line.strip()]
121+
assert len(pids) == 3
122+
assert current_pid not in pids
123+
for pid in other_pids:
124+
assert pid in pids
125+
126+
@pytest.mark.skipif(not hasattr(sys, 'getwindowsversion'),
127+
reason="Windows-only test")
128+
def test_import_registers_pid_on_windows(self):
129+
"""On Windows, importing concore should register the PID."""
130+
from concore import _PID_REGISTRY_FILE
131+
# The import already happened, so just verify the registry exists
132+
# in our temp dir (we can't easily test the import-time side effect
133+
# since concore was already imported — we test the functions directly)
134+
from concore import _register_pid
135+
_register_pid()
136+
assert os.path.exists(_PID_REGISTRY_FILE)
137+
138+
6139
class TestSafeLiteralEval:
7140
def test_reads_dictionary_from_file(self, temp_dir):
8141
test_file = os.path.join(temp_dir, "config.txt")

0 commit comments

Comments
 (0)