From 28245599889504164c4bd7d8a47d8e4d4d41b70b Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 12:56:53 +0100 Subject: [PATCH 01/29] better EGL check handling --- whippersnappy/gl/_headless.py | 38 ++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/whippersnappy/gl/_headless.py b/whippersnappy/gl/_headless.py index 3a8214a..54b65cd 100644 --- a/whippersnappy/gl/_headless.py +++ b/whippersnappy/gl/_headless.py @@ -44,12 +44,20 @@ def _osmesa_is_available(): def egl_device_is_available(): - """Return True if libEGL is loadable AND a DRI render node is accessible. + """Return True if libEGL is loadable AND a DRI render node is present. - Checking for ``/dev/dri/renderD*`` existence and readability guards - against Singularity/Docker containers that have EGL libraries installed - but no device nodes bound in — in those cases EGL context creation would - fail and we should fall back to OSMesa instead. + We check that at least one ``/dev/dri/renderD*`` node exists and that + ``libEGL`` can be loaded. We intentionally do **not** gate on + ``os.access(node, os.R_OK)`` here because that POSIX check does not + honour supplementary group memberships on all kernels (e.g. when the + process inherits group ``render`` via newgrp or a login session) and + does not account for POSIX ACL entries (the ``+`` suffix in ``ls -l`` + output). If the node exists and EGL is installed we optimistically try + EGL and let the context-creation call fail gracefully if the device truly + turns out to be inaccessible. + + We still skip EGL if *no* device node exists at all — that is the + reliable Singularity/Docker signal where no device is bound in. This function is called both here (at import time) and from :func:`~whippersnappy.gl.context.init_offscreen_context` (at context @@ -59,17 +67,16 @@ def egl_device_is_available(): if not render_nodes: logger.debug("EGL: no /dev/dri/renderD* device nodes found — skipping EGL.") return False - if not any(os.access(n, os.R_OK) for n in render_nodes): - logger.debug("EGL: /dev/dri/renderD* exists but not readable — skipping EGL.") - return False for name in ("libEGL.so.1", "libEGL.so"): try: ctypes.CDLL(name) - logger.debug("EGL: libEGL found and render node accessible.") + logger.debug( + "EGL: libEGL found and %d render node(s) present.", len(render_nodes) + ) return True except OSError: continue - logger.debug("EGL: libEGL not found.") + logger.debug("EGL: /dev/dri/renderD* found but libEGL not loadable.") return False @@ -98,15 +105,18 @@ def egl_device_is_available(): "whippersnappy requires an OpenGL context but none could be found.\n" "\n" "No display server detected (DISPLAY / WAYLAND_DISPLAY are unset),\n" - "no accessible GPU render device (/dev/dri/renderD*), and OSMesa\n" - "is not installed.\n" + "no GPU render device found (/dev/dri/renderD* absent or libEGL missing),\n" + "and OSMesa is not installed.\n" "\n" "To fix this, choose one of:\n" " 1. Install OSMesa (recommended for headless/SSH use):\n" " Debian/Ubuntu: sudo apt-get install libosmesa6\n" " RHEL/Fedora: sudo dnf install mesa-libOSMesa\n" - " 2. Use EGL GPU rendering by ensuring /dev/dri/renderD* is accessible\n" - " and libEGL is installed (libegl1 on Debian/Ubuntu).\n" + " 2. Use EGL GPU rendering — ensure /dev/dri/renderD* exists and\n" + " libEGL is installed (libegl1 on Debian/Ubuntu). If the device\n" + " exists but you still see this error, add your user to the\n" + " 'render' group: sudo usermod -aG render $USER\n" + " (then log out and back in).\n" " 3. Set DISPLAY if a local X server is running:\n" " export DISPLAY=:1\n" ) From 027c8b7ccae4183e7e7526b779d977ed1d56c373 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 13:07:32 +0100 Subject: [PATCH 02/29] better error log messages and exits --- whippersnappy/cli/whippersnap.py | 45 ++++++++++++++++--------------- whippersnappy/cli/whippersnap1.py | 7 +++-- whippersnappy/cli/whippersnap4.py | 7 +++-- whippersnappy/gl/context.py | 8 +++--- 4 files changed, 37 insertions(+), 30 deletions(-) diff --git a/whippersnappy/cli/whippersnap.py b/whippersnappy/cli/whippersnap.py index 1a5c074..fa628db 100644 --- a/whippersnappy/cli/whippersnap.py +++ b/whippersnappy/cli/whippersnap.py @@ -720,22 +720,21 @@ def run(): # ------------------------------------------------------------------ if QApplication is None: print( - "ERROR: Interactive mode requires PyQt6. " + "Error: Interactive mode requires PyQt6. " "Install with: pip install 'whippersnappy[gui]'", file=sys.stderr, ) - raise RuntimeError( - "Interactive mode requires PyQt6. " - "Install with: pip install 'whippersnappy[gui]'" - ) + sys.exit(1) try: from ..gui import ConfigWindow # noqa: PLC0415 except ModuleNotFoundError as e: - raise RuntimeError( - "Interactive mode requires PyQt6. " - "Install with: pip install 'whippersnappy[gui]'" - ) from e + print( + f"Error: Interactive mode requires PyQt6 ({e}). " + "Install with: pip install 'whippersnappy[gui]'", + file=sys.stderr, + ) + sys.exit(1) current_fthresh_ = args.fthresh current_fmax_ = args.fmax @@ -756,18 +755,22 @@ def run(): # show_window creates the GLFW window, sets up a QTimer render loop, # then calls app.exec() — returns when either window is closed. - show_window( - mesh=mesh_path, - overlay=overlay, - annot=args.annot, - bg_map=bg_map, - roi=roi, - invert=args.invert, - specular=args.specular, - view=view, - app=app, - config_window=config_window, - ) + try: + show_window( + mesh=mesh_path, + overlay=overlay, + annot=args.annot, + bg_map=bg_map, + roi=roi, + invert=args.invert, + specular=args.specular, + view=view, + app=app, + config_window=config_window, + ) + except (RuntimeError, FileNotFoundError) as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) if __name__ == "__main__": diff --git a/whippersnappy/cli/whippersnap1.py b/whippersnappy/cli/whippersnap1.py index 2fe5625..cf96cb9 100644 --- a/whippersnappy/cli/whippersnap1.py +++ b/whippersnappy/cli/whippersnap1.py @@ -43,12 +43,12 @@ import argparse import logging import os +import sys import tempfile import numpy as np if __name__ == "__main__" and __package__ is None: - import sys os.execv(sys.executable, [sys.executable, "-m", "whippersnappy.cli.whippersnap1"] + sys.argv[1:]) from .. import snap1, snap_rotate @@ -309,8 +309,11 @@ def run(): ambient=args.ambient, ) log.info("Snapshot saved to %s (%dx%d)", outpath, img.width, img.height) - except (RuntimeError, FileNotFoundError, ValueError, ImportError) as e: + except ValueError as e: parser.error(str(e)) + except (RuntimeError, FileNotFoundError, ImportError) as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) if __name__ == "__main__": diff --git a/whippersnappy/cli/whippersnap4.py b/whippersnappy/cli/whippersnap4.py index a91fa80..ca37b00 100644 --- a/whippersnappy/cli/whippersnap4.py +++ b/whippersnappy/cli/whippersnap4.py @@ -16,12 +16,12 @@ import argparse import logging import os +import sys import tempfile import numpy as np if __name__ == "__main__" and __package__ is None: - import sys os.execv(sys.executable, [sys.executable, "-m", "whippersnappy.cli.whippersnap4"] + sys.argv[1:]) from .. import snap4 @@ -210,8 +210,11 @@ def run(): logger.info( "Snapshot saved to %s (%dx%d)", args.output_path, img.width, img.height ) - except (RuntimeError, FileNotFoundError, ValueError) as e: + except ValueError as e: parser.error(str(e)) + except (RuntimeError, FileNotFoundError) as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) if __name__ == "__main__": diff --git a/whippersnappy/gl/context.py b/whippersnappy/gl/context.py index 60ab95c..71803dd 100644 --- a/whippersnappy/gl/context.py +++ b/whippersnappy/gl/context.py @@ -207,7 +207,7 @@ def init_offscreen_context(width, height): logger.warning("EGL failed (%s) — falling back to OSMesa.", exc) # --- Step 3: OSMesa software rendering --- - logger.info("Trying OSMesa software rendering (CPU).") + logger.debug("Trying OSMesa software rendering (CPU).") try: from .osmesa_context import OSMesaContext # noqa: PLC0415 ctx = OSMesaContext(width, height) @@ -217,10 +217,8 @@ def init_offscreen_context(width, height): return None except (ImportError, RuntimeError) as exc: raise RuntimeError( - "Could not create any OpenGL context (tried GLFW, EGL, OSMesa). " - f"Last error: {exc}\n" - "Install OSMesa: sudo apt-get install libosmesa6 (Debian/Ubuntu)\n" - " or sudo dnf install mesa-libOSMesa (RHEL/Fedora)" + "Could not create any OpenGL context (tried GLFW invisible window and OSMesa). " + f"Last error: {exc}" ) from exc From 714cfcf77ed7dfb7ed6de8f397d1bd025811a693 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 13:16:44 +0100 Subject: [PATCH 03/29] improve detection of DISPLAY set but not working --- whippersnappy/gl/_headless.py | 155 ++++++++++++++++++++++------------ whippersnappy/gl/context.py | 27 +++--- 2 files changed, 116 insertions(+), 66 deletions(-) diff --git a/whippersnappy/gl/_headless.py b/whippersnappy/gl/_headless.py index 54b65cd..b1665a0 100644 --- a/whippersnappy/gl/_headless.py +++ b/whippersnappy/gl/_headless.py @@ -5,20 +5,26 @@ that PyOpenGL resolves function pointers via the correct backend before ``OpenGL.GL`` is first imported. -Priority chain when no display is detected (Linux only): +Priority chain on Linux when no usable display is detected: -1. **EGL + GPU device** — ``/dev/dri/renderD*`` readable and ``libEGL`` +1. **EGL + GPU device** — ``/dev/dri/renderD*`` present and ``libEGL`` loadable. Sets ``PYOPENGL_PLATFORM=egl`` immediately so that PyOpenGL binds function pointers via EGL when ``OpenGL.GL`` is first imported. 2. **OSMesa** — CPU software renderer. Sets ``PYOPENGL_PLATFORM=osmesa``. 3. **Neither** — raises ``RuntimeError`` with install instructions. -When ``DISPLAY`` is set (e.g. normal desktop or ``ssh -Y``), ``_headless`` -does not intervene: GLFW is tried first in :func:`init_offscreen_context`. -If GLFW fails (e.g. GLX 3.3 unavailable on the forwarded display), EGL is -attempted only when ``PYOPENGL_PLATFORM`` was already set to ``"egl"`` by -this module at import time — i.e. only for the no-display + EGL-device case. -In all other GLFW-failure scenarios, OSMesa is used as the final fallback. +"No usable display" covers both: + +* ``DISPLAY`` / ``WAYLAND_DISPLAY`` are unset entirely. +* ``DISPLAY`` is set but the X server is unreachable or refuses the + connection (e.g. ``ssh -X``/``ssh -Y`` to a machine whose X server + does not support GLX 3.3, or a stale/wrong ``DISPLAY`` value). In + these cases GLFW will fail with a GLX error anyway, so we pre-empt it + by trying EGL/OSMesa instead. + +When ``DISPLAY`` is set **and** the X server is reachable, this module +does not intervene: GLFW is tried first in +:func:`~whippersnappy.gl.context.init_offscreen_context`. No OpenGL, GLFW, or other heavy imports are done here — only stdlib. """ @@ -43,6 +49,45 @@ def _osmesa_is_available(): return False +def _display_is_usable(): + """Return True if the X11 display named by ``DISPLAY`` is actually reachable. + + Tries to open a connection to the X server via ``XOpenDisplay`` (from + ``libX11``) without importing any OpenGL library. Returns ``False`` when: + + * ``DISPLAY`` is unset or empty. + * ``libX11`` cannot be loaded (headless system without X11 client libs). + * ``XOpenDisplay`` returns ``NULL`` (server unreachable, access denied, + or display string invalid). + + A ``True`` result means the X server accepted the connection; GLFW will + very likely be able to create a window (though GLX 3.3 availability is + not guaranteed — that is discovered later by GLFW itself). + """ + display_str = os.environ.get("DISPLAY") + if not display_str: + return False + for lib_name in ("libX11.so.6", "libX11.so"): + try: + libx11 = ctypes.CDLL(lib_name) + break + except OSError: + continue + else: + # libX11 not installed — treat as unusable display. + return False + try: + libx11.XOpenDisplay.restype = ctypes.c_void_p + libx11.XOpenDisplay.argtypes = [ctypes.c_char_p] + dpy = libx11.XOpenDisplay(display_str.encode()) + if dpy: + libx11.XCloseDisplay(dpy) + return True + return False + except Exception: # noqa: BLE001 + return False + + def egl_device_is_available(): """Return True if libEGL is loadable AND a DRI render node is present. @@ -80,49 +125,53 @@ def egl_device_is_available(): return False -if ( - sys.platform == "linux" - and "PYOPENGL_PLATFORM" not in os.environ - and not os.environ.get("DISPLAY") - and not os.environ.get("WAYLAND_DISPLAY") -): - if egl_device_is_available(): - # Set PYOPENGL_PLATFORM=egl NOW, before OpenGL.GL is imported anywhere. - # PyOpenGL selects its platform backend on first import and cannot be - # changed afterwards; deferring to egl_context.py would mean OpenGL.GL - # is already bound to the wrong backend by the time EGL is tried. - os.environ["PYOPENGL_PLATFORM"] = "egl" - logger.debug( - "No display, EGL + GPU device available — PYOPENGL_PLATFORM=egl set." - ) - elif _osmesa_is_available(): - os.environ["PYOPENGL_PLATFORM"] = "osmesa" - logger.debug( - "No display, no EGL device — PYOPENGL_PLATFORM=osmesa set (CPU rendering)." - ) - else: - raise RuntimeError( - "whippersnappy requires an OpenGL context but none could be found.\n" - "\n" - "No display server detected (DISPLAY / WAYLAND_DISPLAY are unset),\n" - "no GPU render device found (/dev/dri/renderD* absent or libEGL missing),\n" - "and OSMesa is not installed.\n" - "\n" - "To fix this, choose one of:\n" - " 1. Install OSMesa (recommended for headless/SSH use):\n" - " Debian/Ubuntu: sudo apt-get install libosmesa6\n" - " RHEL/Fedora: sudo dnf install mesa-libOSMesa\n" - " 2. Use EGL GPU rendering — ensure /dev/dri/renderD* exists and\n" - " libEGL is installed (libegl1 on Debian/Ubuntu). If the device\n" - " exists but you still see this error, add your user to the\n" - " 'render' group: sudo usermod -aG render $USER\n" - " (then log out and back in).\n" - " 3. Set DISPLAY if a local X server is running:\n" - " export DISPLAY=:1\n" - ) -elif sys.platform == "linux": - _display = os.environ.get("DISPLAY") or os.environ.get("WAYLAND_DISPLAY") - logger.debug( - "Display set (%s) — will try GLFW first.", - _display, +if sys.platform == "linux" and "PYOPENGL_PLATFORM" not in os.environ: + _has_display = ( + bool(os.environ.get("DISPLAY")) or bool(os.environ.get("WAYLAND_DISPLAY")) ) + + if _has_display and _display_is_usable(): + # Reachable X/Wayland server — let GLFW try first; don't intervene. + _display = os.environ.get("DISPLAY") or os.environ.get("WAYLAND_DISPLAY") + logger.debug("Display set and reachable (%s) — will try GLFW first.", _display) + else: + # No display, or display is set but unreachable (e.g. bad ssh -X forward). + # Must choose a headless backend NOW before OpenGL.GL is imported. + if _has_display: + logger.debug( + "DISPLAY is set (%s) but X server is unreachable — " + "skipping GLFW/GLX and trying headless backends.", + os.environ.get("DISPLAY"), + ) + if egl_device_is_available(): + os.environ["PYOPENGL_PLATFORM"] = "egl" + logger.debug( + "No usable display; EGL + GPU device available — " + "PYOPENGL_PLATFORM=egl set." + ) + elif _osmesa_is_available(): + os.environ["PYOPENGL_PLATFORM"] = "osmesa" + logger.debug( + "No usable display; no EGL device — " + "PYOPENGL_PLATFORM=osmesa set (CPU rendering)." + ) + else: + raise RuntimeError( + "whippersnappy requires an OpenGL context but none could be found.\n" + "\n" + "No usable display detected (DISPLAY/WAYLAND_DISPLAY unset or X server\n" + "unreachable), no GPU render device found (/dev/dri/renderD* absent or\n" + "libEGL missing), and OSMesa is not installed.\n" + "\n" + "To fix this, choose one of:\n" + " 1. Install OSMesa (recommended for headless/SSH use):\n" + " Debian/Ubuntu: sudo apt-get install libosmesa6\n" + " RHEL/Fedora: sudo dnf install mesa-libOSMesa\n" + " 2. Use EGL GPU rendering — ensure /dev/dri/renderD* exists and\n" + " libEGL is installed (libegl1 on Debian/Ubuntu). If the device\n" + " exists but you still see this error, add your user to the\n" + " 'render' group: sudo usermod -aG render $USER\n" + " (then log out and back in).\n" + " 3. If you used ssh -X/-Y, try without X forwarding:\n" + " unset DISPLAY\n" + ) diff --git a/whippersnappy/gl/context.py b/whippersnappy/gl/context.py index 71803dd..7d3c26d 100644 --- a/whippersnappy/gl/context.py +++ b/whippersnappy/gl/context.py @@ -142,14 +142,15 @@ def init_offscreen_context(width, height): Tries up to three paths on Linux; macOS and Windows use GLFW only. - 1. **GLFW invisible window** — standard path when a display is available. - 2. **EGL pbuffer** — headless GPU rendering (Linux only, no display needed). - Only attempted when :mod:`~whippersnappy.gl._headless` already set - ``PYOPENGL_PLATFORM=egl`` at import time (i.e. no display detected AND - ``/dev/dri/renderD*`` is accessible). This guarantees ``OpenGL.GL`` - was bound to the EGL backend before any GL call; attempting EGL after - ``OpenGL.GL`` has already been imported with a different backend would - silently break function resolution. + 1. **GLFW invisible window** — standard path when a usable display is + available. + 2. **EGL pbuffer** — headless GPU rendering (Linux only). Attempted when + :mod:`~whippersnappy.gl._headless` set ``PYOPENGL_PLATFORM=egl`` at + import time. This happens when either no display is present, or when + ``DISPLAY`` is set but the X server was unreachable (e.g. a stale + ``ssh -X`` forward that cannot provide GLX 3.3). Pre-setting + ``PYOPENGL_PLATFORM`` before ``OpenGL.GL`` is first imported ensures + PyOpenGL binds EGL function pointers correctly. 3. **OSMesa** — CPU software renderer (Linux only). Used when neither GLFW nor EGL succeeds, or when ``PYOPENGL_PLATFORM=osmesa`` was set. @@ -189,11 +190,11 @@ def init_offscreen_context(width, height): # --- Step 2: EGL headless GPU rendering --- # Only safe when PYOPENGL_PLATFORM=egl was set by _headless.py before - # OpenGL.GL was imported — meaning the process has no display AND an EGL - # device was found at import time. PyOpenGL binds its platform backend on - # first import and cannot be switched afterwards; importing egl_context.py - # here when PYOPENGL_PLATFORM is already something else (e.g. "osmesa" or - # unset/GLX) would cause silent function-pointer mismatches. + # OpenGL.GL was imported. _headless.py sets this when either no display + # is present at all, or when DISPLAY is set but the X server was + # unreachable (e.g. a stale/unusable ssh -X forward). In both cases + # PyOpenGL is already bound to EGL; attempting EGL when OpenGL.GL was + # imported with GLX would cause silent function-pointer mismatches. if os.environ.get("PYOPENGL_PLATFORM") == "egl": logger.info("GLFW failed — trying EGL headless GPU rendering.") try: From 1d63aff1d0af9248d834dea83c265310a92170ca Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 13:19:34 +0100 Subject: [PATCH 04/29] improve detection of DISPLAY set but not working --- whippersnappy/gl/_headless.py | 83 ++++++++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 21 deletions(-) diff --git a/whippersnappy/gl/_headless.py b/whippersnappy/gl/_headless.py index b1665a0..1757490 100644 --- a/whippersnappy/gl/_headless.py +++ b/whippersnappy/gl/_headless.py @@ -16,11 +16,11 @@ "No usable display" covers both: * ``DISPLAY`` / ``WAYLAND_DISPLAY`` are unset entirely. -* ``DISPLAY`` is set but the X server is unreachable or refuses the - connection (e.g. ``ssh -X``/``ssh -Y`` to a machine whose X server - does not support GLX 3.3, or a stale/wrong ``DISPLAY`` value). In - these cases GLFW will fail with a GLX error anyway, so we pre-empt it - by trying EGL/OSMesa instead. +* ``DISPLAY`` is set but the X server is unreachable, refuses the + connection, or does not provide GLX >= 1.3 (e.g. ``ssh -X``/``ssh -Y`` + to a server whose software-rendered GLX cannot support OpenGL 3.3 Core + Profile). In these cases GLFW will fail with a GLX error anyway, so we + pre-empt it by trying EGL/OSMesa instead. When ``DISPLAY`` is set **and** the X server is reachable, this module does not intervene: GLFW is tried first in @@ -50,23 +50,27 @@ def _osmesa_is_available(): def _display_is_usable(): - """Return True if the X11 display named by ``DISPLAY`` is actually reachable. + """Return True if ``DISPLAY`` points to an X server with usable GLX support. - Tries to open a connection to the X server via ``XOpenDisplay`` (from - ``libX11``) without importing any OpenGL library. Returns ``False`` when: + A display is considered usable only when **all** of the following hold: - * ``DISPLAY`` is unset or empty. - * ``libX11`` cannot be loaded (headless system without X11 client libs). - * ``XOpenDisplay`` returns ``NULL`` (server unreachable, access denied, - or display string invalid). + 1. ``DISPLAY`` is set and non-empty. + 2. ``libX11`` is loadable and ``XOpenDisplay`` succeeds (server reachable). + 3. ``libGL`` (or ``libGLX``) is loadable and ``glXQueryVersion`` reports + GLX >= 1.3. GLX 1.3 is the minimum needed for modern context creation; + older or absent GLX means GLFW will fail with a GLX error regardless. - A ``True`` result means the X server accepted the connection; GLFW will - very likely be able to create a window (though GLX 3.3 availability is - not guaranteed — that is discovered later by GLFW itself). + Returns ``False`` in all other cases so that ``_headless.py`` falls through + to EGL or OSMesa instead of letting GLFW attempt and print GLX warnings. + + Note: Wayland displays (``WAYLAND_DISPLAY``) are not probed here because + GLFW handles the Wayland path natively and does not go through GLX. """ display_str = os.environ.get("DISPLAY") if not display_str: return False + + # --- Step 1: open X connection --- for lib_name in ("libX11.so.6", "libX11.so"): try: libx11 = ctypes.CDLL(lib_name) @@ -74,18 +78,55 @@ def _display_is_usable(): except OSError: continue else: - # libX11 not installed — treat as unusable display. - return False + return False # libX11 not installed + try: libx11.XOpenDisplay.restype = ctypes.c_void_p libx11.XOpenDisplay.argtypes = [ctypes.c_char_p] dpy = libx11.XOpenDisplay(display_str.encode()) - if dpy: - libx11.XCloseDisplay(dpy) - return True - return False except Exception: # noqa: BLE001 return False + if not dpy: + return False # X server unreachable / access denied + + # --- Step 2: check GLX version --- + glx_ok = False + for lib_name in ("libGL.so.1", "libGL.so", "libGLX.so.0", "libGLX.so"): + try: + libgl = ctypes.CDLL(lib_name) + break + except OSError: + continue + else: + libgl = None + + if libgl is not None: + try: + major = ctypes.c_int(0) + minor = ctypes.c_int(0) + libgl.glXQueryVersion.restype = ctypes.c_int + libgl.glXQueryVersion.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ctypes.c_int), + ctypes.POINTER(ctypes.c_int), + ] + ok = libgl.glXQueryVersion(dpy, ctypes.byref(major), ctypes.byref(minor)) + if ok and (major.value, minor.value) >= (1, 3): + glx_ok = True + else: + logger.debug( + "GLX version %d.%d on %s is too old or unavailable (need >= 1.3).", + major.value, minor.value, display_str, + ) + except Exception: # noqa: BLE001 + pass + + try: + libx11.XCloseDisplay(dpy) + except Exception: # noqa: BLE001 + pass + + return glx_ok def egl_device_is_available(): From a3185d6fd90661a51b9d9294aaedc5723cefecf9 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 13:21:42 +0100 Subject: [PATCH 05/29] improve detection of DISPLAY set but not working --- whippersnappy/gl/_headless.py | 53 ++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/whippersnappy/gl/_headless.py b/whippersnappy/gl/_headless.py index 1757490..1cd6be9 100644 --- a/whippersnappy/gl/_headless.py +++ b/whippersnappy/gl/_headless.py @@ -17,10 +17,12 @@ * ``DISPLAY`` / ``WAYLAND_DISPLAY`` are unset entirely. * ``DISPLAY`` is set but the X server is unreachable, refuses the - connection, or does not provide GLX >= 1.3 (e.g. ``ssh -X``/``ssh -Y`` - to a server whose software-rendered GLX cannot support OpenGL 3.3 Core - Profile). In these cases GLFW will fail with a GLX error anyway, so we - pre-empt it by trying EGL/OSMesa instead. + connection, or does not advertise the ``GLX_ARB_create_context_profile`` + extension that GLFW requires to create an OpenGL 3.3 Core Profile context + (e.g. ``ssh -X``/``ssh -Y`` to a server with only old software-rendered + GLX). In these cases GLFW fails with + ``GLX_ARB_create_context_profile is unavailable``, so we pre-empt it by + trying EGL/OSMesa instead. When ``DISPLAY`` is set **and** the X server is reachable, this module does not intervene: GLFW is tried first in @@ -56,9 +58,12 @@ def _display_is_usable(): 1. ``DISPLAY`` is set and non-empty. 2. ``libX11`` is loadable and ``XOpenDisplay`` succeeds (server reachable). - 3. ``libGL`` (or ``libGLX``) is loadable and ``glXQueryVersion`` reports - GLX >= 1.3. GLX 1.3 is the minimum needed for modern context creation; - older or absent GLX means GLFW will fail with a GLX error regardless. + 3. ``libGL`` (or ``libGLX``) exposes ``glXQueryExtensionsString`` and the + returned string contains ``GLX_ARB_create_context_profile``. This + extension is what GLFW requires to create an OpenGL 3.3 Core Profile + context. A forwarded ``ssh -X``/``ssh -Y`` display typically provides + old software-rendered GLX that lacks this extension, causing GLFW to + fail with ``GLX_ARB_create_context_profile is unavailable``. Returns ``False`` in all other cases so that ``_headless.py`` falls through to EGL or OSMesa instead of letting GLFW attempt and print GLX warnings. @@ -89,7 +94,9 @@ def _display_is_usable(): if not dpy: return False # X server unreachable / access denied - # --- Step 2: check GLX version --- + # --- Step 2: check for GLX_ARB_create_context_profile extension --- + # This is the extension GLFW needs to request an OpenGL 3.3 Core Profile. + # It is absent on old/software-rendered forwarded displays. glx_ok = False for lib_name in ("libGL.so.1", "libGL.so", "libGLX.so.0", "libGLX.so"): try: @@ -102,22 +109,22 @@ def _display_is_usable(): if libgl is not None: try: - major = ctypes.c_int(0) - minor = ctypes.c_int(0) - libgl.glXQueryVersion.restype = ctypes.c_int - libgl.glXQueryVersion.argtypes = [ - ctypes.c_void_p, - ctypes.POINTER(ctypes.c_int), - ctypes.POINTER(ctypes.c_int), + libgl.glXQueryExtensionsString.restype = ctypes.c_char_p + libgl.glXQueryExtensionsString.argtypes = [ + ctypes.c_void_p, # display + ctypes.c_int, # screen ] - ok = libgl.glXQueryVersion(dpy, ctypes.byref(major), ctypes.byref(minor)) - if ok and (major.value, minor.value) >= (1, 3): - glx_ok = True - else: - logger.debug( - "GLX version %d.%d on %s is too old or unavailable (need >= 1.3).", - major.value, minor.value, display_str, - ) + ext_bytes = libgl.glXQueryExtensionsString(dpy, 0) + if ext_bytes: + exts = ext_bytes.decode("ascii", errors="replace") + if "GLX_ARB_create_context_profile" in exts: + glx_ok = True + else: + logger.debug( + "GLX_ARB_create_context_profile absent on %s " + "(GLFW would fail); treating display as unusable.", + display_str, + ) except Exception: # noqa: BLE001 pass From 88006870b823447dba3d7cb4c11a3608c15be176 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 14:11:03 +0100 Subject: [PATCH 06/29] linux headless detect egl first --- whippersnappy/gl/_headless.py | 172 +++++++--------------------------- whippersnappy/gl/context.py | 32 +++---- 2 files changed, 49 insertions(+), 155 deletions(-) diff --git a/whippersnappy/gl/_headless.py b/whippersnappy/gl/_headless.py index 1cd6be9..77c8807 100644 --- a/whippersnappy/gl/_headless.py +++ b/whippersnappy/gl/_headless.py @@ -1,32 +1,23 @@ """Headless OpenGL platform detection. This module MUST be imported before any ``import OpenGL.GL`` statement in the -package. On Linux with no display server it sets ``PYOPENGL_PLATFORM`` so -that PyOpenGL resolves function pointers via the correct backend before -``OpenGL.GL`` is first imported. +package. On Linux it sets ``PYOPENGL_PLATFORM`` so that PyOpenGL resolves +function pointers via the correct backend before ``OpenGL.GL`` is first +imported. -Priority chain on Linux when no usable display is detected: +Priority chain on Linux (applied unconditionally — ``DISPLAY`` is irrelevant +for offscreen rendering): 1. **EGL + GPU device** — ``/dev/dri/renderD*`` present and ``libEGL`` - loadable. Sets ``PYOPENGL_PLATFORM=egl`` immediately so that PyOpenGL - binds function pointers via EGL when ``OpenGL.GL`` is first imported. + loadable. Sets ``PYOPENGL_PLATFORM=egl``. Works with or without a + display server, including headless servers, Docker/Singularity, and + ``ssh`` sessions (with or without ``-X``/``-Y``). 2. **OSMesa** — CPU software renderer. Sets ``PYOPENGL_PLATFORM=osmesa``. 3. **Neither** — raises ``RuntimeError`` with install instructions. -"No usable display" covers both: - -* ``DISPLAY`` / ``WAYLAND_DISPLAY`` are unset entirely. -* ``DISPLAY`` is set but the X server is unreachable, refuses the - connection, or does not advertise the ``GLX_ARB_create_context_profile`` - extension that GLFW requires to create an OpenGL 3.3 Core Profile context - (e.g. ``ssh -X``/``ssh -Y`` to a server with only old software-rendered - GLX). In these cases GLFW fails with - ``GLX_ARB_create_context_profile is unavailable``, so we pre-empt it by - trying EGL/OSMesa instead. - -When ``DISPLAY`` is set **and** the X server is reachable, this module -does not intervene: GLFW is tried first in -:func:`~whippersnappy.gl.context.init_offscreen_context`. +``PYOPENGL_PLATFORM`` is not consulted by GLFW, so setting it here does not +affect the interactive GUI (``whippersnap``), which creates its own visible +GLFW window independently. No OpenGL, GLFW, or other heavy imports are done here — only stdlib. """ @@ -51,91 +42,6 @@ def _osmesa_is_available(): return False -def _display_is_usable(): - """Return True if ``DISPLAY`` points to an X server with usable GLX support. - - A display is considered usable only when **all** of the following hold: - - 1. ``DISPLAY`` is set and non-empty. - 2. ``libX11`` is loadable and ``XOpenDisplay`` succeeds (server reachable). - 3. ``libGL`` (or ``libGLX``) exposes ``glXQueryExtensionsString`` and the - returned string contains ``GLX_ARB_create_context_profile``. This - extension is what GLFW requires to create an OpenGL 3.3 Core Profile - context. A forwarded ``ssh -X``/``ssh -Y`` display typically provides - old software-rendered GLX that lacks this extension, causing GLFW to - fail with ``GLX_ARB_create_context_profile is unavailable``. - - Returns ``False`` in all other cases so that ``_headless.py`` falls through - to EGL or OSMesa instead of letting GLFW attempt and print GLX warnings. - - Note: Wayland displays (``WAYLAND_DISPLAY``) are not probed here because - GLFW handles the Wayland path natively and does not go through GLX. - """ - display_str = os.environ.get("DISPLAY") - if not display_str: - return False - - # --- Step 1: open X connection --- - for lib_name in ("libX11.so.6", "libX11.so"): - try: - libx11 = ctypes.CDLL(lib_name) - break - except OSError: - continue - else: - return False # libX11 not installed - - try: - libx11.XOpenDisplay.restype = ctypes.c_void_p - libx11.XOpenDisplay.argtypes = [ctypes.c_char_p] - dpy = libx11.XOpenDisplay(display_str.encode()) - except Exception: # noqa: BLE001 - return False - if not dpy: - return False # X server unreachable / access denied - - # --- Step 2: check for GLX_ARB_create_context_profile extension --- - # This is the extension GLFW needs to request an OpenGL 3.3 Core Profile. - # It is absent on old/software-rendered forwarded displays. - glx_ok = False - for lib_name in ("libGL.so.1", "libGL.so", "libGLX.so.0", "libGLX.so"): - try: - libgl = ctypes.CDLL(lib_name) - break - except OSError: - continue - else: - libgl = None - - if libgl is not None: - try: - libgl.glXQueryExtensionsString.restype = ctypes.c_char_p - libgl.glXQueryExtensionsString.argtypes = [ - ctypes.c_void_p, # display - ctypes.c_int, # screen - ] - ext_bytes = libgl.glXQueryExtensionsString(dpy, 0) - if ext_bytes: - exts = ext_bytes.decode("ascii", errors="replace") - if "GLX_ARB_create_context_profile" in exts: - glx_ok = True - else: - logger.debug( - "GLX_ARB_create_context_profile absent on %s " - "(GLFW would fail); treating display as unusable.", - display_str, - ) - except Exception: # noqa: BLE001 - pass - - try: - libx11.XCloseDisplay(dpy) - except Exception: # noqa: BLE001 - pass - - return glx_ok - - def egl_device_is_available(): """Return True if libEGL is loadable AND a DRI render node is present. @@ -174,42 +80,30 @@ def egl_device_is_available(): if sys.platform == "linux" and "PYOPENGL_PLATFORM" not in os.environ: - _has_display = ( - bool(os.environ.get("DISPLAY")) or bool(os.environ.get("WAYLAND_DISPLAY")) - ) - - if _has_display and _display_is_usable(): - # Reachable X/Wayland server — let GLFW try first; don't intervene. - _display = os.environ.get("DISPLAY") or os.environ.get("WAYLAND_DISPLAY") - logger.debug("Display set and reachable (%s) — will try GLFW first.", _display) + if egl_device_is_available(): + # Prefer EGL for all offscreen rendering on Linux — regardless of + # whether DISPLAY is set. GLFW does not use PYOPENGL_PLATFORM, so + # the interactive GUI is unaffected. Setting this here, before any + # import of OpenGL.GL, ensures PyOpenGL binds EGL function pointers. + os.environ["PYOPENGL_PLATFORM"] = "egl" + logger.debug("EGL device available — PYOPENGL_PLATFORM=egl set.") + elif _osmesa_is_available(): + os.environ["PYOPENGL_PLATFORM"] = "osmesa" + logger.debug("No EGL device — PYOPENGL_PLATFORM=osmesa set (CPU rendering).") else: - # No display, or display is set but unreachable (e.g. bad ssh -X forward). - # Must choose a headless backend NOW before OpenGL.GL is imported. - if _has_display: - logger.debug( - "DISPLAY is set (%s) but X server is unreachable — " - "skipping GLFW/GLX and trying headless backends.", - os.environ.get("DISPLAY"), - ) - if egl_device_is_available(): - os.environ["PYOPENGL_PLATFORM"] = "egl" - logger.debug( - "No usable display; EGL + GPU device available — " - "PYOPENGL_PLATFORM=egl set." - ) - elif _osmesa_is_available(): - os.environ["PYOPENGL_PLATFORM"] = "osmesa" - logger.debug( - "No usable display; no EGL device — " - "PYOPENGL_PLATFORM=osmesa set (CPU rendering)." - ) - else: + _has_display = ( + bool(os.environ.get("DISPLAY")) or bool(os.environ.get("WAYLAND_DISPLAY")) + ) + if not _has_display: + # No display and no headless backend — raise immediately with + # instructions. When DISPLAY is set we stay silent and let GLFW + # try; if it fails too, context.py will raise a clearer error. raise RuntimeError( "whippersnappy requires an OpenGL context but none could be found.\n" "\n" - "No usable display detected (DISPLAY/WAYLAND_DISPLAY unset or X server\n" - "unreachable), no GPU render device found (/dev/dri/renderD* absent or\n" - "libEGL missing), and OSMesa is not installed.\n" + "No display server detected (DISPLAY/WAYLAND_DISPLAY unset),\n" + "no GPU render device found (/dev/dri/renderD* absent or libEGL\n" + "missing), and OSMesa is not installed.\n" "\n" "To fix this, choose one of:\n" " 1. Install OSMesa (recommended for headless/SSH use):\n" @@ -220,6 +114,6 @@ def egl_device_is_available(): " exists but you still see this error, add your user to the\n" " 'render' group: sudo usermod -aG render $USER\n" " (then log out and back in).\n" - " 3. If you used ssh -X/-Y, try without X forwarding:\n" - " unset DISPLAY\n" + " 3. Set DISPLAY if a local X server is running:\n" + " export DISPLAY=:0\n" ) diff --git a/whippersnappy/gl/context.py b/whippersnappy/gl/context.py index 7d3c26d..d037fee 100644 --- a/whippersnappy/gl/context.py +++ b/whippersnappy/gl/context.py @@ -142,15 +142,15 @@ def init_offscreen_context(width, height): Tries up to three paths on Linux; macOS and Windows use GLFW only. - 1. **GLFW invisible window** — standard path when a usable display is - available. + 1. **GLFW invisible window** — used when ``PYOPENGL_PLATFORM`` is not + ``"egl"`` (i.e. macOS, Windows, or Linux without EGL). Skipped on + Linux when EGL was selected at import time to avoid spurious GLX + warnings from a forwarded or software-rendered display. 2. **EGL pbuffer** — headless GPU rendering (Linux only). Attempted when :mod:`~whippersnappy.gl._headless` set ``PYOPENGL_PLATFORM=egl`` at - import time. This happens when either no display is present, or when - ``DISPLAY`` is set but the X server was unreachable (e.g. a stale - ``ssh -X`` forward that cannot provide GLX 3.3). Pre-setting - ``PYOPENGL_PLATFORM`` before ``OpenGL.GL`` is first imported ensures - PyOpenGL binds EGL function pointers correctly. + import time, which happens whenever an EGL-capable GPU device is + present (``/dev/dri/renderD*`` + ``libEGL``), regardless of whether + ``DISPLAY`` is set. 3. **OSMesa** — CPU software renderer (Linux only). Used when neither GLFW nor EGL succeeds, or when ``PYOPENGL_PLATFORM=osmesa`` was set. @@ -175,9 +175,12 @@ def init_offscreen_context(width, height): global _offscreen_context # --- Step 1: GLFW invisible window --- - window = init_window(width, height, visible=False) - if window: - return window + # Skip when PYOPENGL_PLATFORM=egl — OpenGL.GL is already bound to EGL, + # so a GLFW/GLX attempt would print GLX warnings and fail anyway. + if os.environ.get("PYOPENGL_PLATFORM") != "egl": + window = init_window(width, height, visible=False) + if window: + return window # Steps 2 & 3 are Linux-only. if sys.platform != "linux": @@ -189,12 +192,9 @@ def init_offscreen_context(width, height): ) # --- Step 2: EGL headless GPU rendering --- - # Only safe when PYOPENGL_PLATFORM=egl was set by _headless.py before - # OpenGL.GL was imported. _headless.py sets this when either no display - # is present at all, or when DISPLAY is set but the X server was - # unreachable (e.g. a stale/unusable ssh -X forward). In both cases - # PyOpenGL is already bound to EGL; attempting EGL when OpenGL.GL was - # imported with GLX would cause silent function-pointer mismatches. + # PYOPENGL_PLATFORM=egl was set by _headless.py at import time whenever + # an EGL-capable device was found (regardless of DISPLAY). PyOpenGL is + # already bound to EGL, so this is safe to call directly. if os.environ.get("PYOPENGL_PLATFORM") == "egl": logger.info("GLFW failed — trying EGL headless GPU rendering.") try: From ab7388aa415371def7fadfdef6fd1660c607331c Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 14:19:21 +0100 Subject: [PATCH 07/29] remove duplicate log output for saving --- whippersnappy/snap.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/whippersnappy/snap.py b/whippersnappy/snap.py index 9ad2659..88a06d9 100644 --- a/whippersnappy/snap.py +++ b/whippersnappy/snap.py @@ -275,7 +275,6 @@ def snap1( draw_caption(image, caption, font, orientation, x=cx, y=cy) if outpath: - logger.info("Saving snapshot to %s", outpath) image.save(outpath) return image finally: @@ -516,7 +515,6 @@ def snap4( # If outpath is specified, save to disk if outpath: - logger.info("Saving snapshot to %s", outpath) image.save(outpath) return image From a0246c1e7a15fbe2987fcfea53bb4607a232db29 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 15:17:00 +0100 Subject: [PATCH 08/29] update README and DOCKER docs --- DOCKER.md | 23 +++++++++++++---------- README.md | 19 +++++++++++-------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/DOCKER.md b/DOCKER.md index 3e56d2b..a0cb0ef 100644 --- a/DOCKER.md +++ b/DOCKER.md @@ -1,10 +1,10 @@ # Docker Guide -The Docker image provides a fully headless rendering environment. By default -it uses [OSMesa](https://docs.mesa3d.org/osmesa.html) (Mesa's CPU software -renderer) — no display server, `xvfb`, or GPU required. If a GPU render -device is available, pass `--device /dev/dri/renderD128` to enable EGL GPU -rendering instead; `libegl1` is already included in the image. +The Docker image provides a fully headless rendering environment. It +automatically uses **EGL** (GPU rendering) when a render device is passed in, +or falls back to **OSMesa** (CPU software renderer) otherwise — no display +server or `xvfb` required in either case. Both `libegl1` and `libosmesa6` +are pre-installed in the image. The default entry point is `whippersnap4` (four-view batch rendering). `whippersnap1` (single-view snapshot and rotation video) can be invoked by @@ -189,11 +189,12 @@ parent directory to retrieve them on the host. not root. - The interactive GUI (`whippersnap`) is **not** available in the Docker image — it requires a display server and PyQt6, which are not installed. -- **Default rendering** uses **OSMesa** (Mesa's CPU software renderer, provided - by the `libosmesa6` system package). No GPU or `/dev/dri/` device needed. -- **GPU rendering via EGL** works out of the box — `libegl1` is included in the - image. Pass the render device into the container and WhipperSnapPy will - automatically prefer EGL over OSMesa when `/dev/dri/renderD*` is accessible: +- **Default rendering** uses **EGL** (GPU) when `/dev/dri/renderD*` is + accessible, or **OSMesa** (CPU software renderer, `libosmesa6`) otherwise. + Both `libegl1` and `libosmesa6` are pre-installed in the image — no extra + setup is needed. +- **GPU rendering via EGL** is selected automatically when you pass the render + device into the container: ```bash docker run --rm --init \ --device /dev/dri/renderD128 \ @@ -203,4 +204,6 @@ parent directory to retrieve them on the host. -lh /subject/surf/lh.thickness -rh /subject/surf/rh.thickness \ -sd /subject -o /output/snap4.png ``` +- Without `--device`, WhipperSnapPy falls back to **OSMesa** (CPU) automatically. + No GPU or `/dev/dri/` device needed for CPU rendering. diff --git a/README.md b/README.md index 994da30..1837261 100644 --- a/README.md +++ b/README.md @@ -33,15 +33,18 @@ For interactive 3D in Jupyter notebooks: pip install 'whippersnappy[notebook]' ``` -Off-screen (headless) rendering on **Linux** uses a three-path fallback: -1. **GLFW invisible window** — used when a display is available (`DISPLAY` set). -2. **EGL** (GPU, no display needed) — used when no display is detected and a - GPU render device (`/dev/dri/renderD*`) is accessible with `libEGL` installed - (`libegl1` on Debian/Ubuntu). This is the recommended path for SSH servers - with a GPU — no `DISPLAY`, `xvfb`, or OSMesa required. -3. **OSMesa** (CPU software renderer) — final fallback; requires - `sudo apt-get install libosmesa6` (Debian/Ubuntu) or +Off-screen (headless) rendering on **Linux** uses a three-path priority chain: +1. **EGL** (GPU, preferred) — used whenever a GPU render device + (`/dev/dri/renderD*`) is present and `libEGL` is installed (`libegl1` on + Debian/Ubuntu). Works with or without a display server, including plain + SSH sessions and `ssh -X`/`ssh -Y` forwards where GLX is unavailable. + No `DISPLAY`, `xvfb`, or OSMesa required. +2. **OSMesa** (CPU software renderer) — used when no EGL device is found; + requires `sudo apt-get install libosmesa6` (Debian/Ubuntu) or `sudo dnf install mesa-libOSMesa` (RHEL/Fedora). +3. **GLFW invisible window** — fallback on Linux when neither EGL nor OSMesa + is available and a display is set; also the primary path on macOS and + Windows. On **Windows**, GLFW creates an invisible window; a GPU driver is sufficient. On **macOS**, a real display connection is required (NSGL does not support From 8f9eaa93ff598d62280939cc4971522f330919ba Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 16:24:41 +0100 Subject: [PATCH 09/29] disable mesa warning output if home is not writeable (e.g. inside docker) --- Dockerfile | 5 +++++ whippersnappy/gl/osmesa_context.py | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/Dockerfile b/Dockerfile index c450d0e..f675414 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,10 @@ FROM python:3.11-slim +# Suppress Mesa's shader-cache warning ("Failed to create //.cache …") that +# appears when running as a non-standard user inside Docker where $HOME is +# unset or points to a non-writable directory. +ENV MESA_SHADER_CACHE_DISABLE=1 + # libosmesa6 — OSMesa CPU software renderer (default headless path, no GPU needed) # libegl1 — EGL dispatch library; enables GPU rendering when /dev/dri/renderD* # is passed via --device (e.g. docker run --device /dev/dri/renderD128) diff --git a/whippersnappy/gl/osmesa_context.py b/whippersnappy/gl/osmesa_context.py index 412d97b..5dd9b1c 100644 --- a/whippersnappy/gl/osmesa_context.py +++ b/whippersnappy/gl/osmesa_context.py @@ -25,6 +25,7 @@ import ctypes import ctypes.util import logging +import os import OpenGL.GL as gl from PIL import Image @@ -97,6 +98,12 @@ def __init__(self, width: int, height: int): self._init_osmesa() def _init_osmesa(self): + # Suppress Mesa's "Failed to create //.cache for shader cache" warning + # that appears when $HOME is unset or non-writable (e.g. inside Docker). + # Only set if the user has not already configured it explicitly. + if "MESA_SHADER_CACHE_DISABLE" not in os.environ: + os.environ["MESA_SHADER_CACHE_DISABLE"] = "1" + lib = _load_libosmesa() self._libosmesa = lib From 84b8c7e21a94ffe531cad23147ca37f2bc7ee591 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 17:16:33 +0100 Subject: [PATCH 10/29] attempt for egl in docker --- DOCKER.md | 15 +++++++-- Dockerfile | 6 ++++ whippersnappy/gl/_headless.py | 60 +++++++++++++++++++++++------------ whippersnappy/gl/shaders.py | 1 + 4 files changed, 60 insertions(+), 22 deletions(-) diff --git a/DOCKER.md b/DOCKER.md index a0cb0ef..effd0f8 100644 --- a/DOCKER.md +++ b/DOCKER.md @@ -193,17 +193,28 @@ parent directory to retrieve them on the host. accessible, or **OSMesa** (CPU software renderer, `libosmesa6`) otherwise. Both `libegl1` and `libosmesa6` are pre-installed in the image — no extra setup is needed. -- **GPU rendering via EGL** is selected automatically when you pass the render - device into the container: +- **GPU rendering via EGL** requires passing the render device **and** the + render group into the container. On the host, `systemd-logind` grants the + logged-in user direct access to `/dev/dri/renderD*` via a POSIX ACL + (visible as the `+` in `ls -l`), so no group membership is needed natively. + Inside Docker there is no login session, so only traditional DAC permissions + apply — the process must belong to the `render` group to open the device. + The `--user $(id -u):$(id -g)` flag passes only the primary group; add + `--group-add` for the render group separately: ```bash docker run --rm --init \ --device /dev/dri/renderD128 \ + --group-add render \ + --user $(id -u):$(id -g) \ -v /path/to/subject:/subject \ -v /path/to/output:/output \ whippersnappy \ -lh /subject/surf/lh.thickness -rh /subject/surf/rh.thickness \ -sd /subject -o /output/snap4.png ``` + The image pre-creates a `render` group with GID 103 (Debian/Ubuntu default). + If your host uses a different GID, replace `--group-add render` with + `--group-add $(getent group render | cut -d: -f3)`. - Without `--device`, WhipperSnapPy falls back to **OSMesa** (CPU) automatically. No GPU or `/dev/dri/` device needed for CPU rendering. diff --git a/Dockerfile b/Dockerfile index f675414..7121c8a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,6 +20,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +# Create a 'render' group (GID 103, matching Debian/Ubuntu default) so that +# GPU EGL rendering works when the host render device is passed in via +# docker run --device /dev/dri/renderD128 --group-add render ... +# If the host render group has a different GID use --group-add instead. +RUN groupadd -g 103 render 2>/dev/null || true + RUN pip install --upgrade pip COPY . /WhipperSnapPy diff --git a/whippersnappy/gl/_headless.py b/whippersnappy/gl/_headless.py index 77c8807..9d05055 100644 --- a/whippersnappy/gl/_headless.py +++ b/whippersnappy/gl/_headless.py @@ -43,17 +43,16 @@ def _osmesa_is_available(): def egl_device_is_available(): - """Return True if libEGL is loadable AND a DRI render node is present. - - We check that at least one ``/dev/dri/renderD*`` node exists and that - ``libEGL`` can be loaded. We intentionally do **not** gate on - ``os.access(node, os.R_OK)`` here because that POSIX check does not - honour supplementary group memberships on all kernels (e.g. when the - process inherits group ``render`` via newgrp or a login session) and - does not account for POSIX ACL entries (the ``+`` suffix in ``ls -l`` - output). If the node exists and EGL is installed we optimistically try - EGL and let the context-creation call fail gracefully if the device truly - turns out to be inaccessible. + """Return True if libEGL is loadable AND a DRI render node can be opened. + + Unlike a simple existence check, this function actually tries to + ``open()`` each ``/dev/dri/renderD*`` node so that a permission error + (e.g. inside Docker when ``--group-add render`` is missing) is caught + here — before ``PYOPENGL_PLATFORM=egl`` is set and ``OpenGL.GL`` is + bound to EGL function pointers. If EGL were allowed to fail *after* + PyOpenGL has already bound to EGL, any OSMesa fallback in the same + process would use EGL function pointers for OSMesa calls, causing + cryptic GL errors (e.g. ``Validation failure``). We still skip EGL if *no* device node exists at all — that is the reliable Singularity/Docker signal where no device is bound in. @@ -66,16 +65,36 @@ def egl_device_is_available(): if not render_nodes: logger.debug("EGL: no /dev/dri/renderD* device nodes found — skipping EGL.") return False + + # Try to open at least one node to verify actual access permission. + # os.access() is unreliable for supplementary groups and POSIX ACLs, + # but open() uses the real kernel permission check. + accessible = [] + for node in render_nodes: + try: + fd = os.open(node, os.O_RDWR | os.O_NONBLOCK) + os.close(fd) + accessible.append(node) + except OSError: + continue + if not accessible: + logger.debug( + "EGL: /dev/dri/renderD* node(s) exist but none could be opened " + "(permission denied?) — skipping EGL to avoid broken fallback." + ) + return False + for name in ("libEGL.so.1", "libEGL.so"): try: ctypes.CDLL(name) logger.debug( - "EGL: libEGL found and %d render node(s) present.", len(render_nodes) + "EGL: libEGL found and %d accessible render node(s) — EGL available.", + len(accessible), ) return True except OSError: continue - logger.debug("EGL: /dev/dri/renderD* found but libEGL not loadable.") + logger.debug("EGL: render node accessible but libEGL not loadable.") return False @@ -102,18 +121,19 @@ def egl_device_is_available(): "whippersnappy requires an OpenGL context but none could be found.\n" "\n" "No display server detected (DISPLAY/WAYLAND_DISPLAY unset),\n" - "no GPU render device found (/dev/dri/renderD* absent or libEGL\n" - "missing), and OSMesa is not installed.\n" + "no accessible GPU render device (no /dev/dri/renderD* node could be\n" + "opened — device absent or permission denied), and OSMesa is not\n" + "installed.\n" "\n" "To fix this, choose one of:\n" " 1. Install OSMesa (recommended for headless/SSH use):\n" " Debian/Ubuntu: sudo apt-get install libosmesa6\n" " RHEL/Fedora: sudo dnf install mesa-libOSMesa\n" - " 2. Use EGL GPU rendering — ensure /dev/dri/renderD* exists and\n" - " libEGL is installed (libegl1 on Debian/Ubuntu). If the device\n" - " exists but you still see this error, add your user to the\n" - " 'render' group: sudo usermod -aG render $USER\n" - " (then log out and back in).\n" + " 2. Use EGL GPU rendering — ensure /dev/dri/renderD* is accessible:\n" + " • Add your user to the render group:\n" + " sudo usermod -aG render $USER (then log out and back in)\n" + " • Inside Docker: add --group-add render (or --group-add )\n" + " and --device /dev/dri/renderD128 to your docker run command.\n" " 3. Set DISPLAY if a local X server is running:\n" " export DISPLAY=:0\n" ) diff --git a/whippersnappy/gl/shaders.py b/whippersnappy/gl/shaders.py index 54c2671..28a799a 100644 --- a/whippersnappy/gl/shaders.py +++ b/whippersnappy/gl/shaders.py @@ -23,6 +23,7 @@ def compile_shader_program(vertex_src, fragment_src): return _gl_shaders.compileProgram( _gl_shaders.compileShader(vertex_src, gl.GL_VERTEX_SHADER), _gl_shaders.compileShader(fragment_src, gl.GL_FRAGMENT_SHADER), + validate=False, ) From 184c88d56ea41ec707e80bcde9f3afc929999783 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 17:34:52 +0100 Subject: [PATCH 11/29] update docker offscreen test chain --- DOCKER.md | 35 +++---- Dockerfile | 6 -- README.md | 17 ++-- whippersnappy/gl/_headless.py | 176 ++++++++++++++++++---------------- whippersnappy/gl/context.py | 33 +++---- 5 files changed, 127 insertions(+), 140 deletions(-) diff --git a/DOCKER.md b/DOCKER.md index effd0f8..3acdc06 100644 --- a/DOCKER.md +++ b/DOCKER.md @@ -1,10 +1,11 @@ # Docker Guide -The Docker image provides a fully headless rendering environment. It -automatically uses **EGL** (GPU rendering) when a render device is passed in, -or falls back to **OSMesa** (CPU software renderer) otherwise — no display -server or `xvfb` required in either case. Both `libegl1` and `libosmesa6` -are pre-installed in the image. +The Docker image provides a fully headless rendering environment using +**EGL** — no display server, `xvfb`, or `--device` flag required. EGL uses +Mesa's llvmpipe CPU renderer by default, and automatically switches to GPU +rendering when a GPU device is passed via `--device`. `libegl1` is +pre-installed; `libosmesa6` is also included as a fallback for systems where +EGL is unavailable. The default entry point is `whippersnap4` (four-view batch rendering). `whippersnap1` (single-view snapshot and rotation video) can be invoked by @@ -189,22 +190,14 @@ parent directory to retrieve them on the host. not root. - The interactive GUI (`whippersnap`) is **not** available in the Docker image — it requires a display server and PyQt6, which are not installed. -- **Default rendering** uses **EGL** (GPU) when `/dev/dri/renderD*` is - accessible, or **OSMesa** (CPU software renderer, `libosmesa6`) otherwise. - Both `libegl1` and `libosmesa6` are pre-installed in the image — no extra - setup is needed. -- **GPU rendering via EGL** requires passing the render device **and** the - render group into the container. On the host, `systemd-logind` grants the - logged-in user direct access to `/dev/dri/renderD*` via a POSIX ACL - (visible as the `+` in `ls -l`), so no group membership is needed natively. - Inside Docker there is no login session, so only traditional DAC permissions - apply — the process must belong to the `render` group to open the device. - The `--user $(id -u):$(id -g)` flag passes only the primary group; add - `--group-add` for the render group separately: +- **Default rendering** uses **EGL** with Mesa's llvmpipe CPU renderer — no + GPU or `/dev/dri/` device is needed. `libegl1` is pre-installed in the + image; no extra flags required. +- **GPU rendering** is selected automatically by EGL when you pass the render + device into the container (optional — only needed for hardware acceleration): ```bash docker run --rm --init \ --device /dev/dri/renderD128 \ - --group-add render \ --user $(id -u):$(id -g) \ -v /path/to/subject:/subject \ -v /path/to/output:/output \ @@ -212,9 +205,5 @@ parent directory to retrieve them on the host. -lh /subject/surf/lh.thickness -rh /subject/surf/rh.thickness \ -sd /subject -o /output/snap4.png ``` - The image pre-creates a `render` group with GID 103 (Debian/Ubuntu default). - If your host uses a different GID, replace `--group-add render` with - `--group-add $(getent group render | cut -d: -f3)`. -- Without `--device`, WhipperSnapPy falls back to **OSMesa** (CPU) automatically. - No GPU or `/dev/dri/` device needed for CPU rendering. + diff --git a/Dockerfile b/Dockerfile index 7121c8a..f675414 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,12 +20,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -# Create a 'render' group (GID 103, matching Debian/Ubuntu default) so that -# GPU EGL rendering works when the host render device is passed in via -# docker run --device /dev/dri/renderD128 --group-add render ... -# If the host render group has a different GID use --group-add instead. -RUN groupadd -g 103 render 2>/dev/null || true - RUN pip install --upgrade pip COPY . /WhipperSnapPy diff --git a/README.md b/README.md index 1837261..02cab2a 100644 --- a/README.md +++ b/README.md @@ -33,18 +33,15 @@ For interactive 3D in Jupyter notebooks: pip install 'whippersnappy[notebook]' ``` -Off-screen (headless) rendering on **Linux** uses a three-path priority chain: -1. **EGL** (GPU, preferred) — used whenever a GPU render device - (`/dev/dri/renderD*`) is present and `libEGL` is installed (`libegl1` on - Debian/Ubuntu). Works with or without a display server, including plain - SSH sessions and `ssh -X`/`ssh -Y` forwards where GLX is unavailable. - No `DISPLAY`, `xvfb`, or OSMesa required. -2. **OSMesa** (CPU software renderer) — used when no EGL device is found; +Off-screen (headless) rendering on **Linux** uses a two-path priority chain: +1. **EGL** (preferred) — used whenever `libEGL` is installed (`libegl1` on + Debian/Ubuntu). EGL handles both GPU rendering (when a GPU is present) + and CPU software rendering via Mesa's llvmpipe — no `/dev/dri` device or + display server required. Works in Docker without `--device`, in + Singularity, and over plain SSH (with or without `-X`/`-Y`). +2. **OSMesa** (CPU fallback) — used only when `libEGL` is not installed; requires `sudo apt-get install libosmesa6` (Debian/Ubuntu) or `sudo dnf install mesa-libOSMesa` (RHEL/Fedora). -3. **GLFW invisible window** — fallback on Linux when neither EGL nor OSMesa - is available and a display is set; also the primary path on macOS and - Windows. On **Windows**, GLFW creates an invisible window; a GPU driver is sufficient. On **macOS**, a real display connection is required (NSGL does not support diff --git a/whippersnappy/gl/_headless.py b/whippersnappy/gl/_headless.py index 9d05055..278dc72 100644 --- a/whippersnappy/gl/_headless.py +++ b/whippersnappy/gl/_headless.py @@ -1,29 +1,34 @@ """Headless OpenGL platform detection. This module MUST be imported before any ``import OpenGL.GL`` statement in the -package. On Linux it sets ``PYOPENGL_PLATFORM`` so that PyOpenGL resolves -function pointers via the correct backend before ``OpenGL.GL`` is first -imported. - -Priority chain on Linux (applied unconditionally — ``DISPLAY`` is irrelevant -for offscreen rendering): - -1. **EGL + GPU device** — ``/dev/dri/renderD*`` present and ``libEGL`` - loadable. Sets ``PYOPENGL_PLATFORM=egl``. Works with or without a - display server, including headless servers, Docker/Singularity, and - ``ssh`` sessions (with or without ``-X``/``-Y``). -2. **OSMesa** — CPU software renderer. Sets ``PYOPENGL_PLATFORM=osmesa``. +package. On Linux with no display it sets ``PYOPENGL_PLATFORM`` so that +PyOpenGL resolves function pointers via the correct backend before +``OpenGL.GL`` is first imported. + +Priority chain on Linux when no display is detected +(``DISPLAY`` / ``WAYLAND_DISPLAY`` unset): + +1. **EGL** — tried first when ``libEGL`` is installed. A lightweight ctypes + probe (``eglGetDisplay`` + ``eglInitialize``) confirms EGL can actually + initialise before ``PYOPENGL_PLATFORM=egl`` is set. This covers both GPU + rendering (real device) and CPU software rendering (Mesa llvmpipe) — + works in Docker without ``--device``. +2. **OSMesa** — fallback when EGL is not installed or the probe fails (e.g. + no GPU and no llvmpipe). Sets ``PYOPENGL_PLATFORM=osmesa``. 3. **Neither** — raises ``RuntimeError`` with install instructions. +When ``DISPLAY`` is set the module does not intervene; GLFW is tried first +in :func:`~whippersnappy.gl.context.init_offscreen_context`. If GLFW then +fails (e.g. broken ``ssh -X`` forward), the same EGL/OSMesa chain is +attempted there. + ``PYOPENGL_PLATFORM`` is not consulted by GLFW, so setting it here does not -affect the interactive GUI (``whippersnap``), which creates its own visible -GLFW window independently. +affect the interactive GUI (``whippersnap``). No OpenGL, GLFW, or other heavy imports are done here — only stdlib. """ import ctypes -import glob import logging import os import sys @@ -31,9 +36,9 @@ logger = logging.getLogger(__name__) -def _osmesa_is_available(): - """Return True if libOSMesa can be loaded via ctypes.""" - for name in ("libOSMesa.so.8", "libOSMesa.so", "OSMesa"): +def _egl_is_available(): + """Return True if libEGL can be loaded via ctypes.""" + for name in ("libEGL.so.1", "libEGL.so"): try: ctypes.CDLL(name) return True @@ -42,98 +47,99 @@ def _osmesa_is_available(): return False -def egl_device_is_available(): - """Return True if libEGL is loadable AND a DRI render node can be opened. +def _egl_context_works(): + """Probe EGL via ctypes to confirm a context can actually be created. - Unlike a simple existence check, this function actually tries to - ``open()`` each ``/dev/dri/renderD*`` node so that a permission error - (e.g. inside Docker when ``--group-add render`` is missing) is caught - here — before ``PYOPENGL_PLATFORM=egl`` is set and ``OpenGL.GL`` is - bound to EGL function pointers. If EGL were allowed to fail *after* - PyOpenGL has already bound to EGL, any OSMesa fallback in the same - process would use EGL function pointers for OSMesa calls, causing - cryptic GL errors (e.g. ``Validation failure``). + Calls ``eglGetDisplay(EGL_DEFAULT_DISPLAY)`` + ``eglInitialize`` only — + no ``OpenGL.GL`` import, no ``PYOPENGL_PLATFORM`` change. Returns + ``True`` only when EGL is loadable **and** a display can be initialised. + This means both the GPU path (real device) and the CPU path (llvmpipe) + are confirmed before we commit to ``PYOPENGL_PLATFORM=egl``. - We still skip EGL if *no* device node exists at all — that is the - reliable Singularity/Docker signal where no device is bound in. - - This function is called both here (at import time) and from - :func:`~whippersnappy.gl.context.init_offscreen_context` (at context - creation time, to decide whether to attempt EGL after GLFW fails). + If this returns ``False``, callers should fall back to OSMesa so that + ``OpenGL.GL`` is imported with the correct backend on its first import — + mixing EGL-bound function pointers with an OSMesa context causes silent + failures. """ - render_nodes = glob.glob("/dev/dri/renderD*") - if not render_nodes: - logger.debug("EGL: no /dev/dri/renderD* device nodes found — skipping EGL.") - return False - - # Try to open at least one node to verify actual access permission. - # os.access() is unreliable for supplementary groups and POSIX ACLs, - # but open() uses the real kernel permission check. - accessible = [] - for node in render_nodes: + for lib_name in ("libEGL.so.1", "libEGL.so"): try: - fd = os.open(node, os.O_RDWR | os.O_NONBLOCK) - os.close(fd) - accessible.append(node) + libegl = ctypes.CDLL(lib_name) + break except OSError: continue - if not accessible: - logger.debug( - "EGL: /dev/dri/renderD* node(s) exist but none could be opened " - "(permission denied?) — skipping EGL to avoid broken fallback." - ) + else: return False - for name in ("libEGL.so.1", "libEGL.so"): + try: + libegl.eglGetDisplay.restype = ctypes.c_void_p + libegl.eglGetDisplay.argtypes = [ctypes.c_void_p] + libegl.eglInitialize.restype = ctypes.c_bool + libegl.eglInitialize.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ctypes.c_int), + ctypes.POINTER(ctypes.c_int), + ] + libegl.eglTerminate.restype = ctypes.c_bool + libegl.eglTerminate.argtypes = [ctypes.c_void_p] + + dpy = libegl.eglGetDisplay(ctypes.c_void_p(0)) # EGL_DEFAULT_DISPLAY + if not dpy: + logger.debug("EGL probe: eglGetDisplay returned NULL.") + return False + major, minor = ctypes.c_int(0), ctypes.c_int(0) + ok = libegl.eglInitialize(dpy, ctypes.byref(major), ctypes.byref(minor)) + libegl.eglTerminate(dpy) + if ok: + logger.debug("EGL probe: eglInitialize succeeded (EGL %d.%d).", + major.value, minor.value) + return True + logger.debug("EGL probe: eglInitialize failed.") + return False + except Exception: # noqa: BLE001 + return False + + +def _osmesa_is_available(): + """Return True if libOSMesa can be loaded via ctypes.""" + for name in ("libOSMesa.so.8", "libOSMesa.so", "OSMesa"): try: ctypes.CDLL(name) - logger.debug( - "EGL: libEGL found and %d accessible render node(s) — EGL available.", - len(accessible), - ) return True except OSError: continue - logger.debug("EGL: render node accessible but libEGL not loadable.") return False if sys.platform == "linux" and "PYOPENGL_PLATFORM" not in os.environ: - if egl_device_is_available(): - # Prefer EGL for all offscreen rendering on Linux — regardless of - # whether DISPLAY is set. GLFW does not use PYOPENGL_PLATFORM, so - # the interactive GUI is unaffected. Setting this here, before any - # import of OpenGL.GL, ensures PyOpenGL binds EGL function pointers. - os.environ["PYOPENGL_PLATFORM"] = "egl" - logger.debug("EGL device available — PYOPENGL_PLATFORM=egl set.") - elif _osmesa_is_available(): - os.environ["PYOPENGL_PLATFORM"] = "osmesa" - logger.debug("No EGL device — PYOPENGL_PLATFORM=osmesa set (CPU rendering).") - else: - _has_display = ( - bool(os.environ.get("DISPLAY")) or bool(os.environ.get("WAYLAND_DISPLAY")) - ) - if not _has_display: - # No display and no headless backend — raise immediately with - # instructions. When DISPLAY is set we stay silent and let GLFW - # try; if it fails too, context.py will raise a clearer error. + _has_display = ( + bool(os.environ.get("DISPLAY")) or bool(os.environ.get("WAYLAND_DISPLAY")) + ) + if not _has_display: + # No display — choose headless backend before OpenGL.GL is imported. + # Use _egl_context_works() not just _egl_is_available(): libEGL may be + # installed but still fail (no GPU, no llvmpipe, bad driver). We must + # know the outcome before setting PYOPENGL_PLATFORM because OpenGL.GL + # binds its function pointers on first import and cannot be re-bound. + if _egl_context_works(): + os.environ["PYOPENGL_PLATFORM"] = "egl" + logger.debug("No display; EGL probe succeeded — PYOPENGL_PLATFORM=egl set.") + elif _osmesa_is_available(): + os.environ["PYOPENGL_PLATFORM"] = "osmesa" + logger.debug("No display; EGL unavailable — PYOPENGL_PLATFORM=osmesa set.") + else: raise RuntimeError( "whippersnappy requires an OpenGL context but none could be found.\n" "\n" "No display server detected (DISPLAY/WAYLAND_DISPLAY unset),\n" - "no accessible GPU render device (no /dev/dri/renderD* node could be\n" - "opened — device absent or permission denied), and OSMesa is not\n" - "installed.\n" + "EGL initialisation failed, and OSMesa is not installed.\n" "\n" "To fix this, choose one of:\n" - " 1. Install OSMesa (recommended for headless/SSH use):\n" + " 1. Install EGL (recommended, if GPU is installed):\n" + " Debian/Ubuntu: sudo apt-get install libegl1\n" + " RHEL/Fedora: sudo dnf install mesa-libEGL\n" + " 2. Install OSMesa (CPU-only alternative):\n" " Debian/Ubuntu: sudo apt-get install libosmesa6\n" " RHEL/Fedora: sudo dnf install mesa-libOSMesa\n" - " 2. Use EGL GPU rendering — ensure /dev/dri/renderD* is accessible:\n" - " • Add your user to the render group:\n" - " sudo usermod -aG render $USER (then log out and back in)\n" - " • Inside Docker: add --group-add render (or --group-add )\n" - " and --device /dev/dri/renderD128 to your docker run command.\n" " 3. Set DISPLAY if a local X server is running:\n" " export DISPLAY=:0\n" ) diff --git a/whippersnappy/gl/context.py b/whippersnappy/gl/context.py index d037fee..b2e1d86 100644 --- a/whippersnappy/gl/context.py +++ b/whippersnappy/gl/context.py @@ -143,16 +143,17 @@ def init_offscreen_context(width, height): Tries up to three paths on Linux; macOS and Windows use GLFW only. 1. **GLFW invisible window** — used when ``PYOPENGL_PLATFORM`` is not - ``"egl"`` (i.e. macOS, Windows, or Linux without EGL). Skipped on - Linux when EGL was selected at import time to avoid spurious GLX - warnings from a forwarded or software-rendered display. - 2. **EGL pbuffer** — headless GPU rendering (Linux only). Attempted when - :mod:`~whippersnappy.gl._headless` set ``PYOPENGL_PLATFORM=egl`` at - import time, which happens whenever an EGL-capable GPU device is - present (``/dev/dri/renderD*`` + ``libEGL``), regardless of whether - ``DISPLAY`` is set. - 3. **OSMesa** — CPU software renderer (Linux only). Used when neither - GLFW nor EGL succeeds, or when ``PYOPENGL_PLATFORM=osmesa`` was set. + ``"egl"`` (i.e. a display is available and EGL was not pre-selected). + Skipped on Linux when EGL was selected at import time to avoid spurious + GLX warnings. + 2. **EGL** — used when ``PYOPENGL_PLATFORM=egl`` was set by + :mod:`~whippersnappy.gl._headless` at import time (no display detected + and ``libEGL`` is installed). EGL handles both GPU and CPU (llvmpipe) + rendering without needing ``/dev/dri`` access — works in Docker without + ``--device``. + 3. **OSMesa** — CPU software renderer (Linux only). Used when EGL is not + installed (``PYOPENGL_PLATFORM=osmesa``) or when EGL context creation + fails. Parameters ---------- @@ -191,18 +192,18 @@ def init_offscreen_context(width, height): "On Windows ensure a GPU driver or Mesa opengl32.dll is available." ) - # --- Step 2: EGL headless GPU rendering --- - # PYOPENGL_PLATFORM=egl was set by _headless.py at import time whenever - # an EGL-capable device was found (regardless of DISPLAY). PyOpenGL is - # already bound to EGL, so this is safe to call directly. + # --- Step 2: EGL headless rendering --- + # PYOPENGL_PLATFORM=egl was set by _headless.py before OpenGL.GL was + # imported (no display detected + libEGL available). PyOpenGL is already + # bound to EGL; GLFW was intentionally skipped above. if os.environ.get("PYOPENGL_PLATFORM") == "egl": - logger.info("GLFW failed — trying EGL headless GPU rendering.") + logger.debug("Using EGL headless context.") try: from .egl_context import EGLContext # noqa: PLC0415 ctx = EGLContext(width, height) ctx.make_current() _offscreen_context = ctx - logger.info("Using EGL headless context (GPU, no display required).") + logger.info("Using EGL headless context (no display required).") return None except (ImportError, RuntimeError) as exc: logger.warning("EGL failed (%s) — falling back to OSMesa.", exc) From 6c656227df70bf3705dbf195ad9c25402bcb8ec8 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 17:53:34 +0100 Subject: [PATCH 12/29] further modification --- DOCKER.md | 20 +++-- whippersnappy/gl/_headless.py | 130 ++++++++++++++++++++++++-------- whippersnappy/gl/egl_context.py | 41 +++++++--- 3 files changed, 144 insertions(+), 47 deletions(-) diff --git a/DOCKER.md b/DOCKER.md index 3acdc06..c32fb6a 100644 --- a/DOCKER.md +++ b/DOCKER.md @@ -1,11 +1,10 @@ # Docker Guide The Docker image provides a fully headless rendering environment using -**EGL** — no display server, `xvfb`, or `--device` flag required. EGL uses -Mesa's llvmpipe CPU renderer by default, and automatically switches to GPU -rendering when a GPU device is passed via `--device`. `libegl1` is -pre-installed; `libosmesa6` is also included as a fallback for systems where -EGL is unavailable. +**EGL** — no display server or `xvfb` required. Without `--device`, EGL +renders via Mesa's llvmpipe (CPU); with `--device /dev/dri/renderD128` it +uses the GPU. `libosmesa6` is also included as a last-resort fallback for +the rare case where EGL itself cannot initialise. The default entry point is `whippersnap4` (four-view batch rendering). `whippersnap1` (single-view snapshot and rotation video) can be invoked by @@ -190,9 +189,14 @@ parent directory to retrieve them on the host. not root. - The interactive GUI (`whippersnap`) is **not** available in the Docker image — it requires a display server and PyQt6, which are not installed. -- **Default rendering** uses **EGL** with Mesa's llvmpipe CPU renderer — no - GPU or `/dev/dri/` device is needed. `libegl1` is pre-installed in the - image; no extra flags required. +- **Default rendering** uses **EGL**. Without `--device`, EGL uses Mesa's + software rasterizer (llvmpipe) for CPU rendering — no GPU required. With + `--device /dev/dri/renderD128`, EGL uses the GPU automatically. `libegl1` + is pre-installed in the image; no extra flags required. +- **OSMesa** is included (`libosmesa6`) as a last-resort fallback only for + the rare case where EGL itself fails to initialise (e.g. a stripped-down + base image without Mesa's EGL backend). Under normal circumstances EGL + handles both the GPU and CPU rendering paths. - **GPU rendering** is selected automatically by EGL when you pass the render device into the container (optional — only needed for hardware acceleration): ```bash diff --git a/whippersnappy/gl/_headless.py b/whippersnappy/gl/_headless.py index 278dc72..7da350f 100644 --- a/whippersnappy/gl/_headless.py +++ b/whippersnappy/gl/_headless.py @@ -48,18 +48,19 @@ def _egl_is_available(): def _egl_context_works(): - """Probe EGL via ctypes to confirm a context can actually be created. - - Calls ``eglGetDisplay(EGL_DEFAULT_DISPLAY)`` + ``eglInitialize`` only — - no ``OpenGL.GL`` import, no ``PYOPENGL_PLATFORM`` change. Returns - ``True`` only when EGL is loadable **and** a display can be initialised. - This means both the GPU path (real device) and the CPU path (llvmpipe) - are confirmed before we commit to ``PYOPENGL_PLATFORM=egl``. - - If this returns ``False``, callers should fall back to OSMesa so that - ``OpenGL.GL`` is imported with the correct backend on its first import — - mixing EGL-bound function pointers with an OSMesa context causes silent - failures. + """Probe EGL via ctypes to confirm a context can actually be created headlessly. + + Tries display-independent EGL paths in order: + + 1. ``EGL_MESA_platform_surfaceless`` — Mesa-specific, works with no display + server and no GPU (llvmpipe). The reliable headless path on Mesa stacks. + 2. ``EGL_EXT_device_enumeration`` — enumerate GPU devices directly; works + without a display server when a GPU is present. + 3. ``eglGetDisplay(EGL_DEFAULT_DISPLAY)`` — last resort; only succeeds when + a display server is reachable (i.e. ``DISPLAY`` is set). + + No ``OpenGL.GL`` import and no ``PYOPENGL_PLATFORM`` change are made. + Returns ``True`` only when EGL can actually initialise a display. """ for lib_name in ("libEGL.so.1", "libEGL.so"): try: @@ -68,13 +69,18 @@ def _egl_context_works(): except OSError: continue else: + logger.debug("EGL probe: libEGL not loadable.") return False try: - libegl.eglGetDisplay.restype = ctypes.c_void_p - libegl.eglGetDisplay.argtypes = [ctypes.c_void_p] - libegl.eglInitialize.restype = ctypes.c_bool - libegl.eglInitialize.argtypes = [ + libegl.eglGetProcAddress.restype = ctypes.c_void_p + libegl.eglGetProcAddress.argtypes = [ctypes.c_char_p] + libegl.eglQueryString.restype = ctypes.c_char_p + libegl.eglQueryString.argtypes = [ctypes.c_void_p, ctypes.c_int] + libegl.eglGetDisplay.restype = ctypes.c_void_p + libegl.eglGetDisplay.argtypes = [ctypes.c_void_p] + libegl.eglInitialize.restype = ctypes.c_bool + libegl.eglInitialize.argtypes = [ ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), ctypes.POINTER(ctypes.c_int), @@ -82,20 +88,84 @@ def _egl_context_works(): libegl.eglTerminate.restype = ctypes.c_bool libegl.eglTerminate.argtypes = [ctypes.c_void_p] - dpy = libegl.eglGetDisplay(ctypes.c_void_p(0)) # EGL_DEFAULT_DISPLAY - if not dpy: - logger.debug("EGL probe: eglGetDisplay returned NULL.") - return False - major, minor = ctypes.c_int(0), ctypes.c_int(0) - ok = libegl.eglInitialize(dpy, ctypes.byref(major), ctypes.byref(minor)) - libegl.eglTerminate(dpy) - if ok: - logger.debug("EGL probe: eglInitialize succeeded (EGL %d.%d).", - major.value, minor.value) + _EGL_EXTENSIONS = 0x3055 + _EGL_NONE = 0x3038 + _EGL_PLATFORM_DEVICE = 0x313F + + def _try_init(dpy): + if not dpy: + return False + major, minor = ctypes.c_int(0), ctypes.c_int(0) + ok = libegl.eglInitialize(dpy, ctypes.byref(major), ctypes.byref(minor)) + libegl.eglTerminate(dpy) + if ok: + logger.debug("EGL probe: eglInitialize OK (EGL %d.%d).", + major.value, minor.value) + return bool(ok) + + client_exts = libegl.eglQueryString(None, _EGL_EXTENSIONS) or b"" + logger.debug("EGL client extensions: %s", client_exts.decode()) + + _GetPlatformDisplayEXT = None + if b"EGL_EXT_platform_base" in client_exts: + addr = libegl.eglGetProcAddress(b"eglGetPlatformDisplayEXT") + if addr: + _GetPlatformDisplayEXT = ctypes.CFUNCTYPE( + ctypes.c_void_p, + ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), + )(addr) + + no_attribs = (ctypes.c_int * 1)(_EGL_NONE) + + # --- Path 1: EGL_MESA_platform_surfaceless --- + # Truly headless: no display server, no GPU needed (llvmpipe). + # Present on Mesa stacks (EGL_MESA_platform_surfaceless extension). + _EGL_PLATFORM_SURFACELESS = 0x31DD + if _GetPlatformDisplayEXT and b"EGL_MESA_platform_surfaceless" in client_exts: + dpy = _GetPlatformDisplayEXT( + _EGL_PLATFORM_SURFACELESS, ctypes.c_void_p(0), no_attribs + ) + if _try_init(dpy): + logger.debug("EGL probe: surfaceless platform succeeded.") + return True + + # --- Path 2: EGL_EXT_device_enumeration --- + # Enumerate GPU devices directly — works headlessly when GPU present. + if (_GetPlatformDisplayEXT + and b"EGL_EXT_device_enumeration" in client_exts): + addr = libegl.eglGetProcAddress(b"eglQueryDevicesEXT") + if addr: + _QueryDevices = ctypes.CFUNCTYPE( + ctypes.c_bool, + ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), + )(addr) + n = ctypes.c_int(0) + if _QueryDevices(0, None, ctypes.byref(n)) and n.value > 0: + devices = (ctypes.c_void_p * n.value)() + _QueryDevices(n.value, devices, ctypes.byref(n)) + for dev in devices: + dpy = _GetPlatformDisplayEXT( + _EGL_PLATFORM_DEVICE, + ctypes.c_void_p(dev), + no_attribs, + ) + if _try_init(dpy): + logger.debug("EGL probe: device enumeration succeeded.") + return True + + # --- Path 3: EGL_DEFAULT_DISPLAY --- + # Works only when a display server is reachable (DISPLAY set). + # Last resort — will fail headlessly on X11-linked Mesa builds. + dpy = libegl.eglGetDisplay(ctypes.c_void_p(0)) + if _try_init(dpy): + logger.debug("EGL probe: EGL_DEFAULT_DISPLAY succeeded.") return True - logger.debug("EGL probe: eglInitialize failed.") + + logger.info("EGL probe: no EGL display could be initialised — will use OSMesa.") return False - except Exception: # noqa: BLE001 + + except Exception as exc: # noqa: BLE001 + logger.debug("EGL probe: unexpected error (%s) — will use OSMesa.", exc) return False @@ -122,10 +192,10 @@ def _osmesa_is_available(): # binds its function pointers on first import and cannot be re-bound. if _egl_context_works(): os.environ["PYOPENGL_PLATFORM"] = "egl" - logger.debug("No display; EGL probe succeeded — PYOPENGL_PLATFORM=egl set.") + logger.info("No display detected; EGL available — using EGL headless rendering.") elif _osmesa_is_available(): os.environ["PYOPENGL_PLATFORM"] = "osmesa" - logger.debug("No display; EGL unavailable — PYOPENGL_PLATFORM=osmesa set.") + logger.info("No display detected; EGL unavailable — using OSMesa CPU rendering.") else: raise RuntimeError( "whippersnappy requires an OpenGL context but none could be found.\n" diff --git a/whippersnappy/gl/egl_context.py b/whippersnappy/gl/egl_context.py index 46b0d6e..94b97ee 100644 --- a/whippersnappy/gl/egl_context.py +++ b/whippersnappy/gl/egl_context.py @@ -175,28 +175,51 @@ def _init_egl(self): client_exts = libegl.eglQueryString(None, _EGL_EXTENSIONS) or b"" logger.debug("EGL client extensions: %s", client_exts.decode()) - has_device_enum = b"EGL_EXT_device_enumeration" in client_exts has_platform_base = b"EGL_EXT_platform_base" in client_exts + has_device_enum = b"EGL_EXT_device_enumeration" in client_exts + has_surfaceless = b"EGL_MESA_platform_surfaceless" in client_exts + eglGetPlatformDisplayEXT = None + if has_platform_base: + eglGetPlatformDisplayEXT = self._get_ext_fn( + "eglGetPlatformDisplayEXT", + ctypes.c_void_p, + [ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_int)], + ) + + _EGL_NONE = 0x3038 + no_attribs = (ctypes.c_int * 1)(_EGL_NONE) display = None - if has_device_enum and has_platform_base: + + # --- Path 1: EGL_MESA_platform_surfaceless --- + # Truly headless: no display server, no GPU needed (llvmpipe). + # The reliable path inside Docker without --device on Mesa stacks. + _EGL_PLATFORM_SURFACELESS = 0x31DD + if eglGetPlatformDisplayEXT and has_surfaceless and display is None: + candidate = eglGetPlatformDisplayEXT( + _EGL_PLATFORM_SURFACELESS, ctypes.c_void_p(0), no_attribs + ) + if candidate: + logger.debug("EGL: trying surfaceless platform display.") + display = candidate + + # --- Path 2: EGL_EXT_device_enumeration --- + # Enumerate GPU devices directly — works headlessly when GPU present. + if has_device_enum and eglGetPlatformDisplayEXT and display is None: eglQueryDevicesEXT = self._get_ext_fn( "eglQueryDevicesEXT", ctypes.c_bool, [ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_int)], ) - eglGetPlatformDisplayEXT = self._get_ext_fn( - "eglGetPlatformDisplayEXT", - ctypes.c_void_p, - [ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_int)], - ) display = self._open_device_display( eglQueryDevicesEXT, eglGetPlatformDisplayEXT ) + # --- Path 3: EGL_DEFAULT_DISPLAY --- + # Works only when a display server (X11/Wayland) is reachable. if display is None: - logger.debug("Falling back to eglGetDisplay(EGL_DEFAULT_DISPLAY)") - libegl.eglGetDisplay.restype = ctypes.c_void_p + logger.debug("EGL: trying EGL_DEFAULT_DISPLAY.") + libegl.eglGetDisplay.restype = ctypes.c_void_p libegl.eglGetDisplay.argtypes = [ctypes.c_void_p] display = libegl.eglGetDisplay(ctypes.c_void_p(0)) From ee0c992aff209d5a74eb4c54499a984198ad0744 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 18:19:48 +0100 Subject: [PATCH 13/29] output EGL device and update docs --- DOCKER.md | 55 ++++++++++++++++++++++++--------- README.md | 25 +++++++++------ whippersnappy/gl/egl_context.py | 26 +++++++++++++++- 3 files changed, 82 insertions(+), 24 deletions(-) diff --git a/DOCKER.md b/DOCKER.md index c32fb6a..c0f31fe 100644 --- a/DOCKER.md +++ b/DOCKER.md @@ -1,10 +1,12 @@ # Docker Guide The Docker image provides a fully headless rendering environment using -**EGL** — no display server or `xvfb` required. Without `--device`, EGL -renders via Mesa's llvmpipe (CPU); with `--device /dev/dri/renderD128` it -uses the GPU. `libosmesa6` is also included as a last-resort fallback for -the rare case where EGL itself cannot initialise. +**EGL** — no display server or `xvfb` required. By default EGL renders via +Mesa's llvmpipe (CPU software rendering), which requires no GPU. GPU +rendering is enabled automatically when a GPU is passed into the container +via `--gpus all` (NVIDIA) or `--device /dev/dri/renderD128` (AMD/Intel). +`libosmesa6` is also included as a last-resort fallback if EGL cannot +initialise. The default entry point is `whippersnap4` (four-view batch rendering). `whippersnap1` (single-view snapshot and rotation video) can be invoked by @@ -189,16 +191,31 @@ parent directory to retrieve them on the host. not root. - The interactive GUI (`whippersnap`) is **not** available in the Docker image — it requires a display server and PyQt6, which are not installed. -- **Default rendering** uses **EGL**. Without `--device`, EGL uses Mesa's - software rasterizer (llvmpipe) for CPU rendering — no GPU required. With - `--device /dev/dri/renderD128`, EGL uses the GPU automatically. `libegl1` - is pre-installed in the image; no extra flags required. -- **OSMesa** is included (`libosmesa6`) as a last-resort fallback only for - the rare case where EGL itself fails to initialise (e.g. a stripped-down - base image without Mesa's EGL backend). Under normal circumstances EGL - handles both the GPU and CPU rendering paths. -- **GPU rendering** is selected automatically by EGL when you pass the render - device into the container (optional — only needed for hardware acceleration): +- **Default rendering** uses **EGL with CPU software rendering** (Mesa + llvmpipe) — no GPU or display server required. The log will show: + ``` + EGL context active — CPU software rendering (llvmpipe (...), ...) + ``` +- **GPU rendering** is optional and selected automatically by EGL when a GPU + is accessible. The log will show: + ``` + EGL context active — GPU rendering (...) + ``` + To enable GPU rendering pass the GPU into the container: + + *NVIDIA (requires the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/)):* + ```bash + docker run --rm --init \ + --gpus all \ + --user $(id -u):$(id -g) \ + -v /path/to/subject:/subject \ + -v /path/to/output:/output \ + whippersnappy \ + -lh /subject/surf/lh.thickness -rh /subject/surf/rh.thickness \ + -sd /subject -o /output/snap4.png + ``` + + *AMD / Intel (pass the DRI render device directly):* ```bash docker run --rm --init \ --device /dev/dri/renderD128 \ @@ -210,4 +227,14 @@ parent directory to retrieve them on the host. -sd /subject -o /output/snap4.png ``` +- **Singularity/Apptainer:** CPU rendering works without any flags. For GPU + rendering pass `--nv` (NVIDIA) or `--rocm` (AMD): + ```bash + singularity exec --nv whippersnappy.sif \ + whippersnap4 -lh lh.thickness -rh rh.thickness -sd fsaverage -o snap4.png + ``` +- **OSMesa** (`libosmesa6`) is included as a last-resort CPU fallback for the + rare case where EGL itself fails to initialise. Under normal circumstances + EGL handles both GPU and CPU rendering and OSMesa is not used. + diff --git a/README.md b/README.md index 02cab2a..71883c3 100644 --- a/README.md +++ b/README.md @@ -33,15 +33,22 @@ For interactive 3D in Jupyter notebooks: pip install 'whippersnappy[notebook]' ``` -Off-screen (headless) rendering on **Linux** uses a two-path priority chain: -1. **EGL** (preferred) — used whenever `libEGL` is installed (`libegl1` on - Debian/Ubuntu). EGL handles both GPU rendering (when a GPU is present) - and CPU software rendering via Mesa's llvmpipe — no `/dev/dri` device or - display server required. Works in Docker without `--device`, in - Singularity, and over plain SSH (with or without `-X`/`-Y`). -2. **OSMesa** (CPU fallback) — used only when `libEGL` is not installed; - requires `sudo apt-get install libosmesa6` (Debian/Ubuntu) or - `sudo dnf install mesa-libOSMesa` (RHEL/Fedora). +Off-screen (headless) rendering on **Linux** uses EGL: +- **CPU software rendering** (default, no GPU needed) — EGL uses Mesa's + llvmpipe automatically when no GPU is available. Works in Docker without + any extra flags, in Singularity without `--nv`, and over plain SSH. +- **GPU rendering** — enabled automatically when a GPU is accessible. In + Docker pass `--gpus all` (NVIDIA) or `--device /dev/dri/renderD128` + (AMD/Intel). In Singularity pass `--nv` (NVIDIA) or `--rocm` (AMD). + +The log always reports which is active: +``` +EGL context active — CPU software rendering (llvmpipe (...), ...) +EGL context active — GPU rendering (...) +``` + +OSMesa (`libosmesa6`) is a last-resort CPU fallback used only when EGL fails +to initialise entirely (e.g. `libegl1` not installed). On **Windows**, GLFW creates an invisible window; a GPU driver is sufficient. On **macOS**, a real display connection is required (NSGL does not support diff --git a/whippersnappy/gl/egl_context.py b/whippersnappy/gl/egl_context.py index 94b97ee..a2250b5 100644 --- a/whippersnappy/gl/egl_context.py +++ b/whippersnappy/gl/egl_context.py @@ -190,6 +190,7 @@ def _init_egl(self): _EGL_NONE = 0x3038 no_attribs = (ctypes.c_int * 1)(_EGL_NONE) display = None + self._display_path = "unknown" # track for GPU/CPU log message # --- Path 1: EGL_MESA_platform_surfaceless --- # Truly headless: no display server, no GPU needed (llvmpipe). @@ -202,6 +203,7 @@ def _init_egl(self): if candidate: logger.debug("EGL: trying surfaceless platform display.") display = candidate + self._display_path = "surfaceless" # --- Path 2: EGL_EXT_device_enumeration --- # Enumerate GPU devices directly — works headlessly when GPU present. @@ -214,6 +216,8 @@ def _init_egl(self): display = self._open_device_display( eglQueryDevicesEXT, eglGetPlatformDisplayEXT ) + if display is not None: + self._display_path = "device" # --- Path 3: EGL_DEFAULT_DISPLAY --- # Works only when a display server (X11/Wayland) is reachable. @@ -222,6 +226,8 @@ def _init_egl(self): libegl.eglGetDisplay.restype = ctypes.c_void_p libegl.eglGetDisplay.argtypes = [ctypes.c_void_p] display = libegl.eglGetDisplay(ctypes.c_void_p(0)) + if display: + self._display_path = "default" if not display: raise RuntimeError( @@ -275,7 +281,8 @@ def _init_egl(self): "eglCreateContext for OpenGL 3.3 Core failed. " "Try: MESA_GL_VERSION_OVERRIDE=3.3 MESA_GLSL_VERSION_OVERRIDE=330" ) - logger.info("EGL context created (%dx%d)", self.width, self.height) + logger.debug("EGL context created (%dx%d) via %s display.", + self.width, self.height, self._display_path) def _open_device_display(self, eglQueryDevicesEXT, eglGetPlatformDisplayEXT): @@ -314,6 +321,23 @@ def make_current(self): # via at least one GL call; glGetError() is the cheapest trigger. gl.glGetError() + # Report GPU vs CPU rendering based on the GL renderer string. + renderer = (gl.glGetString(gl.GL_RENDERER) or b"").decode("utf-8", errors="replace") + vendor = (gl.glGetString(gl.GL_VENDOR) or b"").decode("utf-8", errors="replace") + _sw = ("llvmpipe", "softpipe", "swrast", "software") + is_cpu = any(s in renderer.lower() for s in _sw) + if is_cpu: + logger.info( + "EGL context active — CPU software rendering (%s, %s). " + "Pass --gpus all (Docker) or --nv (Singularity) to use GPU.", + renderer, vendor, + ) + else: + logger.info( + "EGL context active — GPU rendering (%s, %s).", + renderer, vendor, + ) + # Build FBO so rendering is directed off-screen self.fbo = gl.glGenFramebuffers(1) gl.glBindFramebuffer(gl.GL_FRAMEBUFFER, self.fbo) From 3ac638e7df2b51c0abd5fbfc4234af073a9c749f Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 18:27:59 +0100 Subject: [PATCH 14/29] allow --gpus all --- DOCKER.md | 7 ++++++- Dockerfile | 15 ++++++++++----- whippersnappy/gl/egl_context.py | 6 +++++- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/DOCKER.md b/DOCKER.md index c0f31fe..ba27791 100644 --- a/DOCKER.md +++ b/DOCKER.md @@ -203,7 +203,8 @@ parent directory to retrieve them on the host. ``` To enable GPU rendering pass the GPU into the container: - *NVIDIA (requires the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/)):* + *NVIDIA (requires the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/) + installed on the **host**):* ```bash docker run --rm --init \ --gpus all \ @@ -214,6 +215,10 @@ parent directory to retrieve them on the host. -lh /subject/surf/lh.thickness -rh /subject/surf/rh.thickness \ -sd /subject -o /output/snap4.png ``` + The NVIDIA Container Runtime injects the GPU EGL ICD (`10_nvidia.json`) + into the container at runtime. If the log still shows CPU rendering after + passing `--gpus all`, the NVIDIA Container Toolkit is likely not installed + or configured on the host (`nvidia-ctk --version` to check). *AMD / Intel (pass the DRI render device directly):* ```bash diff --git a/Dockerfile b/Dockerfile index f675414..db6c42c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,14 +5,13 @@ FROM python:3.11-slim # unset or points to a non-writable directory. ENV MESA_SHADER_CACHE_DISABLE=1 -# libosmesa6 — OSMesa CPU software renderer (default headless path, no GPU needed) -# libegl1 — EGL dispatch library; enables GPU rendering when /dev/dri/renderD* -# is passed via --device (e.g. docker run --device /dev/dri/renderD128) -# libgl1 — base OpenGL dispatch library required by PyOpenGL +# libegl1 — GLVND EGL dispatch library (routes to GPU or Mesa llvmpipe) +# libosmesa6 — OSMesa CPU fallback for environments where EGL cannot initialise +# libgl1 — base OpenGL dispatch library required by PyOpenGL # libglib2.0-0, libfontconfig1, libdbus-1-3 — runtime deps for Pillow / font rendering RUN apt-get update && apt-get install -y --no-install-recommends \ - libosmesa6 \ libegl1 \ + libosmesa6 \ libgl1 \ libglib2.0-0 \ libfontconfig1 \ @@ -20,6 +19,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +# Create the GLVND EGL vendor directory so the NVIDIA Container Runtime can +# inject its EGL ICD (10_nvidia.json) when --gpus all is passed. +# Without this directory the NVIDIA EGL ICD is silently not registered and +# EGL falls back to Mesa llvmpipe even with a GPU available. +RUN mkdir -p /usr/share/glvnd/egl_vendor.d + RUN pip install --upgrade pip COPY . /WhipperSnapPy diff --git a/whippersnappy/gl/egl_context.py b/whippersnappy/gl/egl_context.py index a2250b5..46796a9 100644 --- a/whippersnappy/gl/egl_context.py +++ b/whippersnappy/gl/egl_context.py @@ -329,7 +329,11 @@ def make_current(self): if is_cpu: logger.info( "EGL context active — CPU software rendering (%s, %s). " - "Pass --gpus all (Docker) or --nv (Singularity) to use GPU.", + "For GPU rendering: Docker: pass --gpus all (NVIDIA) or " + "--device /dev/dri/renderD128 (AMD/Intel); " + "Singularity: pass --nv (NVIDIA) or --rocm (AMD). " + "NVIDIA: also requires the NVIDIA Container Toolkit on the host " + "(check with: nvidia-ctk --version).", renderer, vendor, ) else: From 19db6116dd054e020fc78ced2d69aa8ef6692c36 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 18:33:33 +0100 Subject: [PATCH 15/29] fix --gpus all path --- whippersnappy/gl/_headless.py | 45 ++++++++++---------- whippersnappy/gl/egl_context.py | 75 ++++++++++++++++++++++++--------- 2 files changed, 79 insertions(+), 41 deletions(-) diff --git a/whippersnappy/gl/_headless.py b/whippersnappy/gl/_headless.py index 7da350f..82785a5 100644 --- a/whippersnappy/gl/_headless.py +++ b/whippersnappy/gl/_headless.py @@ -52,12 +52,14 @@ def _egl_context_works(): Tries display-independent EGL paths in order: - 1. ``EGL_MESA_platform_surfaceless`` — Mesa-specific, works with no display - server and no GPU (llvmpipe). The reliable headless path on Mesa stacks. - 2. ``EGL_EXT_device_enumeration`` — enumerate GPU devices directly; works - without a display server when a GPU is present. - 3. ``eglGetDisplay(EGL_DEFAULT_DISPLAY)`` — last resort; only succeeds when - a display server is reachable (i.e. ``DISPLAY`` is set). + 1. ``EGL_EXT_device_enumeration`` — enumerate GPU devices directly; works + headlessly without a display server. With ``--gpus all`` (NVIDIA) or + ``--device`` (AMD/Intel) the GPU device appears here and is preferred. + 2. ``EGL_MESA_platform_surfaceless`` — Mesa CPU software rendering + (llvmpipe); no GPU or display server needed. Used when no GPU device + is found (e.g. Docker without ``--gpus``/``--device``). + 3. ``eglGetDisplay(EGL_DEFAULT_DISPLAY)`` — last resort; only succeeds + when a display server (X11/Wayland) is reachable. No ``OpenGL.GL`` import and no ``PYOPENGL_PLATFORM`` change are made. Returns ``True`` only when EGL can actually initialise a display. @@ -117,20 +119,9 @@ def _try_init(dpy): no_attribs = (ctypes.c_int * 1)(_EGL_NONE) - # --- Path 1: EGL_MESA_platform_surfaceless --- - # Truly headless: no display server, no GPU needed (llvmpipe). - # Present on Mesa stacks (EGL_MESA_platform_surfaceless extension). - _EGL_PLATFORM_SURFACELESS = 0x31DD - if _GetPlatformDisplayEXT and b"EGL_MESA_platform_surfaceless" in client_exts: - dpy = _GetPlatformDisplayEXT( - _EGL_PLATFORM_SURFACELESS, ctypes.c_void_p(0), no_attribs - ) - if _try_init(dpy): - logger.debug("EGL probe: surfaceless platform succeeded.") - return True - - # --- Path 2: EGL_EXT_device_enumeration --- - # Enumerate GPU devices directly — works headlessly when GPU present. + # --- Path 1: EGL_EXT_device_enumeration --- + # GPU devices — preferred. With --gpus all (NVIDIA) or --device + # (AMD/Intel) the GPU appears here before surfaceless/llvmpipe. if (_GetPlatformDisplayEXT and b"EGL_EXT_device_enumeration" in client_exts): addr = libegl.eglGetProcAddress(b"eglQueryDevicesEXT") @@ -150,12 +141,22 @@ def _try_init(dpy): no_attribs, ) if _try_init(dpy): - logger.debug("EGL probe: device enumeration succeeded.") + logger.debug("EGL probe: device enumeration succeeded (GPU).") return True + # --- Path 2: EGL_MESA_platform_surfaceless --- + # CPU software rendering (llvmpipe) — no GPU needed. + _EGL_PLATFORM_SURFACELESS = 0x31DD + if _GetPlatformDisplayEXT and b"EGL_MESA_platform_surfaceless" in client_exts: + dpy = _GetPlatformDisplayEXT( + _EGL_PLATFORM_SURFACELESS, ctypes.c_void_p(0), no_attribs + ) + if _try_init(dpy): + logger.debug("EGL probe: surfaceless platform succeeded (CPU/llvmpipe).") + return True + # --- Path 3: EGL_DEFAULT_DISPLAY --- # Works only when a display server is reachable (DISPLAY set). - # Last resort — will fail headlessly on X11-linked Mesa builds. dpy = libegl.eglGetDisplay(ctypes.c_void_p(0)) if _try_init(dpy): logger.debug("EGL probe: EGL_DEFAULT_DISPLAY succeeded.") diff --git a/whippersnappy/gl/egl_context.py b/whippersnappy/gl/egl_context.py index 46796a9..14d2c21 100644 --- a/whippersnappy/gl/egl_context.py +++ b/whippersnappy/gl/egl_context.py @@ -192,21 +192,10 @@ def _init_egl(self): display = None self._display_path = "unknown" # track for GPU/CPU log message - # --- Path 1: EGL_MESA_platform_surfaceless --- - # Truly headless: no display server, no GPU needed (llvmpipe). - # The reliable path inside Docker without --device on Mesa stacks. - _EGL_PLATFORM_SURFACELESS = 0x31DD - if eglGetPlatformDisplayEXT and has_surfaceless and display is None: - candidate = eglGetPlatformDisplayEXT( - _EGL_PLATFORM_SURFACELESS, ctypes.c_void_p(0), no_attribs - ) - if candidate: - logger.debug("EGL: trying surfaceless platform display.") - display = candidate - self._display_path = "surfaceless" - - # --- Path 2: EGL_EXT_device_enumeration --- - # Enumerate GPU devices directly — works headlessly when GPU present. + # --- Path 1: EGL_EXT_device_enumeration --- + # Enumerate GPU devices directly — preferred when a GPU is present. + # Works headlessly without a display server. With --gpus all (NVIDIA) + # or --device (AMD/Intel) the GPU device appears here. if has_device_enum and eglGetPlatformDisplayEXT and display is None: eglQueryDevicesEXT = self._get_ext_fn( "eglQueryDevicesEXT", @@ -219,6 +208,19 @@ def _init_egl(self): if display is not None: self._display_path = "device" + # --- Path 2: EGL_MESA_platform_surfaceless --- + # CPU software rendering (llvmpipe) — no GPU needed. + # Used when no GPU device was found (e.g. Docker without --gpus/--device). + _EGL_PLATFORM_SURFACELESS = 0x31DD + if eglGetPlatformDisplayEXT and has_surfaceless and display is None: + candidate = eglGetPlatformDisplayEXT( + _EGL_PLATFORM_SURFACELESS, ctypes.c_void_p(0), no_attribs + ) + if candidate: + logger.debug("EGL: trying surfaceless platform display (CPU/llvmpipe).") + display = candidate + self._display_path = "surfaceless" + # --- Path 3: EGL_DEFAULT_DISPLAY --- # Works only when a display server (X11/Wayland) is reachable. if display is None: @@ -286,21 +288,56 @@ def _init_egl(self): def _open_device_display(self, eglQueryDevicesEXT, eglGetPlatformDisplayEXT): - """Enumerate EGL devices and return first usable display pointer.""" + """Enumerate EGL devices and return the first usable GPU display pointer. + + Prefers hardware GPU devices over software (llvmpipe) devices by + checking ``EGL_DRM_DEVICE_FILE_EXT`` — hardware devices have a DRM + path, software devices do not. Falls back to any working device if + no hardware device is found. + """ n = ctypes.c_int(0) if not eglQueryDevicesEXT(0, None, ctypes.byref(n)) or n.value == 0: - logger.warning("eglQueryDevicesEXT: no devices.") + logger.debug("EGL: eglQueryDevicesEXT found no devices.") return None logger.debug("EGL: %d device(s) found", n.value) devices = (ctypes.c_void_p * n.value)() eglQueryDevicesEXT(n.value, devices, ctypes.byref(n)) no_attribs = (ctypes.c_int * 1)(_EGL_NONE) - for i, dev in enumerate(devices): + + # Try to load eglQueryDeviceStringEXT to identify hardware vs software + _EGL_DRM_DEVICE_FILE_EXT = 0x3233 + try: + addr = self._libegl.eglGetProcAddress(b"eglQueryDeviceStringEXT") + if addr: + _QueryDeviceString = ctypes.CFUNCTYPE( + ctypes.c_char_p, ctypes.c_void_p, ctypes.c_int + )(addr) + else: + _QueryDeviceString = None + except Exception: # noqa: BLE001 + _QueryDeviceString = None + + # Two passes: first try hardware GPU devices, then fall back to any device + hw_devices = [] + sw_devices = [] + for dev in devices: + is_hw = False + if _QueryDeviceString: + drm_path = _QueryDeviceString(ctypes.c_void_p(dev), _EGL_DRM_DEVICE_FILE_EXT) + is_hw = bool(drm_path) + logger.debug("EGL device drm_path=%s hw=%s", drm_path, is_hw) + if is_hw: + hw_devices.append(dev) + else: + sw_devices.append(dev) + + for i, dev in enumerate(hw_devices + sw_devices): dpy = eglGetPlatformDisplayEXT( _EGL_PLATFORM_DEVICE_EXT, ctypes.c_void_p(dev), no_attribs ) if dpy: - logger.debug("EGL: using device %d", i) + kind = "hardware GPU" if i < len(hw_devices) else "software" + logger.debug("EGL: using %s device %d", kind, i) return dpy return None From 66cbf2fffec58392bae51ce8041ea29d663181cb Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 18:41:45 +0100 Subject: [PATCH 16/29] add logging output to debug container GPU issue --- whippersnappy/gl/_headless.py | 3 +++ whippersnappy/gl/egl_context.py | 41 ++++++++++++++++----------------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/whippersnappy/gl/_headless.py b/whippersnappy/gl/_headless.py index 82785a5..07d90ac 100644 --- a/whippersnappy/gl/_headless.py +++ b/whippersnappy/gl/_headless.py @@ -132,6 +132,7 @@ def _try_init(dpy): )(addr) n = ctypes.c_int(0) if _QueryDevices(0, None, ctypes.byref(n)) and n.value > 0: + logger.info("EGL probe: %d EGL device(s) found via enumeration.", n.value) devices = (ctypes.c_void_p * n.value)() _QueryDevices(n.value, devices, ctypes.byref(n)) for dev in devices: @@ -143,6 +144,8 @@ def _try_init(dpy): if _try_init(dpy): logger.debug("EGL probe: device enumeration succeeded (GPU).") return True + else: + logger.info("EGL probe: device enumeration found 0 devices.") # --- Path 2: EGL_MESA_platform_surfaceless --- # CPU software rendering (llvmpipe) — no GPU needed. diff --git a/whippersnappy/gl/egl_context.py b/whippersnappy/gl/egl_context.py index 14d2c21..548705b 100644 --- a/whippersnappy/gl/egl_context.py +++ b/whippersnappy/gl/egl_context.py @@ -290,46 +290,45 @@ def _init_egl(self): def _open_device_display(self, eglQueryDevicesEXT, eglGetPlatformDisplayEXT): """Enumerate EGL devices and return the first usable GPU display pointer. - Prefers hardware GPU devices over software (llvmpipe) devices by - checking ``EGL_DRM_DEVICE_FILE_EXT`` — hardware devices have a DRM - path, software devices do not. Falls back to any working device if - no hardware device is found. + Prefers hardware GPU devices over software (llvmpipe/Mesa) devices. + Hardware devices are identified by the absence of ``EGL_MESA_device_software`` + in their extension string — this correctly handles both AMD/Intel (which + have a DRM path) and NVIDIA (which do not expose a DRM path but are still + real GPU devices). """ n = ctypes.c_int(0) if not eglQueryDevicesEXT(0, None, ctypes.byref(n)) or n.value == 0: logger.debug("EGL: eglQueryDevicesEXT found no devices.") return None - logger.debug("EGL: %d device(s) found", n.value) + logger.info("EGL: %d device(s) found via enumeration.", n.value) devices = (ctypes.c_void_p * n.value)() eglQueryDevicesEXT(n.value, devices, ctypes.byref(n)) no_attribs = (ctypes.c_int * 1)(_EGL_NONE) - # Try to load eglQueryDeviceStringEXT to identify hardware vs software - _EGL_DRM_DEVICE_FILE_EXT = 0x3233 + _EGL_DRM_DEVICE_FILE_EXT = 0x3233 # for logging only + _EGL_EXTENSIONS_STR = 0x3055 try: - addr = self._libegl.eglGetProcAddress(b"eglQueryDeviceStringEXT") - if addr: - _QueryDeviceString = ctypes.CFUNCTYPE( - ctypes.c_char_p, ctypes.c_void_p, ctypes.c_int - )(addr) - else: - _QueryDeviceString = None + addr2 = self._libegl.eglGetProcAddress(b"eglQueryDeviceStringEXT") + _QueryDeviceString = ctypes.CFUNCTYPE( + ctypes.c_char_p, ctypes.c_void_p, ctypes.c_int + )(addr2) if addr2 else None except Exception: # noqa: BLE001 _QueryDeviceString = None - # Two passes: first try hardware GPU devices, then fall back to any device hw_devices = [] sw_devices = [] for dev in devices: - is_hw = False + is_sw = False if _QueryDeviceString: drm_path = _QueryDeviceString(ctypes.c_void_p(dev), _EGL_DRM_DEVICE_FILE_EXT) - is_hw = bool(drm_path) - logger.debug("EGL device drm_path=%s hw=%s", drm_path, is_hw) - if is_hw: - hw_devices.append(dev) - else: + dev_exts = _QueryDeviceString(ctypes.c_void_p(dev), _EGL_EXTENSIONS_STR) or b"" + is_sw = b"EGL_MESA_device_software" in dev_exts + logger.info("EGL device: drm_path=%s sw=%s exts=%s", + drm_path, is_sw, dev_exts.decode() if dev_exts else "") + if is_sw: sw_devices.append(dev) + else: + hw_devices.append(dev) for i, dev in enumerate(hw_devices + sw_devices): dpy = eglGetPlatformDisplayEXT( From 27d51193696fb97baafd9ddc8a789b87b0cde687 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 18:53:03 +0100 Subject: [PATCH 17/29] revert nvidia stuff --- DOCKER.md | 42 ++++++++++++++------------------- Dockerfile | 5 ---- README.md | 10 ++++---- whippersnappy/gl/egl_context.py | 8 +++---- 4 files changed, 27 insertions(+), 38 deletions(-) diff --git a/DOCKER.md b/DOCKER.md index ba27791..cc09afe 100644 --- a/DOCKER.md +++ b/DOCKER.md @@ -1,10 +1,12 @@ # Docker Guide The Docker image provides a fully headless rendering environment using -**EGL** — no display server or `xvfb` required. By default EGL renders via -Mesa's llvmpipe (CPU software rendering), which requires no GPU. GPU -rendering is enabled automatically when a GPU is passed into the container -via `--gpus all` (NVIDIA) or `--device /dev/dri/renderD128` (AMD/Intel). +**EGL** with Mesa's llvmpipe CPU software renderer — no GPU, display server, +or `xvfb` required. AMD/Intel GPU rendering is available via +`--device /dev/dri/renderD128`. NVIDIA GPU OpenGL rendering in Docker +requires an `nvidia/opengl`-based image and is not supported in this image; +CPU rendering is the intended headless path and is fast enough for all +snapshot and video tasks. `libosmesa6` is also included as a last-resort fallback if EGL cannot initialise. @@ -196,31 +198,23 @@ parent directory to retrieve them on the host. ``` EGL context active — CPU software rendering (llvmpipe (...), ...) ``` -- **GPU rendering** is optional and selected automatically by EGL when a GPU - is accessible. The log will show: + This is sufficient for all snapshot and video rendering tasks. + +- **GPU rendering** requires OpenGL access to the GPU, not just CUDA. + The log will show: ``` EGL context active — GPU rendering (...) ``` - To enable GPU rendering pass the GPU into the container: - *NVIDIA (requires the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/) - installed on the **host**):* - ```bash - docker run --rm --init \ - --gpus all \ - --user $(id -u):$(id -g) \ - -v /path/to/subject:/subject \ - -v /path/to/output:/output \ - whippersnappy \ - -lh /subject/surf/lh.thickness -rh /subject/surf/rh.thickness \ - -sd /subject -o /output/snap4.png - ``` - The NVIDIA Container Runtime injects the GPU EGL ICD (`10_nvidia.json`) - into the container at runtime. If the log still shows CPU rendering after - passing `--gpus all`, the NVIDIA Container Toolkit is likely not installed - or configured on the host (`nvidia-ctk --version` to check). + *NVIDIA:* `--gpus all` only provides CUDA compute access, **not** + OpenGL/EGL rendering. For NVIDIA GPU OpenGL in Docker you need an + image based on `nvidia/opengl` or `nvidia/cuda:*-opengl`. The standard + `whippersnappy` image does not include the NVIDIA OpenGL drivers and + therefore always uses CPU rendering regardless of `--gpus all`. + GPU rendering with NVIDIA in Docker is not officially supported in this + image — CPU rendering via llvmpipe is the intended headless path. - *AMD / Intel (pass the DRI render device directly):* + *AMD / Intel* (DRI render device — works out of the box): ```bash docker run --rm --init \ --device /dev/dri/renderD128 \ diff --git a/Dockerfile b/Dockerfile index db6c42c..1a4610a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,11 +19,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -# Create the GLVND EGL vendor directory so the NVIDIA Container Runtime can -# inject its EGL ICD (10_nvidia.json) when --gpus all is passed. -# Without this directory the NVIDIA EGL ICD is silently not registered and -# EGL falls back to Mesa llvmpipe even with a GPU available. -RUN mkdir -p /usr/share/glvnd/egl_vendor.d RUN pip install --upgrade pip diff --git a/README.md b/README.md index 71883c3..f9cbe80 100644 --- a/README.md +++ b/README.md @@ -37,9 +37,11 @@ Off-screen (headless) rendering on **Linux** uses EGL: - **CPU software rendering** (default, no GPU needed) — EGL uses Mesa's llvmpipe automatically when no GPU is available. Works in Docker without any extra flags, in Singularity without `--nv`, and over plain SSH. -- **GPU rendering** — enabled automatically when a GPU is accessible. In - Docker pass `--gpus all` (NVIDIA) or `--device /dev/dri/renderD128` - (AMD/Intel). In Singularity pass `--nv` (NVIDIA) or `--rocm` (AMD). +- **GPU rendering** — for AMD/Intel in Docker pass + `--device /dev/dri/renderD128`; in Singularity pass `--nv` (NVIDIA) or + `--rocm` (AMD). Note: NVIDIA `--gpus all` in Docker only provides CUDA + compute access, not EGL/OpenGL rendering; NVIDIA GPU OpenGL in Docker + requires an `nvidia/opengl`-based image. The log always reports which is active: ``` @@ -48,7 +50,7 @@ EGL context active — GPU rendering (...) ``` OSMesa (`libosmesa6`) is a last-resort CPU fallback used only when EGL fails -to initialise entirely (e.g. `libegl1` not installed). +to initialise entirely. On **Windows**, GLFW creates an invisible window; a GPU driver is sufficient. On **macOS**, a real display connection is required (NSGL does not support diff --git a/whippersnappy/gl/egl_context.py b/whippersnappy/gl/egl_context.py index 548705b..37ffdba 100644 --- a/whippersnappy/gl/egl_context.py +++ b/whippersnappy/gl/egl_context.py @@ -365,11 +365,9 @@ def make_current(self): if is_cpu: logger.info( "EGL context active — CPU software rendering (%s, %s). " - "For GPU rendering: Docker: pass --gpus all (NVIDIA) or " - "--device /dev/dri/renderD128 (AMD/Intel); " - "Singularity: pass --nv (NVIDIA) or --rocm (AMD). " - "NVIDIA: also requires the NVIDIA Container Toolkit on the host " - "(check with: nvidia-ctk --version).", + "For AMD/Intel GPU rendering in Docker pass " + "--device /dev/dri/renderD128; " + "for Singularity pass --nv (NVIDIA) or --rocm (AMD).", renderer, vendor, ) else: From 7b837e43187fb4f236ffb84b59ec7cee586e0246 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 18:57:23 +0100 Subject: [PATCH 18/29] more fixing --- DOCKER.md | 6 ++- whippersnappy/gl/context.py | 28 ++++++------ whippersnappy/gl/egl_context.py | 76 ++++++++++++++++++--------------- 3 files changed, 61 insertions(+), 49 deletions(-) diff --git a/DOCKER.md b/DOCKER.md index cc09afe..60bb9e5 100644 --- a/DOCKER.md +++ b/DOCKER.md @@ -214,10 +214,12 @@ parent directory to retrieve them on the host. GPU rendering with NVIDIA in Docker is not officially supported in this image — CPU rendering via llvmpipe is the intended headless path. - *AMD / Intel* (DRI render device — works out of the box): + *AMD / Intel* (DRI render device). The device is owned by the host's + `render` group; pass the group ID into the container so EGL can open it: ```bash docker run --rm --init \ --device /dev/dri/renderD128 \ + --group-add $(getent group render | cut -d: -f3) \ --user $(id -u):$(id -g) \ -v /path/to/subject:/subject \ -v /path/to/output:/output \ @@ -225,6 +227,8 @@ parent directory to retrieve them on the host. -lh /subject/surf/lh.thickness -rh /subject/surf/rh.thickness \ -sd /subject -o /output/snap4.png ``` + If the GPU device cannot be opened, EGL automatically falls back to + CPU software rendering (llvmpipe) — no crash, no OSMesa required. - **Singularity/Apptainer:** CPU rendering works without any flags. For GPU rendering pass `--nv` (NVIDIA) or `--rocm` (AMD): diff --git a/whippersnappy/gl/context.py b/whippersnappy/gl/context.py index b2e1d86..cf265c2 100644 --- a/whippersnappy/gl/context.py +++ b/whippersnappy/gl/context.py @@ -194,22 +194,24 @@ def init_offscreen_context(width, height): # --- Step 2: EGL headless rendering --- # PYOPENGL_PLATFORM=egl was set by _headless.py before OpenGL.GL was - # imported (no display detected + libEGL available). PyOpenGL is already - # bound to EGL; GLFW was intentionally skipped above. + # imported (no display detected + EGL probe succeeded). PyOpenGL is + # already bound to EGL; GLFW was intentionally skipped above. + # EGLContext._init_egl tries GPU device → surfaceless (llvmpipe) → + # default display in order, so it handles CPU fallback internally. + # We must NOT fall back to OSMesa here: PYOPENGL_PLATFORM is already + # "egl" and OpenGL.GL function pointers are bound to EGL — using an + # OSMesa context with EGL pointers causes silent GL failures. if os.environ.get("PYOPENGL_PLATFORM") == "egl": - logger.debug("Using EGL headless context.") - try: - from .egl_context import EGLContext # noqa: PLC0415 - ctx = EGLContext(width, height) - ctx.make_current() - _offscreen_context = ctx - logger.info("Using EGL headless context (no display required).") - return None - except (ImportError, RuntimeError) as exc: - logger.warning("EGL failed (%s) — falling back to OSMesa.", exc) + from .egl_context import EGLContext # noqa: PLC0415 + ctx = EGLContext(width, height) + ctx.make_current() + _offscreen_context = ctx + logger.info("Using EGL headless context (no display required).") + return None # --- Step 3: OSMesa software rendering --- - logger.debug("Trying OSMesa software rendering (CPU).") + # Only reached when PYOPENGL_PLATFORM=osmesa was set at import time + # (i.e. EGL probe failed entirely — libEGL not installed or unusable). try: from .osmesa_context import OSMesaContext # noqa: PLC0415 ctx = OSMesaContext(width, height) diff --git a/whippersnappy/gl/egl_context.py b/whippersnappy/gl/egl_context.py index 37ffdba..cbd0e36 100644 --- a/whippersnappy/gl/egl_context.py +++ b/whippersnappy/gl/egl_context.py @@ -189,47 +189,59 @@ def _init_egl(self): _EGL_NONE = 0x3038 no_attribs = (ctypes.c_int * 1)(_EGL_NONE) - display = None - self._display_path = "unknown" # track for GPU/CPU log message - # --- Path 1: EGL_EXT_device_enumeration --- - # Enumerate GPU devices directly — preferred when a GPU is present. - # Works headlessly without a display server. With --gpus all (NVIDIA) - # or --device (AMD/Intel) the GPU device appears here. - if has_device_enum and eglGetPlatformDisplayEXT and display is None: + # Build an ordered list of (display_handle, path_name) candidates. + # We try each in order, calling eglInitialize on each; the first that + # succeeds becomes the active display. This means a GPU device that + # returns a display handle but fails eglInitialize (e.g. permission + # denied on /dev/dri) is skipped and we fall through to surfaceless + # (llvmpipe CPU) — all within EGL, avoiding the broken EGL→OSMesa + # mixed-platform problem. + candidates = [] # list of (dpy, path_name) + + # --- Candidate 1: EGL_EXT_device_enumeration (GPU preferred) --- + if has_device_enum and eglGetPlatformDisplayEXT: eglQueryDevicesEXT = self._get_ext_fn( "eglQueryDevicesEXT", ctypes.c_bool, [ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_int)], ) - display = self._open_device_display( + gpu_dpy = self._open_device_display( eglQueryDevicesEXT, eglGetPlatformDisplayEXT ) - if display is not None: - self._display_path = "device" + if gpu_dpy is not None: + candidates.append((gpu_dpy, "device")) - # --- Path 2: EGL_MESA_platform_surfaceless --- - # CPU software rendering (llvmpipe) — no GPU needed. - # Used when no GPU device was found (e.g. Docker without --gpus/--device). + # --- Candidate 2: EGL_MESA_platform_surfaceless (CPU/llvmpipe) --- _EGL_PLATFORM_SURFACELESS = 0x31DD - if eglGetPlatformDisplayEXT and has_surfaceless and display is None: - candidate = eglGetPlatformDisplayEXT( + if eglGetPlatformDisplayEXT and has_surfaceless: + sl_dpy = eglGetPlatformDisplayEXT( _EGL_PLATFORM_SURFACELESS, ctypes.c_void_p(0), no_attribs ) - if candidate: - logger.debug("EGL: trying surfaceless platform display (CPU/llvmpipe).") - display = candidate - self._display_path = "surfaceless" - - # --- Path 3: EGL_DEFAULT_DISPLAY --- - # Works only when a display server (X11/Wayland) is reachable. - if display is None: - logger.debug("EGL: trying EGL_DEFAULT_DISPLAY.") - libegl.eglGetDisplay.restype = ctypes.c_void_p - libegl.eglGetDisplay.argtypes = [ctypes.c_void_p] - display = libegl.eglGetDisplay(ctypes.c_void_p(0)) - if display: - self._display_path = "default" + if sl_dpy: + candidates.append((sl_dpy, "surfaceless")) + + # --- Candidate 3: EGL_DEFAULT_DISPLAY (needs X11/Wayland) --- + libegl.eglGetDisplay.restype = ctypes.c_void_p + libegl.eglGetDisplay.argtypes = [ctypes.c_void_p] + def_dpy = libegl.eglGetDisplay(ctypes.c_void_p(0)) + if def_dpy: + candidates.append((def_dpy, "default")) + + # Try each candidate until one succeeds eglInitialize + display = None + self._display_path = "unknown" + for dpy, path_name in candidates: + major, minor = ctypes.c_int(0), ctypes.c_int(0) + if libegl.eglInitialize(dpy, ctypes.byref(major), ctypes.byref(minor)): + display = dpy + self._display_path = path_name + logger.debug("EGL: initialised via %s display (EGL %d.%d).", + path_name, major.value, minor.value) + break + else: + logger.debug("EGL: %s display failed eglInitialize — trying next.", + path_name) if not display: raise RuntimeError( @@ -238,12 +250,6 @@ def _init_egl(self): ) self._display = display - major, minor = ctypes.c_int(0), ctypes.c_int(0) - if not libegl.eglInitialize( - self._display, ctypes.byref(major), ctypes.byref(minor) - ): - raise RuntimeError("eglInitialize failed.") - logger.debug("EGL %d.%d", major.value, minor.value) if not libegl.eglBindAPI(_EGL_OPENGL_API): raise RuntimeError("eglBindAPI(OpenGL) failed.") From 34b1467a97f4ee82c82c34e802ac2863f64962af Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 19:04:20 +0100 Subject: [PATCH 19/29] more fixing --- whippersnappy/gl/_headless.py | 4 +- whippersnappy/gl/egl_context.py | 71 +++++++++++++++++++-------------- 2 files changed, 42 insertions(+), 33 deletions(-) diff --git a/whippersnappy/gl/_headless.py b/whippersnappy/gl/_headless.py index 07d90ac..520f7d7 100644 --- a/whippersnappy/gl/_headless.py +++ b/whippersnappy/gl/_headless.py @@ -132,7 +132,7 @@ def _try_init(dpy): )(addr) n = ctypes.c_int(0) if _QueryDevices(0, None, ctypes.byref(n)) and n.value > 0: - logger.info("EGL probe: %d EGL device(s) found via enumeration.", n.value) + logger.debug("EGL probe: %d EGL device(s) found.", n.value) devices = (ctypes.c_void_p * n.value)() _QueryDevices(n.value, devices, ctypes.byref(n)) for dev in devices: @@ -145,7 +145,7 @@ def _try_init(dpy): logger.debug("EGL probe: device enumeration succeeded (GPU).") return True else: - logger.info("EGL probe: device enumeration found 0 devices.") + logger.debug("EGL probe: device enumeration found 0 devices.") # --- Path 2: EGL_MESA_platform_surfaceless --- # CPU software rendering (llvmpipe) — no GPU needed. diff --git a/whippersnappy/gl/egl_context.py b/whippersnappy/gl/egl_context.py index cbd0e36..b10ba52 100644 --- a/whippersnappy/gl/egl_context.py +++ b/whippersnappy/gl/egl_context.py @@ -190,14 +190,13 @@ def _init_egl(self): _EGL_NONE = 0x3038 no_attribs = (ctypes.c_int * 1)(_EGL_NONE) - # Build an ordered list of (display_handle, path_name) candidates. + # Build an ordered list of (display_handle, path_label) candidates. # We try each in order, calling eglInitialize on each; the first that # succeeds becomes the active display. This means a GPU device that - # returns a display handle but fails eglInitialize (e.g. permission - # denied on /dev/dri) is skipped and we fall through to surfaceless - # (llvmpipe CPU) — all within EGL, avoiding the broken EGL→OSMesa - # mixed-platform problem. - candidates = [] # list of (dpy, path_name) + # fails eglInitialize (e.g. DRI2 screen not available inside Docker) + # is skipped and we fall through to surfaceless (llvmpipe CPU) — + # all within EGL, avoiding the broken EGL→OSMesa mixed-platform issue. + candidates = [] # list of (dpy, label) # --- Candidate 1: EGL_EXT_device_enumeration (GPU preferred) --- if has_device_enum and eglGetPlatformDisplayEXT: @@ -206,11 +205,10 @@ def _init_egl(self): ctypes.c_bool, [ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_int)], ) - gpu_dpy = self._open_device_display( - eglQueryDevicesEXT, eglGetPlatformDisplayEXT - ) - if gpu_dpy is not None: - candidates.append((gpu_dpy, "device")) + for dpy, is_hw in (self._open_device_display( + eglQueryDevicesEXT, eglGetPlatformDisplayEXT) or []): + label = "GPU device" if is_hw else "software device" + candidates.append((dpy, label)) # --- Candidate 2: EGL_MESA_platform_surfaceless (CPU/llvmpipe) --- _EGL_PLATFORM_SURFACELESS = 0x31DD @@ -226,22 +224,28 @@ def _init_egl(self): libegl.eglGetDisplay.argtypes = [ctypes.c_void_p] def_dpy = libegl.eglGetDisplay(ctypes.c_void_p(0)) if def_dpy: - candidates.append((def_dpy, "default")) + candidates.append((def_dpy, "default display")) # Try each candidate until one succeeds eglInitialize display = None self._display_path = "unknown" - for dpy, path_name in candidates: + for dpy, label in candidates: major, minor = ctypes.c_int(0), ctypes.c_int(0) if libegl.eglInitialize(dpy, ctypes.byref(major), ctypes.byref(minor)): display = dpy - self._display_path = path_name - logger.debug("EGL: initialised via %s display (EGL %d.%d).", - path_name, major.value, minor.value) + self._display_path = label + logger.debug("EGL: initialised via %s (EGL %d.%d).", + label, major.value, minor.value) break else: - logger.debug("EGL: %s display failed eglInitialize — trying next.", - path_name) + if "GPU" in label: + logger.info( + "EGL: GPU device found but could not be initialised " + "(DRI2/kernel driver not accessible inside container) " + "— falling back to CPU software rendering." + ) + else: + logger.debug("EGL: %s failed eglInitialize — trying next.", label) if not display: raise RuntimeError( @@ -294,13 +298,15 @@ def _init_egl(self): def _open_device_display(self, eglQueryDevicesEXT, eglGetPlatformDisplayEXT): - """Enumerate EGL devices and return the first usable GPU display pointer. + """Enumerate EGL devices and return display candidates ordered GPU-first. + + Returns a list of ``(display_handle, is_hw)`` tuples — hardware GPU + devices first, software devices last. The caller (``_init_egl``) tries + each by calling ``eglInitialize``; the first that succeeds is used. - Prefers hardware GPU devices over software (llvmpipe/Mesa) devices. - Hardware devices are identified by the absence of ``EGL_MESA_device_software`` - in their extension string — this correctly handles both AMD/Intel (which - have a DRM path) and NVIDIA (which do not expose a DRM path but are still - real GPU devices). + Hardware vs software is determined by ``EGL_MESA_device_software`` in + the device extension string — this correctly handles NVIDIA (no DRM + path, but not a software device) and AMD/Intel (has a DRM path). """ n = ctypes.c_int(0) if not eglQueryDevicesEXT(0, None, ctypes.byref(n)) or n.value == 0: @@ -329,22 +335,25 @@ def _open_device_display(self, eglQueryDevicesEXT, eglGetPlatformDisplayEXT): drm_path = _QueryDeviceString(ctypes.c_void_p(dev), _EGL_DRM_DEVICE_FILE_EXT) dev_exts = _QueryDeviceString(ctypes.c_void_p(dev), _EGL_EXTENSIONS_STR) or b"" is_sw = b"EGL_MESA_device_software" in dev_exts - logger.info("EGL device: drm_path=%s sw=%s exts=%s", - drm_path, is_sw, dev_exts.decode() if dev_exts else "") + logger.debug("EGL device: drm_path=%s sw=%s exts=%s", + drm_path, is_sw, dev_exts.decode() if dev_exts else "") if is_sw: sw_devices.append(dev) else: hw_devices.append(dev) - for i, dev in enumerate(hw_devices + sw_devices): + if hw_devices: + logger.debug("EGL: %d hardware device(s), %d software device(s).", + len(hw_devices), len(sw_devices)) + + results = [] + for dev in hw_devices + sw_devices: dpy = eglGetPlatformDisplayEXT( _EGL_PLATFORM_DEVICE_EXT, ctypes.c_void_p(dev), no_attribs ) if dpy: - kind = "hardware GPU" if i < len(hw_devices) else "software" - logger.debug("EGL: using %s device %d", kind, i) - return dpy - return None + results.append((dpy, dev in hw_devices)) + return results # list of (display, is_hw) def make_current(self): From 45d145bc90f493f7236f74ab6e3cb8c48cf49d09 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 19:24:56 +0100 Subject: [PATCH 20/29] update log messages and documentation --- DOCKER.md | 87 ++++++++++++++++----------------- README.md | 24 ++++----- whippersnappy/gl/_headless.py | 28 +++++------ whippersnappy/gl/egl_context.py | 14 +++--- 4 files changed, 68 insertions(+), 85 deletions(-) diff --git a/DOCKER.md b/DOCKER.md index 60bb9e5..9af449c 100644 --- a/DOCKER.md +++ b/DOCKER.md @@ -1,15 +1,14 @@ -# Docker Guide +# Docker / Singularity Guide The Docker image provides a fully headless rendering environment using **EGL** with Mesa's llvmpipe CPU software renderer — no GPU, display server, -or `xvfb` required. AMD/Intel GPU rendering is available via -`--device /dev/dri/renderD128`. NVIDIA GPU OpenGL rendering in Docker -requires an `nvidia/opengl`-based image and is not supported in this image; -CPU rendering is the intended headless path and is fast enough for all -snapshot and video tasks. +or `xvfb` required. This is fast enough for all snapshot and video tasks. `libosmesa6` is also included as a last-resort fallback if EGL cannot initialise. +For **Singularity/Apptainer**, passing `--nv` enables GPU rendering via EGL +automatically — no other configuration needed. + The default entry point is `whippersnap4` (four-view batch rendering). `whippersnap1` (single-view snapshot and rotation video) can be invoked by overriding the entry point. @@ -185,6 +184,37 @@ parent directory to retrieve them on the host. --- +## Singularity / Apptainer + +The same image can be used with Singularity or Apptainer. + +**CPU rendering** (default — no GPU needed): +```bash +singularity exec \ + -B /path/to/subject:/subject \ + -B /path/to/output:/output \ + whippersnappy.sif \ + whippersnap4 \ + -lh /subject/surf/lh.thickness \ + -rh /subject/surf/rh.thickness \ + -sd /subject -o /output/snap4.png +``` + +**GPU rendering** — pass `--nv` (NVIDIA) or `--rocm` (AMD); EGL selects +the GPU automatically: +```bash +singularity exec --nv \ + -B /path/to/subject:/subject \ + -B /path/to/output:/output \ + whippersnappy.sif \ + whippersnap4 \ + -lh /subject/surf/lh.thickness \ + -rh /subject/surf/rh.thickness \ + -sd /subject -o /output/snap4.png +``` + +--- + ## Notes - The `--init` flag is recommended so that signals (e.g. `Ctrl-C`) are handled @@ -193,51 +223,18 @@ parent directory to retrieve them on the host. not root. - The interactive GUI (`whippersnap`) is **not** available in the Docker image — it requires a display server and PyQt6, which are not installed. -- **Default rendering** uses **EGL with CPU software rendering** (Mesa - llvmpipe) — no GPU or display server required. The log will show: +- **Docker rendering** uses **EGL with CPU software rendering** (Mesa llvmpipe) + — no GPU or display server required. The log will show: ``` EGL context active — CPU software rendering (llvmpipe (...), ...) ``` - This is sufficient for all snapshot and video rendering tasks. - -- **GPU rendering** requires OpenGL access to the GPU, not just CUDA. - The log will show: +- **Singularity GPU rendering** with `--nv` uses EGL with the NVIDIA GPU + driver injected by Singularity. The log will show: ``` EGL context active — GPU rendering (...) ``` - - *NVIDIA:* `--gpus all` only provides CUDA compute access, **not** - OpenGL/EGL rendering. For NVIDIA GPU OpenGL in Docker you need an - image based on `nvidia/opengl` or `nvidia/cuda:*-opengl`. The standard - `whippersnappy` image does not include the NVIDIA OpenGL drivers and - therefore always uses CPU rendering regardless of `--gpus all`. - GPU rendering with NVIDIA in Docker is not officially supported in this - image — CPU rendering via llvmpipe is the intended headless path. - - *AMD / Intel* (DRI render device). The device is owned by the host's - `render` group; pass the group ID into the container so EGL can open it: - ```bash - docker run --rm --init \ - --device /dev/dri/renderD128 \ - --group-add $(getent group render | cut -d: -f3) \ - --user $(id -u):$(id -g) \ - -v /path/to/subject:/subject \ - -v /path/to/output:/output \ - whippersnappy \ - -lh /subject/surf/lh.thickness -rh /subject/surf/rh.thickness \ - -sd /subject -o /output/snap4.png - ``` - If the GPU device cannot be opened, EGL automatically falls back to - CPU software rendering (llvmpipe) — no crash, no OSMesa required. - -- **Singularity/Apptainer:** CPU rendering works without any flags. For GPU - rendering pass `--nv` (NVIDIA) or `--rocm` (AMD): - ```bash - singularity exec --nv whippersnappy.sif \ - whippersnap4 -lh lh.thickness -rh rh.thickness -sd fsaverage -o snap4.png - ``` - **OSMesa** (`libosmesa6`) is included as a last-resort CPU fallback for the - rare case where EGL itself fails to initialise. Under normal circumstances - EGL handles both GPU and CPU rendering and OSMesa is not used. + rare case where EGL itself fails to initialise. + diff --git a/README.md b/README.md index f9cbe80..3306ce2 100644 --- a/README.md +++ b/README.md @@ -33,29 +33,23 @@ For interactive 3D in Jupyter notebooks: pip install 'whippersnappy[notebook]' ``` -Off-screen (headless) rendering on **Linux** uses EGL: -- **CPU software rendering** (default, no GPU needed) — EGL uses Mesa's - llvmpipe automatically when no GPU is available. Works in Docker without - any extra flags, in Singularity without `--nv`, and over plain SSH. -- **GPU rendering** — for AMD/Intel in Docker pass - `--device /dev/dri/renderD128`; in Singularity pass `--nv` (NVIDIA) or - `--rocm` (AMD). Note: NVIDIA `--gpus all` in Docker only provides CUDA - compute access, not EGL/OpenGL rendering; NVIDIA GPU OpenGL in Docker - requires an `nvidia/opengl`-based image. - -The log always reports which is active: +Off-screen (headless) rendering on **Linux** uses **EGL** with Mesa's llvmpipe +CPU software renderer — no GPU or display server required. The log reports: ``` EGL context active — CPU software rendering (llvmpipe (...), ...) +``` +When a GPU is accessible (native install or Singularity with ``--nv``), +EGL selects it automatically: +``` EGL context active — GPU rendering (...) ``` - -OSMesa (`libosmesa6`) is a last-resort CPU fallback used only when EGL fails -to initialise entirely. +OSMesa (`libosmesa6`) is a last-resort CPU fallback used only when EGL +itself cannot initialise (e.g. `libegl1` not installed). On **Windows**, GLFW creates an invisible window; a GPU driver is sufficient. On **macOS**, a real display connection is required (NSGL does not support headless rendering). -See the Docker guide for headless Linux usage. +See the Docker/Singularity guide for container usage. ## Command-Line Usage diff --git a/whippersnappy/gl/_headless.py b/whippersnappy/gl/_headless.py index 520f7d7..a6171d1 100644 --- a/whippersnappy/gl/_headless.py +++ b/whippersnappy/gl/_headless.py @@ -8,19 +8,17 @@ Priority chain on Linux when no display is detected (``DISPLAY`` / ``WAYLAND_DISPLAY`` unset): -1. **EGL** — tried first when ``libEGL`` is installed. A lightweight ctypes - probe (``eglGetDisplay`` + ``eglInitialize``) confirms EGL can actually - initialise before ``PYOPENGL_PLATFORM=egl`` is set. This covers both GPU - rendering (real device) and CPU software rendering (Mesa llvmpipe) — - works in Docker without ``--device``. -2. **OSMesa** — fallback when EGL is not installed or the probe fails (e.g. - no GPU and no llvmpipe). Sets ``PYOPENGL_PLATFORM=osmesa``. +1. **EGL** — tried first. A lightweight ctypes probe confirms EGL can + actually initialise a display before ``PYOPENGL_PLATFORM=egl`` is set. + When a GPU is accessible (native install) EGL uses it; otherwise EGL falls + back to Mesa's llvmpipe CPU software renderer. Works in Docker and + Singularity without any special flags. +2. **OSMesa** — fallback when EGL cannot initialise at all (e.g. ``libegl1`` + not installed). Sets ``PYOPENGL_PLATFORM=osmesa``. 3. **Neither** — raises ``RuntimeError`` with install instructions. When ``DISPLAY`` is set the module does not intervene; GLFW is tried first -in :func:`~whippersnappy.gl.context.init_offscreen_context`. If GLFW then -fails (e.g. broken ``ssh -X`` forward), the same EGL/OSMesa chain is -attempted there. +in :func:`~whippersnappy.gl.context.init_offscreen_context`. ``PYOPENGL_PLATFORM`` is not consulted by GLFW, so setting it here does not affect the interactive GUI (``whippersnap``). @@ -52,12 +50,9 @@ def _egl_context_works(): Tries display-independent EGL paths in order: - 1. ``EGL_EXT_device_enumeration`` — enumerate GPU devices directly; works - headlessly without a display server. With ``--gpus all`` (NVIDIA) or - ``--device`` (AMD/Intel) the GPU device appears here and is preferred. + 1. ``EGL_EXT_device_enumeration`` — enumerate GPU/software devices. 2. ``EGL_MESA_platform_surfaceless`` — Mesa CPU software rendering - (llvmpipe); no GPU or display server needed. Used when no GPU device - is found (e.g. Docker without ``--gpus``/``--device``). + (llvmpipe); no GPU or display server needed. 3. ``eglGetDisplay(EGL_DEFAULT_DISPLAY)`` — last resort; only succeeds when a display server (X11/Wayland) is reachable. @@ -120,8 +115,7 @@ def _try_init(dpy): no_attribs = (ctypes.c_int * 1)(_EGL_NONE) # --- Path 1: EGL_EXT_device_enumeration --- - # GPU devices — preferred. With --gpus all (NVIDIA) or --device - # (AMD/Intel) the GPU appears here before surfaceless/llvmpipe. + # Try GPU and software devices; prefer GPU when available natively. if (_GetPlatformDisplayEXT and b"EGL_EXT_device_enumeration" in client_exts): addr = libegl.eglGetProcAddress(b"eglQueryDevicesEXT") diff --git a/whippersnappy/gl/egl_context.py b/whippersnappy/gl/egl_context.py index b10ba52..c6267e3 100644 --- a/whippersnappy/gl/egl_context.py +++ b/whippersnappy/gl/egl_context.py @@ -198,7 +198,9 @@ def _init_egl(self): # all within EGL, avoiding the broken EGL→OSMesa mixed-platform issue. candidates = [] # list of (dpy, label) - # --- Candidate 1: EGL_EXT_device_enumeration (GPU preferred) --- + # --- Candidate 1: EGL_EXT_device_enumeration --- + # Hardware GPU devices tried first, software devices last. + # GPU is selected automatically when accessible natively. if has_device_enum and eglGetPlatformDisplayEXT: eglQueryDevicesEXT = self._get_ext_fn( "eglQueryDevicesEXT", @@ -239,9 +241,8 @@ def _init_egl(self): break else: if "GPU" in label: - logger.info( - "EGL: GPU device found but could not be initialised " - "(DRI2/kernel driver not accessible inside container) " + logger.debug( + "EGL: GPU device found but eglInitialize failed " "— falling back to CPU software rendering." ) else: @@ -379,10 +380,7 @@ def make_current(self): is_cpu = any(s in renderer.lower() for s in _sw) if is_cpu: logger.info( - "EGL context active — CPU software rendering (%s, %s). " - "For AMD/Intel GPU rendering in Docker pass " - "--device /dev/dri/renderD128; " - "for Singularity pass --nv (NVIDIA) or --rocm (AMD).", + "EGL context active — CPU software rendering (%s, %s).", renderer, vendor, ) else: From d5a083dfd7d6a4675d8af0876c8199bd2c36ec7c Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 19:33:40 +0100 Subject: [PATCH 21/29] try to remove EGL warnings in singularity --- whippersnappy/gl/egl_context.py | 38 ++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/whippersnappy/gl/egl_context.py b/whippersnappy/gl/egl_context.py index c6267e3..21b93b8 100644 --- a/whippersnappy/gl/egl_context.py +++ b/whippersnappy/gl/egl_context.py @@ -22,6 +22,7 @@ ctx.destroy() """ +import contextlib import ctypes import logging import os @@ -67,6 +68,26 @@ _EGL_PLATFORM_DEVICE_EXT = 0x313F +@contextlib.contextmanager +def _silence_stderr(): + """Suppress C-level stderr for the duration of the block. + + Mesa writes DRI/EGL warnings (e.g. "failed to open /dev/dri/renderD128: + Permission denied") directly to file descriptor 2, bypassing Python's + logging system. We redirect fd 2 to ``/dev/null`` for calls that are + expected to fail (e.g. probing GPU devices without access) so users don't + see spurious warnings when the fallback path works fine. + """ + devnull_fd = os.open(os.devnull, os.O_WRONLY) + saved = os.dup(2) + try: + os.dup2(devnull_fd, 2) + yield + finally: + os.dup2(saved, 2) + os.close(saved) + os.close(devnull_fd) + class EGLContext: """A headless OpenGL 3.3 Core context backed by an EGL pbuffer + FBO. @@ -228,12 +249,18 @@ def _init_egl(self): if def_dpy: candidates.append((def_dpy, "default display")) - # Try each candidate until one succeeds eglInitialize + # Try each candidate until one succeeds eglInitialize. + # Suppress C-level stderr during attempts that may produce Mesa DRI + # warnings ("failed to open /dev/dri/...", "failed to create dri2 + # screen") — these are expected when a GPU device is found but not + # accessible (e.g. Singularity without --nv). display = None self._display_path = "unknown" for dpy, label in candidates: major, minor = ctypes.c_int(0), ctypes.c_int(0) - if libegl.eglInitialize(dpy, ctypes.byref(major), ctypes.byref(minor)): + with _silence_stderr(): + ok = libegl.eglInitialize(dpy, ctypes.byref(major), ctypes.byref(minor)) + if ok: display = dpy self._display_path = label logger.debug("EGL: initialised via %s (EGL %d.%d).", @@ -349,9 +376,10 @@ def _open_device_display(self, eglQueryDevicesEXT, eglGetPlatformDisplayEXT): results = [] for dev in hw_devices + sw_devices: - dpy = eglGetPlatformDisplayEXT( - _EGL_PLATFORM_DEVICE_EXT, ctypes.c_void_p(dev), no_attribs - ) + with _silence_stderr(): + dpy = eglGetPlatformDisplayEXT( + _EGL_PLATFORM_DEVICE_EXT, ctypes.c_void_p(dev), no_attribs + ) if dpy: results.append((dpy, dev in hw_devices)) return results # list of (display, is_hw) From effabdc05aadfbd98b782ac26c076a5e63a16181 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 19:39:30 +0100 Subject: [PATCH 22/29] try to remove EGL warnings in singularity --- whippersnappy/gl/_headless.py | 67 +++++++++++++++++++++++---------- whippersnappy/gl/egl_context.py | 7 +++- 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/whippersnappy/gl/_headless.py b/whippersnappy/gl/_headless.py index a6171d1..9983431 100644 --- a/whippersnappy/gl/_headless.py +++ b/whippersnappy/gl/_headless.py @@ -89,16 +89,6 @@ def _egl_context_works(): _EGL_NONE = 0x3038 _EGL_PLATFORM_DEVICE = 0x313F - def _try_init(dpy): - if not dpy: - return False - major, minor = ctypes.c_int(0), ctypes.c_int(0) - ok = libegl.eglInitialize(dpy, ctypes.byref(major), ctypes.byref(minor)) - libegl.eglTerminate(dpy) - if ok: - logger.debug("EGL probe: eglInitialize OK (EGL %d.%d).", - major.value, minor.value) - return bool(ok) client_exts = libegl.eglQueryString(None, _EGL_EXTENSIONS) or b"" logger.debug("EGL client extensions: %s", client_exts.decode()) @@ -115,6 +105,26 @@ def _try_init(dpy): no_attribs = (ctypes.c_int * 1)(_EGL_NONE) # --- Path 1: EGL_EXT_device_enumeration --- + def _try_init_silent(dpy): + """Call eglInitialize with C-level stderr suppressed.""" + if not dpy: + return False + devnull = os.open(os.devnull, os.O_WRONLY) + saved = os.dup(2) + try: + os.dup2(devnull, 2) + major, minor = ctypes.c_int(0), ctypes.c_int(0) + ok = libegl.eglInitialize(dpy, ctypes.byref(major), ctypes.byref(minor)) + libegl.eglTerminate(dpy) + finally: + os.dup2(saved, 2) + os.close(saved) + os.close(devnull) + if ok: + logger.debug("EGL probe: eglInitialize OK (EGL %d.%d).", + major.value, minor.value) + return bool(ok) + # Try GPU and software devices; prefer GPU when available natively. if (_GetPlatformDisplayEXT and b"EGL_EXT_device_enumeration" in client_exts): @@ -125,18 +135,35 @@ def _try_init(dpy): ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), )(addr) n = ctypes.c_int(0) - if _QueryDevices(0, None, ctypes.byref(n)) and n.value > 0: + devnull = os.open(os.devnull, os.O_WRONLY) + saved = os.dup(2) + try: + os.dup2(devnull, 2) + found = _QueryDevices(0, None, ctypes.byref(n)) + finally: + os.dup2(saved, 2) + os.close(saved) + os.close(devnull) + if found and n.value > 0: logger.debug("EGL probe: %d EGL device(s) found.", n.value) devices = (ctypes.c_void_p * n.value)() _QueryDevices(n.value, devices, ctypes.byref(n)) for dev in devices: - dpy = _GetPlatformDisplayEXT( - _EGL_PLATFORM_DEVICE, - ctypes.c_void_p(dev), - no_attribs, - ) - if _try_init(dpy): - logger.debug("EGL probe: device enumeration succeeded (GPU).") + devnull = os.open(os.devnull, os.O_WRONLY) + saved = os.dup(2) + try: + os.dup2(devnull, 2) + dpy = _GetPlatformDisplayEXT( + _EGL_PLATFORM_DEVICE, + ctypes.c_void_p(dev), + no_attribs, + ) + finally: + os.dup2(saved, 2) + os.close(saved) + os.close(devnull) + if _try_init_silent(dpy): + logger.debug("EGL probe: device enumeration succeeded.") return True else: logger.debug("EGL probe: device enumeration found 0 devices.") @@ -148,14 +175,14 @@ def _try_init(dpy): dpy = _GetPlatformDisplayEXT( _EGL_PLATFORM_SURFACELESS, ctypes.c_void_p(0), no_attribs ) - if _try_init(dpy): + if _try_init_silent(dpy): logger.debug("EGL probe: surfaceless platform succeeded (CPU/llvmpipe).") return True # --- Path 3: EGL_DEFAULT_DISPLAY --- # Works only when a display server is reachable (DISPLAY set). dpy = libegl.eglGetDisplay(ctypes.c_void_p(0)) - if _try_init(dpy): + if _try_init_silent(dpy): logger.debug("EGL probe: EGL_DEFAULT_DISPLAY succeeded.") return True diff --git a/whippersnappy/gl/egl_context.py b/whippersnappy/gl/egl_context.py index 21b93b8..2cfb0c0 100644 --- a/whippersnappy/gl/egl_context.py +++ b/whippersnappy/gl/egl_context.py @@ -337,12 +337,15 @@ def _open_device_display(self, eglQueryDevicesEXT, eglGetPlatformDisplayEXT): path, but not a software device) and AMD/Intel (has a DRM path). """ n = ctypes.c_int(0) - if not eglQueryDevicesEXT(0, None, ctypes.byref(n)) or n.value == 0: + with _silence_stderr(): + found = eglQueryDevicesEXT(0, None, ctypes.byref(n)) + if not found or n.value == 0: logger.debug("EGL: eglQueryDevicesEXT found no devices.") return None logger.info("EGL: %d device(s) found via enumeration.", n.value) devices = (ctypes.c_void_p * n.value)() - eglQueryDevicesEXT(n.value, devices, ctypes.byref(n)) + with _silence_stderr(): + eglQueryDevicesEXT(n.value, devices, ctypes.byref(n)) no_attribs = (ctypes.c_int * 1)(_EGL_NONE) _EGL_DRM_DEVICE_FILE_EXT = 0x3233 # for logging only From cb5203e137f16d26de6db83beab9fbc195469aa4 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 19:49:16 +0100 Subject: [PATCH 23/29] fix typo --- whippersnappy/gl/context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whippersnappy/gl/context.py b/whippersnappy/gl/context.py index cf265c2..f710242 100644 --- a/whippersnappy/gl/context.py +++ b/whippersnappy/gl/context.py @@ -143,7 +143,7 @@ def init_offscreen_context(width, height): Tries up to three paths on Linux; macOS and Windows use GLFW only. 1. **GLFW invisible window** — used when ``PYOPENGL_PLATFORM`` is not - ``"egl"`` (i.e. a display is available and EGL was not pre-selected). + ``"egl"`` (i.e. a display is available and EGL was not preselected). Skipped on Linux when EGL was selected at import time to avoid spurious GLX warnings. 2. **EGL** — used when ``PYOPENGL_PLATFORM=egl`` was set by From 9c764dc6ef87c928d36c1f57c67fe3ac377f6083 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Fri, 6 Mar 2026 20:38:40 +0100 Subject: [PATCH 24/29] update docstrings and guard mesa --- whippersnappy/gl/_headless.py | 6 +-- whippersnappy/gl/context.py | 80 +++++++++++++++++++++-------------- 2 files changed, 51 insertions(+), 35 deletions(-) diff --git a/whippersnappy/gl/_headless.py b/whippersnappy/gl/_headless.py index 9983431..832b283 100644 --- a/whippersnappy/gl/_headless.py +++ b/whippersnappy/gl/_headless.py @@ -211,10 +211,8 @@ def _osmesa_is_available(): ) if not _has_display: # No display — choose headless backend before OpenGL.GL is imported. - # Use _egl_context_works() not just _egl_is_available(): libEGL may be - # installed but still fail (no GPU, no llvmpipe, bad driver). We must - # know the outcome before setting PYOPENGL_PLATFORM because OpenGL.GL - # binds its function pointers on first import and cannot be re-bound. + # OpenGL.GL binds its function pointers on first import and cannot be + # re-bound, so PYOPENGL_PLATFORM must be set correctly here. if _egl_context_works(): os.environ["PYOPENGL_PLATFORM"] = "egl" logger.info("No display detected; EGL available — using EGL headless rendering.") diff --git a/whippersnappy/gl/context.py b/whippersnappy/gl/context.py index f710242..47039e1 100644 --- a/whippersnappy/gl/context.py +++ b/whippersnappy/gl/context.py @@ -6,18 +6,18 @@ Context creation tries up to three paths (Linux; macOS/Windows use GLFW only): 1. **GLFW invisible window** — standard path when a display is available. -2. **EGL pbuffer** — headless GPU rendering (Linux, no display needed). - Only used when :mod:`~whippersnappy.gl._headless` set - ``PYOPENGL_PLATFORM=egl`` at import time (no display + accessible - ``/dev/dri/renderD*``). PyOpenGL selects its platform backend on the - first ``import OpenGL.GL`` and cannot be changed afterwards — so EGL is - only safe when it was selected before any ``OpenGL.GL`` import. +2. **EGL pbuffer** — headless rendering (Linux, no display needed). + Used when :mod:`~whippersnappy.gl._headless` set + ``PYOPENGL_PLATFORM=egl`` at import time. ``EGLContext`` handles GPU + and CPU (llvmpipe) fallback internally. 3. **OSMesa** — CPU software renderer (Linux only). - Used when neither GLFW nor EGL succeeds. + Used only when ``PYOPENGL_PLATFORM=osmesa`` was set at import time + (i.e. EGL probe failed — ``libEGL`` not installed). -The :mod:`whippersnappy.gl._headless` guard runs before ``OpenGL.GL`` is -imported and sets ``PYOPENGL_PLATFORM`` to ``"egl"`` or ``"osmesa"`` -as appropriate. +Each backend is selected before ``OpenGL.GL`` is first imported. +PyOpenGL binds its function pointers on first import and cannot be re-bound, +so mixing backends causes silent GL failures. The guard in each branch +ensures only the preselected backend is used. """ # ruff: noqa: I001 — import order is intentional: _headless must precede OpenGL.GL @@ -144,16 +144,18 @@ def init_offscreen_context(width, height): 1. **GLFW invisible window** — used when ``PYOPENGL_PLATFORM`` is not ``"egl"`` (i.e. a display is available and EGL was not preselected). - Skipped on Linux when EGL was selected at import time to avoid spurious - GLX warnings. + Skipped on Linux when EGL was selected at import time. 2. **EGL** — used when ``PYOPENGL_PLATFORM=egl`` was set by - :mod:`~whippersnappy.gl._headless` at import time (no display detected - and ``libEGL`` is installed). EGL handles both GPU and CPU (llvmpipe) - rendering without needing ``/dev/dri`` access — works in Docker without - ``--device``. - 3. **OSMesa** — CPU software renderer (Linux only). Used when EGL is not - installed (``PYOPENGL_PLATFORM=osmesa``) or when EGL context creation - fails. + :mod:`~whippersnappy.gl._headless` at import time. ``EGLContext`` + tries GPU device → surfaceless (llvmpipe) → default display in order, + so it handles CPU fallback internally within EGL. + 3. **OSMesa** — used only when ``PYOPENGL_PLATFORM=osmesa`` was set at + import time (EGL probe failed entirely — ``libEGL`` not installed). + + Each backend is only used when it was preselected before ``OpenGL.GL`` + was imported. PyOpenGL binds its function pointers on first import and + cannot be re-bound — mixing backends (e.g. GLX-bound pointers with an + OSMesa context) causes silent GL failures. Parameters ---------- @@ -212,18 +214,34 @@ def init_offscreen_context(width, height): # --- Step 3: OSMesa software rendering --- # Only reached when PYOPENGL_PLATFORM=osmesa was set at import time # (i.e. EGL probe failed entirely — libEGL not installed or unusable). - try: - from .osmesa_context import OSMesaContext # noqa: PLC0415 - ctx = OSMesaContext(width, height) - ctx.make_current() - _offscreen_context = ctx - logger.info("Using OSMesa headless context (CPU, no display or GPU required).") - return None - except (ImportError, RuntimeError) as exc: - raise RuntimeError( - "Could not create any OpenGL context (tried GLFW invisible window and OSMesa). " - f"Last error: {exc}" - ) from exc + # Guard is required: if OpenGL.GL was bound to GLX (a display was set but + # GLFW failed) and we created an OSMesa context here, GL function pointers + # would be GLX-bound while the context is OSMesa — causing silent failures. + if os.environ.get("PYOPENGL_PLATFORM") == "osmesa": + try: + from .osmesa_context import OSMesaContext # noqa: PLC0415 + ctx = OSMesaContext(width, height) + ctx.make_current() + _offscreen_context = ctx + logger.info("Using OSMesa headless context (CPU, no display or GPU required).") + return None + except (ImportError, RuntimeError) as exc: + raise RuntimeError( + "Could not create any OpenGL context (tried GLFW and OSMesa). " + f"Last error: {exc}" + ) from exc + + raise RuntimeError( + "Could not create a GLFW OpenGL context and no headless backend was " + "preselected. This can happen when DISPLAY is set but the display is " + "not usable (e.g. a broken ssh -X forward) and no EGL or OSMesa " + "library was found at import time. To fix this, install a headless " + "rendering backend:\n" + " - EGL (recommended): sudo apt-get install libegl1\n" + " - OSMesa (fallback): sudo apt-get install libosmesa6\n" + "With either library installed, WhipperSnapPy will select the headless " + "backend automatically on the next run." + ) def terminate_context(window): From 8ad4781e7b6bd997e4c2c3c431048f1db55388d7 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Sun, 8 Mar 2026 00:51:25 +0100 Subject: [PATCH 25/29] simplify docker (no osmesa etc needed) --- Dockerfile | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1a4610a..83dddc3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,20 +6,15 @@ FROM python:3.11-slim ENV MESA_SHADER_CACHE_DISABLE=1 # libegl1 — GLVND EGL dispatch library (routes to GPU or Mesa llvmpipe) -# libosmesa6 — OSMesa CPU fallback for environments where EGL cannot initialise # libgl1 — base OpenGL dispatch library required by PyOpenGL -# libglib2.0-0, libfontconfig1, libdbus-1-3 — runtime deps for Pillow / font rendering +# libfontconfig1 — runtime deps for Pillow / font rendering RUN apt-get update && apt-get install -y --no-install-recommends \ libegl1 \ - libosmesa6 \ libgl1 \ - libglib2.0-0 \ - libfontconfig1 \ - libdbus-1-3 && \ + libfontconfig1 && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* - RUN pip install --upgrade pip COPY . /WhipperSnapPy From b8d370ae2d7ca8c2680e6a475396213eef5b9851 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Sun, 8 Mar 2026 00:56:22 +0100 Subject: [PATCH 26/29] update Doc --- DOCKER.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/DOCKER.md b/DOCKER.md index 9af449c..9f55949 100644 --- a/DOCKER.md +++ b/DOCKER.md @@ -3,8 +3,6 @@ The Docker image provides a fully headless rendering environment using **EGL** with Mesa's llvmpipe CPU software renderer — no GPU, display server, or `xvfb` required. This is fast enough for all snapshot and video tasks. -`libosmesa6` is also included as a last-resort fallback if EGL cannot -initialise. For **Singularity/Apptainer**, passing `--nv` enables GPU rendering via EGL automatically — no other configuration needed. @@ -233,8 +231,7 @@ singularity exec --nv \ ``` EGL context active — GPU rendering (...) ``` -- **OSMesa** (`libosmesa6`) is included as a last-resort CPU fallback for the - rare case where EGL itself fails to initialise. + From 91a47e8ece12c20f25a6345b3e20ce8a62dceba3 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Sun, 8 Mar 2026 01:57:47 +0100 Subject: [PATCH 27/29] update dockerfile for GPUs and documentation --- DOCKER.md | 41 +++++++++++++++++++++++++++++++++++------ Dockerfile | 9 +++++++++ README.md | 8 ++++---- 3 files changed, 48 insertions(+), 10 deletions(-) diff --git a/DOCKER.md b/DOCKER.md index 9f55949..bf43ad4 100644 --- a/DOCKER.md +++ b/DOCKER.md @@ -1,11 +1,15 @@ # Docker / Singularity Guide The Docker image provides a fully headless rendering environment using -**EGL** with Mesa's llvmpipe CPU software renderer — no GPU, display server, -or `xvfb` required. This is fast enough for all snapshot and video tasks. +**EGL** — no display server or `xvfb` required. -For **Singularity/Apptainer**, passing `--nv` enables GPU rendering via EGL -automatically — no other configuration needed. +- **CPU rendering (default):** EGL falls back to Mesa's llvmpipe software + renderer automatically. No GPU or special flags needed. +- **GPU rendering (NVIDIA):** + - For **Docker**, pass `--gpus all` and EGL selects the + GPU via the NVIDIA Container Toolkit. + - For **Singularity/Apptainer**, pass `--nv` (NVIDIA) enables GPU + rendering via EGL automatically. The default entry point is `whippersnap4` (four-view batch rendering). `whippersnap1` (single-view snapshot and rotation video) can be invoked by @@ -43,6 +47,26 @@ docker run --rm --init \ -o /output/snap4.png ``` +### With NVIDIA GPU (faster rendering) + +Pass `--gpus all` to let EGL use the GPU via the NVIDIA Container Toolkit: + +```bash +docker run --rm --init \ + --gpus all \ + -v /path/to/subject:/subject \ + -v /path/to/output:/output \ + --user $(id -u):$(id -g) \ + whippersnappy \ + -lh /subject/surf/lh.thickness \ + -rh /subject/surf/rh.thickness \ + -sd /subject \ + -o /output/snap4.png +``` + +> **Note:** Requires the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) +> installed on the host (`nvidia-ctk --version` to verify). + ### With an annotation file instead of an overlay ```bash @@ -221,11 +245,16 @@ singularity exec --nv \ not root. - The interactive GUI (`whippersnap`) is **not** available in the Docker image — it requires a display server and PyQt6, which are not installed. -- **Docker rendering** uses **EGL with CPU software rendering** (Mesa llvmpipe) - — no GPU or display server required. The log will show: +- **Docker CPU rendering** (default — no GPU needed): EGL uses Mesa's llvmpipe + software renderer. The log will show: ``` EGL context active — CPU software rendering (llvmpipe (...), ...) ``` +- **Docker GPU rendering** (`--gpus all`, NVIDIA only): EGL uses the NVIDIA GPU + driver injected by the NVIDIA Container Toolkit. The log will show: + ``` + EGL context active — GPU rendering (...) + ``` - **Singularity GPU rendering** with `--nv` uses EGL with the NVIDIA GPU driver injected by Singularity. The log will show: ``` diff --git a/Dockerfile b/Dockerfile index 83dddc3..c64ee22 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,6 +5,10 @@ FROM python:3.11-slim # unset or points to a non-writable directory. ENV MESA_SHADER_CACHE_DISABLE=1 +# In order to find Nividia GPUs (--gpus all) +ENV NVIDIA_VISIBLE_DEVICES=all +ENV NVIDIA_DRIVER_CAPABILITIES=all + # libegl1 — GLVND EGL dispatch library (routes to GPU or Mesa llvmpipe) # libgl1 — base OpenGL dispatch library required by PyOpenGL # libfontconfig1 — runtime deps for Pillow / font rendering @@ -15,6 +19,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +# Register the NVIDIA EGL ICD so libEGL finds the GPU driver +RUN mkdir -p /usr/share/glvnd/egl_vendor.d && \ + echo '{"file_format_version":"1.0.0","ICD":{"library_path":"libEGL_nvidia.so.0"}}' \ + > /usr/share/glvnd/egl_vendor.d/10_nvidia.json + RUN pip install --upgrade pip COPY . /WhipperSnapPy diff --git a/README.md b/README.md index 3306ce2..1eb321e 100644 --- a/README.md +++ b/README.md @@ -38,8 +38,8 @@ CPU software renderer — no GPU or display server required. The log reports: ``` EGL context active — CPU software rendering (llvmpipe (...), ...) ``` -When a GPU is accessible (native install or Singularity with ``--nv``), -EGL selects it automatically: +When a GPU is accessible (native install, Docker with `--gpus all`, or +Singularity with `--nv`), EGL selects it automatically: ``` EGL context active — GPU rendering (...) ``` @@ -220,8 +220,8 @@ See `tutorials/whippersnappy_tutorial.ipynb` for complete notebook examples. ## Docker -The Docker image provides a fully headless rendering environment using -OSMesa (CPU software renderer) — no display server, `xvfb`, or GPU required. +The Docker image provides a fully headless rendering environment using EGL — +CPU software rendering by default, GPU rendering with `--gpus all` (NVIDIA). See DOCKER.md for details. ## API Documentation From 3405175ef965a5716bdfc8cadaea4952cc050f9e Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Sun, 8 Mar 2026 09:19:51 +0100 Subject: [PATCH 28/29] fix typo update logger messages --- Dockerfile | 2 +- whippersnappy/gl/_headless.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index c64ee22..72988ff 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ FROM python:3.11-slim # unset or points to a non-writable directory. ENV MESA_SHADER_CACHE_DISABLE=1 -# In order to find Nividia GPUs (--gpus all) +# In order to find NVIDIA GPUs (--gpus all) ENV NVIDIA_VISIBLE_DEVICES=all ENV NVIDIA_DRIVER_CAPABILITIES=all diff --git a/whippersnappy/gl/_headless.py b/whippersnappy/gl/_headless.py index 832b283..c4aafe6 100644 --- a/whippersnappy/gl/_headless.py +++ b/whippersnappy/gl/_headless.py @@ -186,11 +186,11 @@ def _try_init_silent(dpy): logger.debug("EGL probe: EGL_DEFAULT_DISPLAY succeeded.") return True - logger.info("EGL probe: no EGL display could be initialised — will use OSMesa.") + logger.info("EGL probe: no EGL display could be initialised.") return False except Exception as exc: # noqa: BLE001 - logger.debug("EGL probe: unexpected error (%s) — will use OSMesa.", exc) + logger.debug("EGL probe: unexpected error (%s).", exc) return False @@ -227,7 +227,7 @@ def _osmesa_is_available(): "EGL initialisation failed, and OSMesa is not installed.\n" "\n" "To fix this, choose one of:\n" - " 1. Install EGL (recommended, if GPU is installed):\n" + " 1. Install EGL (recommended, for GPU or CPU rendering):\n" " Debian/Ubuntu: sudo apt-get install libegl1\n" " RHEL/Fedora: sudo dnf install mesa-libEGL\n" " 2. Install OSMesa (CPU-only alternative):\n" From 5677465acb680faf74d66ad07291d9af2edbaa84 Mon Sep 17 00:00:00 2001 From: Martin Reuter Date: Sun, 8 Mar 2026 09:36:01 +0100 Subject: [PATCH 29/29] improve egl context detection --- whippersnappy/gl/_headless.py | 122 ++++++++++++++++------------------ 1 file changed, 59 insertions(+), 63 deletions(-) diff --git a/whippersnappy/gl/_headless.py b/whippersnappy/gl/_headless.py index c4aafe6..b3f2b49 100644 --- a/whippersnappy/gl/_headless.py +++ b/whippersnappy/gl/_headless.py @@ -69,6 +69,21 @@ def _egl_context_works(): logger.debug("EGL probe: libEGL not loadable.") return False + from contextlib import contextmanager + + @contextmanager + def _suppress_stderr(): + """Suppress C-level stderr (e.g. Mesa/EGL driver warnings).""" + devnull = os.open(os.devnull, os.O_WRONLY) + saved = os.dup(2) + try: + os.dup2(devnull, 2) + yield + finally: + os.dup2(saved, 2) + os.close(saved) + os.close(devnull) + try: libegl.eglGetProcAddress.restype = ctypes.c_void_p libegl.eglGetProcAddress.argtypes = [ctypes.c_char_p] @@ -85,104 +100,85 @@ def _egl_context_works(): libegl.eglTerminate.restype = ctypes.c_bool libegl.eglTerminate.argtypes = [ctypes.c_void_p] - _EGL_EXTENSIONS = 0x3055 - _EGL_NONE = 0x3038 - _EGL_PLATFORM_DEVICE = 0x313F - + _EGL_EXTENSIONS = 0x3055 + _EGL_NONE = 0x3038 + _EGL_PLATFORM_DEVICE = 0x313F + _EGL_PLATFORM_SURFACELESS = 0x31DD client_exts = libegl.eglQueryString(None, _EGL_EXTENSIONS) or b"" logger.debug("EGL client extensions: %s", client_exts.decode()) - _GetPlatformDisplayEXT = None - if b"EGL_EXT_platform_base" in client_exts: - addr = libegl.eglGetProcAddress(b"eglGetPlatformDisplayEXT") - if addr: - _GetPlatformDisplayEXT = ctypes.CFUNCTYPE( - ctypes.c_void_p, - ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), - )(addr) - no_attribs = (ctypes.c_int * 1)(_EGL_NONE) - # --- Path 1: EGL_EXT_device_enumeration --- - def _try_init_silent(dpy): - """Call eglInitialize with C-level stderr suppressed.""" + def _get_proc(signature, name): + """Resolve an EGL extension function; return None if unavailable.""" + addr = libegl.eglGetProcAddress(name) + return signature(addr) if addr else None + + def _try_init(dpy): + """Try eglInitialize on dpy with stderr suppressed; terminate on success.""" if not dpy: return False - devnull = os.open(os.devnull, os.O_WRONLY) - saved = os.dup(2) - try: - os.dup2(devnull, 2) - major, minor = ctypes.c_int(0), ctypes.c_int(0) + major, minor = ctypes.c_int(0), ctypes.c_int(0) + with _suppress_stderr(): ok = libegl.eglInitialize(dpy, ctypes.byref(major), ctypes.byref(minor)) libegl.eglTerminate(dpy) - finally: - os.dup2(saved, 2) - os.close(saved) - os.close(devnull) if ok: logger.debug("EGL probe: eglInitialize OK (EGL %d.%d).", major.value, minor.value) return bool(ok) + # Resolve eglGetPlatformDisplayEXT once; used by paths 1 and 2. + _GetPlatformDisplayEXT = None + if b"EGL_EXT_platform_base" in client_exts: + _GetPlatformDisplayEXT = _get_proc( + ctypes.CFUNCTYPE(ctypes.c_void_p, + ctypes.c_int, ctypes.c_void_p, + ctypes.POINTER(ctypes.c_int)), + b"eglGetPlatformDisplayEXT", + ) + + # --- Path 1: EGL_EXT_device_enumeration --- # Try GPU and software devices; prefer GPU when available natively. - if (_GetPlatformDisplayEXT - and b"EGL_EXT_device_enumeration" in client_exts): - addr = libegl.eglGetProcAddress(b"eglQueryDevicesEXT") - if addr: - _QueryDevices = ctypes.CFUNCTYPE( - ctypes.c_bool, - ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), - )(addr) + if _GetPlatformDisplayEXT and b"EGL_EXT_device_enumeration" in client_exts: + _QueryDevices = _get_proc( + ctypes.CFUNCTYPE(ctypes.c_bool, + ctypes.c_int, ctypes.c_void_p, + ctypes.POINTER(ctypes.c_int)), + b"eglQueryDevicesEXT", + ) + if _QueryDevices: n = ctypes.c_int(0) - devnull = os.open(os.devnull, os.O_WRONLY) - saved = os.dup(2) - try: - os.dup2(devnull, 2) + with _suppress_stderr(): found = _QueryDevices(0, None, ctypes.byref(n)) - finally: - os.dup2(saved, 2) - os.close(saved) - os.close(devnull) + logger.debug("EGL probe: %d EGL device(s) found.", n.value if found else 0) if found and n.value > 0: - logger.debug("EGL probe: %d EGL device(s) found.", n.value) devices = (ctypes.c_void_p * n.value)() - _QueryDevices(n.value, devices, ctypes.byref(n)) + with _suppress_stderr(): + _QueryDevices(n.value, devices, ctypes.byref(n)) for dev in devices: - devnull = os.open(os.devnull, os.O_WRONLY) - saved = os.dup(2) - try: - os.dup2(devnull, 2) - dpy = _GetPlatformDisplayEXT( - _EGL_PLATFORM_DEVICE, - ctypes.c_void_p(dev), - no_attribs, - ) - finally: - os.dup2(saved, 2) - os.close(saved) - os.close(devnull) - if _try_init_silent(dpy): + dpy = _GetPlatformDisplayEXT( + _EGL_PLATFORM_DEVICE, + ctypes.c_void_p(dev), + no_attribs, + ) + if _try_init(dpy): logger.debug("EGL probe: device enumeration succeeded.") return True - else: - logger.debug("EGL probe: device enumeration found 0 devices.") # --- Path 2: EGL_MESA_platform_surfaceless --- # CPU software rendering (llvmpipe) — no GPU needed. - _EGL_PLATFORM_SURFACELESS = 0x31DD if _GetPlatformDisplayEXT and b"EGL_MESA_platform_surfaceless" in client_exts: dpy = _GetPlatformDisplayEXT( _EGL_PLATFORM_SURFACELESS, ctypes.c_void_p(0), no_attribs ) - if _try_init_silent(dpy): + if _try_init(dpy): logger.debug("EGL probe: surfaceless platform succeeded (CPU/llvmpipe).") return True # --- Path 3: EGL_DEFAULT_DISPLAY --- # Works only when a display server is reachable (DISPLAY set). - dpy = libegl.eglGetDisplay(ctypes.c_void_p(0)) - if _try_init_silent(dpy): + if _try_init(libegl.eglGetDisplay(ctypes.c_void_p(0))): logger.debug("EGL probe: EGL_DEFAULT_DISPLAY succeeded.") return True