Fix Windows build errors and thread management for dynamic backend loading (-DGGML_BACKEND_DL=ON) (#22)

cynodesmus · coderabbitai[bot] · ServeurpersoCom · web-flow · commit fc712096fadd · 2026-03-13T08:36:36.000+01:00
* Fix Windows MSVC build for GGML DL mode and update to Registry API

* Update src/backend.h

Co-authored-by: coderabbitai[bot] &lt;136622811+coderabbitai[bot]@users.noreply.github.com&gt;

* Apply n_threads configuration to tokenizer backend

* Fix thread params in fallback path for tokenizer

* Standardize CPU backend initialization with thread params

* Remove outdated comment and finalize CPU initialization

* Finalize CPU initialization logic with thread param propagation

* Update CMakeLists.txt

---------

Co-authored-by: coderabbitai[bot] &lt;136622811+coderabbitai[bot]@users.noreply.github.com&gt;
Co-authored-by: Pascal &lt;admin@serveurperso.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -41,10 +41,16 @@ macro(link_ggml_backends target)
         target_compile_options(${target} PRIVATE -Wall -Wextra -Wshadow -Wconversion
                               -Wno-unused-parameter -Wno-unused-function -Wno-sign-conversion)
     endif()
-    target_link_libraries(${target} PRIVATE ggml ggml-base ggml-cpu)
-    foreach(backend blas cuda metal vulkan)
+    target_link_libraries(${target} PRIVATE ggml)
+    if(TARGET ggml-base)
+        target_link_libraries(${target} PRIVATE ggml-base)
+    endif()
+    foreach(backend cpu blas cuda metal vulkan)
         if(TARGET ggml-${backend})
-            target_link_libraries(${target} PRIVATE ggml-${backend})
+            get_target_property(CURRENT_BACKEND_TYPE ggml-${backend} TYPE)
+            if (NOT CURRENT_BACKEND_TYPE STREQUAL "MODULE_LIBRARY")
+                 target_link_libraries(${target} PRIVATE ggml-${backend})
+            endif()
             string(TOUPPER ${backend} BACKEND_UPPER)
             target_compile_definitions(${target} PRIVATE ACESTEP_HAVE_${BACKEND_UPPER})
             if(backend STREQUAL "cuda")
diff --git a/src/backend.h b/src/backend.h
@@ -6,7 +6,6 @@
 // qwen3.h, qwen3-lm.h, cond.h, dit.h, vae.h.
 
 #include "ggml-backend.h"
-#include "ggml-cpu.h"
 #ifdef ACESTEP_HAVE_CUDA
 // Query compute capability without pulling in cuda_runtime.h.
 // cudaDeviceGetAttribute takes an int enum value; we pass the raw constants.
@@ -45,23 +44,34 @@ static BackendPair backend_init(const char * label) {
         fprintf(stderr, "[Load] FATAL: no backend available\n");
         exit(1);
     }
+    bool best_is_cpu = (strcmp(ggml_backend_name(bp.backend), "CPU") == 0);
     int n_threads = (int) std::thread::hardware_concurrency() / 2;
     if (n_threads < 1) {
         n_threads = 1;
     }
-    // [GGML] If best backend is already CPU, reuse it (avoid 2 CPU instances
-    // where only one gets the thread count)
-    bool best_is_cpu = (strcmp(ggml_backend_name(bp.backend), "CPU") == 0);
+    // Initialize CPU backend with explicit thread count
+    char params[64];
+    snprintf(params, sizeof(params), "n_threads=%d", n_threads);
+    auto init_cpu_backend = [&]() -> ggml_backend_t {
+        ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+        if (cpu_dev) {
+            if (ggml_backend_t cpu = ggml_backend_dev_init(cpu_dev, params)) {
+                return cpu;
+            }
+        }
+        return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, params);
+    };
+
     if (best_is_cpu) {
+        ggml_backend_free(bp.backend);
+        bp.backend = init_cpu_backend();
         bp.cpu_backend = bp.backend;
-        ggml_backend_cpu_set_n_threads(bp.backend, n_threads);
     } else {
-        bp.cpu_backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
-        if (!bp.cpu_backend) {
-            fprintf(stderr, "[Load] FATAL: failed to init CPU backend\n");
-            exit(1);
-        }
-        ggml_backend_cpu_set_n_threads(bp.cpu_backend, n_threads);
+        bp.cpu_backend = init_cpu_backend();
+    }
+    if (!bp.cpu_backend) {
+        fprintf(stderr, "[Load] FATAL: failed to init CPU backend\n");
+        exit(1);
     }
     fprintf(stderr, "[Load] %s backend: %s (CPU threads: %d)\n", label, ggml_backend_name(bp.backend), n_threads);
 
diff --git a/tools/ace-understand.cpp b/tools/ace-understand.cpp
@@ -28,6 +28,7 @@
 #include <cstring>
 #include <random>
 #include <string>
+#include <thread>
 #include <unordered_map>
 #include <vector>
 
@@ -390,7 +391,24 @@ int main(int argc, char ** argv) {
         // Tokenizer weights live in the DiT GGUF (prefix "tokenizer.")
         Timer          t_tok;
         TokGGML        tok    = {};
-        ggml_backend_t be_tok = ggml_backend_cpu_init();
+        ggml_backend_dev_t dev_cpu = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+        ggml_backend_t be_tok = NULL;
+        int n_threads = (int) std::thread::hardware_concurrency() / 2;
+        if (n_threads < 1) {
+            n_threads = 1;
+        }
+        char params[64];
+        snprintf(params, sizeof(params), "n_threads=%d", n_threads);
+        if (dev_cpu) {
+            be_tok = ggml_backend_dev_init(dev_cpu, params);
+        }
+        if (!be_tok) {
+            be_tok = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, params);
+        }
+        if (!be_tok) {
+            fprintf(stderr, "[Tok] FATAL: failed to init CPU backend\n");
+            return 1;
+        }
         if (!tok_ggml_load(&tok, dit_gguf, be_tok, be_tok)) {
             fprintf(stderr, "[Tok] FATAL: load failed\n");
             ggml_backend_free(be_tok);