Fix lora loading when using multiple clip backends

stduhpf · stduhpf · commit 4fb8901df0af · 2026-03-19T15:27:34.000+01:00
diff --git a/src/conditioner.hpp b/src/conditioner.hpp
@@ -35,6 +35,7 @@ struct ConditionerParams {
 };
 
 struct Conditioner {
+    int model_count                                                                        = 1;
     virtual SDCondition get_learned_condition(ggml_context* work_ctx,
                                               int n_threads,
                                               const ConditionerParams& conditioner_params) = 0;
@@ -53,6 +54,11 @@ struct Conditioner {
                                                    const std::string& prompt) {
         GGML_ABORT("Not implemented yet!");
     }
+    virtual bool is_cond_stage_model_name_at_index(const std::string& name, int index) {
+        return true;
+    }
+    virtual ggml_backend_t get_params_backend_at_index(int index) = 0;
+    virtual ggml_backend_t get_runtime_backend_at_index(int index) = 0;
 };
 
 // ldm.modules.encoders.modules.FrozenCLIPEmbedder
@@ -95,8 +101,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             LOG_INFO("CLIP-H: using %s backend", ggml_backend_name(clip_backend));
             text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
         } else if (sd_version_is_sdxl(version)) {
+            model_count                   = 2;
             ggml_backend_t clip_g_backend = clip_backend;
-            if (backends.size() >= 2){
+            if (backends.size() >= 2) {
                 clip_g_backend = backends[1];
                 if (backends.size() > 2) {
                     LOG_WARN("More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest.");
@@ -665,6 +672,42 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                             conditioner_params.adm_in_channels,
                                             conditioner_params.zero_out_masked);
     }
+
+    bool is_cond_stage_model_name_at_index(const std::string& name, int index) override {
+        if (sd_version_is_sdxl(version)) {
+            if (index == 0) {
+                return contains(name, "cond_stage_model.model.transformer");
+            } else if (index == 1) {
+                return contains(name, "cond_stage_model.model.1");
+            } else {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    ggml_backend_t get_params_backend_at_index(int index){
+        if (sd_version_is_sdxl(version) && index == 1){
+            if(text_model2) {
+                return text_model2->get_params_backend();
+            }
+        } else if (text_model) {
+            return text_model->get_params_backend();
+        }
+        return nullptr;
+    }
+
+    ggml_backend_t get_runtime_backend_at_index(int index){
+        if (sd_version_is_sdxl(version) && index == 1){
+            if(text_model2) {
+                return text_model2->get_runtime_backend();
+            }
+        } else if (text_model) {
+            return text_model->get_runtime_backend();
+        }
+        return nullptr;
+    }
+
 };
 
 struct FrozenCLIPVisionEmbedder : public GGMLRunner {
@@ -740,12 +783,14 @@ struct SD3CLIPEmbedder : public Conditioner {
         bool use_clip_g = false;
         bool use_t5     = false;
 
+        model_count = 3;
+
         ggml_backend_t clip_l_backend, clip_g_backend, t5_backend;
         if (backends.size() == 1) {
             clip_l_backend = clip_g_backend = t5_backend = backends[0];
         } else if (backends.size() == 2) {
             clip_l_backend = clip_g_backend = backends[0];
-            t5_backend = backends[1];
+            t5_backend                      = backends[1];
         } else if (backends.size() >= 3) {
             clip_l_backend = backends[0];
             clip_g_backend = backends[1];
@@ -1175,6 +1220,42 @@ struct SD3CLIPEmbedder : public Conditioner {
                                             conditioner_params.clip_skip,
                                             conditioner_params.zero_out_masked);
     }
+
+    bool is_cond_stage_model_name_at_index(const std::string& name, int index) override {
+        if (index == 0) {
+            return contains(name, "text_encoders.clip_l");
+        } else if (index == 1) {
+            return contains(name, "text_encoders.clip_g");
+        } else if (index == 2) {
+            return contains(name, "text_encoders.t5xxl");
+        } else {
+            return false;
+        }
+    }
+
+    ggml_backend_t get_params_backend_at_index(int index){
+        if (index == 0 && clip_l) {
+            return clip_l->get_params_backend();
+        } else if (index == 1 && clip_g) {
+            return clip_g->get_params_backend();
+        } else if (index == 2 && t5) {
+            return t5->get_params_backend();
+        } else {
+            return nullptr;
+        }
+    }
+
+    ggml_backend_t get_runtime_backend_at_index(int index){
+        if (index == 0 && clip_l) {
+            return clip_l->get_runtime_backend();
+        } else if (index == 1 && clip_g) {
+            return clip_g->get_runtime_backend();
+        } else if (index == 2 && t5) {
+            return t5->get_runtime_backend();
+        } else {
+            return nullptr;
+        }
+    }
 };
 
 struct FluxCLIPEmbedder : public Conditioner {
@@ -1190,19 +1271,19 @@ struct FluxCLIPEmbedder : public Conditioner {
         bool use_clip_l = false;
         bool use_t5     = false;
 
+        model_count = 2;
 
         ggml_backend_t clip_l_backend, t5_backend;
         if (backends.size() == 1) {
             clip_l_backend = t5_backend = backends[0];
         } else if (backends.size() >= 2) {
             clip_l_backend = backends[0];
-            t5_backend = backends[1];
+            t5_backend     = backends[1];
             if (backends.size() > 2) {
                 LOG_WARN("More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest.");
             }
         }
 
-
         for (auto pair : tensor_storage_map) {
             if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
                 use_clip_l = true;
@@ -1468,6 +1549,36 @@ struct FluxCLIPEmbedder : public Conditioner {
                                             conditioner_params.clip_skip,
                                             conditioner_params.zero_out_masked);
     }
+
+    bool is_cond_stage_model_name_at_index(const std::string& name, int index) override {
+        if (index == 0) {
+            return contains(name, "text_encoders.clip_l");
+        } else if (index == 1) {
+            return contains(name, "text_encoders.t5xxl");
+        } else {
+            return false;
+        }
+    }
+
+    ggml_backend_t get_params_backend_at_index(int index){
+        if (index == 0 && clip_l) {
+            return clip_l->get_params_backend();
+        } else if (index == 1 && t5) {
+            return t5->get_params_backend();
+        } else {
+            return nullptr;
+        }
+    }
+
+    ggml_backend_t get_runtime_backend_at_index(int index){
+        if (index == 0 && clip_l) {
+            return clip_l->get_runtime_backend();
+        } else if (index == 1 && t5) {
+            return t5->get_runtime_backend();
+        } else {
+            return nullptr;
+        }
+    }
 };
 
 struct T5CLIPEmbedder : public Conditioner {
@@ -1691,6 +1802,20 @@ struct T5CLIPEmbedder : public Conditioner {
                                             conditioner_params.clip_skip,
                                             conditioner_params.zero_out_masked);
     }
+
+    ggml_backend_t get_params_backend_at_index(int index){
+        if (t5){
+            return t5->get_params_backend();
+        }
+        return nullptr;
+    }
+
+    ggml_backend_t get_runtime_backend_at_index(int index){
+        if (t5){
+            return t5->get_runtime_backend();
+        }
+        return nullptr;
+    }
 };
 
 struct AnimaConditioner : public Conditioner {
@@ -1703,11 +1828,11 @@ struct AnimaConditioner : public Conditioner {
                      const String2TensorStorage& tensor_storage_map = {}) {
         qwen_tokenizer = std::make_shared<LLM::Qwen2Tokenizer>();
         llm            = std::make_shared<LLM::LLMRunner>(LLM::LLMArch::QWEN3,
-                                               backend,
-                                               offload_params_to_cpu,
-                                               tensor_storage_map,
-                                               "text_encoders.llm",
-                                               false);
+                                                          backend,
+                                                          offload_params_to_cpu,
+                                                          tensor_storage_map,
+                                                          "text_encoders.llm",
+                                                          false);
     }
 
     void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
@@ -1827,6 +1952,20 @@ struct AnimaConditioner : public Conditioner {
 
         return {hidden_states, t5_weight_tensor, t5_ids_tensor};
     }
+
+    ggml_backend_t get_params_backend_at_index(int index){
+        if (llm){
+            return llm->get_params_backend();
+        }
+        return nullptr;
+    }
+
+    ggml_backend_t get_runtime_backend_at_index(int index){
+        if (llm){
+            return llm->get_runtime_backend();
+        }
+        return nullptr;
+    }
 };
 
 struct LLMEmbedder : public Conditioner {
@@ -2201,6 +2340,20 @@ struct LLMEmbedder : public Conditioner {
         LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
         return {hidden_states, nullptr, nullptr, extra_hidden_states_vec};
     }
+
+    ggml_backend_t get_params_backend_at_index(int index){
+        if (llm){
+            return llm->get_params_backend();
+        }
+        return nullptr;
+    }
+
+    ggml_backend_t get_runtime_backend_at_index(int index){
+        if (llm){
+            return llm->get_runtime_backend();
+        }
+        return nullptr;
+    }
 };
 
 #endif
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
@@ -1637,14 +1637,6 @@ class StableDiffusionGGML {
         for (auto& kv : lora_state_diff) {
             bool applied = false;
             int64_t t0 = ggml_time_ms();
-            // TODO: Fix that
-            bool are_clip_backends_similar = true;
-            for (auto backend: clip_backends){
-                are_clip_backends_similar = are_clip_backends_similar && (clip_backends[0]==backend || ggml_backend_is_cpu(backend));
-            }
-            if(!are_clip_backends_similar){
-                LOG_WARN("Text encoders are running on different backends. This may cause issues when immediately applying LoRAs.");
-            }
             auto lora_tensor_filter_diff = [&](const std::string& tensor_name) {
                 if (is_diffusion_model_name(tensor_name)) {
                     return true;
@@ -1660,19 +1652,22 @@ class StableDiffusionGGML {
                 applied = true;
             }
 
-            auto lora_tensor_filter_cond = [&](const std::string& tensor_name) {
-                if (is_cond_stage_model_name(tensor_name)) {
-                    return true;
+            for (int i = 0; i < cond_stage_model->model_count; i++) {
+                auto lora_tensor_filter_cond = [&](const std::string& tensor_name) {
+                    if (is_cond_stage_model_name(tensor_name)) {
+                        return cond_stage_model->is_cond_stage_model_name_at_index(tensor_name, i);
+                    }
+                    return false;
+                };
+                // TODO: split by model
+                LOG_INFO("applying lora to text encoder (%d)", i);
+                auto backend = cond_stage_model->get_params_backend_at_index(i);
+                lora         = load_lora_model_from_file(kv.first, kv.second, backend, lora_tensor_filter_cond);
+                if (lora && !lora->lora_tensors.empty()) {
+                    lora->apply(tensors, version, n_threads);
+                    lora->free_params_buffer();
+                    applied = true;
                 }
-                return false;
-            };
-            // TODO: split by model
-            LOG_INFO("applying lora to text encoders");
-            lora = load_lora_model_from_file(kv.first, kv.second, clip_backends[0], lora_tensor_filter_cond);
-            if (lora && !lora->lora_tensors.empty()) {
-                lora->apply(tensors, version, n_threads);
-                lora->free_params_buffer();
-                applied = true;
             }
 
             auto lora_tensor_filter_first = [&](const std::string& tensor_name) {
@@ -1734,22 +1729,27 @@ class StableDiffusionGGML {
                 }
             }
             cond_stage_lora_models  = lora_models;
-            auto lora_tensor_filter = [&](const std::string& tensor_name) {
-                if (is_cond_stage_model_name(tensor_name)) {
-                    return true;
-                }
-                return false;
-            };
-            for (auto& kv : lora_state_diff) {
-                const std::string& lora_id = kv.first;
-                float multiplier           = kv.second;
-                //TODO: split by model
-                auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backends[0], lora_tensor_filter);
-                if (lora && !lora->lora_tensors.empty()) {
-                    lora->preprocess_lora_tensors(tensors);
-                    cond_stage_lora_models.push_back(lora);
+
+            
+            for(int i=0;i<cond_stage_model->model_count;i++){
+                auto lora_tensor_filter_cond = [&](const std::string& tensor_name) {
+                    if (is_cond_stage_model_name(tensor_name)) {
+                        return cond_stage_model->is_cond_stage_model_name_at_index(tensor_name, i);
+                    }
+                    return false;
+                };
+                for (auto& kv : lora_state_diff) {
+                    const std::string& lora_id = kv.first;
+                    float multiplier           = kv.second;
+                    auto backend = cond_stage_model->get_runtime_backend_at_index(i);
+                    auto lora = load_lora_model_from_file(kv.first, kv.second, backend, lora_tensor_filter_cond);
+                    if (lora && !lora->lora_tensors.empty()) {
+                        lora->preprocess_lora_tensors(tensors);
+                        cond_stage_lora_models.push_back(lora);
+                    }
                 }
             }
+
             auto multi_lora_adapter = std::make_shared<MultiLoraAdapter>(cond_stage_lora_models);
             cond_stage_model->set_weight_adapter(multi_lora_adapter);
         }