leejet · stduhpf · Jan 9, 2026 · Jan 9, 2026 · Jan 11, 2026 · Jan 11, 2026
diff --git a/common.hpp b/common.hpp
@@ -3,6 +3,10 @@
 
 #include "ggml_extend.hpp"
 
+#ifdef SD_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
+
 class DownSampleBlock : public GGMLBlock {
 protected:
     int channels;
@@ -248,9 +252,6 @@ class FeedForward : public GGMLBlock {
         float scale         = 1.f;
         if (precision_fix) {
             scale = 1.f / 128.f;
-#ifdef SD_USE_VULKAN
-            force_prec_f32 = true;
-#endif
         }
         // The purpose of the scale here is to prevent NaN issues in certain situations.
         // For example, when using Vulkan without enabling force_prec_f32,
@@ -264,6 +265,11 @@ class FeedForward : public GGMLBlock {
 
         auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
         auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
+        #ifdef SD_USE_VULKAN
+            if(ggml_backend_is_vk(ctx->backend)){
+                net_2->set_force_prec_f32(true);
+            }
+        #endif
 
         x = net_0->forward(ctx, x);  // [ne3, ne2, ne1, inner_dim]
         x = net_2->forward(ctx, x);  // [ne3, ne2, ne1, dim_out]

diff --git a/conditioner.hpp b/conditioner.hpp
@@ -2,8 +2,11 @@
 #define __CONDITIONER_HPP__
 
 #include "clip.hpp"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
 #include "llm.hpp"
 #include "t5.hpp"
+#include "util.h"
 
 struct SDCondition {
     struct ggml_tensor* c_crossattn = nullptr;  // aka context
@@ -62,7 +65,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
     std::vector<uint8_t> token_embed_custom;
     std::map<std::string, std::pair<int, int>> embedding_pos_map;
 
-    FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
+    FrozenCLIPEmbedderWithCustomWords(std::vector<ggml_backend_t> backends,
                                       bool offload_params_to_cpu,
                                       const String2TensorStorage& tensor_storage_map,
                                       const std::map<std::string, std::string>& orig_embedding_map,
@@ -76,13 +79,27 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             tokenizer.add_special_token(name);
         }
         bool force_clip_f32 = !embedding_map.empty();
+
+        ggml_backend_t clip_backend = backends[0];
+
         if (sd_version_is_sd1(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
+            LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_backend));
+            text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
         } else if (sd_version_is_sd2(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
+            LOG_INFO("CLIP-H: using %s backend", ggml_backend_name(clip_backend));
+            text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
         } else if (sd_version_is_sdxl(version)) {
-            text_model  = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
-            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
+            ggml_backend_t clip_g_backend = clip_backend;
+            if (backends.size() >= 2){
+                clip_g_backend = backends[1];
+                if (backends.size() > 2) {
+                    LOG_WARN("More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest.");
+                }
+            }
+            LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_backend));
+            LOG_INFO("CLIP-G: using %s backend", ggml_backend_name(clip_g_backend));
+            text_model  = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
+            text_model2 = std::make_shared<CLIPTextModelRunner>(clip_g_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
         }
     }
 
@@ -702,13 +719,29 @@ struct SD3CLIPEmbedder : public Conditioner {
     std::shared_ptr<CLIPTextModelRunner> clip_g;
     std::shared_ptr<T5Runner> t5;
 
-    SD3CLIPEmbedder(ggml_backend_t backend,
+    SD3CLIPEmbedder(std::vector<ggml_backend_t> backends,
                     bool offload_params_to_cpu,
                     const String2TensorStorage& tensor_storage_map = {})
         : clip_g_tokenizer(0) {
         bool use_clip_l = false;
         bool use_clip_g = false;
         bool use_t5     = false;
+
+        ggml_backend_t clip_l_backend, clip_g_backend, t5_backend;
+        if (backends.size() == 1) {
+            clip_l_backend = clip_g_backend = t5_backend = backends[0];
+        } else if (backends.size() == 2) {
+            clip_l_backend = clip_g_backend = backends[0];
+            t5_backend = backends[1];
+        } else if (backends.size() >= 3) {
+            clip_l_backend = backends[0];
+            clip_g_backend = backends[1];
+            t5_backend     = backends[2];
+            if (backends.size() > 3) {
+                LOG_WARN("More than 3 clip backends provided, but the model only supports 3 text encoders. Ignoring the rest.");
+            }
+        }
+
         for (auto pair : tensor_storage_map) {
             if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
                 use_clip_l = true;
@@ -723,13 +756,16 @@ struct SD3CLIPEmbedder : public Conditioner {
             return;
         }
         if (use_clip_l) {
-            clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+            LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_l_backend));
+            clip_l = std::make_shared<CLIPTextModelRunner>(clip_l_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
         }
         if (use_clip_g) {
-            clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+            LOG_INFO("CLIP-G: using %s backend", ggml_backend_name(clip_g_backend));
+            clip_g = std::make_shared<CLIPTextModelRunner>(clip_g_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
         }
         if (use_t5) {
-            t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
+            LOG_INFO("T5-XXL: using %s backend", ggml_backend_name(t5_backend));
+            t5 = std::make_shared<T5Runner>(t5_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
         }
     }
 
@@ -1123,11 +1159,25 @@ struct FluxCLIPEmbedder : public Conditioner {
     std::shared_ptr<T5Runner> t5;
     size_t chunk_len = 256;
 
-    FluxCLIPEmbedder(ggml_backend_t backend,
+    FluxCLIPEmbedder(std::vector<ggml_backend_t> backends,
                      bool offload_params_to_cpu,
                      const String2TensorStorage& tensor_storage_map = {}) {
         bool use_clip_l = false;
         bool use_t5     = false;
+
+
+        ggml_backend_t clip_l_backend, t5_backend;
+        if (backends.size() == 1) {
+            clip_l_backend = t5_backend = backends[0];
+        } else if (backends.size() >= 2) {
+            clip_l_backend = backends[0];
+            t5_backend = backends[1];
+            if (backends.size() > 2) {
+                LOG_WARN("More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest.");
+            }
+        }
+
+
         for (auto pair : tensor_storage_map) {
             if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
                 use_clip_l = true;
@@ -1142,12 +1192,14 @@ struct FluxCLIPEmbedder : public Conditioner {
         }
 
         if (use_clip_l) {
-            clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
+            LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_l_backend));
+            clip_l = std::make_shared<CLIPTextModelRunner>(clip_l_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
         } else {
             LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded.");
         }
         if (use_t5) {
-            t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
+            LOG_INFO("T5-XXL: using %s backend", ggml_backend_name(clip_l_backend));
+            t5 = std::make_shared<T5Runner>(t5_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
         } else {
             LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded.");
         }

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -46,6 +46,7 @@ struct SDCliParams {
     bool color               = false;
 
     bool normal_exit = false;
+    bool skip_usage = false;
 
     ArgOptions get_options() {
         ArgOptions options;
@@ -143,7 +144,27 @@ struct SDCliParams {
 
         auto on_help_arg = [&](int argc, const char** argv, int index) {
             normal_exit = true;
-            return -1;
+            return VALID_BREAK_OPT;
+        };
+
+        auto on_rpc_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            const char* rpc_device = argv[index];
+            add_rpc_device(rpc_device);
+            return 1; 
+        };
+
+        auto on_list_devices_arg = [&](int argc, const char** argv, int index) {
+            size_t buff_size = backend_list_size();
+            char* buff = (char*)malloc(buff_size);
+            list_backends_to_buffer(buff, buff_size);
+            printf("List of available GGML devices:\nName\tDescription\n-------------------\n%s\n", buff);
+            free(buff);
+            normal_exit = true;
+            skip_usage = true;
+            return VALID_BREAK_OPT;
         };
 
         options.manual_options = {
@@ -159,6 +180,14 @@ struct SDCliParams {
              "--help",
              "show this help message and exit",
              on_help_arg},
+            {"",
+             "--rpc",
+             "add a rpc device", 
+             on_rpc_arg},
+             {"",
+              "--list-devices",
+              "list available ggml compute devices",
+              on_list_devices_arg},
         };
 
         return options;
@@ -213,7 +242,9 @@ void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextP
     std::vector<ArgOptions> options_vec = {cli_params.get_options(), ctx_params.get_options(), gen_params.get_options()};
 
     if (!parse_options(argc, argv, options_vec)) {
-        print_usage(argc, argv, options_vec);
+        if (!cli_params.skip_usage){
+            print_usage(argc, argv, options_vec);
+        }
         exit(cli_params.normal_exit ? 0 : 1);
     }
 
@@ -783,7 +814,8 @@ int main(int argc, const char* argv[]) {
                                                         ctx_params.offload_params_to_cpu,
                                                         ctx_params.diffusion_conv_direct,
                                                         ctx_params.n_threads,
-                                                        gen_params.upscale_tile_size);
+                                                        gen_params.upscale_tile_size,
+                                                        ctx_params.upscaler_backend_device.c_str());
 
         if (upscaler_ctx == nullptr) {
             LOG_ERROR("new_upscaler_ctx failed");