Skip to content
12 changes: 9 additions & 3 deletions common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@

#include "ggml_extend.hpp"

#ifdef SD_USE_VULKAN
#include "ggml-vulkan.h"
#endif

class DownSampleBlock : public GGMLBlock {
protected:
int channels;
Expand Down Expand Up @@ -248,9 +252,6 @@ class FeedForward : public GGMLBlock {
float scale = 1.f;
if (precision_fix) {
scale = 1.f / 128.f;
#ifdef SD_USE_VULKAN
force_prec_f32 = true;
#endif
}
// The purpose of the scale here is to prevent NaN issues in certain situations.
// For example, when using Vulkan without enabling force_prec_f32,
Expand All @@ -264,6 +265,11 @@ class FeedForward : public GGMLBlock {

auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
#ifdef SD_USE_VULKAN
if(ggml_backend_is_vk(ctx->backend)){
net_2->set_force_prec_f32(true);
}
#endif

x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out]
Expand Down
76 changes: 64 additions & 12 deletions conditioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
#define __CONDITIONER_HPP__

#include "clip.hpp"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "llm.hpp"
#include "t5.hpp"
#include "util.h"

struct SDCondition {
struct ggml_tensor* c_crossattn = nullptr; // aka context
Expand Down Expand Up @@ -62,7 +65,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
std::vector<uint8_t> token_embed_custom;
std::map<std::string, std::pair<int, int>> embedding_pos_map;

FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
FrozenCLIPEmbedderWithCustomWords(std::vector<ggml_backend_t> backends,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map,
const std::map<std::string, std::string>& orig_embedding_map,
Expand All @@ -76,13 +79,27 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
tokenizer.add_special_token(name);
}
bool force_clip_f32 = !embedding_map.empty();

ggml_backend_t clip_backend = backends[0];

if (sd_version_is_sd1(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_backend));
text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
} else if (sd_version_is_sd2(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
LOG_INFO("CLIP-H: using %s backend", ggml_backend_name(clip_backend));
text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
} else if (sd_version_is_sdxl(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
ggml_backend_t clip_g_backend = clip_backend;
if (backends.size() >= 2){
clip_g_backend = backends[1];
if (backends.size() > 2) {
LOG_WARN("More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest.");
}
}
LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_backend));
LOG_INFO("CLIP-G: using %s backend", ggml_backend_name(clip_g_backend));
text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
text_model2 = std::make_shared<CLIPTextModelRunner>(clip_g_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
}
}

Expand Down Expand Up @@ -702,13 +719,29 @@ struct SD3CLIPEmbedder : public Conditioner {
std::shared_ptr<CLIPTextModelRunner> clip_g;
std::shared_ptr<T5Runner> t5;

SD3CLIPEmbedder(ggml_backend_t backend,
SD3CLIPEmbedder(std::vector<ggml_backend_t> backends,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {})
: clip_g_tokenizer(0) {
bool use_clip_l = false;
bool use_clip_g = false;
bool use_t5 = false;

ggml_backend_t clip_l_backend, clip_g_backend, t5_backend;
if (backends.size() == 1) {
clip_l_backend = clip_g_backend = t5_backend = backends[0];
} else if (backends.size() == 2) {
clip_l_backend = clip_g_backend = backends[0];
t5_backend = backends[1];
} else if (backends.size() >= 3) {
clip_l_backend = backends[0];
clip_g_backend = backends[1];
t5_backend = backends[2];
if (backends.size() > 3) {
LOG_WARN("More than 3 clip backends provided, but the model only supports 3 text encoders. Ignoring the rest.");
}
}

for (auto pair : tensor_storage_map) {
if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
use_clip_l = true;
Expand All @@ -723,13 +756,16 @@ struct SD3CLIPEmbedder : public Conditioner {
return;
}
if (use_clip_l) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_l_backend));
clip_l = std::make_shared<CLIPTextModelRunner>(clip_l_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
}
if (use_clip_g) {
clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
LOG_INFO("CLIP-G: using %s backend", ggml_backend_name(clip_g_backend));
clip_g = std::make_shared<CLIPTextModelRunner>(clip_g_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
}
if (use_t5) {
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
LOG_INFO("T5-XXL: using %s backend", ggml_backend_name(t5_backend));
t5 = std::make_shared<T5Runner>(t5_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
}
}

Expand Down Expand Up @@ -1123,11 +1159,25 @@ struct FluxCLIPEmbedder : public Conditioner {
std::shared_ptr<T5Runner> t5;
size_t chunk_len = 256;

FluxCLIPEmbedder(ggml_backend_t backend,
FluxCLIPEmbedder(std::vector<ggml_backend_t> backends,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {}) {
bool use_clip_l = false;
bool use_t5 = false;


ggml_backend_t clip_l_backend, t5_backend;
if (backends.size() == 1) {
clip_l_backend = t5_backend = backends[0];
} else if (backends.size() >= 2) {
clip_l_backend = backends[0];
t5_backend = backends[1];
if (backends.size() > 2) {
LOG_WARN("More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest.");
}
}


for (auto pair : tensor_storage_map) {
if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
use_clip_l = true;
Expand All @@ -1142,12 +1192,14 @@ struct FluxCLIPEmbedder : public Conditioner {
}

if (use_clip_l) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_l_backend));
clip_l = std::make_shared<CLIPTextModelRunner>(clip_l_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
} else {
LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded.");
}
if (use_t5) {
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
LOG_INFO("T5-XXL: using %s backend", ggml_backend_name(clip_l_backend));
t5 = std::make_shared<T5Runner>(t5_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
} else {
LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded.");
}
Expand Down
38 changes: 35 additions & 3 deletions examples/cli/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ struct SDCliParams {
bool color = false;

bool normal_exit = false;
bool skip_usage = false;

ArgOptions get_options() {
ArgOptions options;
Expand Down Expand Up @@ -143,7 +144,27 @@ struct SDCliParams {

auto on_help_arg = [&](int argc, const char** argv, int index) {
normal_exit = true;
return -1;
return VALID_BREAK_OPT;
};

auto on_rpc_arg = [&](int argc, const char** argv, int index) {
if (++index >= argc) {
return -1;
}
const char* rpc_device = argv[index];
add_rpc_device(rpc_device);
return 1;
};

auto on_list_devices_arg = [&](int argc, const char** argv, int index) {
size_t buff_size = backend_list_size();
char* buff = (char*)malloc(buff_size);
list_backends_to_buffer(buff, buff_size);
printf("List of available GGML devices:\nName\tDescription\n-------------------\n%s\n", buff);
free(buff);
normal_exit = true;
skip_usage = true;
return VALID_BREAK_OPT;
};

options.manual_options = {
Expand All @@ -159,6 +180,14 @@ struct SDCliParams {
"--help",
"show this help message and exit",
on_help_arg},
{"",
"--rpc",
"add a rpc device",
on_rpc_arg},
{"",
"--list-devices",
"list available ggml compute devices",
on_list_devices_arg},
};

return options;
Expand Down Expand Up @@ -213,7 +242,9 @@ void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextP
std::vector<ArgOptions> options_vec = {cli_params.get_options(), ctx_params.get_options(), gen_params.get_options()};

if (!parse_options(argc, argv, options_vec)) {
print_usage(argc, argv, options_vec);
if (!cli_params.skip_usage){
print_usage(argc, argv, options_vec);
}
exit(cli_params.normal_exit ? 0 : 1);
}

Expand Down Expand Up @@ -783,7 +814,8 @@ int main(int argc, const char* argv[]) {
ctx_params.offload_params_to_cpu,
ctx_params.diffusion_conv_direct,
ctx_params.n_threads,
gen_params.upscale_tile_size);
gen_params.upscale_tile_size,
ctx_params.upscaler_backend_device.c_str());

if (upscaler_ctx == nullptr) {
LOG_ERROR("new_upscaler_ctx failed");
Expand Down
Loading
Loading