diff --git a/docs/distilled_sd.md b/docs/distilled_sd.md index 3174b18f8..7aa8fbede 100644 --- a/docs/distilled_sd.md +++ b/docs/distilled_sd.md @@ -87,51 +87,32 @@ pipe.save_pretrained("segmindtiny-sd", safe_serialization=True) ```bash python convert_diffusers_to_original_stable_diffusion.py \ --model_path ./segmindtiny-sd \ - --checkpoint_path ./segmind_tiny-sd.ckpt --half + --checkpoint_path ./segmind_tiny-sd.safetensors --half --use_safetensors ``` -The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above. +The file segmind_tiny-sd.safetensors will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above. -##### Another available .ckpt file: - - * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt - -To use this file, you must first adjust its non-contiguous tensors: - -```python -import torch -ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu')) -for key, value in ckpt['state_dict'].items(): - if isinstance(value, torch.Tensor): - ckpt['state_dict'][key] = value.contiguous() -torch.save(ckpt, "tinySDdistilled_fixed.ckpt") -``` - - -### SDXS-512 +### SDXS-512-DreamShaper Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part. +##### Some ready-to-run SDXS-512 model files are available online, such as: -##### 1. Download the diffusers model from Hugging Face using Python: - -```python -from diffusers import StableDiffusionPipeline -pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper") -pipe.save_pretrained(save_directory="sdxs") -``` -##### 2. Create a safetensors file - -```bash -python convert_diffusers_to_original_stable_diffusion.py \ - --model_path sdxs --checkpoint_path sdxs.safetensors --half --use_safetensors -``` - -##### 3. Run the model as follows: +* https://huggingface.co/akleine/sdxs-512 +* https://huggingface.co/concedo/sdxs-512-tinySDdistilled-GGUF +##### Run the model as follows: ```bash ~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \ --cfg-scale 1 --steps 1 ``` +Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here. + +### SDXS-512-0.9 + +Even though the name "SDXS-512-0.9" is similar to "SDXS-512-DreamShaper", it is *completely different* but also **incredibly fast**. Sometimes it is preferred, so try it yourself. +##### Download a ready-to-run file from here: + +* https://huggingface.co/akleine/sdxs-09 -Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here. +For the use of this model, both options ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are again absolutely necessary. diff --git a/src/common_block.hpp b/src/common_block.hpp index 2cef389af..112a4d7a1 100644 --- a/src/common_block.hpp +++ b/src/common_block.hpp @@ -277,6 +277,7 @@ class CrossAttention : public GGMLBlock { int64_t context_dim; int64_t n_head; int64_t d_head; + bool xtra_dim = false; public: CrossAttention(int64_t query_dim, @@ -288,7 +289,11 @@ class CrossAttention : public GGMLBlock { query_dim(query_dim), context_dim(context_dim) { int64_t inner_dim = d_head * n_head; - + if (context_dim == 320 && d_head == 320) { + // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09"); + xtra_dim = true; + context_dim = 1024; + } blocks["to_q"] = std::shared_ptr(new Linear(query_dim, inner_dim, false)); blocks["to_k"] = std::shared_ptr(new Linear(context_dim, inner_dim, false)); blocks["to_v"] = std::shared_ptr(new Linear(context_dim, inner_dim, false)); @@ -313,10 +318,16 @@ class CrossAttention : public GGMLBlock { int64_t n_context = context->ne[1]; int64_t inner_dim = d_head * n_head; - auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim] + auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim] + if (xtra_dim) { + // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09"); + context->ne[0] = 1024; // patch dim + } auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim] auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim] - + if (xtra_dim) { + context->ne[0] = 320; // reset dim to orig + } x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim] x = to_out_0->forward(ctx, x); // [N, n_token, query_dim] diff --git a/src/model.cpp b/src/model.cpp index d23b97fac..f443f96bf 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -1042,6 +1042,7 @@ SDVersion ModelLoader::get_sd_version() { bool has_middle_block_1 = false; bool has_output_block_311 = false; bool has_output_block_71 = false; + bool has_attn_1024 = false; for (auto& [name, tensor_storage] : tensor_storage_map) { if (!(is_xl)) { @@ -1111,6 +1112,10 @@ SDVersion ModelLoader::get_sd_version() { if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos || tensor_storage.name.find("unet.up_blocks.2.attentions.1") != std::string::npos) { has_output_block_71 = true; + if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_k.weight") != std::string::npos) { + if (tensor_storage.ne[0] == 1024) + has_attn_1024 = true; + } } if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" || tensor_storage.name == "cond_stage_model.model.token_embedding.weight" || @@ -1184,7 +1189,7 @@ SDVersion ModelLoader::get_sd_version() { } if (!has_middle_block_1) { if (!has_output_block_71) { - return VERSION_SDXS; + return VERSION_SDXS_512_DS; } return VERSION_SD1_TINY_UNET; } @@ -1194,7 +1199,7 @@ SDVersion ModelLoader::get_sd_version() { return VERSION_SD2_INPAINT; } if (!has_middle_block_1) { - return VERSION_SD2_TINY_UNET; + return has_attn_1024 ? VERSION_SDXS_09 : VERSION_SD2_TINY_UNET; } return VERSION_SD2; } diff --git a/src/model.h b/src/model.h index 3af35eb7e..c3f43f0a7 100644 --- a/src/model.h +++ b/src/model.h @@ -28,7 +28,8 @@ enum SDVersion { VERSION_SD2, VERSION_SD2_INPAINT, VERSION_SD2_TINY_UNET, - VERSION_SDXS, + VERSION_SDXS_512_DS, + VERSION_SDXS_09, VERSION_SDXL, VERSION_SDXL_INPAINT, VERSION_SDXL_PIX2PIX, @@ -54,14 +55,14 @@ enum SDVersion { }; static inline bool sd_version_is_sd1(SDVersion version) { - if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS) { + if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS_512_DS) { return true; } return false; } static inline bool sd_version_is_sd2(SDVersion version) { - if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) { + if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_09) { return true; } return false; diff --git a/src/name_conversion.cpp b/src/name_conversion.cpp index d5d5e052c..618c7f6e9 100644 --- a/src/name_conversion.cpp +++ b/src/name_conversion.cpp @@ -1120,7 +1120,7 @@ std::string convert_tensor_name(std::string name, SDVersion version) { for (const auto& prefix : first_stage_model_prefix_vec) { if (starts_with(name, prefix)) { name = convert_first_stage_model_name(name.substr(prefix.size()), prefix); - if (version == VERSION_SDXS) { + if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) { name = "tae." + name; } else { name = prefix + name; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index bbf2f979d..5a022ddef 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -33,7 +33,8 @@ const char* model_version_to_str[] = { "SD 2.x", "SD 2.x Inpaint", "SD 2.x Tiny UNet", - "SDXS", + "SDXS (512-DS)", + "SDXS (09)", "SDXL", "SDXL Inpaint", "SDXL Instruct-Pix2Pix", @@ -789,7 +790,7 @@ class StableDiffusionGGML { } bool tae_preview_only = sd_ctx_params->tae_preview_only; - if (version == VERSION_SDXS) { + if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) { tae_preview_only = false; use_tae = true; } @@ -811,8 +812,8 @@ class StableDiffusionGGML { offload_params_to_cpu, tensor_storage_map); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map); + offload_params_to_cpu, + tensor_storage_map); } else if (sd_version_is_flux(version)) { bool is_chroma = false; for (auto pair : tensor_storage_map) { @@ -860,10 +861,10 @@ class StableDiffusionGGML { tensor_storage_map, version); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - version, - sd_ctx_params->chroma_use_dit_mask); + offload_params_to_cpu, + tensor_storage_map, + version, + sd_ctx_params->chroma_use_dit_mask); } else if (sd_version_is_wan(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, @@ -872,10 +873,10 @@ class StableDiffusionGGML { 1, true); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - "model.diffusion_model", - version); + offload_params_to_cpu, + tensor_storage_map, + "model.diffusion_model", + version); if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) { high_noise_diffusion_model = std::make_shared(backend, offload_params_to_cpu, @@ -904,29 +905,29 @@ class StableDiffusionGGML { "", enable_vision); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - "model.diffusion_model", - version, - sd_ctx_params->qwen_image_zero_cond_t); + offload_params_to_cpu, + tensor_storage_map, + "model.diffusion_model", + version, + sd_ctx_params->qwen_image_zero_cond_t); } else if (sd_version_is_anima(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - "model.diffusion_model"); + offload_params_to_cpu, + tensor_storage_map, + "model.diffusion_model"); } else if (sd_version_is_z_image(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, version); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - "model.diffusion_model", - version); + offload_params_to_cpu, + tensor_storage_map, + "model.diffusion_model", + version); } else { // SD1.x SD2.x SDXL std::map embbeding_map; for (uint32_t i = 0; i < sd_ctx_params->embedding_count; i++) { diff --git a/src/unet.hpp b/src/unet.hpp index f7aa3f05d..6a333e66f 100644 --- a/src/unet.hpp +++ b/src/unet.hpp @@ -217,11 +217,11 @@ class UnetModelBlock : public GGMLBlock { } else if (sd_version_is_unet_edit(version)) { in_channels = 8; } - if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS) { + if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) { num_res_blocks = 1; channel_mult = {1, 2, 4}; tiny_unet = true; - if (version == VERSION_SDXS) { + if (version == VERSION_SDXS_512_DS) { attention_resolutions = {4, 2}; // here just like SDXL } } @@ -264,6 +264,10 @@ class UnetModelBlock : public GGMLBlock { if (version == VERSION_SVD) { return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection); } else { + if (version == VERSION_SDXS_09 && n_head == 5) { + n_head = 1; // to carry a special case of sdxs_09 into CrossAttentionLayer, + d_head = 320; // works as long the product remains equal (5*64 == 1*320) + } return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection); } };