diff --git a/docs/distilled_sd.md b/docs/distilled_sd.md
index 3174b18f8..7aa8fbede 100644
--- a/docs/distilled_sd.md
+++ b/docs/distilled_sd.md
@@ -87,51 +87,32 @@ pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
 ```bash
 python convert_diffusers_to_original_stable_diffusion.py \
       --model_path  ./segmindtiny-sd \
-      --checkpoint_path ./segmind_tiny-sd.ckpt --half
+      --checkpoint_path ./segmind_tiny-sd.safetensors  --half --use_safetensors
 ```
 
-The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
+The file segmind_tiny-sd.safetensors will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
 
 
-##### Another available .ckpt file:
-
- * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
-
-To use this file, you must first adjust its non-contiguous tensors:
-
-```python
-import torch
-ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
-for key, value in ckpt['state_dict'].items():
-    if isinstance(value, torch.Tensor):
-        ckpt['state_dict'][key] = value.contiguous()
-torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
-```
-
-
-### SDXS-512
+### SDXS-512-DreamShaper
 
 Another very tiny and **incredibly fast**  model is SDXS by IDKiro et al.  The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
+##### Some ready-to-run SDXS-512 model files are available online, such as:
 
-##### 1. Download the diffusers model from  Hugging Face using Python:
-
-```python
-from diffusers import StableDiffusionPipeline
-pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
-pipe.save_pretrained(save_directory="sdxs")
-```
-##### 2. Create a safetensors file
-
-```bash
-python convert_diffusers_to_original_stable_diffusion.py \
-    --model_path  sdxs  --checkpoint_path sdxs.safetensors --half --use_safetensors
-```
-
-##### 3. Run the model as follows:
+* https://huggingface.co/akleine/sdxs-512
+* https://huggingface.co/concedo/sdxs-512-tinySDdistilled-GGUF
 
+##### Run the model as follows:
 ```bash
 ~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
   --cfg-scale 1 --steps 1
 ```
+Both options: ``` --cfg-scale 1 ``` and  ``` --steps 1 ``` are mandatory here.
+
+### SDXS-512-0.9
+
+Even though the name "SDXS-512-0.9" is similar to "SDXS-512-DreamShaper", it is *completely different* but also **incredibly fast**. Sometimes it is preferred, so try it yourself.
+##### Download a ready-to-run file from here:
+
+* https://huggingface.co/akleine/sdxs-09
 
-Both options: ``` --cfg-scale 1 ``` and  ``` --steps 1 ``` are mandatory here.                                                 
+For the use of this model, both options ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are again absolutely necessary.
diff --git a/src/common_block.hpp b/src/common_block.hpp
index 2cef389af..112a4d7a1 100644
--- a/src/common_block.hpp
+++ b/src/common_block.hpp
@@ -277,6 +277,7 @@ class CrossAttention : public GGMLBlock {
     int64_t context_dim;
     int64_t n_head;
     int64_t d_head;
+    bool xtra_dim = false;
 
 public:
     CrossAttention(int64_t query_dim,
@@ -288,7 +289,11 @@ class CrossAttention : public GGMLBlock {
           query_dim(query_dim),
           context_dim(context_dim) {
         int64_t inner_dim = d_head * n_head;
-
+        if (context_dim == 320 && d_head == 320) {
+            // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
+            xtra_dim    = true;
+            context_dim = 1024;
+        }
         blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
         blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
         blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
@@ -313,10 +318,16 @@ class CrossAttention : public GGMLBlock {
         int64_t n_context = context->ne[1];
         int64_t inner_dim = d_head * n_head;
 
-        auto q = to_q->forward(ctx, x);        // [N, n_token, inner_dim]
+        auto q = to_q->forward(ctx, x);  // [N, n_token, inner_dim]
+        if (xtra_dim) {
+            // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
+            context->ne[0] = 1024;  // patch dim
+        }
         auto k = to_k->forward(ctx, context);  // [N, n_context, inner_dim]
         auto v = to_v->forward(ctx, context);  // [N, n_context, inner_dim]
-
+        if (xtra_dim) {
+            context->ne[0] = 320;  // reset dim to orig
+        }
         x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled);  // [N, n_token, inner_dim]
 
         x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
diff --git a/src/model.cpp b/src/model.cpp
index d23b97fac..f443f96bf 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -1042,6 +1042,7 @@ SDVersion ModelLoader::get_sd_version() {
     bool has_middle_block_1          = false;
     bool has_output_block_311        = false;
     bool has_output_block_71         = false;
+    bool has_attn_1024               = false;
 
     for (auto& [name, tensor_storage] : tensor_storage_map) {
         if (!(is_xl)) {
@@ -1111,6 +1112,10 @@ SDVersion ModelLoader::get_sd_version() {
         if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos ||
             tensor_storage.name.find("unet.up_blocks.2.attentions.1") != std::string::npos) {
             has_output_block_71 = true;
+            if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_k.weight") != std::string::npos) {
+                if (tensor_storage.ne[0] == 1024)
+                    has_attn_1024 = true;
+            }
         }
         if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
             tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
@@ -1184,7 +1189,7 @@ SDVersion ModelLoader::get_sd_version() {
         }
         if (!has_middle_block_1) {
             if (!has_output_block_71) {
-                return VERSION_SDXS;
+                return VERSION_SDXS_512_DS;
             }
             return VERSION_SD1_TINY_UNET;
         }
@@ -1194,7 +1199,7 @@ SDVersion ModelLoader::get_sd_version() {
             return VERSION_SD2_INPAINT;
         }
         if (!has_middle_block_1) {
-            return VERSION_SD2_TINY_UNET;
+            return has_attn_1024 ? VERSION_SDXS_09 : VERSION_SD2_TINY_UNET;
         }
         return VERSION_SD2;
     }
diff --git a/src/model.h b/src/model.h
index 3af35eb7e..c3f43f0a7 100644
--- a/src/model.h
+++ b/src/model.h
@@ -28,7 +28,8 @@ enum SDVersion {
     VERSION_SD2,
     VERSION_SD2_INPAINT,
     VERSION_SD2_TINY_UNET,
-    VERSION_SDXS,
+    VERSION_SDXS_512_DS,
+    VERSION_SDXS_09,
     VERSION_SDXL,
     VERSION_SDXL_INPAINT,
     VERSION_SDXL_PIX2PIX,
@@ -54,14 +55,14 @@ enum SDVersion {
 };
 
 static inline bool sd_version_is_sd1(SDVersion version) {
-    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS) {
+    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS_512_DS) {
         return true;
     }
     return false;
 }
 
 static inline bool sd_version_is_sd2(SDVersion version) {
-    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) {
+    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_09) {
         return true;
     }
     return false;
diff --git a/src/name_conversion.cpp b/src/name_conversion.cpp
index d5d5e052c..618c7f6e9 100644
--- a/src/name_conversion.cpp
+++ b/src/name_conversion.cpp
@@ -1120,7 +1120,7 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
         for (const auto& prefix : first_stage_model_prefix_vec) {
             if (starts_with(name, prefix)) {
                 name = convert_first_stage_model_name(name.substr(prefix.size()), prefix);
-                if (version == VERSION_SDXS) {
+                if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
                     name = "tae." + name;
                 } else {
                     name = prefix + name;
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index bbf2f979d..5a022ddef 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -33,7 +33,8 @@ const char* model_version_to_str[] = {
     "SD 2.x",
     "SD 2.x Inpaint",
     "SD 2.x Tiny UNet",
-    "SDXS",
+    "SDXS (512-DS)",
+    "SDXS (09)",
     "SDXL",
     "SDXL Inpaint",
     "SDXL Instruct-Pix2Pix",
@@ -789,7 +790,7 @@ class StableDiffusionGGML {
         }
 
         bool tae_preview_only = sd_ctx_params->tae_preview_only;
-        if (version == VERSION_SDXS) {
+        if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
             tae_preview_only = false;
             use_tae          = true;
         }
@@ -811,8 +812,8 @@ class StableDiffusionGGML {
                                                                      offload_params_to_cpu,
                                                                      tensor_storage_map);
                 diffusion_model  = std::make_shared<MMDiTModel>(backend,
-                                                               offload_params_to_cpu,
-                                                               tensor_storage_map);
+                                                                offload_params_to_cpu,
+                                                                tensor_storage_map);
             } else if (sd_version_is_flux(version)) {
                 bool is_chroma = false;
                 for (auto pair : tensor_storage_map) {
@@ -860,10 +861,10 @@ class StableDiffusionGGML {
                                                                  tensor_storage_map,
                                                                  version);
                 diffusion_model  = std::make_shared<FluxModel>(backend,
-                                                              offload_params_to_cpu,
-                                                              tensor_storage_map,
-                                                              version,
-                                                              sd_ctx_params->chroma_use_dit_mask);
+                                                               offload_params_to_cpu,
+                                                               tensor_storage_map,
+                                                               version,
+                                                               sd_ctx_params->chroma_use_dit_mask);
             } else if (sd_version_is_wan(version)) {
                 cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                     offload_params_to_cpu,
@@ -872,10 +873,10 @@ class StableDiffusionGGML {
                                                                     1,
                                                                     true);
                 diffusion_model  = std::make_shared<WanModel>(backend,
-                                                             offload_params_to_cpu,
-                                                             tensor_storage_map,
-                                                             "model.diffusion_model",
-                                                             version);
+                                                              offload_params_to_cpu,
+                                                              tensor_storage_map,
+                                                              "model.diffusion_model",
+                                                              version);
                 if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
                     high_noise_diffusion_model = std::make_shared<WanModel>(backend,
                                                                             offload_params_to_cpu,
@@ -904,29 +905,29 @@ class StableDiffusionGGML {
                                                                  "",
                                                                  enable_vision);
                 diffusion_model  = std::make_shared<QwenImageModel>(backend,
-                                                                   offload_params_to_cpu,
-                                                                   tensor_storage_map,
-                                                                   "model.diffusion_model",
-                                                                   version,
-                                                                   sd_ctx_params->qwen_image_zero_cond_t);
+                                                                    offload_params_to_cpu,
+                                                                    tensor_storage_map,
+                                                                    "model.diffusion_model",
+                                                                    version,
+                                                                    sd_ctx_params->qwen_image_zero_cond_t);
             } else if (sd_version_is_anima(version)) {
                 cond_stage_model = std::make_shared<AnimaConditioner>(clip_backend,
                                                                       offload_params_to_cpu,
                                                                       tensor_storage_map);
                 diffusion_model  = std::make_shared<AnimaModel>(backend,
-                                                               offload_params_to_cpu,
-                                                               tensor_storage_map,
-                                                               "model.diffusion_model");
+                                                                offload_params_to_cpu,
+                                                                tensor_storage_map,
+                                                                "model.diffusion_model");
             } else if (sd_version_is_z_image(version)) {
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version);
                 diffusion_model  = std::make_shared<ZImageModel>(backend,
-                                                                offload_params_to_cpu,
-                                                                tensor_storage_map,
-                                                                "model.diffusion_model",
-                                                                version);
+                                                                 offload_params_to_cpu,
+                                                                 tensor_storage_map,
+                                                                 "model.diffusion_model",
+                                                                 version);
             } else {  // SD1.x SD2.x SDXL
                 std::map<std::string, std::string> embbeding_map;
                 for (uint32_t i = 0; i < sd_ctx_params->embedding_count; i++) {
diff --git a/src/unet.hpp b/src/unet.hpp
index f7aa3f05d..6a333e66f 100644
--- a/src/unet.hpp
+++ b/src/unet.hpp
@@ -217,11 +217,11 @@ class UnetModelBlock : public GGMLBlock {
         } else if (sd_version_is_unet_edit(version)) {
             in_channels = 8;
         }
-        if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS) {
+        if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
             num_res_blocks = 1;
             channel_mult   = {1, 2, 4};
             tiny_unet      = true;
-            if (version == VERSION_SDXS) {
+            if (version == VERSION_SDXS_512_DS) {
                 attention_resolutions = {4, 2};  // here just like SDXL
             }
         }
@@ -264,6 +264,10 @@ class UnetModelBlock : public GGMLBlock {
             if (version == VERSION_SVD) {
                 return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
             } else {
+                if (version == VERSION_SDXS_09 && n_head == 5) {
+                    n_head = 1;    // to carry a special case of sdxs_09 into CrossAttentionLayer,
+                    d_head = 320;  // works as long the product remains equal (5*64 == 1*320)
+                }
                 return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
             }
         };