mindspore-lab · Dong1017 · Aug 15, 2025 · Aug 15, 2025 · Aug 18, 2025 · Aug 18, 2025
@@ -0,0 +1,26 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# AutoencoderKLQwenImage
+
+The model can be loaded with the following code snippet.
+
+```python
+from mindone.diffusers import AutoencoderKLQwenImage
+
+vae = AutoencoderKLQwenImage.from_pretrained("Qwen/QwenImage", subfolder="vae")
+```
+
+::: mindspore.diffusers.AutoencoderKLQwenImage
+
+::: mindspore.diffusers.models.autoencoders.autoencoder_kl.AutoencoderKLOutput
+
+::: mindspore.diffusers.models.autoencoders.vae.DecoderOutput
@@ -0,0 +1,24 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# QwenImageTransformer2DModel
+
+The model can be loaded with the following code snippet.
+
+```python
+from mindone.diffusers import QwenImageTransformer2DModel
+
+transformer = QwenImageTransformer2DModel.from_pretrained("Qwen/QwenImage", subfolder="transformer", mindspore_dtype=mindspore.bfloat16)
+```
+
+::: mindspore.diffusers.QwenImageTransformer2DModel
+
+::: mindspore.diffusers.models.modeling_outputs.Transformer2DModelOutput
@@ -0,0 +1,42 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. -->
+
+# QwenImage
+
+<div class="flex flex-wrap space-x-1">
+  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
+</div>
+
+Qwen-Image from the Qwen team is an image generation foundation model in the Qwen series that achieves significant advances in complex text rendering and precise image editing. Experiments show strong general capabilities in both image generation and editing, with exceptional performance in text rendering, especially for Chinese.
+
+Qwen-Image comes in the following variants:
+
+| model type | model id |
+|:----------:|:--------:|
+| Qwen-Image | [`Qwen/Qwen-Image`](https://huggingface.co/Qwen/Qwen-Image) |
+| Qwen-Image-Edit | [`Qwen/Qwen-Image-Edit`](https://huggingface.co/Qwen/Qwen-Image-Edit) |
+
+!!! Tip
+
+[Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.
+
+!!! Tip
+
+::: mindone.diffusers.QwenImagePipeline
+
+::: mindone.diffusers.pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput
+
+::: mindone.diffusers.QwenImageImg2ImgPipeline
+
+::: mindone.diffusers.QwenImageInpaintPipeline
@@ -29,6 +29,7 @@
         "AutoencoderKLLTXVideo",
         "AutoencoderKLMagvit",
         "AutoencoderKLMochi",
+        "AutoencoderKLQwenImage",
         "AutoencoderKLTemporalDecoder",
         "AutoencoderKLWan",
         "AutoencoderOobleck",
@@ -68,6 +69,7 @@
         "OmniGenTransformer2DModel",
         "PixArtTransformer2DModel",
         "PriorTransformer",
+        "QwenImageTransformer2DModel",
         "SanaControlNetModel",
         "SanaTransformer2DModel",
         "SD3ControlNetModel",
@@ -222,6 +224,11 @@
         "PixArtAlphaPipeline",
         "PixArtSigmaPAGPipeline",
         "PixArtSigmaPipeline",
+        "QwenImageImg2ImgPipeline",
+        "QwenImageInpaintPipeline",
+        "QwenImagePipeline",
+        "QwenImageEditPipeline",
+        "QwenImageEditInpaintPipeline",
         "ReduxImageEncoder",
         "SanaControlNetPipeline",
         "SanaPAGPipeline",
@@ -375,6 +382,7 @@
         AutoencoderKLLTXVideo,
         AutoencoderKLMagvit,
         AutoencoderKLMochi,
+        AutoencoderKLQwenImage,
         AutoencoderKLTemporalDecoder,
         AutoencoderKLWan,
         AutoencoderOobleck,
@@ -414,6 +422,7 @@
         OmniGenTransformer2DModel,
         PixArtTransformer2DModel,
         PriorTransformer,
+        QwenImageTransformer2DModel,
         SanaControlNetModel,
         SanaTransformer2DModel,
         SD3ControlNetModel,
@@ -567,6 +576,11 @@
         PixArtAlphaPipeline,
         PixArtSigmaPAGPipeline,
         PixArtSigmaPipeline,
+        QwenImageEditPipeline,
+        QwenImageEditInpaintPipeline,
+        QwenImageImg2ImgPipeline,
+        QwenImageInpaintPipeline,
+        QwenImagePipeline,
         ReduxImageEncoder,
         SanaControlNetPipeline,
         SanaPAGPipeline,

@@ -72,11 +72,12 @@ def text_encoder_attn_modules(text_encoder):
         "CogView4LoraLoaderMixin",
         "Mochi1LoraLoaderMixin",
         "HunyuanVideoLoraLoaderMixin",
+        "QwenImageLoraLoaderMixin",
         "SanaLoraLoaderMixin",
         "Lumina2LoraLoaderMixin",
         "WanLoraLoaderMixin",
         "HiDreamImageLoraLoaderMixin",
-        "SkyReelsV2LoraLoaderMixin",
+        "SkyReelsV2LoraLoaderMixin",     
     ],
     "peft": ["PeftAdapterMixin"],
     "single_file": ["FromSingleFileMixin"],
@@ -99,6 +100,7 @@ def text_encoder_attn_modules(text_encoder):
         LTXVideoLoraLoaderMixin,
         Lumina2LoraLoaderMixin,
         Mochi1LoraLoaderMixin,
+        QwenImageLoraLoaderMixin,
         SanaLoraLoaderMixin,
         SD3LoraLoaderMixin,
         SkyReelsV2LoraLoaderMixin,
@@ -116,4 +118,4 @@ def text_encoder_attn_modules(text_encoder):
 else:
     import sys
 
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/mindone/diffusers/loaders/lora_conversion_utils.py b/mindone/diffusers/loaders/lora_conversion_utils.py
@@ -1920,3 +1920,38 @@ def _convert_non_diffusers_ltxv_lora_to_diffusers(state_dict, non_diffusers_pref
     converted_state_dict = {k.removeprefix(f"{non_diffusers_prefix}."): v for k, v in state_dict.items()}
     converted_state_dict = {f"transformer.{k}": v for k, v in converted_state_dict.items()}
     return converted_state_dict
+
+def _convert_non_diffusers_qwen_lora_to_diffusers(state_dict):
+    converted_state_dict = {}
+    all_keys = list(state_dict.keys())
+    down_key = ".lora_down.weight"
+    up_key = ".lora_up.weight"
+
+    def get_alpha_scales(down_weight, alpha_key):
+        rank = down_weight.shape[0]
+        alpha = state_dict.pop(alpha_key).item()
+        scale = alpha / rank  # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
+        scale_down = scale
+        scale_up = 1.0
+        while scale_down * 2 < scale_up:
+            scale_down *= 2
+            scale_up /= 2
+        return scale_down, scale_up
+
+    for k in all_keys:
+        if k.endswith(down_key):
+            diffusers_down_key = k.replace(down_key, ".lora_A.weight")
+            diffusers_up_key = k.replace(down_key, up_key).replace(up_key, ".lora_B.weight")
+            alpha_key = k.replace(down_key, ".alpha")
+
+            down_weight = state_dict.pop(k)
+            up_weight = state_dict.pop(k.replace(down_key, up_key))
+            scale_down, scale_up = get_alpha_scales(down_weight, alpha_key)
+            converted_state_dict[diffusers_down_key] = down_weight * scale_down
+            converted_state_dict[diffusers_up_key] = up_weight * scale_up
+
+    if len(state_dict) > 0:
+        raise ValueError(f"`state_dict` should be empty at this point but has {state_dict.keys()=}")
+
+    converted_state_dict = {f"transformer.{k}": v for k, v in converted_state_dict.items()}
+    return converted_state_dict