Skip to content
Open
Show file tree
Hide file tree
Changes from 66 commits
Commits
Show all changes
83 commits
Select commit Hold shift + click to select a range
d3dec44
2025/08/15
Dong1017 Aug 15, 2025
6006960
2025/8/15 17:18 revised
Dong1017 Aug 15, 2025
15bc8ae
2025/8/18 10:22 revised
Dong1017 Aug 18, 2025
103db50
2025/8/18 17:00 revised
Dong1017 Aug 18, 2025
77779b5
2025/8/18 19:08 revised
Dong1017 Aug 18, 2025
0cab22b
2025/8/18 19:13 revised
Dong1017 Aug 18, 2025
2b7b4c9
2025/8/19 9:02 revised
Dong1017 Aug 19, 2025
d7eaa37
2025/8/19 9:04 revised
Dong1017 Aug 19, 2025
dddd8f2
2025/8/19 9:12 revised
Dong1017 Aug 19, 2025
3117bdc
2025/8/19 10:27 revised
Dong1017 Aug 19, 2025
e19c2e3
2025/8/20 9:22 revised
Dong1017 Aug 20, 2025
0fb127a
2025/8/20 9:247 revised
Dong1017 Aug 20, 2025
5d317bc
2025/8/20 9:48 revised
Dong1017 Aug 20, 2025
e8043d8
2025/8/20 9:52 revised
Dong1017 Aug 20, 2025
b78ef0a
2025/8/20 10:15 revised
Dong1017 Aug 20, 2025
656acce
2025/8/20 10:50 revised
Dong1017 Aug 20, 2025
9a33d83
2025/8/20 11:11 revised
Dong1017 Aug 20, 2025
c2f972c
2025/8/20 11:27 revised
Dong1017 Aug 20, 2025
9e2cccf
2025/8/20 11:47 revised
Dong1017 Aug 20, 2025
9b5be21
2025/8/20 14:25 revised
Dong1017 Aug 20, 2025
1906919
2025/8/20 14:26 revised
Dong1017 Aug 20, 2025
c3055ba
2025/8/21 15:20 revised
Dong1017 Aug 21, 2025
e025800
2025/8/21 15:24 revised
Dong1017 Aug 21, 2025
436ebf3
2025/8/21 17:08 revised
Dong1017 Aug 21, 2025
dafec1a
2025/8/21 17:57 revised
Dong1017 Aug 21, 2025
e573be1
2025/8/21 19:13 revised
Dong1017 Aug 21, 2025
d549ab2
2025/8/22 11:32 revised
Dong1017 Aug 22, 2025
09ac0bd
2025/8/22 17:40 revised
Dong1017 Aug 22, 2025
fb5877b
2025/8/25 10:40 revised
Dong1017 Aug 25, 2025
358b20b
2025/8/26 10:30 revised
Dong1017 Aug 26, 2025
721543e
2025/8/26 17:10 revised
Dong1017 Aug 26, 2025
fc02927
2025/8/26 17:20 revised
Dong1017 Aug 26, 2025
3b16c50
2025/8/27 14:08 revised
Dong1017 Aug 27, 2025
151ed25
2025/8/27 17:05 revised
Dong1017 Aug 27, 2025
46cd675
2025/8/27 17:09 revised
Dong1017 Aug 27, 2025
35b35fc
2025/8/27 17:23 revised
Dong1017 Aug 27, 2025
19c938e
2025/8/29 15:42 revised
Dong1017 Aug 29, 2025
44504e5
2025/9/1 09:18 revised
Dong1017 Sep 1, 2025
d5cfad2
2025/9/1 09:40 revised
Dong1017 Sep 1, 2025
aa08b1a
2025/9/2 14:06, img2img infer
Dong1017 Sep 2, 2025
92d1a23
2025/9/3 8:50, inpaint infer
Dong1017 Sep 3, 2025
c1998e4
2025/9/3 14:07, img2img test
Dong1017 Sep 3, 2025
ae1f7b2
2025/9/3 14:18, img2img test
Dong1017 Sep 3, 2025
be1b18b
2025/9/3 16:30, inpaint test
Dong1017 Sep 3, 2025
d6cf7e1
2025/9/4 14:21, edit bugs
Dong1017 Sep 4, 2025
dfc5e23
2025/9/4 15:58, edit ut
Dong1017 Sep 4, 2025
39168ee
2025/9/5 15:07, edit-inpaint pipe
Dong1017 Sep 5, 2025
fdfb3a3
2025/9/5 17:40, fix some bugs
Dong1017 Sep 5, 2025
97b3d8e
modified qwenimage 2025/9/15
Dong1017 Sep 15, 2025
440e226
2025/9/15 seamless_m4t submit
Dong1017 Sep 15, 2025
2e060cb
2025/9/17 seamless_m4t ut
Dong1017 Sep 17, 2025
a2a52b2
25/9/17 seamless_m4t clean
Dong1017 Sep 17, 2025
74c91f3
2025/9/17 qwenimage clean
Dong1017 Sep 17, 2025
9ec46f4
fix: remove unwanted files
Dong1017 Sep 17, 2025
69c6f6a
fix: remove unwanted files
Dong1017 Sep 17, 2025
ec98ccb
fix: keep file consistent
Dong1017 Sep 17, 2025
c2c61d3
fix: keep file consistent
Dong1017 Sep 17, 2025
ff03c7a
fix: keep file consistent
Dong1017 Sep 17, 2025
8faa7b3
revised according to gemini
Dong1017 Sep 17, 2025
f77b668
revised according to gemini
Dong1017 Sep 17, 2025
ab08ce5
fix conflicting according to gemini
Dong1017 Sep 17, 2025
a19fbe5
fix conflicting according to gemini
Dong1017 Sep 17, 2025
ab566b3
Merge branch 'master' into qwenimage
Dong1017 Sep 17, 2025
9ce5c35
required lines but conflicting
Dong1017 Sep 17, 2025
854ac0c
Merge branch 'qwenimage' of github.com:Dong1017/mindone into qwenimage
Dong1017 Sep 17, 2025
f770d50
required lines but conflicting
Dong1017 Sep 17, 2025
7d5da80
fix: md, according to Cui-yshoho
Dong1017 Sep 18, 2025
c70d315
fix a bug of qwen2_5_vl, some revisions suggested from Cui-yshoho and…
Dong1017 Sep 26, 2025
735e6af
Resolved the conflict regarding qwen2_5_vl masked_scatter-bf16-bug
Dong1017 Sep 26, 2025
237183e
Add UTs of transformer, supplement MDs, delete unused code comments
Dong1017 Sep 28, 2025
a498371
update md to notice the use of transformers==4.52.1
Dong1017 Sep 29, 2025
e73acef
fix ci problem
Dong1017 Sep 29, 2025
07b10a7
Merge branch 'master' into qwenimage
Dong1017 Sep 29, 2025
dbb8ac2
fix ci problem
Dong1017 Sep 29, 2025
d88f7e4
Merge branch 'qwenimage' of https://github.com/Dong1017/mindone into …
Dong1017 Sep 29, 2025
427961a
fix ci problem
Dong1017 Sep 29, 2025
54af7f1
fix ci problem
Dong1017 Sep 29, 2025
4eb6673
Merge branch 'master' into qwenimage
Dong1017 Sep 29, 2025
0524331
CHECK: pre-commit run --all-files
Dong1017 Sep 30, 2025
3102629
fix ci problem - strange format?
Dong1017 Sep 30, 2025
faca606
Trigger CI
Dong1017 Sep 30, 2025
c2508b9
fix ci problem - modeling_reformer
Dong1017 Sep 30, 2025
ad179cd
Merge branch 'master' into qwenimage
Dong1017 Sep 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions docs/diffusers/api/models/autoencoderkl_qwenimage.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License. -->

# AutoencoderKLQwenImage

The model can be loaded with the following code snippet.

```python
from mindone.diffusers import AutoencoderKLQwenImage

vae = AutoencoderKLQwenImage.from_pretrained("Qwen/QwenImage", subfolder="vae")
```

::: mindspore.diffusers.AutoencoderKLQwenImage

::: mindspore.diffusers.models.autoencoders.autoencoder_kl.AutoencoderKLOutput

::: mindspore.diffusers.models.autoencoders.vae.DecoderOutput
24 changes: 24 additions & 0 deletions docs/diffusers/api/models/qwenimage_transformer2d.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License. -->

# QwenImageTransformer2DModel

The model can be loaded with the following code snippet.

```python
from mindone.diffusers import QwenImageTransformer2DModel

transformer = QwenImageTransformer2DModel.from_pretrained("Qwen/QwenImage", subfolder="transformer", mindspore_dtype=mindspore.bfloat16)
```

::: mindspore.diffusers.QwenImageTransformer2DModel

::: mindspore.diffusers.models.modeling_outputs.Transformer2DModelOutput
42 changes: 42 additions & 0 deletions docs/diffusers/api/pipelines/qwenimage.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License. -->

# QwenImage

<div class="flex flex-wrap space-x-1">
<img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
</div>

Qwen-Image from the Qwen team is an image generation foundation model in the Qwen series that achieves significant advances in complex text rendering and precise image editing. Experiments show strong general capabilities in both image generation and editing, with exceptional performance in text rendering, especially for Chinese.

Qwen-Image comes in the following variants:

| model type | model id |
|:----------:|:--------:|
| Qwen-Image | [`Qwen/Qwen-Image`](https://huggingface.co/Qwen/Qwen-Image) |
| Qwen-Image-Edit | [`Qwen/Qwen-Image-Edit`](https://huggingface.co/Qwen/Qwen-Image-Edit) |

!!! Tip

[Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.

!!! Tip

::: mindone.diffusers.QwenImagePipeline

::: mindone.diffusers.pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput

::: mindone.diffusers.QwenImageImg2ImgPipeline

::: mindone.diffusers.QwenImageInpaintPipeline
14 changes: 14 additions & 0 deletions mindone/diffusers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"AutoencoderKLLTXVideo",
"AutoencoderKLMagvit",
"AutoencoderKLMochi",
"AutoencoderKLQwenImage",
"AutoencoderKLTemporalDecoder",
"AutoencoderKLWan",
"AutoencoderOobleck",
Expand Down Expand Up @@ -68,6 +69,7 @@
"OmniGenTransformer2DModel",
"PixArtTransformer2DModel",
"PriorTransformer",
"QwenImageTransformer2DModel",
"SanaControlNetModel",
"SanaTransformer2DModel",
"SD3ControlNetModel",
Expand Down Expand Up @@ -222,6 +224,11 @@
"PixArtAlphaPipeline",
"PixArtSigmaPAGPipeline",
"PixArtSigmaPipeline",
"QwenImageImg2ImgPipeline",
"QwenImageInpaintPipeline",
"QwenImagePipeline",
"QwenImageEditPipeline",
"QwenImageEditInpaintPipeline",
"ReduxImageEncoder",
"SanaControlNetPipeline",
"SanaPAGPipeline",
Expand Down Expand Up @@ -375,6 +382,7 @@
AutoencoderKLLTXVideo,
AutoencoderKLMagvit,
AutoencoderKLMochi,
AutoencoderKLQwenImage,
AutoencoderKLTemporalDecoder,
AutoencoderKLWan,
AutoencoderOobleck,
Expand Down Expand Up @@ -414,6 +422,7 @@
OmniGenTransformer2DModel,
PixArtTransformer2DModel,
PriorTransformer,
QwenImageTransformer2DModel,
SanaControlNetModel,
SanaTransformer2DModel,
SD3ControlNetModel,
Expand Down Expand Up @@ -567,6 +576,11 @@
PixArtAlphaPipeline,
PixArtSigmaPAGPipeline,
PixArtSigmaPipeline,
QwenImageEditPipeline,
QwenImageEditInpaintPipeline,
QwenImageImg2ImgPipeline,
QwenImageInpaintPipeline,
QwenImagePipeline,
ReduxImageEncoder,
SanaControlNetPipeline,
SanaPAGPipeline,
Expand Down
6 changes: 4 additions & 2 deletions mindone/diffusers/loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,12 @@ def text_encoder_attn_modules(text_encoder):
"CogView4LoraLoaderMixin",
"Mochi1LoraLoaderMixin",
"HunyuanVideoLoraLoaderMixin",
"QwenImageLoraLoaderMixin",
"SanaLoraLoaderMixin",
"Lumina2LoraLoaderMixin",
"WanLoraLoaderMixin",
"HiDreamImageLoraLoaderMixin",
"SkyReelsV2LoraLoaderMixin",
"SkyReelsV2LoraLoaderMixin",
],
"peft": ["PeftAdapterMixin"],
"single_file": ["FromSingleFileMixin"],
Expand All @@ -99,6 +100,7 @@ def text_encoder_attn_modules(text_encoder):
LTXVideoLoraLoaderMixin,
Lumina2LoraLoaderMixin,
Mochi1LoraLoaderMixin,
QwenImageLoraLoaderMixin,
SanaLoraLoaderMixin,
SD3LoraLoaderMixin,
SkyReelsV2LoraLoaderMixin,
Expand All @@ -116,4 +118,4 @@ def text_encoder_attn_modules(text_encoder):
else:
import sys

sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
35 changes: 35 additions & 0 deletions mindone/diffusers/loaders/lora_conversion_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1920,3 +1920,38 @@ def _convert_non_diffusers_ltxv_lora_to_diffusers(state_dict, non_diffusers_pref
converted_state_dict = {k.removeprefix(f"{non_diffusers_prefix}."): v for k, v in state_dict.items()}
converted_state_dict = {f"transformer.{k}": v for k, v in converted_state_dict.items()}
return converted_state_dict

def _convert_non_diffusers_qwen_lora_to_diffusers(state_dict):
converted_state_dict = {}
all_keys = list(state_dict.keys())
down_key = ".lora_down.weight"
up_key = ".lora_up.weight"

def get_alpha_scales(down_weight, alpha_key):
rank = down_weight.shape[0]
alpha = state_dict.pop(alpha_key).item()
scale = alpha / rank # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
scale_down = scale
scale_up = 1.0
while scale_down * 2 < scale_up:
scale_down *= 2
scale_up /= 2
return scale_down, scale_up

for k in all_keys:
if k.endswith(down_key):
diffusers_down_key = k.replace(down_key, ".lora_A.weight")
diffusers_up_key = k.replace(down_key, up_key).replace(up_key, ".lora_B.weight")
alpha_key = k.replace(down_key, ".alpha")

down_weight = state_dict.pop(k)
up_weight = state_dict.pop(k.replace(down_key, up_key))
scale_down, scale_up = get_alpha_scales(down_weight, alpha_key)
converted_state_dict[diffusers_down_key] = down_weight * scale_down
converted_state_dict[diffusers_up_key] = up_weight * scale_up

if len(state_dict) > 0:
raise ValueError(f"`state_dict` should be empty at this point but has {state_dict.keys()=}")

converted_state_dict = {f"transformer.{k}": v for k, v in converted_state_dict.items()}
return converted_state_dict
Loading