From ea2f65d7d4da15cab1738f67b0a11aaffab1fe80 Mon Sep 17 00:00:00 2001 From: SahilCarterr <110806554+SahilCarterr@users.noreply.github.com> Date: Sat, 11 Oct 2025 00:21:02 +0100 Subject: [PATCH 1/3] Update pipeline_qwenimage_edit_plus.py --- .../qwenimage/pipeline_qwenimage_edit_plus.py | 60 +++++++++++++++---- 1 file changed, 48 insertions(+), 12 deletions(-) diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py index ec203edf166c..b406e09bb211 100644 --- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py +++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py @@ -252,9 +252,20 @@ def _get_qwen_prompt_embeds( drop_idx = self.prompt_template_encode_start_idx txt = [template.format(base_img_prompt + e) for e in prompt] + if image is None: + images_for_processor = None + else: + # If `image` is a single image (not list) the processor will broadcast it. + # If `image` is a list of conditioning images, we must repeat that list + # for each prompt so processor has one entry per text example. + if isinstance(image, list): + images_for_processor = [image] * len(txt) + else: + images_for_processor = image + model_inputs = self.processor( text=txt, - images=image, + images=images_for_processor, padding=True, return_tensors="pt", ).to(device) @@ -627,7 +638,12 @@ def __call__( [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images. """ - image_size = image[-1].size if isinstance(image, list) else image.size + # Use the first image's size as the deterministic base for output dims + ref_img = image[0] if isinstance(image, list) else image + if isinstance(ref_img, (tuple, list)): + ref_img = ref_img[0] + image_size = ref_img.size + calculated_width, calculated_height = calculate_dimensions(1024 * 1024, image_size[0] / image_size[1]) height = height or calculated_height width = width or calculated_width @@ -673,6 +689,7 @@ def __call__( vae_image_sizes = [] vae_images = [] for img in image: + img = img[0] if isinstance(img, (tuple, list)) else img image_width, image_height = img.size condition_width, condition_height = calculate_dimensions( CONDITION_IMAGE_SIZE, image_width / image_height @@ -681,7 +698,10 @@ def __call__( condition_image_sizes.append((condition_width, condition_height)) vae_image_sizes.append((vae_width, vae_height)) condition_images.append(self.image_processor.resize(img, condition_height, condition_width)) - vae_images.append(self.image_processor.preprocess(img, vae_height, vae_width).unsqueeze(2)) + preproc = self.image_processor.preprocess(img, vae_height, vae_width) + if isinstance(preproc, (tuple, list)): + preproc = preproc[0] + vae_images.append(preproc.unsqueeze(0)) has_neg_prompt = negative_prompt is not None or ( negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None @@ -719,6 +739,25 @@ def __call__( # 4. Prepare latent variables num_channels_latents = self.transformer.config.in_channels // 4 + if vae_images is not None: + for idx, v in enumerate(vae_images): + if isinstance(v, (tuple, list)): + v = v[0] + + if not torch.is_tensor(v): + v = torch.as_tensor(v) + + if v.ndim == 5 and v.shape[1] == 1 and v.shape[2] in (1, 3): + v = v.permute(0, 2, 1, 3, 4).contiguous() + + elif v.ndim == 4 and v.shape[1] in (1, 3): + v = v.unsqueeze(2) + + elif v.ndim == 3 and v.shape[0] in (1, 3): + v = v.unsqueeze(0).unsqueeze(2) + + vae_images[idx] = v + latents, image_latents = self.prepare_latents( vae_images, batch_size * num_images_per_prompt, @@ -730,15 +769,12 @@ def __call__( generator, latents, ) - img_shapes = [ - [ - (1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2), - *[ - (1, vae_height // self.vae_scale_factor // 2, vae_width // self.vae_scale_factor // 2) - for vae_width, vae_height in vae_image_sizes - ], - ] - ] * batch_size + base_shape = (1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2) + per_image_shapes = [ + (1, vae_height // self.vae_scale_factor // 2, vae_width // self.vae_scale_factor // 2) + for vae_width, vae_height in vae_image_sizes + ] + img_shapes = [[base_shape, *per_image_shapes] for _ in range(batch_size)] # 5. Prepare timesteps sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas From cb2bfe6dbf952f0ff7b9b86f149d485b4b41127d Mon Sep 17 00:00:00 2001 From: SahilCarterr <110806554+SahilCarterr@users.noreply.github.com> Date: Mon, 13 Oct 2025 11:31:03 +0100 Subject: [PATCH 2/3] Update pipeline_qwenimage_edit_plus.py --- .../qwenimage/pipeline_qwenimage_edit_plus.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py index b406e09bb211..152468f57d81 100644 --- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py +++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py @@ -769,12 +769,15 @@ def __call__( generator, latents, ) - base_shape = (1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2) - per_image_shapes = [ - (1, vae_height // self.vae_scale_factor // 2, vae_width // self.vae_scale_factor // 2) - for vae_width, vae_height in vae_image_sizes - ] - img_shapes = [[base_shape, *per_image_shapes] for _ in range(batch_size)] + img_shapes = [ + [ + (1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2), + *[ + (1, vae_height // self.vae_scale_factor // 2, vae_width // self.vae_scale_factor // 2) + for vae_width, vae_height in vae_image_sizes + ], + ] + ] * batch_size # 5. Prepare timesteps sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas From 01f264f57dcccf1decc42f1afe62eb7de565c5fd Mon Sep 17 00:00:00 2001 From: SahilCarterr <110806554+SahilCarterr@users.noreply.github.com> Date: Mon, 13 Oct 2025 11:33:51 +0100 Subject: [PATCH 3/3] Update pipeline_qwenimage_edit_plus.py --- .../pipelines/qwenimage/pipeline_qwenimage_edit_plus.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py index 152468f57d81..65d664978fa5 100644 --- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py +++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py @@ -255,9 +255,6 @@ def _get_qwen_prompt_embeds( if image is None: images_for_processor = None else: - # If `image` is a single image (not list) the processor will broadcast it. - # If `image` is a list of conditioning images, we must repeat that list - # for each prompt so processor has one entry per text example. if isinstance(image, list): images_for_processor = [image] * len(txt) else: @@ -638,7 +635,7 @@ def __call__( [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images. """ - # Use the first image's size as the deterministic base for output dims + ref_img = image[0] if isinstance(image, list) else image if isinstance(ref_img, (tuple, list)): ref_img = ref_img[0]