Skip to content

Commit efb34ea

Browse files
authored
Qwen2.5_VL Example Script Update (#598)
Signed-off-by: Mohit Soni <mohisoni@qti.qualcomm.com>
1 parent cb7da87 commit efb34ea

File tree

4 files changed

+58
-53
lines changed

4 files changed

+58
-53
lines changed

QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -752,8 +752,8 @@ def get_dummy_inputs(self, kv_offload: bool = False, **kwargs):
752752
seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
753753
)
754754

755-
lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.num_hidden_layers)]
756-
for i in range(self.model.config.num_hidden_layers):
755+
lang_inputs["past_key_values"] = [[] for _ in range(self.model.config.text_config.num_hidden_layers)]
756+
for i in range(self.model.config.text_config.num_hidden_layers):
757757
for kv in ["key", "value"]:
758758
lang_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32))
759759

@@ -779,10 +779,10 @@ def get_specializations(
779779
**compiler_options,
780780
):
781781
if height is None or width is None:
782-
height = 1365
783-
width = 2048
782+
height = constants.QWEN2_5_VL_HEIGHT
783+
width = constants.QWEN2_5_VL_WIDTH
784784
logger.warning(
785-
"Setting height and width to be 1365 and 2048 respectively, as it was neither passed nor found in vision_config"
785+
f"Setting height and width to be {height} and {width} respectively, as it was neither passed nor found in vision_config"
786786
)
787787
prefill_seq_len = prefill_seq_len if prefill_seq_len else 128
788788
ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN
@@ -882,7 +882,7 @@ def smart_resize(
882882

883883
def get_onnx_dynamic_axes(self, kv_offload: bool = False):
884884
# Define dynamic axes
885-
num_layers = self.config.num_hidden_layers
885+
num_layers = self.config.text_config.num_hidden_layers
886886

887887
vision_dynamic_axes = {
888888
"pixel_values": {0: "grid_height", 1: "grid_width"},
@@ -900,6 +900,7 @@ def get_onnx_dynamic_axes(self, kv_offload: bool = False):
900900
lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"}
901901

902902
dynamic_axes = {}
903+
903904
if kv_offload:
904905
dynamic_axes["vision"] = vision_dynamic_axes
905906
dynamic_axes["lang"] = lang_dynamic_axes
@@ -911,7 +912,7 @@ def get_onnx_dynamic_axes(self, kv_offload: bool = False):
911912
def get_output_names(self, kv_offload: bool = False):
912913
vision_output_names = ["vision_embeds"]
913914
lang_output_names = ["logits"]
914-
for i in range(self.model.config.num_hidden_layers):
915+
for i in range(self.model.config.text_config.num_hidden_layers):
915916
for kv in ["key", "value"]:
916917
lang_output_names.append(f"past_{kv}.{i}_RetainedState")
917918

@@ -927,6 +928,32 @@ def get_output_names(self, kv_offload: bool = False):
927928
return lang_output_names
928929
return output_names
929930

931+
def prepare_inputs_for_generation(self, inputs, prefill_seq_len=128, batch_size=1):
932+
input_ids_length = inputs["input_ids"].shape[1]
933+
934+
inputs["position_ids"] = torch.arange(input_ids_length).view(1, 1, input_ids_length).expand(-1, batch_size, -1)
935+
936+
pos_ids, rope_deltas = self.model.get_rope_index(
937+
inputs["input_ids"],
938+
None if "image_grid_thw" not in inputs else inputs["image_grid_thw"],
939+
video_grid_thw=None,
940+
second_per_grid_ts=None,
941+
attention_mask=inputs["attention_mask"],
942+
)
943+
944+
inputs["position_ids"] = torch.cat((inputs["position_ids"], pos_ids), dim=0)
945+
946+
num_chunks = -(input_ids_length // -prefill_seq_len) # ceil divide without float
947+
padded_len = num_chunks * prefill_seq_len # Convert to a multiple of prompt_len
948+
949+
inputs["position_ids"] = F.pad(
950+
inputs["position_ids"], pad=(0, padded_len - input_ids_length), mode="constant", value=-1
951+
)
952+
953+
inputs.pop("image_grid_thw", None)
954+
955+
return inputs
956+
930957
def get_inputs_info(self):
931958
return [
932959
IOInfo(name="input_ids", datatype=torch.int64, shape=("batch_size", "seq_len")),

QEfficient/utils/constants.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,10 @@ def get_models_dir():
125125
# Wav2Vec2 Constant
126126
WAV2VEC2_MAX_SEQ_LEN = 480000 # 30 seconds of audio at 16 kHz sampling rate (16,000 samples/sec × 30 sec)
127127

128+
# Qwen2_5_vl Constants
129+
QWEN2_5_VL_HEIGHT = 354
130+
QWEN2_5_VL_WIDTH = 536
131+
128132

129133
class Constants:
130134
# Export Constants.

examples/qwen2_5_vl_example.py

Lines changed: 5 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
# -----------------------------------------------------------------------------
77

88
import requests
9-
import torch
10-
import torch.nn.functional as F
119
import transformers
1210
from PIL import Image
1311
from qwen_vl_utils import process_vision_info
@@ -18,8 +16,7 @@
1816
## For AWQ model update pytorch version to 2.8.*
1917
model_id = "Qwen/Qwen2.5-VL-32B-Instruct"
2018
config = AutoConfig.from_pretrained(model_id)
21-
22-
## Use complete model without changing num_hidden_layers as it will not work for TF version 4.55.0 for Qwen2.5VL model
19+
config.text_config.num_hidden_layers = 2
2320

2421
qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
2522
model_id, attn_implementation="eager", kv_offload=True, config=config
@@ -28,13 +25,13 @@
2825
processor = AutoProcessor.from_pretrained(model_id)
2926

3027
### use skip_vision=Ture, if want to run only text, ow false ###
31-
skip_vision = False
28+
skip_vision = True
3229

3330
if skip_vision:
3431
## Only Text ##
3532

3633
## Set Batch_Size ##
37-
batch_size = 2
34+
batch_size = 1
3835
qeff_model.compile(
3936
batch_size=batch_size,
4037
prefill_seq_len=128,
@@ -68,25 +65,7 @@
6865
return_tensors="pt",
6966
)
7067

71-
pos_ids, rope_deltas = qeff_model.model.get_rope_index(
72-
inputs["input_ids"],
73-
image_grid_thw=None,
74-
video_grid_thw=None,
75-
second_per_grid_ts=None,
76-
attention_mask=inputs["attention_mask"],
77-
)
78-
79-
input_ids_length = inputs["input_ids"].shape[1]
80-
81-
inputs["position_ids"] = torch.cat([pos_ids, pos_ids[0].unsqueeze(0)], dim=0)
82-
83-
prefill_seq_len = 128
84-
num_chunks = -(input_ids_length // -prefill_seq_len) # ceil divide without float
85-
padded_len = num_chunks * prefill_seq_len # Convert to a multiple of prompt_len
86-
87-
inputs["position_ids"] = F.pad(
88-
inputs["position_ids"], pad=(0, padded_len - input_ids_length), mode="constant", value=-1
89-
)
68+
inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)
9069

9170
streamer = TextStreamer(tokenizer)
9271
output = qeff_model.generate(inputs=inputs, generation_len=100)
@@ -148,29 +127,9 @@
148127
padding=True,
149128
return_tensors="pt",
150129
)
151-
input_ids_length = inputs["input_ids"].shape[1]
152-
153-
inputs["position_ids"] = torch.arange(input_ids_length).view(1, 1, input_ids_length).expand(-1, batch_size, -1)
154-
155-
pos_ids, rope_deltas = qeff_model.model.model.get_rope_index(
156-
inputs["input_ids"],
157-
inputs["image_grid_thw"],
158-
video_grid_thw=None,
159-
second_per_grid_ts=None,
160-
attention_mask=inputs["attention_mask"],
161-
)
162130

163-
inputs["position_ids"] = torch.cat((inputs["position_ids"], pos_ids), dim=0)
164-
165-
prefill_seq_len = 128
166-
num_chunks = -(input_ids_length // -prefill_seq_len) # ceil divide without float
167-
padded_len = num_chunks * prefill_seq_len # Convert to a multiple of prompt_len
168-
169-
inputs["position_ids"] = F.pad(
170-
inputs["position_ids"], pad=(0, padded_len - input_ids_length), mode="constant", value=-1
171-
)
131+
inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)
172132

173-
inputs.pop("image_grid_thw")
174133
streamer = TextStreamer(tokenizer)
175134
output = qeff_model.generate(inputs=inputs, generation_len=100)
176135
print(output.generated_ids)

tests/transformers/models/test_image_text_to_text_models.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,17 @@
134134
"Can you describe the image in detail.",
135135
1,
136136
),
137+
(
138+
"Qwen/Qwen2.5-VL-3B-Instruct",
139+
True,
140+
1,
141+
128,
142+
4096,
143+
1540,
144+
"https://picsum.photos/id/237/536/354",
145+
"Can you describe the image in detail.",
146+
1,
147+
),
137148
# (
138149
# "meta-llama/Llama-3.2-11B-Vision-Instruct",
139150
# True,
@@ -320,6 +331,10 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
320331
qnn_config=qnn_config,
321332
)
322333
inputs = processor(images=image, text=prompt, return_tensors="pt")
334+
if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl":
335+
inputs = qeff_model.model.prepare_inputs_for_generation(
336+
inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size
337+
)
323338
if "pixel_values" in inputs:
324339
inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
325340
print("QPC Outputs (QAIC):")

0 commit comments

Comments
 (0)