diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index 0822fc59286..5621b615182 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -49,8 +49,9 @@ jobs: model_name: Qwen3-8B - runner: a2-1 model_name: Qwen2.5-VL-7B-Instruct - - runner: a2-1 - model_name: Qwen2-Audio-7B-Instruct + # To do: This model has a bug that needs to be fixed and readded + # - runner: a2-1 + # model_name: Qwen2-Audio-7B-Instruct - runner: a2-2 model_name: Qwen3-30B-A3B - runner: a2-2 @@ -61,6 +62,12 @@ jobs: model_name: Qwen3-Next-80B-A3B-Instruct - runner: a2-1 model_name: Qwen3-8B-W8A8 + - runner: a2-1 + model_name: Qwen3-VL-8B-Instruct + - runner: a2-1 + model_name: Qwen2.5-Omni-7B + - runner: a2-1 + model_name: Meta-Llama-3.1-8B-Instruct fail-fast: false # test will be triggered when tag 'accuracy-test' & 'ready-for-test' if: >- diff --git a/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml b/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml index 848a4911441..c23be35b281 100644 --- a/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml +++ b/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml @@ -1,5 +1,4 @@ model_name: "deepseek-ai/DeepSeek-V2-Lite" -runner: "linux-aarch64-a2-2" hardware: "Atlas A2 Series" tasks: - name: "gsm8k" diff --git a/tests/e2e/models/configs/Meta-Llama-3.1-8B-Instruct.yaml b/tests/e2e/models/configs/Meta-Llama-3.1-8B-Instruct.yaml new file mode 100644 index 00000000000..4590116c9a1 --- /dev/null +++ b/tests/e2e/models/configs/Meta-Llama-3.1-8B-Instruct.yaml @@ -0,0 +1,11 @@ +model_name: "LLM-Research/Meta-Llama-3.1-8B-Instruct" +hardware: "Atlas A2 Series" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.82 + - name: "exact_match,flexible-extract" + value: 0.84 + +num_fewshot: 5 diff --git a/tests/e2e/models/configs/Qwen2.5-Omni-7B.yaml b/tests/e2e/models/configs/Qwen2.5-Omni-7B.yaml new file mode 100644 index 00000000000..dec228dd9f1 --- /dev/null +++ b/tests/e2e/models/configs/Qwen2.5-Omni-7B.yaml @@ -0,0 +1,10 @@ +model_name: "Qwen/Qwen2.5-Omni-7B" +hardware: "Atlas A2 Series" +model: "vllm-vlm" +tasks: +- name: "mmmu_val" + metrics: + - name: "acc,none" + value: 0.52 +max_model_len: 8192 +gpu_memory_utilization: 0.7 diff --git a/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml b/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml index 3543e0c2c6b..8548989954c 100644 --- a/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml +++ b/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml @@ -1,5 +1,4 @@ model_name: "Qwen/Qwen2.5-VL-7B-Instruct" -runner: "linux-aarch64-a2-1" hardware: "Atlas A2 Series" model: "vllm-vlm" tasks: @@ -7,4 +6,4 @@ tasks: metrics: - name: "acc,none" value: 0.51 -max_model_len: 8192 \ No newline at end of file +max_model_len: 8192 diff --git a/tests/e2e/models/configs/Qwen3-30B-A3B.yaml b/tests/e2e/models/configs/Qwen3-30B-A3B.yaml index 6b0425233b1..b97f6dae4d2 100644 --- a/tests/e2e/models/configs/Qwen3-30B-A3B.yaml +++ b/tests/e2e/models/configs/Qwen3-30B-A3B.yaml @@ -1,5 +1,4 @@ model_name: "Qwen/Qwen3-30B-A3B" -runner: "linux-aarch64-a2-2" hardware: "Atlas A2 Series" tasks: - name: "gsm8k" @@ -17,4 +16,4 @@ gpu_memory_utilization: 0.6 enable_expert_parallel: True tensor_parallel_size: 2 apply_chat_template: False -fewshot_as_multiturn: False \ No newline at end of file +fewshot_as_multiturn: False diff --git a/tests/e2e/models/configs/Qwen3-8B-Base.yaml b/tests/e2e/models/configs/Qwen3-8B-Base.yaml index 21243615c2b..730264463a4 100644 --- a/tests/e2e/models/configs/Qwen3-8B-Base.yaml +++ b/tests/e2e/models/configs/Qwen3-8B-Base.yaml @@ -1,5 +1,4 @@ model_name: "Qwen/Qwen3-8B-Base" -runner: "linux-aarch64-a2-1" hardware: "Atlas A2 Series" tasks: - name: "gsm8k" diff --git a/tests/e2e/models/configs/Qwen3-VL-8B-Instruct.yaml b/tests/e2e/models/configs/Qwen3-VL-8B-Instruct.yaml new file mode 100644 index 00000000000..8803a120cec --- /dev/null +++ b/tests/e2e/models/configs/Qwen3-VL-8B-Instruct.yaml @@ -0,0 +1,11 @@ +model_name: "Qwen/Qwen3-VL-8B-Instruct" +hardware: "Atlas A2 Series" +model: "vllm-vlm" +tasks: +- name: "mmmu_val" + metrics: + - name: "acc,none" + value: 0.55 +max_model_len: 8192 +batch_size: 32 +gpu_memory_utilization: 0.7 diff --git a/tests/e2e/models/configs/accuracy.txt b/tests/e2e/models/configs/accuracy.txt index 3bdcfd8a04d..5a839071252 100644 --- a/tests/e2e/models/configs/accuracy.txt +++ b/tests/e2e/models/configs/accuracy.txt @@ -6,3 +6,6 @@ Qwen2-7B.yaml Qwen2-VL-7B-Instruct.yaml Qwen2-Audio-7B-Instruct.yaml Qwen3-VL-30B-A3B-Instruct.yaml +Qwen3-VL-8B-Instruct.yaml +Qwen2.5-Omni-7B.yaml +Meta-Llama-3.1-8B-Instruct.yaml