add acc test

MrZ20 · MrZ20 · commit a6f8422a99e7 · 2025-10-30T15:18:31.000+08:00
Signed-off-by: MrZ20 &lt;2609716663@qq.com&gt;
diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml
@@ -53,12 +53,23 @@ jobs:
             model_name: Qwen2-Audio-7B-Instruct
           - runner: a2-2
             model_name: Qwen3-30B-A3B
-          - runner: a2-2
-            model_name: Qwen3-VL-30B-A3B-Instruct
+          # This model has a bug that needs to be fixed and re added
+          # - runner: a2-2
+          #   model_name: Qwen3-VL-30B-A3B-Instruct
           - runner: a2-2
             model_name: DeepSeek-V2-Lite
           - runner: a2-4
             model_name: Qwen3-Next-80B-A3B-Instruct
+          - runner: a2-1
+            model_name: Qwen3-VL-8B-Instruct
+          - runner: a2-1
+            model_name: Qwen2.5-Omni-7B
+          - runner: a2-1
+            model_name: Meta-Llama-3.1-8B-Instruct
+          - runner: a2-2
+            model_name: ERNIE-4.5-21B-A3B-PT
+          - runner: a2-1
+            model_name: Mistral-7B-Instruct-v0.1
       fail-fast: false
     # test will be triggered when tag 'accuracy-test' & 'ready-for-test'
     if:  >-
diff --git a/tests/e2e/models/configs/ERNIE-4.5-21B-A3B-PT.yaml b/tests/e2e/models/configs/ERNIE-4.5-21B-A3B-PT.yaml
@@ -0,0 +1,14 @@
+model_name: "PaddlePaddle/ERNIE-4.5-21B-A3B-PT"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,flexible-extract"
+    value: 0.72
+- name: "ceval-valid"
+  metrics:
+  - name: "acc,none"
+    value: 0.85
+num_fewshot: 5
+tensor_parallel_size: 2
+batch_size: 16
+gpu_memory_utilization: 0.6
diff --git a/tests/e2e/models/configs/Meta-Llama-3.1-8B-Instruct.yaml b/tests/e2e/models/configs/Meta-Llama-3.1-8B-Instruct.yaml
@@ -0,0 +1,11 @@
+model_name: "LLM-Research/Meta-Llama-3.1-8B-Instruct"
+hardware: "Atlas A2 Series"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.82
+  - name: "exact_match,flexible-extract"
+    value: 0.84
+
+num_fewshot: 5
diff --git a/tests/e2e/models/configs/Mistral-7B-Instruct-v0.1.yaml b/tests/e2e/models/configs/Mistral-7B-Instruct-v0.1.yaml
@@ -0,0 +1,11 @@
+model_name: "AI-ModelScope/Mistral-7B-Instruct-v0.1"
+runner: "linux-aarch64-a2-1"
+hardware: "Atlas A2 Series"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.35
+  - name: "exact_match,flexible-extract"
+    value: 0.38
+trust_remote_code: True
diff --git a/tests/e2e/models/configs/Qwen2.5-Omni-7B.yaml b/tests/e2e/models/configs/Qwen2.5-Omni-7B.yaml
@@ -0,0 +1,10 @@
+model_name: "Qwen/Qwen2.5-Omni-7B"
+hardware: "Atlas A2 Series"
+model: "vllm-vlm"
+tasks:
+- name: "mmmu_val"
+  metrics:
+  - name: "acc,none"
+    value: 0.52
+max_model_len: 8192
+gpu_memory_utilization: 0.7
diff --git a/tests/e2e/models/configs/Qwen3-VL-8B-Instruct.yaml b/tests/e2e/models/configs/Qwen3-VL-8B-Instruct.yaml
@@ -0,0 +1,11 @@
+model_name: "Qwen/Qwen3-VL-8B-Instruct"
+hardware: "Atlas A2 Series"
+model: "vllm-vlm"
+tasks:
+- name: "mmmu_val"
+  metrics:
+  - name: "acc,none"
+    value: 0.55
+max_model_len: 8192
+batch_size: 32
+gpu_memory_utilization: 0.7
diff --git a/tests/e2e/models/configs/accuracy.txt b/tests/e2e/models/configs/accuracy.txt
@@ -6,3 +6,8 @@ Qwen2-7B.yaml
 Qwen2-VL-7B-Instruct.yaml
 Qwen2-Audio-7B-Instruct.yaml
 Qwen3-VL-30B-A3B-Instruct.yaml
+Qwen3-VL-8B-Instruct.yaml
+Qwen2.5-Omni-7B.yaml
+Meta-Llama-3.1-8B-Instruct.yaml
+ERNIE-4.5-21B-A3B-PT.yaml
+Mistral-7B-Instruct-v0.1.yaml