PaddlePaddle
diff --git a/‎examples/splitwise/start_v2.sh‎
Lines changed: 1 addition & 2 deletions b/‎examples/splitwise/start_v2.sh‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎fastdeploy/engine/common_engine.py‎
Lines changed: 42 additions & 23 deletions b/‎fastdeploy/engine/common_engine.py‎
Lines changed: 42 additions & 23 deletions
diff --git a/‎fastdeploy/router/launch.py‎
Lines changed: 53 additions & 0 deletions b/‎fastdeploy/router/launch.py‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎fastdeploy/router/launch_router.py‎
Lines changed: 0 additions & 95 deletions b/‎fastdeploy/router/launch_router.py‎
Lines changed: 0 additions & 95 deletions
@@ -21,8 +21,7 @@ rm -rf ${FD_LOG_DIR}
 mkdir -p ${FD_LOG_DIR}
 
 router_port=9000
-nohup python -m fastdeploy.router.launch_router \
-    --pd-disaggregation \
+nohup python -m fastdeploy.router.launch \
     --port ${router_port} \
     2>&1 >${FD_LOG_DIR}/nohup &
 sleep 1
 
@@ -23,7 +23,7 @@
 import traceback
 import weakref
 from concurrent.futures import ThreadPoolExecutor
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import paddle
@@ -142,14 +142,16 @@ def __init__(self, cfg, start_queue=True):
     def start(self):
         self.running = True
         if envs.ENABLE_V1_KVCACHE_SCHEDULER:
-            self.insert_task_to_worker_thread = threading.Thread(target=self._scheduler_task_to_worker_v1, daemon=True)
+            self.insert_task_to_worker_thread = threading.Thread(
+                target=self._schedule_request_to_worker_v1, daemon=True
+            )
         else:
-            self.insert_task_to_worker_thread = threading.Thread(target=self._insert_task_to_worker, daemon=True)
+            self.insert_task_to_worker_thread = threading.Thread(target=self._schedule_request_to_worker, daemon=True)
         self.insert_task_to_worker_thread.start()
         self.token_processor.tasks_queue = self.engine_worker_queue
         self.token_processor.run()
         if self.cfg.scheduler_config.splitwise_role != "mixed":
-            self.split_mode_get_tasks()
+            self._process_splitwise_task()
 
     def create_data_processor(self):
         self.input_processor = InputPreprocessor(
@@ -310,7 +312,7 @@ def start_worker_queue_service(self, start_queue):
             ),
         )
 
-    def insert_tasks(self, tasks, current_id=-1, allocated=False):
+    def insert_tasks(self, tasks: Union[List[Request], List[RequestOutput]], current_id=-1, allocated=False):
         """
         Insert tasks to engine.
         """
@@ -572,7 +574,7 @@ def update_mm_requests_chunk_size(self, requests):
                 patch_st += chunk_patch_num
             request.set("prefill_chunk_info", chunks_info)
 
-    def _insert_task_to_worker(self):
+    def _schedule_request_to_worker(self):
         """
         Insert task to engine thread, monitor scheduler request queue.
         if the engine has resource, insert task to engine
@@ -618,7 +620,7 @@ def _insert_task_to_worker(self):
                     time.sleep(0.001)
                     continue
                 if self.cfg.splitwise_version == "v2" and self.cfg.scheduler_config.splitwise_role == "decode":
-                    # the task in decode instance will processed in split_mode_get_tasks thread
+                    # the task in decode instance will processed in _process_splitwise_task thread
                     continue
 
                 llm_logger.debug(f"get tasks from scheduler: {tasks}")
@@ -637,7 +639,7 @@ def _insert_task_to_worker(self):
                 err_msg = f"Error happend while insert task to engine: {e}, {traceback.format_exc()!s}."
                 self.llm_logger.error(err_msg)
 
-    def _scheduler_task_to_worker_v1(self):
+    def _schedule_request_to_worker_v1(self):
         """
         Insert tasks to worker with scheduler v1 (ENABLE_V1_KVCACHE_SCHEDULER=1).
         """
@@ -921,7 +923,7 @@ def _zmq_send_generated_tokens(self):
             except Exception as e:
                 llm_logger.error(f"Unexcepted error happend: {e}, {traceback.format_exc()!s}")
 
-    def split_mode_get_tasks(self):
+    def _process_splitwise_task(self):
         """
         Processing tasks from engine worker queue in splitwise deployment.
         For v0 version, prefill instance gets tasks from engine worker queue.
@@ -932,10 +934,25 @@ def split_mode_get_tasks(self):
 
         def receiver_loop():
             waiting_resource_requests = []
+            waiting_ready_tasks = []
+
+            # Waiting for the api_server and scheduler in decode to
+            # receive the request sent by the client
+            def _decode_process_prefilled_task_v0_scheduler(input_tasks):
+                ready_tasks = []
+                waiting_tasks = []
+                for task in input_tasks:
+                    if not hasattr(self.scheduler, "has_request") or self.scheduler.has_request(task.request_id):
+                        ready_tasks.append(task)
+                    else:
+                        waiting_tasks.append(task)
+                self.insert_tasks(ready_tasks, allocated=True)
+                if self.cfg.splitwise_version in ("v0", "v2"):
+                    self.scheduler.put_results(ready_tasks)
+                return waiting_tasks
 
             while self.running:
                 try:
-
                     processed_indices = []
                     for idx, task in enumerate(waiting_resource_requests):
                         if envs.ENABLE_V1_KVCACHE_SCHEDULER:
@@ -958,19 +975,24 @@ def receiver_loop():
                     for idx in sorted(processed_indices, reverse=True):
                         waiting_resource_requests.pop(idx)
 
-                    if not self.engine_worker_queue.disaggregate_queue_empty():
+                    waiting_ready_tasks = _decode_process_prefilled_task_v0_scheduler(waiting_ready_tasks)
+
+                    if self.engine_worker_queue.disaggregate_queue_empty():
+                        time.sleep(0.001)
+                    else:
                         items = self.engine_worker_queue.get_disaggregated_tasks()
                         for item in items:
                             role = item[0]
                             tasks = item[1]
 
-                            if role == "prefill":  # prefill instance gets tasks from engine worker queue
+                            # prefill instance gets tasks from engine worker queue
+                            if role == "prefill":
                                 for task in tasks:
                                     task.max_tokens = task.min_tokens = 2
                                 self.insert_tasks(tasks)
-
-                            elif role == "decode":  # decode instance gets tasks from engine worker queue
-                                if hasattr(tasks[0], "finished"):
+                            # decode instance gets tasks from engine worker queue
+                            elif role == "decode":
+                                if isinstance(tasks[0], RequestOutput):
                                     self.llm_logger.debug(f"receive prefilled tasks, {tasks}")
                                     if not isinstance(tasks, list):
                                         tasks = [tasks]
@@ -1009,11 +1031,9 @@ def receiver_loop():
                                             self.resource_manager.insert_task_for_decoding(task)
 
                                     else:
-                                        self.insert_tasks(tasks, allocated=True)
-                                        if self.cfg.splitwise_version in ("v0", "v2"):
-                                            self.scheduler.put_results(tasks)
-                                else:
-                                    self.llm_logger.debug(f"receive tasks to allocate resource, {tasks}")
+                                        waiting_ready_tasks.extend(_decode_process_prefilled_task_v0_scheduler(tasks))
+                                elif isinstance(tasks[0], Request):
+                                    self.llm_logger.debug(f"receive tasks to preallocate resource, {tasks}")
                                     if len(waiting_resource_requests):
                                         self.llm_logger.info(f"Waiting for resource for task {tasks[0].request_id}")
                                         waiting_resource_requests.extend(tasks)
@@ -1044,9 +1064,8 @@ def receiver_loop():
                                                 self.llm_logger.info(
                                                     f"Added {len(new_waiting)} tasks to waiting queue"
                                                 )
-
-                    else:
-                        time.sleep(0.001)
+                                else:
+                                    raise ValueError(f"Unsupported task type: {type(tasks[0])}")
 
                 except Exception as e:
                     self.llm_logger.error(f"Error in main loop: {e}")
 
@@ -0,0 +1,53 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import argparse
+
+from fastdeploy.router.router import start_router
+from fastdeploy.utils import router_logger as logger
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Router for splitwise deployment testing")
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="0.0.0.0",
+        help="Host address to bind the router server.",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default="9000",
+        help="Port number to bind the router server",
+    )
+    parser.add_argument(
+        "--request-timeout-secs",
+        type=int,
+        default=1800,
+        help="Request timeout in seconds",
+    )
+    args = parser.parse_args()
+
+    try:
+        start_router(args)
+    except Exception as e:
+        logger.error(f"Error starting router: {e}")
+        raise e
+
+
+if __name__ == "__main__":
+    main()