Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,11 @@ async def async_request_eb_openai_chat_completions(
"stream_options": {
"include_usage": True,
"continuous_usage_stats": True,
}
},
"max_tokens": request_func_input.output_len,
}
if request_func_input.response_format:
payload["response_format"] =request_func_input.response_format
payload["response_format"] = request_func_input.response_format

# 超参由yaml传入
payload.update(request_func_input.hyper_parameters)
Expand Down Expand Up @@ -132,13 +133,13 @@ async def async_request_eb_openai_chat_completions(

chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk != "[DONE]":
#print("####chunk:", chunk, type(chunk))
# print("####chunk:", chunk, type(chunk))
timestamp = time.perf_counter()
data = json.loads(chunk)

if request_id == "None" and "id" in data:
request_id = data["id"]

if choices := data.get("choices"):
content = choices[0]["delta"].get("content")
reason_content = choices[0]["delta"].get("reasoning_content")
Expand All @@ -164,7 +165,6 @@ async def async_request_eb_openai_chat_completions(
elif usage := data.get("usage", {}):
output.output_tokens = usage.get("completion_tokens", 0)
output.prompt_tokens = usage.get("prompt_tokens", 0)


most_recent_timestamp = timestamp

Expand Down
6 changes: 3 additions & 3 deletions benchmarks/benchmark_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class SampleRequest:
prompt_len: int
expected_output_len: int
response_format: Optional[dict] = None


class BenchmarkDataset(ABC):
"""BenchmarkDataset"""
Expand Down Expand Up @@ -299,7 +299,7 @@ def sample(
prompt = entry["messages"][-1].get("content", "")
history_QA = entry.get("messages", [])
response_format = entry.get("response_format")
new_output_len = int(entry.get("max_tokens", 12288))
new_output_len = int(entry.get("max_tokens", output_len if output_len else 12288))

if enable_multimodal_chat:
prompt = self.apply_multimodal_chat_transformation(prompt, None)
Expand All @@ -311,7 +311,7 @@ def sample(
prompt_len=0,
history_QA=history_QA,
expected_output_len=new_output_len,
response_format=response_format
response_format=response_format,
)
)
cnt += 1
Expand Down
8 changes: 4 additions & 4 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ async def benchmark(
ignore_eos=ignore_eos,
debug=debug,
extra_body=extra_body,
response_format=response_format
response_format=response_format,
)

print("test_input:", test_input)
Expand Down Expand Up @@ -384,7 +384,7 @@ async def benchmark(
logprobs=logprobs,
ignore_eos=ignore_eos,
extra_body=extra_body,
response_format=response_format
response_format=response_format,
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
Expand Down Expand Up @@ -444,7 +444,7 @@ async def limited_request_func(request_func_input, pbar):
debug=debug,
ignore_eos=ignore_eos,
extra_body=extra_body,
response_format=response_format
response_format=response_format,
)
tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
Expand All @@ -460,7 +460,7 @@ async def limited_request_func(request_func_input, pbar):
api_url=base_url + "/stop_profile",
output_len=test_output_len,
logprobs=logprobs,
response_format=response_format
response_format=response_format,
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/yaml/qwen25_7b-vl-32k-bf16.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ max_num_seqs: 128
gpu_memory_utilization: 0.85
tensor_parallel_size: 1
limit_mm_per_prompt: '{"image": 100, "video": 100}'
enable_mm: True
enable_mm: True
2 changes: 1 addition & 1 deletion benchmarks/yaml/request_yaml/qwen25-vl-32k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ metadata:
max_tokens: 32768
repetition_penalty: 1.05
frequency_penalty: 0
presence_penalty: 0
presence_penalty: 0
4 changes: 2 additions & 2 deletions docs/features/multi-node_deployment.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ We recommend using mpirun for one-command startup without manually starting each
4. Ensure all nodes can resolve each other's hostnames

* Online inference startup example:

```shell
python -m fastdeploy.entrypoints.openai.api_server \
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
Expand All @@ -40,7 +40,7 @@ We recommend using mpirun for one-command startup without manually starting each
```

* Offline startup example:

```python
from fastdeploy.engine.sampling_params import SamplingParams
from fastdeploy.entrypoints.llm import LLM
Expand Down
4 changes: 2 additions & 2 deletions docs/zh/features/multi-node_deployment.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
4. 确保所有节点能够解析彼此的主机名

* 在线推理启动示例:

```shell
python -m fastdeploy.entrypoints.openai.api_server \
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
Expand All @@ -40,7 +40,7 @@
```

* 离线启动示例:

```python
from fastdeploy.engine.sampling_params import SamplingParams
from fastdeploy.entrypoints.llm import LLM
Expand Down
71 changes: 71 additions & 0 deletions examples/splitwise/start_mixed.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/bin/bash
set -e

wait_for_health() {
local server_port=$1
while true; do
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
if [ "$status_code" -eq 200 ]; then
break
else
echo "Service not ready. Retrying in 2s..."
sleep 2
fi
done
}

# prepare environment
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"

export FD_DEBUG=1
export ENABLE_V1_KVCACHE_SCHEDULER=0
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1

unset http_proxy && unset https_proxy
rm -rf log_*

# start router
export FD_LOG_DIR="log_router"
mkdir -p ${FD_LOG_DIR}

router_port=9000
nohup python -m fastdeploy.router.launch \
--port ${router_port} \
2>&1 >${FD_LOG_DIR}/nohup &
sleep 1

# start modelserver 0
export CUDA_VISIBLE_DEVICES=0
export FD_LOG_DIR="log_server_0"
mkdir -p ${FD_LOG_DIR}

nohup python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_NAME} \
--port 8100 \
--metrics-port 8101 \
--engine-worker-queue-port 8102 \
--cache-queue-port 8103 \
--max-model-len 32768 \
--router "0.0.0.0:${router_port}" \
2>&1 >${FD_LOG_DIR}/nohup &
sleep 1

wait_for_health 8100

# start modelserver 1
export CUDA_VISIBLE_DEVICES=1
export FD_LOG_DIR="log_server_1"
mkdir -p ${FD_LOG_DIR}

nohup python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_NAME} \
--port 8200 \
--metrics-port 8201 \
--engine-worker-queue-port 8202 \
--cache-queue-port 8203 \
--max-model-len 32768 \
--router "0.0.0.0:${router_port}" \
2>&1 >${FD_LOG_DIR}/nohup &

wait_for_health 8200
66 changes: 66 additions & 0 deletions examples/splitwise/start_v0_tp1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/bash
set -e

# Test splitwise deployment
# v0 requires prefill and decode in one node and it uses local scheduler
# v1 supports prefill and decode in multi node and it uses splitwise scheduler
# v2 supports prefill and decode in multi node and it uses router and local scheduler

wait_for_health() {
local server_port=$1
while true; do
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
if [ "$status_code" -eq 200 ]; then
break
else
echo "Service not ready. Retrying in 2s..."
sleep 2
fi
done
}

MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
aistudio download --model ${MODEL_NAME}

unset http_proxy && unset https_proxy
rm -rf log_*

# start prefill
export FD_LOG_DIR="log_prefill"
mkdir -p ${FD_LOG_DIR}

export CUDA_VISIBLE_DEVICES=0
export FD_DEBUG=1
export ENABLE_V1_KVCACHE_SCHEDULER=0
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个开关应该已经废弃了,另外这个还开了DEBUG日志

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个开 debug 是便于调试,后面可以删掉。 ENABLE_V1_KVCACHE_SCHEDULER也还有效,后面要适配 v1


nohup python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_NAME} \
--port 8100 \
--metrics-port 8101 \
--engine-worker-queue-port 8102 \
--cache-queue-port 8103 \
--max-model-len 32768 \
--splitwise-role "prefill" \
2>&1 >${FD_LOG_DIR}/nohup &
wait_for_health 8100

# start decode
export FD_LOG_DIR="log_decode"
mkdir -p ${FD_LOG_DIR}

export CUDA_VISIBLE_DEVICES=1
export FD_DEBUG=1
export ENABLE_V1_KVCACHE_SCHEDULER=0

nohup python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_NAME} \
--port 9000 \
--metrics-port 9001 \
--engine-worker-queue-port 9002 \
--cache-queue-port 9003 \
--max-model-len 32768 \
--splitwise-role "decode" \
--innode-prefill-ports 8102 \
2>&1 >${FD_LOG_DIR}/nohup &
wait_for_health 9000
96 changes: 96 additions & 0 deletions examples/splitwise/start_v1_tp1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/bin/bash
set -e

# Test splitwise deployment
# v0 requires prefill and decode in one node and it uses local scheduler
# v1 supports prefill and decode in multi node and it uses splitwise scheduler
# v2 supports prefill and decode in multi node and it uses router and local scheduler

wait_for_health() {
local server_port=$1
while true; do
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
if [ "$status_code" -eq 200 ]; then
break
else
echo "Service not ready. Retrying in 2s..."
sleep 2
fi
done
}

# prepare environment
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"

export FD_DEBUG=1
export ENABLE_V1_KVCACHE_SCHEDULER=0
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1

SCRIPT_PATH=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu)
echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
if [ -z "${KVCACHE_RDMA_NICS}" ]; then
echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
exit 1
fi

unset http_proxy && unset https_proxy
rm -rf log_*

# start redis
if ! redis-cli ping &>/dev/null; then
echo "Redis is not running. Starting redis-server..."
redis-server --daemonize yes
sleep 1
else
echo "Redis is already running."
fi
sleep 1

# start prefill
export CUDA_VISIBLE_DEVICES=0
export FD_LOG_DIR="log_prefill"
mkdir -p ${FD_LOG_DIR}

nohup python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_NAME} \
--port 8100 \
--metrics-port 8101 \
--engine-worker-queue-port 8102 \
--cache-queue-port 8103 \
--max-model-len 32768 \
--splitwise-role "prefill" \
--cache-transfer-protocol "rdma,ipc" \
--rdma-comm-ports 8104 \
--pd-comm-port 8105 \
--scheduler-name "splitwise" \
--scheduler-host "127.0.0.1" \
--scheduler-port 6379 \
--scheduler-ttl 9000 \
2>&1 >${FD_LOG_DIR}/nohup &
wait_for_health 8100

# start decode
export CUDA_VISIBLE_DEVICES=1
export FD_LOG_DIR="log_decode"
mkdir -p ${FD_LOG_DIR}

nohup python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_NAME} \
--port 9000 \
--metrics-port 9001 \
--engine-worker-queue-port 9002 \
--cache-queue-port 9003 \
--max-model-len 32768 \
--splitwise-role "decode" \
--cache-transfer-protocol "rdma,ipc" \
--rdma-comm-ports 9004 \
--pd-comm-port 9005 \
--scheduler-name "splitwise" \
--scheduler-host "127.0.0.1" \
--scheduler-port 6379 \
--scheduler-ttl 9000 \
2>&1 >${FD_LOG_DIR}/nohup &
wait_for_health 9000
Loading
Loading