From 566ed798cccd70a4ff5df1c6f6a4514f490b81b0 Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Fri, 7 Nov 2025 09:27:49 +0000
Subject: [PATCH 1/6] remove splitwise deployment on single node and refine the
 code

---
 docs/features/disaggregated.md                |  50 +--
 docs/zh/features/disaggregated.md             |  45 +--
 examples/splitwise/start_mixed.sh             |  19 +-
 examples/splitwise/start_v0_tp1.sh            |  74 +++-
 .../{start_v2_tp2.sh => start_v0_tp2.sh}      |  71 ++--
 examples/splitwise/start_v1_tp1.sh            |  72 ++--
 examples/splitwise/start_v1_tp2.sh            |  74 ++--
 examples/splitwise/start_v2_tp1.sh            |  93 -----
 fastdeploy/cache_manager/cache_messager.py    |   5 +-
 .../transfer_factory/ipc_cache_transfer.py    |   1 -
 fastdeploy/config.py                          |  26 +-
 fastdeploy/demo/offline_disaggregated_demo.py |  64 ----
 fastdeploy/engine/args_utils.py               |  42 +-
 fastdeploy/engine/async_llm.py                |   2 -
 fastdeploy/engine/common_engine.py            | 359 +++++++++---------
 fastdeploy/engine/engine.py                   |   2 -
 fastdeploy/engine/request.py                  |   4 +
 .../inter_communicator/engine_worker_queue.py |  17 -
 fastdeploy/output/token_processor.py          |   9 +-
 fastdeploy/splitwise/splitwise_connector.py   |  70 ----
 fastdeploy/worker/worker_process.py           |   2 -
 ...b_pd.py => test_ernie_03b_pd_router_v0.py} | 188 ++++-----
 ... test_ernie_03b_pd_splitwise_scheduler.py} | 159 ++++----
 tests/e2e/utils/serving_utils.py              |  60 ++-
 24 files changed, 625 insertions(+), 883 deletions(-)
 rename examples/splitwise/{start_v2_tp2.sh => start_v0_tp2.sh} (59%)
 delete mode 100644 examples/splitwise/start_v2_tp1.sh
 delete mode 100644 fastdeploy/demo/offline_disaggregated_demo.py
 rename tests/e2e/{test_ernie_03b_pd.py => test_ernie_03b_pd_router_v0.py} (71%)
 rename tests/e2e/{test_ernie_03b_pd_multi_node.py => test_ernie_03b_pd_splitwise_scheduler.py} (83%)

diff --git a/docs/features/disaggregated.md b/docs/features/disaggregated.md
index 58ecaeb245d..e240d33a283 100644
--- a/docs/features/disaggregated.md
+++ b/docs/features/disaggregated.md
@@ -29,48 +29,6 @@ In multi-instance scenarios, each incoming request needs to be assigned to diffe
 
 ## Usage Instructions
 
-### Single-machine Disaggregated Deployment
-
-#### Online Inference Service
-Use the following commands for service deployment:
-
-**Prefill Instance**
-
-```bash
-export FD_LOG_DIR="log_prefill"
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-python -m fastdeploy.entrypoints.openai.api_server \
-       --model ERNIE-4.5-300B-A47B-BF16 \
-       --port 8180 --metrics-port 8181 \
-       --engine-worker-queue-port 8182 \
-       --cache-queue-port 8183 \
-       --tensor-parallel-size 4 \
-       --quantization wint4 \
-       --splitwise-role "prefill"
-```
-
-**Decode Instance**
-
-```bash
-export FD_LOG_DIR="log_decode"
-export CUDA_VISIBLE_DEVICES=4,5,6,7
-# Note: innode-prefill-ports should specify the engine-worker-queue-port of the Prefill service
-python -m fastdeploy.entrypoints.openai.api_server \
-       --model ERNIE-4.5-300B-A47B-BF16 \
-       --port 8184 --metrics-port 8185 \
-       --engine-worker-queue-port 8186 \
-       --cache-queue-port 8187 \
-       --tensor-parallel-size 4 \
-       --quantization wint4 \
-       --innode-prefill-ports 8182 \
-       --splitwise-role "decode"
-```
-
-Note: When requesting single-machine PD disaggregated service, **users should request the Decode service's port**.
-
-#### Offline Inference Service
-Refer to the example code `offline_disaggregated_demo.py` in the `fastdeploy/demo` directory for offline inference service deployment.
-
 ### Multi-machine Disaggregated Deployment
 
 #### Prerequisite: Redis
@@ -118,12 +76,14 @@ For multi-machine deployment, confirm that the NIC supports RDMA and that all no
 ```bash
 export FD_LOG_DIR="log_prefill"
 export CUDA_VISIBLE_DEVICES=0,1,2,3
+export ENABLE_V1_KVCACHE_SCHEDULER=0
 echo "set RDMA NICS"
 export $(bash scripts/get_rdma_nics.sh gpu)
 echo "KVCACHE_RDMA_NICS ${KVCACHE_RDMA_NICS}"
 python -m fastdeploy.entrypoints.openai.api_server \
        --model ERNIE-4.5-300B-A47B-BF16 \
-       --port 8180 --metrics-port 8181 \
+       --port 8180 \
+       --metrics-port 8181 \
        --engine-worker-queue-port 8182 \
        --cache-queue-port 8183 \
        --tensor-parallel-size 4 \
@@ -143,12 +103,14 @@ python -m fastdeploy.entrypoints.openai.api_server \
 ```bash
 export FD_LOG_DIR="log_decode"
 export CUDA_VISIBLE_DEVICES=4,5,6,7
+export ENABLE_V1_KVCACHE_SCHEDULER=0
 echo "set RDMA NICS"
 export $(bash scripts/get_rdma_nics.sh gpu)
 echo "KVCACHE_RDMA_NICS ${KVCACHE_RDMA_NICS}"
 python -m fastdeploy.entrypoints.openai.api_server \
        --model ERNIE-4.5-300B-A47B-BF16 \
-       --port 8184 --metrics-port 8185 \
+       --port 8184 \
+       --metrics-port 8185 \
        --engine-worker-queue-port 8186 \
        --cache-queue-port 8187 \
        --tensor-parallel-size 4 \
diff --git a/docs/zh/features/disaggregated.md b/docs/zh/features/disaggregated.md
index 909925ea697..093fdd24c6a 100644
--- a/docs/zh/features/disaggregated.md
+++ b/docs/zh/features/disaggregated.md
@@ -29,49 +29,6 @@
 
 ## 使用说明
 
-### 单机分离式部署
-
-#### 在线推理服务
-使用如下命令进行服务部署
-
-**prefill 实例**
-
-```bash
-export FD_LOG_DIR="log_prefill"
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-python -m fastdeploy.entrypoints.openai.api_server \
-       --model ERNIE-4.5-300B-A47B-BF16 \
-       --port 8180 --metrics-port 8181 \
-       --engine-worker-queue-port 8182 \
-       --cache-queue-port 8183 \
-       --tensor-parallel-size 4 \
-       --quantization wint4 \
-       --splitwise-role "prefill"
-```
-
-**decode 实例**
-
-```bash
-export FD_LOG_DIR="log_decode"
-export CUDA_VISIBLE_DEVICES=4,5,6,7
-# 注意innode-prefill-ports指定为Prefill服务的engine-worker-queue-port
-python -m fastdeploy.entrypoints.openai.api_server \
-       --model ERNIE-4.5-300B-A47B-BF16 \
-       --port 8184 --metrics-port 8185 \
-       --engine-worker-queue-port 8186 \
-       --cache-queue-port 8187 \
-       --tensor-parallel-size 4 \
-       --quantization wint4 \
-       --innode-prefill-ports 8182 \
-       --splitwise-role "decode"
-```
-
-注意在请求单机PD分离服务时，**用户需请求Decode服务的端口**。
-
-#### 离线推理服务
-
-参考`fastdeploy/demo` 目录下 `offline_disaggregated_demo.py` 示例代码，进行离线推理服务部署
-
 ### 多机分离式部署
 
 #### 前置依赖 Redis
@@ -120,6 +77,7 @@ sudo systemctl start redis
 
 export FD_LOG_DIR="log_prefill"
 export CUDA_VISIBLE_DEVICES=0,1,2,3
+export ENABLE_V1_KVCACHE_SCHEDULER=0
 echo "set RDMA NICS"
 export $(bash scripts/get_rdma_nics.sh gpu)
 echo "KVCACHE_RDMA_NICS ${KVCACHE_RDMA_NICS}"
@@ -146,6 +104,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
 ```bash
 export FD_LOG_DIR="log_decode"
 export CUDA_VISIBLE_DEVICES=4,5,6,7
+export ENABLE_V1_KVCACHE_SCHEDULER=0
 echo "set RDMA NICS"
 export $(bash scripts/get_rdma_nics.sh gpu)
 echo "KVCACHE_RDMA_NICS ${KVCACHE_RDMA_NICS}"
diff --git a/examples/splitwise/start_mixed.sh b/examples/splitwise/start_mixed.sh
index bf3e78ab058..750c2a45e55 100644
--- a/examples/splitwise/start_mixed.sh
+++ b/examples/splitwise/start_mixed.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 set -e
 
+# Test mixed server + router
+
 wait_for_health() {
        local server_port=$1
        while true; do
@@ -16,7 +18,6 @@ wait_for_health() {
 
 # prepare environment
 MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
-# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
 
 export FD_DEBUG=1
 export ENABLE_V1_KVCACHE_SCHEDULER=0
@@ -51,7 +52,7 @@ nohup python -m fastdeploy.entrypoints.openai.api_server \
        2>&1 >${FD_LOG_DIR}/nohup &
 sleep 1
 
-wait_for_health 8100
+# wait_for_health 8100
 
 # start modelserver 1
 export CUDA_VISIBLE_DEVICES=1
@@ -69,3 +70,17 @@ nohup python -m fastdeploy.entrypoints.openai.api_server \
        2>&1 >${FD_LOG_DIR}/nohup &
 
 wait_for_health 8200
+
+
+# send request
+sleep 10  # make sure server is registered to router
+port=9000
+curl -X POST "http://0.0.0.0:${port}/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "hello"}
+  ],
+  "max_tokens": 20,
+  "stream": true
+}'
diff --git a/examples/splitwise/start_v0_tp1.sh b/examples/splitwise/start_v0_tp1.sh
index 30dbb5a906d..c4b94a9b1fa 100644
--- a/examples/splitwise/start_v0_tp1.sh
+++ b/examples/splitwise/start_v0_tp1.sh
@@ -2,9 +2,9 @@
 set -e
 
 # Test splitwise deployment
-# v0 requires prefill and decode in one node and it uses local scheduler
-# v1 supports prefill and decode in multi node and it uses splitwise scheduler
-# v2 supports prefill and decode in multi node and it uses router and local scheduler
+# There are two methods for splitwise deployment:
+# v0: using splitwise_scheduler or dp_scheduler
+# v1: using local_scheduler + router
 
 wait_for_health() {
        local server_port=$1
@@ -19,21 +19,40 @@ wait_for_health() {
        done
 }
 
+# prepare environment
 MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
-# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
-aistudio download --model ${MODEL_NAME}
+
+export FD_DEBUG=1
+export ENABLE_V1_KVCACHE_SCHEDULER=1
+export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
+
+SCRIPT_PATH=$(readlink -f "$0")
+SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
+export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu)
+echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
+if [ -z "${KVCACHE_RDMA_NICS}" ]; then
+  echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
+  exit 1
+fi
 
 unset http_proxy && unset https_proxy
 rm -rf log_*
 
+# start redis
+if ! redis-cli ping &>/dev/null; then
+    echo "Redis is not running. Starting redis-server..."
+    redis-server --daemonize yes
+    sleep 1
+else
+    echo "Redis is already running."
+fi
+sleep 1
+
 # start prefill
+export CUDA_VISIBLE_DEVICES=0
 export FD_LOG_DIR="log_prefill"
 mkdir -p ${FD_LOG_DIR}
 
-export CUDA_VISIBLE_DEVICES=0
-export FD_DEBUG=1
-export ENABLE_V1_KVCACHE_SCHEDULER=0
-
 nohup python -m fastdeploy.entrypoints.openai.api_server \
        --model ${MODEL_NAME} \
        --port 8100 \
@@ -41,18 +60,23 @@ nohup python -m fastdeploy.entrypoints.openai.api_server \
        --engine-worker-queue-port 8102 \
        --cache-queue-port 8103 \
        --max-model-len 32768 \
+       --num-gpu-blocks-override 1000 \
        --splitwise-role "prefill" \
+       --cache-transfer-protocol "rdma" \
+       --rdma-comm-ports 8104 \
+       --pd-comm-port 8105 \
+       --scheduler-name "splitwise" \
+       --scheduler-host "127.0.0.1" \
+       --scheduler-port 6379 \
+       --scheduler-ttl 9000 \
        2>&1 >${FD_LOG_DIR}/nohup &
-wait_for_health 8100
+# wait_for_health 8100
 
 # start decode
+export CUDA_VISIBLE_DEVICES=1
 export FD_LOG_DIR="log_decode"
 mkdir -p ${FD_LOG_DIR}
 
-export CUDA_VISIBLE_DEVICES=1
-export FD_DEBUG=1
-export ENABLE_V1_KVCACHE_SCHEDULER=0
-
 nohup python -m fastdeploy.entrypoints.openai.api_server \
        --model ${MODEL_NAME} \
        --port 9000 \
@@ -61,6 +85,26 @@ nohup python -m fastdeploy.entrypoints.openai.api_server \
        --cache-queue-port 9003 \
        --max-model-len 32768 \
        --splitwise-role "decode" \
-       --innode-prefill-ports 8102 \
+       --cache-transfer-protocol "rdma" \
+       --rdma-comm-ports 9004 \
+       --pd-comm-port 9005 \
+       --scheduler-name "splitwise" \
+       --scheduler-host "127.0.0.1" \
+       --scheduler-port 6379 \
+       --scheduler-ttl 9000 \
        2>&1 >${FD_LOG_DIR}/nohup &
 wait_for_health 9000
+
+
+# send request
+sleep 10  # make sure server is registered to router
+port=9000
+curl -X POST "http://0.0.0.0:${port}/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "hello"}
+  ],
+  "max_tokens": 20,
+  "stream": true
+}'
diff --git a/examples/splitwise/start_v2_tp2.sh b/examples/splitwise/start_v0_tp2.sh
similarity index 59%
rename from examples/splitwise/start_v2_tp2.sh
rename to examples/splitwise/start_v0_tp2.sh
index 5563b2f4c98..cb2015ec4ac 100644
--- a/examples/splitwise/start_v2_tp2.sh
+++ b/examples/splitwise/start_v0_tp2.sh
@@ -2,9 +2,9 @@
 set -e
 
 # Test splitwise deployment
-# v0 requires prefill and decode in one node and it uses local scheduler
-# v1 supports prefill and decode in multi node and it uses splitwise scheduler
-# v2 supports prefill and decode in multi node and it uses router and local scheduler
+# There are two methods for splitwise deployment:
+# v0: using splitwise_scheduler or dp_scheduler
+# v1: using local_scheduler + router
 
 wait_for_health() {
        local server_port=$1
@@ -21,7 +21,6 @@ wait_for_health() {
 
 # prepare environment
 MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
-# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
 
 export FD_DEBUG=1
 export ENABLE_V1_KVCACHE_SCHEDULER=0
@@ -39,16 +38,14 @@ fi
 unset http_proxy && unset https_proxy
 rm -rf log_*
 
-# start router
-export FD_LOG_DIR="log_router"
-mkdir -p ${FD_LOG_DIR}
-
-echo "start router"
-router_port=9000
-nohup python -m fastdeploy.router.launch \
-    --port ${router_port} \
-    --splitwise \
-    2>&1 >${FD_LOG_DIR}/nohup &
+# start redis
+if ! redis-cli ping &>/dev/null; then
+    echo "Redis is not running. Starting redis-server..."
+    redis-server --daemonize yes
+    sleep 1
+else
+    echo "Redis is already running."
+fi
 sleep 1
 
 # start prefill
@@ -56,41 +53,59 @@ export CUDA_VISIBLE_DEVICES=0,1
 export FD_LOG_DIR="log_prefill"
 mkdir -p ${FD_LOG_DIR}
 
-echo "start prefill"
 nohup python -m fastdeploy.entrypoints.openai.api_server \
        --model ${MODEL_NAME} \
        --port 8100 \
        --metrics-port 8101 \
        --engine-worker-queue-port 8102 \
        --cache-queue-port 8103 \
-       --tensor-parallel-size 2 \
        --max-model-len 32768 \
+       --tensor-parallel-size 2 \
        --splitwise-role "prefill" \
+       --cache-transfer-protocol "rdma,ipc" \
        --pd-comm-port 8104 \
        --rdma-comm-ports 8105,8106 \
-       --router "0.0.0.0:${router_port}" \
+       --scheduler-name "splitwise" \
+       --scheduler-host "127.0.0.1" \
+       --scheduler-port 6379 \
+       --scheduler-ttl 9000 \
        2>&1 >${FD_LOG_DIR}/nohup &
-
-wait_for_health 8100
+# wait_for_health 8100
 
 # start decode
 export CUDA_VISIBLE_DEVICES=2,3
 export FD_LOG_DIR="log_decode"
 mkdir -p ${FD_LOG_DIR}
 
-echo "start decode"
 nohup python -m fastdeploy.entrypoints.openai.api_server \
        --model ${MODEL_NAME} \
-       --port 8200 \
-       --metrics-port 8201 \
-       --engine-worker-queue-port 8202 \
-       --cache-queue-port 8203 \
+       --port 9000 \
+       --metrics-port 9001 \
+       --engine-worker-queue-port 9002 \
+       --cache-queue-port 9003 \
        --max-model-len 32768 \
        --tensor-parallel-size 2 \
        --splitwise-role "decode" \
-       --pd-comm-port 8204 \
-       --rdma-comm-ports 8205,8206 \
-       --router "0.0.0.0:${router_port}" \
+       --cache-transfer-protocol "rdma,ipc" \
+       --pd-comm-port 9004 \
+       --rdma-comm-ports 9005,9006 \
+       --scheduler-name "splitwise" \
+       --scheduler-host "127.0.0.1" \
+       --scheduler-port 6379 \
+       --scheduler-ttl 9000 \
        2>&1 >${FD_LOG_DIR}/nohup &
+wait_for_health 9000
+
 
-wait_for_health 8200
+# send request
+sleep 10  # make sure server is registered to router
+port=9000
+curl -X POST "http://0.0.0.0:${port}/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "hello"}
+  ],
+  "max_tokens": 20,
+  "stream": true
+}'
diff --git a/examples/splitwise/start_v1_tp1.sh b/examples/splitwise/start_v1_tp1.sh
index 12377404c1d..523ce15b010 100644
--- a/examples/splitwise/start_v1_tp1.sh
+++ b/examples/splitwise/start_v1_tp1.sh
@@ -2,9 +2,9 @@
 set -e
 
 # Test splitwise deployment
-# v0 requires prefill and decode in one node and it uses local scheduler
-# v1 supports prefill and decode in multi node and it uses splitwise scheduler
-# v2 supports prefill and decode in multi node and it uses router and local scheduler
+# There are two methods for splitwise deployment:
+# v0: using splitwise_scheduler or dp_scheduler
+# v1: using local_scheduler + router
 
 wait_for_health() {
        local server_port=$1
@@ -21,10 +21,9 @@ wait_for_health() {
 
 # prepare environment
 MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
-# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
 
 export FD_DEBUG=1
-export ENABLE_V1_KVCACHE_SCHEDULER=0
+export ENABLE_V1_KVCACHE_SCHEDULER=1
 export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
 
 SCRIPT_PATH=$(readlink -f "$0")
@@ -39,14 +38,15 @@ fi
 unset http_proxy && unset https_proxy
 rm -rf log_*
 
-# start redis
-if ! redis-cli ping &>/dev/null; then
-    echo "Redis is not running. Starting redis-server..."
-    redis-server --daemonize yes
-    sleep 1
-else
-    echo "Redis is already running."
-fi
+# start router
+export FD_LOG_DIR="log_router"
+mkdir -p ${FD_LOG_DIR}
+
+router_port=9000
+nohup python -m fastdeploy.router.launch \
+    --port ${router_port} \
+    --splitwise \
+    2>&1 >${FD_LOG_DIR}/nohup &
 sleep 1
 
 # start prefill
@@ -62,15 +62,14 @@ nohup python -m fastdeploy.entrypoints.openai.api_server \
        --cache-queue-port 8103 \
        --max-model-len 32768 \
        --splitwise-role "prefill" \
-       --cache-transfer-protocol "rdma,ipc" \
+       --cache-transfer-protocol "rdma" \
        --rdma-comm-ports 8104 \
        --pd-comm-port 8105 \
-       --scheduler-name "splitwise" \
-       --scheduler-host "127.0.0.1" \
-       --scheduler-port 6379 \
-       --scheduler-ttl 9000 \
+       --num-gpu-blocks-override 2000 \
+       --router "0.0.0.0:${router_port}" \
        2>&1 >${FD_LOG_DIR}/nohup &
-wait_for_health 8100
+
+# wait_for_health 8100
 
 # start decode
 export CUDA_VISIBLE_DEVICES=1
@@ -79,18 +78,29 @@ mkdir -p ${FD_LOG_DIR}
 
 nohup python -m fastdeploy.entrypoints.openai.api_server \
        --model ${MODEL_NAME} \
-       --port 9000 \
-       --metrics-port 9001 \
-       --engine-worker-queue-port 9002 \
-       --cache-queue-port 9003 \
+       --port 8200 \
+       --metrics-port 8201 \
+       --engine-worker-queue-port 8202 \
+       --cache-queue-port 8203 \
        --max-model-len 32768 \
        --splitwise-role "decode" \
-       --cache-transfer-protocol "rdma,ipc" \
-       --rdma-comm-ports 9004 \
-       --pd-comm-port 9005 \
-       --scheduler-name "splitwise" \
-       --scheduler-host "127.0.0.1" \
-       --scheduler-port 6379 \
-       --scheduler-ttl 9000 \
+       --cache-transfer-protocol "rdma" \
+       --rdma-comm-ports 8204 \
+       --pd-comm-port 8205 \
+       --router "0.0.0.0:${router_port}" \
        2>&1 >${FD_LOG_DIR}/nohup &
-wait_for_health 9000
+
+wait_for_health 8200
+
+# send request
+sleep 10  # make sure server is registered to router
+port=9000
+curl -X POST "http://0.0.0.0:${port}/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "hello"}
+  ],
+  "max_tokens": 20,
+  "stream": true
+}'
diff --git a/examples/splitwise/start_v1_tp2.sh b/examples/splitwise/start_v1_tp2.sh
index cf0b728064a..c58a8a9cead 100644
--- a/examples/splitwise/start_v1_tp2.sh
+++ b/examples/splitwise/start_v1_tp2.sh
@@ -2,9 +2,9 @@
 set -e
 
 # Test splitwise deployment
-# v0 requires prefill and decode in one node and it uses local scheduler
-# v1 supports prefill and decode in multi node and it uses splitwise scheduler
-# v2 supports prefill and decode in multi node and it uses router and local scheduler
+# There are two methods for splitwise deployment:
+# v0: using splitwise_scheduler or dp_scheduler
+# v1: using local_scheduler + router
 
 wait_for_health() {
        local server_port=$1
@@ -21,7 +21,6 @@ wait_for_health() {
 
 # prepare environment
 MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
-# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
 
 export FD_DEBUG=1
 export ENABLE_V1_KVCACHE_SCHEDULER=0
@@ -39,14 +38,16 @@ fi
 unset http_proxy && unset https_proxy
 rm -rf log_*
 
-# start redis
-if ! redis-cli ping &>/dev/null; then
-    echo "Redis is not running. Starting redis-server..."
-    redis-server --daemonize yes
-    sleep 1
-else
-    echo "Redis is already running."
-fi
+# start router
+export FD_LOG_DIR="log_router"
+mkdir -p ${FD_LOG_DIR}
+
+echo "start router"
+router_port=9000
+nohup python -m fastdeploy.router.launch \
+    --port ${router_port} \
+    --splitwise \
+    2>&1 >${FD_LOG_DIR}/nohup &
 sleep 1
 
 # start prefill
@@ -54,45 +55,56 @@ export CUDA_VISIBLE_DEVICES=0,1
 export FD_LOG_DIR="log_prefill"
 mkdir -p ${FD_LOG_DIR}
 
+echo "start prefill"
 nohup python -m fastdeploy.entrypoints.openai.api_server \
        --model ${MODEL_NAME} \
        --port 8100 \
        --metrics-port 8101 \
        --engine-worker-queue-port 8102 \
        --cache-queue-port 8103 \
-       --max-model-len 32768 \
        --tensor-parallel-size 2 \
+       --max-model-len 32768 \
        --splitwise-role "prefill" \
-       --cache-transfer-protocol "rdma,ipc" \
        --pd-comm-port 8104 \
        --rdma-comm-ports 8105,8106 \
-       --scheduler-name "splitwise" \
-       --scheduler-host "127.0.0.1" \
-       --scheduler-port 6379 \
-       --scheduler-ttl 9000 \
+       --router "0.0.0.0:${router_port}" \
        2>&1 >${FD_LOG_DIR}/nohup &
-wait_for_health 8100
+
+# wait_for_health 8100
 
 # start decode
 export CUDA_VISIBLE_DEVICES=2,3
 export FD_LOG_DIR="log_decode"
 mkdir -p ${FD_LOG_DIR}
 
+echo "start decode"
 nohup python -m fastdeploy.entrypoints.openai.api_server \
        --model ${MODEL_NAME} \
-       --port 9000 \
-       --metrics-port 9001 \
-       --engine-worker-queue-port 9002 \
-       --cache-queue-port 9003 \
+       --port 8200 \
+       --metrics-port 8201 \
+       --engine-worker-queue-port 8202 \
+       --cache-queue-port 8203 \
        --max-model-len 32768 \
        --tensor-parallel-size 2 \
        --splitwise-role "decode" \
-       --cache-transfer-protocol "rdma,ipc" \
-       --pd-comm-port 9004 \
-       --rdma-comm-ports 9005,9006 \
-       --scheduler-name "splitwise" \
-       --scheduler-host "127.0.0.1" \
-       --scheduler-port 6379 \
-       --scheduler-ttl 9000 \
+       --pd-comm-port 8204 \
+       --rdma-comm-ports 8205,8206 \
+       --router "0.0.0.0:${router_port}" \
        2>&1 >${FD_LOG_DIR}/nohup &
-wait_for_health 9000
+
+wait_for_health 8200
+
+
+
+# send request
+sleep 10  # make sure server is registered to router
+port=9000
+curl -X POST "http://0.0.0.0:${port}/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "hello"}
+  ],
+  "max_tokens": 20,
+  "stream": true
+}'
diff --git a/examples/splitwise/start_v2_tp1.sh b/examples/splitwise/start_v2_tp1.sh
deleted file mode 100644
index 78a0358f957..00000000000
--- a/examples/splitwise/start_v2_tp1.sh
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/bin/bash
-set -e
-
-# Test splitwise deployment
-# v0 requires prefill and decode in one node and it uses local scheduler
-# v1 supports prefill and decode in multi node and it uses splitwise scheduler
-# v2 supports prefill and decode in multi node and it uses router and local scheduler
-
-wait_for_health() {
-       local server_port=$1
-       while true; do
-       status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
-       if [ "$status_code" -eq 200 ]; then
-              break
-       else
-              echo "Service not ready. Retrying in 2s..."
-              sleep 2
-       fi
-       done
-}
-
-# prepare environment
-MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
-# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
-
-export FD_DEBUG=1
-export ENABLE_V1_KVCACHE_SCHEDULER=0
-export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
-
-SCRIPT_PATH=$(readlink -f "$0")
-SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
-export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu)
-echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
-if [ -z "${KVCACHE_RDMA_NICS}" ]; then
-  echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
-  exit 1
-fi
-
-unset http_proxy && unset https_proxy
-rm -rf log_*
-
-# start router
-export FD_LOG_DIR="log_router"
-mkdir -p ${FD_LOG_DIR}
-
-router_port=9000
-nohup python -m fastdeploy.router.launch \
-    --port ${router_port} \
-    --splitwise \
-    2>&1 >${FD_LOG_DIR}/nohup &
-sleep 1
-
-# start prefill
-export CUDA_VISIBLE_DEVICES=0
-export FD_LOG_DIR="log_prefill"
-mkdir -p ${FD_LOG_DIR}
-
-nohup python -m fastdeploy.entrypoints.openai.api_server \
-       --model ${MODEL_NAME} \
-       --port 8100 \
-       --metrics-port 8101 \
-       --engine-worker-queue-port 8102 \
-       --cache-queue-port 8103 \
-       --max-model-len 32768 \
-       --splitwise-role "prefill" \
-       --cache-transfer-protocol "ipc,rdma" \
-       --rdma-comm-ports 8104 \
-       --pd-comm-port 8105 \
-       --router "0.0.0.0:${router_port}" \
-       2>&1 >${FD_LOG_DIR}/nohup &
-
-wait_for_health 8100
-
-# start decode
-export CUDA_VISIBLE_DEVICES=1
-export FD_LOG_DIR="log_decode"
-mkdir -p ${FD_LOG_DIR}
-
-nohup python -m fastdeploy.entrypoints.openai.api_server \
-       --model ${MODEL_NAME} \
-       --port 8200 \
-       --metrics-port 8201 \
-       --engine-worker-queue-port 8202 \
-       --cache-queue-port 8203 \
-       --max-model-len 32768 \
-       --splitwise-role "decode" \
-       --cache-transfer-protocol "ipc,rdma" \
-       --rdma-comm-ports 8204 \
-       --pd-comm-port 8205 \
-       --router "0.0.0.0:${router_port}" \
-       2>&1 >${FD_LOG_DIR}/nohup &
-
-wait_for_health 8200
diff --git a/fastdeploy/cache_manager/cache_messager.py b/fastdeploy/cache_manager/cache_messager.py
index 7717afa3a6b..187b41f0e81 100644
--- a/fastdeploy/cache_manager/cache_messager.py
+++ b/fastdeploy/cache_manager/cache_messager.py
@@ -622,8 +622,11 @@ def prefill_layerwise_send_cache_thread(self):
                                     target_id = int(task["rdma_ports"][self.rank])
                                     if "error" in task["status"]:
                                         continue
+                                    logger.debug("rdma, start connect decode")
                                     status = self.messager[current_transfer_protocol].connect(target_ip, target_id)
-                                    if not status:
+                                    if status:
+                                        logger.info(f"connect to {target_ip}:{target_id} success")
+                                    else:
                                         logger.error(f"connect to {target_ip}:{target_id} failed")
                                         task["status"] = "connection error"
                                         continue
diff --git a/fastdeploy/cache_manager/transfer_factory/ipc_cache_transfer.py b/fastdeploy/cache_manager/transfer_factory/ipc_cache_transfer.py
index 61a4fa10b06..e87c77f277e 100644
--- a/fastdeploy/cache_manager/transfer_factory/ipc_cache_transfer.py
+++ b/fastdeploy/cache_manager/transfer_factory/ipc_cache_transfer.py
@@ -51,7 +51,6 @@ def __init__(self, rank_id_, remote_gpu_id_, layer_num, local_gpu_id_):
             self.remote_key_tensor_ptr_list.append(get_data_ptr_ipc(tmp, key_unique_name))
             self.remote_value_tensor_ptr_list.append(get_data_ptr_ipc(tmp, value_unique_name))
         self.write_stream = paddle.device.Stream(f"gpu:{self.local_gpu_id}")
-        self.finish_event = paddle.device.Event()
 
 
 class IPCCommManager:
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index f7b3fe0b9e6..5ec3df934ac 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -20,7 +20,7 @@
 import os
 from dataclasses import field
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Any, Dict, Literal, Optional, Union
 
 import paddle
 import paddle.distributed as dist
@@ -1453,7 +1453,6 @@ def __init__(
         use_warmup: bool = False,
         limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
         mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-        innode_prefill_ports: Optional[List[int]] = None,
         max_num_partial_prefills: int = 1,
         max_long_partial_prefills: int = 1,
         long_prefill_token_threshold: int = 0,
@@ -1517,13 +1516,10 @@ def __init__(
         self.limit_mm_per_prompt = limit_mm_per_prompt
         self.mm_processor_kwargs = mm_processor_kwargs
         self.use_warmup = use_warmup
-        self.innode_prefill_ports = innode_prefill_ports
         self.max_num_partial_prefills = max_num_partial_prefills
         self.max_long_partial_prefills = max_long_partial_prefills
         self.long_prefill_token_threshold = long_prefill_token_threshold
 
-        self._str_to_list("innode_prefill_ports", int)
-
         if envs.FD_FOR_TORCH_MODEL_FORMAT:
             self.model_config.model_format = "torch"
 
@@ -1773,23 +1769,15 @@ def init_cache_info(self):
         """
         initialize cache info
         """
-        # TODO: group the splitiwse params, remove code of v0
-        # v0 requires prefill and decode in one node and it uses local scheduler
-        # v1 supports prefill and decode in multi node and it uses splitwise or dp scheduler
-        # v2 supports prefill and decode in multi node and it uses router and local scheduler
+        # TODO: group the splitiwse params
+        # There are two methods for splitwise deployment:
+        # 1. v0 splitwise_scheduler or dp_scheduler
+        # 2. v1 local_scheduler + router
         self.splitwise_version = None
-        if self.scheduler_config.name == "local" and (self.router_config is None or self.router_config.router is None):
+        if self.scheduler_config.name in ("splitwise", "dp"):
             self.splitwise_version = "v0"
-        elif self.scheduler_config.name in ("splitwise", "dp"):
-            self.splitwise_version = "v1"
         elif self.scheduler_config.name == "local" and self.router_config and self.router_config.router:
-            self.splitwise_version = "v2"
-        else:
-            raise ValueError(
-                f"Unsupported scheduler mode, scheduler_name: {self.scheduler_config.name}, "
-                f"router_config: {self.router_config}"
-            )
-        logger.info(f"splitwise_version: {self.splitwise_version}")
+            self.splitwise_version = "v1"
 
         if isinstance(self.parallel_config.engine_worker_queue_port, (int, str)):
             engine_worker_queue_port = self.parallel_config.engine_worker_queue_port
diff --git a/fastdeploy/demo/offline_disaggregated_demo.py b/fastdeploy/demo/offline_disaggregated_demo.py
deleted file mode 100644
index 26e34794168..00000000000
--- a/fastdeploy/demo/offline_disaggregated_demo.py
+++ /dev/null
@@ -1,64 +0,0 @@
-"""
-# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import multiprocessing
-import os
-import time
-
-from fastdeploy.entrypoints.llm import LLM
-
-model_name_or_path = "baidu/ERNIE-4.5-0.3B-Paddle"
-
-
-def start_decode(model_name_or_path):
-    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
-    os.environ["FD_LOG_DIR"] = "log_decode"
-    llm_decode = LLM(
-        model=model_name_or_path,
-        tensor_parallel_size=1,
-        splitwise_role="decode",
-        engine_worker_queue_port=6678,
-        innode_prefill_ports=[6677],
-        cache_queue_port=55668,
-    )
-    return llm_decode
-
-
-def start_prefill(model_name_or_path):
-    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-    os.environ["FD_LOG_DIR"] = "log_prefill"
-    LLM(
-        model=model_name_or_path,
-        tensor_parallel_size=1,
-        splitwise_role="prefill",
-        engine_worker_queue_port=6677,
-        cache_queue_port=55667,
-    )
-
-
-def main():
-    prefill = multiprocessing.Process(target=start_prefill, args=(model_name_or_path,)).start()
-    time.sleep(10)
-    llm_decode = start_decode(model_name_or_path)
-
-    output = llm_decode.generate(prompts=["who are you？", "what can you do？"], use_tqdm=True)
-    print(output)
-
-    prefill.join()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
index 82c4af9d77b..b5b10ca6ca3 100644
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -296,11 +296,6 @@ class EngineArgs:
     Port for splitwise communication.
     """
 
-    innode_prefill_ports: Optional[List[int]] = None
-    """
-    Ports for innode dispatch request.
-    """
-
     rdma_comm_ports: Optional[List[int]] = None
     """
     Ports for rdma communication.
@@ -500,8 +495,33 @@ def __post_init__(self):
             if self.max_logprobs == -1 and not envs.ENABLE_V1_KVCACHE_SCHEDULER:
                 raise NotImplementedError("Only ENABLE_V1_KVCACHE_SCHEDULER=1 support max_logprobs=-1")
 
-        if self.splitwise_role != "mixed" and self.cache_transfer_protocol != "rdma":
-            envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
+        if self.splitwise_role != "mixed":
+            if self.scheduler_name == "local" and self.router is None:
+                raise ValueError(
+                    f"When using {self.splitwise_role} role and the {self.scheduler_name} "
+                    f"scheduler, please provide --router argument."
+                )
+
+            if "rdma" in self.cache_transfer_protocol:
+                if self.rdma_comm_ports is None:
+                    raise ValueError(
+                        "Please set --rdma_comm_ports argument when using " "rdma cache transfer protocol."
+                    )
+                if len(self.rdma_comm_ports) != self.tensor_parallel_size:
+                    raise ValueError("The number of rdma comm ports must be equal to tensor parallel size.")
+
+            if envs.ENABLE_V1_KVCACHE_SCHEDULER == 1:
+                if "ipc" in self.cache_transfer_protocol:
+                    # FIXME: support ipc cache transfer protocol
+                    raise NotImplementedError(
+                        "only support rdma cache transfer protocol " "when using ENABLE_V1_KVCACHE_SCHEDULER."
+                    )
+                # FIXME: fix this bug
+                if self.splitwise_role == "prefill" and self.num_gpu_blocks_override is None:
+                    raise NotImplementedError(
+                        "please set num_gpu_blocks_override for prefill " "instance using ENABLE_V1_KVCACHE_SCHEDULER."
+                    )
+
         if not current_platform.is_cuda() and not current_platform.is_xpu():
             envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
         if self.guided_decoding_backend != "off":
@@ -931,13 +951,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'mixed'. (prefill, decode, mixed)",
         )
 
-        splitwise_group.add_argument(
-            "--innode-prefill-ports",
-            type=lambda s: s.split(",") if s else None,
-            default=EngineArgs.innode_prefill_ports,
-            help="port for innode prefill, only used in single machine splitwise deployment",
-        )
-
         splitwise_group.add_argument(
             "--cache-transfer-protocol",
             type=str,
@@ -1233,7 +1246,6 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig:
             limit_mm_per_prompt=self.limit_mm_per_prompt,
             mm_processor_kwargs=self.mm_processor_kwargs,
             tool_parser=self.tool_call_parser,
-            innode_prefill_ports=self.innode_prefill_ports,
             max_num_partial_prefills=self.max_num_partial_prefills,
             max_long_partial_prefills=self.max_long_partial_prefills,
             long_prefill_token_threshold=self.long_prefill_token_threshold,
diff --git a/fastdeploy/engine/async_llm.py b/fastdeploy/engine/async_llm.py
index 240e1620d06..a2306c66534 100644
--- a/fastdeploy/engine/async_llm.py
+++ b/fastdeploy/engine/async_llm.py
@@ -899,8 +899,6 @@ def check_health(self, time_interval_threashold=30):
 
     def launch_components(self):
         if self.cfg.scheduler_config.splitwise_role != "mixed":
-            # 单机逻辑
-            self.engine_service.engine_worker_queue.available_prefill_instances.put(1)
             self.engine_service.split_mode_get_tasks()
             if self.cfg.scheduler_config.name == "splitwise":
                 self.splitwise_receive_thread = threading.Thread(
diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py
index b88a4d0f054..583005a5637 100644
--- a/fastdeploy/engine/common_engine.py
+++ b/fastdeploy/engine/common_engine.py
@@ -160,8 +160,8 @@ def start(self):
         self.insert_task_to_worker_thread.start()
         self.token_processor.tasks_queue = self.engine_worker_queue
         self.token_processor.run()
-        if self.cfg.scheduler_config.splitwise_role != "mixed":
-            self._process_splitwise_task()
+        if self.cfg.scheduler_config.splitwise_role == "decode":
+            self._decode_process_splitwise_requests()
 
         self._register_to_router()
 
@@ -329,54 +329,13 @@ def start_worker_queue_service(self, start_queue):
             local_data_parallel_id=self.cfg.parallel_config.local_data_parallel_id,
         )
 
-    def insert_tasks(self, tasks: Union[List[Request], List[RequestOutput]], current_id=-1, allocated=False):
+    def insert_tasks(self, tasks: Union[List[Request], List[RequestOutput]], current_id=-1):
         """
         Insert tasks to engine.
         """
         for task in tasks:
             start_span_request("DEQUEUE", task, trace.SpanKind.CONSUMER)
 
-        # TODO 返回至 scheduler
-        if allocated:
-            current_tasks = []
-            for task in tasks:
-                cur_task_idx = self.resource_manager.req_dict[task.request_id]
-                del self.resource_manager.req_dict[task.request_id]
-                cur_task = self.resource_manager.tasks_list[cur_task_idx]
-                if envs.FD_ENABLE_INTERNAL_ADAPTER:
-                    if not task.outputs.token_ids:  # first token is eos in Prefill, just recycle resource and continue
-                        self.resource_manager.stop_flags[cur_task_idx] = True
-                        self.resource_manager.tasks_list[cur_task_idx] = None
-                        self.resource_manager._recycle_block_tables(cur_task)
-                        if task.request_id in self.token_processor.tokens_counter:
-                            del self.token_processor.tokens_counter[task.request_id]
-                        self.llm_logger.warning(f"{task.request_id} need not decode after first token")
-                        continue
-                cur_task.prompt_token_ids[0] = task.outputs.token_ids[0]
-                cur_task.num_cached_tokens = task.num_cached_tokens
-                if (
-                    self.cfg.speculative_config.method in ["mtp"]
-                    and self.cfg.scheduler_config.splitwise_role == "decode"
-                ):
-                    cur_task.draft_token_ids = copy.deepcopy(task.outputs.draft_token_ids)
-                if task.error_code != 200:
-                    self.resource_manager.stop_flags[cur_task_idx] = True
-                    self.resource_manager.tasks_list[cur_task_idx] = None
-                    self.resource_manager._recycle_block_tables(cur_task)
-                    if task.request_id in self.token_processor.tokens_counter:
-                        del self.token_processor.tokens_counter[task.request_id]
-                    self.scheduler.put_results([task])
-                    self.llm_logger.warning(
-                        f"{task.request_id} prefill failed with msg:{task.error_msg}, recycle resource."
-                    )
-                    continue
-                self.token_processor.tokens_counter[task.request_id] = 1
-                current_tasks.append(cur_task)
-            if current_tasks:
-                self.engine_worker_queue.put_tasks((current_tasks, self.resource_manager.real_bsz))
-                self.llm_logger.debug(f"put task to engine worker queue, task:{current_tasks}")
-            return True
-
         self.resource_manager.check_and_free_block_tables()
 
         if not isinstance(tasks, list):
@@ -445,8 +404,53 @@ def insert_tasks(self, tasks: Union[List[Request], List[RequestOutput]], current
                 else:
                     self.update_mm_requests_chunk_size(tasks)
             self.engine_worker_queue.put_tasks((tasks, self.resource_manager.real_bsz))
-            if is_prefill and self.cfg.scheduler_config.name != "splitwise":
-                self.engine_worker_queue.available_prefill_instances.put(1)
+        return True
+
+    def _insert_prefilled_requests(self, request_outputs: List[RequestOutput]):
+        """
+        insert prefilled requests into engine worker queue.
+        Args:
+            request_outputs: a list of RequestOutput sent by prefill instance
+        """
+        to_infer_reqs = []
+        for req_out in request_outputs:
+            solt_idx = self.resource_manager.req_dict[req_out.request_id]
+            del self.resource_manager.req_dict[req_out.request_id]
+            cur_req = self.resource_manager.tasks_list[solt_idx]
+
+            if envs.FD_ENABLE_INTERNAL_ADAPTER:
+                if not req_out.outputs.token_ids:  # first token is eos in Prefill, just recycle resource and continue
+                    self.resource_manager.stop_flags[solt_idx] = True
+                    self.resource_manager.tasks_list[solt_idx] = None
+                    self.resource_manager._recycle_block_tables(cur_req)
+                    if req_out.request_id in self.token_processor.tokens_counter:
+                        del self.token_processor.tokens_counter[req_out.request_id]
+                    self.llm_logger.warning(f"{req_out.request_id} need not decode after first token")
+                    continue
+
+            cur_req.prompt_token_ids[0] = req_out.outputs.token_ids[0]
+            cur_req.num_cached_tokens = req_out.num_cached_tokens
+            if self.cfg.speculative_config.method in ["mtp"] and self.cfg.scheduler_config.splitwise_role == "decode":
+                cur_req.draft_token_ids = copy.deepcopy(req_out.outputs.draft_token_ids)
+
+            if req_out.error_code != 200:
+                self.resource_manager.stop_flags[solt_idx] = True
+                self.resource_manager.tasks_list[solt_idx] = None
+                self.resource_manager._recycle_block_tables(cur_req)
+                if req_out.request_id in self.token_processor.tokens_counter:
+                    del self.token_processor.tokens_counter[req_out.request_id]
+                self.scheduler.put_results([req_out])
+                self.llm_logger.warning(
+                    f"{req_out.request_id} prefill failed with msg:{req_out.error_msg}, recycle resource."
+                )
+                continue
+
+            self.token_processor.tokens_counter[req_out.request_id] = 1
+            to_infer_reqs.append(cur_req)
+
+        if to_infer_reqs:
+            self.engine_worker_queue.put_tasks((to_infer_reqs, self.resource_manager.real_bsz))
+            self.llm_logger.debug(f"put requests to engine worker queue, task:{to_infer_reqs}")
         return True
 
     def task_is_finished(self, index):
@@ -636,8 +640,9 @@ def _schedule_request_to_worker(self):
                 if len(tasks) == 0:
                     time.sleep(0.001)
                     continue
-                if self.cfg.splitwise_version == "v2" and self.cfg.scheduler_config.splitwise_role == "decode":
-                    # the task in decode instance will processed in _process_splitwise_task thread
+                if self.cfg.scheduler_config.splitwise_role == "decode":
+                    # Decode will instert the request sent by prefill to engine,
+                    # so the task sent by client will be ignored
                     continue
 
                 llm_logger.debug(f"get tasks from scheduler: {tasks}")
@@ -684,7 +689,14 @@ def _fetch_request():
                     max_num_batched_tokens=max_num_batched_tokens,
                     batch=num_prefill_batch,
                 )
-                self.llm_logger.debug(f"get tasks from scheduler: {tasks}")
+
+                if self.cfg.scheduler_config.splitwise_role == "decode":
+                    # Decode will instert the request sent by prefill to engine,
+                    # so the task sent by client will be ignored
+                    is_fetching = False
+                    return
+
+                self.llm_logger.debug(f"get tasks from {type(self.scheduler)}: {tasks}")
                 if self.cfg.scheduler_config.splitwise_role != "mixed":
                     need_delete_tasks = []
                     if envs.FD_OFFLINE_PERF_TEST_FOR_PD:
@@ -705,6 +717,7 @@ def _fetch_request():
                         for task in tasks:
                             # assure can allocate block ids in P
                             while not self.resource_manager.preallocate_resource_in_p(task):
+                                self.llm_logger.info("wait for preallocate_resource_in_p")
                                 time.sleep(0.005)
                             self.llm_logger.info(f"ask D resource for req_id: {task.request_id}")
                             self.split_connector.send_splitwise_tasks([task], task.idx)
@@ -864,7 +877,7 @@ def _insert_zmq_task_to_scheduler(self):
                         request.llm_engine_recv_req_timestamp = time.time()
                         start_span("ENQUEUE_ZMQ", data, trace.SpanKind.PRODUCER)
                         main_process_metrics.requests_number.inc()
-                        self.llm_logger.debug(f"Receive request: {request}")
+                        self.llm_logger.debug(f"Receive request from api server: {request}")
                     except Exception as e:
                         self.llm_logger.error(f"Receive request error: {e}, {traceback.format_exc()!s}")
                         err_msg = str(e)
@@ -997,156 +1010,126 @@ def _zmq_send_generated_tokens(self):
             except Exception as e:
                 llm_logger.error(f"Unexcepted error happend: {e}, {traceback.format_exc()!s}")
 
-    def _process_splitwise_task(self):
+    def _decode_process_splitwise_requests(self):
         """
-        Processing tasks from engine worker queue in splitwise deployment.
-        For v0 version, prefill instance gets tasks from engine worker queue.
-        For v1 and v2 version, decode instance gets raw tasks from engine worker queue to preallocate resources,
-        and decode instance gets prefilled tasks from engine worker queue to generate tokens.
-        TODO: unifiy the communication between decode and prefill instances.
+        Decode processes requests from engine worker queue, which are sent by prefill.
+        TODO: merge this function to the schedule function in resource manager
         """
+        allocate_resource_requests: list[Request] = []
+        prefilled_request_ouputs: list[RequestOutput] = []
 
-        def receiver_loop():
-            waiting_resource_requests = []
-            waiting_ready_tasks = []
+        def _fetch_requests():
+            if self.engine_worker_queue.disaggregate_queue_empty():
+                return
 
-            # Waiting for the api_server and scheduler in decode to
-            # receive the request sent by the client
-            def _decode_process_prefilled_task_v0_scheduler(input_tasks):
-                ready_tasks = []
-                waiting_tasks = []
-                for task in input_tasks:
-                    if not hasattr(self.scheduler, "has_request") or self.scheduler.has_request(task.request_id):
-                        ready_tasks.append(task)
+            items = self.engine_worker_queue.get_disaggregated_tasks()
+            for item in items:
+                tasks = item[1]
+                if isinstance(tasks[0], Request):
+                    self.llm_logger.debug(f"receive tasks to preallocate resource, {tasks}")
+                    allocate_resource_requests.extend(tasks)
+                elif isinstance(tasks[0], RequestOutput):
+                    self.llm_logger.debug(f"receive prefilled tasks, {tasks}")
+                    if not isinstance(tasks, list):
+                        tasks = [tasks]
+                    for task in tasks:
+                        task.finished = False
+                    prefilled_request_ouputs.extend(tasks)
+
+        def _process_allocate_resource_requests():
+            processed_indices = []
+            for idx, task in enumerate(allocate_resource_requests):
+                is_success = False
+
+                if envs.ENABLE_V1_KVCACHE_SCHEDULER:
+                    if self.resource_manager.preallocate_resource_in_d(task):
+                        self.llm_logger.info(f"Resource available, processing task {task.request_id}")
+                        self.split_connector.send_cache_infos([task], -1)
+                        processed_indices.append(idx)
+                        is_success = True
+                else:
+                    if self.resource_manager.is_resource_sufficient(task.prompt_token_ids_len):
+                        self.llm_logger.info(f"Resource available, processing task {task.request_id}")
+                        self.insert_tasks([task])
+                        processed_indices.append(idx)
+                        is_success = True
+
+                if not is_success:
+                    if not self.enable_decode_cache_task:
+                        task.error_msg = "Not enough resources"
+                        self.split_connector.send_cache_infos([task], -1)
+                        processed_indices.append(idx)
                     else:
-                        waiting_tasks.append(task)
-                self.insert_tasks(ready_tasks, allocated=True)
-                if self.cfg.splitwise_version in ("v0", "v2"):
-                    self.scheduler.put_results(ready_tasks)
-                return waiting_tasks
+                        self.llm_logger.debug(f"Still waiting for resources {task.request_id}")
+                        break
 
-            while self.running:
-                try:
-                    processed_indices = []
-                    for idx, task in enumerate(waiting_resource_requests):
-                        if envs.ENABLE_V1_KVCACHE_SCHEDULER:
-                            if self.resource_manager.preallocate_resource_in_d(task):
-                                self.llm_logger.info(f"Resource available, processing task {task.request_id}")
-                                self.split_connector.send_cache_infos([task], -1)
-                                processed_indices.append(idx)
-                            else:
-                                self.llm_logger.debug(f"Still waiting for resources {task.request_id}")
-                                break
-                        else:
-                            if self.resource_manager.is_resource_sufficient(task.prompt_token_ids_len):
-                                self.insert_tasks([task])
-                                self.llm_logger.info(f"Resource available, processing task {task.request_id}")
-                                processed_indices.append(idx)
-                            else:
-                                self.llm_logger.debug(f"Still waiting for resources {task.request_id}")
-                                break
+            for idx in sorted(processed_indices, reverse=True):
+                allocate_resource_requests.pop(idx)
 
-                    for idx in sorted(processed_indices, reverse=True):
-                        waiting_resource_requests.pop(idx)
+        def _process_prefilled_requests():
+            nonlocal prefilled_request_ouputs
+            ready_request_outputs = []
+            waiting_request_outputs = []
+            # Waiting for the api_server and scheduler in decode to
+            # receive the request sent by the client
+            for task in prefilled_request_ouputs:
+                if not hasattr(self.scheduler, "has_request") or self.scheduler.has_request(task.request_id):
+                    ready_request_outputs.append(task)
+                else:
+                    waiting_request_outputs.append(task)
 
-                    waiting_ready_tasks = _decode_process_prefilled_task_v0_scheduler(waiting_ready_tasks)
+            prefilled_request_ouputs = waiting_request_outputs
+            if self.cfg.splitwise_version == "v1":
+                # decode return first token to client
+                self.scheduler.put_results(ready_request_outputs)
 
-                    if self.engine_worker_queue.disaggregate_queue_empty():
-                        time.sleep(0.001)
-                    else:
-                        items = self.engine_worker_queue.get_disaggregated_tasks()
-                        for item in items:
-                            role = item[0]
-                            tasks = item[1]
-
-                            # prefill instance gets tasks from engine worker queue
-                            if role == "prefill":
-                                for task in tasks:
-                                    task.max_tokens = task.min_tokens = 2
-                                self.insert_tasks(tasks)
-                            # decode instance gets tasks from engine worker queue
-                            elif role == "decode":
-                                if isinstance(tasks[0], RequestOutput):
-                                    self.llm_logger.debug(f"receive prefilled tasks, {tasks}")
-                                    if not isinstance(tasks, list):
-                                        tasks = [tasks]
-                                    for task in tasks:
-                                        task.finished = False
-                                    if envs.ENABLE_V1_KVCACHE_SCHEDULER:
-                                        for task in tasks:
-                                            if envs.FD_ENABLE_INTERNAL_ADAPTER:
-                                                if (
-                                                    not task.outputs.token_ids
-                                                ):  # first token is eos in Prefill, just recycle resource and continue
-                                                    cur_task = self.resource_manager.requests[task.request_id]
-                                                    self.resource_manager.stop_flags[cur_task.idx] = True
-                                                    self.resource_manager.tasks_list[cur_task.idx] = None
-                                                    self.resource_manager._free_blocks(cur_task)
-                                                    if cur_task.request_id in self.token_processor.tokens_counter:
-                                                        del self.token_processor.tokens_counter[task.request_id]
-                                                    self.llm_logger.warning(
-                                                        f"{task.request_id} need not decode after first token"
-                                                    )
-                                                    del self.resource_manager.requests[task.request_id]
-                                                    del self.resource_manager.req_dict[task.request_id]
-                                                    continue
-                                            if task.error_code != 200:
-                                                cur_task = self.resource_manager.requests[task.request_id]
-                                                self.resource_manager.stop_flags[cur_task.idx] = True
-                                                self.resource_manager.tasks_list[cur_task.idx] = None
-                                                self.resource_manager._free_blocks(cur_task)
-                                                if cur_task.request_id in self.token_processor.tokens_counter:
-                                                    del self.token_processor.tokens_counter[task.request_id]
-                                                self.scheduler.put_results([task])
-                                                self.llm_logger.warning(
-                                                    f"{task.request_id} prefill failed with msg:{task.error_msg}, recycle resource."
-                                                )
-                                                continue
-                                            self.token_processor.tokens_counter[task.request_id] = 1
-                                            self.resource_manager.insert_task_for_decoding(task)
-
-                                    else:
-                                        waiting_ready_tasks.extend(_decode_process_prefilled_task_v0_scheduler(tasks))
-                                elif isinstance(tasks[0], Request):
-                                    self.llm_logger.debug(f"receive tasks to preallocate resource, {tasks}")
-                                    if len(waiting_resource_requests):
-                                        self.llm_logger.info(f"Waiting for resource for task {tasks[0].request_id}")
-                                        waiting_resource_requests.extend(tasks)
-                                    else:
-                                        new_waiting = []
-                                        for task in tasks:
-                                            can_allocate_resource = False
-                                            if envs.ENABLE_V1_KVCACHE_SCHEDULER:
-                                                if self.resource_manager.preallocate_resource_in_d(task):
-                                                    self.split_connector.send_cache_infos([task], -1)
-                                                    can_allocate_resource = True
-                                            else:
-                                                if self.resource_manager.is_resource_sufficient(
-                                                    task.prompt_token_ids_len
-                                                ):
-                                                    self.insert_tasks([task])
-                                                    can_allocate_resource = True
-                                            if can_allocate_resource is False:
-                                                if not self.enable_decode_cache_task:
-                                                    task.error_msg = "Not enough resources"
-                                                new_waiting.append(task)
-
-                                        if new_waiting:
-                                            if not self.enable_decode_cache_task:
-                                                self.split_connector.send_cache_infos(new_waiting, -1)
-                                            else:
-                                                waiting_resource_requests.extend(new_waiting)
-                                                self.llm_logger.info(
-                                                    f"Added {len(new_waiting)} tasks to waiting queue"
-                                                )
-                                else:
-                                    raise ValueError(f"Unsupported task type: {type(tasks[0])}")
+            if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
+                self._insert_prefilled_requests(ready_request_outputs)
+            else:
+                for task in ready_request_outputs:
+                    if envs.FD_ENABLE_INTERNAL_ADAPTER:
+                        if (
+                            not task.outputs.token_ids
+                        ):  # first token is eos in Prefill, just recycle resource and continue
+                            cur_req = self.resource_manager.requests[task.request_id]
+                            self.resource_manager.stop_flags[cur_req.idx] = True
+                            self.resource_manager.tasks_list[cur_req.idx] = None
+                            self.resource_manager._free_blocks(cur_req)
+                            if cur_req.request_id in self.token_processor.tokens_counter:
+                                del self.token_processor.tokens_counter[task.request_id]
+                            self.llm_logger.warning(f"{task.request_id} need not decode after first token")
+                            del self.resource_manager.requests[task.request_id]
+                            del self.resource_manager.req_dict[task.request_id]
+                            continue
+                    if task.error_code != 200:
+                        cur_req = self.resource_manager.requests[task.request_id]
+                        self.resource_manager.stop_flags[cur_req.idx] = True
+                        self.resource_manager.tasks_list[cur_req.idx] = None
+                        self.resource_manager._free_blocks(cur_req)
+                        if cur_req.request_id in self.token_processor.tokens_counter:
+                            del self.token_processor.tokens_counter[task.request_id]
+                        self.scheduler.put_results([task])
+                        self.llm_logger.warning(
+                            f"{task.request_id} prefill failed with msg:{task.error_msg}, recycle resource."
+                        )
+                        continue
+                    self.token_processor.tokens_counter[task.request_id] = 1
+                    self.resource_manager.insert_task_for_decoding(task)
 
+        def decode_loop():
+            while self.running:
+                try:
+                    _fetch_requests()
+                    _process_allocate_resource_requests()
+                    _process_prefilled_requests()
+                    time.sleep(0.001)
                 except Exception as e:
-                    self.llm_logger.error(f"Error in main loop: {e}")
-                    time.sleep(0.1)
+                    self.llm_logger.error(
+                        f"Error in main loop of decode_process_splitwise_requests: " f"{e}, {traceback.format_exc()}"
+                    )
+                    time.sleep(0.01)
 
-        threading.Thread(target=receiver_loop, daemon=True).start()
+        threading.Thread(target=decode_loop, daemon=True).start()
 
     def start_cache_service(self, device_ids, ipc_signal_suffix):
         return self.resource_manager.cache_manager.launch_cache_manager(
diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
index c7d40c557e5..dcf3f8a596a 100644
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -690,8 +690,6 @@ def check_health(self, time_interval_threashold=30):
 
     def launch_components(self):
         if self.cfg.scheduler_config.splitwise_role != "mixed":
-            # 单机逻辑
-            self.engine.engine_worker_queue.available_prefill_instances.put(1)
             self.splitwise_receive_thread = threading.Thread(
                 target=self.engine.split_connector.start_receiver, args=()
             )
diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py
index 776611560ab..935d83d2b6e 100644
--- a/fastdeploy/engine/request.py
+++ b/fastdeploy/engine/request.py
@@ -100,6 +100,8 @@ def __init__(
         prefill_start_index: int = 0,
         prefill_end_index: int = 0,
         num_computed_tokens: int = 0,
+        inference_start_time: float = 0,
+        llm_engine_recv_req_timestamp: float = 0,
     ) -> None:
         self.request_id = request_id
         self.prompt = prompt
@@ -217,6 +219,8 @@ def from_dict(cls, d: dict):
             video_end=d.get("video_end", 0),
             audio_end=d.get("audio_end", 0),
             dp_rank=d.get("dp_rank", None),
+            inference_start_time=d.get("inference_start_time"),
+            llm_engine_recv_req_timestamp=d.get("llm_engine_recv_req_timestamp"),
         )
 
     @property
diff --git a/fastdeploy/inter_communicator/engine_worker_queue.py b/fastdeploy/inter_communicator/engine_worker_queue.py
index ffa9155bab7..f91638bb58b 100644
--- a/fastdeploy/inter_communicator/engine_worker_queue.py
+++ b/fastdeploy/inter_communicator/engine_worker_queue.py
@@ -287,12 +287,6 @@ class QueueManager(BaseManager):
                 callable=lambda idx: self.disaggregate_requests[idx],
             )
 
-            self.available_prefill_instances = Queue()
-            QueueManager.register(
-                "get_available_prefill_instances",
-                callable=lambda: self.available_prefill_instances,
-            )
-
             QueueManager.register(
                 "get_finish_request_barrier",
                 callable=lambda idx: self.finish_request_barrier[idx],
@@ -351,7 +345,6 @@ class QueueManager(BaseManager):
             QueueManager.register("get_client_read_info_flag")
             QueueManager.register("get_lock_info")
             QueueManager.register("get_disaggregate_requests")
-            QueueManager.register("get_available_prefill_instances")
             QueueManager.register("get_finish_request_barrier")
             QueueManager.register("get_finish_add_cache_task_barrier")
             QueueManager.register("get_connect_task_barrier")
@@ -390,7 +383,6 @@ class QueueManager(BaseManager):
 
             # p/d 分离获取
             self.disaggregate_requests = self.manager.get_disaggregate_requests(self.local_data_parallel_id)
-            self.available_prefill_instances = self.manager.get_available_prefill_instances()
             self.finish_request_barrier = self.manager.get_finish_request_barrier(self.local_data_parallel_id)
             self.finish_add_cache_task_barrier = self.manager.get_finish_add_cache_task_barrier(
                 self.local_data_parallel_id
@@ -652,15 +644,6 @@ def get_connect_rdma_task_response(self):
         self.connect_task_response_lock.release()
         return task_response
 
-    def get_prefill_instances(self):
-        """
-        check if the prefill queue is empty
-        """
-        if self.available_prefill_instances.qsize() == 0:
-            return 0
-        else:
-            return self.available_prefill_instances.get()
-
     def put_cache_info(self, cache_info) -> None:
         """
         Args:
diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py
index b2e89e276e1..1d933461f61 100644
--- a/fastdeploy/output/token_processor.py
+++ b/fastdeploy/output/token_processor.py
@@ -498,6 +498,9 @@ def _recycle_resources(self, task_id, index, task, result=None, is_prefill=False
                     self.split_connector.send_first_token(task.disaggregate_info, [result])
                     break
                 else:
+                    # TODO: Refine checking sending cache and do not keep waiting
+                    if time.time() - start_time > 30:
+                        llm_logger.warning(f"wait for sending cache, {task_id}")
                     time.sleep(0.002)
         else:
             if envs.ENABLE_V1_KVCACHE_SCHEDULER:
@@ -753,10 +756,8 @@ def _process_batch_output(self):
                     self._recycle_resources(task_id, i, task, result, is_prefill)
                     break
 
-            if not (is_prefill and self.cfg.splitwise_version == "v0"):
-                # NOTE: prefill instance in v0 version does not return result to scheduler
-                llm_logger.debug(f"get response from infer: {result}")
-                batch_result.append(result)
+            llm_logger.debug(f"get response from infer: {result}")
+            batch_result.append(result)
 
         self.postprocess(batch_result, mtype)
 
diff --git a/fastdeploy/splitwise/splitwise_connector.py b/fastdeploy/splitwise/splitwise_connector.py
index e5ad4ad8adc..8daab42ddf1 100644
--- a/fastdeploy/splitwise/splitwise_connector.py
+++ b/fastdeploy/splitwise/splitwise_connector.py
@@ -175,72 +175,6 @@ def _close_connection(self, addr):
             self.push_sockets[addr].close()
             del self.push_sockets[addr]
 
-    def has_splitwise_tasks(self):
-        """
-        PD mode: check prefill empty
-        """
-        if self.cfg.innode_prefill_ports is None:
-            return True
-        else:
-            for port in self.cfg.innode_prefill_ports:
-                if port not in self.connect_innode_instances:
-                    self.create_connection(port)
-                if self.connect_innode_instances[port].available_prefill_instances.qsize() > 0:
-                    return False
-            return True
-
-    def dispatch_innode_splitwise_tasks(self, tasks, current_id):
-        """
-        Dispatch splitwise tasks .
-
-        Parameters:
-        tasks (list): List of tasks.
-        """
-        tasks_status = "mixed"
-        is_changable = envs.FD_PD_CHANGEABLE == "1"
-        while True:
-            for port in self.cfg.innode_prefill_ports:
-                current_port = -1
-                if port not in self.connect_innode_instances:
-                    self.create_connection(port)
-                if self.connect_innode_instances[port].get_prefill_instances() == 1:
-                    for task in tasks:
-                        task.disaggregate_info = {
-                            "role": "prefill",
-                            "transfer_protocol": "ipc",
-                            "cache_info": {
-                                "ipc": {
-                                    "ip": "0.0.0.0",
-                                    "port": self.cfg.parallel_config.engine_worker_queue_port[self.idx],
-                                    "current_id": current_id,
-                                },
-                            },
-                        }
-                    self.connect_innode_instances[port].put_disaggregated_tasks(("prefill", tasks))
-                    current_port = port
-
-                if current_port != -1:
-                    tasks_status = "decode"
-                    break
-            if current_port != -1 or is_changable:
-                break
-            else:
-                time.sleep(0.005)
-
-        if tasks_status == "decode":
-            for task in tasks:
-                task.disaggregate_info = {
-                    "role": tasks_status,
-                    "transfer_protocol": "ipc",
-                    "cache_info": {
-                        "ipc": {
-                            "ip": "0.0.0.0",
-                            "port": current_port,
-                            "current_id": current_id,
-                        },
-                    },
-                }
-
     def send_splitwise_tasks(self, tasks: List[Request], current_id):
         """
         Send splitwise tasks to all connected addresses.
@@ -249,10 +183,6 @@ def send_splitwise_tasks(self, tasks: List[Request], current_id):
         tasks (list): List of tasks.
         current_id (int): Current ID.
         """
-
-        if self.cfg.innode_prefill_ports is not None:
-            self.dispatch_innode_splitwise_tasks(tasks, current_id)
-            return
         addr = None
         decode_diagg = None
         for task in tasks:
diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
index df6604f8d99..14a45c437b7 100644
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -927,8 +927,6 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
     logger.info(f"- Dynamic load weight: {load_config.dynamic_load_weight}")
     logger.info(f"- Load strategy: {load_config.load_strategy}")
 
-    if args.splitwise_role != "mixed" and args.cache_transfer_protocol != "rdma":
-        envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
     if not current_platform.is_cuda() and not current_platform.is_xpu():
         logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported.")
         envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
diff --git a/tests/e2e/test_ernie_03b_pd.py b/tests/e2e/test_ernie_03b_pd_router_v0.py
similarity index 71%
rename from tests/e2e/test_ernie_03b_pd.py
rename to tests/e2e/test_ernie_03b_pd_router_v0.py
index 7d31a574a9e..3d4e967da8e 100644
--- a/tests/e2e/test_ernie_03b_pd.py
+++ b/tests/e2e/test_ernie_03b_pd_router_v0.py
@@ -12,23 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Test splitwise deployment which uses local_scheduler + router,
+# and ENABLE_V1_KVCACHE_SCHEDULER is 0
+
 import json
 import os
 import shutil
 import signal
-import socket
 import subprocess
 import sys
 import time
 
 import pytest
 import requests
+from utils.serving_utils import (
+    FD_API_PORT,
+    FD_CACHE_QUEUE_PORT,
+    FD_ENGINE_QUEUE_PORT,
+    FD_METRICS_PORT,
+    clean_ports,
+    get_registered_number,
+)
 
 # Read ports from environment variables; use default values if not set
-FD_API_PORT = int(os.getenv("FD_API_PORT", 8188))
-FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133))
-FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233))
-FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
+FD_CONNECTOR_PORT = int(os.getenv("FD_CONNECTOR_PORT", 8433))
+FD_ROUTER_PORT = int(os.getenv("FD_ROUTER_PORT", 8533))
 
 # List of ports to clean before and after tests
 PORTS_TO_CLEAN = [
@@ -36,95 +44,15 @@
     FD_ENGINE_QUEUE_PORT,
     FD_METRICS_PORT,
     FD_CACHE_QUEUE_PORT,
+    FD_CONNECTOR_PORT,
     FD_API_PORT + 1,
     FD_ENGINE_QUEUE_PORT + 1,
     FD_METRICS_PORT + 1,
     FD_CACHE_QUEUE_PORT + 1,
+    FD_CONNECTOR_PORT + 1,
+    FD_ROUTER_PORT,
 ]
 
-
-def is_port_open(host: str, port: int, timeout=1.0):
-    """
-    Check if a TCP port is open on the given host.
-    Returns True if connection succeeds, False otherwise.
-    """
-    try:
-        with socket.create_connection((host, port), timeout):
-            return True
-    except Exception:
-        return False
-
-
-def kill_process_on_port(port: int):
-    """
-    Kill processes that are listening on the given port.
-    Uses multiple methods to ensure thorough cleanup.
-    """
-    current_pid = os.getpid()
-    parent_pid = os.getppid()
-
-    # Method 1: Use lsof to find processes
-    try:
-        output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip()
-        for pid in output.splitlines():
-            pid = int(pid)
-            if pid in (current_pid, parent_pid):
-                print(f"Skip killing current process (pid={pid}) on port {port}")
-                continue
-            try:
-                # First try SIGTERM for graceful shutdown
-                os.kill(pid, signal.SIGTERM)
-                time.sleep(1)
-                # Then SIGKILL if still running
-                os.kill(pid, signal.SIGKILL)
-                print(f"Killed process on port {port}, pid={pid}")
-            except ProcessLookupError:
-                pass  # Process already terminated
-    except subprocess.CalledProcessError:
-        pass
-
-    # Method 2: Use netstat and fuser as backup
-    try:
-        # Find processes using netstat and awk
-        cmd = f"netstat -tulpn 2>/dev/null | grep :{port} | awk '{{print $7}}' | cut -d'/' -f1"
-        output = subprocess.check_output(cmd, shell=True).decode().strip()
-        for pid in output.splitlines():
-            if pid and pid.isdigit():
-                pid = int(pid)
-                if pid in (current_pid, parent_pid):
-                    continue
-                try:
-                    os.kill(pid, signal.SIGKILL)
-                    print(f"Killed process (netstat) on port {port}, pid={pid}")
-                except ProcessLookupError:
-                    pass
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        pass
-
-    # Method 3: Use fuser if available
-    try:
-        subprocess.run(f"fuser -k {port}/tcp", shell=True, timeout=5)
-    except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError):
-        pass
-
-
-def clean_ports():
-    """
-    Kill all processes occupying the ports listed in PORTS_TO_CLEAN.
-    """
-    print(f"Cleaning ports: {PORTS_TO_CLEAN}")
-    for port in PORTS_TO_CLEAN:
-        kill_process_on_port(port)
-
-    # Double check and retry if ports are still in use
-    time.sleep(2)
-    for port in PORTS_TO_CLEAN:
-        if is_port_open("127.0.0.1", port, timeout=0.1):
-            print(f"Port {port} still in use, retrying cleanup...")
-            kill_process_on_port(port)
-            time.sleep(1)
-
-
 @pytest.fixture(scope="session", autouse=True)
 def setup_and_run_server():
     """
@@ -135,9 +63,11 @@ def setup_and_run_server():
     - Tears down server after all tests finish
     """
     print("Pre-test port cleanup...")
-    clean_ports()
+    clean_ports(PORTS_TO_CLEAN)
 
     print("log dir clean ")
+    if os.path.exists("log_router") and os.path.isdir("log_router"):
+        shutil.rmtree("log_router")
     if os.path.exists("log_prefill") and os.path.isdir("log_prefill"):
         shutil.rmtree("log_prefill")
     if os.path.exists("log_decode") and os.path.isdir("log_decode"):
@@ -150,13 +80,36 @@ def setup_and_run_server():
         model_path = "baidu/ERNIE-4.5-0.3B-Paddle"
     print(f"model_path: {model_path}")
 
+    # router
+    print("start router...")
+    env_router = os.environ.copy()
+    env_router["FD_LOG_DIR"] = "log_router"
+    router_log_path = "router.log"
+
+    router_cmd = [
+        sys.executable,
+        "-m",
+        "fastdeploy.router.launch",
+        "--port",
+        str(FD_ROUTER_PORT),
+        "--splitwise",
+    ]
+
+    with open(router_log_path, "w") as logfile:
+        process_router = subprocess.Popen(
+            router_cmd,
+            stdout=logfile,
+            stderr=subprocess.STDOUT,
+            start_new_session=True,  # Enables killing full group via os.killpg
+            env=env_router,
+        )
+
     # prefill实例
     print("start prefill...")
     env_prefill = os.environ.copy()
     env_prefill["CUDA_VISIBLE_DEVICES"] = "0"
     env_prefill["ENABLE_V1_KVCACHE_SCHEDULER"] = "0"
     env_prefill["FD_LOG_DIR"] = "log_prefill"
-    env_prefill["INFERENCE_MSG_QUEUE_ID"] = str(FD_API_PORT)
     prefill_log_path = "server.log"
     prefill_cmd = [
         sys.executable,
@@ -182,6 +135,12 @@ def setup_and_run_server():
         "wint8",
         "--splitwise-role",
         "prefill",
+        "--cache-transfer-protocol",
+        "ipc",
+        "--pd-comm-port",
+        str(FD_CONNECTOR_PORT),
+        "--router",
+        f"0.0.0.0:{FD_ROUTER_PORT}",
     ]
 
     # Start subprocess in new process group
@@ -193,14 +152,13 @@ def setup_and_run_server():
             start_new_session=True,  # Enables killing full group via os.killpg
             env=env_prefill,
         )
-    time.sleep(3)
+    time.sleep(1)
 
     # decode实例
     print("start decode...")
     env_decode = os.environ.copy()
     env_decode["CUDA_VISIBLE_DEVICES"] = "1"
-    env_prefill["ENABLE_V1_KVCACHE_SCHEDULER"] = "0"
-    env_decode["INFERENCE_MSG_QUEUE_ID"] = str(FD_API_PORT + 1)
+    env_decode["ENABLE_V1_KVCACHE_SCHEDULER"] = "0"
     env_decode["FD_LOG_DIR"] = "log_decode"
     decode_log_path = "decode_server.log"
     decode_cmd = [
@@ -227,8 +185,12 @@ def setup_and_run_server():
         "wint8",
         "--splitwise-role",
         "decode",
-        "--innode-prefill-ports",
-        str(FD_ENGINE_QUEUE_PORT),
+        "--cache-transfer-protocol",
+        "ipc",
+        "--pd-comm-port",
+        str(FD_CONNECTOR_PORT + 1),
+        "--router",
+        f"0.0.0.0:{FD_ROUTER_PORT}",
     ]
 
     # Start subprocess in new process group
@@ -242,13 +204,12 @@ def setup_and_run_server():
         )
 
     # Wait up to 300 seconds for API server to be ready
-    for _ in range(300):
-        if is_port_open("127.0.0.1", FD_API_PORT):
-            if is_port_open("127.0.0.1", FD_API_PORT + 1):
-                print(f"Prefill server is up on port {FD_API_PORT}")
-                print(f"Decode server is up on port {FD_API_PORT + 1}")
-                break
-        time.sleep(1)
+    for _ in range(60):
+        registered_numbers = get_registered_number(f"0.0.0.0:{FD_ROUTER_PORT}")
+        if registered_numbers["prefill"] >= 1 and registered_numbers["decode"] >= 1:
+            print("Prefill and decode servers are both online")
+            break
+        time.sleep(5)
     else:
         print("[TIMEOUT] API server failed to start in 5 minutes. Cleaning up...")
         try:
@@ -263,9 +224,10 @@ def setup_and_run_server():
 
     print("\n===== Post-test server cleanup... =====")
     try:
+        os.killpg(process_router.pid, signal.SIGTERM)
         os.killpg(process_prefill.pid, signal.SIGTERM)
         os.killpg(process_decode.pid, signal.SIGTERM)
-        clean_ports()
+        clean_ports(PORTS_TO_CLEAN)
         print(f"Prefill server (pid={process_prefill.pid}) terminated")
         print(f"Decode server (pid={process_decode.pid}) terminated")
     except Exception as e:
@@ -277,7 +239,7 @@ def api_url(request):
     """
     Returns the API endpoint URL for chat completions.
     """
-    return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions", f"http://0.0.0.0:{FD_API_PORT + 1}/v1/chat/completions"
+    return f"http://0.0.0.0:{FD_ROUTER_PORT}/v1/chat/completions"
 
 
 @pytest.fixture(scope="session")
@@ -364,15 +326,12 @@ def test_chat_usage_stream(api_url):
         "stream_options": {"include_usage": True, "continuous_usage_stats": True},
         "metadata": {"min_tokens": 10},
     }
-    _, d_url = api_url  # Only the decode server receives the request
 
-    response = send_request(url=d_url, payload=payload)
+    response = send_request(url=api_url, payload=payload)
     chunks = get_stream_chunks(response)
     result = "".join([x["choices"][0]["delta"]["content"] for x in chunks[:-1]])
     print("Decode Response:", result)
     assert result != "", "结果为空"
-    # for idx, chunk in enumerate(chunks):
-    #     print(f"\nchunk[{idx}]:\n{json.dumps(chunk, indent=2, ensure_ascii=False)}")
     usage = chunks[-1]["usage"]
     total_tokens = usage["completion_tokens"] + usage["prompt_tokens"]
     assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens"
@@ -395,9 +354,8 @@ def test_chat_usage_non_stream(api_url):
         "stream": False,
         "metadata": {"min_tokens": 10},
     }
-    _, d_url = api_url
 
-    response = send_request(url=d_url, payload=payload).json()
+    response = send_request(url=api_url, payload=payload).json()
     usage = response["usage"]
     result = response["choices"][0]["message"]["content"]
     assert result != "", "结果为空"
@@ -420,10 +378,9 @@ def test_non_chat_usage_stream(api_url):
         "stream_options": {"include_usage": True, "continuous_usage_stats": True},
         "metadata": {"min_tokens": 10},
     }
-    _, d_url = api_url
-    d_url = d_url.replace("chat/completions", "completions")
+    api_url = api_url.replace("chat/completions", "completions")
 
-    response = send_request(url=d_url, payload=payload)
+    response = send_request(url=api_url, payload=payload)
     chunks = get_stream_chunks(response)
     result = "".join([x["choices"][0]["text"] for x in chunks[:-1]])
     print("Decode Response:", result)
@@ -447,10 +404,9 @@ def test_non_chat_usage_non_stream(api_url):
         "stream": False,
         "metadata": {"min_tokens": 10},
     }
-    _, d_url = api_url
-    d_url = d_url.replace("chat/completions", "completions")
+    api_url = api_url.replace("chat/completions", "completions")
 
-    response = send_request(url=d_url, payload=payload).json()
+    response = send_request(url=api_url, payload=payload).json()
     usage = response["usage"]
     result = response["choices"][0]["text"]
     print("Decode Response:", result)
diff --git a/tests/e2e/test_ernie_03b_pd_multi_node.py b/tests/e2e/test_ernie_03b_pd_splitwise_scheduler.py
similarity index 83%
rename from tests/e2e/test_ernie_03b_pd_multi_node.py
rename to tests/e2e/test_ernie_03b_pd_splitwise_scheduler.py
index b1cc1fd2ac4..2797b80fc08 100644
--- a/tests/e2e/test_ernie_03b_pd_multi_node.py
+++ b/tests/e2e/test_ernie_03b_pd_splitwise_scheduler.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Test splitwise deployment which uses splitwise_scheduler,
+# and ENABLE_V1_KVCACHE_SCHEDULER is 0
+
 import json
 import os
 import shutil
@@ -29,8 +32,6 @@
 FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133))
 FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233))
 FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
-FD_CONNECTOR_PORT = int(os.getenv("FD_CONNECTOR_PORT", 8433))
-FD_ROUTER_PORT = int(os.getenv("FD_ROUTER_PORT", 8533))
 
 # List of ports to clean before and after tests
 PORTS_TO_CLEAN = [
@@ -38,13 +39,10 @@
     FD_ENGINE_QUEUE_PORT,
     FD_METRICS_PORT,
     FD_CACHE_QUEUE_PORT,
-    FD_CONNECTOR_PORT,
     FD_API_PORT + 1,
     FD_ENGINE_QUEUE_PORT + 1,
     FD_METRICS_PORT + 1,
     FD_CACHE_QUEUE_PORT + 1,
-    FD_CONNECTOR_PORT + 1,
-    FD_ROUTER_PORT,
 ]
 
 
@@ -60,51 +58,6 @@ def is_port_open(host: str, port: int, timeout=1.0):
         return False
 
 
-def check_service_health(base_url: str, timeout: int = 3) -> bool:
-    """
-    Check the health status of a service.
-
-    Args:
-        base_url (str): The base URL of the service, e.g. "http://127.0.0.1:8080"
-        timeout (int): Request timeout in seconds.
-
-    Returns:
-        bool: True if the service is healthy, False otherwise.
-    """
-    if not base_url.startswith("http"):
-        base_url = f"http://{base_url}"
-    url = f"{base_url.rstrip('/')}/health"
-    try:
-        resp = requests.get(url, timeout=timeout)
-        if resp.status_code == 200:
-            return True
-        else:
-            return False
-    except Exception:
-        return False
-
-
-def get_registered_number(router_url) -> list:
-    """
-    Get the number of registered models in the router.
-
-    Args:
-        router_url (str): The base URL of the router, e.g. "http://localhost:8080".
-
-    Returns:
-        int: The number of registered models.
-    """
-    if not router_url.startswith("http"):
-        router_url = f"http://{router_url}"
-
-    try:
-        response = requests.get(f"{router_url}/registered_number", timeout=60)
-        registered_numbers = response.json()
-        return registered_numbers
-    except Exception:
-        return {"mixed": 0, "prefill": 0, "decode": 0}
-
-
 def kill_process_on_port(port: int):
     """
     Kill processes that are listening on the given port.
@@ -174,6 +127,25 @@ def clean_ports():
             kill_process_on_port(port)
             time.sleep(1)
 
+# Read ports from environment variables; use default values if not set
+FD_CONNECTOR_PORT = int(os.getenv("FD_CONNECTOR_PORT", 8433))
+FD_REDIS_PORT = int(os.getenv("FD_REDIS_PORT", 8533))
+
+# List of ports to clean before and after tests
+PORTS_TO_CLEAN = [
+    FD_API_PORT,
+    FD_ENGINE_QUEUE_PORT,
+    FD_METRICS_PORT,
+    FD_CACHE_QUEUE_PORT,
+    FD_CONNECTOR_PORT,
+    FD_API_PORT + 1,
+    FD_ENGINE_QUEUE_PORT + 1,
+    FD_METRICS_PORT + 1,
+    FD_CACHE_QUEUE_PORT + 1,
+    FD_CONNECTOR_PORT + 1,
+    FD_REDIS_PORT,
+]
+
 
 @pytest.fixture(scope="session", autouse=True)
 def setup_and_run_server():
@@ -185,11 +157,11 @@ def setup_and_run_server():
     - Tears down server after all tests finish
     """
     print("Pre-test port cleanup...")
-    clean_ports()
+    clean_ports(PORTS_TO_CLEAN)
 
     print("log dir clean ")
-    if os.path.exists("log_router") and os.path.isdir("log_router"):
-        shutil.rmtree("log_router")
+    if os.path.exists("log_redis") and os.path.isdir("log_redis"):
+        shutil.rmtree("log_redis")
     if os.path.exists("log_prefill") and os.path.isdir("log_prefill"):
         shutil.rmtree("log_prefill")
     if os.path.exists("log_decode") and os.path.isdir("log_decode"):
@@ -202,28 +174,26 @@ def setup_and_run_server():
         model_path = "baidu/ERNIE-4.5-0.3B-Paddle"
     print(f"model_path: {model_path}")
 
-    # router
-    print("start router...")
-    env_router = os.environ.copy()
-    env_router["FD_LOG_DIR"] = "log_router"
-    router_log_path = "router.log"
+    # redis-server
+    print("start redis...")
+    env_copy = os.environ.copy()
+    log_path = "router.log"
 
-    router_cmd = [
-        sys.executable,
-        "-m",
-        "fastdeploy.router.launch",
+    cmd = [
+        "redis-server",
         "--port",
-        str(FD_ROUTER_PORT),
-        "--splitwise",
+        str(FD_REDIS_PORT),
+        "--daemonize",
+        "yes",
     ]
 
-    with open(router_log_path, "w") as logfile:
-        process_router = subprocess.Popen(
-            router_cmd,
+    with open(log_path, "w") as logfile:
+        process_redis = subprocess.Popen(
+            cmd,
             stdout=logfile,
             stderr=subprocess.STDOUT,
             start_new_session=True,  # Enables killing full group via os.killpg
-            env=env_router,
+            env=env_copy,
         )
 
     # prefill实例
@@ -232,7 +202,6 @@ def setup_and_run_server():
     env_prefill["CUDA_VISIBLE_DEVICES"] = "0"
     env_prefill["ENABLE_V1_KVCACHE_SCHEDULER"] = "0"
     env_prefill["FD_LOG_DIR"] = "log_prefill"
-    env_prefill["INFERENCE_MSG_QUEUE_ID"] = str(FD_API_PORT)
     prefill_log_path = "server.log"
     prefill_cmd = [
         sys.executable,
@@ -262,8 +231,12 @@ def setup_and_run_server():
         "ipc",
         "--pd-comm-port",
         str(FD_CONNECTOR_PORT),
-        "--router",
-        f"0.0.0.0:{FD_ROUTER_PORT}",
+        "--scheduler-name",
+        "splitwise",
+        "--scheduler-host",
+        "127.0.0.1",
+        "--scheduler-port",
+        str(FD_REDIS_PORT),
     ]
 
     # Start subprocess in new process group
@@ -282,7 +255,6 @@ def setup_and_run_server():
     env_decode = os.environ.copy()
     env_decode["CUDA_VISIBLE_DEVICES"] = "1"
     env_decode["ENABLE_V1_KVCACHE_SCHEDULER"] = "0"
-    env_decode["INFERENCE_MSG_QUEUE_ID"] = str(FD_API_PORT + 1)
     env_decode["FD_LOG_DIR"] = "log_decode"
     decode_log_path = "decode_server.log"
     decode_cmd = [
@@ -313,8 +285,12 @@ def setup_and_run_server():
         "ipc",
         "--pd-comm-port",
         str(FD_CONNECTOR_PORT + 1),
-        "--router",
-        f"0.0.0.0:{FD_ROUTER_PORT}",
+        "--scheduler-name",
+        "splitwise",
+        "--scheduler-host",
+        "127.0.0.1",
+        "--scheduler-port",
+        str(FD_REDIS_PORT),
     ]
 
     # Start subprocess in new process group
@@ -329,9 +305,9 @@ def setup_and_run_server():
 
     # Wait up to 300 seconds for API server to be ready
     for _ in range(60):
-        registered_numbers = get_registered_number(f"0.0.0.0:{FD_ROUTER_PORT}")
-        if registered_numbers["prefill"] >= 1 and registered_numbers["decode"] >= 1:
-            print("Prefill and decode servers are both online")
+        if is_port_open("127.0.0.1", FD_API_PORT) and is_port_open("127.0.0.1", FD_API_PORT + 1):
+            print(f"Prefill server is up on port {FD_API_PORT}")
+            print(f"Decode server is up on port {FD_API_PORT + 1}")
             break
         time.sleep(5)
     else:
@@ -339,7 +315,7 @@ def setup_and_run_server():
         try:
             os.killpg(process_prefill.pid, signal.SIGTERM)
             os.killpg(process_decode.pid, signal.SIGTERM)
-            clean_ports()
+            clean_ports(PORTS_TO_CLEAN)
         except Exception as e:
             print(f"Failed to kill process group: {e}")
         raise RuntimeError(f"API server did not start on port {FD_API_PORT}")
@@ -348,10 +324,10 @@ def setup_and_run_server():
 
     print("\n===== Post-test server cleanup... =====")
     try:
-        os.killpg(process_router.pid, signal.SIGTERM)
+        os.killpg(process_redis.pid, signal.SIGTERM)
         os.killpg(process_prefill.pid, signal.SIGTERM)
         os.killpg(process_decode.pid, signal.SIGTERM)
-        clean_ports()
+        clean_ports(PORTS_TO_CLEAN)
         print(f"Prefill server (pid={process_prefill.pid}) terminated")
         print(f"Decode server (pid={process_decode.pid}) terminated")
     except Exception as e:
@@ -363,7 +339,7 @@ def api_url(request):
     """
     Returns the API endpoint URL for chat completions.
     """
-    return f"http://0.0.0.0:{FD_ROUTER_PORT}/v1/chat/completions"
+    return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions", f"http://0.0.0.0:{FD_API_PORT+1}/v1/chat/completions"
 
 
 @pytest.fixture(scope="session")
@@ -450,8 +426,8 @@ def test_chat_usage_stream(api_url):
         "stream_options": {"include_usage": True, "continuous_usage_stats": True},
         "metadata": {"min_tokens": 10},
     }
-
-    response = send_request(url=api_url, payload=payload)
+    p_url, d_url = api_url
+    response = send_request(url=p_url, payload=payload)
     chunks = get_stream_chunks(response)
     result = "".join([x["choices"][0]["delta"]["content"] for x in chunks[:-1]])
     print("Decode Response:", result)
@@ -479,7 +455,8 @@ def test_chat_usage_non_stream(api_url):
         "metadata": {"min_tokens": 10},
     }
 
-    response = send_request(url=api_url, payload=payload).json()
+    p_url, d_url = api_url
+    response = send_request(url=p_url, payload=payload).json()
     usage = response["usage"]
     result = response["choices"][0]["message"]["content"]
     assert result != "", "结果为空"
@@ -502,12 +479,13 @@ def test_non_chat_usage_stream(api_url):
         "stream_options": {"include_usage": True, "continuous_usage_stats": True},
         "metadata": {"min_tokens": 10},
     }
-    api_url = api_url.replace("chat/completions", "completions")
+    p_url, d_url = api_url
+    p_url = p_url.replace("chat/completions", "completions")
 
-    response = send_request(url=api_url, payload=payload)
+    response = send_request(url=p_url, payload=payload)
     chunks = get_stream_chunks(response)
     result = "".join([x["choices"][0]["text"] for x in chunks[:-1]])
-    print("Decode Response:", result)
+    # print("Decode Response:", result)
     assert result != "", "结果为空"
     usage = chunks[-1]["usage"]
     total_tokens = usage["completion_tokens"] + usage["prompt_tokens"]
@@ -528,12 +506,13 @@ def test_non_chat_usage_non_stream(api_url):
         "stream": False,
         "metadata": {"min_tokens": 10},
     }
-    api_url = api_url.replace("chat/completions", "completions")
+    p_url, d_url = api_url
+    p_url = p_url.replace("chat/completions", "completions")
 
-    response = send_request(url=api_url, payload=payload).json()
+    response = send_request(url=p_url, payload=payload).json()
     usage = response["usage"]
     result = response["choices"][0]["text"]
-    print("Decode Response:", result)
+    # print("Decode Response:", result)
     assert result != "", "结果为空"
     total_tokens = usage["completion_tokens"] + usage["prompt_tokens"]
     assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens"
diff --git a/tests/e2e/utils/serving_utils.py b/tests/e2e/utils/serving_utils.py
index e6dcaf8b31c..7267758529a 100644
--- a/tests/e2e/utils/serving_utils.py
+++ b/tests/e2e/utils/serving_utils.py
@@ -4,6 +4,8 @@
 import subprocess
 import time
 
+import requests
+
 # Read ports from environment variables; use default values if not set
 FD_API_PORT = int(os.getenv("FD_API_PORT", 8188))
 FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133))
@@ -79,18 +81,66 @@ def kill_process_on_port(port: int):
         pass
 
 
-def clean_ports():
+def clean_ports(ports=None):
     """
-    Kill all processes occupying the ports listed in PORTS_TO_CLEAN.
+    Kill all processes occupying the ports
     """
-    print(f"Cleaning ports: {PORTS_TO_CLEAN}")
-    for port in PORTS_TO_CLEAN:
+    if ports is None:
+        ports = PORTS_TO_CLEAN
+
+    print(f"Cleaning ports: {ports}")
+    for port in ports:
         kill_process_on_port(port)
 
     # Double check and retry if ports are still in use
     time.sleep(2)
-    for port in PORTS_TO_CLEAN:
+    for port in ports:
         if is_port_open("127.0.0.1", port, timeout=0.1):
             print(f"Port {port} still in use, retrying cleanup...")
             kill_process_on_port(port)
             time.sleep(1)
+
+
+def check_service_health(base_url: str, timeout: int = 3) -> bool:
+    """
+    Check the health status of a service.
+
+    Args:
+        base_url (str): The base URL of the service, e.g. "http://127.0.0.1:8080"
+        timeout (int): Request timeout in seconds.
+
+    Returns:
+        bool: True if the service is healthy, False otherwise.
+    """
+    if not base_url.startswith("http"):
+        base_url = f"http://{base_url}"
+    url = f"{base_url.rstrip('/')}/health"
+    try:
+        resp = requests.get(url, timeout=timeout)
+        if resp.status_code == 200:
+            return True
+        else:
+            return False
+    except Exception:
+        return False
+
+
+def get_registered_number(router_url) -> list:
+    """
+    Get the number of registered models in the router.
+
+    Args:
+        router_url (str): The base URL of the router, e.g. "http://localhost:8080".
+
+    Returns:
+        int: The number of registered models.
+    """
+    if not router_url.startswith("http"):
+        router_url = f"http://{router_url}"
+
+    try:
+        response = requests.get(f"{router_url}/registered_number", timeout=60)
+        registered_numbers = response.json()
+        return registered_numbers
+    except Exception:
+        return {"mixed": 0, "prefill": 0, "decode": 0}

From 469f06800f9bfeb5080e72b9efcd3c455c2fd303 Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Mon, 10 Nov 2025 03:19:52 +0000
Subject: [PATCH 2/6] up

---
 fastdeploy/engine/request.py     | 9 ++++++---
 tests/e2e/utils/serving_utils.py | 6 +++---
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py
index 935d83d2b6e..70d82d2e32c 100644
--- a/fastdeploy/engine/request.py
+++ b/fastdeploy/engine/request.py
@@ -75,6 +75,8 @@ def __init__(
         pooling_params: Optional[PoolingParams] = None,
         preprocess_start_time: Optional[float] = None,
         preprocess_end_time: Optional[float] = None,
+        inference_start_time: float = 0,
+        llm_engine_recv_req_timestamp: float = 0,
         multimodal_inputs: Optional[dict] = None,
         multimodal_data: Optional[dict] = None,
         disable_chat_template: bool = False,
@@ -100,8 +102,6 @@ def __init__(
         prefill_start_index: int = 0,
         prefill_end_index: int = 0,
         num_computed_tokens: int = 0,
-        inference_start_time: float = 0,
-        llm_engine_recv_req_timestamp: float = 0,
     ) -> None:
         self.request_id = request_id
         self.prompt = prompt
@@ -120,6 +120,10 @@ def __init__(
         self.arrival_time = arrival_time
         self.preprocess_start_time = preprocess_start_time
         self.preprocess_end_time = preprocess_end_time
+        self.inference_start_time = inference_start_time
+        self.llm_engine_recv_req_timestamp = (
+            llm_engine_recv_req_timestamp if llm_engine_recv_req_timestamp else time.time()
+        )
         self.disable_chat_template = disable_chat_template
         self.disaggregate_info = disaggregate_info
 
@@ -168,7 +172,6 @@ def __init__(
         self.extend_block_tables = []
         # dp
         self.dp_rank = dp_rank
-        self.llm_engine_recv_req_timestamp = time.time()
 
     @classmethod
     def from_dict(cls, d: dict):
diff --git a/tests/e2e/utils/serving_utils.py b/tests/e2e/utils/serving_utils.py
index 7267758529a..ad2538e4962 100644
--- a/tests/e2e/utils/serving_utils.py
+++ b/tests/e2e/utils/serving_utils.py
@@ -125,15 +125,15 @@ def check_service_health(base_url: str, timeout: int = 3) -> bool:
         return False
 
 
-def get_registered_number(router_url) -> list:
+def get_registered_number(router_url) -> dict:
     """
-    Get the number of registered models in the router.
+    Get the registered model counts by type from the router.
 
     Args:
         router_url (str): The base URL of the router, e.g. "http://localhost:8080".
 
     Returns:
-        int: The number of registered models.
+        dict: A dictionary containing registered model counts with keys "mixed", "prefill", and "decode".
     """
     if not router_url.startswith("http"):
         router_url = f"http://{router_url}"

From 280a0d8c18ca0352a49746bd3ab6a62526c850f6 Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Mon, 10 Nov 2025 03:59:26 +0000
Subject: [PATCH 3/6] up

---
 fastdeploy/cache_manager/cache_messager.py    |   4 +-
 .../test_ernie_03b_pd_splitwise_scheduler.py  | 103 +-----------------
 2 files changed, 8 insertions(+), 99 deletions(-)

diff --git a/fastdeploy/cache_manager/cache_messager.py b/fastdeploy/cache_manager/cache_messager.py
index 187b41f0e81..2ff8098a8d1 100644
--- a/fastdeploy/cache_manager/cache_messager.py
+++ b/fastdeploy/cache_manager/cache_messager.py
@@ -622,7 +622,9 @@ def prefill_layerwise_send_cache_thread(self):
                                     target_id = int(task["rdma_ports"][self.rank])
                                     if "error" in task["status"]:
                                         continue
-                                    logger.debug("rdma, start connect decode")
+
+                                    # TODO: use is connected to check if the connection is still alive
+                                    logger.debug(f"rdma, start connect decode, {target_ip}:{target_id}")
                                     status = self.messager[current_transfer_protocol].connect(target_ip, target_id)
                                     if status:
                                         logger.info(f"connect to {target_ip}:{target_id} success")
diff --git a/tests/e2e/test_ernie_03b_pd_splitwise_scheduler.py b/tests/e2e/test_ernie_03b_pd_splitwise_scheduler.py
index 2797b80fc08..b8af9c011cc 100644
--- a/tests/e2e/test_ernie_03b_pd_splitwise_scheduler.py
+++ b/tests/e2e/test_ernie_03b_pd_splitwise_scheduler.py
@@ -19,113 +19,20 @@
 import os
 import shutil
 import signal
-import socket
 import subprocess
 import sys
 import time
 
 import pytest
 import requests
-
-# Read ports from environment variables; use default values if not set
-FD_API_PORT = int(os.getenv("FD_API_PORT", 8188))
-FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133))
-FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233))
-FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
-
-# List of ports to clean before and after tests
-PORTS_TO_CLEAN = [
+from utils.serving_utils import (
     FD_API_PORT,
+    FD_CACHE_QUEUE_PORT,
     FD_ENGINE_QUEUE_PORT,
     FD_METRICS_PORT,
-    FD_CACHE_QUEUE_PORT,
-    FD_API_PORT + 1,
-    FD_ENGINE_QUEUE_PORT + 1,
-    FD_METRICS_PORT + 1,
-    FD_CACHE_QUEUE_PORT + 1,
-]
-
-
-def is_port_open(host: str, port: int, timeout=1.0):
-    """
-    Check if a TCP port is open on the given host.
-    Returns True if connection succeeds, False otherwise.
-    """
-    try:
-        with socket.create_connection((host, port), timeout):
-            return True
-    except Exception:
-        return False
-
-
-def kill_process_on_port(port: int):
-    """
-    Kill processes that are listening on the given port.
-    Uses multiple methods to ensure thorough cleanup.
-    """
-    current_pid = os.getpid()
-    parent_pid = os.getppid()
-
-    # Method 1: Use lsof to find processes
-    try:
-        output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip()
-        for pid in output.splitlines():
-            pid = int(pid)
-            if pid in (current_pid, parent_pid):
-                print(f"Skip killing current process (pid={pid}) on port {port}")
-                continue
-            try:
-                # First try SIGTERM for graceful shutdown
-                os.kill(pid, signal.SIGTERM)
-                time.sleep(1)
-                # Then SIGKILL if still running
-                os.kill(pid, signal.SIGKILL)
-                print(f"Killed process on port {port}, pid={pid}")
-            except ProcessLookupError:
-                pass  # Process already terminated
-    except subprocess.CalledProcessError:
-        pass
-
-    # Method 2: Use netstat and fuser as backup
-    try:
-        # Find processes using netstat and awk
-        cmd = f"netstat -tulpn 2>/dev/null | grep :{port} | awk '{{print $7}}' | cut -d'/' -f1"
-        output = subprocess.check_output(cmd, shell=True).decode().strip()
-        for pid in output.splitlines():
-            if pid and pid.isdigit():
-                pid = int(pid)
-                if pid in (current_pid, parent_pid):
-                    continue
-                try:
-                    os.kill(pid, signal.SIGKILL)
-                    print(f"Killed process (netstat) on port {port}, pid={pid}")
-                except ProcessLookupError:
-                    pass
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        pass
-
-    # Method 3: Use fuser if available
-    try:
-        subprocess.run(f"fuser -k {port}/tcp", shell=True, timeout=5)
-    except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError):
-        pass
-
-
-def clean_ports():
-    """
-    Kill all processes occupying the ports listed in PORTS_TO_CLEAN.
-    """
-    print(f"Cleaning ports: {PORTS_TO_CLEAN}")
-    for port in PORTS_TO_CLEAN:
-        kill_process_on_port(port)
-
-    # Double check and retry if ports are still in use
-    time.sleep(2)
-    for port in PORTS_TO_CLEAN:
-        if is_port_open("127.0.0.1", port, timeout=0.1):
-            print(f"Port {port} still in use, retrying cleanup...")
-            kill_process_on_port(port)
-            time.sleep(1)
+    clean_ports,
+    is_port_open,
+)
 
 # Read ports from environment variables; use default values if not set
 FD_CONNECTOR_PORT = int(os.getenv("FD_CONNECTOR_PORT", 8433))

From 162eccefd97a41a5959baafdcea7d4ed318ec949 Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Mon, 10 Nov 2025 06:28:43 +0000
Subject: [PATCH 4/6] up

---
 fastdeploy/cache_manager/cache_messager.py | 2 +-
 tests/e2e/test_ernie_03b_pd_router_v0.py   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/fastdeploy/cache_manager/cache_messager.py b/fastdeploy/cache_manager/cache_messager.py
index 2ff8098a8d1..dc3d64099a8 100644
--- a/fastdeploy/cache_manager/cache_messager.py
+++ b/fastdeploy/cache_manager/cache_messager.py
@@ -757,7 +757,7 @@ def _handle_connect_task(self):
                 self.engine_worker_queue.connect_task_response_barrier.wait()
                 self.engine_worker_queue.put_connect_rdma_task_response(response)
             except Exception as e:
-                logger.error(f"handle_connect_task has exception: {e}")
+                logger.error(f"handle_connect_task has exception: {e}, {traceback.format_exc()}")
 
 
 def main():
diff --git a/tests/e2e/test_ernie_03b_pd_router_v0.py b/tests/e2e/test_ernie_03b_pd_router_v0.py
index 3d4e967da8e..c8da6adbb67 100644
--- a/tests/e2e/test_ernie_03b_pd_router_v0.py
+++ b/tests/e2e/test_ernie_03b_pd_router_v0.py
@@ -53,6 +53,7 @@
     FD_ROUTER_PORT,
 ]
 
+
 @pytest.fixture(scope="session", autouse=True)
 def setup_and_run_server():
     """

From 02a391f05eb7cc71c06c777775ec93c643066b29 Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Mon, 10 Nov 2025 06:30:10 +0000
Subject: [PATCH 5/6] add test

---
 tests/e2e/test_ernie_03b_pd_router_v1.py | 428 +++++++++++++++++++++++
 tests/e2e/utils/get_rdma_nics.sh         | 225 ++++++++++++
 2 files changed, 653 insertions(+)
 create mode 100644 tests/e2e/test_ernie_03b_pd_router_v1.py
 create mode 100644 tests/e2e/utils/get_rdma_nics.sh

diff --git a/tests/e2e/test_ernie_03b_pd_router_v1.py b/tests/e2e/test_ernie_03b_pd_router_v1.py
new file mode 100644
index 00000000000..927a9bae962
--- /dev/null
+++ b/tests/e2e/test_ernie_03b_pd_router_v1.py
@@ -0,0 +1,428 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Test splitwise deployment which uses local_scheduler + router,
+# and ENABLE_V1_KVCACHE_SCHEDULER is 1
+
+import json
+import os
+import shutil
+import signal
+import subprocess
+import sys
+import time
+
+import pytest
+import requests
+from utils.serving_utils import (
+    FD_API_PORT,
+    FD_CACHE_QUEUE_PORT,
+    FD_ENGINE_QUEUE_PORT,
+    FD_METRICS_PORT,
+    clean_ports,
+    get_registered_number,
+)
+
+# Read ports from environment variables; use default values if not set
+FD_CONNECTOR_PORT = int(os.getenv("FD_CONNECTOR_PORT", 8433))
+FD_ROUTER_PORT = int(os.getenv("FD_ROUTER_PORT", 8533))
+FD_RDMA_PORT = int(os.getenv("FD_RDMA_PORT", 8623))
+
+# List of ports to clean before and after tests
+PORTS_TO_CLEAN = [
+    FD_API_PORT,
+    FD_ENGINE_QUEUE_PORT,
+    FD_METRICS_PORT,
+    FD_CACHE_QUEUE_PORT,
+    FD_CONNECTOR_PORT,
+    FD_RDMA_PORT,
+    FD_API_PORT + 1,
+    FD_ENGINE_QUEUE_PORT + 1,
+    FD_METRICS_PORT + 1,
+    FD_CACHE_QUEUE_PORT + 1,
+    FD_CONNECTOR_PORT + 1,
+    FD_RDMA_PORT + 1,
+    FD_ROUTER_PORT,
+]
+
+
+@pytest.fixture(scope="session", autouse=True)
+def setup_and_run_server():
+    """
+    Pytest fixture that runs once per test session:
+    - Cleans ports before tests
+    - Starts the API server as a subprocess
+    - Waits for server port to open (up to 30 seconds)
+    - Tears down server after all tests finish
+    """
+    print("Pre-test port cleanup...")
+    clean_ports(PORTS_TO_CLEAN)
+
+    print("log dir clean ")
+    if os.path.exists("log_router") and os.path.isdir("log_router"):
+        shutil.rmtree("log_router")
+    if os.path.exists("log_prefill") and os.path.isdir("log_prefill"):
+        shutil.rmtree("log_prefill")
+    if os.path.exists("log_decode") and os.path.isdir("log_decode"):
+        shutil.rmtree("log_decode")
+
+    base_path = os.getenv("MODEL_PATH")
+    if base_path:
+        model_path = os.path.join(base_path, "ERNIE-4.5-0.3B-Paddle")
+    else:
+        model_path = "baidu/ERNIE-4.5-0.3B-Paddle"
+    print(f"model_path: {model_path}")
+
+    # get rdma nics
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    shell_path = os.path.join(current_dir, "utils/get_rdma_nics.sh")
+    output = subprocess.check_output(["bash", shell_path, "gpu"], text=True)
+    _, rdma_nics = output.split("=")
+    print(f"shell_path: {shell_path}, rdma_nics: {rdma_nics}")
+
+    # router
+    print("start router...")
+    env_router = os.environ.copy()
+    env_router["FD_LOG_DIR"] = "log_router"
+    router_log_path = "router.log"
+
+    router_cmd = [
+        sys.executable,
+        "-m",
+        "fastdeploy.router.launch",
+        "--port",
+        str(FD_ROUTER_PORT),
+        "--splitwise",
+    ]
+
+    with open(router_log_path, "w") as logfile:
+        process_router = subprocess.Popen(
+            router_cmd,
+            stdout=logfile,
+            stderr=subprocess.STDOUT,
+            start_new_session=True,  # Enables killing full group via os.killpg
+            env=env_router,
+        )
+
+    # prefill实例
+    print("start prefill...")
+    env_prefill = os.environ.copy()
+    env_prefill["CUDA_VISIBLE_DEVICES"] = "0"
+    env_prefill["FD_LOG_DIR"] = "log_prefill"
+    env_prefill["KVCACHE_RDMA_NICS"] = rdma_nics
+    env_prefill["KVCACHE_GDRCOPY_FLUSH_ENABLE"] = "1"
+    # env_prefill["KVCACHE_DEBUG"] = "1"
+    # env_prefill["KV_CACHE_DEBUG_FILE"] = f"{current_dir}/rdma_log_debug"
+    # env_prefill["KV_CACHE_ERROR_FILE"] = f"{current_dir}/rdma_log_error"
+
+    prefill_log_path = "server.log"
+    prefill_cmd = [
+        sys.executable,
+        "-m",
+        "fastdeploy.entrypoints.openai.api_server",
+        "--model",
+        model_path,
+        "--port",
+        str(FD_API_PORT),
+        "--engine-worker-queue-port",
+        str(FD_ENGINE_QUEUE_PORT),
+        "--metrics-port",
+        str(FD_METRICS_PORT),
+        "--cache-queue-port",
+        str(FD_CACHE_QUEUE_PORT),
+        "--max-model-len",
+        "8192",
+        "--num-gpu-blocks-override",
+        "2000",
+        "--splitwise-role",
+        "prefill",
+        "--cache-transfer-protocol",
+        "rdma",
+        "--rdma-comm-ports",
+        str(FD_RDMA_PORT),
+        "--pd-comm-port",
+        str(FD_CONNECTOR_PORT),
+        "--router",
+        f"0.0.0.0:{FD_ROUTER_PORT}",
+    ]
+
+    # Start subprocess in new process group
+    with open(prefill_log_path, "w") as logfile:
+        process_prefill = subprocess.Popen(
+            prefill_cmd,
+            stdout=logfile,
+            stderr=subprocess.STDOUT,
+            start_new_session=True,  # Enables killing full group via os.killpg
+            env=env_prefill,
+        )
+    time.sleep(1)
+
+    # decode实例
+    print("start decode...")
+    env_decode = os.environ.copy()
+    env_decode["CUDA_VISIBLE_DEVICES"] = "1"
+    env_decode["FD_LOG_DIR"] = "log_decode"
+    env_prefill["KVCACHE_RDMA_NICS"] = rdma_nics
+    env_prefill["KVCACHE_GDRCOPY_FLUSH_ENABLE"] = "1"
+    decode_log_path = "decode_server.log"
+    decode_cmd = [
+        sys.executable,
+        "-m",
+        "fastdeploy.entrypoints.openai.api_server",
+        "--model",
+        model_path,
+        "--port",
+        str(FD_API_PORT + 1),
+        "--engine-worker-queue-port",
+        str(FD_ENGINE_QUEUE_PORT + 1),
+        "--metrics-port",
+        str(FD_METRICS_PORT + 1),
+        "--cache-queue-port",
+        str(FD_CACHE_QUEUE_PORT + 1),
+        "--max-model-len",
+        "8192",
+        "--splitwise-role",
+        "decode",
+        "--cache-transfer-protocol",
+        "rdma",
+        "--rdma-comm-ports",
+        str(FD_RDMA_PORT + 1),
+        "--pd-comm-port",
+        str(FD_CONNECTOR_PORT + 1),
+        "--router",
+        f"0.0.0.0:{FD_ROUTER_PORT}",
+    ]
+
+    # Start subprocess in new process group
+    with open(decode_log_path, "w") as logfile:
+        process_decode = subprocess.Popen(
+            decode_cmd,
+            stdout=logfile,
+            stderr=subprocess.STDOUT,
+            start_new_session=True,  # Enables killing full group via os.killpg
+            env=env_decode,
+        )
+
+    # Wait up to 300 seconds for API server to be ready
+    for _ in range(60):
+        registered_numbers = get_registered_number(f"0.0.0.0:{FD_ROUTER_PORT}")
+        if registered_numbers["prefill"] >= 1 and registered_numbers["decode"] >= 1:
+            print("Prefill and decode servers are both online")
+            break
+        time.sleep(5)
+    else:
+        print("[TIMEOUT] API server failed to start in 5 minutes. Cleaning up...")
+        try:
+            os.killpg(process_prefill.pid, signal.SIGTERM)
+            os.killpg(process_decode.pid, signal.SIGTERM)
+            clean_ports(PORTS_TO_CLEAN)
+        except Exception as e:
+            print(f"Failed to kill process group: {e}")
+        raise RuntimeError(f"API server did not start on port {FD_API_PORT}")
+
+    yield  # Run tests
+
+    print("\n===== Post-test server cleanup... =====")
+    try:
+        os.killpg(process_router.pid, signal.SIGTERM)
+        os.killpg(process_prefill.pid, signal.SIGTERM)
+        os.killpg(process_decode.pid, signal.SIGTERM)
+        clean_ports(PORTS_TO_CLEAN)
+        print(f"Prefill server (pid={process_prefill.pid}) terminated")
+        print(f"Decode server (pid={process_decode.pid}) terminated")
+    except Exception as e:
+        print(f"Failed to terminate API server: {e}")
+
+
+@pytest.fixture(scope="session")
+def api_url(request):
+    """
+    Returns the API endpoint URL for chat completions.
+    """
+    return f"http://0.0.0.0:{FD_ROUTER_PORT}/v1/chat/completions"
+
+
+@pytest.fixture(scope="session")
+def metrics_url(request):
+    """
+    Returns the metrics endpoint URL.
+    """
+    return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics"
+
+
+@pytest.fixture
+def headers():
+    """
+    Returns common HTTP request headers.
+    """
+    return {"Content-Type": "application/json"}
+
+
+def test_metrics_config(metrics_url):
+    timeout = 600
+    url = metrics_url.replace("metrics", "config-info")
+    res = requests.get(url, timeout=timeout)
+    assert res.status_code == 200
+
+
+def send_request(url, payload, timeout=600):
+    """
+    发送请求到指定的URL，并返回响应结果。
+    """
+    headers = {
+        "Content-Type": "application/json",
+    }
+
+    try:
+        res = requests.post(url, headers=headers, json=payload, timeout=timeout)
+        print("🟢 接收响应中...\n")
+        return res
+    except requests.exceptions.Timeout:
+        print(f"❌ 请求超时（超过 {timeout} 秒）")
+        return None
+    except requests.exceptions.RequestException as e:
+        print(f"❌ 请求失败：{e}")
+        return None
+
+
+def get_stream_chunks(response):
+    """解析流式返回，生成chunk List[dict]"""
+    chunks = []
+
+    if response.status_code == 200:
+        for line in response.iter_lines(decode_unicode=True):
+            if line:
+                if line.startswith("data: "):
+                    line = line[len("data: ") :]
+
+                if line.strip() == "[DONE]":
+                    break
+
+                try:
+                    chunk = json.loads(line)
+                    chunks.append(chunk)
+                except Exception as e:
+                    print(f"解析失败: {e}, 行内容: {line}")
+    else:
+        print(f"请求失败，状态码: {response.status_code}")
+        print("返回内容：", response.text)
+
+    return chunks
+
+
+def test_chat_usage_stream(api_url):
+    """测试流式chat usage"""
+    payload = {
+        "model": "default",
+        "temperature": 0,
+        "top_p": 0,
+        "seed": 33,
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "牛顿的三大运动定律是什么？"},
+        ],
+        "max_tokens": 50,
+        "stream": True,
+        "stream_options": {"include_usage": True, "continuous_usage_stats": True},
+        "metadata": {"min_tokens": 10},
+    }
+
+    response = send_request(url=api_url, payload=payload)
+    chunks = get_stream_chunks(response)
+    result = "".join([x["choices"][0]["delta"]["content"] for x in chunks[:-1]])
+    print("Decode Response:", result)
+    assert result != "", "结果为空"
+    usage = chunks[-1]["usage"]
+    total_tokens = usage["completion_tokens"] + usage["prompt_tokens"]
+    assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens"
+    assert payload["metadata"]["min_tokens"] <= usage["completion_tokens"], "completion_tokens小于min_tokens"
+    assert usage["total_tokens"] == total_tokens, "total_tokens不等于prompt_tokens + completion_tokens"
+
+
+def test_chat_usage_non_stream(api_url):
+    """测试非流式chat usage"""
+    payload = {
+        "model": "default",
+        "temperature": 0,
+        "top_p": 0,
+        "seed": 33,
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "牛顿的三大运动定律是什么？"},
+        ],
+        "max_tokens": 50,
+        "stream": False,
+        "metadata": {"min_tokens": 10},
+    }
+
+    response = send_request(url=api_url, payload=payload).json()
+    usage = response["usage"]
+    result = response["choices"][0]["message"]["content"]
+    assert result != "", "结果为空"
+    total_tokens = usage["completion_tokens"] + usage["prompt_tokens"]
+    assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens"
+    assert payload["metadata"]["min_tokens"] <= usage["completion_tokens"], "completion_tokens小于min_tokens"
+    assert usage["total_tokens"] == total_tokens, "total_tokens不等于prompt_tokens + completion_tokens"
+
+
+def test_non_chat_usage_stream(api_url):
+    """测试流式非chat usage"""
+    payload = {
+        "model": "default",
+        "temperature": 0,
+        "top_p": 0,
+        "seed": 33,
+        "prompt": "牛顿的三大运动定律是什么？",
+        "max_tokens": 50,
+        "stream": True,
+        "stream_options": {"include_usage": True, "continuous_usage_stats": True},
+        "metadata": {"min_tokens": 10},
+    }
+    api_url = api_url.replace("chat/completions", "completions")
+
+    response = send_request(url=api_url, payload=payload)
+    chunks = get_stream_chunks(response)
+    result = "".join([x["choices"][0]["text"] for x in chunks[:-1]])
+    print("Decode Response:", result)
+    assert result != "", "结果为空"
+    usage = chunks[-1]["usage"]
+    total_tokens = usage["completion_tokens"] + usage["prompt_tokens"]
+    assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens"
+    assert payload["metadata"]["min_tokens"] <= usage["completion_tokens"], "completion_tokens小于min_tokens"
+    assert usage["total_tokens"] == total_tokens, "total_tokens不等于prompt_tokens + completion_tokens"
+
+
+def test_non_chat_usage_non_stream(api_url):
+    """测试非流式非chat usage"""
+    payload = {
+        "model": "default",
+        "temperature": 0,
+        "top_p": 0,
+        "seed": 33,
+        "prompt": "牛顿的三大运动定律是什么？",
+        "max_tokens": 50,
+        "stream": False,
+        "metadata": {"min_tokens": 10},
+    }
+    api_url = api_url.replace("chat/completions", "completions")
+
+    response = send_request(url=api_url, payload=payload).json()
+    usage = response["usage"]
+    result = response["choices"][0]["text"]
+    print("Decode Response:", result)
+    assert result != "", "结果为空"
+    total_tokens = usage["completion_tokens"] + usage["prompt_tokens"]
+    assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens"
+    assert payload["metadata"]["min_tokens"] <= usage["completion_tokens"], "completion_tokens小于min_tokens"
+    assert usage["total_tokens"] == total_tokens, "total_tokens不等于prompt_tokens + completion_tokens"
diff --git a/tests/e2e/utils/get_rdma_nics.sh b/tests/e2e/utils/get_rdma_nics.sh
new file mode 100644
index 00000000000..4fc07a98c9a
--- /dev/null
+++ b/tests/e2e/utils/get_rdma_nics.sh
@@ -0,0 +1,225 @@
+#!/bin/bash
+Cur_Dir=$(cd `dirname $0`; pwd)
+NICNAME_TYPE=xgbe  # 默认检测类型
+type=$1
+
+if [ "$ENABLE_EP_DP" == "1" ]; then
+    gpu_root_port_filename="${Cur_Dir}/gpu_rootport_${DP_RANK}.txt"
+else
+    gpu_root_port_filename="${Cur_Dir}/gpu_rootport.txt"
+fi
+
+function __NEW_GPU_ROOTPORT_FILE__() {
+    touch ${gpu_root_port_filename} 2>/dev/null
+    echo "" > ${gpu_root_port_filename} 2>/dev/null
+    for gpu_bus in $(lspci 2>/dev/null | grep -iE "Communication controller: | controller: NVIDIA" | awk '{print $1}')
+    do
+        readlink "/sys/bus/pci/devices/0000:${gpu_bus}" 2>/dev/null | awk -F [/] '{print $6}' >> ${gpu_root_port_filename}
+    done
+}
+
+function  __RM_GPU_ROOTPORT_FILE__() {
+    rm -rf ${gpu_root_port_filename} 2>/dev/null
+}
+
+function __JUDGE_NIC_TYPE__() {
+    XGBE_NUM=$(ip a 2>/dev/null | grep -c ": ${NICNAME_TYPE}")
+    gpu_first=true
+    xpu_first=true
+    cpu_first=true
+
+    for (( xgbe_no=0; xgbe_no < XGBE_NUM; xgbe_no++ ))
+    do
+        [ ! -d "/sys/class/net/${NICNAME_TYPE}${xgbe_no}" ] && continue
+
+        PCI_ADDRESS=$(ethtool -i "${NICNAME_TYPE}${xgbe_no}" 2>/dev/null | awk -F '0000:' '/bus-info/{print $2}')
+        [ -z "$PCI_ADDRESS" ] && continue
+        NIC_ROOTPORT=$(readlink "/sys/bus/pci/devices/0000:${PCI_ADDRESS}" 2>/dev/null | awk -F '/' '{print $6}')
+
+        NIC_TYPE="CPU_NIC"
+        grep -qxF "$NIC_ROOTPORT" ${gpu_root_port_filename} 2>/dev/null && NIC_TYPE="GPU_NIC"
+
+        if [[ "$type" == "gpu" && "$NIC_TYPE" == "GPU_NIC" ]]; then
+            ibdev=$(ibdev2netdev 2>/dev/null | awk -v nic="${NICNAME_TYPE}${xgbe_no}" '$5 == nic {print $1}')
+            if [ -n "$ibdev" ] && ip link show "${NICNAME_TYPE}${xgbe_no}" | grep -q "state UP"; then
+                if $gpu_first; then
+                    printf "KVCACHE_RDMA_NICS=%s" "$ibdev"
+                    gpu_first=false
+                else
+                    printf ",%s" "$ibdev"
+                fi
+            fi
+        fi
+
+        if [[ "$type" == "xpu" && "$NIC_TYPE" == "GPU_NIC" ]]; then
+            ibdev=$(ibdev2netdev 2>/dev/null | awk -v nic="${NICNAME_TYPE}${xgbe_no}" '$5 == nic {print $1}')
+            if [ -n "$ibdev" ] && ip link show "${NICNAME_TYPE}${xgbe_no}" | grep -q "state UP"; then
+                if $xpu_first; then
+                    printf "KVCACHE_RDMA_NICS=%s,%s" "$ibdev" "$ibdev"
+                    xpu_first=false
+                else
+                    printf ",%s,%s" "$ibdev" "$ibdev"
+                fi
+            fi
+        fi
+
+        if [[ "$type" == "cpu" ]]; then
+            for (( xgbe_no=0; xgbe_no < XGBE_NUM; xgbe_no++ ))
+            do
+                [ ! -d "/sys/class/net/${NICNAME_TYPE}${xgbe_no}" ] && continue
+
+                PCI_ADDRESS=$(ethtool -i "${NICNAME_TYPE}${xgbe_no}" 2>/dev/null | awk -F '0000:' '/bus-info/{print $2}')
+                [ -z "$PCI_ADDRESS" ] && continue
+
+                NIC_ROOTPORT=$(readlink "/sys/bus/pci/devices/0000:${PCI_ADDRESS}" 2>/dev/null | awk -F '/' '{print $6}')
+                grep -qxF "$NIC_ROOTPORT" ${gpu_root_port_filename} 2>/dev/null && continue
+
+                if ip link show "${NICNAME_TYPE}${xgbe_no}" | grep -q "state UP" && \
+                ip a show "${NICNAME_TYPE}${xgbe_no}" | grep -q "inet"; then
+                    printf "KV_CACHE_SOCKET_IFNAME=%s\n" "${NICNAME_TYPE}${xgbe_no}"
+                    return 0
+                fi
+            done
+                echo "ERROR: No active CPU NIC with IP found!" >&2
+                return 1
+        fi
+
+        if [[ "$type" == "cpu_ib" && "$NIC_TYPE" == "CPU_NIC" ]]; then
+            ibdev=$(ibdev2netdev 2>/dev/null | awk -v nic="${NICNAME_TYPE}${xgbe_no}" '$5 == nic {print $1}')
+            if [ -n "$ibdev" ] && ip link show "${NICNAME_TYPE}${xgbe_no}" | grep -q "state UP" && \
+               ip a show "${NICNAME_TYPE}${xgbe_no}" | grep -q "inet "; then
+                if $cpu_ib_first; then
+                    printf "KVCACHE_RDMA_NICS=%s" "$ibdev"
+                    cpu_ib_first=false
+                else
+                    printf ",%s" "$ibdev"
+                fi
+            fi
+        fi
+
+    done
+
+    case "$type" in
+        gpu) ! $gpu_first && printf "\n" ;;
+        xpu) ! $xpu_first && printf "\n" ;;
+        cpu) ! $cpu_first && printf "\n" ;;
+        cpu_ib) ! $cpu_ib_first && printf "\n" ;;
+    esac
+}
+
+function get_vxpu_nics() {
+    local topo_output=$(xpu-smi topo -m)
+    local xpu_info=$(echo "$topo_output" | grep -E '^XPU[0-9]+')
+
+    local nic_mapping=()
+    while IFS= read -r line; do
+        if [[ $line =~ NIC([0-9]+):\ +(mlx[0-9_]+) ]]; then
+            local nic_idx=${BASH_REMATCH[1]}
+            local nic_name=${BASH_REMATCH[2]}
+            nic_mapping[$nic_idx]=$nic_name
+        fi
+    done < <(echo "$topo_output" | grep -E '^\s*NIC[0-9]+:')
+
+    local nic_count=${#nic_mapping[@]}
+
+    declare -A priority_map=([PIX]=2 [NODE]=1 [SYS]=0)
+    local optimal_nics=()
+
+    while IFS= read -r line; do
+        local fields=($line)
+        local nic_start_index=5
+        local max_nics=$(( ${#fields[@]} - nic_start_index ))
+        local actual_nic_count=$(( max_nics < nic_count ? max_nics : nic_count ))
+
+        local best_priority=-1
+        local best_nic=""
+
+        for ((nic_idx=0; nic_idx<actual_nic_count; nic_idx++)); do
+            local conn_type=${fields[nic_idx+nic_start_index]}
+            local current_priority=${priority_map[$conn_type]:--1}
+
+            if (( current_priority > best_priority )); then
+                best_priority=$current_priority
+                best_nic="${nic_mapping[$nic_idx]}"
+            fi
+        done
+
+        if [[ -n "$best_nic" ]]; then
+            optimal_nics+=("$best_nic")
+        fi
+    done <<< "$xpu_info"
+
+    local IFS=,
+    export KVCACHE_RDMA_NICS="${optimal_nics[*]}"
+    echo "KVCACHE_RDMA_NICS=${optimal_nics[*]}"
+}
+
+function get_vcpu_nics() {
+    ip -o addr show | awk '$3 == "inet" && $4 ~ /^10\./ {print "KV_CACHE_SOCKET_IFNAME="$2; exit}'
+}
+
+function __main__() {
+    if [[ "$type" == "vxpu" ]]; then
+        get_vxpu_nics
+        return 0
+    fi
+    if [[ "$type" == "vcpu" ]]; then
+        get_vcpu_nics
+        return 0
+    fi
+
+    # 处理 bond 情况
+    if [[ "$type" == "cpu" ]]; then
+        for bond in $(ls -d /sys/class/net/bond* 2>/dev/null); do
+            bond_if=$(basename "$bond")
+            if ip link show "$bond_if" | grep -q "state UP" && \
+               ip a show "$bond_if" | grep -q "inet "; then
+                printf "KV_CACHE_SOCKET_IFNAME=%s\n" "$bond_if"
+                return 0
+            fi
+        done
+    fi
+
+    if [[ "$type" == "cpu_ib" ]]; then
+        first=true
+        for bond in $(ls -d /sys/class/net/bond* 2>/dev/null); do
+            bond_if=$(basename "$bond")
+            __NEW_GPU_ROOTPORT_FILE__
+
+            ibdev=$(ibdev2netdev 2>/dev/null | grep -w "$bond_if" | awk '{print $1}')
+            if [ -n "$ibdev" ] && ip link show "$bond_if" | grep -q "state UP" && \
+               ip a show "$bond_if" | grep -q "inet "; then
+                if $first; then
+                    printf "KVCACHE_RDMA_NICS=%s" "$ibdev"
+                    first=false
+                else
+                    printf ",%s" "$ibdev"
+                fi
+            fi
+
+            bondib=$(show_gids 2>/dev/null | grep -w "$bond_if" | awk '{print $1}' | grep "mlx.*bond" | head -1)
+            if [ -n "$bondib" ] && ip link show "$bond_if" | grep -q "state UP" && \
+               ip a show "$bond_if" | grep -q "inet " && $first; then
+                printf "KVCACHE_RDMA_NICS=%s" "$bondib"
+                first=false
+            fi
+
+            __RM_GPU_ROOTPORT_FILE__
+        done
+
+        ! $first && printf "\n"
+        [ ! $first ] && return 0
+    fi
+
+    local nic_types=("eth" "ib" "xgbe")
+    for nt in "${nic_types[@]}"; do
+        if ip a | grep -iq "$nt"; then
+            __NEW_GPU_ROOTPORT_FILE__
+            NICNAME_TYPE=$nt
+            __JUDGE_NIC_TYPE__
+            __RM_GPU_ROOTPORT_FILE__
+        fi
+    done
+}
+
+__main__

From b93a7fe765d4a1e6995c7c47d74e9ab7a2f35d30 Mon Sep 17 00:00:00 2001
From: juncaipeng <13006307475@163.com>
Date: Mon, 10 Nov 2025 09:19:45 +0000
Subject: [PATCH 6/6] up

---
 examples/splitwise/start_mixed.sh        |  35 +-
 examples/splitwise/start_v0_tp1.sh       |  45 +--
 examples/splitwise/start_v1_tp1.sh       |  42 +--
 tests/e2e/test_ernie_03b_pd_router_v1.py | 428 -----------------------
 tests/e2e/utils/get_rdma_nics.sh         | 225 ------------
 5 files changed, 65 insertions(+), 710 deletions(-)
 delete mode 100644 tests/e2e/test_ernie_03b_pd_router_v1.py
 delete mode 100644 tests/e2e/utils/get_rdma_nics.sh

diff --git a/examples/splitwise/start_mixed.sh b/examples/splitwise/start_mixed.sh
index 750c2a45e55..c36027ac26a 100644
--- a/examples/splitwise/start_mixed.sh
+++ b/examples/splitwise/start_mixed.sh
@@ -26,13 +26,16 @@ export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
 unset http_proxy && unset https_proxy
 rm -rf log_*
 
+S1_PORT=52400
+S2_PORT=52500
+ROUTER_PORT=52600
+
 # start router
 export FD_LOG_DIR="log_router"
 mkdir -p ${FD_LOG_DIR}
 
-router_port=9000
 nohup python -m fastdeploy.router.launch \
-    --port ${router_port} \
+    --port ${ROUTER_PORT} \
     2>&1 >${FD_LOG_DIR}/nohup &
 sleep 1
 
@@ -43,16 +46,16 @@ mkdir -p ${FD_LOG_DIR}
 
 nohup python -m fastdeploy.entrypoints.openai.api_server \
        --model ${MODEL_NAME} \
-       --port 8100 \
-       --metrics-port 8101 \
-       --engine-worker-queue-port 8102 \
-       --cache-queue-port 8103 \
+       --port ${S1_PORT} \
+       --metrics-port $((S1_PORT + 1)) \
+       --engine-worker-queue-port $((S1_PORT + 2)) \
+       --cache-queue-port $((S1_PORT + 3)) \
        --max-model-len 32768 \
-       --router "0.0.0.0:${router_port}" \
+       --router "0.0.0.0:${ROUTER_PORT}" \
        2>&1 >${FD_LOG_DIR}/nohup &
 sleep 1
 
-# wait_for_health 8100
+wait_for_health ${S1_PORT}
 
 # start modelserver 1
 export CUDA_VISIBLE_DEVICES=1
@@ -61,21 +64,19 @@ mkdir -p ${FD_LOG_DIR}
 
 nohup python -m fastdeploy.entrypoints.openai.api_server \
        --model ${MODEL_NAME} \
-       --port 8200 \
-       --metrics-port 8201 \
-       --engine-worker-queue-port 8202 \
-       --cache-queue-port 8203 \
+       --port ${S2_PORT} \
+       --metrics-port $((S2_PORT + 1)) \
+       --engine-worker-queue-port $((S2_PORT + 2)) \
+       --cache-queue-port $((S2_PORT + 3)) \
        --max-model-len 32768 \
-       --router "0.0.0.0:${router_port}" \
+       --router "0.0.0.0:${ROUTER_PORT}" \
        2>&1 >${FD_LOG_DIR}/nohup &
 
-wait_for_health 8200
-
+wait_for_health ${S2_PORT}
 
 # send request
 sleep 10  # make sure server is registered to router
-port=9000
-curl -X POST "http://0.0.0.0:${port}/v1/chat/completions" \
+curl -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \
 -H "Content-Type: application/json" \
 -d '{
   "messages": [
diff --git a/examples/splitwise/start_v0_tp1.sh b/examples/splitwise/start_v0_tp1.sh
index c4b94a9b1fa..42f585a5a71 100644
--- a/examples/splitwise/start_v0_tp1.sh
+++ b/examples/splitwise/start_v0_tp1.sh
@@ -38,10 +38,14 @@ fi
 unset http_proxy && unset https_proxy
 rm -rf log_*
 
+P_PORT=52400
+D_PORT=52500
+REDIS_PORT=56388
+
 # start redis
-if ! redis-cli ping &>/dev/null; then
+if ! redis-cli -p ${REDIS_PORT} ping &>/dev/null; then
     echo "Redis is not running. Starting redis-server..."
-    redis-server --daemonize yes
+    redis-server --daemonize yes --port ${REDIS_PORT}
     sleep 1
 else
     echo "Redis is already running."
@@ -55,22 +59,23 @@ mkdir -p ${FD_LOG_DIR}
 
 nohup python -m fastdeploy.entrypoints.openai.api_server \
        --model ${MODEL_NAME} \
-       --port 8100 \
-       --metrics-port 8101 \
-       --engine-worker-queue-port 8102 \
-       --cache-queue-port 8103 \
+       --port ${P_PORT} \
+       --metrics-port $((P_PORT + 1)) \
+       --engine-worker-queue-port $((P_PORT + 2)) \
+       --cache-queue-port $((P_PORT + 3)) \
        --max-model-len 32768 \
        --num-gpu-blocks-override 1000 \
        --splitwise-role "prefill" \
        --cache-transfer-protocol "rdma" \
-       --rdma-comm-ports 8104 \
-       --pd-comm-port 8105 \
+       --rdma-comm-ports $((P_PORT + 4)) \
+       --pd-comm-port $((P_PORT + 5)) \
        --scheduler-name "splitwise" \
        --scheduler-host "127.0.0.1" \
-       --scheduler-port 6379 \
+       --scheduler-port ${REDIS_PORT} \
        --scheduler-ttl 9000 \
        2>&1 >${FD_LOG_DIR}/nohup &
-# wait_for_health 8100
+
+wait_for_health ${P_PORT}
 
 # start decode
 export CUDA_VISIBLE_DEVICES=1
@@ -79,27 +84,27 @@ mkdir -p ${FD_LOG_DIR}
 
 nohup python -m fastdeploy.entrypoints.openai.api_server \
        --model ${MODEL_NAME} \
-       --port 9000 \
-       --metrics-port 9001 \
-       --engine-worker-queue-port 9002 \
-       --cache-queue-port 9003 \
+       --port ${D_PORT} \
+       --metrics-port $((D_PORT + 1)) \
+       --engine-worker-queue-port $((D_PORT + 2)) \
+       --cache-queue-port $((D_PORT + 3)) \
        --max-model-len 32768 \
        --splitwise-role "decode" \
        --cache-transfer-protocol "rdma" \
-       --rdma-comm-ports 9004 \
-       --pd-comm-port 9005 \
+       --rdma-comm-ports $((D_PORT + 4)) \
+       --pd-comm-port $((D_PORT + 5)) \
        --scheduler-name "splitwise" \
        --scheduler-host "127.0.0.1" \
-       --scheduler-port 6379 \
+       --scheduler-port ${REDIS_PORT} \
        --scheduler-ttl 9000 \
        2>&1 >${FD_LOG_DIR}/nohup &
-wait_for_health 9000
+
+wait_for_health ${D_PORT}
 
 
 # send request
 sleep 10  # make sure server is registered to router
-port=9000
-curl -X POST "http://0.0.0.0:${port}/v1/chat/completions" \
+curl -X POST "http://0.0.0.0:${D_PORT}/v1/chat/completions" \
 -H "Content-Type: application/json" \
 -d '{
   "messages": [
diff --git a/examples/splitwise/start_v1_tp1.sh b/examples/splitwise/start_v1_tp1.sh
index 523ce15b010..31eca8ab77f 100644
--- a/examples/splitwise/start_v1_tp1.sh
+++ b/examples/splitwise/start_v1_tp1.sh
@@ -38,13 +38,16 @@ fi
 unset http_proxy && unset https_proxy
 rm -rf log_*
 
+P_PORT=52400
+D_PORT=52500
+ROUTER_PORT=52600
+
 # start router
 export FD_LOG_DIR="log_router"
 mkdir -p ${FD_LOG_DIR}
 
-router_port=9000
 nohup python -m fastdeploy.router.launch \
-    --port ${router_port} \
+    --port ${ROUTER_PORT} \
     --splitwise \
     2>&1 >${FD_LOG_DIR}/nohup &
 sleep 1
@@ -56,20 +59,20 @@ mkdir -p ${FD_LOG_DIR}
 
 nohup python -m fastdeploy.entrypoints.openai.api_server \
        --model ${MODEL_NAME} \
-       --port 8100 \
-       --metrics-port 8101 \
-       --engine-worker-queue-port 8102 \
-       --cache-queue-port 8103 \
+       --port "${P_PORT}" \
+       --metrics-port "$((P_PORT + 1))" \
+       --engine-worker-queue-port "$((P_PORT + 2))" \
+       --cache-queue-port "$((P_PORT + 3))" \
        --max-model-len 32768 \
        --splitwise-role "prefill" \
        --cache-transfer-protocol "rdma" \
-       --rdma-comm-ports 8104 \
-       --pd-comm-port 8105 \
+       --rdma-comm-ports "$((P_PORT + 4))" \
+       --pd-comm-port "$((P_PORT + 5))" \
        --num-gpu-blocks-override 2000 \
-       --router "0.0.0.0:${router_port}" \
+       --router "0.0.0.0:${ROUTER_PORT}" \
        2>&1 >${FD_LOG_DIR}/nohup &
 
-# wait_for_health 8100
+wait_for_health ${P_PORT}
 
 # start decode
 export CUDA_VISIBLE_DEVICES=1
@@ -78,24 +81,23 @@ mkdir -p ${FD_LOG_DIR}
 
 nohup python -m fastdeploy.entrypoints.openai.api_server \
        --model ${MODEL_NAME} \
-       --port 8200 \
-       --metrics-port 8201 \
-       --engine-worker-queue-port 8202 \
-       --cache-queue-port 8203 \
+       --port "${D_PORT}" \
+       --metrics-port "$((D_PORT + 2))" \
+       --engine-worker-queue-port "$((D_PORT + 3))" \
+       --cache-queue-port "$((D_PORT + 1))" \
        --max-model-len 32768 \
        --splitwise-role "decode" \
        --cache-transfer-protocol "rdma" \
-       --rdma-comm-ports 8204 \
-       --pd-comm-port 8205 \
-       --router "0.0.0.0:${router_port}" \
+       --rdma-comm-ports "$((D_PORT + 4))" \
+       --pd-comm-port "$((D_PORT + 5))" \
+       --router "0.0.0.0:${ROUTER_PORT}" \
        2>&1 >${FD_LOG_DIR}/nohup &
 
-wait_for_health 8200
+wait_for_health ${D_PORT}
 
 # send request
 sleep 10  # make sure server is registered to router
-port=9000
-curl -X POST "http://0.0.0.0:${port}/v1/chat/completions" \
+curl -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \
 -H "Content-Type: application/json" \
 -d '{
   "messages": [
diff --git a/tests/e2e/test_ernie_03b_pd_router_v1.py b/tests/e2e/test_ernie_03b_pd_router_v1.py
deleted file mode 100644
index 927a9bae962..00000000000
--- a/tests/e2e/test_ernie_03b_pd_router_v1.py
+++ /dev/null
@@ -1,428 +0,0 @@
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Test splitwise deployment which uses local_scheduler + router,
-# and ENABLE_V1_KVCACHE_SCHEDULER is 1
-
-import json
-import os
-import shutil
-import signal
-import subprocess
-import sys
-import time
-
-import pytest
-import requests
-from utils.serving_utils import (
-    FD_API_PORT,
-    FD_CACHE_QUEUE_PORT,
-    FD_ENGINE_QUEUE_PORT,
-    FD_METRICS_PORT,
-    clean_ports,
-    get_registered_number,
-)
-
-# Read ports from environment variables; use default values if not set
-FD_CONNECTOR_PORT = int(os.getenv("FD_CONNECTOR_PORT", 8433))
-FD_ROUTER_PORT = int(os.getenv("FD_ROUTER_PORT", 8533))
-FD_RDMA_PORT = int(os.getenv("FD_RDMA_PORT", 8623))
-
-# List of ports to clean before and after tests
-PORTS_TO_CLEAN = [
-    FD_API_PORT,
-    FD_ENGINE_QUEUE_PORT,
-    FD_METRICS_PORT,
-    FD_CACHE_QUEUE_PORT,
-    FD_CONNECTOR_PORT,
-    FD_RDMA_PORT,
-    FD_API_PORT + 1,
-    FD_ENGINE_QUEUE_PORT + 1,
-    FD_METRICS_PORT + 1,
-    FD_CACHE_QUEUE_PORT + 1,
-    FD_CONNECTOR_PORT + 1,
-    FD_RDMA_PORT + 1,
-    FD_ROUTER_PORT,
-]
-
-
-@pytest.fixture(scope="session", autouse=True)
-def setup_and_run_server():
-    """
-    Pytest fixture that runs once per test session:
-    - Cleans ports before tests
-    - Starts the API server as a subprocess
-    - Waits for server port to open (up to 30 seconds)
-    - Tears down server after all tests finish
-    """
-    print("Pre-test port cleanup...")
-    clean_ports(PORTS_TO_CLEAN)
-
-    print("log dir clean ")
-    if os.path.exists("log_router") and os.path.isdir("log_router"):
-        shutil.rmtree("log_router")
-    if os.path.exists("log_prefill") and os.path.isdir("log_prefill"):
-        shutil.rmtree("log_prefill")
-    if os.path.exists("log_decode") and os.path.isdir("log_decode"):
-        shutil.rmtree("log_decode")
-
-    base_path = os.getenv("MODEL_PATH")
-    if base_path:
-        model_path = os.path.join(base_path, "ERNIE-4.5-0.3B-Paddle")
-    else:
-        model_path = "baidu/ERNIE-4.5-0.3B-Paddle"
-    print(f"model_path: {model_path}")
-
-    # get rdma nics
-    current_dir = os.path.dirname(os.path.abspath(__file__))
-    shell_path = os.path.join(current_dir, "utils/get_rdma_nics.sh")
-    output = subprocess.check_output(["bash", shell_path, "gpu"], text=True)
-    _, rdma_nics = output.split("=")
-    print(f"shell_path: {shell_path}, rdma_nics: {rdma_nics}")
-
-    # router
-    print("start router...")
-    env_router = os.environ.copy()
-    env_router["FD_LOG_DIR"] = "log_router"
-    router_log_path = "router.log"
-
-    router_cmd = [
-        sys.executable,
-        "-m",
-        "fastdeploy.router.launch",
-        "--port",
-        str(FD_ROUTER_PORT),
-        "--splitwise",
-    ]
-
-    with open(router_log_path, "w") as logfile:
-        process_router = subprocess.Popen(
-            router_cmd,
-            stdout=logfile,
-            stderr=subprocess.STDOUT,
-            start_new_session=True,  # Enables killing full group via os.killpg
-            env=env_router,
-        )
-
-    # prefill实例
-    print("start prefill...")
-    env_prefill = os.environ.copy()
-    env_prefill["CUDA_VISIBLE_DEVICES"] = "0"
-    env_prefill["FD_LOG_DIR"] = "log_prefill"
-    env_prefill["KVCACHE_RDMA_NICS"] = rdma_nics
-    env_prefill["KVCACHE_GDRCOPY_FLUSH_ENABLE"] = "1"
-    # env_prefill["KVCACHE_DEBUG"] = "1"
-    # env_prefill["KV_CACHE_DEBUG_FILE"] = f"{current_dir}/rdma_log_debug"
-    # env_prefill["KV_CACHE_ERROR_FILE"] = f"{current_dir}/rdma_log_error"
-
-    prefill_log_path = "server.log"
-    prefill_cmd = [
-        sys.executable,
-        "-m",
-        "fastdeploy.entrypoints.openai.api_server",
-        "--model",
-        model_path,
-        "--port",
-        str(FD_API_PORT),
-        "--engine-worker-queue-port",
-        str(FD_ENGINE_QUEUE_PORT),
-        "--metrics-port",
-        str(FD_METRICS_PORT),
-        "--cache-queue-port",
-        str(FD_CACHE_QUEUE_PORT),
-        "--max-model-len",
-        "8192",
-        "--num-gpu-blocks-override",
-        "2000",
-        "--splitwise-role",
-        "prefill",
-        "--cache-transfer-protocol",
-        "rdma",
-        "--rdma-comm-ports",
-        str(FD_RDMA_PORT),
-        "--pd-comm-port",
-        str(FD_CONNECTOR_PORT),
-        "--router",
-        f"0.0.0.0:{FD_ROUTER_PORT}",
-    ]
-
-    # Start subprocess in new process group
-    with open(prefill_log_path, "w") as logfile:
-        process_prefill = subprocess.Popen(
-            prefill_cmd,
-            stdout=logfile,
-            stderr=subprocess.STDOUT,
-            start_new_session=True,  # Enables killing full group via os.killpg
-            env=env_prefill,
-        )
-    time.sleep(1)
-
-    # decode实例
-    print("start decode...")
-    env_decode = os.environ.copy()
-    env_decode["CUDA_VISIBLE_DEVICES"] = "1"
-    env_decode["FD_LOG_DIR"] = "log_decode"
-    env_prefill["KVCACHE_RDMA_NICS"] = rdma_nics
-    env_prefill["KVCACHE_GDRCOPY_FLUSH_ENABLE"] = "1"
-    decode_log_path = "decode_server.log"
-    decode_cmd = [
-        sys.executable,
-        "-m",
-        "fastdeploy.entrypoints.openai.api_server",
-        "--model",
-        model_path,
-        "--port",
-        str(FD_API_PORT + 1),
-        "--engine-worker-queue-port",
-        str(FD_ENGINE_QUEUE_PORT + 1),
-        "--metrics-port",
-        str(FD_METRICS_PORT + 1),
-        "--cache-queue-port",
-        str(FD_CACHE_QUEUE_PORT + 1),
-        "--max-model-len",
-        "8192",
-        "--splitwise-role",
-        "decode",
-        "--cache-transfer-protocol",
-        "rdma",
-        "--rdma-comm-ports",
-        str(FD_RDMA_PORT + 1),
-        "--pd-comm-port",
-        str(FD_CONNECTOR_PORT + 1),
-        "--router",
-        f"0.0.0.0:{FD_ROUTER_PORT}",
-    ]
-
-    # Start subprocess in new process group
-    with open(decode_log_path, "w") as logfile:
-        process_decode = subprocess.Popen(
-            decode_cmd,
-            stdout=logfile,
-            stderr=subprocess.STDOUT,
-            start_new_session=True,  # Enables killing full group via os.killpg
-            env=env_decode,
-        )
-
-    # Wait up to 300 seconds for API server to be ready
-    for _ in range(60):
-        registered_numbers = get_registered_number(f"0.0.0.0:{FD_ROUTER_PORT}")
-        if registered_numbers["prefill"] >= 1 and registered_numbers["decode"] >= 1:
-            print("Prefill and decode servers are both online")
-            break
-        time.sleep(5)
-    else:
-        print("[TIMEOUT] API server failed to start in 5 minutes. Cleaning up...")
-        try:
-            os.killpg(process_prefill.pid, signal.SIGTERM)
-            os.killpg(process_decode.pid, signal.SIGTERM)
-            clean_ports(PORTS_TO_CLEAN)
-        except Exception as e:
-            print(f"Failed to kill process group: {e}")
-        raise RuntimeError(f"API server did not start on port {FD_API_PORT}")
-
-    yield  # Run tests
-
-    print("\n===== Post-test server cleanup... =====")
-    try:
-        os.killpg(process_router.pid, signal.SIGTERM)
-        os.killpg(process_prefill.pid, signal.SIGTERM)
-        os.killpg(process_decode.pid, signal.SIGTERM)
-        clean_ports(PORTS_TO_CLEAN)
-        print(f"Prefill server (pid={process_prefill.pid}) terminated")
-        print(f"Decode server (pid={process_decode.pid}) terminated")
-    except Exception as e:
-        print(f"Failed to terminate API server: {e}")
-
-
-@pytest.fixture(scope="session")
-def api_url(request):
-    """
-    Returns the API endpoint URL for chat completions.
-    """
-    return f"http://0.0.0.0:{FD_ROUTER_PORT}/v1/chat/completions"
-
-
-@pytest.fixture(scope="session")
-def metrics_url(request):
-    """
-    Returns the metrics endpoint URL.
-    """
-    return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics"
-
-
-@pytest.fixture
-def headers():
-    """
-    Returns common HTTP request headers.
-    """
-    return {"Content-Type": "application/json"}
-
-
-def test_metrics_config(metrics_url):
-    timeout = 600
-    url = metrics_url.replace("metrics", "config-info")
-    res = requests.get(url, timeout=timeout)
-    assert res.status_code == 200
-
-
-def send_request(url, payload, timeout=600):
-    """
-    发送请求到指定的URL，并返回响应结果。
-    """
-    headers = {
-        "Content-Type": "application/json",
-    }
-
-    try:
-        res = requests.post(url, headers=headers, json=payload, timeout=timeout)
-        print("🟢 接收响应中...\n")
-        return res
-    except requests.exceptions.Timeout:
-        print(f"❌ 请求超时（超过 {timeout} 秒）")
-        return None
-    except requests.exceptions.RequestException as e:
-        print(f"❌ 请求失败：{e}")
-        return None
-
-
-def get_stream_chunks(response):
-    """解析流式返回，生成chunk List[dict]"""
-    chunks = []
-
-    if response.status_code == 200:
-        for line in response.iter_lines(decode_unicode=True):
-            if line:
-                if line.startswith("data: "):
-                    line = line[len("data: ") :]
-
-                if line.strip() == "[DONE]":
-                    break
-
-                try:
-                    chunk = json.loads(line)
-                    chunks.append(chunk)
-                except Exception as e:
-                    print(f"解析失败: {e}, 行内容: {line}")
-    else:
-        print(f"请求失败，状态码: {response.status_code}")
-        print("返回内容：", response.text)
-
-    return chunks
-
-
-def test_chat_usage_stream(api_url):
-    """测试流式chat usage"""
-    payload = {
-        "model": "default",
-        "temperature": 0,
-        "top_p": 0,
-        "seed": 33,
-        "messages": [
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "牛顿的三大运动定律是什么？"},
-        ],
-        "max_tokens": 50,
-        "stream": True,
-        "stream_options": {"include_usage": True, "continuous_usage_stats": True},
-        "metadata": {"min_tokens": 10},
-    }
-
-    response = send_request(url=api_url, payload=payload)
-    chunks = get_stream_chunks(response)
-    result = "".join([x["choices"][0]["delta"]["content"] for x in chunks[:-1]])
-    print("Decode Response:", result)
-    assert result != "", "结果为空"
-    usage = chunks[-1]["usage"]
-    total_tokens = usage["completion_tokens"] + usage["prompt_tokens"]
-    assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens"
-    assert payload["metadata"]["min_tokens"] <= usage["completion_tokens"], "completion_tokens小于min_tokens"
-    assert usage["total_tokens"] == total_tokens, "total_tokens不等于prompt_tokens + completion_tokens"
-
-
-def test_chat_usage_non_stream(api_url):
-    """测试非流式chat usage"""
-    payload = {
-        "model": "default",
-        "temperature": 0,
-        "top_p": 0,
-        "seed": 33,
-        "messages": [
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "牛顿的三大运动定律是什么？"},
-        ],
-        "max_tokens": 50,
-        "stream": False,
-        "metadata": {"min_tokens": 10},
-    }
-
-    response = send_request(url=api_url, payload=payload).json()
-    usage = response["usage"]
-    result = response["choices"][0]["message"]["content"]
-    assert result != "", "结果为空"
-    total_tokens = usage["completion_tokens"] + usage["prompt_tokens"]
-    assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens"
-    assert payload["metadata"]["min_tokens"] <= usage["completion_tokens"], "completion_tokens小于min_tokens"
-    assert usage["total_tokens"] == total_tokens, "total_tokens不等于prompt_tokens + completion_tokens"
-
-
-def test_non_chat_usage_stream(api_url):
-    """测试流式非chat usage"""
-    payload = {
-        "model": "default",
-        "temperature": 0,
-        "top_p": 0,
-        "seed": 33,
-        "prompt": "牛顿的三大运动定律是什么？",
-        "max_tokens": 50,
-        "stream": True,
-        "stream_options": {"include_usage": True, "continuous_usage_stats": True},
-        "metadata": {"min_tokens": 10},
-    }
-    api_url = api_url.replace("chat/completions", "completions")
-
-    response = send_request(url=api_url, payload=payload)
-    chunks = get_stream_chunks(response)
-    result = "".join([x["choices"][0]["text"] for x in chunks[:-1]])
-    print("Decode Response:", result)
-    assert result != "", "结果为空"
-    usage = chunks[-1]["usage"]
-    total_tokens = usage["completion_tokens"] + usage["prompt_tokens"]
-    assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens"
-    assert payload["metadata"]["min_tokens"] <= usage["completion_tokens"], "completion_tokens小于min_tokens"
-    assert usage["total_tokens"] == total_tokens, "total_tokens不等于prompt_tokens + completion_tokens"
-
-
-def test_non_chat_usage_non_stream(api_url):
-    """测试非流式非chat usage"""
-    payload = {
-        "model": "default",
-        "temperature": 0,
-        "top_p": 0,
-        "seed": 33,
-        "prompt": "牛顿的三大运动定律是什么？",
-        "max_tokens": 50,
-        "stream": False,
-        "metadata": {"min_tokens": 10},
-    }
-    api_url = api_url.replace("chat/completions", "completions")
-
-    response = send_request(url=api_url, payload=payload).json()
-    usage = response["usage"]
-    result = response["choices"][0]["text"]
-    print("Decode Response:", result)
-    assert result != "", "结果为空"
-    total_tokens = usage["completion_tokens"] + usage["prompt_tokens"]
-    assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens"
-    assert payload["metadata"]["min_tokens"] <= usage["completion_tokens"], "completion_tokens小于min_tokens"
-    assert usage["total_tokens"] == total_tokens, "total_tokens不等于prompt_tokens + completion_tokens"
diff --git a/tests/e2e/utils/get_rdma_nics.sh b/tests/e2e/utils/get_rdma_nics.sh
deleted file mode 100644
index 4fc07a98c9a..00000000000
--- a/tests/e2e/utils/get_rdma_nics.sh
+++ /dev/null
@@ -1,225 +0,0 @@
-#!/bin/bash
-Cur_Dir=$(cd `dirname $0`; pwd)
-NICNAME_TYPE=xgbe  # 默认检测类型
-type=$1
-
-if [ "$ENABLE_EP_DP" == "1" ]; then
-    gpu_root_port_filename="${Cur_Dir}/gpu_rootport_${DP_RANK}.txt"
-else
-    gpu_root_port_filename="${Cur_Dir}/gpu_rootport.txt"
-fi
-
-function __NEW_GPU_ROOTPORT_FILE__() {
-    touch ${gpu_root_port_filename} 2>/dev/null
-    echo "" > ${gpu_root_port_filename} 2>/dev/null
-    for gpu_bus in $(lspci 2>/dev/null | grep -iE "Communication controller: | controller: NVIDIA" | awk '{print $1}')
-    do
-        readlink "/sys/bus/pci/devices/0000:${gpu_bus}" 2>/dev/null | awk -F [/] '{print $6}' >> ${gpu_root_port_filename}
-    done
-}
-
-function  __RM_GPU_ROOTPORT_FILE__() {
-    rm -rf ${gpu_root_port_filename} 2>/dev/null
-}
-
-function __JUDGE_NIC_TYPE__() {
-    XGBE_NUM=$(ip a 2>/dev/null | grep -c ": ${NICNAME_TYPE}")
-    gpu_first=true
-    xpu_first=true
-    cpu_first=true
-
-    for (( xgbe_no=0; xgbe_no < XGBE_NUM; xgbe_no++ ))
-    do
-        [ ! -d "/sys/class/net/${NICNAME_TYPE}${xgbe_no}" ] && continue
-
-        PCI_ADDRESS=$(ethtool -i "${NICNAME_TYPE}${xgbe_no}" 2>/dev/null | awk -F '0000:' '/bus-info/{print $2}')
-        [ -z "$PCI_ADDRESS" ] && continue
-        NIC_ROOTPORT=$(readlink "/sys/bus/pci/devices/0000:${PCI_ADDRESS}" 2>/dev/null | awk -F '/' '{print $6}')
-
-        NIC_TYPE="CPU_NIC"
-        grep -qxF "$NIC_ROOTPORT" ${gpu_root_port_filename} 2>/dev/null && NIC_TYPE="GPU_NIC"
-
-        if [[ "$type" == "gpu" && "$NIC_TYPE" == "GPU_NIC" ]]; then
-            ibdev=$(ibdev2netdev 2>/dev/null | awk -v nic="${NICNAME_TYPE}${xgbe_no}" '$5 == nic {print $1}')
-            if [ -n "$ibdev" ] && ip link show "${NICNAME_TYPE}${xgbe_no}" | grep -q "state UP"; then
-                if $gpu_first; then
-                    printf "KVCACHE_RDMA_NICS=%s" "$ibdev"
-                    gpu_first=false
-                else
-                    printf ",%s" "$ibdev"
-                fi
-            fi
-        fi
-
-        if [[ "$type" == "xpu" && "$NIC_TYPE" == "GPU_NIC" ]]; then
-            ibdev=$(ibdev2netdev 2>/dev/null | awk -v nic="${NICNAME_TYPE}${xgbe_no}" '$5 == nic {print $1}')
-            if [ -n "$ibdev" ] && ip link show "${NICNAME_TYPE}${xgbe_no}" | grep -q "state UP"; then
-                if $xpu_first; then
-                    printf "KVCACHE_RDMA_NICS=%s,%s" "$ibdev" "$ibdev"
-                    xpu_first=false
-                else
-                    printf ",%s,%s" "$ibdev" "$ibdev"
-                fi
-            fi
-        fi
-
-        if [[ "$type" == "cpu" ]]; then
-            for (( xgbe_no=0; xgbe_no < XGBE_NUM; xgbe_no++ ))
-            do
-                [ ! -d "/sys/class/net/${NICNAME_TYPE}${xgbe_no}" ] && continue
-
-                PCI_ADDRESS=$(ethtool -i "${NICNAME_TYPE}${xgbe_no}" 2>/dev/null | awk -F '0000:' '/bus-info/{print $2}')
-                [ -z "$PCI_ADDRESS" ] && continue
-
-                NIC_ROOTPORT=$(readlink "/sys/bus/pci/devices/0000:${PCI_ADDRESS}" 2>/dev/null | awk -F '/' '{print $6}')
-                grep -qxF "$NIC_ROOTPORT" ${gpu_root_port_filename} 2>/dev/null && continue
-
-                if ip link show "${NICNAME_TYPE}${xgbe_no}" | grep -q "state UP" && \
-                ip a show "${NICNAME_TYPE}${xgbe_no}" | grep -q "inet"; then
-                    printf "KV_CACHE_SOCKET_IFNAME=%s\n" "${NICNAME_TYPE}${xgbe_no}"
-                    return 0
-                fi
-            done
-                echo "ERROR: No active CPU NIC with IP found!" >&2
-                return 1
-        fi
-
-        if [[ "$type" == "cpu_ib" && "$NIC_TYPE" == "CPU_NIC" ]]; then
-            ibdev=$(ibdev2netdev 2>/dev/null | awk -v nic="${NICNAME_TYPE}${xgbe_no}" '$5 == nic {print $1}')
-            if [ -n "$ibdev" ] && ip link show "${NICNAME_TYPE}${xgbe_no}" | grep -q "state UP" && \
-               ip a show "${NICNAME_TYPE}${xgbe_no}" | grep -q "inet "; then
-                if $cpu_ib_first; then
-                    printf "KVCACHE_RDMA_NICS=%s" "$ibdev"
-                    cpu_ib_first=false
-                else
-                    printf ",%s" "$ibdev"
-                fi
-            fi
-        fi
-
-    done
-
-    case "$type" in
-        gpu) ! $gpu_first && printf "\n" ;;
-        xpu) ! $xpu_first && printf "\n" ;;
-        cpu) ! $cpu_first && printf "\n" ;;
-        cpu_ib) ! $cpu_ib_first && printf "\n" ;;
-    esac
-}
-
-function get_vxpu_nics() {
-    local topo_output=$(xpu-smi topo -m)
-    local xpu_info=$(echo "$topo_output" | grep -E '^XPU[0-9]+')
-
-    local nic_mapping=()
-    while IFS= read -r line; do
-        if [[ $line =~ NIC([0-9]+):\ +(mlx[0-9_]+) ]]; then
-            local nic_idx=${BASH_REMATCH[1]}
-            local nic_name=${BASH_REMATCH[2]}
-            nic_mapping[$nic_idx]=$nic_name
-        fi
-    done < <(echo "$topo_output" | grep -E '^\s*NIC[0-9]+:')
-
-    local nic_count=${#nic_mapping[@]}
-
-    declare -A priority_map=([PIX]=2 [NODE]=1 [SYS]=0)
-    local optimal_nics=()
-
-    while IFS= read -r line; do
-        local fields=($line)
-        local nic_start_index=5
-        local max_nics=$(( ${#fields[@]} - nic_start_index ))
-        local actual_nic_count=$(( max_nics < nic_count ? max_nics : nic_count ))
-
-        local best_priority=-1
-        local best_nic=""
-
-        for ((nic_idx=0; nic_idx<actual_nic_count; nic_idx++)); do
-            local conn_type=${fields[nic_idx+nic_start_index]}
-            local current_priority=${priority_map[$conn_type]:--1}
-
-            if (( current_priority > best_priority )); then
-                best_priority=$current_priority
-                best_nic="${nic_mapping[$nic_idx]}"
-            fi
-        done
-
-        if [[ -n "$best_nic" ]]; then
-            optimal_nics+=("$best_nic")
-        fi
-    done <<< "$xpu_info"
-
-    local IFS=,
-    export KVCACHE_RDMA_NICS="${optimal_nics[*]}"
-    echo "KVCACHE_RDMA_NICS=${optimal_nics[*]}"
-}
-
-function get_vcpu_nics() {
-    ip -o addr show | awk '$3 == "inet" && $4 ~ /^10\./ {print "KV_CACHE_SOCKET_IFNAME="$2; exit}'
-}
-
-function __main__() {
-    if [[ "$type" == "vxpu" ]]; then
-        get_vxpu_nics
-        return 0
-    fi
-    if [[ "$type" == "vcpu" ]]; then
-        get_vcpu_nics
-        return 0
-    fi
-
-    # 处理 bond 情况
-    if [[ "$type" == "cpu" ]]; then
-        for bond in $(ls -d /sys/class/net/bond* 2>/dev/null); do
-            bond_if=$(basename "$bond")
-            if ip link show "$bond_if" | grep -q "state UP" && \
-               ip a show "$bond_if" | grep -q "inet "; then
-                printf "KV_CACHE_SOCKET_IFNAME=%s\n" "$bond_if"
-                return 0
-            fi
-        done
-    fi
-
-    if [[ "$type" == "cpu_ib" ]]; then
-        first=true
-        for bond in $(ls -d /sys/class/net/bond* 2>/dev/null); do
-            bond_if=$(basename "$bond")
-            __NEW_GPU_ROOTPORT_FILE__
-
-            ibdev=$(ibdev2netdev 2>/dev/null | grep -w "$bond_if" | awk '{print $1}')
-            if [ -n "$ibdev" ] && ip link show "$bond_if" | grep -q "state UP" && \
-               ip a show "$bond_if" | grep -q "inet "; then
-                if $first; then
-                    printf "KVCACHE_RDMA_NICS=%s" "$ibdev"
-                    first=false
-                else
-                    printf ",%s" "$ibdev"
-                fi
-            fi
-
-            bondib=$(show_gids 2>/dev/null | grep -w "$bond_if" | awk '{print $1}' | grep "mlx.*bond" | head -1)
-            if [ -n "$bondib" ] && ip link show "$bond_if" | grep -q "state UP" && \
-               ip a show "$bond_if" | grep -q "inet " && $first; then
-                printf "KVCACHE_RDMA_NICS=%s" "$bondib"
-                first=false
-            fi
-
-            __RM_GPU_ROOTPORT_FILE__
-        done
-
-        ! $first && printf "\n"
-        [ ! $first ] && return 0
-    fi
-
-    local nic_types=("eth" "ib" "xgbe")
-    for nt in "${nic_types[@]}"; do
-        if ip a | grep -iq "$nt"; then
-            __NEW_GPU_ROOTPORT_FILE__
-            NICNAME_TYPE=$nt
-            __JUDGE_NIC_TYPE__
-            __RM_GPU_ROOTPORT_FILE__
-        fi
-    done
-}
-
-__main__