PaddlePaddle · Jiang-Jia-Jun · Nov 3, 2025 · Oct 30, 2025 · Oct 31, 2025 · Oct 31, 2025
diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -986,12 +986,15 @@ void LimitThinkingContentLengthV1(const paddle::Tensor& next_tokens,
                                   const paddle::Tensor& max_think_lens,
                                   const paddle::Tensor& step_idx,
                                   const paddle::Tensor& limit_think_status,
+                                  const paddle::Tensor& stop_flags,
+                                  const paddle::Tensor& eos_token_ids,
                                   const int64_t think_end_id);
 
 void LimitThinkingContentLengthV2(const paddle::Tensor& next_tokens,
                                   const paddle::Tensor& max_think_lens,
                                   const paddle::Tensor& step_idx,
                                   const paddle::Tensor& limit_think_status,
+                                  const paddle::Tensor& stop_flags,
                                   const int64_t think_end_id,
                                   const int64_t line_break_id);
 
@@ -1002,6 +1005,8 @@ void SpeculateLimitThinkingContentLengthV1(
     const paddle::Tensor& limit_think_status,
     const paddle::Tensor& accept_num,
     const paddle::Tensor& seq_lens_decoder,
+    const paddle::Tensor& stop_flags,
+    const paddle::Tensor& eos_token_ids,
     const int64_t think_end_id);
 
 void SpeculateLimitThinkingContentLengthV2(
@@ -1011,6 +1016,7 @@ void SpeculateLimitThinkingContentLengthV2(
     const paddle::Tensor& limit_think_status,
     const paddle::Tensor& accept_num,
     const paddle::Tensor& seq_lens_decoder,
+    const paddle::Tensor& stop_flags,
     const int64_t think_end_id,
     const int64_t line_break_id);
 

diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu
@@ -19,69 +19,97 @@ __global__ void limit_thinking_content_length_kernel_v1(
     int64_t *next_tokens,
     const int *max_think_lens,
     const int64_t *step_idx,
+    const int64_t *eos_token_ids,
     int *limit_think_status,
+    bool *stop_flags,
     const int64_t think_end_id,
-    const int bs) {
-    int bid = threadIdx.x;
-    if (bid >= bs) return;
+    const int bs,
+    const int eos_token_id_len) {
+  int bid = threadIdx.x;
+  if (bid >= bs) return;
 
-    // 如果该序列未启用思考功能，则直接返回，默认值为 -1，表示不限制思考长度
-    const int max_think_len = max_think_lens[bid];
-    if (max_think_len < 0) return;
-    int current_limit_think_status = limit_think_status[bid];
-    // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行.
-    if (current_limit_think_status == 2) {
-        return;
-    }
+  // 如果该序列未启用思考功能，则直接返回，默认值为 -1，表示不限制思考长度
+  const int max_think_len = max_think_lens[bid];
+  if (max_think_len < 0) return;
+  int current_limit_think_status = limit_think_status[bid];
+  // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行
+  if (current_limit_think_status == 2 && stop_flags[bid]) {
+    return;
+  }
 
-    int64_t next_token = next_tokens[bid];
-    const int64_t step = step_idx[bid];
+  int64_t next_token = next_tokens[bid];
+  const int64_t step = step_idx[bid];
 
-    // ======================= 思考阶段控制 =======================
-    // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束
-    if (current_limit_think_status < 1) {
-        // 当开启思考长度控制时，检查是否超时
-        if (step >= max_think_len) {
-            // 强制将当前token替换为结束思考的token
-            next_token = think_end_id;
-            // 将状态推进到 1, 表示 "正在结束思考"
-            current_limit_think_status = 1;
+  // ======================= 思考阶段控制 =======================
+  // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束
+  if (current_limit_think_status < 1) {
+    // 当开启思考长度控制时，检查是否超时
+    if (step >= max_think_len) {
+      // 强制将当前token替换为结束思考的token
+      next_token = think_end_id;
+      // 将状态推进到 1, 表示 "正在结束思考"
+      current_limit_think_status = 1;
+    } else {
+      // 检查是否生成了EOS
+      for (int i = 0; i < eos_token_id_len; i++) {
+        if (eos_token_ids[i] == next_token) {
+          // 强制将当前token替换为结束思考的token
+          next_token = think_end_id;
+          // 将状态推进到 1, 表示 "正在结束思考"
+          current_limit_think_status = 1;
+          if (stop_flags[bid]) {
+            stop_flags[bid] = false;
+          }
+          break;
         }
+      }
     }
-    // ======================= 思考结束处理 =======================
-    // 阶段 2: 检查是否已满足结束思考的条件 (status < 2)
-    // 这种情况会处理两种场景:
-    // 1. status == 0: 模型自己生成了 think_end_id
-    // 2. status == 1: 上一阶段强制注入了 think_end_id
-    if (current_limit_think_status < 2) {
-        if (next_token == think_end_id) {
-            // 确认思考结束，将状态推进到 2 (响应阶段)
-            current_limit_think_status = 2;
-        }
+  }
+  // ======================= 思考结束处理 =======================
+  // 阶段 2: 检查是否已满足结束思考的条件 (status < 2)
+  // 这种情况会处理两种场景:
+  // 1. status == 0: 模型自己生成了 think_end_id
+  // 2. status == 1: 上一阶段强制注入了 think_end_id
+  if (current_limit_think_status < 2) {
+    if (next_token == think_end_id) {
+      // 确认思考结束，将状态推进到 2 (响应阶段)
+      current_limit_think_status = 2;
     }
-    // 写回更新后的 token
-    next_tokens[bid] = next_token;
-    // 更新全局状态
-    limit_think_status[bid] = current_limit_think_status;
+  }
+  // 写回更新后的 token
+  next_tokens[bid] = next_token;
+  // 更新全局状态
+  limit_think_status[bid] = current_limit_think_status;
 }
 
 void LimitThinkingContentLengthV1(const paddle::Tensor &next_tokens,
                                   const paddle::Tensor &max_think_lens,
                                   const paddle::Tensor &step_idx,
                                   const paddle::Tensor &limit_think_status,
+                                  const paddle::Tensor &stop_flags,
+                                  const paddle::Tensor &eos_token_ids,
                                   const int64_t think_end_id) {
-    const int batch_size = next_tokens.shape()[0];
-    limit_thinking_content_length_kernel_v1<<<1, 1024>>>(
-        const_cast<int64_t *>(next_tokens.data<int64_t>()),
-        max_think_lens.data<int>(),
-        step_idx.data<int64_t>(),
-        const_cast<int *>(limit_think_status.data<int>()),
-        think_end_id,
-        batch_size);
+  const int batch_size = next_tokens.shape()[0];
+  const int eos_token_id_len = eos_token_ids.shape()[0];
+  limit_thinking_content_length_kernel_v1<<<1, 1024>>>(
+      const_cast<int64_t *>(next_tokens.data<int64_t>()),
+      max_think_lens.data<int>(),
+      step_idx.data<int64_t>(),
+      eos_token_ids.data<int64_t>(),
+      const_cast<int *>(limit_think_status.data<int>()),
+      const_cast<bool *>(stop_flags.data<bool>()),
+      think_end_id,
+      batch_size,
+      eos_token_id_len);
 }
 
 PD_BUILD_STATIC_OP(limit_thinking_content_length_v1)
-    .Inputs({"next_tokens", "max_think_lens", "step_idx", "limit_think_status"})
+    .Inputs({"next_tokens",
+             "max_think_lens",
+             "step_idx",
+             "limit_think_status",
+             "stop_flags",
+             "eos_token_ids"})
     .Attrs({"think_end_id: int64_t"})
     .Outputs({"next_tokens_out"})
     .SetInplaceMap({{"next_tokens", "next_tokens_out"}})

diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu
@@ -24,87 +24,94 @@ __global__ void limit_thinking_content_length_kernel_v2(
     const int *max_think_lens,
     const int64_t *step_idx,
     int *limit_think_status,
+    const bool *stop_flags,
     const int64_t think_end_id,
     const int64_t line_break_id,
     const int bs) {
-    int bid = threadIdx.x;
-    if (bid >= bs) return;
-    // 如果该序列未启用思考功能，则直接返回，默认值为 -1，表示不限制思考长度
-    const int max_think_len = max_think_lens[bid];
-    if (max_think_len < 0) return;
-    int current_limit_think_status = limit_think_status[bid];
-    // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行.
-    if (current_limit_think_status == 3) {
-        return;
-    }
+  int bid = threadIdx.x;
+  if (bid >= bs) return;
+  // 如果该序列未启用思考功能，则直接返回，默认值为 -1，表示不限制思考长度
+  const int max_think_len = max_think_lens[bid];
+  if (max_think_len < 0) return;
+  int current_limit_think_status = limit_think_status[bid];
+  // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行
+  if (current_limit_think_status == 3 && stop_flags[bid]) {
+    return;
+  }
 
-    int64_t next_token = next_tokens[bid];
-    const int64_t step = step_idx[bid];
+  int64_t next_token = next_tokens[bid];
+  const int64_t step = step_idx[bid];
 
-    // ======================= 思考阶段控制 =======================
-    // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束
-    // 阶段 2: 在替换 (status == 1), 检查是否替换结束
-    if (current_limit_think_status <= 1) {
-        // 当开启思考长度控制时，检查是否超时
-        if (step == max_think_len) {
-            // 强制将当前token替换为结束思考的token
-            next_token = line_break_id;
-            current_limit_think_status = 1;
-        } else if (step == max_think_len + 1) {
-            // 强制将当前token替换为结束思考的token
-            next_token = think_end_id;
-            current_limit_think_status = 1;
-        } else if (step == max_think_len + 2) {
-            // 强制将当前token替换为结束思考的token
-            next_token = line_break_id;
-            current_limit_think_status = 1;
-        } else if (step == max_think_len + 3) {
-            // 强制将当前token替换为结束思考的token
-            next_token = line_break_id;
-            // 将状态推进到 1, 表示 "正在结束思考"
-            current_limit_think_status = 2;
-        }
-    }
-    // ======================= 思考结束处理 =======================
-    // 阶段 3: 检查是否已满足结束思考的条件 (status == 0 || status == 2)
-    // 这种情况会处理两种场景:
-    // 1. status == 0: 模型可能自己生成了 </think>
-    // 2. status == 2: 上一阶段强制注入了 \n</think>\n\n
-    if (current_limit_think_status == 0) {
-        if (next_token == think_end_id) {
-            // 确认思考结束，将状态推进到 3 (响应阶段)
-            current_limit_think_status = 3;
-        }
+  // ======================= 思考阶段控制 =======================
+  // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束
+  // 阶段 2: 在替换 (status == 1), 检查是否替换结束
+  if (current_limit_think_status <= 1) {
+    // 当开启思考长度控制时，检查是否超时
+    if (step == max_think_len) {
+      // 强制将当前token替换为结束思考的token
+      next_token = line_break_id;
+      current_limit_think_status = 1;
+    } else if (step == max_think_len + 1) {
+      // 强制将当前token替换为结束思考的token
+      next_token = think_end_id;
+      current_limit_think_status = 1;
+    } else if (step == max_think_len + 2) {
+      // 强制将当前token替换为结束思考的token
+      next_token = line_break_id;
+      current_limit_think_status = 1;
+    } else if (step == max_think_len + 3) {
+      // 强制将当前token替换为结束思考的token
+      next_token = line_break_id;
+      // 将状态推进到 1, 表示 "正在结束思考"
+      current_limit_think_status = 2;
     }
-    if (current_limit_think_status == 2) {
-        // 确认思考结束，将状态推进到 3 (响应阶段)
-        current_limit_think_status = 3;
+  }
+  // ======================= 思考结束处理 =======================
+  // 阶段 3: 检查是否已满足结束思考的条件 (status == 0 || status == 2)
+  // 这种情况会处理两种场景:
+  // 1. status == 0: 模型可能自己生成了 </think>
+  // 2. status == 2: 上一阶段强制注入了 \n</think>\n\n
+  if (current_limit_think_status == 0) {
+    if (next_token == think_end_id) {
+      // 确认思考结束，将状态推进到 3 (响应阶段)
+      current_limit_think_status = 3;
     }
-    // 写回更新后的 token
-    next_tokens[bid] = next_token;
-    // 更新全局状态
-    limit_think_status[bid] = current_limit_think_status;
+  }
+  if (current_limit_think_status == 2) {
+    // 确认思考结束，将状态推进到 3 (响应阶段)
+    current_limit_think_status = 3;
+  }
+  // 写回更新后的 token
+  next_tokens[bid] = next_token;
+  // 更新全局状态
+  limit_think_status[bid] = current_limit_think_status;
 }
 
 void LimitThinkingContentLengthV2(const paddle::Tensor &next_tokens,
                                   const paddle::Tensor &max_think_lens,
                                   const paddle::Tensor &step_idx,
                                   const paddle::Tensor &limit_think_status,
+                                  const paddle::Tensor &stop_flags,
                                   const int64_t think_end_id,
                                   const int64_t line_break_id) {
-    const int batch_size = next_tokens.shape()[0];
-    limit_thinking_content_length_kernel_v2<<<1, 1024>>>(
-        const_cast<int64_t *>(next_tokens.data<int64_t>()),
-        max_think_lens.data<int>(),
-        step_idx.data<int64_t>(),
-        const_cast<int *>(limit_think_status.data<int>()),
-        think_end_id,
-        line_break_id,
-        batch_size);
+  const int batch_size = next_tokens.shape()[0];
+  limit_thinking_content_length_kernel_v2<<<1, 1024>>>(
+      const_cast<int64_t *>(next_tokens.data<int64_t>()),
+      max_think_lens.data<int>(),
+      step_idx.data<int64_t>(),
+      const_cast<int *>(limit_think_status.data<int>()),
+      stop_flags.data<bool>(),
+      think_end_id,
+      line_break_id,
+      batch_size);
 }
 
 PD_BUILD_STATIC_OP(limit_thinking_content_length_v2)
-    .Inputs({"next_tokens", "max_think_lens", "step_idx", "limit_think_status"})
+    .Inputs({"next_tokens",
+             "max_think_lens",
+             "step_idx",
+             "limit_think_status",
+             "stop_flags"})
     .Attrs({"think_end_id: int64_t", "line_break_id: int64_t"})
     .Outputs({"next_tokens_out"})
     .SetInplaceMap({{"next_tokens", "next_tokens_out"}})