apache · iakuf · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026 · Feb 28, 2026
diff --git a/apisix/plugins/ai-drivers/openai-base.lua b/apisix/plugins/ai-drivers/openai-base.lua
@@ -131,10 +131,12 @@ local function read_response(conf, ctx, res, response_filter)
                         core.log.info("got token usage from ai service: ",
                                             core.json.delay_encode(data.usage))
                         ctx.llm_raw_usage = data.usage
+                        local pt = data.usage.prompt_tokens or data.usage.input_tokens or 0
+                        local ct = data.usage.completion_tokens or data.usage.output_tokens or 0
                         ctx.ai_token_usage = {
-                            prompt_tokens = data.usage.prompt_tokens or 0,
-                            completion_tokens = data.usage.completion_tokens or 0,
-                            total_tokens = data.usage.total_tokens or 0,
+                            prompt_tokens = pt,
+                            completion_tokens = ct,
+                            total_tokens = data.usage.total_tokens or (pt + ct),
                         }
                         ctx.var.llm_prompt_tokens = ctx.ai_token_usage.prompt_tokens
                         ctx.var.llm_completion_tokens = ctx.ai_token_usage.completion_tokens
@@ -188,9 +190,13 @@ local function read_response(conf, ctx, res, response_filter)
         ctx.ai_token_usage = {}
         if type(res_body.usage) == "table" then
             ctx.llm_raw_usage = res_body.usage
-            ctx.ai_token_usage.prompt_tokens = res_body.usage.prompt_tokens or 0
-            ctx.ai_token_usage.completion_tokens = res_body.usage.completion_tokens or 0
-            ctx.ai_token_usage.total_tokens = res_body.usage.total_tokens or 0
+            ctx.ai_token_usage.prompt_tokens = res_body.usage.prompt_tokens
+                                               or res_body.usage.input_tokens or 0
+            ctx.ai_token_usage.completion_tokens = res_body.usage.completion_tokens
+                                                   or res_body.usage.output_tokens or 0
+            ctx.ai_token_usage.total_tokens = res_body.usage.total_tokens
+                                              or (ctx.ai_token_usage.prompt_tokens
+                                                 + ctx.ai_token_usage.completion_tokens)
         end
         ctx.var.llm_prompt_tokens = ctx.ai_token_usage.prompt_tokens or 0
         ctx.var.llm_completion_tokens = ctx.ai_token_usage.completion_tokens or 0

diff --git a/apisix/plugins/ai-rate-limiting.lua b/apisix/plugins/ai-rate-limiting.lua
@@ -65,6 +65,10 @@ local schema = {
             default = "total_tokens",
             description = "The strategy to limit the tokens"
         },
+        -- 使用 OpenRouter/OpenAI 兼容的标准头名，IDE 插件（Cursor/Continue）可直接识别
+        -- true:  X-RateLimit-Limit-Tokens / X-RateLimit-Remaining-Tokens / X-RateLimit-Reset-Tokens
+        -- false: X-AI-RateLimit-Limit-{instance} (原有行为)
+        standard_headers = {type = "boolean", default = false},
-        -- 使用 OpenRouter/OpenAI 兼容的标准头名，IDE 插件（Cursor/Continue）可直接识别
-        -- true:  X-RateLimit-Limit-Tokens / X-RateLimit-Remaining-Tokens / X-RateLimit-Reset-Tokens
-        -- false: X-AI-RateLimit-Limit-{instance} (原有行为)
-        standard_headers = {type = "boolean", default = false},
+        standard_headers = {
+            type = "boolean",
+            default = false,
+            description = "Use OpenRouter/OpenAI-compatible standard rate limit header names (true: X-RateLimit-Limit-Tokens / X-RateLimit-Remaining-Tokens / X-RateLimit-Reset-Tokens; false: keep original behavior with X-AI-RateLimit-Limit-{instance})"
+        },
-        -- 使用 OpenRouter/OpenAI 兼容的标准头名，IDE 插件（Cursor/Continue）可直接识别
-        -- true:  X-RateLimit-Limit-Tokens / X-RateLimit-Remaining-Tokens / X-RateLimit-Reset-Tokens
-        -- false: X-AI-RateLimit-Limit-{instance} (原有行为)
-        standard_headers = {type = "boolean", default = false},
+        standard_headers = {
+            type = "boolean",
+            default = false,
+            description = "Use OpenRouter/OpenAI-compatible standard rate limit header names (true: X-RateLimit-Limit-Tokens / X-RateLimit-Remaining-Tokens / X-RateLimit-Reset-Tokens; false: keep original behavior with X-AI-RateLimit-Limit-{instance})"
+        },
         instances = {
             type = "array",
             items = instance_limit_schema,
@@ -177,9 +181,23 @@ local function transform_limit_conf(plugin_conf, instance_conf, instance_name)
     limit_conf._meta = plugin_conf._meta
     limit_conf.count = limit
     limit_conf.time_window = time_window
-    limit_conf.limit_header = "X-AI-RateLimit-Limit-" .. name
-    limit_conf.remaining_header = "X-AI-RateLimit-Remaining-" .. name
-    limit_conf.reset_header = "X-AI-RateLimit-Reset-" .. name
+
+    -- standard_headers=true 输出 OpenRouter/OpenAI 兼容头名
+    -- IDE 插件（Cursor/Continue）可直接识别并做退避
+    if plugin_conf.standard_headers then
+        local strategy = plugin_conf.limit_strategy or "total_tokens"
+        local suffix = strategy == "total_tokens"      and "Tokens"
+                    or strategy == "prompt_tokens"     and "PromptTokens"
+                    or "CompletionTokens"
+        limit_conf.limit_header     = "X-RateLimit-Limit-"     .. suffix
+        limit_conf.remaining_header = "X-RateLimit-Remaining-" .. suffix
+        limit_conf.reset_header     = "X-RateLimit-Reset-"     .. suffix
+    else
+        limit_conf.limit_header     = "X-AI-RateLimit-Limit-"     .. name
+        limit_conf.remaining_header = "X-AI-RateLimit-Remaining-" .. name
+        limit_conf.reset_header     = "X-AI-RateLimit-Reset-"     .. name
+    end
+
     return limit_conf
 end
 

diff --git a/docs/en/latest/plugins/ai-rate-limiting-standard-headers-patch.md b/docs/en/latest/plugins/ai-rate-limiting-standard-headers-patch.md
@@ -0,0 +1,86 @@
+# ai-rate-limiting — `standard_headers` Parameter
+
+## Overview
+
+The `standard_headers` option makes `ai-rate-limiting` emit rate-limit response
+headers that follow the [OpenRouter / OpenAI convention][openrouter-headers],
+so IDE extensions such as **Cursor** and **Continue** can detect quota exhaustion
+and apply automatic back-off without any custom configuration.
+
+[openrouter-headers]: https://openrouter.ai/docs/api-reference/limits
+
+## New Parameter
+
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `standard_headers` | boolean | `false` | When `true`, emit OpenAI/OpenRouter-compatible rate-limit headers instead of the legacy `X-AI-RateLimit-*` headers. |
+
+The header suffix is derived from `limit_strategy`:
+
+| `limit_strategy` | Header suffix |
+|---|---|
+| `total_tokens` (default) | `Tokens` |
+| `prompt_tokens` | `PromptTokens` |
+| `completion_tokens` | `CompletionTokens` |
+
+## Configuration Example
+
+```yaml
+routes:
+  - id: 1
+    uri: /v1/chat/completions
+    plugins:
+      ai-proxy-multi:
+        instances:
+          - name: my-llm
+            provider: openai
+            weight: 1
+            auth:
+              header:
+                Authorization: "Bearer ${{OPENAI_API_KEY}}"
+            options:
+              model: gpt-4o-mini
+      ai-rate-limiting:
+        instances:
+          - name: my-llm
+            limit: 100000
+            time_window: 60
+        limit_strategy: total_tokens
+        standard_headers: true   # <-- enable standard headers
+        rejected_code: 429
+```
+
+## Response Headers
+
+### Normal request (quota available)
+
+```
+HTTP/1.1 200 OK
+X-RateLimit-Limit-Tokens: 100000
+X-RateLimit-Remaining-Tokens: 99985
+X-RateLimit-Reset-Tokens: 42
+```
+
+### Rate-limited request (quota exhausted)
+
+```
+HTTP/1.1 429 Too Many Requests
+X-RateLimit-Limit-Tokens: 100000
+X-RateLimit-Remaining-Tokens: 0
+X-RateLimit-Reset-Tokens: 18
+```
+
+### With `limit_strategy: prompt_tokens`
+
+```
+HTTP/1.1 200 OK
+X-RateLimit-Limit-PromptTokens: 50000
+X-RateLimit-Remaining-PromptTokens: 49990
+X-RateLimit-Reset-PromptTokens: 55
+```
+
+## Backward Compatibility
+
+Setting `standard_headers: false` (or omitting it) preserves the original
+`X-AI-RateLimit-Limit-{instance_name}` header format, so existing integrations
+are unaffected.