From 7a0b8798c704bf4d05be8e89649289a9e47977cc Mon Sep 17 00:00:00 2001
From: iakuf <iakuf@gmail.com>
Date: Thu, 26 Feb 2026 22:10:32 +0800
Subject: [PATCH 1/4] fix(ai-proxy): support Anthropic token field names in
 openai-base driver

When using openai-compatible provider with Anthropic-format endpoints
(e.g. DeepSeek's /anthropic/v1/messages), the response returns
input_tokens/output_tokens instead of prompt_tokens/completion_tokens.

This patch adds fallback support for both field names in both
streaming and non-streaming paths, so token usage statistics work
correctly regardless of which format the upstream LLM returns.

Fixes token stats being 0 when proxying to Anthropic-compatible endpoints.
---
 apisix/plugins/ai-drivers/openai-base.lua | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/apisix/plugins/ai-drivers/openai-base.lua b/apisix/plugins/ai-drivers/openai-base.lua
index 4f279bbc3eab..6fc7e8bcb36d 100644
--- a/apisix/plugins/ai-drivers/openai-base.lua
+++ b/apisix/plugins/ai-drivers/openai-base.lua
@@ -132,8 +132,10 @@ local function read_response(conf, ctx, res, response_filter)
                                             core.json.delay_encode(data.usage))
                         ctx.llm_raw_usage = data.usage
                         ctx.ai_token_usage = {
-                            prompt_tokens = data.usage.prompt_tokens or 0,
-                            completion_tokens = data.usage.completion_tokens or 0,
+                            prompt_tokens = data.usage.prompt_tokens
+                                            or data.usage.input_tokens or 0,
+                            completion_tokens = data.usage.completion_tokens
+                                               or data.usage.output_tokens or 0,
                             total_tokens = data.usage.total_tokens or 0,
                         }
                         ctx.var.llm_prompt_tokens = ctx.ai_token_usage.prompt_tokens
@@ -188,8 +190,10 @@ local function read_response(conf, ctx, res, response_filter)
         ctx.ai_token_usage = {}
         if type(res_body.usage) == "table" then
             ctx.llm_raw_usage = res_body.usage
-            ctx.ai_token_usage.prompt_tokens = res_body.usage.prompt_tokens or 0
-            ctx.ai_token_usage.completion_tokens = res_body.usage.completion_tokens or 0
+            ctx.ai_token_usage.prompt_tokens = res_body.usage.prompt_tokens
+                                               or res_body.usage.input_tokens or 0
+            ctx.ai_token_usage.completion_tokens = res_body.usage.completion_tokens
+                                                   or res_body.usage.output_tokens or 0
             ctx.ai_token_usage.total_tokens = res_body.usage.total_tokens or 0
         end
         ctx.var.llm_prompt_tokens = ctx.ai_token_usage.prompt_tokens or 0

From 03608d902f03ff15ca465b7594bea97ab5eeb667 Mon Sep 17 00:00:00 2001
From: iakuf <iakuf@gmail.com>
Date: Thu, 26 Feb 2026 22:30:32 +0800
Subject: [PATCH 2/4] fix(ai-proxy): compute total_tokens fallback for
 Anthropic format in streaming path

---
 apisix/plugins/ai-drivers/openai-base.lua | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/apisix/plugins/ai-drivers/openai-base.lua b/apisix/plugins/ai-drivers/openai-base.lua
index 6fc7e8bcb36d..5e23c1c995d6 100644
--- a/apisix/plugins/ai-drivers/openai-base.lua
+++ b/apisix/plugins/ai-drivers/openai-base.lua
@@ -131,12 +131,12 @@ local function read_response(conf, ctx, res, response_filter)
                         core.log.info("got token usage from ai service: ",
                                             core.json.delay_encode(data.usage))
                         ctx.llm_raw_usage = data.usage
+                        local pt = data.usage.prompt_tokens or data.usage.input_tokens or 0
+                        local ct = data.usage.completion_tokens or data.usage.output_tokens or 0
                         ctx.ai_token_usage = {
-                            prompt_tokens = data.usage.prompt_tokens
-                                            or data.usage.input_tokens or 0,
-                            completion_tokens = data.usage.completion_tokens
-                                               or data.usage.output_tokens or 0,
-                            total_tokens = data.usage.total_tokens or 0,
+                            prompt_tokens = pt,
+                            completion_tokens = ct,
+                            total_tokens = data.usage.total_tokens or (pt + ct),
                         }
                         ctx.var.llm_prompt_tokens = ctx.ai_token_usage.prompt_tokens
                         ctx.var.llm_completion_tokens = ctx.ai_token_usage.completion_tokens

From d88f36035a6856f1710cc70260aa4313d6ef3a96 Mon Sep 17 00:00:00 2001
From: iakuf <iakuf@gmail.com>
Date: Thu, 26 Feb 2026 22:37:49 +0800
Subject: [PATCH 3/4] fix(ai-proxy): also compute total_tokens fallback in
 non-streaming path

---
 apisix/plugins/ai-drivers/openai-base.lua | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/apisix/plugins/ai-drivers/openai-base.lua b/apisix/plugins/ai-drivers/openai-base.lua
index 5e23c1c995d6..959c14b85c23 100644
--- a/apisix/plugins/ai-drivers/openai-base.lua
+++ b/apisix/plugins/ai-drivers/openai-base.lua
@@ -194,7 +194,9 @@ local function read_response(conf, ctx, res, response_filter)
                                                or res_body.usage.input_tokens or 0
             ctx.ai_token_usage.completion_tokens = res_body.usage.completion_tokens
                                                    or res_body.usage.output_tokens or 0
-            ctx.ai_token_usage.total_tokens = res_body.usage.total_tokens or 0
+            ctx.ai_token_usage.total_tokens = res_body.usage.total_tokens
+                                              or (ctx.ai_token_usage.prompt_tokens
+                                                 + ctx.ai_token_usage.completion_tokens)
         end
         ctx.var.llm_prompt_tokens = ctx.ai_token_usage.prompt_tokens or 0
         ctx.var.llm_completion_tokens = ctx.ai_token_usage.completion_tokens or 0

From 526eee299c557460ddb135b387a96267b48fc432 Mon Sep 17 00:00:00 2001
From: iakuf <iakuf@gmail.com>
Date: Sat, 28 Feb 2026 18:02:16 +0800
Subject: [PATCH 4/4] feat(ai-rate-limiting): add standard_headers option for
 OpenAI/OpenRouter-compatible rate-limit headers

---
 apisix/plugins/ai-rate-limiting.lua           |  24 +-
 ...ai-rate-limiting-standard-headers-patch.md |  86 ++++
 t/plugin/ai-rate-limiting-standard-headers.t  | 393 ++++++++++++++++++
 3 files changed, 500 insertions(+), 3 deletions(-)
 create mode 100644 docs/en/latest/plugins/ai-rate-limiting-standard-headers-patch.md
 create mode 100644 t/plugin/ai-rate-limiting-standard-headers.t

diff --git a/apisix/plugins/ai-rate-limiting.lua b/apisix/plugins/ai-rate-limiting.lua
index 8c7eea51aee9..e10e5258b494 100644
--- a/apisix/plugins/ai-rate-limiting.lua
+++ b/apisix/plugins/ai-rate-limiting.lua
@@ -65,6 +65,10 @@ local schema = {
             default = "total_tokens",
             description = "The strategy to limit the tokens"
         },
+        -- 使用 OpenRouter/OpenAI 兼容的标准头名，IDE 插件（Cursor/Continue）可直接识别
+        -- true:  X-RateLimit-Limit-Tokens / X-RateLimit-Remaining-Tokens / X-RateLimit-Reset-Tokens
+        -- false: X-AI-RateLimit-Limit-{instance} (原有行为)
+        standard_headers = {type = "boolean", default = false},
         instances = {
             type = "array",
             items = instance_limit_schema,
@@ -177,9 +181,23 @@ local function transform_limit_conf(plugin_conf, instance_conf, instance_name)
     limit_conf._meta = plugin_conf._meta
     limit_conf.count = limit
     limit_conf.time_window = time_window
-    limit_conf.limit_header = "X-AI-RateLimit-Limit-" .. name
-    limit_conf.remaining_header = "X-AI-RateLimit-Remaining-" .. name
-    limit_conf.reset_header = "X-AI-RateLimit-Reset-" .. name
+
+    -- standard_headers=true 输出 OpenRouter/OpenAI 兼容头名
+    -- IDE 插件（Cursor/Continue）可直接识别并做退避
+    if plugin_conf.standard_headers then
+        local strategy = plugin_conf.limit_strategy or "total_tokens"
+        local suffix = strategy == "total_tokens"      and "Tokens"
+                    or strategy == "prompt_tokens"     and "PromptTokens"
+                    or "CompletionTokens"
+        limit_conf.limit_header     = "X-RateLimit-Limit-"     .. suffix
+        limit_conf.remaining_header = "X-RateLimit-Remaining-" .. suffix
+        limit_conf.reset_header     = "X-RateLimit-Reset-"     .. suffix
+    else
+        limit_conf.limit_header     = "X-AI-RateLimit-Limit-"     .. name
+        limit_conf.remaining_header = "X-AI-RateLimit-Remaining-" .. name
+        limit_conf.reset_header     = "X-AI-RateLimit-Reset-"     .. name
+    end
+
     return limit_conf
 end
 
diff --git a/docs/en/latest/plugins/ai-rate-limiting-standard-headers-patch.md b/docs/en/latest/plugins/ai-rate-limiting-standard-headers-patch.md
new file mode 100644
index 000000000000..f42789b52436
--- /dev/null
+++ b/docs/en/latest/plugins/ai-rate-limiting-standard-headers-patch.md
@@ -0,0 +1,86 @@
+# ai-rate-limiting — `standard_headers` Parameter
+
+## Overview
+
+The `standard_headers` option makes `ai-rate-limiting` emit rate-limit response
+headers that follow the [OpenRouter / OpenAI convention][openrouter-headers],
+so IDE extensions such as **Cursor** and **Continue** can detect quota exhaustion
+and apply automatic back-off without any custom configuration.
+
+[openrouter-headers]: https://openrouter.ai/docs/api-reference/limits
+
+## New Parameter
+
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `standard_headers` | boolean | `false` | When `true`, emit OpenAI/OpenRouter-compatible rate-limit headers instead of the legacy `X-AI-RateLimit-*` headers. |
+
+The header suffix is derived from `limit_strategy`:
+
+| `limit_strategy` | Header suffix |
+|---|---|
+| `total_tokens` (default) | `Tokens` |
+| `prompt_tokens` | `PromptTokens` |
+| `completion_tokens` | `CompletionTokens` |
+
+## Configuration Example
+
+```yaml
+routes:
+  - id: 1
+    uri: /v1/chat/completions
+    plugins:
+      ai-proxy-multi:
+        instances:
+          - name: my-llm
+            provider: openai
+            weight: 1
+            auth:
+              header:
+                Authorization: "Bearer ${{OPENAI_API_KEY}}"
+            options:
+              model: gpt-4o-mini
+      ai-rate-limiting:
+        instances:
+          - name: my-llm
+            limit: 100000
+            time_window: 60
+        limit_strategy: total_tokens
+        standard_headers: true   # <-- enable standard headers
+        rejected_code: 429
+```
+
+## Response Headers
+
+### Normal request (quota available)
+
+```
+HTTP/1.1 200 OK
+X-RateLimit-Limit-Tokens: 100000
+X-RateLimit-Remaining-Tokens: 99985
+X-RateLimit-Reset-Tokens: 42
+```
+
+### Rate-limited request (quota exhausted)
+
+```
+HTTP/1.1 429 Too Many Requests
+X-RateLimit-Limit-Tokens: 100000
+X-RateLimit-Remaining-Tokens: 0
+X-RateLimit-Reset-Tokens: 18
+```
+
+### With `limit_strategy: prompt_tokens`
+
+```
+HTTP/1.1 200 OK
+X-RateLimit-Limit-PromptTokens: 50000
+X-RateLimit-Remaining-PromptTokens: 49990
+X-RateLimit-Reset-PromptTokens: 55
+```
+
+## Backward Compatibility
+
+Setting `standard_headers: false` (or omitting it) preserves the original
+`X-AI-RateLimit-Limit-{instance_name}` header format, so existing integrations
+are unaffected.
diff --git a/t/plugin/ai-rate-limiting-standard-headers.t b/t/plugin/ai-rate-limiting-standard-headers.t
new file mode 100644
index 000000000000..8611e6b14e45
--- /dev/null
+++ b/t/plugin/ai-rate-limiting-standard-headers.t
@@ -0,0 +1,393 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+use t::APISIX 'no_plan';
+
+log_level("info");
+repeat_each(1);
+no_long_string();
+no_root_location();
+
+add_block_preprocessor(sub {
+    my ($block) = @_;
+
+    if (!defined $block->request) {
+        $block->set_value("request", "GET /t");
+    }
+
+    my $extra_yaml_config = <<_EOC_;
+plugins:
+  - ai-proxy-multi
+  - ai-rate-limiting
+  - prometheus
+_EOC_
+    $block->set_value("extra_yaml_config", $extra_yaml_config);
+
+    # Default mock LLM backend on port 6799
+    if (!defined $block->http_config) {
+        my $http_config = <<_EOC_;
+    server {
+        server_name mock-llm;
+        listen 6799;
+
+        default_type 'application/json';
+
+        location /v1/chat/completions {
+            content_by_lua_block {
+                ngx.status = 200
+                ngx.say([[{
+                    "id": "chatcmpl-test",
+                    "object": "chat.completion",
+                    "choices": [{"index":0,"message":{"role":"assistant","content":"hi"},"finish_reason":"stop"}],
+                    "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
+                }]])
+            }
+        }
+    }
+_EOC_
+        $block->set_value("http_config", $http_config);
+    }
+});
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: schema check — standard_headers field is accepted
+--- apisix_yaml
+routes:
+  - id: 1
+    uri: /t
+    plugins:
+      ai-rate-limiting:
+        instances:
+          - name: mock-instance
+            limit: 1000
+            time_window: 60
+        limit_strategy: total_tokens
+        standard_headers: true
+        rejected_code: 429
+    upstream:
+      nodes:
+        "127.0.0.1:6799": 1
+      type: roundrobin
+#END
+--- config
+    location /t {
+        content_by_lua_block {
+            local plugin = require("apisix.plugins.ai-rate-limiting")
+            local ok, err = plugin.check_schema({
+                instances = {
+                    { name = "mock-instance", limit = 1000, time_window = 60 }
+                },
+                limit_strategy = "total_tokens",
+                standard_headers = true,
+                rejected_code = 429,
+            })
+            if not ok then
+                ngx.say("schema error: ", err)
+            else
+                ngx.say("passed")
+            end
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 2: schema check — standard_headers defaults to false
+--- apisix_yaml
+routes:
+  - id: 1
+    uri: /t
+    plugins:
+      ai-rate-limiting:
+        instances:
+          - name: mock-instance
+            limit: 1000
+            time_window: 60
+    upstream:
+      nodes:
+        "127.0.0.1:6799": 1
+      type: roundrobin
+#END
+--- config
+    location /t {
+        content_by_lua_block {
+            local plugin = require("apisix.plugins.ai-rate-limiting")
+            local conf = {
+                instances = {
+                    { name = "mock-instance", limit = 1000, time_window = 60 }
+                },
+            }
+            local ok, err = plugin.check_schema(conf)
+            if not ok then
+                ngx.say("schema error: ", err)
+                return
+            end
+            -- default should be false
+            if conf.standard_headers == false then
+                ngx.say("default is false")
+            else
+                ngx.say("unexpected default: ", tostring(conf.standard_headers))
+            end
+        }
+    }
+--- response_body
+default is false
+
+
+
+=== TEST 3: standard_headers=true returns X-RateLimit-Limit-Tokens header
+--- apisix_yaml
+routes:
+  - id: 1
+    uri: /anything
+    plugins:
+      ai-proxy-multi:
+        instances:
+          - name: mock-instance
+            provider: openai
+            weight: 1
+            auth:
+              header:
+                Authorization: "Bearer test-key"
+            options:
+              model: gpt-4o-mini
+            override:
+              endpoint: "http://localhost:6799/v1/chat/completions"
+        ssl_verify: false
+      ai-rate-limiting:
+        instances:
+          - name: mock-instance
+            limit: 10000
+            time_window: 60
+        limit_strategy: total_tokens
+        standard_headers: true
+        rejected_code: 429
+#END
+--- request
+POST /anything
+{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hi"}]}
+--- more_headers
+Content-Type: application/json
+apikey: test-key-123
+--- error_code: 200
+--- response_headers_like
+X-RateLimit-Limit-Tokens: \d+
+X-RateLimit-Remaining-Tokens: \d+
+X-RateLimit-Reset-Tokens: \d+
+
+
+
+=== TEST 4: standard_headers=true, 429 response has Remaining-Tokens: 0
+--- apisix_yaml
+routes:
+  - id: 1
+    uri: /anything
+    plugins:
+      ai-proxy-multi:
+        instances:
+          - name: mock-instance
+            provider: openai
+            weight: 1
+            auth:
+              header:
+                Authorization: "Bearer test-key"
+            options:
+              model: gpt-4o-mini
+            override:
+              endpoint: "http://localhost:6799/v1/chat/completions"
+        ssl_verify: false
+      ai-rate-limiting:
+        instances:
+          - name: mock-instance
+            limit: 1
+            time_window: 60
+        limit_strategy: total_tokens
+        standard_headers: true
+        rejected_code: 429
+#END
+--- config
+    location /t {
+        content_by_lua_block {
+            local http = require("resty.http")
+            local httpc = http.new()
+
+            -- First request: should succeed and consume the 1-token budget
+            local res1, err = httpc:request_uri("http://127.0.0.1:" .. ngx.var.server_port .. "/anything", {
+                method = "POST",
+                headers = {
+                    ["Content-Type"] = "application/json",
+                    ["apikey"] = "test-key-123",
+                },
+                body = [[{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hi"}]}]],
+            })
+            if not res1 then
+                ngx.say("req1 error: ", err)
+                return
+            end
+
+            -- Second request: should be rate-limited (429)
+            local res2, err = httpc:request_uri("http://127.0.0.1:" .. ngx.var.server_port .. "/anything", {
+                method = "POST",
+                headers = {
+                    ["Content-Type"] = "application/json",
+                    ["apikey"] = "test-key-123",
+                },
+                body = [[{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hi again"}]}]],
+            })
+            if not res2 then
+                ngx.say("req2 error: ", err)
+                return
+            end
+
+            ngx.say("status: ", res2.status)
+            local remaining = res2.headers["X-RateLimit-Remaining-Tokens"]
+            ngx.say("remaining: ", remaining or "nil")
+        }
+    }
+--- response_body
+status: 429
+remaining: 0
+
+
+
+=== TEST 5: limit_strategy=prompt_tokens uses PromptTokens suffix
+--- apisix_yaml
+routes:
+  - id: 1
+    uri: /anything
+    plugins:
+      ai-proxy-multi:
+        instances:
+          - name: mock-instance
+            provider: openai
+            weight: 1
+            auth:
+              header:
+                Authorization: "Bearer test-key"
+            options:
+              model: gpt-4o-mini
+            override:
+              endpoint: "http://localhost:6799/v1/chat/completions"
+        ssl_verify: false
+      ai-rate-limiting:
+        instances:
+          - name: mock-instance
+            limit: 10000
+            time_window: 60
+        limit_strategy: prompt_tokens
+        standard_headers: true
+        rejected_code: 429
+#END
+--- request
+POST /anything
+{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hi"}]}
+--- more_headers
+Content-Type: application/json
+apikey: test-key-123
+--- error_code: 200
+--- response_headers_like
+X-RateLimit-Limit-PromptTokens: \d+
+X-RateLimit-Remaining-PromptTokens: \d+
+X-RateLimit-Reset-PromptTokens: \d+
+
+
+
+=== TEST 6: limit_strategy=completion_tokens uses CompletionTokens suffix
+--- apisix_yaml
+routes:
+  - id: 1
+    uri: /anything
+    plugins:
+      ai-proxy-multi:
+        instances:
+          - name: mock-instance
+            provider: openai
+            weight: 1
+            auth:
+              header:
+                Authorization: "Bearer test-key"
+            options:
+              model: gpt-4o-mini
+            override:
+              endpoint: "http://localhost:6799/v1/chat/completions"
+        ssl_verify: false
+      ai-rate-limiting:
+        instances:
+          - name: mock-instance
+            limit: 10000
+            time_window: 60
+        limit_strategy: completion_tokens
+        standard_headers: true
+        rejected_code: 429
+#END
+--- request
+POST /anything
+{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hi"}]}
+--- more_headers
+Content-Type: application/json
+apikey: test-key-123
+--- error_code: 200
+--- response_headers_like
+X-RateLimit-Limit-CompletionTokens: \d+
+X-RateLimit-Remaining-CompletionTokens: \d+
+X-RateLimit-Reset-CompletionTokens: \d+
+
+
+
+=== TEST 7: standard_headers=false (default) outputs legacy X-AI-RateLimit headers
+--- apisix_yaml
+routes:
+  - id: 1
+    uri: /anything
+    plugins:
+      ai-proxy-multi:
+        instances:
+          - name: mock-instance
+            provider: openai
+            weight: 1
+            auth:
+              header:
+                Authorization: "Bearer test-key"
+            options:
+              model: gpt-4o-mini
+            override:
+              endpoint: "http://localhost:6799/v1/chat/completions"
+        ssl_verify: false
+      ai-rate-limiting:
+        instances:
+          - name: mock-instance
+            limit: 10000
+            time_window: 60
+        limit_strategy: total_tokens
+        standard_headers: false
+        rejected_code: 429
+#END
+--- request
+POST /anything
+{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hi"}]}
+--- more_headers
+Content-Type: application/json
+apikey: test-key-123
+--- error_code: 200
+--- response_headers_like
+X-AI-RateLimit-Limit-mock-instance: \d+
+X-AI-RateLimit-Remaining-mock-instance: \d+
+X-AI-RateLimit-Reset-mock-instance: \d+