From 7a0b8798c704bf4d05be8e89649289a9e47977cc Mon Sep 17 00:00:00 2001 From: iakuf Date: Thu, 26 Feb 2026 22:10:32 +0800 Subject: [PATCH 1/4] fix(ai-proxy): support Anthropic token field names in openai-base driver When using openai-compatible provider with Anthropic-format endpoints (e.g. DeepSeek's /anthropic/v1/messages), the response returns input_tokens/output_tokens instead of prompt_tokens/completion_tokens. This patch adds fallback support for both field names in both streaming and non-streaming paths, so token usage statistics work correctly regardless of which format the upstream LLM returns. Fixes token stats being 0 when proxying to Anthropic-compatible endpoints. --- apisix/plugins/ai-drivers/openai-base.lua | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/apisix/plugins/ai-drivers/openai-base.lua b/apisix/plugins/ai-drivers/openai-base.lua index 4f279bbc3eab..6fc7e8bcb36d 100644 --- a/apisix/plugins/ai-drivers/openai-base.lua +++ b/apisix/plugins/ai-drivers/openai-base.lua @@ -132,8 +132,10 @@ local function read_response(conf, ctx, res, response_filter) core.json.delay_encode(data.usage)) ctx.llm_raw_usage = data.usage ctx.ai_token_usage = { - prompt_tokens = data.usage.prompt_tokens or 0, - completion_tokens = data.usage.completion_tokens or 0, + prompt_tokens = data.usage.prompt_tokens + or data.usage.input_tokens or 0, + completion_tokens = data.usage.completion_tokens + or data.usage.output_tokens or 0, total_tokens = data.usage.total_tokens or 0, } ctx.var.llm_prompt_tokens = ctx.ai_token_usage.prompt_tokens @@ -188,8 +190,10 @@ local function read_response(conf, ctx, res, response_filter) ctx.ai_token_usage = {} if type(res_body.usage) == "table" then ctx.llm_raw_usage = res_body.usage - ctx.ai_token_usage.prompt_tokens = res_body.usage.prompt_tokens or 0 - ctx.ai_token_usage.completion_tokens = res_body.usage.completion_tokens or 0 + ctx.ai_token_usage.prompt_tokens = res_body.usage.prompt_tokens + or res_body.usage.input_tokens or 0 + ctx.ai_token_usage.completion_tokens = res_body.usage.completion_tokens + or res_body.usage.output_tokens or 0 ctx.ai_token_usage.total_tokens = res_body.usage.total_tokens or 0 end ctx.var.llm_prompt_tokens = ctx.ai_token_usage.prompt_tokens or 0 From 03608d902f03ff15ca465b7594bea97ab5eeb667 Mon Sep 17 00:00:00 2001 From: iakuf Date: Thu, 26 Feb 2026 22:30:32 +0800 Subject: [PATCH 2/4] fix(ai-proxy): compute total_tokens fallback for Anthropic format in streaming path --- apisix/plugins/ai-drivers/openai-base.lua | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/apisix/plugins/ai-drivers/openai-base.lua b/apisix/plugins/ai-drivers/openai-base.lua index 6fc7e8bcb36d..5e23c1c995d6 100644 --- a/apisix/plugins/ai-drivers/openai-base.lua +++ b/apisix/plugins/ai-drivers/openai-base.lua @@ -131,12 +131,12 @@ local function read_response(conf, ctx, res, response_filter) core.log.info("got token usage from ai service: ", core.json.delay_encode(data.usage)) ctx.llm_raw_usage = data.usage + local pt = data.usage.prompt_tokens or data.usage.input_tokens or 0 + local ct = data.usage.completion_tokens or data.usage.output_tokens or 0 ctx.ai_token_usage = { - prompt_tokens = data.usage.prompt_tokens - or data.usage.input_tokens or 0, - completion_tokens = data.usage.completion_tokens - or data.usage.output_tokens or 0, - total_tokens = data.usage.total_tokens or 0, + prompt_tokens = pt, + completion_tokens = ct, + total_tokens = data.usage.total_tokens or (pt + ct), } ctx.var.llm_prompt_tokens = ctx.ai_token_usage.prompt_tokens ctx.var.llm_completion_tokens = ctx.ai_token_usage.completion_tokens From d88f36035a6856f1710cc70260aa4313d6ef3a96 Mon Sep 17 00:00:00 2001 From: iakuf Date: Thu, 26 Feb 2026 22:37:49 +0800 Subject: [PATCH 3/4] fix(ai-proxy): also compute total_tokens fallback in non-streaming path --- apisix/plugins/ai-drivers/openai-base.lua | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apisix/plugins/ai-drivers/openai-base.lua b/apisix/plugins/ai-drivers/openai-base.lua index 5e23c1c995d6..959c14b85c23 100644 --- a/apisix/plugins/ai-drivers/openai-base.lua +++ b/apisix/plugins/ai-drivers/openai-base.lua @@ -194,7 +194,9 @@ local function read_response(conf, ctx, res, response_filter) or res_body.usage.input_tokens or 0 ctx.ai_token_usage.completion_tokens = res_body.usage.completion_tokens or res_body.usage.output_tokens or 0 - ctx.ai_token_usage.total_tokens = res_body.usage.total_tokens or 0 + ctx.ai_token_usage.total_tokens = res_body.usage.total_tokens + or (ctx.ai_token_usage.prompt_tokens + + ctx.ai_token_usage.completion_tokens) end ctx.var.llm_prompt_tokens = ctx.ai_token_usage.prompt_tokens or 0 ctx.var.llm_completion_tokens = ctx.ai_token_usage.completion_tokens or 0 From 526eee299c557460ddb135b387a96267b48fc432 Mon Sep 17 00:00:00 2001 From: iakuf Date: Sat, 28 Feb 2026 18:02:16 +0800 Subject: [PATCH 4/4] feat(ai-rate-limiting): add standard_headers option for OpenAI/OpenRouter-compatible rate-limit headers --- apisix/plugins/ai-rate-limiting.lua | 24 +- ...ai-rate-limiting-standard-headers-patch.md | 86 ++++ t/plugin/ai-rate-limiting-standard-headers.t | 393 ++++++++++++++++++ 3 files changed, 500 insertions(+), 3 deletions(-) create mode 100644 docs/en/latest/plugins/ai-rate-limiting-standard-headers-patch.md create mode 100644 t/plugin/ai-rate-limiting-standard-headers.t diff --git a/apisix/plugins/ai-rate-limiting.lua b/apisix/plugins/ai-rate-limiting.lua index 8c7eea51aee9..e10e5258b494 100644 --- a/apisix/plugins/ai-rate-limiting.lua +++ b/apisix/plugins/ai-rate-limiting.lua @@ -65,6 +65,10 @@ local schema = { default = "total_tokens", description = "The strategy to limit the tokens" }, + -- 使用 OpenRouter/OpenAI 兼容的标准头名,IDE 插件(Cursor/Continue)可直接识别 + -- true: X-RateLimit-Limit-Tokens / X-RateLimit-Remaining-Tokens / X-RateLimit-Reset-Tokens + -- false: X-AI-RateLimit-Limit-{instance} (原有行为) + standard_headers = {type = "boolean", default = false}, instances = { type = "array", items = instance_limit_schema, @@ -177,9 +181,23 @@ local function transform_limit_conf(plugin_conf, instance_conf, instance_name) limit_conf._meta = plugin_conf._meta limit_conf.count = limit limit_conf.time_window = time_window - limit_conf.limit_header = "X-AI-RateLimit-Limit-" .. name - limit_conf.remaining_header = "X-AI-RateLimit-Remaining-" .. name - limit_conf.reset_header = "X-AI-RateLimit-Reset-" .. name + + -- standard_headers=true 输出 OpenRouter/OpenAI 兼容头名 + -- IDE 插件(Cursor/Continue)可直接识别并做退避 + if plugin_conf.standard_headers then + local strategy = plugin_conf.limit_strategy or "total_tokens" + local suffix = strategy == "total_tokens" and "Tokens" + or strategy == "prompt_tokens" and "PromptTokens" + or "CompletionTokens" + limit_conf.limit_header = "X-RateLimit-Limit-" .. suffix + limit_conf.remaining_header = "X-RateLimit-Remaining-" .. suffix + limit_conf.reset_header = "X-RateLimit-Reset-" .. suffix + else + limit_conf.limit_header = "X-AI-RateLimit-Limit-" .. name + limit_conf.remaining_header = "X-AI-RateLimit-Remaining-" .. name + limit_conf.reset_header = "X-AI-RateLimit-Reset-" .. name + end + return limit_conf end diff --git a/docs/en/latest/plugins/ai-rate-limiting-standard-headers-patch.md b/docs/en/latest/plugins/ai-rate-limiting-standard-headers-patch.md new file mode 100644 index 000000000000..f42789b52436 --- /dev/null +++ b/docs/en/latest/plugins/ai-rate-limiting-standard-headers-patch.md @@ -0,0 +1,86 @@ +# ai-rate-limiting — `standard_headers` Parameter + +## Overview + +The `standard_headers` option makes `ai-rate-limiting` emit rate-limit response +headers that follow the [OpenRouter / OpenAI convention][openrouter-headers], +so IDE extensions such as **Cursor** and **Continue** can detect quota exhaustion +and apply automatic back-off without any custom configuration. + +[openrouter-headers]: https://openrouter.ai/docs/api-reference/limits + +## New Parameter + +| Parameter | Type | Default | Description | +|---|---|---|---| +| `standard_headers` | boolean | `false` | When `true`, emit OpenAI/OpenRouter-compatible rate-limit headers instead of the legacy `X-AI-RateLimit-*` headers. | + +The header suffix is derived from `limit_strategy`: + +| `limit_strategy` | Header suffix | +|---|---| +| `total_tokens` (default) | `Tokens` | +| `prompt_tokens` | `PromptTokens` | +| `completion_tokens` | `CompletionTokens` | + +## Configuration Example + +```yaml +routes: + - id: 1 + uri: /v1/chat/completions + plugins: + ai-proxy-multi: + instances: + - name: my-llm + provider: openai + weight: 1 + auth: + header: + Authorization: "Bearer ${{OPENAI_API_KEY}}" + options: + model: gpt-4o-mini + ai-rate-limiting: + instances: + - name: my-llm + limit: 100000 + time_window: 60 + limit_strategy: total_tokens + standard_headers: true # <-- enable standard headers + rejected_code: 429 +``` + +## Response Headers + +### Normal request (quota available) + +``` +HTTP/1.1 200 OK +X-RateLimit-Limit-Tokens: 100000 +X-RateLimit-Remaining-Tokens: 99985 +X-RateLimit-Reset-Tokens: 42 +``` + +### Rate-limited request (quota exhausted) + +``` +HTTP/1.1 429 Too Many Requests +X-RateLimit-Limit-Tokens: 100000 +X-RateLimit-Remaining-Tokens: 0 +X-RateLimit-Reset-Tokens: 18 +``` + +### With `limit_strategy: prompt_tokens` + +``` +HTTP/1.1 200 OK +X-RateLimit-Limit-PromptTokens: 50000 +X-RateLimit-Remaining-PromptTokens: 49990 +X-RateLimit-Reset-PromptTokens: 55 +``` + +## Backward Compatibility + +Setting `standard_headers: false` (or omitting it) preserves the original +`X-AI-RateLimit-Limit-{instance_name}` header format, so existing integrations +are unaffected. diff --git a/t/plugin/ai-rate-limiting-standard-headers.t b/t/plugin/ai-rate-limiting-standard-headers.t new file mode 100644 index 000000000000..8611e6b14e45 --- /dev/null +++ b/t/plugin/ai-rate-limiting-standard-headers.t @@ -0,0 +1,393 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +use t::APISIX 'no_plan'; + +log_level("info"); +repeat_each(1); +no_long_string(); +no_root_location(); + +add_block_preprocessor(sub { + my ($block) = @_; + + if (!defined $block->request) { + $block->set_value("request", "GET /t"); + } + + my $extra_yaml_config = <<_EOC_; +plugins: + - ai-proxy-multi + - ai-rate-limiting + - prometheus +_EOC_ + $block->set_value("extra_yaml_config", $extra_yaml_config); + + # Default mock LLM backend on port 6799 + if (!defined $block->http_config) { + my $http_config = <<_EOC_; + server { + server_name mock-llm; + listen 6799; + + default_type 'application/json'; + + location /v1/chat/completions { + content_by_lua_block { + ngx.status = 200 + ngx.say([[{ + "id": "chatcmpl-test", + "object": "chat.completion", + "choices": [{"index":0,"message":{"role":"assistant","content":"hi"},"finish_reason":"stop"}], + "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15} + }]]) + } + } + } +_EOC_ + $block->set_value("http_config", $http_config); + } +}); + +run_tests(); + +__DATA__ + +=== TEST 1: schema check — standard_headers field is accepted +--- apisix_yaml +routes: + - id: 1 + uri: /t + plugins: + ai-rate-limiting: + instances: + - name: mock-instance + limit: 1000 + time_window: 60 + limit_strategy: total_tokens + standard_headers: true + rejected_code: 429 + upstream: + nodes: + "127.0.0.1:6799": 1 + type: roundrobin +#END +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.ai-rate-limiting") + local ok, err = plugin.check_schema({ + instances = { + { name = "mock-instance", limit = 1000, time_window = 60 } + }, + limit_strategy = "total_tokens", + standard_headers = true, + rejected_code = 429, + }) + if not ok then + ngx.say("schema error: ", err) + else + ngx.say("passed") + end + } + } +--- response_body +passed + + + +=== TEST 2: schema check — standard_headers defaults to false +--- apisix_yaml +routes: + - id: 1 + uri: /t + plugins: + ai-rate-limiting: + instances: + - name: mock-instance + limit: 1000 + time_window: 60 + upstream: + nodes: + "127.0.0.1:6799": 1 + type: roundrobin +#END +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.ai-rate-limiting") + local conf = { + instances = { + { name = "mock-instance", limit = 1000, time_window = 60 } + }, + } + local ok, err = plugin.check_schema(conf) + if not ok then + ngx.say("schema error: ", err) + return + end + -- default should be false + if conf.standard_headers == false then + ngx.say("default is false") + else + ngx.say("unexpected default: ", tostring(conf.standard_headers)) + end + } + } +--- response_body +default is false + + + +=== TEST 3: standard_headers=true returns X-RateLimit-Limit-Tokens header +--- apisix_yaml +routes: + - id: 1 + uri: /anything + plugins: + ai-proxy-multi: + instances: + - name: mock-instance + provider: openai + weight: 1 + auth: + header: + Authorization: "Bearer test-key" + options: + model: gpt-4o-mini + override: + endpoint: "http://localhost:6799/v1/chat/completions" + ssl_verify: false + ai-rate-limiting: + instances: + - name: mock-instance + limit: 10000 + time_window: 60 + limit_strategy: total_tokens + standard_headers: true + rejected_code: 429 +#END +--- request +POST /anything +{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hi"}]} +--- more_headers +Content-Type: application/json +apikey: test-key-123 +--- error_code: 200 +--- response_headers_like +X-RateLimit-Limit-Tokens: \d+ +X-RateLimit-Remaining-Tokens: \d+ +X-RateLimit-Reset-Tokens: \d+ + + + +=== TEST 4: standard_headers=true, 429 response has Remaining-Tokens: 0 +--- apisix_yaml +routes: + - id: 1 + uri: /anything + plugins: + ai-proxy-multi: + instances: + - name: mock-instance + provider: openai + weight: 1 + auth: + header: + Authorization: "Bearer test-key" + options: + model: gpt-4o-mini + override: + endpoint: "http://localhost:6799/v1/chat/completions" + ssl_verify: false + ai-rate-limiting: + instances: + - name: mock-instance + limit: 1 + time_window: 60 + limit_strategy: total_tokens + standard_headers: true + rejected_code: 429 +#END +--- config + location /t { + content_by_lua_block { + local http = require("resty.http") + local httpc = http.new() + + -- First request: should succeed and consume the 1-token budget + local res1, err = httpc:request_uri("http://127.0.0.1:" .. ngx.var.server_port .. "/anything", { + method = "POST", + headers = { + ["Content-Type"] = "application/json", + ["apikey"] = "test-key-123", + }, + body = [[{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hi"}]}]], + }) + if not res1 then + ngx.say("req1 error: ", err) + return + end + + -- Second request: should be rate-limited (429) + local res2, err = httpc:request_uri("http://127.0.0.1:" .. ngx.var.server_port .. "/anything", { + method = "POST", + headers = { + ["Content-Type"] = "application/json", + ["apikey"] = "test-key-123", + }, + body = [[{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hi again"}]}]], + }) + if not res2 then + ngx.say("req2 error: ", err) + return + end + + ngx.say("status: ", res2.status) + local remaining = res2.headers["X-RateLimit-Remaining-Tokens"] + ngx.say("remaining: ", remaining or "nil") + } + } +--- response_body +status: 429 +remaining: 0 + + + +=== TEST 5: limit_strategy=prompt_tokens uses PromptTokens suffix +--- apisix_yaml +routes: + - id: 1 + uri: /anything + plugins: + ai-proxy-multi: + instances: + - name: mock-instance + provider: openai + weight: 1 + auth: + header: + Authorization: "Bearer test-key" + options: + model: gpt-4o-mini + override: + endpoint: "http://localhost:6799/v1/chat/completions" + ssl_verify: false + ai-rate-limiting: + instances: + - name: mock-instance + limit: 10000 + time_window: 60 + limit_strategy: prompt_tokens + standard_headers: true + rejected_code: 429 +#END +--- request +POST /anything +{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hi"}]} +--- more_headers +Content-Type: application/json +apikey: test-key-123 +--- error_code: 200 +--- response_headers_like +X-RateLimit-Limit-PromptTokens: \d+ +X-RateLimit-Remaining-PromptTokens: \d+ +X-RateLimit-Reset-PromptTokens: \d+ + + + +=== TEST 6: limit_strategy=completion_tokens uses CompletionTokens suffix +--- apisix_yaml +routes: + - id: 1 + uri: /anything + plugins: + ai-proxy-multi: + instances: + - name: mock-instance + provider: openai + weight: 1 + auth: + header: + Authorization: "Bearer test-key" + options: + model: gpt-4o-mini + override: + endpoint: "http://localhost:6799/v1/chat/completions" + ssl_verify: false + ai-rate-limiting: + instances: + - name: mock-instance + limit: 10000 + time_window: 60 + limit_strategy: completion_tokens + standard_headers: true + rejected_code: 429 +#END +--- request +POST /anything +{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hi"}]} +--- more_headers +Content-Type: application/json +apikey: test-key-123 +--- error_code: 200 +--- response_headers_like +X-RateLimit-Limit-CompletionTokens: \d+ +X-RateLimit-Remaining-CompletionTokens: \d+ +X-RateLimit-Reset-CompletionTokens: \d+ + + + +=== TEST 7: standard_headers=false (default) outputs legacy X-AI-RateLimit headers +--- apisix_yaml +routes: + - id: 1 + uri: /anything + plugins: + ai-proxy-multi: + instances: + - name: mock-instance + provider: openai + weight: 1 + auth: + header: + Authorization: "Bearer test-key" + options: + model: gpt-4o-mini + override: + endpoint: "http://localhost:6799/v1/chat/completions" + ssl_verify: false + ai-rate-limiting: + instances: + - name: mock-instance + limit: 10000 + time_window: 60 + limit_strategy: total_tokens + standard_headers: false + rejected_code: 429 +#END +--- request +POST /anything +{"model":"gpt-4o-mini","messages":[{"role":"user","content":"hi"}]} +--- more_headers +Content-Type: application/json +apikey: test-key-123 +--- error_code: 200 +--- response_headers_like +X-AI-RateLimit-Limit-mock-instance: \d+ +X-AI-RateLimit-Remaining-mock-instance: \d+ +X-AI-RateLimit-Reset-mock-instance: \d+