From f9af27a4f6a1fa9c7dd81c922ba086d04eaf7fb9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 27 Oct 2025 21:56:51 -0700 Subject: [PATCH 1/5] add optional concurrent max queue rate limit: - set 'max_concur_queue_to_limit_scale' to determine max size of concurrent crawls that can be queued - if set, above limit queueing new crawls will be rejected with a 429 - default to disabled tests: add max queue limit to concurrent crawl tests --- backend/btrixcloud/crawlconfigs.py | 21 +++++++++++++++++++ .../test_concurrent_crawl_limit.py | 5 +++++ chart/templates/configmap.yaml | 2 ++ chart/test/test-nightly-addons.yaml | 3 +++ chart/values.yaml | 6 ++++++ 5 files changed, 37 insertions(+) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index e1c9d60b0f..34a671d20e 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -148,6 +148,10 @@ def __init__( minutes=int(os.environ.get("PAUSED_CRAWL_LIMIT_MINUTES", "10080")) ) + self.max_concur_queue_to_limit_scale = int( + os.environ.get("MAX_CONCUR_QUEUE_TO_LIMIT_SCALE", 0) + ) + self.router = APIRouter( prefix="/crawlconfigs", tags=["crawlconfigs"], @@ -1179,6 +1183,8 @@ async def run_now_internal( if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId): raise HTTPException(status_code=404, detail="proxy_not_found") + await self.too_many_waiting_crawls_check(org) + profile_filename = await self.get_profile_filename(crawlconfig.profileid, org) storage_filename = ( crawlconfig.crawlFilenameTemplate or self.default_filename_template @@ -1220,6 +1226,21 @@ async def run_now_internal( print(traceback.format_exc()) raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}") + async def too_many_waiting_crawls_check(self, org: Organization): + """if max concurrent crawls are set, limit number of queued crawls to X concurrent limit + return 429 if at limit""" + max_concur = org.quotas.maxConcurrentCrawls + if not max_concur: + return False + + num_waiting = await self.crawls.count_documents( + {"oid": org.id, "state": "waiting_org_limit"} + ) + if num_waiting <= max_concur * self.max_concur_queue_to_limit_scale: + return False + + raise HTTPException(status_code=429, detail="slow_down_too_many_crawls_queued") + async def set_config_current_crawl_info( self, cid: UUID, crawl_id: str, crawl_start: datetime, user: User ): diff --git a/backend/test_nightly/test_concurrent_crawl_limit.py b/backend/test_nightly/test_concurrent_crawl_limit.py index 141ec97948..a48ab7bdaf 100644 --- a/backend/test_nightly/test_concurrent_crawl_limit.py +++ b/backend/test_nightly/test_concurrent_crawl_limit.py @@ -1,5 +1,6 @@ import requests import time +import pytest from .conftest import API_PREFIX from .utils import get_crawl_status @@ -49,6 +50,9 @@ def test_run_two_only_one_concurrent(org_with_quotas, admin_auth_headers): == "waiting_org_limit" ) + # additional waiting crawls not allowed due to limit + with pytest.raises(requests.exceptions.HTTPError): + run_crawl(org_with_quotas, admin_auth_headers) def test_cancel_and_run_other(org_with_quotas, admin_auth_headers): r = requests.post( @@ -101,6 +105,7 @@ def run_crawl(org_id, headers): headers=headers, json=crawl_data, ) + r.raise_for_status() data = r.json() return data["run_now_job"] diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index f87c5643f6..7faa05c8fd 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -40,6 +40,8 @@ data: MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}" + MAX_CONCUR_QUEUE_TO_LIMIT_SCALE: "{{ .Values.max_concur_queue_to_limit_scale | default 0 }}" + IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}" RERUN_FROM_MIGRATION: "{{ .Values.rerun_from_migration }}" diff --git a/chart/test/test-nightly-addons.yaml b/chart/test/test-nightly-addons.yaml index 02de7ff1f6..7ea9fcd640 100644 --- a/chart/test/test-nightly-addons.yaml +++ b/chart/test/test-nightly-addons.yaml @@ -10,6 +10,9 @@ cleanup_job_cron_schedule: "* * * * *" # Clean up files > 1 minute old in testing cleanup_files_after_minutes: 1 +# max concurrent crawls +max_concur_queue_to_limit_scale: 1 + # enable to allow access to minio directly minio_local_access_port: 30090 diff --git a/chart/values.yaml b/chart/values.yaml index 0eb3a7cf68..2abf2c72dc 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -33,6 +33,12 @@ crawler_extract_full_text: to-warc # if set, each workflow can have a lower limit, but not higher max_pages_per_crawl: 50000 + +# max concurrent queue to size scale +# if 0, no limit to concurrent crawl queue +# if >0, scale to get max queue from concurrent crawl limit +# max_concur_queue_to_limit_scale: 0 + # default template for generate wacz files # supports following interpolated vars: # @ts - current timestamp From 0a450005a40a3da2b7843940188df4cd2d27a2cf Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 28 Oct 2025 12:56:13 -0700 Subject: [PATCH 2/5] adjust test --- backend/btrixcloud/crawlconfigs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 34a671d20e..74f6c9ece2 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -1236,7 +1236,7 @@ async def too_many_waiting_crawls_check(self, org: Organization): num_waiting = await self.crawls.count_documents( {"oid": org.id, "state": "waiting_org_limit"} ) - if num_waiting <= max_concur * self.max_concur_queue_to_limit_scale: + if num_waiting < max_concur * self.max_concur_queue_to_limit_scale: return False raise HTTPException(status_code=429, detail="slow_down_too_many_crawls_queued") From fc9cbb7e4ce2d18376197cd723426d677c7d2803 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 28 Oct 2025 14:06:05 -0700 Subject: [PATCH 3/5] fix tests? --- .../test_concurrent_crawl_limit.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/backend/test_nightly/test_concurrent_crawl_limit.py b/backend/test_nightly/test_concurrent_crawl_limit.py index a48ab7bdaf..109922b774 100644 --- a/backend/test_nightly/test_concurrent_crawl_limit.py +++ b/backend/test_nightly/test_concurrent_crawl_limit.py @@ -1,6 +1,5 @@ import requests import time -import pytest from .conftest import API_PREFIX from .utils import get_crawl_status @@ -21,11 +20,11 @@ def test_set_concurrent_crawl_limit(org_with_quotas, admin_auth_headers): def test_run_two_only_one_concurrent(org_with_quotas, admin_auth_headers): global crawl_id_a - crawl_id_a = run_crawl(org_with_quotas, admin_auth_headers) + crawl_id_a, _ = run_crawl(org_with_quotas, admin_auth_headers) time.sleep(1) global crawl_id_b - crawl_id_b = run_crawl(org_with_quotas, admin_auth_headers) + crawl_id_b, _ = run_crawl(org_with_quotas, admin_auth_headers) while get_crawl_status(org_with_quotas, crawl_id_a, admin_auth_headers) in ( "starting", @@ -50,9 +49,18 @@ def test_run_two_only_one_concurrent(org_with_quotas, admin_auth_headers): == "waiting_org_limit" ) - # additional waiting crawls not allowed due to limit - with pytest.raises(requests.exceptions.HTTPError): - run_crawl(org_with_quotas, admin_auth_headers) + crawl_id, res = run_crawl(org_with_quotas, admin_auth_headers) + assert not crawl_id + assert res["errorDetail"] == "slow_down_too_many_crawls_queued" + + config_id = res["id"] + + r = requests.post( + f"{API_PREFIX}/orgs/{org_with_quotas}/crawlconfigs/{config_id}/run", + headers=admin_auth_headers, + ) + assert r.status_code == 429 + def test_cancel_and_run_other(org_with_quotas, admin_auth_headers): r = requests.post( @@ -107,5 +115,4 @@ def run_crawl(org_id, headers): ) r.raise_for_status() data = r.json() - - return data["run_now_job"] + return data["run_now_job"], data From 7f653fe8bfd4f13005580bb14e4d786ec76a1b7f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 29 Oct 2025 19:31:20 -0700 Subject: [PATCH 4/5] fix from review, rename param name, cleanup --- backend/btrixcloud/crawlconfigs.py | 14 ++++++-------- chart/templates/configmap.yaml | 2 +- chart/values.yaml | 3 +-- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 74f6c9ece2..a1d79b8db1 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -148,9 +148,7 @@ def __init__( minutes=int(os.environ.get("PAUSED_CRAWL_LIMIT_MINUTES", "10080")) ) - self.max_concur_queue_to_limit_scale = int( - os.environ.get("MAX_CONCUR_QUEUE_TO_LIMIT_SCALE", 0) - ) + self.crawl_queue_limit_scale = int(os.environ.get("CRAWL_QUEUE_LIMIT_SCALE", 0)) self.router = APIRouter( prefix="/crawlconfigs", @@ -1183,7 +1181,7 @@ async def run_now_internal( if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId): raise HTTPException(status_code=404, detail="proxy_not_found") - await self.too_many_waiting_crawls_check(org) + await self.check_if_too_many_waiting_crawls(org) profile_filename = await self.get_profile_filename(crawlconfig.profileid, org) storage_filename = ( @@ -1226,18 +1224,18 @@ async def run_now_internal( print(traceback.format_exc()) raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}") - async def too_many_waiting_crawls_check(self, org: Organization): + async def check_if_too_many_waiting_crawls(self, org: Organization): """if max concurrent crawls are set, limit number of queued crawls to X concurrent limit return 429 if at limit""" max_concur = org.quotas.maxConcurrentCrawls if not max_concur: - return False + return num_waiting = await self.crawls.count_documents( {"oid": org.id, "state": "waiting_org_limit"} ) - if num_waiting < max_concur * self.max_concur_queue_to_limit_scale: - return False + if num_waiting < max_concur * self.crawl_queue_limit_scale: + return raise HTTPException(status_code=429, detail="slow_down_too_many_crawls_queued") diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 7faa05c8fd..3ec5c7f0bd 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -40,7 +40,7 @@ data: MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}" - MAX_CONCUR_QUEUE_TO_LIMIT_SCALE: "{{ .Values.max_concur_queue_to_limit_scale | default 0 }}" + CRAWL_QUEUE_LIMIT_SCALE: "{{ .Values.crawl_queue_limit_scale | default 0 }}" IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}" diff --git a/chart/values.yaml b/chart/values.yaml index 2abf2c72dc..355b5cef3c 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -33,11 +33,10 @@ crawler_extract_full_text: to-warc # if set, each workflow can have a lower limit, but not higher max_pages_per_crawl: 50000 - # max concurrent queue to size scale # if 0, no limit to concurrent crawl queue # if >0, scale to get max queue from concurrent crawl limit -# max_concur_queue_to_limit_scale: 0 +# crawl_queue_limit_scale: 0 # default template for generate wacz files # supports following interpolated vars: From 99f6a29f68a08d95ed9ca1b44662ddf04dd2baa0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 29 Oct 2025 19:33:47 -0700 Subject: [PATCH 5/5] Update chart/values.yaml Co-authored-by: Tessa Walsh --- chart/values.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/chart/values.yaml b/chart/values.yaml index 355b5cef3c..60380ce0fa 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -33,7 +33,8 @@ crawler_extract_full_text: to-warc # if set, each workflow can have a lower limit, but not higher max_pages_per_crawl: 50000 -# max concurrent queue to size scale +# set limit for how many crawls can be queued at once, +# calculated as scale multiplier applied to concurrent crawl limit # if 0, no limit to concurrent crawl queue # if >0, scale to get max queue from concurrent crawl limit # crawl_queue_limit_scale: 0