Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ def __init__(
minutes=int(os.environ.get("PAUSED_CRAWL_LIMIT_MINUTES", "10080"))
)

self.crawl_queue_limit_scale = int(os.environ.get("CRAWL_QUEUE_LIMIT_SCALE", 0))

self.router = APIRouter(
prefix="/crawlconfigs",
tags=["crawlconfigs"],
Expand Down Expand Up @@ -1179,6 +1181,8 @@ async def run_now_internal(
if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId):
raise HTTPException(status_code=404, detail="proxy_not_found")

await self.check_if_too_many_waiting_crawls(org)

profile_filename = await self.get_profile_filename(crawlconfig.profileid, org)
storage_filename = (
crawlconfig.crawlFilenameTemplate or self.default_filename_template
Expand Down Expand Up @@ -1220,6 +1224,21 @@ async def run_now_internal(
print(traceback.format_exc())
raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")

async def check_if_too_many_waiting_crawls(self, org: Organization):
"""if max concurrent crawls are set, limit number of queued crawls to X concurrent limit
return 429 if at limit"""
max_concur = org.quotas.maxConcurrentCrawls
if not max_concur:
return

num_waiting = await self.crawls.count_documents(
{"oid": org.id, "state": "waiting_org_limit"}
)
if num_waiting < max_concur * self.crawl_queue_limit_scale:
return

raise HTTPException(status_code=429, detail="slow_down_too_many_crawls_queued")

async def set_config_current_crawl_info(
self, cid: UUID, crawl_id: str, crawl_start: datetime, user: User
):
Expand Down
20 changes: 16 additions & 4 deletions backend/test_nightly/test_concurrent_crawl_limit.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ def test_set_concurrent_crawl_limit(org_with_quotas, admin_auth_headers):

def test_run_two_only_one_concurrent(org_with_quotas, admin_auth_headers):
global crawl_id_a
crawl_id_a = run_crawl(org_with_quotas, admin_auth_headers)
crawl_id_a, _ = run_crawl(org_with_quotas, admin_auth_headers)
time.sleep(1)

global crawl_id_b
crawl_id_b = run_crawl(org_with_quotas, admin_auth_headers)
crawl_id_b, _ = run_crawl(org_with_quotas, admin_auth_headers)

while get_crawl_status(org_with_quotas, crawl_id_a, admin_auth_headers) in (
"starting",
Expand All @@ -49,6 +49,18 @@ def test_run_two_only_one_concurrent(org_with_quotas, admin_auth_headers):
== "waiting_org_limit"
)

crawl_id, res = run_crawl(org_with_quotas, admin_auth_headers)
assert not crawl_id
assert res["errorDetail"] == "slow_down_too_many_crawls_queued"

config_id = res["id"]

r = requests.post(
f"{API_PREFIX}/orgs/{org_with_quotas}/crawlconfigs/{config_id}/run",
headers=admin_auth_headers,
)
assert r.status_code == 429


def test_cancel_and_run_other(org_with_quotas, admin_auth_headers):
r = requests.post(
Expand Down Expand Up @@ -101,6 +113,6 @@ def run_crawl(org_id, headers):
headers=headers,
json=crawl_data,
)
r.raise_for_status()
data = r.json()

return data["run_now_job"]
return data["run_now_job"], data
2 changes: 2 additions & 0 deletions chart/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ data:

MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"

CRAWL_QUEUE_LIMIT_SCALE: "{{ .Values.crawl_queue_limit_scale | default 0 }}"

IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"

RERUN_FROM_MIGRATION: "{{ .Values.rerun_from_migration }}"
Expand Down
3 changes: 3 additions & 0 deletions chart/test/test-nightly-addons.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ cleanup_job_cron_schedule: "* * * * *"
# Clean up files > 1 minute old in testing
cleanup_files_after_minutes: 1

# max concurrent crawls
max_concur_queue_to_limit_scale: 1

# enable to allow access to minio directly
minio_local_access_port: 30090

Expand Down
6 changes: 6 additions & 0 deletions chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ crawler_extract_full_text: to-warc
# if set, each workflow can have a lower limit, but not higher
max_pages_per_crawl: 50000

# set limit for how many crawls can be queued at once,
# calculated as scale multiplier applied to concurrent crawl limit
# if 0, no limit to concurrent crawl queue
# if >0, scale to get max queue from concurrent crawl limit
# crawl_queue_limit_scale: 0

# default template for generate wacz files
# supports following interpolated vars:
# @ts - current timestamp
Expand Down
Loading