Skip to content

Commit 4314e2a

Browse files
authored
profile constraints improvements (#2988)
profile crawlerChannel default: - use crawlerChannel from profile when launching browser for profile profile proxyId constraint: (backend part of #2982) - ensure workflow proxyId set to that profile.proxyId when creating or updating workflow - also ensure correct proxyId from profile is used when starting crawls - disallow changing proxy altogether if profile is set and not changed
1 parent 2725686 commit 4314e2a

File tree

3 files changed

+56
-60
lines changed

3 files changed

+56
-60
lines changed

backend/btrixcloud/crawlconfigs.py

Lines changed: 31 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -252,22 +252,25 @@ async def add_crawl_config(
252252
if self.is_single_page(config_in.config):
253253
config_in.browserWindows = 1
254254

255+
proxy_id = config_in.proxyId
256+
255257
profileid = None
256258
if isinstance(config_in.profileid, UUID):
257259
profileid = config_in.profileid
258260

259-
# ensure profile is valid, if provided
261+
# ensure profile is valid, get proxy_id from profile
260262
if profileid:
261-
await self.profiles.get_profile(profileid, org)
263+
profile = await self.profiles.get_profile(profileid, org)
264+
proxy_id = profile.proxyId
262265
else:
263266
if config_in.config and config_in.config.failOnContentCheck:
264267
raise HTTPException(
265268
status_code=400, detail="fail_on_content_check_requires_profile"
266269
)
267270

268-
# ensure proxyId is valid and available for org
269-
if config_in.proxyId:
270-
if not self.can_org_use_proxy(org, config_in.proxyId):
271+
# ensure proxy_id is valid and available for org
272+
if proxy_id:
273+
if not self.can_org_use_proxy(org, proxy_id):
271274
raise HTTPException(status_code=404, detail="proxy_not_found")
272275

273276
if config_in.config.exclude:
@@ -336,7 +339,7 @@ async def add_crawl_config(
336339
profileid=profileid,
337340
crawlerChannel=config_in.crawlerChannel,
338341
crawlFilenameTemplate=config_in.crawlFilenameTemplate,
339-
proxyId=config_in.proxyId,
342+
proxyId=proxy_id,
340343
firstSeed=first_seed,
341344
seedCount=seed_count,
342345
shareable=config_in.shareable,
@@ -620,6 +623,8 @@ async def update_crawl_config(
620623
last_rev = ConfigRevision(**orig_dict)
621624
last_rev = await self.config_revs.insert_one(last_rev.to_dict())
622625

626+
proxy_id = update.proxyId
627+
623628
# set update query
624629
query = update.dict(exclude_unset=True)
625630
query["modifiedBy"] = user.id
@@ -631,8 +636,15 @@ async def update_crawl_config(
631636
query["profileid"] = None
632637
# else, ensure its a valid profile
633638
elif update.profileid:
634-
await self.profiles.get_profile(cast(UUID, update.profileid), org)
639+
profile = await self.profiles.get_profile(cast(UUID, update.profileid), org)
635640
query["profileid"] = update.profileid
641+
proxy_id = profile.proxyId
642+
# don't change the proxy if profile is set, as it should match the profile proxy
643+
elif orig_crawl_config.profileid:
644+
proxy_id = None
645+
646+
if proxy_id is not None:
647+
query["proxyId"] = proxy_id
636648

637649
if update.config is not None:
638650
query["config"] = update.config.dict()
@@ -1200,20 +1212,21 @@ async def run_now_internal(
12001212
if await self.get_running_crawl(crawlconfig.id):
12011213
raise HTTPException(status_code=400, detail="crawl_already_running")
12021214

1203-
if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId):
1204-
raise HTTPException(status_code=404, detail="proxy_not_found")
1205-
12061215
await self.check_if_too_many_waiting_crawls(org)
12071216

1208-
profile_filename, profile_proxy_id = (
1209-
await self.profiles.get_profile_filename_and_proxy(
1210-
crawlconfig.profileid, org
1217+
if crawlconfig.profileid:
1218+
profile_filename, crawlconfig.proxyId, _ = (
1219+
await self.profiles.get_profile_filename_proxy_channel(
1220+
crawlconfig.profileid, org
1221+
)
12111222
)
1212-
)
1213-
if crawlconfig.profileid and not profile_filename:
1214-
raise HTTPException(status_code=400, detail="invalid_profile_id")
1223+
if not profile_filename:
1224+
raise HTTPException(status_code=400, detail="invalid_profile_id")
1225+
else:
1226+
profile_filename = ""
12151227

1216-
save_profile_id = self.get_save_profile_id(profile_proxy_id, crawlconfig)
1228+
if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId):
1229+
raise HTTPException(status_code=404, detail="proxy_not_found")
12171230

12181231
storage_filename = (
12191232
crawlconfig.crawlFilenameTemplate or self.default_filename_template
@@ -1244,7 +1257,7 @@ async def run_now_internal(
12441257
warc_prefix=self.get_warc_prefix(org, crawlconfig),
12451258
storage_filename=storage_filename,
12461259
profile_filename=profile_filename or "",
1247-
profileid=save_profile_id,
1260+
profileid=str(crawlconfig.profileid) if crawlconfig.profileid else "",
12481261
is_single_page=self.is_single_page(crawlconfig.config),
12491262
seed_file_url=seed_file_url,
12501263
)
@@ -1256,25 +1269,6 @@ async def run_now_internal(
12561269
print(traceback.format_exc())
12571270
raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")
12581271

1259-
def get_save_profile_id(
1260-
self, profile_proxy_id: str, crawlconfig: CrawlConfig
1261-
) -> str:
1262-
"""return profile id if profile should be auto-saved, or empty str if not"""
1263-
# if no profile, nothing to save
1264-
if not crawlconfig.profileid:
1265-
return ""
1266-
1267-
# if no proxies, allow saving
1268-
if not crawlconfig.proxyId and not profile_proxy_id:
1269-
return str(crawlconfig.profileid)
1270-
1271-
# if proxy ids match, allow saving
1272-
if crawlconfig.proxyId == profile_proxy_id:
1273-
return str(crawlconfig.profileid)
1274-
1275-
# otherwise, don't save
1276-
return ""
1277-
12781272
async def check_if_too_many_waiting_crawls(self, org: Organization):
12791273
"""if max concurrent crawls are set, limit number of queued crawls to X concurrent limit
12801274
return 429 if at limit"""

backend/btrixcloud/operator/cronjobs.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -125,18 +125,17 @@ async def make_new_crawljob(
125125
)
126126
print("Scheduled Crawl Created: " + crawl_id)
127127

128-
profile_filename, profile_proxy_id = (
129-
await self.crawl_config_ops.profiles.get_profile_filename_and_proxy(
130-
crawlconfig.profileid, org
128+
if crawlconfig.profileid:
129+
profile_filename, crawlconfig.proxyId, _ = (
130+
await self.crawl_config_ops.profiles.get_profile_filename_proxy_channel(
131+
crawlconfig.profileid, org
132+
)
131133
)
132-
)
133-
if crawlconfig.profileid and not profile_filename:
134-
print(f"error: missing profile {crawlconfig.profileid}")
135-
return self.get_finished_response(metadata)
136-
137-
save_profile_id = self.crawl_config_ops.get_save_profile_id(
138-
profile_proxy_id, crawlconfig
139-
)
134+
if not profile_filename:
135+
print(f"error: missing profile {crawlconfig.profileid}")
136+
return self.get_finished_response(metadata)
137+
else:
138+
profile_filename = ""
140139

141140
crawl_id, crawljob = self.k8s.new_crawl_job_yaml(
142141
cid=str(cid),
@@ -153,7 +152,7 @@ async def make_new_crawljob(
153152
warc_prefix=warc_prefix,
154153
storage_filename=self.crawl_config_ops.default_filename_template,
155154
profile_filename=profile_filename or "",
156-
profileid=save_profile_id,
155+
profileid=str(crawlconfig.profileid) if crawlconfig else "",
157156
proxy_id=crawlconfig.proxyId or "",
158157
is_single_page=self.crawl_config_ops.is_single_page(crawlconfig.config),
159158
)

backend/btrixcloud/profiles.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -112,24 +112,27 @@ async def create_new_browser(
112112
prev_profile_path = ""
113113
prev_profile_id = ""
114114
prev_proxy_id = ""
115+
prev_channel = ""
115116
if profile_launch.profileId:
116-
prev_profile_path, prev_proxy_id = (
117-
await self.get_profile_filename_and_proxy(profile_launch.profileId, org)
117+
prev_profile_path, prev_proxy_id, prev_channel = (
118+
await self.get_profile_filename_proxy_channel(
119+
profile_launch.profileId, org
120+
)
118121
)
119122

120123
if not prev_profile_path:
121124
raise HTTPException(status_code=400, detail="invalid_base_profile")
122125

123126
prev_profile_id = str(profile_launch.profileId)
124127

125-
crawler_image = self.crawlconfigs.get_channel_crawler_image(
126-
profile_launch.crawlerChannel
127-
)
128+
crawler_channel = profile_launch.crawlerChannel or prev_channel
129+
130+
crawler_image = self.crawlconfigs.get_channel_crawler_image(crawler_channel)
128131
if not crawler_image:
129132
raise HTTPException(status_code=404, detail="crawler_not_found")
130133

131134
image_pull_policy = self.crawlconfigs.get_channel_crawler_image_pull_policy(
132-
profile_launch.crawlerChannel
135+
crawler_channel
133136
)
134137

135138
# use either specified proxyId or if none, use proxyId from existing profile
@@ -515,23 +518,23 @@ async def get_profile(self, profileid: UUID, org: Organization) -> Profile:
515518
profile.inUse = await self.crawlconfigs.is_profile_in_use(profileid, org)
516519
return profile
517520

518-
async def get_profile_filename_and_proxy(
521+
async def get_profile_filename_proxy_channel(
519522
self, profileid: Optional[UUID], org: Organization
520-
) -> tuple[str, str]:
523+
) -> tuple[str, str, str]:
521524
"""return profile path filename (relative path) for given profile id and org"""
522525
if not profileid:
523-
return "", ""
526+
return "", "", ""
524527

525528
try:
526529
profile = await self.get_profile(profileid, org)
527530
storage_path = profile.resource.filename if profile.resource else ""
528531
storage_path = storage_path.lstrip(f"{org.id}/")
529-
return storage_path, profile.proxyId or ""
532+
return storage_path, profile.proxyId or "", profile.crawlerChannel or ""
530533
# pylint: disable=bare-except
531534
except:
532535
pass
533536

534-
return "", ""
537+
return "", "", ""
535538

536539
async def get_profile_name(self, profileid: UUID, org: Organization) -> str:
537540
"""return profile for given profile id and org"""

0 commit comments

Comments
 (0)