@@ -252,22 +252,25 @@ async def add_crawl_config(
252252 if self .is_single_page (config_in .config ):
253253 config_in .browserWindows = 1
254254
255+ proxy_id = config_in .proxyId
256+
255257 profileid = None
256258 if isinstance (config_in .profileid , UUID ):
257259 profileid = config_in .profileid
258260
259- # ensure profile is valid, if provided
261+ # ensure profile is valid, get proxy_id from profile
260262 if profileid :
261- await self .profiles .get_profile (profileid , org )
263+ profile = await self .profiles .get_profile (profileid , org )
264+ proxy_id = profile .proxyId
262265 else :
263266 if config_in .config and config_in .config .failOnContentCheck :
264267 raise HTTPException (
265268 status_code = 400 , detail = "fail_on_content_check_requires_profile"
266269 )
267270
268- # ensure proxyId is valid and available for org
269- if config_in . proxyId :
270- if not self .can_org_use_proxy (org , config_in . proxyId ):
271+ # ensure proxy_id is valid and available for org
272+ if proxy_id :
273+ if not self .can_org_use_proxy (org , proxy_id ):
271274 raise HTTPException (status_code = 404 , detail = "proxy_not_found" )
272275
273276 if config_in .config .exclude :
@@ -336,7 +339,7 @@ async def add_crawl_config(
336339 profileid = profileid ,
337340 crawlerChannel = config_in .crawlerChannel ,
338341 crawlFilenameTemplate = config_in .crawlFilenameTemplate ,
339- proxyId = config_in . proxyId ,
342+ proxyId = proxy_id ,
340343 firstSeed = first_seed ,
341344 seedCount = seed_count ,
342345 shareable = config_in .shareable ,
@@ -620,6 +623,8 @@ async def update_crawl_config(
620623 last_rev = ConfigRevision (** orig_dict )
621624 last_rev = await self .config_revs .insert_one (last_rev .to_dict ())
622625
626+ proxy_id = update .proxyId
627+
623628 # set update query
624629 query = update .dict (exclude_unset = True )
625630 query ["modifiedBy" ] = user .id
@@ -631,8 +636,15 @@ async def update_crawl_config(
631636 query ["profileid" ] = None
632637 # else, ensure its a valid profile
633638 elif update .profileid :
634- await self .profiles .get_profile (cast (UUID , update .profileid ), org )
639+ profile = await self .profiles .get_profile (cast (UUID , update .profileid ), org )
635640 query ["profileid" ] = update .profileid
641+ proxy_id = profile .proxyId
642+ # don't change the proxy if profile is set, as it should match the profile proxy
643+ elif orig_crawl_config .profileid :
644+ proxy_id = None
645+
646+ if proxy_id is not None :
647+ query ["proxyId" ] = proxy_id
636648
637649 if update .config is not None :
638650 query ["config" ] = update .config .dict ()
@@ -1200,20 +1212,21 @@ async def run_now_internal(
12001212 if await self .get_running_crawl (crawlconfig .id ):
12011213 raise HTTPException (status_code = 400 , detail = "crawl_already_running" )
12021214
1203- if crawlconfig .proxyId and not self .can_org_use_proxy (org , crawlconfig .proxyId ):
1204- raise HTTPException (status_code = 404 , detail = "proxy_not_found" )
1205-
12061215 await self .check_if_too_many_waiting_crawls (org )
12071216
1208- profile_filename , profile_proxy_id = (
1209- await self .profiles .get_profile_filename_and_proxy (
1210- crawlconfig .profileid , org
1217+ if crawlconfig .profileid :
1218+ profile_filename , crawlconfig .proxyId , _ = (
1219+ await self .profiles .get_profile_filename_proxy_channel (
1220+ crawlconfig .profileid , org
1221+ )
12111222 )
1212- )
1213- if crawlconfig .profileid and not profile_filename :
1214- raise HTTPException (status_code = 400 , detail = "invalid_profile_id" )
1223+ if not profile_filename :
1224+ raise HTTPException (status_code = 400 , detail = "invalid_profile_id" )
1225+ else :
1226+ profile_filename = ""
12151227
1216- save_profile_id = self .get_save_profile_id (profile_proxy_id , crawlconfig )
1228+ if crawlconfig .proxyId and not self .can_org_use_proxy (org , crawlconfig .proxyId ):
1229+ raise HTTPException (status_code = 404 , detail = "proxy_not_found" )
12171230
12181231 storage_filename = (
12191232 crawlconfig .crawlFilenameTemplate or self .default_filename_template
@@ -1244,7 +1257,7 @@ async def run_now_internal(
12441257 warc_prefix = self .get_warc_prefix (org , crawlconfig ),
12451258 storage_filename = storage_filename ,
12461259 profile_filename = profile_filename or "" ,
1247- profileid = save_profile_id ,
1260+ profileid = str ( crawlconfig . profileid ) if crawlconfig . profileid else "" ,
12481261 is_single_page = self .is_single_page (crawlconfig .config ),
12491262 seed_file_url = seed_file_url ,
12501263 )
@@ -1256,25 +1269,6 @@ async def run_now_internal(
12561269 print (traceback .format_exc ())
12571270 raise HTTPException (status_code = 500 , detail = f"Error starting crawl: { exc } " )
12581271
1259- def get_save_profile_id (
1260- self , profile_proxy_id : str , crawlconfig : CrawlConfig
1261- ) -> str :
1262- """return profile id if profile should be auto-saved, or empty str if not"""
1263- # if no profile, nothing to save
1264- if not crawlconfig .profileid :
1265- return ""
1266-
1267- # if no proxies, allow saving
1268- if not crawlconfig .proxyId and not profile_proxy_id :
1269- return str (crawlconfig .profileid )
1270-
1271- # if proxy ids match, allow saving
1272- if crawlconfig .proxyId == profile_proxy_id :
1273- return str (crawlconfig .profileid )
1274-
1275- # otherwise, don't save
1276- return ""
1277-
12781272 async def check_if_too_many_waiting_crawls (self , org : Organization ):
12791273 """if max concurrent crawls are set, limit number of queued crawls to X concurrent limit
12801274 return 429 if at limit"""
0 commit comments