From a40ecb4cfea54aaf4955c6f28596aa9659f76120 Mon Sep 17 00:00:00 2001 From: llavezzo Date: Mon, 31 Jul 2023 20:39:37 +0200 Subject: [PATCH 1/4] handle edge case of overloaded console --- utils.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/utils.py b/utils.py index 28184ac3..5280493f 100755 --- a/utils.py +++ b/utils.py @@ -482,7 +482,21 @@ def check(self): alarm = "Timeout in checking the sanity of components %d > %d , while checking on %s" % ( now - check_start, self.check_timeout, self.checks.checking) sendLog('componentInfo', alarm, level='critical') - return False + + # handle the edge case in which e.g. the console is reachable, but extremely slow + # at responding, AND it is considered 'soft', i.e. not necessary + try: + self.status = self.checks.status + for comp in self.status.keys(): + # if any necessary component is down, fail out + if not self.status[comp] and comp not in soft: + return False + # handle weird failures + except: + return False + + return True + print("componentInfo, ping", now, check_start, now - check_start) time.sleep(ping) From 38aa5b04211969663eaad8a3cc5b094a8b3ddf99 Mon Sep 17 00:00:00 2001 From: llavezzo Date: Tue, 1 Aug 2023 21:02:29 +0200 Subject: [PATCH 2/4] componentInfo soft check is now ignore --- Unified/actor.py | 2 +- Unified/addHoc.py | 4 ++-- Unified/checkor.py | 7 ++++--- Unified/closor.py | 6 +++--- Unified/completor.py | 5 +++-- Unified/equalizor.py | 2 +- Unified/htmlor.py | 2 +- Unified/injector.py | 6 ++++-- Unified/invalidator.py | 2 +- Unified/mappor.py | 2 +- Unified/recoveror.py | 4 ++-- Unified/rejector.py | 2 +- utils.py | 34 ++++++++++++---------------------- 13 files changed, 36 insertions(+), 42 deletions(-) diff --git a/Unified/actor.py b/Unified/actor.py index b0b71d64..ea1f5803 100755 --- a/Unified/actor.py +++ b/Unified/actor.py @@ -359,7 +359,7 @@ def actor(url,options=None): if mlock(): return if userLock('actor'): return - up = componentInfo(soft=['mcm']) + up = componentInfo(ignore=['mcm']) if not up.check(): return # CI = campaignInfo() diff --git a/Unified/addHoc.py b/Unified/addHoc.py index 8f3c5140..a44bf28c 100755 --- a/Unified/addHoc.py +++ b/Unified/addHoc.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +!/usr/bin/env python from utils import workflowInfo, getWorkflows, sendEmail, componentInfo, monitor_dir, reqmgr_url, siteInfo, sendLog, getWorkflowById, agentInfo, unifiedConfiguration, monitor_eos_dir, base_eos_dir, batchInfo, reportInfo from assignSession import * @@ -10,7 +10,7 @@ import random from JIRAClient import JIRAClient -up = componentInfo(soft=['mcm','wtc','jira']) +up = componentInfo(ignore=['mcm','wtc','jira']) if not up.check(): sys.exit(0) JC = JIRAClient() if up.status.get('jira',False) else None diff --git a/Unified/checkor.py b/Unified/checkor.py index 5a6fb533..43e82275 100755 --- a/Unified/checkor.py +++ b/Unified/checkor.py @@ -84,10 +84,11 @@ def checkor(url, spec=None, options=None): UC = unifiedConfiguration() use_mcm = True - up = componentInfo(soft=['mcm', 'wtc']) + up = componentInfo(ignore=['mcm', 'wtc']) if not up.check(): return - use_mcm = up.status['mcm'] + up_mcm = componentInfo(ignore=['wtc']) + use_mcm = up_mcm.status['mcm'] now_s = time.mktime(time.gmtime()) @@ -192,7 +193,7 @@ def time_point(label="", sub_lap=False, percent=None, is_end=False): user = c.author.name prepid = jira.fields.summary.split()[0] keyword = \ - c.body[(c.body.find(force_complete_jira_string) + len(force_complete_jira_string)):].split()[0] + c.body[(c.body.find(force_complete_jira_string) + len(force_complete_jira_string)):].split()[0] if keyword and user in actors: print(user, "is force-completing", keyword, "from JIRA") bypasses.append(keyword) diff --git a/Unified/closor.py b/Unified/closor.py index 9cdaa74b..d8acf845 100755 --- a/Unified/closor.py +++ b/Unified/closor.py @@ -124,7 +124,7 @@ def closor(url, specific=None, options=None): if userLock(): return mlock = moduleLock() if mlock() and not options.manual: return - up = componentInfo(soft=['mcm','wtc']) + up = componentInfo(ignore=['mcm','wtc']) if not up.check(): return @@ -511,7 +511,7 @@ def close(self): wfi.sendLog('closor',"Delayed announcement of %s due to unresolved Parentage dependencies" % wfi.request['RequestName']) results.append('No ParentageResolved') - if all([result in ['None',None,True] for result in results]): + if all([result in ['None',None,True,'No ParentageResolved'] for result in results]): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) @@ -527,7 +527,7 @@ def close(self): results.append( res ) print(results) - if all([result in ['None',None,True, 'No ParentageResolved'] for result in results]): + if all([result in ['None',None,True,'No ParentageResolved'] for result in results]): if jump_the_line: if not 'announced' in wfo.status: self.to_status = wfo.status.replace('announce','announced') diff --git a/Unified/completor.py b/Unified/completor.py index dcf30541..e6913241 100755 --- a/Unified/completor.py +++ b/Unified/completor.py @@ -18,9 +18,10 @@ def completor(url, specific): use_mcm = True - up = componentInfo(soft=['mcm','wtc','jira']) + up = componentInfo(ignore=['mcm','wtc','jira']) if not up.check(): return - use_mcm = up.status['mcm'] + up_mcm = componentInfo(ignore=['wtc','jira']) + use_mcm = up_mcm.status['mcm'] if use_mcm: mcm = McMClient(dev=False) diff --git a/Unified/equalizor.py b/Unified/equalizor.py index 085d4c8b..35c559ed 100755 --- a/Unified/equalizor.py +++ b/Unified/equalizor.py @@ -16,7 +16,7 @@ def equalizor(url , specific = None, options=None): - up = componentInfo(soft=['mcm','wtc','jira']) + up = componentInfo(ignore=['mcm','wtc','jira']) if not specific: if not up.check(): return # Only check component when running cron job with everything diff --git a/Unified/htmlor.py b/Unified/htmlor.py index 47b23b61..c75656ab 100755 --- a/Unified/htmlor.py +++ b/Unified/htmlor.py @@ -14,7 +14,7 @@ def htmlor( caller = ""): mlock = moduleLock(silent=True) if mlock(): return - up = componentInfo(soft=['mcm','wtc','jira']) + up = componentInfo(ignore=['mcm','wtc','jira']) if not up.check(): return #for backup in ['statuses.json','siteInfo.json','equalizor.json']: diff --git a/Unified/injector.py b/Unified/injector.py index e99ec2ff..31cb1722 100755 --- a/Unified/injector.py +++ b/Unified/injector.py @@ -16,9 +16,11 @@ def injector(url, options, specific): if mlock() and not options.manual: return use_mcm = True - up = componentInfo(soft=['mcm','wtc','jira'] ) + up = componentInfo(ignore=['mcm','wtc','jira'] ) if not up.check(): return - use_mcm = up.status['mcm'] + + up_mcm = componentInfo(ignore=['wtc','jira'] ) + use_mcm = up_mcm.status['mcm'] UC = unifiedConfiguration() diff --git a/Unified/invalidator.py b/Unified/invalidator.py index 9f0db73d..6cc9b580 100755 --- a/Unified/invalidator.py +++ b/Unified/invalidator.py @@ -9,7 +9,7 @@ def invalidator(url, invalid_status='INVALID'): use_mcm = True - up = componentInfo(soft=['wtc','jira']) + up = componentInfo(ignore=['wtc','jira']) if not up.check(): return mcm = McMClient(dev=False) diff --git a/Unified/mappor.py b/Unified/mappor.py index c1157448..70532a3d 100644 --- a/Unified/mappor.py +++ b/Unified/mappor.py @@ -16,7 +16,7 @@ def mappor(url , options=None): - up = componentInfo(soft=['mcm','wtc','jira']) + up = componentInfo(ignore=['mcm','wtc','jira']) ## define regionality site => fallback allowed. feed on an ssb metric ?? mapping = defaultdict(list) diff --git a/Unified/recoveror.py b/Unified/recoveror.py index 9714ec81..4e8d96ce 100755 --- a/Unified/recoveror.py +++ b/Unified/recoveror.py @@ -181,7 +181,7 @@ def singleRecovery(url, task , initial, actions, do=False): def new_recoveror(url, specific, options=None): if userLock('recoveror'): return - up = componentInfo(soft=['mcm','wtc','jira']) + up = componentInfo(ignore=['mcm','wtc','jira']) if not up.check(): return CI = campaignInfo() @@ -256,7 +256,7 @@ def new_recoveror(url, specific, options=None): def recoveror(url,specific,options=None): if userLock('recoveror'): return - up = componentInfo(soft=['mcm','wtc','jira']) + up = componentInfo(ignore=['mcm','wtc','jira']) if not up.check(): return CI = campaignInfo() diff --git a/Unified/rejector.py b/Unified/rejector.py index 24d60dd0..5cbabb51 100755 --- a/Unified/rejector.py +++ b/Unified/rejector.py @@ -19,7 +19,7 @@ def rejector(url, specific, options=None): if options.test: print "Test mode - no changes propagate to the production system" - if not componentInfo(soft=['wtc','jira']).check() and not options.manual: return + if not componentInfo(ignore=['wtc','jira']).check() and not options.manual: return if specific and specific.startswith('/'): ## this is for a dataset diff --git a/utils.py b/utils.py index 5280493f..9f3bbedb 100755 --- a/utils.py +++ b/utils.py @@ -466,8 +466,8 @@ def checkMemory(): class componentInfo: - def __init__(self, block=True, mcm=None, soft=None, keep_trying=False, check_timeout=120): - self.checks = componentCheck(block, mcm, soft, keep_trying) + def __init__(self, block=True, mcm=None, ignore=None, keep_trying=False, check_timeout=120): + self.checks = componentCheck(block, mcm, ignore, keep_trying) self.check_timeout = check_timeout # start the checking self.checks.start() @@ -482,20 +482,7 @@ def check(self): alarm = "Timeout in checking the sanity of components %d > %d , while checking on %s" % ( now - check_start, self.check_timeout, self.checks.checking) sendLog('componentInfo', alarm, level='critical') - - # handle the edge case in which e.g. the console is reachable, but extremely slow - # at responding, AND it is considered 'soft', i.e. not necessary - try: - self.status = self.checks.status - for comp in self.status.keys(): - # if any necessary component is down, fail out - if not self.status[comp] and comp not in soft: - return False - # handle weird failures - except: - return False - - return True + return False print("componentInfo, ping", now, check_start, now - check_start) time.sleep(ping) @@ -507,13 +494,13 @@ def check(self): class componentCheck(threading.Thread): - def __init__(self, block=True, mcm=None, soft=None, keep_trying=False): + def __init__(self, block=True, mcm=None, ignore=None, keep_trying=False): threading.Thread.__init__(self) self.daemon = True - if soft is None: - self.soft = ['mcm', 'wtc', 'mongo', 'jira'] ##components that are not mandatory + if ignore is None: + self.ignore = ['mcm', 'wtc', 'mongo', 'jira'] ##components that are not necessary else: - self.soft = soft + self.ignore = ignore self.block = block self.status = { 'reqmgr': False, @@ -595,6 +582,9 @@ def check(self): for component in sorted(self.status): ecode += 1 self.checking = component + if component in self.ignore and self.ignore is not None: + print("Skipping", component) + continue while True: try: print("checking on", component) @@ -612,8 +602,8 @@ def check(self): print(traceback.format_exc()) print(component, "is unreachable") print(str(e)) - if self.block and not (self.soft and component in self.soft): - self.code = ecode + if self.block: + self.ecode = ecode return False break From fc40b93402aa1752f2b1b759f32fa3f5e476f855 Mon Sep 17 00:00:00 2001 From: llavezzo Date: Tue, 1 Aug 2023 21:05:31 +0200 Subject: [PATCH 3/4] typo --- Unified/addHoc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Unified/addHoc.py b/Unified/addHoc.py index a44bf28c..7c815675 100755 --- a/Unified/addHoc.py +++ b/Unified/addHoc.py @@ -1,4 +1,4 @@ -!/usr/bin/env python +#!/usr/bin/env python from utils import workflowInfo, getWorkflows, sendEmail, componentInfo, monitor_dir, reqmgr_url, siteInfo, sendLog, getWorkflowById, agentInfo, unifiedConfiguration, monitor_eos_dir, base_eos_dir, batchInfo, reportInfo from assignSession import * From ebb69a3d84d65b50d0c5c1ac98cfbcbabc5b32ad Mon Sep 17 00:00:00 2001 From: llavezzo Date: Fri, 4 Aug 2023 19:30:31 +0200 Subject: [PATCH 4/4] fix bug --- Unified/checkor.py | 1 + Unified/completor.py | 1 + Unified/injector.py | 1 + 3 files changed, 3 insertions(+) diff --git a/Unified/checkor.py b/Unified/checkor.py index 43e82275..ea5182a5 100755 --- a/Unified/checkor.py +++ b/Unified/checkor.py @@ -88,6 +88,7 @@ def checkor(url, spec=None, options=None): if not up.check(): return up_mcm = componentInfo(ignore=['wtc']) + up_mcm.check() use_mcm = up_mcm.status['mcm'] now_s = time.mktime(time.gmtime()) diff --git a/Unified/completor.py b/Unified/completor.py index e6913241..b4ad5ef0 100755 --- a/Unified/completor.py +++ b/Unified/completor.py @@ -21,6 +21,7 @@ def completor(url, specific): up = componentInfo(ignore=['mcm','wtc','jira']) if not up.check(): return up_mcm = componentInfo(ignore=['wtc','jira']) + up_mcm.check() use_mcm = up_mcm.status['mcm'] if use_mcm: mcm = McMClient(dev=False) diff --git a/Unified/injector.py b/Unified/injector.py index 31cb1722..7b9893ec 100755 --- a/Unified/injector.py +++ b/Unified/injector.py @@ -20,6 +20,7 @@ def injector(url, options, specific): if not up.check(): return up_mcm = componentInfo(ignore=['wtc','jira'] ) + up_mcm.check() use_mcm = up_mcm.status['mcm'] UC = unifiedConfiguration()