-
Notifications
You must be signed in to change notification settings - Fork 62
Fix timeout error after 4h and improve log resumption #173 #178
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -20,6 +20,9 @@ | |||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
| def wait(): | ||||||||||||||
| connection_max_time = 1800 # time in seconds | ||||||||||||||
| connection_max_time_bool = False | ||||||||||||||
| last_line_number = 0 | ||||||||||||||
| try: | ||||||||||||||
| name = environ.get("RD_CONFIG_NAME") | ||||||||||||||
| namespace = environ.get("RD_CONFIG_NAMESPACE") | ||||||||||||||
|
|
@@ -31,18 +34,22 @@ def wait(): | |||||||||||||
| retries_count = 0 | ||||||||||||||
| completed = False | ||||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
| while True: | ||||||||||||||
| common.connect() | ||||||||||||||
|
|
||||||||||||||
| connection_start_time = time.time() | ||||||||||||||
|
|
||||||||||||||
| #validate retries | ||||||||||||||
| if retries_count != 0: | ||||||||||||||
| if retries_count != 0 and not connection_max_time_bool: | ||||||||||||||
| log.warning("An error occurred - retries: {0}".format(retries_count)) | ||||||||||||||
| retries_count = retries_count + 1 | ||||||||||||||
|
|
||||||||||||||
| if not connection_max_time_bool: | ||||||||||||||
| retries_count = retries_count + 1 | ||||||||||||||
|
|
||||||||||||||
| if retries_count > retries: | ||||||||||||||
| log.error("Number of retries exceeded") | ||||||||||||||
| completed = True | ||||||||||||||
| if retries_count > retries: | ||||||||||||||
| log.error("Number of retries exceeded") | ||||||||||||||
| completed = True | ||||||||||||||
|
|
||||||||||||||
| if show_log and not completed: | ||||||||||||||
| log.debug("Searching for pod associated with job") | ||||||||||||||
|
|
@@ -69,47 +76,62 @@ def wait(): | |||||||||||||
| if ex.status == 200: | ||||||||||||||
| break | ||||||||||||||
| else: | ||||||||||||||
| log.info("waiting for log") | ||||||||||||||
| if not connection_max_time_bool: | ||||||||||||||
| log.info("waiting for log") | ||||||||||||||
| time.sleep(15) | ||||||||||||||
| if timeout and time.time() - start_time > timeout: # pragma: no cover | ||||||||||||||
| raise TimeoutError | ||||||||||||||
|
|
||||||||||||||
| if not connection_max_time_bool: | ||||||||||||||
| log.info("Fetching logs from pod: {0}".format(pod_name)) | ||||||||||||||
|
|
||||||||||||||
| log.info("Fetching logs from pod: {0}".format(pod_name)) | ||||||||||||||
|
|
||||||||||||||
| if retries_count == 1: | ||||||||||||||
| if retries_count == 1 and not connection_max_time_bool: | ||||||||||||||
| log.info("========================== job log start ==========================") | ||||||||||||||
|
|
||||||||||||||
| connection_max_time_bool = False | ||||||||||||||
| current_line_number = 0 | ||||||||||||||
| w = watch.Watch() | ||||||||||||||
| for line in w.stream(core_v1.read_namespaced_pod_log, | ||||||||||||||
| name=pod_name, | ||||||||||||||
| namespace=namespace): | ||||||||||||||
| log.info(line.encode('ascii', 'ignore')) | ||||||||||||||
|
|
||||||||||||||
| #check status job | ||||||||||||||
| batch_v1 = client.BatchV1Api() | ||||||||||||||
|
|
||||||||||||||
| api_response = batch_v1.read_namespaced_job( | ||||||||||||||
| name, | ||||||||||||||
| namespace, | ||||||||||||||
| pretty="True" | ||||||||||||||
| ) | ||||||||||||||
| log.debug(api_response) | ||||||||||||||
|
|
||||||||||||||
| if api_response.status.conditions: | ||||||||||||||
| for condition in api_response.status.conditions: | ||||||||||||||
| if condition.type == "Failed": | ||||||||||||||
| completed = True | ||||||||||||||
|
|
||||||||||||||
| if api_response.status.completion_time: | ||||||||||||||
| completed = True | ||||||||||||||
|
|
||||||||||||||
| if completed: | ||||||||||||||
| if show_log: | ||||||||||||||
| log.info("=========================== job log end ===========================") | ||||||||||||||
| break | ||||||||||||||
|
|
||||||||||||||
| log.info("Waiting for job completion") | ||||||||||||||
| time.sleep(sleep) | ||||||||||||||
| if current_line_number > last_line_number or last_line_number == 0: | ||||||||||||||
|
||||||||||||||
| if current_line_number > last_line_number or last_line_number == 0: | |
| if current_line_number >= last_line_number: |
Copilot
AI
Aug 26, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The last_line_number is being set to current_line_number inside the if block, but current_line_number is incremented after this assignment. This will cause the next reconnection to skip one line. Move this assignment after the current_line_number increment or use current_line_number + 1.
| last_line_number = current_line_number | |
| current_line_number += 1 | |
| current_line_number += 1 | |
| last_line_number = current_line_number |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The magic number 1800 should be defined as a named constant (e.g., CONNECTION_TIMEOUT_SECONDS = 1800) to improve code maintainability and make the 30-minute timeout more explicit.