Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 58 additions & 36 deletions contents/job-wait.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@


def wait():
connection_max_time = 1800 # time in seconds
Copy link

Copilot AI Aug 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The magic number 1800 should be defined as a named constant (e.g., CONNECTION_TIMEOUT_SECONDS = 1800) to improve code maintainability and make the 30-minute timeout more explicit.

Suggested change
connection_max_time = 1800 # time in seconds
connection_max_time = CONNECTION_TIMEOUT_SECONDS

Copilot uses AI. Check for mistakes.
connection_max_time_bool = False
last_line_number = 0
try:
name = environ.get("RD_CONFIG_NAME")
namespace = environ.get("RD_CONFIG_NAMESPACE")
Expand All @@ -31,18 +34,22 @@ def wait():
retries_count = 0
completed = False



while True:
common.connect()

connection_start_time = time.time()

#validate retries
if retries_count != 0:
if retries_count != 0 and not connection_max_time_bool:
log.warning("An error occurred - retries: {0}".format(retries_count))
retries_count = retries_count + 1

if not connection_max_time_bool:
retries_count = retries_count + 1

if retries_count > retries:
log.error("Number of retries exceeded")
completed = True
if retries_count > retries:
log.error("Number of retries exceeded")
completed = True

if show_log and not completed:
log.debug("Searching for pod associated with job")
Expand All @@ -69,47 +76,62 @@ def wait():
if ex.status == 200:
break
else:
log.info("waiting for log")
if not connection_max_time_bool:
log.info("waiting for log")
time.sleep(15)
if timeout and time.time() - start_time > timeout: # pragma: no cover
raise TimeoutError

if not connection_max_time_bool:
log.info("Fetching logs from pod: {0}".format(pod_name))

log.info("Fetching logs from pod: {0}".format(pod_name))

if retries_count == 1:
if retries_count == 1 and not connection_max_time_bool:
log.info("========================== job log start ==========================")

connection_max_time_bool = False
current_line_number = 0
w = watch.Watch()
for line in w.stream(core_v1.read_namespaced_pod_log,
name=pod_name,
namespace=namespace):
log.info(line.encode('ascii', 'ignore'))

#check status job
batch_v1 = client.BatchV1Api()

api_response = batch_v1.read_namespaced_job(
name,
namespace,
pretty="True"
)
log.debug(api_response)

if api_response.status.conditions:
for condition in api_response.status.conditions:
if condition.type == "Failed":
completed = True

if api_response.status.completion_time:
completed = True

if completed:
if show_log:
log.info("=========================== job log end ===========================")
break

log.info("Waiting for job completion")
time.sleep(sleep)
if current_line_number > last_line_number or last_line_number == 0:
Copy link

Copilot AI Aug 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This condition will skip the first line when resuming (when last_line_number > 0). The condition should be 'current_line_number >= last_line_number' to avoid missing lines during log resumption.

Suggested change
if current_line_number > last_line_number or last_line_number == 0:
if current_line_number >= last_line_number:

Copilot uses AI. Check for mistakes.
log.info(line.encode('ascii', 'ignore'))
last_line_number = current_line_number

current_line_number += 1
Comment on lines +100 to +102
Copy link

Copilot AI Aug 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The last_line_number is being set to current_line_number inside the if block, but current_line_number is incremented after this assignment. This will cause the next reconnection to skip one line. Move this assignment after the current_line_number increment or use current_line_number + 1.

Suggested change
last_line_number = current_line_number
current_line_number += 1
current_line_number += 1
last_line_number = current_line_number

Copilot uses AI. Check for mistakes.

connection_elapsed_time = time.time() - connection_start_time
if connection_elapsed_time >= connection_max_time:
connection_max_time_bool = True
break

if not connection_max_time_bool:
#check status job
batch_v1 = client.BatchV1Api()

api_response = batch_v1.read_namespaced_job(
name,
namespace,
pretty="True"
)
log.debug(api_response)

if api_response.status.conditions:
for condition in api_response.status.conditions:
if condition.type == "Failed":
completed = True

if api_response.status.completion_time:
completed = True

if completed:
if show_log:
log.info("=========================== job log end ===========================")
break

log.info("Waiting for job completion")
time.sleep(sleep)


if api_response.status.succeeded:
Expand Down