-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Fix bug in CHASM activity schedule-to-close timer task validation #8720
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,9 +17,6 @@ message ScheduleToStartTimeoutTask { | |
| } | ||
|
|
||
| message ScheduleToCloseTimeoutTask { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe you can remove this altogether now. In place of the interface arg you can use
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Keep it because we will have validation later when we support activity resets. |
||
| // The current attempt number for this activity execution. Since task validation/exec happen outside of a lock, we | ||
| // need to guard against any concurrent operations where the originally intended task may be outdated. | ||
| int32 attempt = 1; | ||
| } | ||
|
|
||
| message StartToCloseTimeoutTask { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -760,6 +760,67 @@ func (s *standaloneActivityTestSuite) TestCompletedActivity_CannotTerminate() { | |
| require.Error(t, err) | ||
| } | ||
|
|
||
| func (s *standaloneActivityTestSuite) TestScheduleToCloseTimeout_WithRetry() { | ||
| t := s.T() | ||
| ctx, cancel := context.WithTimeout(t.Context(), 30*time.Second) | ||
| defer cancel() | ||
| activityID := testcore.RandomizeStr(t.Name()) | ||
| taskQueue := testcore.RandomizeStr(t.Name()) | ||
|
|
||
| // Start an activity | ||
| startResp, err := s.FrontendClient().StartActivityExecution(ctx, &workflowservice.StartActivityExecutionRequest{ | ||
| Namespace: s.Namespace().String(), | ||
| ActivityId: activityID, | ||
| ActivityType: &commonpb.ActivityType{ | ||
| Name: "test-activity-type", | ||
| }, | ||
| Options: &activitypb.ActivityOptions{ | ||
| TaskQueue: &taskqueuepb.TaskQueue{ | ||
| Name: taskQueue, | ||
| }, | ||
| // It's not possible to guarantee (e.g. via NextRetryDelay or RetryPolicy) that a retry | ||
| // will start with a delay <1s because of the use of TimerProcessorMaxTimeShift in the | ||
| // timer queue. Therefore we allow 1s for the ActivityDispatchTask to be executed, and | ||
| // time out the activity 1s into Attempt 2. | ||
| ScheduleToCloseTimeout: durationpb.New(2 * time.Second), | ||
| }, | ||
| }) | ||
| require.NoError(t, err) | ||
|
|
||
| // Fail attempt 1, causing the attempt counter to increment. | ||
| pollTaskResp, err := s.pollActivityTaskQueue(ctx, taskQueue) | ||
| require.NoError(t, err) | ||
| _, err = s.FrontendClient().RespondActivityTaskFailed(ctx, &workflowservice.RespondActivityTaskFailedRequest{ | ||
| Namespace: s.Namespace().String(), | ||
| TaskToken: pollTaskResp.TaskToken, | ||
| Failure: &failurepb.Failure{ | ||
| Message: "Retryable failure", | ||
| FailureInfo: &failurepb.Failure_ApplicationFailureInfo{ApplicationFailureInfo: &failurepb.ApplicationFailureInfo{ | ||
| NonRetryable: false, | ||
| NextRetryDelay: durationpb.New(1 * time.Second), | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Depending on timing, this may prevent the schedule to close timeout from firing because we would know there's not enough time for the next attempt and avoid scheduling it.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should have a test for this behavior if we don't yet. |
||
| }}, | ||
| }, | ||
| }) | ||
| require.NoError(t, err) | ||
| _, err = s.pollActivityTaskQueue(ctx, taskQueue) | ||
| require.NoError(t, err) | ||
|
|
||
| // Wait for schedule-to-close timeout. | ||
| pollResp, err := s.FrontendClient().PollActivityExecution(ctx, &workflowservice.PollActivityExecutionRequest{ | ||
| Namespace: s.Namespace().String(), | ||
| ActivityId: activityID, | ||
| RunId: startResp.RunId, | ||
| IncludeInfo: true, | ||
| IncludeOutcome: true, | ||
| WaitPolicy: &workflowservice.PollActivityExecutionRequest_WaitCompletion{ | ||
| WaitCompletion: &workflowservice.PollActivityExecutionRequest_CompletionWaitOptions{}, | ||
| }, | ||
| }) | ||
| require.NoError(t, err) | ||
| require.Equal(t, enumspb.ACTIVITY_EXECUTION_STATUS_TIMED_OUT, pollResp.GetInfo().GetStatus()) | ||
| require.Equal(t, enumspb.TIMEOUT_TYPE_SCHEDULE_TO_CLOSE, pollResp.GetFailure().GetTimeoutFailureInfo().GetTimeoutType()) | ||
| } | ||
|
|
||
| // TestStartToCloseTimeout tests that a start-to-close timeout is recorded after the activity is | ||
| // started. It also verifies that PollActivityExecution can be used to poll for a TimedOut state | ||
| // change caused by execution of a timer task. | ||
|
|
@@ -865,11 +926,6 @@ func (s *standaloneActivityTestSuite) TestStartToCloseTimeout() { | |
| "expected StartToCloseTimeout but is %s", pollResp.GetFailure().GetTimeoutFailureInfo().GetTimeoutType()) | ||
| } | ||
|
|
||
| func (s *standaloneActivityTestSuite) TestScheduleToCloseTimeout() { | ||
| // TODO implement when we have PollActivityExecution. Make sure we check the attempt vs. outcome failure population. | ||
| s.T().Skip("Temporarily disabled") | ||
| } | ||
|
|
||
| func (s *standaloneActivityTestSuite) TestPollActivityExecution_NoWait() { | ||
| t := s.T() | ||
| ctx, cancel := context.WithTimeout(t.Context(), 10*time.Second) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the bug fix
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe you're right since there could be multiple retry attempts within a scheduleToClose tasks. Good catch.