-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Standalone activity heartbeating #8730
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: standalone-activity
Are you sure you want to change the base?
Changes from all commits
bbf3c34
e3e5777
b003aa4
aa5155a
16192e2
3796405
d0836a6
3ba3273
e968ad0
988cc7f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,6 +15,7 @@ import ( | |
| "go.temporal.io/api/workflowservice/v1" | ||
| "go.temporal.io/server/api/historyservice/v1" | ||
| "go.temporal.io/server/api/matchingservice/v1" | ||
| tokenspb "go.temporal.io/server/api/token/v1" | ||
| "go.temporal.io/server/chasm" | ||
| "go.temporal.io/server/chasm/lib/activity/gen/activitypb/v1" | ||
| "go.temporal.io/server/common" | ||
|
|
@@ -54,6 +55,12 @@ type Activity struct { | |
| Store chasm.Field[ActivityStore] | ||
| } | ||
|
|
||
| // WithToken wraps a request with its deserialized task token. | ||
| type WithToken[R any] struct { | ||
| Token *tokenspb.Task | ||
| Request R | ||
| } | ||
|
|
||
| func (a *Activity) LifecycleState(_ chasm.Context) chasm.LifecycleState { | ||
| switch a.Status { | ||
| case activitypb.ACTIVITY_EXECUTION_STATUS_COMPLETED: | ||
|
|
@@ -134,25 +141,26 @@ func (a *Activity) createAddActivityTaskRequest(ctx chasm.Context, namespaceID s | |
| func (a *Activity) HandleStarted(ctx chasm.MutableContext, request *historyservice.RecordActivityTaskStartedRequest) ( | ||
| *historyservice.RecordActivityTaskStartedResponse, error, | ||
| ) { | ||
| if err := TransitionStarted.Apply(a, ctx, nil); err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| attempt := a.LastAttempt.Get(ctx) | ||
| attempt.StartedTime = timestamppb.New(ctx.Now(a)) | ||
| attempt.LastWorkerIdentity = request.GetPollRequest().GetIdentity() | ||
|
|
||
| if versionDirective := request.GetVersionDirective().GetDeploymentVersion(); versionDirective != nil { | ||
| attempt.LastDeploymentVersion = &deploymentpb.WorkerDeploymentVersion{ | ||
| BuildId: versionDirective.GetBuildId(), | ||
| DeploymentName: versionDirective.GetDeploymentName(), | ||
| } | ||
| } | ||
|
|
||
| if err := TransitionStarted.Apply(a, ctx, nil); err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| response := &historyservice.RecordActivityTaskStartedResponse{} | ||
| err := a.StoreOrSelf(ctx).PopulateRecordStartedResponse(ctx, ctx.ExecutionKey(), response) | ||
| return response, err | ||
| } | ||
|
|
||
| // PopulateRecordStartedResponse populates the response for HandleStarted. | ||
| func (a *Activity) PopulateRecordStartedResponse(ctx chasm.Context, key chasm.ExecutionKey, response *historyservice.RecordActivityTaskStartedResponse) error { | ||
| lastHeartbeat, _ := a.LastHeartbeat.TryGet(ctx) | ||
| if lastHeartbeat != nil { | ||
|
|
@@ -183,15 +191,22 @@ func (a *Activity) PopulateRecordStartedResponse(ctx chasm.Context, key chasm.Ex | |
| return nil | ||
| } | ||
|
|
||
| // RecordCompleted applies the provided function to record activity completion. | ||
| func (a *Activity) RecordCompleted(ctx chasm.MutableContext, applyFn func(ctx chasm.MutableContext) error) error { | ||
| return applyFn(ctx) | ||
| } | ||
|
|
||
| // HandleCompleted updates the activity on activity completion. | ||
| func (a *Activity) HandleCompleted(ctx chasm.MutableContext, request *historyservice.RespondActivityTaskCompletedRequest) ( | ||
| *historyservice.RespondActivityTaskCompletedResponse, error, | ||
| ) { | ||
| if err := TransitionCompleted.Apply(a, ctx, request); err != nil { | ||
| func (a *Activity) HandleCompleted( | ||
| ctx chasm.MutableContext, | ||
| input WithToken[*historyservice.RespondActivityTaskCompletedRequest], | ||
| ) (*historyservice.RespondActivityTaskCompletedResponse, error) { | ||
| // TODO(dan): add test coverage for this validation | ||
| if err := ValidateActivityTaskToken(ctx, a, input.Token); err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| if err := TransitionCompleted.Apply(a, ctx, input.Request); err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
|
|
@@ -200,10 +215,16 @@ func (a *Activity) HandleCompleted(ctx chasm.MutableContext, request *historyser | |
|
|
||
| // HandleFailed updates the activity on activity failure. if the activity is retryable, it will be rescheduled | ||
| // for retry instead. | ||
| func (a *Activity) HandleFailed(ctx chasm.MutableContext, req *historyservice.RespondActivityTaskFailedRequest) ( | ||
| *historyservice.RespondActivityTaskFailedResponse, error, | ||
| ) { | ||
| failure := req.GetFailedRequest().GetFailure() | ||
| func (a *Activity) HandleFailed( | ||
| ctx chasm.MutableContext, | ||
| input WithToken[*historyservice.RespondActivityTaskFailedRequest], | ||
| ) (*historyservice.RespondActivityTaskFailedResponse, error) { | ||
| // TODO(dan): add test coverage for this validation | ||
| if err := ValidateActivityTaskToken(ctx, a, input.Token); err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| failure := input.Request.GetFailedRequest().GetFailure() | ||
|
|
||
| shouldRetry, retryInterval, err := a.shouldRetryOnFailure(ctx, failure) | ||
| if err != nil { | ||
|
|
@@ -222,18 +243,24 @@ func (a *Activity) HandleFailed(ctx chasm.MutableContext, req *historyservice.Re | |
| } | ||
|
|
||
| // No more retries, transition to failed state | ||
| if err := TransitionFailed.Apply(a, ctx, req); err != nil { | ||
| if err := TransitionFailed.Apply(a, ctx, input.Request); err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| return &historyservice.RespondActivityTaskFailedResponse{}, nil | ||
| } | ||
|
|
||
| // HandleCanceled updates the activity on activity canceled. | ||
| func (a *Activity) HandleCanceled(ctx chasm.MutableContext, request *historyservice.RespondActivityTaskCanceledRequest) ( | ||
| *historyservice.RespondActivityTaskCanceledResponse, error, | ||
| ) { | ||
| if err := TransitionCanceled.Apply(a, ctx, request.GetCancelRequest().GetDetails()); err != nil { | ||
| func (a *Activity) HandleCanceled( | ||
| ctx chasm.MutableContext, | ||
| input WithToken[*historyservice.RespondActivityTaskCanceledRequest], | ||
| ) (*historyservice.RespondActivityTaskCanceledResponse, error) { | ||
| // TODO(dan): add test coverage for this validation | ||
| if err := ValidateActivityTaskToken(ctx, a, input.Token); err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| if err := TransitionCanceled.Apply(a, ctx, input.Request.GetCancelRequest().GetDetails()); err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
|
|
@@ -250,9 +277,9 @@ func (a *Activity) handleTerminated(ctx chasm.MutableContext, req *activitypb.Te | |
| return &activitypb.TerminateActivityExecutionResponse{}, nil | ||
| } | ||
|
|
||
| // getLastHeartbeat retrieves the last heartbeat state, initializing it if not present. The heartbeat is lazily created | ||
| // getOrCreateLastHeartbeat retrieves the last heartbeat state, initializing it if not present. The heartbeat is lazily created | ||
| // to avoid unnecessary writes when heartbeats are not used. | ||
| func (a *Activity) getLastHeartbeat(ctx chasm.MutableContext) *activitypb.ActivityHeartbeatState { | ||
| func (a *Activity) getOrCreateLastHeartbeat(ctx chasm.MutableContext) *activitypb.ActivityHeartbeatState { | ||
| heartbeat, ok := a.LastHeartbeat.TryGet(ctx) | ||
| if !ok { | ||
| heartbeat = &activitypb.ActivityHeartbeatState{} | ||
|
|
@@ -417,12 +444,43 @@ func createStartToCloseTimeoutFailure() *failurepb.Failure { | |
| } | ||
| } | ||
|
|
||
| func (a *Activity) RecordHeartbeat(ctx chasm.MutableContext, details *commonpb.Payloads) (chasm.NoValue, error) { | ||
| func createHeartbeatTimeoutFailure() *failurepb.Failure { | ||
| return &failurepb.Failure{ | ||
| Message: fmt.Sprintf(common.FailureReasonActivityTimeout, enumspb.TIMEOUT_TYPE_HEARTBEAT.String()), | ||
| FailureInfo: &failurepb.Failure_TimeoutFailureInfo{ | ||
| TimeoutFailureInfo: &failurepb.TimeoutFailureInfo{ | ||
| TimeoutType: enumspb.TIMEOUT_TYPE_HEARTBEAT, | ||
| }, | ||
| }, | ||
| } | ||
| } | ||
|
|
||
| // RecordHeartbeat records a heartbeat for the activity. | ||
| func (a *Activity) RecordHeartbeat( | ||
| ctx chasm.MutableContext, | ||
| input WithToken[*historyservice.RecordActivityTaskHeartbeatRequest], | ||
| ) (*historyservice.RecordActivityTaskHeartbeatResponse, error) { | ||
| err := ValidateActivityTaskToken(ctx, a, input.Token) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| a.LastHeartbeat = chasm.NewDataField(ctx, &activitypb.ActivityHeartbeatState{ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @bergundy is there any performance pentalty if we create a new field on every hearbeat?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, no penalty. |
||
| RecordedTime: timestamppb.New(ctx.Now(a)), | ||
| Details: details, | ||
| Details: input.Request.GetHeartbeatRequest().GetDetails(), | ||
| }) | ||
| return nil, nil | ||
| ctx.AddTask( | ||
| a, | ||
| chasm.TaskAttributes{ | ||
| ScheduledTime: ctx.Now(a).Add(a.GetHeartbeatTimeout().AsDuration()), | ||
| }, | ||
| &activitypb.HeartbeatTimeoutTask{ | ||
| Attempt: a.LastAttempt.Get(ctx).GetCount(), | ||
| }, | ||
| ) | ||
| return &historyservice.RecordActivityTaskHeartbeatResponse{ | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just before returning here you want to generate a new heartbeat task if the heartbeat timeout is set.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See reply above; I currently still think the proposed design is preferable.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is done now -- switched back to your design. |
||
| CancelRequested: a.Status == activitypb.ACTIVITY_EXECUTION_STATUS_CANCEL_REQUESTED, | ||
| // TODO(dan): ActivityPaused, ActivityReset | ||
| }, nil | ||
cursor[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| func (a *Activity) buildActivityExecutionInfo(ctx chasm.Context) (*activity.ActivityExecutionInfo, error) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,6 +7,7 @@ import ( | |
| "go.temporal.io/server/chasm" | ||
| "go.temporal.io/server/chasm/lib/activity/gen/activitypb/v1" | ||
| "go.temporal.io/server/common/resource" | ||
| "go.temporal.io/server/common/util" | ||
| "go.uber.org/fx" | ||
| ) | ||
|
|
||
|
|
@@ -146,3 +147,72 @@ func (e *startToCloseTimeoutTaskExecutor) Execute( | |
| // Reached maximum attempts, timeout the activity | ||
| return TransitionTimedOut.Apply(activity, ctx, enumspb.TIMEOUT_TYPE_START_TO_CLOSE) | ||
| } | ||
|
|
||
| // HeartbeatTimeoutTask is a pure task that enforces heartbeat timeouts. | ||
| type heartbeatTimeoutTaskExecutor struct{} | ||
|
|
||
| func newHeartbeatTimeoutTaskExecutor() *heartbeatTimeoutTaskExecutor { | ||
| return &heartbeatTimeoutTaskExecutor{} | ||
| } | ||
|
|
||
| // Validate validates a HeartbeatTimeoutTask. | ||
| func (e *heartbeatTimeoutTaskExecutor) Validate( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You will want to validate that the schedule time in the attributes is still relevant here not in the execute function. It should deterministic function of the last heartbeat time and the attempt start time (
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See reply below |
||
| ctx chasm.Context, | ||
| activity *Activity, | ||
| taskAttrs chasm.TaskAttributes, | ||
| task *activitypb.HeartbeatTimeoutTask, | ||
| ) (bool, error) { | ||
| // Let T = user-configured heartbeat timeout and let hb_i be the time of the ith user-submitted | ||
| // heartbeat request. (hb_0 = 0 since we always start a timer task when an attempt starts). | ||
|
|
||
| // There are two concurrent sequences of events: | ||
| // 1. A worker is sending heartbeats at times hb_i. | ||
| // 2. This task is being executed at (shortly after) times hb_i + T. | ||
|
|
||
| // On the i-th execution of this function, we look back into the past and determine whether the | ||
| // last heartbeat was received after hb_i. If so, we reject this timeout task. Otherwise, the | ||
| // Execute function runs and we fail the attempt. | ||
| if activity.Status != activitypb.ACTIVITY_EXECUTION_STATUS_STARTED && | ||
| activity.Status != activitypb.ACTIVITY_EXECUTION_STATUS_CANCEL_REQUESTED { | ||
| return false, nil | ||
| } | ||
| // Task attempt must still match current attempt. | ||
| attempt := activity.LastAttempt.Get(ctx) | ||
| if attempt.GetCount() != task.Attempt { | ||
| return false, nil | ||
| } | ||
|
|
||
| // Must not have been a heartbeat since this task was created | ||
| hbTimeout := activity.GetHeartbeatTimeout().AsDuration() // T | ||
| attemptStartTime := attempt.GetStartedTime().AsTime() | ||
| lastHb, _ := activity.LastHeartbeat.TryGet(ctx) // could be nil, or from a previous attempt | ||
| // No hbs in attempt so far is equivalent to hb having been sent at attempt start time. | ||
| lastHbTime := util.MaxTime(lastHb.GetRecordedTime().AsTime(), attemptStartTime) | ||
| thisTaskHbTime := taskAttrs.ScheduledTime.Add(-hbTimeout) // hb_i | ||
| if lastHbTime.After(thisTaskHbTime) { | ||
| // another heartbeat has invalidated this task's heartbeat | ||
| return false, nil | ||
| } | ||
| return true, nil | ||
| } | ||
|
|
||
| // Execute executes a HeartbeatTimeoutTask. It fails the attempt due to heartbeat timeout, leading | ||
| // to retry or activity failure. | ||
| func (e *heartbeatTimeoutTaskExecutor) Execute( | ||
| ctx chasm.MutableContext, | ||
| activity *Activity, | ||
| _ chasm.TaskAttributes, | ||
| _ *activitypb.HeartbeatTimeoutTask, | ||
| ) error { | ||
| shouldRetry, retryInterval, err := activity.shouldRetry(ctx, 0) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| if shouldRetry { | ||
| return TransitionRescheduled.Apply(activity, ctx, rescheduleEvent{ | ||
| retryInterval: retryInterval, | ||
| failure: createHeartbeatTimeoutFailure(), | ||
| }) | ||
| } | ||
| return TransitionTimedOut.Apply(activity, ctx, enumspb.TIMEOUT_TYPE_HEARTBEAT) | ||
| } | ||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The movement here was required to fix a bug that I encountered when implementing the other proposed design. We needed
attempt.StartedTimeto be available inTransitionStartedso that we could use it in scheduling the first heartbeat task, as well as the base for the start-to-close timeout. Before this PR we were using an ad-hocctx.Now()that was close to but not equal toStartedTime.Reverting it doesn't cause a test failure with the hreartbeating design in this PR, but it's better to set the field state before calling the transition function.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Reverting this would now cause a test failure since the PR is back to design A.