Skip to content

Commit 4dfed95

Browse files
committed
[core] Ensure TRG timeouts are obeyed + add polling timeout
1 parent 24a8e72 commit 4dfed95

File tree

2 files changed

+58
-25
lines changed

2 files changed

+58
-25
lines changed

core/config.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,8 @@ func setDefaults() error {
107107
viper.SetDefault("ddSchedulerUseSystemProxy", false)
108108
viper.SetDefault("trgServiceEndpoint", "//127.0.0.1:50060")
109109
viper.SetDefault("trgPollingInterval", "3s")
110+
viper.SetDefault("trgPollingTimeout", "3s")
111+
viper.SetDefault("trgReconciliationTimeout", "5s")
110112
viper.SetDefault("odcEndpoint", "//127.0.0.1:50053")
111113
viper.SetDefault("odcPollingInterval", "3s")
112114
viper.SetDefault("odcUseSystemProxy", false)
@@ -174,7 +176,9 @@ func setFlags() error {
174176
pflag.Duration("ddSchedulerStatusTimeout", viper.GetDuration("ddSchedulerStatusTimeout"), "Timeout for status calls in ddshed plugin")
175177
pflag.Bool("ddSchedulerUseSystemProxy", viper.GetBool("ddSchedulerUseSystemProxy"), "When true the https_proxy, http_proxy and no_proxy environment variables are obeyed")
176178
pflag.String("trgServiceEndpoint", viper.GetString("trgServiceEndpoint"), "Endpoint of the TRG gRPC service (`host:port`)")
177-
pflag.String("trgPollingInterval", viper.GetString("trgPollingInterval"), "How often to query the TRG gRPC service for run status (default: 3s)")
179+
pflag.Duration("trgPollingInterval", viper.GetDuration("trgPollingInterval"), "How often to query the TRG gRPC service for run status (default: 3s)")
180+
pflag.Duration("trgPollingTimeout", viper.GetDuration("trgPollingTimeout"), "Timeout for the query to the TRG gRPC service for run status (default: 3s)")
181+
pflag.Duration("trgReconciliationTimeout", viper.GetDuration("trgReconciliationTimeout"), "Timeout for reconciliation requests to the TRG gRPC service (default: 5s)")
178182
pflag.String("odcEndpoint", viper.GetString("odcEndpoint"), "Endpoint of the ODC gRPC service (`host:port`)")
179183
pflag.String("odcPollingInterval", viper.GetString("odcPollingInterval"), "How often to query the ODC gRPC service for partition status (default: 3s)")
180184
pflag.Bool("odcUseSystemProxy", viper.GetBool("odcUseSystemProxy"), "When true the https_proxy, http_proxy and no_proxy environment variables are obeyed")

core/integration/trg/plugin.go

Lines changed: 53 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,14 @@ import (
5252
)
5353

5454
const (
55-
TRG_DIAL_TIMEOUT = 2 * time.Second
56-
TRG_POLLING_INTERVAL = 3 * time.Second
57-
TRG_RECONCILIATION_TIMEOUT = 5 * time.Second
58-
TOPIC = topic.IntegratedService + topic.Separator + "trg"
55+
TRG_DIAL_TIMEOUT = 2 * time.Second
56+
TRG_PFR_TIMEOUT = 5 * time.Second
57+
TRG_LOAD_TIMEOUT = 5 * time.Second
58+
TRG_START_TIMEOUT = 5 * time.Second
59+
TRG_STOP_TIMEOUT = 5 * time.Second
60+
TRG_UNLOAD_TIMEOUT = 5 * time.Second
61+
TRG_CLEANUP_TIMEOUT = 5 * time.Second
62+
TOPIC = topic.IntegratedService + topic.Separator + "trg"
5963
)
6064

6165
type Plugin struct {
@@ -119,7 +123,10 @@ func (p *Plugin) GetConnectionState() string {
119123
}
120124

121125
func (p *Plugin) queryRunList() {
122-
runReply, err := p.trgClient.RunList(context.Background(), &trgpb.Empty{}, grpc.EmptyCallOption{})
126+
ctx, cancel := context.WithTimeout(context.Background(), viper.GetDuration("trgPollingTimeout"))
127+
defer cancel()
128+
129+
runReply, err := p.trgClient.RunList(ctx, &trgpb.Empty{}, grpc.EmptyCallOption{})
123130
if err != nil {
124131
err = fmt.Errorf("error querying TRG service at %s: %w", viper.GetString("trgServiceEndpoint"), err)
125132
log.WithError(err).
@@ -260,12 +267,7 @@ func (p *Plugin) Init(instanceId string) error {
260267
var ctx context.Context
261268
ctx, p.cachedStatusCancelFunc = context.WithCancel(context.Background())
262269

263-
trgPollingIntervalStr := viper.GetString("trgPollingInterval")
264-
trgPollingInterval, err := time.ParseDuration(trgPollingIntervalStr)
265-
if err != nil {
266-
trgPollingInterval = TRG_POLLING_INTERVAL
267-
log.Debugf("TRG plugin cannot acquire polling interval, defaulting to %s", TRG_POLLING_INTERVAL.String())
268-
}
270+
trgPollingInterval := viper.GetDuration("trgPollingInterval")
269271

270272
// polling
271273
go func() {
@@ -321,8 +323,10 @@ func (p *Plugin) reconcile() {
321323
}
322324
}*/
323325

324-
ctx, _ := context.WithTimeout(context.Background(), TRG_RECONCILIATION_TIMEOUT)
326+
ctx, cancel := context.WithTimeout(context.Background(), viper.GetDuration("trgReconciliationTimeout"))
325327
_, err := p.trgClient.RunStop(ctx, &in, grpc.EmptyCallOption{})
328+
cancel()
329+
326330
// TODO: Response's RC should also be checked here
327331
if err != nil {
328332
err = fmt.Errorf("TRG reconciliation failure: %w", err)
@@ -344,8 +348,9 @@ func (p *Plugin) reconcile() {
344348
}
345349
}
346350
if trgRun.State == CTP_LOADED && trgRun.Cardinality == CTP_GLOBAL {
347-
ctx, _ := context.WithTimeout(context.Background(), TRG_RECONCILIATION_TIMEOUT)
351+
ctx, cancel := context.WithTimeout(context.Background(), viper.GetDuration("trgReconciliationTimeout"))
348352
_, err := p.trgClient.RunUnload(ctx, &in, grpc.EmptyCallOption{})
353+
cancel()
349354
if err != nil {
350355
err = fmt.Errorf("TRG reconciliation failure: %w", err)
351356
log.WithError(err).
@@ -457,6 +462,10 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) {
457462
return
458463
}
459464

465+
timeout := callable.AcquireTimeout(TRG_PFR_TIMEOUT, varStack, "PrepareForRun", envId)
466+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
467+
defer cancel()
468+
460469
payload := map[string]interface{}{
461470
"trgRequest": &in,
462471
}
@@ -473,7 +482,7 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) {
473482
})
474483

475484
var response *trgpb.RunReply
476-
response, err = p.trgClient.PrepareForRun(context.Background(), &in, grpc.EmptyCallOption{})
485+
response, err = p.trgClient.PrepareForRun(ctx, &in, grpc.EmptyCallOption{})
477486
if err != nil {
478487
log.WithError(err).
479488
WithField("level", infologger.IL_Support).
@@ -666,6 +675,10 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) {
666675
return
667676
}
668677

678+
timeout := callable.AcquireTimeout(TRG_LOAD_TIMEOUT, varStack, "RunLoad", envId)
679+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
680+
defer cancel()
681+
669682
payload := map[string]interface{}{
670683
"trgRequest": &in,
671684
}
@@ -682,7 +695,7 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) {
682695
})
683696

684697
var response *trgpb.RunReply
685-
response, err = p.trgClient.RunLoad(context.Background(), &in, grpc.EmptyCallOption{})
698+
response, err = p.trgClient.RunLoad(ctx, &in, grpc.EmptyCallOption{})
686699
if err != nil {
687700
log.WithError(err).
688701
WithField("level", infologger.IL_Support).
@@ -853,6 +866,10 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) {
853866
return
854867
}
855868

869+
timeout := callable.AcquireTimeout(TRG_START_TIMEOUT, varStack, "RunStart", envId)
870+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
871+
defer cancel()
872+
856873
payload := map[string]interface{}{
857874
"trgRequest": &in,
858875
}
@@ -869,7 +886,7 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) {
869886
})
870887

871888
var response *trgpb.RunReply
872-
response, err = p.trgClient.RunStart(context.Background(), &in, grpc.EmptyCallOption{})
889+
response, err = p.trgClient.RunStart(ctx, &in, grpc.EmptyCallOption{})
873890
if err != nil {
874891
log.WithError(err).
875892
WithField("level", infologger.IL_Support).
@@ -956,7 +973,7 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) {
956973

957974
return
958975
}
959-
runStopFunc := func(runNumber64 int64) (out string) {
976+
runStopFunc := func(ctx context.Context, runNumber64 int64) (out string) {
960977
trgDetectorsParam, ok := varStack["trg_detectors"]
961978
if !ok {
962979
// "" it is a global run
@@ -1033,7 +1050,7 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) {
10331050
})
10341051

10351052
var response *trgpb.RunReply
1036-
response, err = p.trgClient.RunStop(context.Background(), &in, grpc.EmptyCallOption{})
1053+
response, err = p.trgClient.RunStop(ctx, &in, grpc.EmptyCallOption{})
10371054
if err != nil {
10381055
log.WithError(err).
10391056
WithField("level", infologger.IL_Support).
@@ -1124,7 +1141,7 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) {
11241141

11251142
return
11261143
}
1127-
runUnloadFunc := func(runNumber64 int64) (out string) {
1144+
runUnloadFunc := func(ctx context.Context, runNumber64 int64) (out string) {
11281145

11291146
trgDetectorsParam, ok := varStack["trg_detectors"]
11301147
if !ok {
@@ -1201,7 +1218,7 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) {
12011218
})
12021219

12031220
var response *trgpb.RunReply
1204-
response, err = p.trgClient.RunUnload(context.Background(), &in, grpc.EmptyCallOption{})
1221+
response, err = p.trgClient.RunUnload(ctx, &in, grpc.EmptyCallOption{})
12051222
if err != nil {
12061223
log.WithError(err).
12071224
WithField("level", infologger.IL_Support).
@@ -1294,7 +1311,11 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) {
12941311
Error("cannot acquire run number for TRG Run Stop")
12951312
}
12961313

1297-
return runStopFunc(runNumber64)
1314+
timeout := callable.AcquireTimeout(TRG_STOP_TIMEOUT, varStack, "RunStop", envId)
1315+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
1316+
defer cancel()
1317+
1318+
return runStopFunc(ctx, runNumber64)
12981319
}
12991320
stack["RunUnload"] = func() (out string) {
13001321
log.WithField("partition", envId).
@@ -1311,7 +1332,11 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) {
13111332
Error("cannot acquire run number for TRG Run Unload")
13121333
}
13131334

1314-
return runUnloadFunc(runNumber64)
1335+
timeout := callable.AcquireTimeout(TRG_UNLOAD_TIMEOUT, varStack, "RunUnload", envId)
1336+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
1337+
defer cancel()
1338+
1339+
return runUnloadFunc(ctx, runNumber64)
13151340
}
13161341
stack["Cleanup"] = func() (out string) {
13171342
envId, ok := varStack["environment_id"]
@@ -1322,6 +1347,10 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) {
13221347
return
13231348
}
13241349

1350+
timeout := callable.AcquireTimeout(TRG_CLEANUP_TIMEOUT, varStack, "Cleanup", envId)
1351+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
1352+
defer cancel()
1353+
13251354
// runStop if found pending
13261355
runNumberStop, ok := p.pendingRunStops[envId]
13271356
if ok {
@@ -1331,7 +1360,7 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) {
13311360
Debug("pending TRG Stop found, performing cleanup")
13321361

13331362
delete(p.pendingRunStops, envId)
1334-
_ = runStopFunc(runNumberStop)
1363+
_ = runStopFunc(ctx, runNumberStop)
13351364

13361365
trgEndTime := strconv.FormatInt(time.Now().UnixMilli(), 10)
13371366
parentRole, ok := call.GetParentRole().(callable.ParentRole)
@@ -1358,7 +1387,7 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) {
13581387
Debug("pending TRG Unload found, performing cleanup")
13591388

13601389
delete(p.pendingRunUnloads, envId)
1361-
_ = runUnloadFunc(runNumberUnload)
1390+
_ = runUnloadFunc(ctx, runNumberUnload)
13621391
} else {
13631392
log.WithField("partition", envId).
13641393
WithField("level", infologger.IL_Devel).

0 commit comments

Comments
 (0)