From 91a081801ab970b588680b22ce05819170a3afd1 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Mon, 16 Feb 2026 17:06:03 +0100 Subject: [PATCH 01/34] WIP#1: working on local machine --- .../configs/workflow-gateway-don-remote.toml | 81 +++++ .../environment/environment/environment.go | 2 +- .../cre/environment/environment/swap.go | 10 +- .../environment/agent/cmd/local-agent/main.go | 34 ++ .../lib/cre/environment/agent/deploy.go | 50 +++ .../lib/cre/environment/agent/server.go | 210 +++++++++++++ .../lib/cre/environment/agent/server_test.go | 48 +++ .../lib/cre/environment/agent/transport.go | 38 +++ .../cre/environment/agent/transport_test.go | 41 +++ .../lib/cre/environment/blockchain_start.go | 291 ++++++++++++++++++ .../cre/environment/blockchain_start_test.go | 21 ++ .../cre/environment/blockchains/evm/evm.go | 33 +- .../lib/cre/environment/config/config.go | 72 ++++- .../lib/cre/environment/environment.go | 18 +- system-tests/lib/cre/environment/state.go | 6 +- .../tests/load/cre/workflow_don_load_test.go | 7 +- .../tests/load/cre/writer_don_load_test.go | 8 +- 17 files changed, 947 insertions(+), 23 deletions(-) create mode 100644 core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml create mode 100644 system-tests/lib/cre/environment/agent/cmd/local-agent/main.go create mode 100644 system-tests/lib/cre/environment/agent/deploy.go create mode 100644 system-tests/lib/cre/environment/agent/server.go create mode 100644 system-tests/lib/cre/environment/agent/server_test.go create mode 100644 system-tests/lib/cre/environment/agent/transport.go create mode 100644 system-tests/lib/cre/environment/agent/transport_test.go create mode 100644 system-tests/lib/cre/environment/blockchain_start.go create mode 100644 system-tests/lib/cre/environment/blockchain_start_test.go diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml b/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml new file mode 100644 index 00000000000..e86df8a625c --- /dev/null +++ b/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml @@ -0,0 +1,81 @@ + +[[blockchains]] + type = "anvil" + chain_id = "1337" + docker_cmd_params = ["-b", "0.5", "--mixed-mining"] + target = "remote" + +[[blockchains]] + type = "anvil" + chain_id = "2337" + port = "8546" + docker_cmd_params = ["-b", "0.5", "--mixed-mining"] + +[jd] + csa_encryption_key = "d1093c0060d50a3c89c189b2e485da5a3ce57f3dcb38ab7e2c0d5f0bb2314a44" # any random 32 byte hex string + # change to your version + image = "job-distributor:0.22.1" + +[fake] + port = 8171 + +[fake_http] + port = 8666 + +#[s3provider] +# # use all defaults +# port = 9000 +# console_port = 9001 + +[infra] + # either "docker" or "kubernetes" + type = "docker" + +[[nodesets]] + nodes = 4 + name = "workflow" + don_types = ["workflow"] + override_mode = "all" + http_port_range_start = 10100 + + env_vars = { CL_EVM_CMD = "" } + capabilities = ["ocr3", "custom-compute", "web-api-target", "web-api-trigger", "vault", "cron", "http-action", "http-trigger", "consensus", "don-time", "write-evm-1337", "write-evm-2337", "evm-1337", "evm-2337", "read-contract-1337", "read-contract-2337"] + + [nodesets.db] + image = "postgres:12.0" + port = 13000 + +[[nodesets.node_specs]] + roles = ["plugin"] + [nodesets.node_specs.node] + docker_ctx = "../../../.." + docker_file = "core/chainlink.Dockerfile" + docker_build_args = { "CL_IS_PROD_BUILD" = "false" } + # image = "chainlink-tmp:latest" + user_config_overrides = "" + +[[nodesets]] + nodes = 1 + name = "bootstrap-gateway" + don_types = ["bootstrap", "gateway"] + override_mode = "each" + http_port_range_start = 10300 + + env_vars = { CL_EVM_CMD = "" } + supported_evm_chains = [1337, 2337] + + [nodesets.db] + image = "postgres:12.0" + port = 13200 + + [[nodesets.node_specs]] + roles = ["bootstrap", "gateway"] + [nodesets.node_specs.node] + docker_ctx = "../../../.." + docker_file = "core/chainlink.Dockerfile" + docker_build_args = { "CL_IS_PROD_BUILD" = "false" } + # 5002 is the web API capabilities port for incoming requests + # 15002 is the vault port for incoming requests + custom_ports = ["5002:5002","15002:15002"] + # image = "chainlink-tmp:latest" + user_config_overrides = "" diff --git a/core/scripts/cre/environment/environment/environment.go b/core/scripts/cre/environment/environment/environment.go index 34a893cd0ee..9ad0428ce14 100644 --- a/core/scripts/cre/environment/environment/environment.go +++ b/core/scripts/cre/environment/environment/environment.go @@ -712,7 +712,7 @@ func StartCLIEnvironment( universalSetupInput := &creenv.SetupInput{ NodeSets: in.NodeSets, - BlockchainsInput: in.Blockchains, + Blockchains: in.Blockchains, ContractVersions: env.ContractVersions(), WithV2Registries: env.WithV2Registries(), JdInput: in.JD, diff --git a/core/scripts/cre/environment/environment/swap.go b/core/scripts/cre/environment/environment/swap.go index 2c973d1ffe5..6cf9b9c012d 100644 --- a/core/scripts/cre/environment/environment/swap.go +++ b/core/scripts/cre/environment/environment/swap.go @@ -253,6 +253,14 @@ func swapNodes(ctx context.Context, forceFlag bool, waitTime time.Duration) erro return fmt.Errorf("failed to set TESTCONTAINERS_RYUK_DISABLED environment variable: %w", setErr) } + effectiveBlockchains, effectiveErr := config.EffectiveBlockchains() + if effectiveErr != nil { + return errors.Wrap(effectiveErr, "failed to resolve blockchain inputs") + } + if len(effectiveBlockchains) == 0 || effectiveBlockchains[0] == nil || effectiveBlockchains[0].Out == nil { + return errors.New("at least one blockchain output is required to restart node sets") + } + nerrg := errgroup.Group{} for _, nodeSet := range config.NodeSets { nerrg.Go(func() error { @@ -290,7 +298,7 @@ func swapNodes(ctx context.Context, forceFlag bool, waitTime time.Duration) erro nodeSet.Out = nil var nodesetErr error nodeSet.Input.NodeSpecs = nodeSet.ExtractCTFInputs() - nodeSet.Out, nodesetErr = ns.NewSharedDBNodeSet(nodeSet.Input, config.Blockchains[0].Out) + nodeSet.Out, nodesetErr = ns.NewSharedDBNodeSet(nodeSet.Input, effectiveBlockchains[0].Out) if nodesetErr != nil { framework.L.Error().Msgf("Failed to create node set named %s: %s", nodeSet.Name, nodesetErr) framework.L.Info().Msgf("Waiting %s for the containers to be removed", waitTime.String()) diff --git a/system-tests/lib/cre/environment/agent/cmd/local-agent/main.go b/system-tests/lib/cre/environment/agent/cmd/local-agent/main.go new file mode 100644 index 00000000000..05e7cefb04c --- /dev/null +++ b/system-tests/lib/cre/environment/agent/cmd/local-agent/main.go @@ -0,0 +1,34 @@ +package main + +import ( + "context" + "flag" + "fmt" + "os" + "os/signal" + "syscall" + + "github.com/rs/zerolog" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" + blockchainsets "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/sets" + "github.com/smartcontractkit/chainlink/system-tests/lib/infra" +) + +func main() { + addr := flag.String("addr", "127.0.0.1:18080", "agent listen address") + flag.Parse() + + lggr := zerolog.New(os.Stderr).With().Timestamp().Logger() + provider := &infra.Provider{Type: infra.Docker} + server := agent.NewServer(lggr, blockchainsets.NewDeployerSet(lggr, provider)) + + ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer cancel() + + lggr.Info().Msgf("starting local CRE agent on %s", *addr) + if err := agent.Run(ctx, *addr, server); err != nil { + _, _ = fmt.Fprintf(os.Stderr, "agent failed: %v\n", err) + os.Exit(1) + } +} diff --git a/system-tests/lib/cre/environment/agent/deploy.go b/system-tests/lib/cre/environment/agent/deploy.go new file mode 100644 index 00000000000..68fb1de3f68 --- /dev/null +++ b/system-tests/lib/cre/environment/agent/deploy.go @@ -0,0 +1,50 @@ +package agent + +import ( + "context" + "fmt" + + pkgerrors "github.com/pkg/errors" + + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" +) + +type OutputDeployer interface { + DeployOutput(ctx context.Context, input *blockchain.Input) (*blockchain.Output, error) +} + +func DeployBlockchainComponent( + ctx context.Context, + deployers map[blockchain.ChainFamily]blockchains.Deployer, + input *blockchain.Input, +) (*blockchain.Output, error) { + if input == nil { + return nil, pkgerrors.New("blockchain input is nil") + } + + chainFamily, err := blockchain.TypeToFamily(input.Type) + if err != nil { + return nil, err + } + + deployer, ok := deployers[chainFamily] + if !ok { + return nil, fmt.Errorf("no deployer found for blockchain type %s", input.Type) + } + + if outputDeployer, ok := deployer.(OutputDeployer); ok { + deployedOutput, err := outputDeployer.DeployOutput(ctx, input) + if err != nil { + return nil, pkgerrors.Wrapf(err, "failed to deploy blockchain output of type %s", input.Type) + } + return deployedOutput, nil + } + + deployed, err := deployer.Deploy(ctx, input) + if err != nil { + return nil, pkgerrors.Wrapf(err, "failed to deploy blockchain of type %s", input.Type) + } + + return deployed.CtfOutput(), nil +} diff --git a/system-tests/lib/cre/environment/agent/server.go b/system-tests/lib/cre/environment/agent/server.go new file mode 100644 index 00000000000..91457e91a43 --- /dev/null +++ b/system-tests/lib/cre/environment/agent/server.go @@ -0,0 +1,210 @@ +package agent + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "strings" + "sync" + + "github.com/rs/zerolog" + + "github.com/smartcontractkit/chainlink-testing-framework/framework" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" +) + +const ( + SchemaVersionV1 = "v1" + OperationStartComponent = "StartComponent" + OperationHealth = "Health" + ComponentTypeBlockchain = "blockchain" + + ErrCodeMethodNotAllowed = "method_not_allowed" + ErrCodeInvalidRequestBody = "invalid_request_body" + ErrCodeUnsupportedSchema = "unsupported_schema_version" + ErrCodeUnsupportedOperation = "unsupported_operation" + ErrCodeInvalidPayload = "invalid_payload" + ErrCodeUnsupportedComponent = "unsupported_component_type" + ErrCodeMissingComponentInput = "missing_component_input" + ErrCodeDeployFailed = "deployment_failed" + ErrCodeTransportEncodeFailed = "transport_encode_failed" +) + +var frameworkLogCaptureMu sync.Mutex + +type StartComponentEnvelope struct { + SchemaVersion string `json:"schemaVersion"` + Operation string `json:"operation"` + Payload json.RawMessage `json:"payload"` +} + +type StartBlockchainPayload struct { + ComponentType string `json:"componentType"` + Blockchain *blockchain.Input `json:"blockchain"` +} + +type StartComponentResponse struct { + BlockchainOutput map[string]any `json:"blockchainOutput,omitempty"` + AgentLogs []string `json:"agentLogs,omitempty"` + ErrorCode string `json:"errorCode,omitempty"` + Error string `json:"error,omitempty"` +} + +type Server struct { + lggr zerolog.Logger + deployers map[blockchain.ChainFamily]blockchains.Deployer +} + +func NewServer(lggr zerolog.Logger, deployers map[blockchain.ChainFamily]blockchains.Deployer) *Server { + return &Server{ + lggr: lggr, + deployers: deployers, + } +} + +func (s *Server) Handler() http.Handler { + mux := http.NewServeMux() + mux.HandleFunc("/v1/health", s.health) + mux.HandleFunc("/v1/components/start", s.startComponent) + return mux +} + +func (s *Server) health(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("ok")) +} + +func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) + return + } + + var envelope StartComponentEnvelope + if err := json.NewDecoder(r.Body).Decode(&envelope); err != nil { + s.respondError(w, http.StatusBadRequest, ErrCodeInvalidRequestBody, fmt.Sprintf("invalid request body: %v", err), nil) + return + } + + if envelope.SchemaVersion != SchemaVersionV1 { + s.respondError(w, http.StatusBadRequest, ErrCodeUnsupportedSchema, fmt.Sprintf("unsupported schema version: %s", envelope.SchemaVersion), nil) + return + } + if envelope.Operation != OperationStartComponent { + s.respondError(w, http.StatusBadRequest, ErrCodeUnsupportedOperation, fmt.Sprintf("unsupported operation: %s", envelope.Operation), nil) + return + } + + var payload StartBlockchainPayload + if err := json.Unmarshal(envelope.Payload, &payload); err != nil { + s.respondError(w, http.StatusBadRequest, ErrCodeInvalidPayload, fmt.Sprintf("invalid payload: %v", err), nil) + return + } + if payload.ComponentType != ComponentTypeBlockchain { + s.respondError(w, http.StatusBadRequest, ErrCodeUnsupportedComponent, fmt.Sprintf("unsupported component type: %s", payload.ComponentType), nil) + return + } + if payload.Blockchain == nil { + s.respondError(w, http.StatusBadRequest, ErrCodeMissingComponentInput, "blockchain payload is required", nil) + return + } + + // Keep this stderr write explicit so startup behavior is visible when agent runs as a subprocess. + requestLog := fmt.Sprintf("[cre-agent] starting component type=%s blockchain=%s chain_id=%s", payload.ComponentType, payload.Blockchain.Type, payload.Blockchain.ChainID) + _, _ = fmt.Fprintln(os.Stderr, requestLog) + + var startedOutput *blockchain.Output + capturedFrameworkLogs, startErr := captureFrameworkLogs(func() error { + deployed, err := DeployBlockchainComponent(r.Context(), s.deployers, payload.Blockchain) + if err != nil { + return err + } + startedOutput = deployed + return nil + }) + + agentLogs := make([]string, 0, 1+len(capturedFrameworkLogs)) + agentLogs = append(agentLogs, requestLog) + agentLogs = append(agentLogs, capturedFrameworkLogs...) + + if startErr != nil { + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, startErr.Error(), agentLogs) + return + } + + safeOutput, encErr := EncodeForTransport(startedOutput) + if encErr != nil { + s.respondError(w, http.StatusInternalServerError, ErrCodeTransportEncodeFailed, encErr.Error(), agentLogs) + return + } + s.respondJSON(w, http.StatusOK, StartComponentResponse{ + BlockchainOutput: safeOutput, + AgentLogs: agentLogs, + }) +} + +func (s *Server) respondJSON(w http.ResponseWriter, code int, body StartComponentResponse) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(code) + _ = json.NewEncoder(w).Encode(body) +} + +func (s *Server) respondError(w http.ResponseWriter, code int, errorCode string, message string, logs []string) { + s.respondJSON(w, code, StartComponentResponse{ + AgentLogs: logs, + ErrorCode: errorCode, + Error: message, + }) +} + +func captureFrameworkLogs(fn func() error) ([]string, error) { + frameworkLogCaptureMu.Lock() + defer frameworkLogCaptureMu.Unlock() + + var buf bytes.Buffer + originalLogger := framework.L + framework.L = originalLogger.Output(io.MultiWriter(os.Stderr, &buf)) + defer func() { + framework.L = originalLogger + }() + + err := fn() + + logs := make([]string, 0) + for _, line := range strings.Split(buf.String(), "\n") { + trimmed := strings.TrimSpace(line) + if trimmed == "" { + continue + } + logs = append(logs, trimmed) + } + + return logs, err +} + +func Run(ctx context.Context, addr string, srv *Server) error { + httpSrv := &http.Server{ + Addr: addr, + Handler: srv.Handler(), + } + + errCh := make(chan error, 1) + go func() { + errCh <- httpSrv.ListenAndServe() + }() + + select { + case <-ctx.Done(): + return httpSrv.Shutdown(context.Background()) + case err := <-errCh: + if err == http.ErrServerClosed { + return nil + } + return err + } +} diff --git a/system-tests/lib/cre/environment/agent/server_test.go b/system-tests/lib/cre/environment/agent/server_test.go new file mode 100644 index 00000000000..54ed1501d64 --- /dev/null +++ b/system-tests/lib/cre/environment/agent/server_test.go @@ -0,0 +1,48 @@ +package agent + +import ( + "bytes" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/rs/zerolog" +) + +func TestStartComponentReturnsErrorCodeForUnsupportedSchema(t *testing.T) { + server := NewServer(zerolog.Nop(), nil) + handler := server.Handler() + + req := httptest.NewRequest(http.MethodPost, "/v1/components/start", strings.NewReader(`{"schemaVersion":"v0","operation":"StartComponent","payload":{}}`)) + req.Header.Set("Content-Type", "application/json") + rr := httptest.NewRecorder() + + handler.ServeHTTP(rr, req) + + if rr.Code != http.StatusBadRequest { + t.Fatalf("expected bad request, got %d", rr.Code) + } + if !strings.Contains(rr.Body.String(), ErrCodeUnsupportedSchema) { + t.Fatalf("expected response to include error code %q, got body: %s", ErrCodeUnsupportedSchema, rr.Body.String()) + } +} + +func TestStartComponentReturnsErrorCodeForUnsupportedComponent(t *testing.T) { + server := NewServer(zerolog.Nop(), nil) + handler := server.Handler() + + body := bytes.NewBufferString(`{"schemaVersion":"v1","operation":"StartComponent","payload":{"componentType":"jd"}}`) + req := httptest.NewRequest(http.MethodPost, "/v1/components/start", body) + req.Header.Set("Content-Type", "application/json") + rr := httptest.NewRecorder() + + handler.ServeHTTP(rr, req) + + if rr.Code != http.StatusBadRequest { + t.Fatalf("expected bad request, got %d", rr.Code) + } + if !strings.Contains(rr.Body.String(), ErrCodeUnsupportedComponent) { + t.Fatalf("expected response to include error code %q, got body: %s", ErrCodeUnsupportedComponent, rr.Body.String()) + } +} diff --git a/system-tests/lib/cre/environment/agent/transport.go b/system-tests/lib/cre/environment/agent/transport.go new file mode 100644 index 00000000000..8d40af035fd --- /dev/null +++ b/system-tests/lib/cre/environment/agent/transport.go @@ -0,0 +1,38 @@ +package agent + +import ( + "fmt" + + "github.com/pelletier/go-toml/v2" +) + +// EncodeForTransport sanitizes arbitrary structs for JSON transport by round-tripping through TOML. +// This drops fields intentionally excluded from TOML (for example runtime handles with toml:"-"). +func EncodeForTransport(v any) (map[string]any, error) { + b, err := toml.Marshal(v) + if err != nil { + return nil, fmt.Errorf("failed to marshal transport payload to TOML: %w", err) + } + + var payload map[string]any + if err := toml.Unmarshal(b, &payload); err != nil { + return nil, fmt.Errorf("failed to unmarshal transport payload from TOML: %w", err) + } + + return payload, nil +} + +// DecodeFromTransport decodes sanitized transport payload into a target type using TOML round-trip. +func DecodeFromTransport[T any](payload map[string]any) (*T, error) { + b, err := toml.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("failed to marshal transport payload to TOML: %w", err) + } + + var out T + if err := toml.Unmarshal(b, &out); err != nil { + return nil, fmt.Errorf("failed to unmarshal transport payload into target: %w", err) + } + + return &out, nil +} diff --git a/system-tests/lib/cre/environment/agent/transport_test.go b/system-tests/lib/cre/environment/agent/transport_test.go new file mode 100644 index 00000000000..5a85f52dbc4 --- /dev/null +++ b/system-tests/lib/cre/environment/agent/transport_test.go @@ -0,0 +1,41 @@ +package agent + +import "testing" + +type testNested struct { + Value string `toml:"value"` +} + +type testRuntimeStruct struct { + Name string `toml:"name"` + Nested *testNested `toml:"nested"` + SkipMe string `toml:"-"` +} + +func TestTransportRoundtripDropsTomlIgnoredFields(t *testing.T) { + input := &testRuntimeStruct{ + Name: "abc", + Nested: &testNested{Value: "x"}, + SkipMe: "should-not-travel", + } + + encoded, err := EncodeForTransport(input) + if err != nil { + t.Fatalf("expected no error encoding transport payload, got %v", err) + } + + decoded, err := DecodeFromTransport[testRuntimeStruct](encoded) + if err != nil { + t.Fatalf("expected no error decoding transport payload, got %v", err) + } + + if decoded.Name != "abc" { + t.Fatalf("expected name to roundtrip, got %q", decoded.Name) + } + if decoded.Nested == nil || decoded.Nested.Value != "x" { + t.Fatalf("expected nested value to roundtrip, got %#v", decoded.Nested) + } + if decoded.SkipMe != "" { + t.Fatalf("expected toml-ignored field to be dropped, got %q", decoded.SkipMe) + } +} diff --git a/system-tests/lib/cre/environment/blockchain_start.go b/system-tests/lib/cre/environment/blockchain_start.go new file mode 100644 index 00000000000..70999ad2b7e --- /dev/null +++ b/system-tests/lib/cre/environment/blockchain_start.go @@ -0,0 +1,291 @@ +package environment + +import ( + "bytes" + "context" + "errors" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "strings" + "time" + + pkgerrors "github.com/pkg/errors" + "github.com/rs/zerolog" + + cldf_chain "github.com/smartcontractkit/chainlink-deployments-framework/chain" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" +) + +const ( + componentTypeBlockchain = "blockchain" + envLocalAgentURL = "CRE_LOCAL_AGENT_URL" + envAgentMode = "CRE_AGENT_MODE" +) + +type startComponentEnvelope struct { + SchemaVersion string `json:"schemaVersion"` + Operation string `json:"operation"` + Payload json.RawMessage `json:"payload"` +} + +type startBlockchainRequest struct { + ComponentType string `json:"componentType"` + Blockchain *blockchain.Input `json:"blockchain"` +} + +type startBlockchainResult struct { + BlockchainOutput map[string]any `json:"blockchainOutput"` + AgentLogs []string `json:"agentLogs"` + ErrorCode string `json:"errorCode"` + Error string `json:"error"` +} + +type componentClient interface { + StartComponent(ctx context.Context, envelope startComponentEnvelope) (*startBlockchainResult, error) +} + +type httpComponentClient struct { + baseURL string + client *http.Client +} + +func newHTTPComponentClient(baseURL string) *httpComponentClient { + return &httpComponentClient{ + baseURL: baseURL, + client: &http.Client{ + Timeout: 4 * time.Minute, + }, + } +} + +func (c *httpComponentClient) StartComponent(ctx context.Context, envelope startComponentEnvelope) (*startBlockchainResult, error) { + body, err := json.Marshal(envelope) + if err != nil { + return nil, pkgerrors.Wrap(err, "failed to encode start component envelope") + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/v1/components/start", bytes.NewReader(body)) + if err != nil { + return nil, pkgerrors.Wrap(err, "failed to create start component request") + } + req.Header.Set("Content-Type", "application/json") + + resp, err := c.client.Do(req) + if err != nil { + return nil, pkgerrors.Wrap(err, "failed to execute start component request") + } + defer resp.Body.Close() + + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return nil, pkgerrors.Wrap(err, "failed to read start component response") + } + + var startResp startBlockchainResult + if len(respBody) > 0 { + if err := json.Unmarshal(respBody, &startResp); err != nil { + return nil, pkgerrors.Wrap(err, "failed to decode start component response") + } + } + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + if startResp.Error != "" { + if startResp.ErrorCode != "" { + return nil, fmt.Errorf("%s: %s", startResp.ErrorCode, startResp.Error) + } + return nil, errors.New(startResp.Error) + } + return nil, fmt.Errorf("start component request failed with status %s: %s", resp.Status, string(respBody)) + } + if startResp.Error != "" { + if startResp.ErrorCode != "" { + return nil, fmt.Errorf("%s: %s", startResp.ErrorCode, startResp.Error) + } + return nil, errors.New(startResp.Error) + } + + return &startResp, nil +} + +func newStartComponentClient() (componentClient, error) { + if os.Getenv(envAgentMode) == "ec2" { + return &ec2ComponentClient{}, nil + } + + baseURL := os.Getenv(envLocalAgentURL) + if baseURL == "" { + return nil, fmt.Errorf("%s must be set for remote component startup", envLocalAgentURL) + } + return newHTTPComponentClient(baseURL), nil +} + +type ec2ComponentClient struct{} + +func (c *ec2ComponentClient) StartComponent(ctx context.Context, envelope startComponentEnvelope) (*startBlockchainResult, error) { + return nil, errors.New("ec2 agent client is not implemented yet") +} + +func blockchainFromOutput(testLogger zerolog.Logger, output *blockchain.Output) (blockchains.Blockchain, error) { + if output == nil { + return nil, pkgerrors.New("blockchain output is nil") + } + + if output.Type != blockchain.TypeAnvil { + return nil, fmt.Errorf("remote blockchain reconstruction supports only %s in phase 2A, got %s", blockchain.TypeAnvil, output.Type) + } + + return evm.FromOutput(testLogger, output) +} + +func prettifyAgentLogLine(line string) string { + trimmed := strings.TrimSpace(line) + if trimmed == "" { + return "" + } + + var payload map[string]any + if err := json.Unmarshal([]byte(trimmed), &payload); err != nil { + return trimmed + } + + message, _ := payload["message"].(string) + if message == "" { + return trimmed + } + + level, _ := payload["level"].(string) + if level == "" { + level = "info" + } + + cmd, _ := payload["Cmd"].(string) + if cmd != "" { + return fmt.Sprintf("[%s] %s (cmd=%s)", level, message, cmd) + } + + return fmt.Sprintf("[%s] %s", level, message) +} + +func validatePhase2ARemoteBlockchainInput(input *blockchain.Input) error { + if input == nil { + return errors.New("blockchain input is nil") + } + if input.Type != blockchain.TypeAnvil { + return fmt.Errorf("remote target in phase 2A supports only %s, got %s", blockchain.TypeAnvil, input.Type) + } + return nil +} + +func startBlockchainsWithTargets( + ctx context.Context, + testLogger zerolog.Logger, + configuredBlockchains []*config.Blockchain, + deployers map[blockchain.ChainFamily]blockchains.Deployer, +) (*blockchains.DeployedBlockchains, error) { + blockchainInputs, err := config.ResolveBlockchainInputs(configuredBlockchains) + if err != nil { + return nil, err + } + + localIdx := make([]int, 0, len(configuredBlockchains)) + localInputs := make([]*blockchain.Input, 0, len(configuredBlockchains)) + remoteIdx := make([]int, 0, len(configuredBlockchains)) + for idx, configuredBlockchain := range configuredBlockchains { + if configuredBlockchain.Target == config.TargetRemote { + remoteIdx = append(remoteIdx, idx) + continue + } + localIdx = append(localIdx, idx) + localInputs = append(localInputs, configuredBlockchain.InputRef()) + } + + outputs := make([]blockchains.Blockchain, len(configuredBlockchains)) + + if len(localInputs) > 0 { + for i, idx := range localIdx { + deployedOutput, err := agent.DeployBlockchainComponent(ctx, deployers, localInputs[i]) + if err != nil { + return nil, err + } + reconstructedBlockchain, err := blockchainFromOutput(testLogger, deployedOutput) + if err != nil { + return nil, err + } + outputs[idx] = reconstructedBlockchain + } + } + + if len(remoteIdx) > 0 { + startClient, err := newStartComponentClient() + if err != nil { + return nil, err + } + + for _, idx := range remoteIdx { + input := blockchainInputs[idx] + if err := validatePhase2ARemoteBlockchainInput(input); err != nil { + return nil, err + } + + payload := startBlockchainRequest{ + ComponentType: componentTypeBlockchain, + Blockchain: input, + } + payloadBytes, err := json.Marshal(payload) + if err != nil { + return nil, pkgerrors.Wrap(err, "failed to encode blockchain payload") + } + + response, err := startClient.StartComponent(ctx, startComponentEnvelope{ + SchemaVersion: agent.SchemaVersionV1, + Operation: agent.OperationStartComponent, + Payload: payloadBytes, + }) + if err != nil { + return nil, err + } + for _, logLine := range response.AgentLogs { + pretty := prettifyAgentLogLine(logLine) + if pretty == "" { + continue + } + testLogger.Info().Msgf("[agent] %s", pretty) + } + + blockchainOutput, err := agent.DecodeFromTransport[blockchain.Output](response.BlockchainOutput) + if err != nil { + return nil, pkgerrors.Wrap(err, "failed to decode blockchain transport payload") + } + + reconstructedBlockchain, err := blockchainFromOutput(testLogger, blockchainOutput) + if err != nil { + return nil, err + } + outputs[idx] = reconstructedBlockchain + } + } + + cldfBlockchains := make([]cldf_chain.BlockChain, 0, len(outputs)) + for _, db := range outputs { + if db == nil { + return nil, pkgerrors.New("blockchain output is nil") + } + chain, chainErr := db.ToCldfChain() + if chainErr != nil { + return nil, pkgerrors.Wrap(chainErr, "failed to create cldf chain from blockchain") + } + cldfBlockchains = append(cldfBlockchains, chain) + } + + return &blockchains.DeployedBlockchains{ + Outputs: outputs, + CldfBlockChains: cldf_chain.NewBlockChainsFromSlice(cldfBlockchains), + }, nil +} diff --git a/system-tests/lib/cre/environment/blockchain_start_test.go b/system-tests/lib/cre/environment/blockchain_start_test.go new file mode 100644 index 00000000000..700edc9122b --- /dev/null +++ b/system-tests/lib/cre/environment/blockchain_start_test.go @@ -0,0 +1,21 @@ +package environment + +import ( + "testing" + + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" +) + +func TestValidatePhase2ARemoteBlockchainInput(t *testing.T) { + if err := validatePhase2ARemoteBlockchainInput(nil); err == nil { + t.Fatalf("expected nil input to fail validation") + } + + if err := validatePhase2ARemoteBlockchainInput(&blockchain.Input{Type: blockchain.TypeGeth}); err == nil { + t.Fatalf("expected non-anvil input to fail validation") + } + + if err := validatePhase2ARemoteBlockchainInput(&blockchain.Input{Type: blockchain.TypeAnvil}); err != nil { + t.Fatalf("expected anvil input to pass validation, got %v", err) + } +} diff --git a/system-tests/lib/cre/environment/blockchains/evm/evm.go b/system-tests/lib/cre/environment/blockchains/evm/evm.go index c4fc95147d0..8ffe9263c4d 100644 --- a/system-tests/lib/cre/environment/blockchains/evm/evm.go +++ b/system-tests/lib/cre/environment/blockchains/evm/evm.go @@ -135,6 +135,15 @@ func (e *Blockchain) ToCldfChain() (cldf_chain.BlockChain, error) { } func (e *Deployer) Deploy(ctx context.Context, input *blockchain.Input) (blockchains.Blockchain, error) { + bcOut, err := e.DeployOutput(ctx, input) + if err != nil { + return nil, err + } + + return FromOutput(e.testLogger, bcOut) +} + +func (e *Deployer) DeployOutput(ctx context.Context, input *blockchain.Input) (*blockchain.Output, error) { var bcOut *blockchain.Output var err error @@ -161,13 +170,21 @@ func (e *Deployer) Deploy(ctx context.Context, input *blockchain.Input) (blockch } } + return bcOut, nil +} + +func FromOutput(testLogger zerolog.Logger, out *blockchain.Output) (*Blockchain, error) { + if out == nil { + return nil, pkgerrors.New("blockchain output is nil") + } + if keyErr := setDefaultPrivateKeyIfEmpty(); keyErr != nil { return nil, keyErr } priv := os.Getenv("PRIVATE_KEY") sethClient, err := seth.NewClientBuilder(). - WithRpcUrl(bcOut.Nodes[0].ExternalWSUrl). + WithRpcUrl(out.Nodes[0].ExternalWSUrl). WithPrivateKeys([]string{priv}). WithProtections(false, false, seth.MustMakeDuration(time.Second)). Build() @@ -180,18 +197,22 @@ func (e *Deployer) Deploy(ctx context.Context, input *blockchain.Input) (blockch return nil, pkgerrors.Wrapf(err, "failed to get chain selector for chain id %d", sethClient.Cfg.Network.ChainID) } - chainID, err := strconv.ParseUint(bcOut.ChainID, 10, 64) + chainID, err := strconv.ParseUint(out.ChainID, 10, 64) if err != nil { - return nil, pkgerrors.Wrapf(err, "failed to parse chain id %s", bcOut.ChainID) + return nil, pkgerrors.Wrapf(err, "failed to parse chain id %s", out.ChainID) } + return newBlockchainFromOutput(testLogger, out, sethClient, selector, chainID), nil +} + +func newBlockchainFromOutput(testLogger zerolog.Logger, out *blockchain.Output, sethClient *seth.Client, selector uint64, chainID uint64) *Blockchain { return &Blockchain{ - testLogger: e.testLogger, + testLogger: testLogger, chainSelector: selector, chainID: chainID, - ctfOutput: bcOut, + ctfOutput: out, SethClient: sethClient, - }, nil + } } func setDefaultPrivateKeyIfEmpty() error { diff --git a/system-tests/lib/cre/environment/config/config.go b/system-tests/lib/cre/environment/config/config.go index 1339c7891e2..96b39edfdc6 100644 --- a/system-tests/lib/cre/environment/config/config.go +++ b/system-tests/lib/cre/environment/config/config.go @@ -57,7 +57,7 @@ func (c *Config) SetAddresses(refs []datastore.AddressRef) error { } type Config struct { - Blockchains []*blockchain.Input `toml:"blockchains" validate:"required"` + Blockchains []*Blockchain `toml:"blockchains" validate:"required"` NodeSets []*cre.NodeSet `toml:"nodesets" validate:"required"` JD *jd.Input `toml:"jd" validate:"required"` Infra *infra.Provider `toml:"infra" validate:"required"` @@ -71,6 +71,65 @@ type Config struct { loaded bool } +type ComponentTarget string + +const ( + TargetDocker ComponentTarget = "docker" + TargetRemote ComponentTarget = "remote" +) + +// Blockchain wraps the existing CTF blockchain input and adds placement metadata. +// The embedded input keeps TOML fields backward-compatible. +type Blockchain struct { + blockchain.Input + Target ComponentTarget `toml:"target"` +} + +func (b *Blockchain) Normalize() { + if b.Target == "" { + b.Target = TargetDocker + } +} + +func (b *Blockchain) Validate() error { + if b == nil { + return errors.New("blockchain is nil") + } + + b.Normalize() + if b.Target != TargetDocker && b.Target != TargetRemote { + return fmt.Errorf("invalid blockchain target: %s", b.Target) + } + + return nil +} + +func (b *Blockchain) InputRef() *blockchain.Input { + if b == nil { + return nil + } + return &b.Input +} + +func (c *Config) EffectiveBlockchains() ([]*blockchain.Input, error) { + return ResolveBlockchainInputs(c.Blockchains) +} + +func ResolveBlockchainInputs(blockchains []*Blockchain) ([]*blockchain.Input, error) { + if len(blockchains) == 0 { + return nil, errors.New("at least one blockchain must be configured") + } + + inputs := make([]*blockchain.Input, 0, len(blockchains)) + for _, configuredBlockchain := range blockchains { + if err := configuredBlockchain.Validate(); err != nil { + return nil, err + } + inputs = append(inputs, configuredBlockchain.InputRef()) + } + return inputs, nil +} + // Validate performs validation checks on the configuration, ensuring all required fields // are present and all referenced capabilities are known to the system. func (c *Config) Validate(envDependencies cre.CLIEnvironmentDependencies) error { @@ -78,8 +137,8 @@ func (c *Config) Validate(envDependencies cre.CLIEnvironmentDependencies) error return errors.New("jd.csa_encryption_key must be provided") } - if len(c.Blockchains) == 0 { - return errors.New("at least one blockchain must be configured") + if _, err := c.EffectiveBlockchains(); err != nil { + return err } if len(c.NodeSets) == 0 { @@ -182,8 +241,13 @@ func (c *Config) Load(absPath string) error { return errors.Wrap(loadErr, "failed to load environment configuration") } + effectiveBlockchains, effErr := in.EffectiveBlockchains() + if effErr != nil { + return errors.Wrap(effErr, "failed to resolve blockchains") + } + for _, nodeSet := range in.NodeSets { - if err := nodeSet.ValidateChainCapabilities(in.Blockchains); err != nil { + if err := nodeSet.ValidateChainCapabilities(effectiveBlockchains); err != nil { return errors.Wrap(err, "failed to validate chain capabilities") } } diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index ed5d77694a8..fc1b73f168e 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -54,7 +54,7 @@ type SetupOutput struct { type SetupInput struct { NodeSets []*cre.NodeSet - BlockchainsInput []*blockchain.Input + Blockchains []*config.Blockchain JdInput *jd.Input Provider infra.Provider ContractVersions map[cre.ContractType]*semver.Version @@ -87,7 +87,7 @@ func (s *SetupInput) Validate() error { return pkgerrors.New("at least one nodeSet is required") } - if len(s.BlockchainsInput) == 0 { + if len(s.Blockchains) == 0 { return pkgerrors.New("at least one blockchain is required") } @@ -127,13 +127,12 @@ func SetupTestEnvironment( return nil, pkgerrors.Wrap(s3Err, "failed to start S3 provider") } - fmt.Print(libformat.PurpleText("%s", input.StageGen.Wrap("Starting %d blockchain(s)", len(input.BlockchainsInput)))) + fmt.Print(libformat.PurpleText("%s", input.StageGen.Wrap("Starting %d blockchain(s)", len(input.Blockchains)))) - deployedBlockchains, startErr := blockchains.Start( + deployedBlockchains, startErr := startBlockchainsWithTargets( ctx, testLogger, - singleFileLogger, - input.BlockchainsInput, + input.Blockchains, input.BlockchainDeployers, ) if startErr != nil { @@ -459,14 +458,17 @@ func appendOutputsToInput(input *SetupInput, nodeSetOutput []*cre.NodeSetOutput, input.NodeSets[idx].Out = nsOut.Output } - for idx, blockchain := range blockchains { - input.BlockchainsInput[idx].Out = blockchain.CtfOutput() + for idx, deployedBlockchain := range blockchains { + if idx < len(input.Blockchains) && input.Blockchains[idx] != nil { + input.Blockchains[idx].Out = deployedBlockchain.CtfOutput() + } } // append the jd output, so that later it can be stored in the cached output, so that we can use the environment again without running setup input.JdInput.Out = jdOutput } + func newCldfEnvironment(ctx context.Context, singleFileLogger logger.Logger, cldfBlockchains cldf_chain.BlockChains) *cldf.Environment { allChainsCLDEnvironment := &cldf.Environment{ Name: cre.EnvironmentName, diff --git a/system-tests/lib/cre/environment/state.go b/system-tests/lib/cre/environment/state.go index 98bbbd3e17e..e92032b2565 100644 --- a/system-tests/lib/cre/environment/state.go +++ b/system-tests/lib/cre/environment/state.go @@ -38,11 +38,15 @@ func BuildFromSavedState(ctx context.Context, cldLogger logger.Logger, cachedInp } blockchainDeployers := blockchain_sets.NewDeployerSet(framework.L, cachedInput.Infra) + effectiveBlockchains, effErr := cachedInput.EffectiveBlockchains() + if effErr != nil { + return nil, nil, errors.Wrap(effErr, "failed to resolve cached blockchain inputs") + } deployedBlockchains, startErr := blockchains.Start( ctx, framework.L, cldLogger, - cachedInput.Blockchains, + effectiveBlockchains, blockchainDeployers, ) if startErr != nil { diff --git a/system-tests/tests/load/cre/workflow_don_load_test.go b/system-tests/tests/load/cre/workflow_don_load_test.go index 5b03b6b44d8..ab53166bc84 100644 --- a/system-tests/tests/load/cre/workflow_don_load_test.go +++ b/system-tests/tests/load/cre/workflow_don_load_test.go @@ -132,10 +132,15 @@ func setupLoadTestEnvironment( jobSpecFactoryFns []cretypes.JobSpecFn, workflowJobsFn cretypes.JobSpecFn, ) *loadTestSetupOutput { + blockchains := make([]*envconfig.Blockchain, 0, len(in.Blockchains)) + for _, bc := range in.Blockchains { + blockchains = append(blockchains, &envconfig.Blockchain{Input: *bc}) + } + universalSetupInput := creenv.SetupInput{ NodeSets: mustSetCapabilitiesFn(in.NodeSets), CapabilitiesContractFactoryFunctions: capabilityFactoryFns, - BlockchainsInput: in.Blockchains, + Blockchains: blockchains, JdInput: in.JD, Provider: *in.Infra, JobSpecFactoryFunctions: jobSpecFactoryFns, diff --git a/system-tests/tests/load/cre/writer_don_load_test.go b/system-tests/tests/load/cre/writer_don_load_test.go index 845affbb1c1..c97cbf1deec 100644 --- a/system-tests/tests/load/cre/writer_don_load_test.go +++ b/system-tests/tests/load/cre/writer_don_load_test.go @@ -51,6 +51,7 @@ import ( cretypes "github.com/smartcontractkit/chainlink/system-tests/lib/cre" libcontracts "github.com/smartcontractkit/chainlink/system-tests/lib/cre/contracts" creenv "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment" + creenvconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" creevm "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" blockchain_sets "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/sets" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/flags" @@ -87,10 +88,15 @@ func setupLoadTestWriterEnvironment( feedIDs []string, workflowNames []string, ) *loadTestSetupOutput { + blockchains := make([]*creenvconfig.Blockchain, 0, len(in.Blockchains)) + for _, bc := range in.Blockchains { + blockchains = append(blockchains, &creenvconfig.Blockchain{Input: *bc}) + } + universalSetupInput := creenv.SetupInput{ NodeSets: mustSetCapabilitiesFn(in.NodeSets), CapabilitiesContractFactoryFunctions: capabilityFactoryFns, - BlockchainsInput: in.Blockchains, + Blockchains: blockchains, JdInput: in.JD, Provider: *in.Infra, JobSpecFactoryFunctions: jobSpecFactoryFns, From e090c1cef189418cba5063ee1558452bc05c37d4 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Tue, 17 Feb 2026 18:20:07 +0100 Subject: [PATCH 02/34] working version of anvil-deploying ec2 agent --- .../adapters/blockchain_adapter.go | 127 +++++++ .../adapters/blockchain_adapter_test.go | 50 +++ .../environment/adapters/tunnel_adapter.go | 10 + .../lib/cre/environment/blockchain_start.go | 326 +++++++++++++++++- .../cre/environment/blockchain_start_test.go | 173 ++++++++++ .../lib/cre/environment/environment.go | 36 ++ .../lib/cre/environment/setup_output_test.go | 36 ++ .../cre/environment/tunnel/component_id.go | 27 ++ .../environment/tunnel/component_id_test.go | 17 + .../lib/cre/environment/tunnel/manager.go | 104 ++++++ .../cre/environment/tunnel/manager_test.go | 85 +++++ .../cre/environment/tunnel/noop_manager.go | 24 ++ .../cre/environment/tunnel/provider_ssm.go | 168 +++++++++ .../lib/cre/environment/tunnel/tunnel.go | 30 ++ 14 files changed, 1194 insertions(+), 19 deletions(-) create mode 100644 system-tests/lib/cre/environment/adapters/blockchain_adapter.go create mode 100644 system-tests/lib/cre/environment/adapters/blockchain_adapter_test.go create mode 100644 system-tests/lib/cre/environment/adapters/tunnel_adapter.go create mode 100644 system-tests/lib/cre/environment/setup_output_test.go create mode 100644 system-tests/lib/cre/environment/tunnel/component_id.go create mode 100644 system-tests/lib/cre/environment/tunnel/component_id_test.go create mode 100644 system-tests/lib/cre/environment/tunnel/manager.go create mode 100644 system-tests/lib/cre/environment/tunnel/manager_test.go create mode 100644 system-tests/lib/cre/environment/tunnel/noop_manager.go create mode 100644 system-tests/lib/cre/environment/tunnel/provider_ssm.go create mode 100644 system-tests/lib/cre/environment/tunnel/tunnel.go diff --git a/system-tests/lib/cre/environment/adapters/blockchain_adapter.go b/system-tests/lib/cre/environment/adapters/blockchain_adapter.go new file mode 100644 index 00000000000..0298203bc3e --- /dev/null +++ b/system-tests/lib/cre/environment/adapters/blockchain_adapter.go @@ -0,0 +1,127 @@ +package adapters + +import ( + "fmt" + "net/url" + "strconv" + + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" +) + +type BlockchainAdapter struct{} + +func NewBlockchainAdapter() *BlockchainAdapter { + return &BlockchainAdapter{} +} + +func (a *BlockchainAdapter) DescribeEndpoints(componentID string, output *blockchain.Output) ([]tunnel.EndpointRef, error) { + if output == nil { + return nil, fmt.Errorf("blockchain output is nil") + } + + refs := make([]tunnel.EndpointRef, 0, len(output.Nodes)*2) + for idx := range output.Nodes { + node := output.Nodes[idx] + + httpRef, err := endpointFromURL(componentID, fmt.Sprintf("node-%d-http", idx), node.ExternalHTTPUrl) + if err != nil { + return nil, err + } + if httpRef != nil { + refs = append(refs, *httpRef) + } + + wsRef, err := endpointFromURL(componentID, fmt.Sprintf("node-%d-ws", idx), node.ExternalWSUrl) + if err != nil { + return nil, err + } + if wsRef != nil { + refs = append(refs, *wsRef) + } + } + + return refs, nil +} + +func (a *BlockchainAdapter) RewriteWithBindings(output *blockchain.Output, bindings []tunnel.TunnelBinding) error { + if output == nil { + return fmt.Errorf("blockchain output is nil") + } + + byName := make(map[string]tunnel.TunnelBinding, len(bindings)) + for _, b := range bindings { + byName[b.EndpointName] = b + } + + for idx := range output.Nodes { + httpKey := fmt.Sprintf("node-%d-http", idx) + if output.Nodes[idx].ExternalHTTPUrl != "" { + b, ok := byName[httpKey] + if !ok { + return fmt.Errorf("missing tunnel binding for %s", httpKey) + } + output.Nodes[idx].ExternalHTTPUrl = b.LocalURL + } + + wsKey := fmt.Sprintf("node-%d-ws", idx) + if output.Nodes[idx].ExternalWSUrl != "" { + b, ok := byName[wsKey] + if !ok { + return fmt.Errorf("missing tunnel binding for %s", wsKey) + } + output.Nodes[idx].ExternalWSUrl = b.LocalURL + } + } + + return nil +} + +func endpointFromURL(componentID, endpointName, rawURL string) (*tunnel.EndpointRef, error) { + if rawURL == "" { + return nil, nil + } + + parsed, err := url.Parse(rawURL) + if err != nil { + return nil, fmt.Errorf("failed to parse endpoint url %q: %w", rawURL, err) + } + + host := parsed.Hostname() + if host == "" { + return nil, fmt.Errorf("endpoint url %q has empty hostname", rawURL) + } + + port, err := resolvePort(parsed) + if err != nil { + return nil, err + } + + return &tunnel.EndpointRef{ + ComponentID: componentID, + EndpointName: endpointName, + Scheme: parsed.Scheme, + Host: host, + Port: port, + OriginalURL: rawURL, + }, nil +} + +func resolvePort(parsed *url.URL) (int, error) { + if parsed.Port() != "" { + port, err := strconv.Atoi(parsed.Port()) + if err != nil || port <= 0 || port > 65535 { + return 0, fmt.Errorf("url %q has invalid port %q", parsed.String(), parsed.Port()) + } + return port, nil + } + + switch parsed.Scheme { + case "http", "ws": + return 80, nil + case "https", "wss": + return 443, nil + default: + return 0, fmt.Errorf("url %q has unsupported scheme %q without explicit port", parsed.String(), parsed.Scheme) + } +} diff --git a/system-tests/lib/cre/environment/adapters/blockchain_adapter_test.go b/system-tests/lib/cre/environment/adapters/blockchain_adapter_test.go new file mode 100644 index 00000000000..c7dc743661c --- /dev/null +++ b/system-tests/lib/cre/environment/adapters/blockchain_adapter_test.go @@ -0,0 +1,50 @@ +package adapters + +import ( + "testing" + + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" +) + +func TestBlockchainAdapterDescribeAndRewrite(t *testing.T) { + adapter := NewBlockchainAdapter() + out := &blockchain.Output{ + Nodes: []*blockchain.Node{ + { + ExternalHTTPUrl: "http://10.0.0.10:8545", + ExternalWSUrl: "ws://10.0.0.10:8546", + }, + }, + } + + refs, err := adapter.DescribeEndpoints("blockchain:0:anvil", out) + if err != nil { + t.Fatalf("expected describe to succeed: %v", err) + } + if len(refs) != 2 { + t.Fatalf("expected two endpoint refs, got %d", len(refs)) + } + + bindings := []tunnel.TunnelBinding{ + { + EndpointRef: tunnel.EndpointRef{EndpointName: "node-0-http"}, + LocalURL: "http://127.0.0.1:18080", + }, + { + EndpointRef: tunnel.EndpointRef{EndpointName: "node-0-ws"}, + LocalURL: "ws://127.0.0.1:18081", + }, + } + + if err := adapter.RewriteWithBindings(out, bindings); err != nil { + t.Fatalf("expected rewrite to succeed: %v", err) + } + + if out.Nodes[0].ExternalHTTPUrl != "http://127.0.0.1:18080" { + t.Fatalf("unexpected rewritten http url: %s", out.Nodes[0].ExternalHTTPUrl) + } + if out.Nodes[0].ExternalWSUrl != "ws://127.0.0.1:18081" { + t.Fatalf("unexpected rewritten ws url: %s", out.Nodes[0].ExternalWSUrl) + } +} diff --git a/system-tests/lib/cre/environment/adapters/tunnel_adapter.go b/system-tests/lib/cre/environment/adapters/tunnel_adapter.go new file mode 100644 index 00000000000..9d838831b30 --- /dev/null +++ b/system-tests/lib/cre/environment/adapters/tunnel_adapter.go @@ -0,0 +1,10 @@ +package adapters + +import ( + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" +) + +type TunnelAdapter[T any] interface { + DescribeEndpoints(componentID string, output *T) ([]tunnel.EndpointRef, error) + RewriteWithBindings(output *T, bindings []tunnel.TunnelBinding) error +} diff --git a/system-tests/lib/cre/environment/blockchain_start.go b/system-tests/lib/cre/environment/blockchain_start.go index 70999ad2b7e..7422b772ef1 100644 --- a/system-tests/lib/cre/environment/blockchain_start.go +++ b/system-tests/lib/cre/environment/blockchain_start.go @@ -7,26 +7,39 @@ import ( "encoding/json" "fmt" "io" + "net" "net/http" + "net/url" "os" + "os/exec" + "strconv" "strings" "time" + retry "github.com/avast/retry-go/v4" pkgerrors "github.com/pkg/errors" "github.com/rs/zerolog" cldf_chain "github.com/smartcontractkit/chainlink-deployments-framework/chain" + "github.com/smartcontractkit/chainlink-testing-framework/framework" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/adapters" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" ) const ( componentTypeBlockchain = "blockchain" envLocalAgentURL = "CRE_LOCAL_AGENT_URL" + envEC2AgentURL = "CRE_EC2_AGENT_URL" + envEC2InstanceID = "CRE_EC2_INSTANCE_ID" + envEC2AgentPort = "CRE_EC2_AGENT_PORT" envAgentMode = "CRE_AGENT_MODE" + ec2Region = "us-west-2" + defaultEC2AgentPort = 8080 ) type startComponentEnvelope struct { @@ -52,71 +65,163 @@ type componentClient interface { } type httpComponentClient struct { - baseURL string - client *http.Client + baseURL string + client *http.Client + maxAttempts int + retryDelay time.Duration + checkHealth bool } func newHTTPComponentClient(baseURL string) *httpComponentClient { return &httpComponentClient{ - baseURL: baseURL, + baseURL: baseURL, client: &http.Client{ Timeout: 4 * time.Minute, }, + maxAttempts: 1, + retryDelay: 0, + checkHealth: false, + } +} + +func newEC2HTTPComponentClient(baseURL string) *httpComponentClient { + return &httpComponentClient{ + baseURL: baseURL, + client: &http.Client{ + Timeout: 4 * time.Minute, + }, + maxAttempts: 3, + retryDelay: 2 * time.Second, + checkHealth: true, } } func (c *httpComponentClient) StartComponent(ctx context.Context, envelope startComponentEnvelope) (*startBlockchainResult, error) { + if c.checkHealth { + if err := c.waitForHealth(ctx); err != nil { + return nil, err + } + } + + var result *startBlockchainResult + err := retry.Do( + func() error { + var err error + result, err = c.startComponentOnce(ctx, envelope) + return err + }, + retry.Attempts(uint(c.maxAttempts)), + retry.Delay(c.retryDelay), + retry.Context(ctx), + retry.LastErrorOnly(true), + ) + if err != nil { + return nil, err + } + + return result, nil +} + +func (c *httpComponentClient) startComponentOnce(ctx context.Context, envelope startComponentEnvelope) (*startBlockchainResult, error) { body, err := json.Marshal(envelope) if err != nil { - return nil, pkgerrors.Wrap(err, "failed to encode start component envelope") + return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to encode start component envelope")) } req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/v1/components/start", bytes.NewReader(body)) if err != nil { - return nil, pkgerrors.Wrap(err, "failed to create start component request") + return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to create start component request")) } req.Header.Set("Content-Type", "application/json") resp, err := c.client.Do(req) if err != nil { - return nil, pkgerrors.Wrap(err, "failed to execute start component request") + if isRetriableNetworkError(err) { + return nil, pkgerrors.Wrap(err, "failed to execute start component request") + } + return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to execute start component request")) } defer resp.Body.Close() respBody, err := io.ReadAll(resp.Body) if err != nil { - return nil, pkgerrors.Wrap(err, "failed to read start component response") + return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to read start component response")) } var startResp startBlockchainResult if len(respBody) > 0 { if err := json.Unmarshal(respBody, &startResp); err != nil { - return nil, pkgerrors.Wrap(err, "failed to decode start component response") + return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to decode start component response")) } } if resp.StatusCode < 200 || resp.StatusCode >= 300 { if startResp.Error != "" { if startResp.ErrorCode != "" { - return nil, fmt.Errorf("%s: %s", startResp.ErrorCode, startResp.Error) + err = fmt.Errorf("%s: %s", startResp.ErrorCode, startResp.Error) + } else { + err = errors.New(startResp.Error) } - return nil, errors.New(startResp.Error) + } else { + err = fmt.Errorf("start component request failed with status %s: %s", resp.Status, string(respBody)) + } + + if isRetriableStatus(resp.StatusCode) { + return nil, err } - return nil, fmt.Errorf("start component request failed with status %s: %s", resp.Status, string(respBody)) + return nil, retry.Unrecoverable(err) } if startResp.Error != "" { if startResp.ErrorCode != "" { - return nil, fmt.Errorf("%s: %s", startResp.ErrorCode, startResp.Error) + return nil, retry.Unrecoverable(fmt.Errorf("%s: %s", startResp.ErrorCode, startResp.Error)) } - return nil, errors.New(startResp.Error) + return nil, retry.Unrecoverable(errors.New(startResp.Error)) } return &startResp, nil } -func newStartComponentClient() (componentClient, error) { +func (c *httpComponentClient) waitForHealth(ctx context.Context) error { + return retry.Do( + func() error { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+"/v1/health", nil) + if err != nil { + return retry.Unrecoverable(pkgerrors.Wrap(err, "failed to create health request")) + } + + resp, err := c.client.Do(req) + if err != nil { + return pkgerrors.Wrap(err, "failed to execute health request") + } + _ = resp.Body.Close() + if resp.StatusCode == http.StatusOK { + return nil + } + return fmt.Errorf("agent health check returned status %s", resp.Status) + }, + retry.Attempts(uint(c.maxAttempts)), + retry.Delay(c.retryDelay), + retry.Context(ctx), + retry.LastErrorOnly(true), + ) +} + +func isRetriableStatus(statusCode int) bool { + return statusCode == http.StatusBadGateway || statusCode == http.StatusServiceUnavailable || statusCode == http.StatusGatewayTimeout +} + +func isRetriableNetworkError(err error) bool { + var netErr net.Error + return errors.As(err, &netErr) +} + +func newStartComponentClient(testLogger zerolog.Logger) (componentClient, error) { if os.Getenv(envAgentMode) == "ec2" { - return &ec2ComponentClient{}, nil + baseURL, err := resolveEC2AgentBaseURL(testLogger) + if err != nil { + return nil, err + } + return newEC2HTTPComponentClient(baseURL), nil } baseURL := os.Getenv(envLocalAgentURL) @@ -126,10 +231,87 @@ func newStartComponentClient() (componentClient, error) { return newHTTPComponentClient(baseURL), nil } -type ec2ComponentClient struct{} +func resolveEC2AgentBaseURL(testLogger zerolog.Logger) (string, error) { + if configured := os.Getenv(envEC2AgentURL); configured != "" { + return configured, nil + } + + instanceID := strings.TrimSpace(os.Getenv(envEC2InstanceID)) + if instanceID == "" { + return "", fmt.Errorf("%s must be set when %s=ec2 and %s is not provided", envEC2InstanceID, envAgentMode, envEC2AgentURL) + } + + remotePort := defaultEC2AgentPort + if configuredPort := strings.TrimSpace(os.Getenv(envEC2AgentPort)); configuredPort != "" { + parsedPort, err := strconv.Atoi(configuredPort) + if err != nil || parsedPort <= 0 || parsedPort > 65535 { + return "", fmt.Errorf("invalid %s: %q", envEC2AgentPort, configuredPort) + } + remotePort = parsedPort + } + + localPort, err := reserveLocalPort() + if err != nil { + return "", pkgerrors.Wrap(err, "failed to allocate local port for ssm tunnel") + } + + if err := startSSMPortForward(testLogger, instanceID, remotePort, localPort); err != nil { + return "", err + } + + testLogger.Info(). + Str("instanceID", instanceID). + Int("remotePort", remotePort). + Int("localPort", localPort). + Msg("Opened SSM tunnel to EC2 agent") + + return fmt.Sprintf("http://127.0.0.1:%d", localPort), nil +} + +func reserveLocalPort() (int, error) { + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + return 0, err + } + defer l.Close() + + tcpAddr, ok := l.Addr().(*net.TCPAddr) + if !ok { + return 0, errors.New("listener addr is not tcp") + } + return tcpAddr.Port, nil +} -func (c *ec2ComponentClient) StartComponent(ctx context.Context, envelope startComponentEnvelope) (*startBlockchainResult, error) { - return nil, errors.New("ec2 agent client is not implemented yet") +func startSSMPortForward(testLogger zerolog.Logger, instanceID string, remotePort, localPort int) error { + cmd := exec.Command( + "aws", + "ssm", + "start-session", + "--region", ec2Region, + "--target", instanceID, + "--document-name", "AWS-StartPortForwardingSession", + "--parameters", fmt.Sprintf("portNumber=%d,localPortNumber=%d", remotePort, localPort), + ) + if testLogger.GetLevel() <= zerolog.DebugLevel { + cmd.Stdout = os.Stderr + cmd.Stderr = os.Stderr + testLogger.Debug(). + Strs("cmd", cmd.Args). + Msg("Starting SSM agent tunnel command") + } + testLogger.Info(). + Str("instanceID", instanceID). + Int("remotePort", remotePort). + Int("localPort", localPort). + Msg("Opening SSM tunnel to EC2 agent") + + if err := cmd.Start(); err != nil { + return pkgerrors.Wrap(err, "failed to start aws ssm port forwarding session") + } + go func() { + _ = cmd.Wait() + }() + return nil } func blockchainFromOutput(testLogger zerolog.Logger, output *blockchain.Output) (blockchains.Blockchain, error) { @@ -188,6 +370,7 @@ func startBlockchainsWithTargets( testLogger zerolog.Logger, configuredBlockchains []*config.Blockchain, deployers map[blockchain.ChainFamily]blockchains.Deployer, + tunnelManager tunnel.Manager, ) (*blockchains.DeployedBlockchains, error) { blockchainInputs, err := config.ResolveBlockchainInputs(configuredBlockchains) if err != nil { @@ -223,7 +406,7 @@ func startBlockchainsWithTargets( } if len(remoteIdx) > 0 { - startClient, err := newStartComponentClient() + startClient, err := newStartComponentClient(testLogger) if err != nil { return nil, err } @@ -264,6 +447,10 @@ func startBlockchainsWithTargets( return nil, pkgerrors.Wrap(err, "failed to decode blockchain transport payload") } + if err := rewriteRemoteBlockchainOutputForLocalAccess(ctx, testLogger, tunnelManager, idx, input, blockchainOutput); err != nil { + return nil, err + } + reconstructedBlockchain, err := blockchainFromOutput(testLogger, blockchainOutput) if err != nil { return nil, err @@ -289,3 +476,104 @@ func startBlockchainsWithTargets( CldfBlockChains: cldf_chain.NewBlockChainsFromSlice(cldfBlockchains), }, nil } + +func newEC2TunnelManager(testLogger zerolog.Logger) (tunnel.Manager, error) { + if os.Getenv(envAgentMode) != "ec2" { + return tunnel.NewNoopManager(), nil + } + + instanceID := strings.TrimSpace(os.Getenv(envEC2InstanceID)) + if instanceID == "" { + // Keep compatibility with pure manual-tunneling mode. + return tunnel.NewNoopManager(), nil + } + + return tunnel.NewManager(tunnel.NewSSMProvider(instanceID, ec2Region, testLogger)), nil +} + +func rewriteRemoteBlockchainOutputForLocalAccess( + ctx context.Context, + testLogger zerolog.Logger, + tunnelManager tunnel.Manager, + configuredIndex int, + input *blockchain.Input, + output *blockchain.Output, +) error { + if output == nil { + return nil + } + + componentID := tunnel.CanonicalComponentID(tunnel.KindBlockchain, configuredIndex, input.Type) + adapter := adapters.NewBlockchainAdapter() + + refs, err := adapter.DescribeEndpoints(componentID, output) + if err != nil { + return pkgerrors.Wrap(err, "failed to describe blockchain tunnel endpoints") + } + + bindings, err := tunnelManager.Start(ctx, refs) + if err != nil { + return pkgerrors.Wrap(err, "failed to start tunnels for blockchain output") + } + for _, binding := range bindings { + testLogger.Info(). + Str("componentID", binding.ComponentID). + Str("endpointName", binding.EndpointName). + Str("originalURL", binding.OriginalURL). + Str("localURL", binding.LocalURL). + Msg("Established endpoint tunnel") + } + + if err := adapter.RewriteWithBindings(output, bindings); err != nil { + return pkgerrors.Wrap(err, "failed to rewrite blockchain output with local tunnel bindings") + } + if err := rewriteBlockchainInternalURLsForLocalNodes(output); err != nil { + return pkgerrors.Wrap(err, "failed to rewrite blockchain internal urls for local node containers") + } + + return nil +} + +func rewriteBlockchainInternalURLsForLocalNodes(output *blockchain.Output) error { + if output == nil { + return nil + } + + dockerHost := strings.TrimPrefix(framework.HostDockerInternal(), "http://") + for _, node := range output.Nodes { + if node == nil { + continue + } + + if node.ExternalHTTPUrl != "" { + internal, err := rewriteURLHost(node.ExternalHTTPUrl, dockerHost) + if err != nil { + return err + } + node.InternalHTTPUrl = internal + } + + if node.ExternalWSUrl != "" { + internal, err := rewriteURLHost(node.ExternalWSUrl, dockerHost) + if err != nil { + return err + } + node.InternalWSUrl = internal + } + } + + return nil +} + +func rewriteURLHost(rawURL, host string) (string, error) { + parsed, err := url.Parse(rawURL) + if err != nil { + return "", fmt.Errorf("failed to parse url %q: %w", rawURL, err) + } + if parsed.Port() != "" { + parsed.Host = net.JoinHostPort(host, parsed.Port()) + return parsed.String(), nil + } + parsed.Host = host + return parsed.String(), nil +} diff --git a/system-tests/lib/cre/environment/blockchain_start_test.go b/system-tests/lib/cre/environment/blockchain_start_test.go index 700edc9122b..b63c5b1883e 100644 --- a/system-tests/lib/cre/environment/blockchain_start_test.go +++ b/system-tests/lib/cre/environment/blockchain_start_test.go @@ -1,9 +1,14 @@ package environment import ( + "context" + "os" + "strings" "testing" + "github.com/rs/zerolog" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" ) func TestValidatePhase2ARemoteBlockchainInput(t *testing.T) { @@ -19,3 +24,171 @@ func TestValidatePhase2ARemoteBlockchainInput(t *testing.T) { t.Fatalf("expected anvil input to pass validation, got %v", err) } } + +func TestNewStartComponentClientEC2Mode(t *testing.T) { + t.Setenv(envAgentMode, "ec2") + t.Setenv(envLocalAgentURL, "") + t.Setenv(envEC2AgentURL, "") + t.Setenv(envEC2InstanceID, "") + + if _, err := newStartComponentClient(zerolog.Nop()); err == nil { + t.Fatalf("expected ec2 mode without %s or %s to fail", envEC2AgentURL, envEC2InstanceID) + } + + t.Setenv(envEC2AgentURL, "http://127.0.0.1:18080") // manual tunnel override + client, err := newStartComponentClient(zerolog.Nop()) + if err != nil { + t.Fatalf("expected ec2 mode client to be created, got %v", err) + } + + httpClient, ok := client.(*httpComponentClient) + if !ok { + t.Fatalf("expected httpComponentClient, got %T", client) + } + if !httpClient.checkHealth { + t.Fatalf("expected ec2 client to enable health checks") + } + if httpClient.maxAttempts != 3 { + t.Fatalf("expected ec2 client retries to be enabled") + } +} + +func TestResolveEC2AgentBaseURLRequiresInstanceIDWhenURLMissing(t *testing.T) { + t.Setenv(envEC2AgentURL, "") + t.Setenv(envEC2InstanceID, "") + t.Setenv(envEC2AgentPort, "") + + _, err := resolveEC2AgentBaseURL(zerolog.Nop()) + if err == nil { + t.Fatalf("expected missing %s to fail when %s is not set", envEC2InstanceID, envEC2AgentURL) + } +} + +func TestResolveEC2AgentBaseURLRejectsInvalidPort(t *testing.T) { + t.Setenv(envEC2AgentURL, "") + t.Setenv(envEC2InstanceID, "i-123") + t.Setenv(envEC2AgentPort, "not-a-port") + + _, err := resolveEC2AgentBaseURL(zerolog.Nop()) + if err == nil { + t.Fatalf("expected invalid %s to fail", envEC2AgentPort) + } + if !strings.Contains(err.Error(), envEC2AgentPort) { + t.Fatalf("expected error to mention %s, got: %v", envEC2AgentPort, err) + } +} + +func TestNewStartComponentClientLocalMode(t *testing.T) { + t.Setenv(envAgentMode, "") + t.Setenv(envEC2AgentURL, "") + t.Setenv(envLocalAgentURL, "") + + if _, err := newStartComponentClient(zerolog.Nop()); err == nil { + t.Fatalf("expected local mode without %s to fail", envLocalAgentURL) + } + + t.Setenv(envLocalAgentURL, "http://127.0.0.1:8080") + client, err := newStartComponentClient(zerolog.Nop()) + if err != nil { + t.Fatalf("expected local mode client to be created, got %v", err) + } + + httpClient, ok := client.(*httpComponentClient) + if !ok { + t.Fatalf("expected httpComponentClient, got %T", client) + } + if httpClient.checkHealth { + t.Fatalf("expected local client health checks to be disabled") + } + if httpClient.maxAttempts != 1 { + t.Fatalf("expected local client retries to be disabled") + } + + if os.Getenv(envLocalAgentURL) == "" { + t.Fatalf("expected local agent url to remain set") + } +} + +type fakeTunnelManager struct { + startCalls int +} + +func (f *fakeTunnelManager) Start(_ context.Context, refs []tunnel.EndpointRef) ([]tunnel.TunnelBinding, error) { + f.startCalls++ + bindings := make([]tunnel.TunnelBinding, 0, len(refs)) + for i, ref := range refs { + bindings = append(bindings, tunnel.TunnelBinding{ + EndpointRef: ref, + LocalPort: 19000 + i, + LocalURL: map[string]string{ + "http": "http://127.0.0.1:19000", + "ws": "ws://127.0.0.1:19001", + }[ref.Scheme], + }) + } + return bindings, nil +} + +func (f *fakeTunnelManager) Stop(_ context.Context) error { return nil } +func (f *fakeTunnelManager) IsStarted() bool { return f.startCalls > 0 } + +func TestRewriteRemoteBlockchainOutputForLocalAccess(t *testing.T) { + out := &blockchain.Output{ + Nodes: []*blockchain.Node{ + { + ExternalHTTPUrl: "http://10.0.0.10:8545", + ExternalWSUrl: "ws://10.0.0.10:8546", + }, + }, + } + manager := &fakeTunnelManager{} + + if err := rewriteRemoteBlockchainOutputForLocalAccess( + context.Background(), + zerolog.Nop(), + manager, + 0, + &blockchain.Input{Type: blockchain.TypeAnvil}, + out, + ); err != nil { + t.Fatalf("expected rewrite helper to succeed: %v", err) + } + + if manager.startCalls != 1 { + t.Fatalf("expected tunnel manager start to be called once, got %d", manager.startCalls) + } + if out.Nodes[0].ExternalHTTPUrl != "http://127.0.0.1:19000" { + t.Fatalf("unexpected rewritten http url: %s", out.Nodes[0].ExternalHTTPUrl) + } + if out.Nodes[0].ExternalWSUrl != "ws://127.0.0.1:19001" { + t.Fatalf("unexpected rewritten ws url: %s", out.Nodes[0].ExternalWSUrl) + } + if out.Nodes[0].InternalHTTPUrl == "" || !strings.Contains(out.Nodes[0].InternalHTTPUrl, ":19000") { + t.Fatalf("expected internal http url to be rewritten for docker host access, got %s", out.Nodes[0].InternalHTTPUrl) + } + if out.Nodes[0].InternalWSUrl == "" || !strings.Contains(out.Nodes[0].InternalWSUrl, ":19001") { + t.Fatalf("expected internal ws url to be rewritten for docker host access, got %s", out.Nodes[0].InternalWSUrl) + } +} + +func TestNewEC2TunnelManagerReturnsNoopWhenNotApplicable(t *testing.T) { + t.Setenv(envAgentMode, "") + t.Setenv(envEC2InstanceID, "") + manager, err := newEC2TunnelManager(zerolog.Nop()) + if err != nil { + t.Fatalf("expected noop manager for local mode, got error: %v", err) + } + if manager.IsStarted() { + t.Fatalf("expected noop manager to report not started") + } + + t.Setenv(envAgentMode, "ec2") + t.Setenv(envEC2InstanceID, "") + manager, err = newEC2TunnelManager(zerolog.Nop()) + if err != nil { + t.Fatalf("expected noop manager for ec2 mode without instance, got error: %v", err) + } + if manager.IsStarted() { + t.Fatalf("expected noop manager to report not started") + } +} diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index fc1b73f168e..1d3354e1aac 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -6,6 +6,7 @@ import ( "fmt" "maps" "os" + "sync" "github.com/Masterminds/semver/v3" "github.com/ethereum/go-ethereum/common" @@ -36,6 +37,7 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/stagegen" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/sharding" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/workflow" libformat "github.com/smartcontractkit/chainlink/system-tests/lib/format" @@ -50,6 +52,26 @@ type SetupOutput struct { NodeOutput []*cre.NodeSetOutput S3ProviderOutput *s3provider.Output GatewayConnectors *cre.GatewayConnectors + + tunnelManager tunnel.Manager + closeOnce sync.Once + closeErr error +} + +func (s *SetupOutput) Close(ctx context.Context) error { + if s == nil { + return nil + } + manager := s.tunnelManager + if manager == nil { + manager = tunnel.NewNoopManager() + } + + s.closeOnce.Do(func() { + s.closeErr = manager.Stop(ctx) + }) + + return s.closeErr } type SetupInput struct { @@ -127,6 +149,11 @@ func SetupTestEnvironment( return nil, pkgerrors.Wrap(s3Err, "failed to start S3 provider") } + tunnelManager, tmErr := newEC2TunnelManager(testLogger) + if tmErr != nil { + return nil, pkgerrors.Wrap(tmErr, "failed to initialize tunnel manager") + } + fmt.Print(libformat.PurpleText("%s", input.StageGen.Wrap("Starting %d blockchain(s)", len(input.Blockchains)))) deployedBlockchains, startErr := startBlockchainsWithTargets( @@ -134,10 +161,17 @@ func SetupTestEnvironment( testLogger, input.Blockchains, input.BlockchainDeployers, + tunnelManager, ) if startErr != nil { return nil, pkgerrors.Wrap(startErr, "failed to start blockchains") } + cleanupTunnelsOnError := true + defer func() { + if cleanupTunnelsOnError { + _ = tunnelManager.Stop(ctx) + } + }() creEnvironment := &cre.Environment{ Blockchains: deployedBlockchains.Outputs, @@ -442,6 +476,7 @@ func SetupTestEnvironment( return nil, pkgerrors.Wrap(err, "failed to store workflow registry configuration output") } + cleanupTunnelsOnError = false return &SetupOutput{ WorkflowRegistryConfigurationOutput: workflowRegistryConfigurationOutput, // pass to caller, so that it can be optionally attached to TestConfig and saved to disk Dons: dons, @@ -449,6 +484,7 @@ func SetupTestEnvironment( CreEnvironment: creEnvironment, S3ProviderOutput: s3Output, GatewayConnectors: topology.GatewayConnectors, + tunnelManager: tunnelManager, }, nil } diff --git a/system-tests/lib/cre/environment/setup_output_test.go b/system-tests/lib/cre/environment/setup_output_test.go new file mode 100644 index 00000000000..9ec774f04fe --- /dev/null +++ b/system-tests/lib/cre/environment/setup_output_test.go @@ -0,0 +1,36 @@ +package environment + +import ( + "context" + "testing" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" +) + +type countingTunnelManager struct { + stopCalls int +} + +func (c *countingTunnelManager) Start(_ context.Context, _ []tunnel.EndpointRef) ([]tunnel.TunnelBinding, error) { + return nil, nil +} +func (c *countingTunnelManager) Stop(_ context.Context) error { + c.stopCalls++ + return nil +} +func (c *countingTunnelManager) IsStarted() bool { return false } + +func TestSetupOutputCloseIsIdempotent(t *testing.T) { + manager := &countingTunnelManager{} + out := &SetupOutput{tunnelManager: manager} + + if err := out.Close(context.Background()); err != nil { + t.Fatalf("expected first close to succeed: %v", err) + } + if err := out.Close(context.Background()); err != nil { + t.Fatalf("expected second close to succeed: %v", err) + } + if manager.stopCalls != 1 { + t.Fatalf("expected tunnel manager stop once, got %d", manager.stopCalls) + } +} diff --git a/system-tests/lib/cre/environment/tunnel/component_id.go b/system-tests/lib/cre/environment/tunnel/component_id.go new file mode 100644 index 00000000000..2c2a1c1ce68 --- /dev/null +++ b/system-tests/lib/cre/environment/tunnel/component_id.go @@ -0,0 +1,27 @@ +package tunnel + +import ( + "fmt" + "strings" +) + +type ComponentKind string + +const ( + KindBlockchain ComponentKind = "blockchain" + KindNodeSet ComponentKind = "nodeset" + KindJD ComponentKind = "jd" +) + +func CanonicalComponentID(kind ComponentKind, index int, name string) string { + if name == "" { + return fmt.Sprintf("%s:%d", kind, index) + } + + normalized := strings.ToLower(strings.TrimSpace(name)) + if normalized == "" { + return fmt.Sprintf("%s:%d", kind, index) + } + + return fmt.Sprintf("%s:%d:%s", kind, index, normalized) +} diff --git a/system-tests/lib/cre/environment/tunnel/component_id_test.go b/system-tests/lib/cre/environment/tunnel/component_id_test.go new file mode 100644 index 00000000000..e3ac740839f --- /dev/null +++ b/system-tests/lib/cre/environment/tunnel/component_id_test.go @@ -0,0 +1,17 @@ +package tunnel + +import "testing" + +func TestCanonicalComponentID(t *testing.T) { + if got := CanonicalComponentID(KindBlockchain, 0, "Anvil-Main"); got != "blockchain:0:anvil-main" { + t.Fatalf("unexpected canonical id: %s", got) + } + + if got := CanonicalComponentID(KindJD, 2, ""); got != "jd:2" { + t.Fatalf("unexpected canonical id for empty name: %s", got) + } + + if got := CanonicalComponentID(KindNodeSet, 1, " "); got != "nodeset:1" { + t.Fatalf("unexpected canonical id for whitespace name: %s", got) + } +} diff --git a/system-tests/lib/cre/environment/tunnel/manager.go b/system-tests/lib/cre/environment/tunnel/manager.go new file mode 100644 index 00000000000..4327724c613 --- /dev/null +++ b/system-tests/lib/cre/environment/tunnel/manager.go @@ -0,0 +1,104 @@ +package tunnel + +import ( + "context" + "errors" + "fmt" + "sync" +) + +type manager struct { + provider Provider + + mu sync.Mutex + bindings map[string]TunnelBinding +} + +func NewManager(provider Provider) Manager { + return &manager{ + provider: provider, + bindings: make(map[string]TunnelBinding), + } +} + +func (m *manager) Start(ctx context.Context, refs []EndpointRef) ([]TunnelBinding, error) { + m.mu.Lock() + defer m.mu.Unlock() + + started := make([]TunnelBinding, 0, len(refs)) + newlyOpened := make([]TunnelBinding, 0, len(refs)) + + for _, ref := range refs { + key := endpointKey(ref.ComponentID, ref.EndpointName) + if existing, ok := m.bindings[key]; ok { + started = append(started, existing) + continue + } + + if err := validateEndpointRef(ref); err != nil { + _ = m.closeMany(ctx, newlyOpened) + return nil, err + } + + binding, err := m.provider.Open(ctx, ref) + if err != nil { + _ = m.closeMany(ctx, newlyOpened) + return nil, fmt.Errorf("failed to open tunnel via %s for %s/%s: %w", m.provider.Name(), ref.ComponentID, ref.EndpointName, err) + } + + m.bindings[key] = binding + started = append(started, binding) + newlyOpened = append(newlyOpened, binding) + } + + return started, nil +} + +func (m *manager) Stop(ctx context.Context) error { + m.mu.Lock() + defer m.mu.Unlock() + + bindings := make([]TunnelBinding, 0, len(m.bindings)) + for _, b := range m.bindings { + bindings = append(bindings, b) + } + clear(m.bindings) + + return m.closeMany(ctx, bindings) +} + +func (m *manager) IsStarted() bool { + m.mu.Lock() + defer m.mu.Unlock() + return len(m.bindings) > 0 +} + +func (m *manager) closeMany(ctx context.Context, bindings []TunnelBinding) error { + var joined error + for _, b := range bindings { + if err := m.provider.Close(ctx, b); err != nil { + joined = errors.Join(joined, err) + } + } + return joined +} + +func validateEndpointRef(ref EndpointRef) error { + if ref.ComponentID == "" { + return errors.New("endpoint componentID is required") + } + if ref.EndpointName == "" { + return errors.New("endpoint endpointName is required") + } + if ref.Host == "" { + return errors.New("endpoint host is required") + } + if ref.Port <= 0 || ref.Port > 65535 { + return fmt.Errorf("endpoint port %d is invalid", ref.Port) + } + return nil +} + +func endpointKey(componentID, endpointName string) string { + return componentID + ":" + endpointName +} diff --git a/system-tests/lib/cre/environment/tunnel/manager_test.go b/system-tests/lib/cre/environment/tunnel/manager_test.go new file mode 100644 index 00000000000..85700d5ba9d --- /dev/null +++ b/system-tests/lib/cre/environment/tunnel/manager_test.go @@ -0,0 +1,85 @@ +package tunnel + +import ( + "context" + "testing" +) + +type fakeProvider struct { + openCount int + closeCount int +} + +func (f *fakeProvider) Open(_ context.Context, ref EndpointRef) (TunnelBinding, error) { + f.openCount++ + return TunnelBinding{ + EndpointRef: ref, + LocalPort: 10000 + f.openCount, + LocalURL: "http://127.0.0.1:10000", + }, nil +} + +func (f *fakeProvider) Close(_ context.Context, _ TunnelBinding) error { + f.closeCount++ + return nil +} + +func (f *fakeProvider) Name() string { return "fake" } + +func TestManagerStartDedupsAndStops(t *testing.T) { + provider := &fakeProvider{} + mgr := NewManager(provider) + + refs := []EndpointRef{ + { + ComponentID: "blockchain:0:anvil", + EndpointName: "node-0-http", + Scheme: "http", + Host: "127.0.0.1", + Port: 8545, + }, + { + ComponentID: "blockchain:0:anvil", + EndpointName: "node-0-ws", + Scheme: "ws", + Host: "127.0.0.1", + Port: 8546, + }, + } + + started, err := mgr.Start(context.Background(), refs) + if err != nil { + t.Fatalf("expected start to succeed: %v", err) + } + if len(started) != 2 { + t.Fatalf("expected 2 bindings, got %d", len(started)) + } + if provider.openCount != 2 { + t.Fatalf("expected 2 opens, got %d", provider.openCount) + } + + startedAgain, err := mgr.Start(context.Background(), refs) + if err != nil { + t.Fatalf("expected dedup start to succeed: %v", err) + } + if len(startedAgain) != 2 { + t.Fatalf("expected 2 dedup bindings, got %d", len(startedAgain)) + } + if provider.openCount != 2 { + t.Fatalf("expected no extra open calls after dedup, got %d", provider.openCount) + } + + if !mgr.IsStarted() { + t.Fatalf("expected manager to report started") + } + + if err := mgr.Stop(context.Background()); err != nil { + t.Fatalf("expected idempotent stop to succeed: %v", err) + } + if provider.closeCount != 2 { + t.Fatalf("expected 2 closes from stop, got %d", provider.closeCount) + } + if mgr.IsStarted() { + t.Fatalf("expected manager to report no active tunnels after stop") + } +} diff --git a/system-tests/lib/cre/environment/tunnel/noop_manager.go b/system-tests/lib/cre/environment/tunnel/noop_manager.go new file mode 100644 index 00000000000..c88c2260313 --- /dev/null +++ b/system-tests/lib/cre/environment/tunnel/noop_manager.go @@ -0,0 +1,24 @@ +package tunnel + +import "context" + +type noopManager struct{} + +func NewNoopManager() Manager { + return &noopManager{} +} + +func (n *noopManager) Start(_ context.Context, refs []EndpointRef) ([]TunnelBinding, error) { + bindings := make([]TunnelBinding, 0, len(refs)) + for _, ref := range refs { + bindings = append(bindings, TunnelBinding{ + EndpointRef: ref, + LocalURL: ref.OriginalURL, + }) + } + return bindings, nil +} + +func (n *noopManager) Stop(_ context.Context) error { return nil } + +func (n *noopManager) IsStarted() bool { return false } diff --git a/system-tests/lib/cre/environment/tunnel/provider_ssm.go b/system-tests/lib/cre/environment/tunnel/provider_ssm.go new file mode 100644 index 00000000000..5ef2ba2021d --- /dev/null +++ b/system-tests/lib/cre/environment/tunnel/provider_ssm.go @@ -0,0 +1,168 @@ +package tunnel + +import ( + "context" + "errors" + "fmt" + "net" + "os" + "os/exec" + "sync" + "time" + + "github.com/rs/zerolog" +) + +type SSMProvider struct { + instanceID string + region string + logger zerolog.Logger + + mu sync.Mutex + sessions map[int]*exec.Cmd +} + +func NewSSMProvider(instanceID, region string, logger zerolog.Logger) Provider { + return &SSMProvider{ + instanceID: instanceID, + region: region, + logger: logger, + sessions: make(map[int]*exec.Cmd), + } +} + +func (p *SSMProvider) Name() string { + return "ssm" +} + +func (p *SSMProvider) Open(ctx context.Context, ref EndpointRef) (TunnelBinding, error) { + localPort, err := reserveLocalPort() + if err != nil { + return TunnelBinding{}, fmt.Errorf("failed to reserve local port: %w", err) + } + + cmd := exec.Command( + "aws", + "ssm", + "start-session", + "--region", p.region, + "--target", p.instanceID, + "--document-name", "AWS-StartPortForwardingSession", + "--parameters", fmt.Sprintf("portNumber=%d,localPortNumber=%d", ref.Port, localPort), + ) + if p.logger.GetLevel() <= zerolog.DebugLevel { + cmd.Stdout = os.Stderr + cmd.Stderr = os.Stderr + p.logger.Debug(). + Strs("cmd", cmd.Args). + Msg("Starting SSM endpoint tunnel command") + } + + p.logger.Info(). + Str("componentID", ref.ComponentID). + Str("endpointName", ref.EndpointName). + Int("remotePort", ref.Port). + Int("localPort", localPort). + Msg("Opening SSM endpoint tunnel") + + if err := cmd.Start(); err != nil { + return TunnelBinding{}, fmt.Errorf("failed to start aws ssm session: %w", err) + } + if err := waitForLocalPortReady(ctx, localPort, 12*time.Second); err != nil { + _ = cmd.Process.Kill() + return TunnelBinding{}, fmt.Errorf("ssm local tunnel on port %d did not become ready: %w", localPort, err) + } + + p.mu.Lock() + p.sessions[localPort] = cmd + p.mu.Unlock() + + go func() { + _ = cmd.Wait() + }() + + return TunnelBinding{ + EndpointRef: ref, + LocalPort: localPort, + LocalURL: localURLFromScheme(ref.Scheme, localPort), + }, nil +} + +func (p *SSMProvider) Close(_ context.Context, binding TunnelBinding) error { + p.mu.Lock() + cmd, ok := p.sessions[binding.LocalPort] + if ok { + delete(p.sessions, binding.LocalPort) + } + p.mu.Unlock() + + if !ok || cmd == nil || cmd.Process == nil { + return nil + } + + if err := cmd.Process.Kill(); err != nil { + return fmt.Errorf("failed to kill ssm session on local port %d: %w", binding.LocalPort, err) + } + p.logger.Info(). + Str("componentID", binding.ComponentID). + Str("endpointName", binding.EndpointName). + Int("localPort", binding.LocalPort). + Msg("Closed SSM endpoint tunnel") + return nil +} + +func reserveLocalPort() (int, error) { + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + return 0, err + } + defer l.Close() + + tcpAddr, ok := l.Addr().(*net.TCPAddr) + if !ok { + return 0, fmt.Errorf("listener addr %T is not tcp", l.Addr()) + } + return tcpAddr.Port, nil +} + +func localURLFromScheme(scheme string, port int) string { + switch scheme { + case "ws": + return fmt.Sprintf("ws://127.0.0.1:%d", port) + case "wss": + return fmt.Sprintf("wss://127.0.0.1:%d", port) + case "https": + return fmt.Sprintf("https://127.0.0.1:%d", port) + default: + return fmt.Sprintf("http://127.0.0.1:%d", port) + } +} + +func waitForLocalPortReady(ctx context.Context, port int, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + address := fmt.Sprintf("127.0.0.1:%d", port) + var lastErr error + + for time.Now().Before(deadline) { + if ctx != nil { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + } + + conn, err := net.DialTimeout("tcp", address, 300*time.Millisecond) + if err == nil { + _ = conn.Close() + return nil + } + lastErr = err + time.Sleep(200 * time.Millisecond) + } + + if lastErr == nil { + lastErr = errors.New("unknown readiness failure") + } + return lastErr +} diff --git a/system-tests/lib/cre/environment/tunnel/tunnel.go b/system-tests/lib/cre/environment/tunnel/tunnel.go new file mode 100644 index 00000000000..b4ea3803607 --- /dev/null +++ b/system-tests/lib/cre/environment/tunnel/tunnel.go @@ -0,0 +1,30 @@ +package tunnel + +import "context" + +type EndpointRef struct { + ComponentID string + EndpointName string + Scheme string + Host string + Port int + OriginalURL string +} + +type TunnelBinding struct { + EndpointRef + LocalPort int + LocalURL string +} + +type Manager interface { + Start(ctx context.Context, refs []EndpointRef) ([]TunnelBinding, error) + Stop(ctx context.Context) error + IsStarted() bool +} + +type Provider interface { + Open(ctx context.Context, ref EndpointRef) (TunnelBinding, error) + Close(ctx context.Context, binding TunnelBinding) error + Name() string +} From 0278b3e808b54a8f0695a47420e169ccd2d5be09 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Tue, 17 Feb 2026 20:39:18 +0100 Subject: [PATCH 03/34] added remote compontent reuse policy, fixed tunneling process leak --- .../configs/workflow-gateway-don-remote.toml | 19 +- .../environment/environment/environment.go | 191 ++++++++++++++++++ .../lib/cre/environment/agent/server.go | 61 ++++++ .../lib/cre/environment/agent/server_test.go | 96 +++++++++ .../lib/cre/environment/blockchain_start.go | 95 +++------ .../cre/environment/blockchain_start_test.go | 21 +- .../cre/environment/blockchains/evm/evm.go | 1 + .../lib/cre/environment/config/config.go | 16 +- .../cre/environment/config/tunnel_state.go | 107 ++++++++++ .../lib/cre/environment/environment.go | 7 + .../lib/cre/environment/setup_output_test.go | 1 + .../lib/cre/environment/tunnel/manager.go | 11 + .../cre/environment/tunnel/noop_manager.go | 2 + .../cre/environment/tunnel/provider_ssm.go | 23 ++- .../lib/cre/environment/tunnel/tunnel.go | 2 + 15 files changed, 574 insertions(+), 79 deletions(-) create mode 100644 system-tests/lib/cre/environment/config/tunnel_state.go diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml b/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml index e86df8a625c..b023c27173c 100644 --- a/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml +++ b/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml @@ -3,13 +3,15 @@ type = "anvil" chain_id = "1337" docker_cmd_params = ["-b", "0.5", "--mixed-mining"] - target = "remote" + #target = "remote" [[blockchains]] type = "anvil" chain_id = "2337" port = "8546" docker_cmd_params = ["-b", "0.5", "--mixed-mining"] + target = "remote" + [jd] csa_encryption_key = "d1093c0060d50a3c89c189b2e485da5a3ce57f3dcb38ab7e2c0d5f0bb2314a44" # any random 32 byte hex string @@ -48,10 +50,10 @@ [[nodesets.node_specs]] roles = ["plugin"] [nodesets.node_specs.node] - docker_ctx = "../../../.." - docker_file = "core/chainlink.Dockerfile" - docker_build_args = { "CL_IS_PROD_BUILD" = "false" } - # image = "chainlink-tmp:latest" + #docker_ctx = "../../../.." + #docker_file = "core/chainlink.Dockerfile" + #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } + image = "chainlink-tmp:latest" user_config_overrides = "" [[nodesets]] @@ -71,9 +73,10 @@ [[nodesets.node_specs]] roles = ["bootstrap", "gateway"] [nodesets.node_specs.node] - docker_ctx = "../../../.." - docker_file = "core/chainlink.Dockerfile" - docker_build_args = { "CL_IS_PROD_BUILD" = "false" } + #docker_ctx = "../../../.." + #docker_file = "core/chainlink.Dockerfile" + #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } + image = "chainlink-tmp:latest" # 5002 is the web API capabilities port for incoming requests # 15002 is the vault port for incoming requests custom_ports = ["5002:5002","15002:15002"] diff --git a/core/scripts/cre/environment/environment/environment.go b/core/scripts/cre/environment/environment/environment.go index 9ad0428ce14..1ca638ba4c6 100644 --- a/core/scripts/cre/environment/environment/environment.go +++ b/core/scripts/cre/environment/environment/environment.go @@ -1,6 +1,7 @@ package environment import ( + "bufio" "context" "crypto/ecdsa" "crypto/rand" @@ -13,6 +14,7 @@ import ( "path/filepath" "runtime/debug" "slices" + "strconv" "strings" "syscall" "time" @@ -255,6 +257,10 @@ func startCmd() *cobra.Command { return errors.Wrap(err, "failed to set default CTF configs") } + if err := cleanupTrackedTunnels(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to clean up tracked SSM tunnels before start") + } + cleanUpErr := envconfig.RemoveAllEnvironmentStateDir(relativePathToRepoRoot) if cleanUpErr != nil { return errors.Wrap(cleanUpErr, "failed to clean up environment state files") @@ -500,6 +506,9 @@ func startCmd() *cobra.Command { if storeErr != nil { return errors.Wrap(storeErr, "failed to store local CRE state") } + if err := persistTunnelState(relativePathToRepoRoot, output); err != nil { + return errors.Wrap(err, "failed to store tunnel state") + } return nil }, @@ -634,6 +643,10 @@ func stopCmd() *cobra.Command { return errors.Wrap(removeErr, "failed to remove environment containers. Please remove them manually") } + if err := cleanupTrackedTunnels(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to clean up tracked SSM tunnels") + } + if allFlag { stopBeholderErr := stopBeholder() if stopBeholderErr != nil { @@ -818,6 +831,184 @@ func oneLineErrorMessage(errOrPanic any) string { return strings.SplitN(fmt.Sprintf("%v", errOrPanic), "\n", 1)[0] } +func cleanupTrackedTunnels(relativePathToRepoRoot string) error { + state, err := envconfig.LoadTunnelState(relativePathToRepoRoot) + if err != nil { + return errors.Wrap(err, "failed to load tracked tunnel state") + } + if len(state.Tunnels) == 0 { + return nil + } + + framework.L.Info().Msgf("Found %d tracked SSM tunnel process(es), cleaning up", len(state.Tunnels)) + failed := 0 + for _, t := range state.Tunnels { + // First, aggressively kill known long-lived plugin children by local forwarded port. + if pluginKilled, pluginErr := killSessionManagerPluginByLocalPort(t.LocalPort); pluginErr != nil { + framework.L.Warn().Err(pluginErr).Msgf("failed to clean session-manager-plugin for localPort=%d", t.LocalPort) + } else if pluginKilled { + framework.L.Info().Msgf("stopped session-manager-plugin for localPort=%d", t.LocalPort) + } + + if t.PID <= 0 { + continue + } + if !processExists(t.PID) { + continue + } + isSSM, checkErr := isSSMStartSessionProcess(t.PID) + if checkErr != nil { + framework.L.Warn().Err(checkErr).Msgf("failed to inspect process pid=%d before tunnel cleanup", t.PID) + failed++ + continue + } + if !isSSM { + framework.L.Warn().Msgf("refusing to kill non-SSM process pid=%d recorded in tunnel state", t.PID) + failed++ + continue + } + + proc, findErr := os.FindProcess(t.PID) + if findErr != nil { + failed++ + continue + } + + _ = proc.Signal(syscall.SIGTERM) + deadline := time.Now().Add(2 * time.Second) + for processExists(t.PID) && time.Now().Before(deadline) { + time.Sleep(150 * time.Millisecond) + } + if processExists(t.PID) { + _ = proc.Kill() + } + if processExists(t.PID) { + failed++ + framework.L.Warn().Msgf("failed to stop tracked tunnel process pid=%d localPort=%d remotePort=%d", t.PID, t.LocalPort, t.RemotePort) + continue + } + + framework.L.Info().Msgf("stopped tracked tunnel process pid=%d localPort=%d remotePort=%d kind=%s", t.PID, t.LocalPort, t.RemotePort, t.Kind) + } + + if clearErr := envconfig.ClearTunnelState(relativePathToRepoRoot); clearErr != nil { + framework.L.Warn().Err(clearErr).Msg("failed to clear tunnel state file after cleanup") + } + + if failed > 0 { + return fmt.Errorf("failed to clean up %d tracked tunnel process(es)", failed) + } + return nil +} + +func processExists(pid int) bool { + if pid <= 0 { + return false + } + proc, err := os.FindProcess(pid) + if err != nil { + return false + } + err = proc.Signal(syscall.Signal(0)) + return err == nil +} + +func isSSMStartSessionProcess(pid int) (bool, error) { + out, err := exec.Command("ps", "-o", "command=", "-p", strconv.Itoa(pid)).Output() + if err != nil { + return false, err + } + cmd := strings.TrimSpace(string(out)) + if cmd == "" { + return false, nil + } + + return strings.Contains(cmd, "aws ssm start-session"), nil +} + +func killSessionManagerPluginByLocalPort(localPort int) (bool, error) { + if localPort <= 0 { + return false, nil + } + + out, err := exec.Command("ps", "-axo", "pid=,command=").Output() + if err != nil { + return false, err + } + + pattern := fmt.Sprintf(`"localPortNumber": ["%d"]`, localPort) + killedAny := false + scanner := bufio.NewScanner(strings.NewReader(string(out))) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + if !strings.Contains(line, "session-manager-plugin") || !strings.Contains(line, pattern) { + continue + } + + fields := strings.Fields(line) + if len(fields) == 0 { + continue + } + pid, parseErr := strconv.Atoi(fields[0]) + if parseErr != nil || pid <= 0 { + continue + } + + proc, findErr := os.FindProcess(pid) + if findErr != nil { + continue + } + _ = proc.Signal(syscall.SIGTERM) + deadline := time.Now().Add(2 * time.Second) + for processExists(pid) && time.Now().Before(deadline) { + time.Sleep(100 * time.Millisecond) + } + if processExists(pid) { + _ = proc.Kill() + } + if !processExists(pid) { + killedAny = true + } + } + if scanErr := scanner.Err(); scanErr != nil { + return killedAny, scanErr + } + + return killedAny, nil +} + +func persistTunnelState(relativePathToRepoRoot string, output *creenv.SetupOutput) error { + if output == nil { + return envconfig.ClearTunnelState(relativePathToRepoRoot) + } + + bindings := output.TunnelBindings() + processes := make([]envconfig.TunnelProcess, 0, len(bindings)) + for _, b := range bindings { + if b.PID <= 0 { + continue + } + processes = append(processes, envconfig.TunnelProcess{ + PID: b.PID, + Kind: "ssm", + InstanceID: os.Getenv("CRE_EC2_INSTANCE_ID"), + Region: "us-west-2", + RemotePort: b.Port, + LocalPort: b.LocalPort, + ComponentID: b.ComponentID, + Endpoint: b.EndpointName, + }) + } + + return envconfig.StoreTunnelState(relativePathToRepoRoot, &envconfig.TunnelState{ + Version: 1, + Tunnels: processes, + }) +} + func initDxTracker() { if dxTracker != nil { return diff --git a/system-tests/lib/cre/environment/agent/server.go b/system-tests/lib/cre/environment/agent/server.go index 91457e91a43..c32b894d77f 100644 --- a/system-tests/lib/cre/environment/agent/server.go +++ b/system-tests/lib/cre/environment/agent/server.go @@ -3,7 +3,9 @@ package agent import ( "bytes" "context" + "crypto/sha256" "encoding/json" + "encoding/hex" "fmt" "io" "net/http" @@ -33,6 +35,9 @@ const ( ErrCodeMissingComponentInput = "missing_component_input" ErrCodeDeployFailed = "deployment_failed" ErrCodeTransportEncodeFailed = "transport_encode_failed" + + RemoteStartPolicyAlways = "always" + RemoteStartPolicyReuseIdentical = "reuse_if_identical" ) var frameworkLogCaptureMu sync.Mutex @@ -46,6 +51,7 @@ type StartComponentEnvelope struct { type StartBlockchainPayload struct { ComponentType string `json:"componentType"` Blockchain *blockchain.Input `json:"blockchain"` + ReusePolicy string `json:"reusePolicy,omitempty"` } type StartComponentResponse struct { @@ -58,12 +64,20 @@ type StartComponentResponse struct { type Server struct { lggr zerolog.Logger deployers map[blockchain.ChainFamily]blockchains.Deployer + cacheMu sync.Mutex + cache map[string]cachedStart +} + +type cachedStart struct { + PayloadHash string + Output map[string]any } func NewServer(lggr zerolog.Logger, deployers map[blockchain.ChainFamily]blockchains.Deployer) *Server { return &Server{ lggr: lggr, deployers: deployers, + cache: make(map[string]cachedStart), } } @@ -113,10 +127,23 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { s.respondError(w, http.StatusBadRequest, ErrCodeMissingComponentInput, "blockchain payload is required", nil) return } + componentKey := fmt.Sprintf("%s:%s:%s", payload.ComponentType, payload.Blockchain.Type, payload.Blockchain.ChainID) + payloadHash := hashPayload(envelope.Payload) // Keep this stderr write explicit so startup behavior is visible when agent runs as a subprocess. requestLog := fmt.Sprintf("[cre-agent] starting component type=%s blockchain=%s chain_id=%s", payload.ComponentType, payload.Blockchain.Type, payload.Blockchain.ChainID) _, _ = fmt.Fprintln(os.Stderr, requestLog) + if shouldReuseRemoteStart(payload.ReusePolicy) { + if cached, ok := s.lookupCachedStart(componentKey, payloadHash); ok { + reuseLog := fmt.Sprintf("[cre-agent] reusing existing component for key=%s (payload hash matched)", componentKey) + _, _ = fmt.Fprintln(os.Stderr, reuseLog) + s.respondJSON(w, http.StatusOK, StartComponentResponse{ + BlockchainOutput: cached.Output, + AgentLogs: []string{requestLog, reuseLog}, + }) + return + } + } var startedOutput *blockchain.Output capturedFrameworkLogs, startErr := captureFrameworkLogs(func() error { @@ -142,6 +169,7 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { s.respondError(w, http.StatusInternalServerError, ErrCodeTransportEncodeFailed, encErr.Error(), agentLogs) return } + s.cacheSuccessfulStart(componentKey, payloadHash, safeOutput) s.respondJSON(w, http.StatusOK, StartComponentResponse{ BlockchainOutput: safeOutput, AgentLogs: agentLogs, @@ -187,6 +215,39 @@ func captureFrameworkLogs(fn func() error) ([]string, error) { return logs, err } +func (s *Server) lookupCachedStart(componentKey, payloadHash string) (*cachedStart, bool) { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + + start, ok := s.cache[componentKey] + if !ok || start.PayloadHash != payloadHash { + return nil, false + } + return &start, true +} + +func (s *Server) cacheSuccessfulStart(componentKey, payloadHash string, output map[string]any) { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + s.cache[componentKey] = cachedStart{ + PayloadHash: payloadHash, + Output: output, + } +} + +func shouldReuseRemoteStart(policy string) bool { + normalized := strings.TrimSpace(strings.ToLower(policy)) + if normalized == "" { + normalized = RemoteStartPolicyReuseIdentical + } + return normalized == RemoteStartPolicyReuseIdentical +} + +func hashPayload(payload []byte) string { + sum := sha256.Sum256(payload) + return hex.EncodeToString(sum[:]) +} + func Run(ctx context.Context, addr string, srv *Server) error { httpSrv := &http.Server{ Addr: addr, diff --git a/system-tests/lib/cre/environment/agent/server_test.go b/system-tests/lib/cre/environment/agent/server_test.go index 54ed1501d64..a1154d7c595 100644 --- a/system-tests/lib/cre/environment/agent/server_test.go +++ b/system-tests/lib/cre/environment/agent/server_test.go @@ -2,12 +2,16 @@ package agent import ( "bytes" + "context" + "encoding/json" "net/http" "net/http/httptest" "strings" "testing" "github.com/rs/zerolog" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" ) func TestStartComponentReturnsErrorCodeForUnsupportedSchema(t *testing.T) { @@ -46,3 +50,95 @@ func TestStartComponentReturnsErrorCodeForUnsupportedComponent(t *testing.T) { t.Fatalf("expected response to include error code %q, got body: %s", ErrCodeUnsupportedComponent, rr.Body.String()) } } + +type fakeOutputDeployer struct { + calls int +} + +func (f *fakeOutputDeployer) Deploy(context.Context, *blockchain.Input) (blockchains.Blockchain, error) { + return nil, nil +} + +func (f *fakeOutputDeployer) DeployOutput(context.Context, *blockchain.Input) (*blockchain.Output, error) { + f.calls++ + return &blockchain.Output{ + Type: blockchain.TypeAnvil, + ChainID: "1337", + Nodes: []*blockchain.Node{ + { + ExternalHTTPUrl: "http://127.0.0.1:8545", + ExternalWSUrl: "ws://127.0.0.1:8546", + }, + }, + }, nil +} + +func TestStartComponentReuseIfIdenticalPayload(t *testing.T) { + deployer := &fakeOutputDeployer{} + server := NewServer(zerolog.Nop(), map[blockchain.ChainFamily]blockchains.Deployer{ + blockchain.FamilyEVM: deployer, + }) + handler := server.Handler() + + payload := `{"componentType":"blockchain","blockchain":{"type":"anvil","chain_id":"1337"}}` + body := bytes.NewBufferString(`{"schemaVersion":"v1","operation":"StartComponent","payload":` + payload + `}`) + + req1 := httptest.NewRequest(http.MethodPost, "/v1/components/start", bytes.NewReader(body.Bytes())) + req1.Header.Set("Content-Type", "application/json") + rr1 := httptest.NewRecorder() + handler.ServeHTTP(rr1, req1) + if rr1.Code != http.StatusOK { + t.Fatalf("expected first request OK, got %d: %s", rr1.Code, rr1.Body.String()) + } + + req2 := httptest.NewRequest(http.MethodPost, "/v1/components/start", bytes.NewReader(body.Bytes())) + req2.Header.Set("Content-Type", "application/json") + rr2 := httptest.NewRecorder() + handler.ServeHTTP(rr2, req2) + if rr2.Code != http.StatusOK { + t.Fatalf("expected second request OK, got %d: %s", rr2.Code, rr2.Body.String()) + } + + if deployer.calls != 1 { + t.Fatalf("expected deployer to be called once with reuse mode, got %d", deployer.calls) + } + + var resp StartComponentResponse + if err := json.Unmarshal(rr2.Body.Bytes(), &resp); err != nil { + t.Fatalf("failed to decode response: %v", err) + } + if len(resp.AgentLogs) == 0 || !strings.Contains(strings.Join(resp.AgentLogs, " "), "reusing existing component") { + t.Fatalf("expected reuse log in response, got: %v", resp.AgentLogs) + } +} + +func TestStartComponentAlwaysPolicyDisablesReuse(t *testing.T) { + deployer := &fakeOutputDeployer{} + server := NewServer(zerolog.Nop(), map[blockchain.ChainFamily]blockchains.Deployer{ + blockchain.FamilyEVM: deployer, + }) + handler := server.Handler() + + payload := `{"componentType":"blockchain","reusePolicy":"always","blockchain":{"type":"anvil","chain_id":"1337"}}` + body := bytes.NewBufferString(`{"schemaVersion":"v1","operation":"StartComponent","payload":` + payload + `}`) + + req1 := httptest.NewRequest(http.MethodPost, "/v1/components/start", bytes.NewReader(body.Bytes())) + req1.Header.Set("Content-Type", "application/json") + rr1 := httptest.NewRecorder() + handler.ServeHTTP(rr1, req1) + if rr1.Code != http.StatusOK { + t.Fatalf("expected first request OK, got %d: %s", rr1.Code, rr1.Body.String()) + } + + req2 := httptest.NewRequest(http.MethodPost, "/v1/components/start", bytes.NewReader(body.Bytes())) + req2.Header.Set("Content-Type", "application/json") + rr2 := httptest.NewRecorder() + handler.ServeHTTP(rr2, req2) + if rr2.Code != http.StatusOK { + t.Fatalf("expected second request OK, got %d: %s", rr2.Code, rr2.Body.String()) + } + + if deployer.calls != 2 { + t.Fatalf("expected deployer to be called twice with always policy, got %d", deployer.calls) + } +} diff --git a/system-tests/lib/cre/environment/blockchain_start.go b/system-tests/lib/cre/environment/blockchain_start.go index 7422b772ef1..b81f8fcbb83 100644 --- a/system-tests/lib/cre/environment/blockchain_start.go +++ b/system-tests/lib/cre/environment/blockchain_start.go @@ -11,7 +11,6 @@ import ( "net/http" "net/url" "os" - "os/exec" "strconv" "strings" "time" @@ -51,6 +50,7 @@ type startComponentEnvelope struct { type startBlockchainRequest struct { ComponentType string `json:"componentType"` Blockchain *blockchain.Input `json:"blockchain"` + ReusePolicy string `json:"reusePolicy,omitempty"` } type startBlockchainResult struct { @@ -158,9 +158,9 @@ func (c *httpComponentClient) startComponentOnce(ctx context.Context, envelope s if resp.StatusCode < 200 || resp.StatusCode >= 300 { if startResp.Error != "" { if startResp.ErrorCode != "" { - err = fmt.Errorf("%s: %s", startResp.ErrorCode, startResp.Error) + err = remoteAgentError(startResp.ErrorCode, startResp.Error) } else { - err = errors.New(startResp.Error) + err = remoteAgentError("remote_agent_error", startResp.Error) } } else { err = fmt.Errorf("start component request failed with status %s: %s", resp.Status, string(respBody)) @@ -173,9 +173,9 @@ func (c *httpComponentClient) startComponentOnce(ctx context.Context, envelope s } if startResp.Error != "" { if startResp.ErrorCode != "" { - return nil, retry.Unrecoverable(fmt.Errorf("%s: %s", startResp.ErrorCode, startResp.Error)) + return nil, retry.Unrecoverable(remoteAgentError(startResp.ErrorCode, startResp.Error)) } - return nil, retry.Unrecoverable(errors.New(startResp.Error)) + return nil, retry.Unrecoverable(remoteAgentError("remote_agent_error", startResp.Error)) } return &startResp, nil @@ -215,9 +215,9 @@ func isRetriableNetworkError(err error) bool { return errors.As(err, &netErr) } -func newStartComponentClient(testLogger zerolog.Logger) (componentClient, error) { +func newStartComponentClient(testLogger zerolog.Logger, tunnelManager tunnel.Manager) (componentClient, error) { if os.Getenv(envAgentMode) == "ec2" { - baseURL, err := resolveEC2AgentBaseURL(testLogger) + baseURL, err := resolveEC2AgentBaseURL(testLogger, tunnelManager) if err != nil { return nil, err } @@ -231,10 +231,13 @@ func newStartComponentClient(testLogger zerolog.Logger) (componentClient, error) return newHTTPComponentClient(baseURL), nil } -func resolveEC2AgentBaseURL(testLogger zerolog.Logger) (string, error) { +func resolveEC2AgentBaseURL(testLogger zerolog.Logger, tunnelManager tunnel.Manager) (string, error) { if configured := os.Getenv(envEC2AgentURL); configured != "" { return configured, nil } + if tunnelManager == nil { + return "", errors.New("tunnel manager is required to auto-open ec2 agent tunnel") + } instanceID := strings.TrimSpace(os.Getenv(envEC2InstanceID)) if instanceID == "" { @@ -250,68 +253,30 @@ func resolveEC2AgentBaseURL(testLogger zerolog.Logger) (string, error) { remotePort = parsedPort } - localPort, err := reserveLocalPort() + bindings, err := tunnelManager.Start(context.Background(), []tunnel.EndpointRef{ + { + ComponentID: "agent", + EndpointName: "api", + Scheme: "http", + Host: "127.0.0.1", + Port: remotePort, + OriginalURL: fmt.Sprintf("http://127.0.0.1:%d", remotePort), + }, + }) if err != nil { - return "", pkgerrors.Wrap(err, "failed to allocate local port for ssm tunnel") + return "", pkgerrors.Wrap(err, "failed to open ssm tunnel to ec2 agent") } - - if err := startSSMPortForward(testLogger, instanceID, remotePort, localPort); err != nil { - return "", err + if len(bindings) == 0 { + return "", errors.New("failed to open ssm tunnel to ec2 agent: no bindings returned") } testLogger.Info(). Str("instanceID", instanceID). Int("remotePort", remotePort). - Int("localPort", localPort). + Int("localPort", bindings[0].LocalPort). Msg("Opened SSM tunnel to EC2 agent") - return fmt.Sprintf("http://127.0.0.1:%d", localPort), nil -} - -func reserveLocalPort() (int, error) { - l, err := net.Listen("tcp", "127.0.0.1:0") - if err != nil { - return 0, err - } - defer l.Close() - - tcpAddr, ok := l.Addr().(*net.TCPAddr) - if !ok { - return 0, errors.New("listener addr is not tcp") - } - return tcpAddr.Port, nil -} - -func startSSMPortForward(testLogger zerolog.Logger, instanceID string, remotePort, localPort int) error { - cmd := exec.Command( - "aws", - "ssm", - "start-session", - "--region", ec2Region, - "--target", instanceID, - "--document-name", "AWS-StartPortForwardingSession", - "--parameters", fmt.Sprintf("portNumber=%d,localPortNumber=%d", remotePort, localPort), - ) - if testLogger.GetLevel() <= zerolog.DebugLevel { - cmd.Stdout = os.Stderr - cmd.Stderr = os.Stderr - testLogger.Debug(). - Strs("cmd", cmd.Args). - Msg("Starting SSM agent tunnel command") - } - testLogger.Info(). - Str("instanceID", instanceID). - Int("remotePort", remotePort). - Int("localPort", localPort). - Msg("Opening SSM tunnel to EC2 agent") - - if err := cmd.Start(); err != nil { - return pkgerrors.Wrap(err, "failed to start aws ssm port forwarding session") - } - go func() { - _ = cmd.Wait() - }() - return nil + return bindings[0].LocalURL, nil } func blockchainFromOutput(testLogger zerolog.Logger, output *blockchain.Output) (blockchains.Blockchain, error) { @@ -406,13 +371,14 @@ func startBlockchainsWithTargets( } if len(remoteIdx) > 0 { - startClient, err := newStartComponentClient(testLogger) + startClient, err := newStartComponentClient(testLogger, tunnelManager) if err != nil { return nil, err } for _, idx := range remoteIdx { input := blockchainInputs[idx] + configured := configuredBlockchains[idx] if err := validatePhase2ARemoteBlockchainInput(input); err != nil { return nil, err } @@ -420,6 +386,7 @@ func startBlockchainsWithTargets( payload := startBlockchainRequest{ ComponentType: componentTypeBlockchain, Blockchain: input, + ReusePolicy: string(configured.RemoteStartPolicy), } payloadBytes, err := json.Marshal(payload) if err != nil { @@ -577,3 +544,7 @@ func rewriteURLHost(rawURL, host string) (string, error) { parsed.Host = host return parsed.String(), nil } + +func remoteAgentError(code, message string) error { + return fmt.Errorf("remote agent error (%s): %s", code, message) +} diff --git a/system-tests/lib/cre/environment/blockchain_start_test.go b/system-tests/lib/cre/environment/blockchain_start_test.go index b63c5b1883e..e8358bcfdf4 100644 --- a/system-tests/lib/cre/environment/blockchain_start_test.go +++ b/system-tests/lib/cre/environment/blockchain_start_test.go @@ -31,12 +31,12 @@ func TestNewStartComponentClientEC2Mode(t *testing.T) { t.Setenv(envEC2AgentURL, "") t.Setenv(envEC2InstanceID, "") - if _, err := newStartComponentClient(zerolog.Nop()); err == nil { + if _, err := newStartComponentClient(zerolog.Nop(), &fakeTunnelManager{}); err == nil { t.Fatalf("expected ec2 mode without %s or %s to fail", envEC2AgentURL, envEC2InstanceID) } t.Setenv(envEC2AgentURL, "http://127.0.0.1:18080") // manual tunnel override - client, err := newStartComponentClient(zerolog.Nop()) + client, err := newStartComponentClient(zerolog.Nop(), &fakeTunnelManager{}) if err != nil { t.Fatalf("expected ec2 mode client to be created, got %v", err) } @@ -58,7 +58,7 @@ func TestResolveEC2AgentBaseURLRequiresInstanceIDWhenURLMissing(t *testing.T) { t.Setenv(envEC2InstanceID, "") t.Setenv(envEC2AgentPort, "") - _, err := resolveEC2AgentBaseURL(zerolog.Nop()) + _, err := resolveEC2AgentBaseURL(zerolog.Nop(), &fakeTunnelManager{}) if err == nil { t.Fatalf("expected missing %s to fail when %s is not set", envEC2InstanceID, envEC2AgentURL) } @@ -69,7 +69,7 @@ func TestResolveEC2AgentBaseURLRejectsInvalidPort(t *testing.T) { t.Setenv(envEC2InstanceID, "i-123") t.Setenv(envEC2AgentPort, "not-a-port") - _, err := resolveEC2AgentBaseURL(zerolog.Nop()) + _, err := resolveEC2AgentBaseURL(zerolog.Nop(), &fakeTunnelManager{}) if err == nil { t.Fatalf("expected invalid %s to fail", envEC2AgentPort) } @@ -83,12 +83,12 @@ func TestNewStartComponentClientLocalMode(t *testing.T) { t.Setenv(envEC2AgentURL, "") t.Setenv(envLocalAgentURL, "") - if _, err := newStartComponentClient(zerolog.Nop()); err == nil { + if _, err := newStartComponentClient(zerolog.Nop(), &fakeTunnelManager{}); err == nil { t.Fatalf("expected local mode without %s to fail", envLocalAgentURL) } t.Setenv(envLocalAgentURL, "http://127.0.0.1:8080") - client, err := newStartComponentClient(zerolog.Nop()) + client, err := newStartComponentClient(zerolog.Nop(), &fakeTunnelManager{}) if err != nil { t.Fatalf("expected local mode client to be created, got %v", err) } @@ -131,6 +131,7 @@ func (f *fakeTunnelManager) Start(_ context.Context, refs []tunnel.EndpointRef) func (f *fakeTunnelManager) Stop(_ context.Context) error { return nil } func (f *fakeTunnelManager) IsStarted() bool { return f.startCalls > 0 } +func (f *fakeTunnelManager) Snapshot() []tunnel.TunnelBinding { return []tunnel.TunnelBinding{} } func TestRewriteRemoteBlockchainOutputForLocalAccess(t *testing.T) { out := &blockchain.Output{ @@ -192,3 +193,11 @@ func TestNewEC2TunnelManagerReturnsNoopWhenNotApplicable(t *testing.T) { t.Fatalf("expected noop manager to report not started") } } + +func TestRemoteAgentErrorFormatting(t *testing.T) { + err := remoteAgentError("deployment_failed", "failed to deploy blockchain output") + want := "remote agent error (deployment_failed): failed to deploy blockchain output" + if err == nil || err.Error() != want { + t.Fatalf("expected %q, got %v", want, err) + } +} diff --git a/system-tests/lib/cre/environment/blockchains/evm/evm.go b/system-tests/lib/cre/environment/blockchains/evm/evm.go index 8ffe9263c4d..3820225ff48 100644 --- a/system-tests/lib/cre/environment/blockchains/evm/evm.go +++ b/system-tests/lib/cre/environment/blockchains/evm/evm.go @@ -187,6 +187,7 @@ func FromOutput(testLogger zerolog.Logger, out *blockchain.Output) (*Blockchain, WithRpcUrl(out.Nodes[0].ExternalWSUrl). WithPrivateKeys([]string{priv}). WithProtections(false, false, seth.MustMakeDuration(time.Second)). + // WithGasPriceEstimations(true, 0, seth.Priority_Auto, 1). Build() if err != nil { return nil, pkgerrors.Wrap(err, "failed to create seth client") diff --git a/system-tests/lib/cre/environment/config/config.go b/system-tests/lib/cre/environment/config/config.go index 96b39edfdc6..bcad49a1812 100644 --- a/system-tests/lib/cre/environment/config/config.go +++ b/system-tests/lib/cre/environment/config/config.go @@ -78,17 +78,28 @@ const ( TargetRemote ComponentTarget = "remote" ) +type RemoteStartPolicy string + +const ( + RemoteStartPolicyReuseIfIdentical RemoteStartPolicy = "reuse_if_identical" + RemoteStartPolicyAlways RemoteStartPolicy = "always" +) + // Blockchain wraps the existing CTF blockchain input and adds placement metadata. // The embedded input keeps TOML fields backward-compatible. type Blockchain struct { blockchain.Input - Target ComponentTarget `toml:"target"` + Target ComponentTarget `toml:"target"` + RemoteStartPolicy RemoteStartPolicy `toml:"remote_start_policy"` } func (b *Blockchain) Normalize() { if b.Target == "" { b.Target = TargetDocker } + if b.RemoteStartPolicy == "" { + b.RemoteStartPolicy = RemoteStartPolicyReuseIfIdentical + } } func (b *Blockchain) Validate() error { @@ -100,6 +111,9 @@ func (b *Blockchain) Validate() error { if b.Target != TargetDocker && b.Target != TargetRemote { return fmt.Errorf("invalid blockchain target: %s", b.Target) } + if b.RemoteStartPolicy != RemoteStartPolicyReuseIfIdentical && b.RemoteStartPolicy != RemoteStartPolicyAlways { + return fmt.Errorf("invalid blockchain remote_start_policy: %s", b.RemoteStartPolicy) + } return nil } diff --git a/system-tests/lib/cre/environment/config/tunnel_state.go b/system-tests/lib/cre/environment/config/tunnel_state.go new file mode 100644 index 00000000000..8ec0b0b8d5c --- /dev/null +++ b/system-tests/lib/cre/environment/config/tunnel_state.go @@ -0,0 +1,107 @@ +package config + +import ( + "fmt" + "os" + "path/filepath" + "sync" + + "github.com/pelletier/go-toml/v2" +) + +const TunnelStateFilename = "tunnels.toml" + +type TunnelProcess struct { + PID int `toml:"pid"` + Kind string `toml:"kind"` + InstanceID string `toml:"instance_id"` + Region string `toml:"region"` + RemotePort int `toml:"remote_port"` + LocalPort int `toml:"local_port"` + ComponentID string `toml:"component_id,omitempty"` + Endpoint string `toml:"endpoint,omitempty"` + CreatedAt string `toml:"created_at,omitempty"` +} + +type TunnelState struct { + Version int `toml:"version"` + Tunnels []TunnelProcess `toml:"tunnels"` +} + +var tunnelStateMu sync.Mutex + +func MustTunnelStateFileAbsPath(relativePathToRepoRoot string) string { + absPath, err := filepath.Abs(filepath.Join(relativePathToRepoRoot, StateDirname, TunnelStateFilename)) + if err != nil { + panic(fmt.Errorf("failed to get absolute path for tunnel state file: %w", err)) + } + return absPath +} + +func LoadTunnelState(relativePathToRepoRoot string) (*TunnelState, error) { + tunnelStateMu.Lock() + defer tunnelStateMu.Unlock() + return loadTunnelStateUnlocked(MustTunnelStateFileAbsPath(relativePathToRepoRoot)) +} + +func StoreTunnelState(relativePathToRepoRoot string, state *TunnelState) error { + tunnelStateMu.Lock() + defer tunnelStateMu.Unlock() + return storeTunnelStateUnlocked(MustTunnelStateFileAbsPath(relativePathToRepoRoot), state) +} + +func ClearTunnelState(relativePathToRepoRoot string) error { + tunnelStateMu.Lock() + defer tunnelStateMu.Unlock() + return storeTunnelStateUnlocked(MustTunnelStateFileAbsPath(relativePathToRepoRoot), &TunnelState{ + Version: 1, + Tunnels: []TunnelProcess{}, + }) +} + +func loadTunnelStateUnlocked(path string) (*TunnelState, error) { + if _, err := os.Stat(path); os.IsNotExist(err) { + return &TunnelState{Version: 1, Tunnels: []TunnelProcess{}}, nil + } + + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("failed to read tunnel state file: %w", err) + } + + state := &TunnelState{} + if err := toml.Unmarshal(data, state); err != nil { + return nil, fmt.Errorf("failed to unmarshal tunnel state file: %w", err) + } + if state.Version == 0 { + state.Version = 1 + } + if state.Tunnels == nil { + state.Tunnels = []TunnelProcess{} + } + return state, nil +} + +func storeTunnelStateUnlocked(path string, state *TunnelState) error { + if state == nil { + state = &TunnelState{Version: 1, Tunnels: []TunnelProcess{}} + } + if state.Version == 0 { + state.Version = 1 + } + if state.Tunnels == nil { + state.Tunnels = []TunnelProcess{} + } + + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return fmt.Errorf("failed to create tunnel state directory: %w", err) + } + data, err := toml.Marshal(state) + if err != nil { + return fmt.Errorf("failed to marshal tunnel state: %w", err) + } + if err := os.WriteFile(path, data, 0o600); err != nil { + return fmt.Errorf("failed to write tunnel state file: %w", err) + } + return nil +} diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index 1d3354e1aac..680be009b24 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -74,6 +74,13 @@ func (s *SetupOutput) Close(ctx context.Context) error { return s.closeErr } +func (s *SetupOutput) TunnelBindings() []tunnel.TunnelBinding { + if s == nil || s.tunnelManager == nil { + return []tunnel.TunnelBinding{} + } + return s.tunnelManager.Snapshot() +} + type SetupInput struct { NodeSets []*cre.NodeSet Blockchains []*config.Blockchain diff --git a/system-tests/lib/cre/environment/setup_output_test.go b/system-tests/lib/cre/environment/setup_output_test.go index 9ec774f04fe..7ef20133d95 100644 --- a/system-tests/lib/cre/environment/setup_output_test.go +++ b/system-tests/lib/cre/environment/setup_output_test.go @@ -19,6 +19,7 @@ func (c *countingTunnelManager) Stop(_ context.Context) error { return nil } func (c *countingTunnelManager) IsStarted() bool { return false } +func (c *countingTunnelManager) Snapshot() []tunnel.TunnelBinding { return []tunnel.TunnelBinding{} } func TestSetupOutputCloseIsIdempotent(t *testing.T) { manager := &countingTunnelManager{} diff --git a/system-tests/lib/cre/environment/tunnel/manager.go b/system-tests/lib/cre/environment/tunnel/manager.go index 4327724c613..4fd01ac363e 100644 --- a/system-tests/lib/cre/environment/tunnel/manager.go +++ b/system-tests/lib/cre/environment/tunnel/manager.go @@ -73,6 +73,17 @@ func (m *manager) IsStarted() bool { return len(m.bindings) > 0 } +func (m *manager) Snapshot() []TunnelBinding { + m.mu.Lock() + defer m.mu.Unlock() + + out := make([]TunnelBinding, 0, len(m.bindings)) + for _, b := range m.bindings { + out = append(out, b) + } + return out +} + func (m *manager) closeMany(ctx context.Context, bindings []TunnelBinding) error { var joined error for _, b := range bindings { diff --git a/system-tests/lib/cre/environment/tunnel/noop_manager.go b/system-tests/lib/cre/environment/tunnel/noop_manager.go index c88c2260313..91829c8c040 100644 --- a/system-tests/lib/cre/environment/tunnel/noop_manager.go +++ b/system-tests/lib/cre/environment/tunnel/noop_manager.go @@ -22,3 +22,5 @@ func (n *noopManager) Start(_ context.Context, refs []EndpointRef) ([]TunnelBind func (n *noopManager) Stop(_ context.Context) error { return nil } func (n *noopManager) IsStarted() bool { return false } + +func (n *noopManager) Snapshot() []TunnelBinding { return []TunnelBinding{} } diff --git a/system-tests/lib/cre/environment/tunnel/provider_ssm.go b/system-tests/lib/cre/environment/tunnel/provider_ssm.go index 5ef2ba2021d..9d06d5e7285 100644 --- a/system-tests/lib/cre/environment/tunnel/provider_ssm.go +++ b/system-tests/lib/cre/environment/tunnel/provider_ssm.go @@ -8,6 +8,7 @@ import ( "os" "os/exec" "sync" + "syscall" "time" "github.com/rs/zerolog" @@ -50,6 +51,8 @@ func (p *SSMProvider) Open(ctx context.Context, ref EndpointRef) (TunnelBinding, "--document-name", "AWS-StartPortForwardingSession", "--parameters", fmt.Sprintf("portNumber=%d,localPortNumber=%d", ref.Port, localPort), ) + // Start in a dedicated process group so cleanup can kill aws + session-manager-plugin together. + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} if p.logger.GetLevel() <= zerolog.DebugLevel { cmd.Stdout = os.Stderr cmd.Stderr = os.Stderr @@ -69,7 +72,7 @@ func (p *SSMProvider) Open(ctx context.Context, ref EndpointRef) (TunnelBinding, return TunnelBinding{}, fmt.Errorf("failed to start aws ssm session: %w", err) } if err := waitForLocalPortReady(ctx, localPort, 12*time.Second); err != nil { - _ = cmd.Process.Kill() + terminateProcessGroup(cmd) return TunnelBinding{}, fmt.Errorf("ssm local tunnel on port %d did not become ready: %w", localPort, err) } @@ -85,6 +88,7 @@ func (p *SSMProvider) Open(ctx context.Context, ref EndpointRef) (TunnelBinding, EndpointRef: ref, LocalPort: localPort, LocalURL: localURLFromScheme(ref.Scheme, localPort), + PID: cmd.Process.Pid, }, nil } @@ -100,7 +104,7 @@ func (p *SSMProvider) Close(_ context.Context, binding TunnelBinding) error { return nil } - if err := cmd.Process.Kill(); err != nil { + if err := terminateProcessGroup(cmd); err != nil { return fmt.Errorf("failed to kill ssm session on local port %d: %w", binding.LocalPort, err) } p.logger.Info(). @@ -138,6 +142,21 @@ func localURLFromScheme(scheme string, port int) string { } } +func terminateProcessGroup(cmd *exec.Cmd) error { + if cmd == nil || cmd.Process == nil { + return nil + } + + // Negative PID targets the process group when Setpgid=true. + if err := syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL); err != nil { + // Fall back to killing parent process only. + if killErr := cmd.Process.Kill(); killErr != nil { + return killErr + } + } + return nil +} + func waitForLocalPortReady(ctx context.Context, port int, timeout time.Duration) error { deadline := time.Now().Add(timeout) address := fmt.Sprintf("127.0.0.1:%d", port) diff --git a/system-tests/lib/cre/environment/tunnel/tunnel.go b/system-tests/lib/cre/environment/tunnel/tunnel.go index b4ea3803607..cb622b28fd8 100644 --- a/system-tests/lib/cre/environment/tunnel/tunnel.go +++ b/system-tests/lib/cre/environment/tunnel/tunnel.go @@ -15,12 +15,14 @@ type TunnelBinding struct { EndpointRef LocalPort int LocalURL string + PID int } type Manager interface { Start(ctx context.Context, refs []EndpointRef) ([]TunnelBinding, error) Stop(ctx context.Context) error IsStarted() bool + Snapshot() []TunnelBinding } type Provider interface { From 56b241259354fe307d9a77bfc6e7df4bbe526cf6 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Wed, 18 Feb 2026 15:24:07 +0100 Subject: [PATCH 04/34] add support for remote startup of JD, add commands for stopping remote env --- core/scripts/cre/environment/.gitignore | 1 + .../configs/workflow-gateway-don-remote.toml | 7 +- .../environment/environment/environment.go | 264 +++++++++++-- .../environment/environment/remote_state.go | 127 +++++++ core/scripts/go.mod | 6 +- core/scripts/go.sum | 12 +- .../lib/cre/environment/agent/deploy.go | 48 +++ .../lib/cre/environment/agent/server.go | 356 ++++++++++++++++-- .../lib/cre/environment/agent/server_test.go | 80 +++- .../lib/cre/environment/blockchain_start.go | 45 ++- .../lib/cre/environment/config/config.go | 49 ++- .../lib/cre/environment/environment.go | 7 +- system-tests/lib/cre/environment/jobs.go | 196 +++++++++- system-tests/lib/cre/environment/jobs_test.go | 68 ++++ .../lib/cre/environment/remote_stop.go | 146 +++++++ .../cre/environment/tunnel/provider_ssm.go | 65 +++- system-tests/lib/go.mod | 4 +- system-tests/lib/go.sum | 12 +- system-tests/tests/go.mod | 2 + system-tests/tests/go.sum | 12 +- .../tests/load/cre/workflow_don_load_test.go | 2 +- .../tests/load/cre/writer_don_load_test.go | 2 +- 22 files changed, 1388 insertions(+), 123 deletions(-) create mode 100644 core/scripts/cre/environment/environment/remote_state.go create mode 100644 system-tests/lib/cre/environment/jobs_test.go create mode 100644 system-tests/lib/cre/environment/remote_stop.go diff --git a/core/scripts/cre/environment/.gitignore b/core/scripts/cre/environment/.gitignore index 51e29dba061..9a9366e3448 100644 --- a/core/scripts/cre/environment/.gitignore +++ b/core/scripts/cre/environment/.gitignore @@ -32,6 +32,7 @@ bin/ artifact_paths.json *.b64 state/ +state_remote/ # TS sdk-related bun.lock diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml b/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml index b023c27173c..09f5bd76c8a 100644 --- a/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml +++ b/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml @@ -11,12 +11,15 @@ port = "8546" docker_cmd_params = ["-b", "0.5", "--mixed-mining"] target = "remote" - + remote_start_policy = "always" [jd] csa_encryption_key = "d1093c0060d50a3c89c189b2e485da5a3ce57f3dcb38ab7e2c0d5f0bb2314a44" # any random 32 byte hex string # change to your version image = "job-distributor:0.22.1" + target = "remote" + # we need fresh DB on each run to avoid DB-level job name uniquness violations + remote_start_policy = "always" [fake] port = 8171 @@ -73,7 +76,7 @@ [[nodesets.node_specs]] roles = ["bootstrap", "gateway"] [nodesets.node_specs.node] - #docker_ctx = "../../../.." + #ocker_ctx = "../../../.." #docker_file = "core/chainlink.Dockerfile" #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } image = "chainlink-tmp:latest" diff --git a/core/scripts/cre/environment/environment/environment.go b/core/scripts/cre/environment/environment/environment.go index 1ca638ba4c6..33ee8ca1257 100644 --- a/core/scripts/cre/environment/environment/environment.go +++ b/core/scripts/cre/environment/environment/environment.go @@ -86,6 +86,8 @@ var EnvironmentCmd = &cobra.Command{ func init() { EnvironmentCmd.AddCommand(startCmd()) EnvironmentCmd.AddCommand(stopCmd()) + EnvironmentCmd.AddCommand(stopAllCmd()) + EnvironmentCmd.AddCommand(stopRemoteCmd()) EnvironmentCmd.AddCommand(workflowCmds()) EnvironmentCmd.AddCommand(beholderCmds()) EnvironmentCmd.AddCommand(swapCmds()) @@ -492,6 +494,9 @@ func startCmd() *cobra.Command { } fmt.Print(libformat.PurpleText("\nEnvironment setup completed successfully in %.2f seconds\n\n", time.Since(provisioningStartTime).Seconds())) fmt.Print("To terminate execute:`go run . env stop`\n\n") + if remoteSummary := summarizeRemoteComponents(in); remoteSummary.Total > 0 { + fmt.Printf("Remote components started (%d). Use `go run . env stop-remote` to stop them.\n\n", remoteSummary.Total) + } addresses, aErr := output.CreEnvironment.CldfEnvironment.DataStore.Addresses().Fetch() if aErr != nil { @@ -506,6 +511,13 @@ func startCmd() *cobra.Command { if storeErr != nil { return errors.Wrap(storeErr, "failed to store local CRE state") } + if remoteSummary := summarizeRemoteComponents(in); remoteSummary.Total > 0 { + if err := storeRemoteStopState(relativePathToRepoRoot, in); err != nil { + return errors.Wrap(err, "failed to store remote component stop state") + } + } else if err := removeRemoteStopConfig(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to clear stale remote component stop state") + } if err := persistTunnelState(relativePathToRepoRoot, output); err != nil { return errors.Wrap(err, "failed to store tunnel state") } @@ -631,60 +643,230 @@ func trackStartup(success, hasBuiltDockerImage bool, infraType string, errorMess } func stopCmd() *cobra.Command { - var allFlag bool cmd := &cobra.Command{ Use: "stop", - Short: "Stops the environment", - Long: `Stops the local CRE environment (if it's not running, it just fallsthrough)`, + Short: "Stops local environment", + Long: `Stops local CRE resources only (containers, tracked local tunnels, and local state file).`, + Example: "go run . env stop", PersistentPreRun: globalPreRunFunc, RunE: func(cmd *cobra.Command, args []string) error { - removeErr := framework.RemoveTestContainers() - if removeErr != nil { - return errors.Wrap(removeErr, "failed to remove environment containers. Please remove them manually") + if err := stopLocalResources(relativePathToRepoRoot, false); err != nil { + return err } - - if err := cleanupTrackedTunnels(relativePathToRepoRoot); err != nil { - framework.L.Warn().Err(err).Msg("failed to clean up tracked SSM tunnels") + remoteConfiguredSummary, _ := loadRemoteStopTargets(relativePathToRepoRoot) + if remoteConfiguredSummary.Total > 0 { + framework.L.Warn(). + Int("count", remoteConfiguredSummary.Total). + Msgf("Remote components are still running. Use `env stop-remote` to stop them. Remote stop state: %s", remoteStateFileAbsPath(relativePathToRepoRoot)) } + fmt.Println("Local environment stopped successfully") + return nil + }, + } + return cmd +} - if allFlag { - stopBeholderErr := stopBeholder() - if stopBeholderErr != nil { - framework.L.Warn().Msgf("failed to stop Beholder: %s", stopBeholderErr) - } - - stopBillingErr := stopBilling() - if stopBillingErr != nil { - framework.L.Warn().Msgf("failed to stop Billing: %s", stopBillingErr) - } - - stopObsStack := framework.ObservabilityDown() - if stopObsStack != nil { - framework.L.Warn().Msgf("failed to stop observability stack: %s", stopObsStack) - } +func stopAllCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "stop-all", + Short: "Stops all local resources", + Long: `Stops local CRE resources and extra local services (beholder, billing, observability), then removes local state directory.`, + Example: "go run . env stop-all", + PersistentPreRun: globalPreRunFunc, + RunE: func(cmd *cobra.Command, args []string) error { + if err := stopLocalResources(relativePathToRepoRoot, true); err != nil { + return err + } + remoteConfiguredSummary, _ := loadRemoteStopTargets(relativePathToRepoRoot) + if remoteConfiguredSummary.Total > 0 { + framework.L.Warn(). + Int("count", remoteConfiguredSummary.Total). + Msgf("Remote components are still running. Use `env stop-remote` to stop them. Remote stop state: %s", remoteStateFileAbsPath(relativePathToRepoRoot)) + } + fmt.Println("All local resources stopped successfully") + return nil + }, + } + return cmd +} - removeCacheErr := envconfig.RemoveAllEnvironmentStateDir(relativePathToRepoRoot) - if removeCacheErr != nil { - framework.L.Warn().Msgf("failed to remove local CRE state files: %s", removeCacheErr) - } - } else { - creStateFile := envconfig.MustLocalCREStateFileAbsPath(relativePathToRepoRoot) - cErr := os.Remove(creStateFile) - if cErr != nil { - framework.L.Warn().Msgf("failed to remove local CRE state file: %s", cErr) - } else { - framework.L.Info().Msgf("removed local CRE state file: %s", creStateFile) - } +func stopRemoteCmd() *cobra.Command { + var dryRunFlag bool + cmd := &cobra.Command{ + Use: "stop-remote", + Short: "Stops remote components only", + Long: `Stops remote CRE components through the agent without performing any local cleanup.`, + Example: strings.TrimSpace(` +go run . env stop-remote +go run . env stop-remote --dry-run +`), + PersistentPreRun: globalPreRunFunc, + RunE: func(cmd *cobra.Command, args []string) error { + remoteConfiguredSummary, targets := loadRemoteStopTargets(relativePathToRepoRoot) + if dryRunFlag { + framework.L.Info(). + Int("total", remoteConfiguredSummary.Total). + Int("blockchains", remoteConfiguredSummary.Blockchains). + Int("jd", remoteConfiguredSummary.JD). + Msg("Dry-run: remote components that would be stopped") + return nil + } + if remoteConfiguredSummary.Total == 0 { + framework.L.Info().Msg("No remote components recorded; nothing to stop.") + return nil } - fmt.Println("Environment stopped successfully") + if err := stopRemoteTargets(cmd.Context(), relativePathToRepoRoot, targets); err != nil { + return err + } + fmt.Println("Remote components stopped successfully") return nil }, } + cmd.Flags().BoolVar(&dryRunFlag, "dry-run", false, "Preview what remote components would be stopped") + return cmd +} - cmd.Flags().BoolVarP(&allFlag, "all", "a", false, "Remove also all extra services (beholder, billing)") +func loadRemoteStopTargets(relativePathToRepoRoot string) (remoteComponentSummary, *envconfig.Config) { + var ( + targets *envconfig.Config + summary remoteComponentSummary + ) + if envconfig.LocalCREStateFileExists(relativePathToRepoRoot) { + cached := &envconfig.Config{} + statePath := envconfig.MustLocalCREStateFileAbsPath(relativePathToRepoRoot) + if loadErr := cached.Load(statePath); loadErr != nil { + framework.L.Warn().Err(loadErr).Msgf("failed to load local CRE state from %s", statePath) + } else { + targets = cached + summary = summarizeRemoteComponents(targets) + } + } - return cmd + if summary.Total == 0 && remoteStateFileExists(relativePathToRepoRoot) { + remoteState, loadErr := loadRemoteStopState(relativePathToRepoRoot) + if loadErr != nil { + framework.L.Warn().Err(loadErr).Msgf("failed to load remote component stop state from %s", remoteStateFileAbsPath(relativePathToRepoRoot)) + } else { + targets = remoteState.Config() + summary = summarizeRemoteComponents(targets) + } + } + return summary, targets +} + +func stopRemoteTargets(ctx context.Context, relativePathToRepoRoot string, targets *envconfig.Config) error { + remoteState, loadErr := loadRemoteStopState(relativePathToRepoRoot) + if loadErr == nil { + applyRemoteAgentEnvFallback(framework.L, remoteState) + } + + summary, stopRemoteErr := creenv.StopRemoteComponents(ctx, framework.L, targets) + framework.L.Info(). + Int("requested", summary.Requested). + Int("stopped", summary.Stopped). + Int("missing", summary.Missing). + Int("failed", summary.Failed). + Msg("Remote component stop summary") + if stopRemoteErr != nil { + return errors.Wrap(stopRemoteErr, "failed to stop one or more remote components") + } + if err := removeRemoteStopConfig(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to remove remote component stop state") + } + return nil +} + +func stopLocalResources(relativePathToRepoRoot string, removeAllState bool) error { + removeErr := framework.RemoveTestContainers() + if removeErr != nil { + return errors.Wrap(removeErr, "failed to remove environment containers. Please remove them manually") + } + + if err := cleanupTrackedTunnels(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to clean up tracked SSM tunnels") + } + + if removeAllState { + stopBeholderErr := stopBeholder() + if stopBeholderErr != nil { + framework.L.Warn().Msgf("failed to stop Beholder: %s", stopBeholderErr) + } + + stopBillingErr := stopBilling() + if stopBillingErr != nil { + framework.L.Warn().Msgf("failed to stop Billing: %s", stopBillingErr) + } + + stopObsStack := framework.ObservabilityDown() + if stopObsStack != nil { + framework.L.Warn().Msgf("failed to stop observability stack: %s", stopObsStack) + } + + removeCacheErr := envconfig.RemoveAllEnvironmentStateDir(relativePathToRepoRoot) + if removeCacheErr != nil { + framework.L.Warn().Msgf("failed to remove local CRE state files: %s", removeCacheErr) + } + return nil + } + + creStateFile := envconfig.MustLocalCREStateFileAbsPath(relativePathToRepoRoot) + cErr := os.Remove(creStateFile) + if cErr != nil && !os.IsNotExist(cErr) { + framework.L.Warn().Msgf("failed to remove local CRE state file: %s", cErr) + } else if cErr != nil && os.IsNotExist(cErr) { + framework.L.Info().Msgf("local CRE state file already absent: %s", creStateFile) + } else { + framework.L.Info().Msgf("removed local CRE state file: %s", creStateFile) + } + return nil +} + +type remoteComponentSummary struct { + Total int + Blockchains int + JD int +} + +func summarizeRemoteComponents(cfg *envconfig.Config) remoteComponentSummary { + summary := remoteComponentSummary{} + if cfg == nil { + return summary + } + for _, configuredBlockchain := range cfg.Blockchains { + if configuredBlockchain != nil && configuredBlockchain.Target == envconfig.TargetRemote { + summary.Blockchains++ + } + } + if cfg.JD != nil && cfg.JD.Target == envconfig.TargetRemote { + summary.JD = 1 + } + summary.Total = summary.Blockchains + summary.JD + return summary +} + +func applyRemoteAgentEnvFallback(logger zerolog.Logger, state *remoteStopState) { + if state == nil { + return + } + setIfEmpty := func(key, value string) { + if strings.TrimSpace(value) == "" { + return + } + if strings.TrimSpace(os.Getenv(key)) != "" { + return + } + if err := os.Setenv(key, value); err != nil { + logger.Warn().Err(err).Msgf("failed to set %s from remote stop state", key) + } + } + + setIfEmpty("CRE_AGENT_MODE", state.Agent.Mode) + setIfEmpty("CRE_LOCAL_AGENT_URL", state.Agent.LocalURL) + setIfEmpty("CRE_EC2_AGENT_URL", state.Agent.EC2URL) + setIfEmpty("CRE_EC2_INSTANCE_ID", state.Agent.EC2InstanceID) + setIfEmpty("CRE_EC2_AGENT_PORT", state.Agent.EC2AgentPort) + setIfEmpty("CRE_AWS_PROFILE", state.Agent.AWSProfile) } func StartCLIEnvironment( @@ -1081,7 +1263,9 @@ func ensureDockerImagesExist(ctx context.Context, logger zerolog.Logger, in *env } if in.JD != nil { - if err := ensureDockerImageExists(ctx, logger, in.JD.Image); err != nil { + if in.JD.Target == envconfig.TargetRemote { + logger.Info().Msg("Skipping local JD image check for remote JD target") + } else if err := ensureDockerImageExists(ctx, logger, in.JD.Image); err != nil { return errors.Wrapf(err, "Job Distributor image '%s' not found. Make sure it exists locally or run 'go run . env setup' to pull it and other dependencies that also might be missing", in.JD.Image) } } diff --git a/core/scripts/cre/environment/environment/remote_state.go b/core/scripts/cre/environment/environment/remote_state.go new file mode 100644 index 00000000000..2bbdb5fe943 --- /dev/null +++ b/core/scripts/cre/environment/environment/remote_state.go @@ -0,0 +1,127 @@ +package environment + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/pelletier/go-toml/v2" + + envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" +) + +const ( + remoteStateDirname = "core/scripts/cre/environment/state_remote" + remoteStateFilename = "remote_components.toml" +) + +type remoteStopState struct { + Version int `toml:"version"` + Blockchains []*envconfig.Blockchain `toml:"blockchains"` + JD *envconfig.JobDistributor `toml:"jd"` + Agent remoteAgentState `toml:"agent"` +} + +type remoteAgentState struct { + Mode string `toml:"mode,omitempty"` + LocalURL string `toml:"local_url,omitempty"` + EC2URL string `toml:"ec2_url,omitempty"` + EC2InstanceID string `toml:"ec2_instance_id,omitempty"` + EC2AgentPort string `toml:"ec2_agent_port,omitempty"` + AWSProfile string `toml:"aws_profile,omitempty"` +} + +func remoteStateFileAbsPath(relativePathToRepoRoot string) string { + absPath, err := filepath.Abs(filepath.Join(relativePathToRepoRoot, remoteStateDirname, remoteStateFilename)) + if err != nil { + panic(fmt.Errorf("failed to get absolute path for remote CRE state file: %w", err)) + } + return absPath +} + +func remoteStateFileExists(relativePathToRepoRoot string) bool { + _, statErr := os.Stat(remoteStateFileAbsPath(relativePathToRepoRoot)) + return statErr == nil +} + +func loadRemoteStopState(relativePathToRepoRoot string) (*remoteStopState, error) { + data, err := os.ReadFile(remoteStateFileAbsPath(relativePathToRepoRoot)) + if err != nil { + return nil, err + } + state := &remoteStopState{} + if err := toml.Unmarshal(data, state); err != nil { + return nil, err + } + if state.Version == 0 { + state.Version = 1 + } + if state.Blockchains == nil { + state.Blockchains = []*envconfig.Blockchain{} + } + return state, nil +} + +func (s *remoteStopState) Config() *envconfig.Config { + if s == nil { + return nil + } + return &envconfig.Config{ + Blockchains: s.Blockchains, + JD: s.JD, + } +} + +func storeRemoteStopState(relativePathToRepoRoot string, cfg *envconfig.Config) error { + if cfg == nil { + return fmt.Errorf("cannot store nil remote stop config") + } + state := &remoteStopState{ + Version: 1, + Blockchains: []*envconfig.Blockchain{}, + Agent: remoteAgentState{ + Mode: os.Getenv("CRE_AGENT_MODE"), + LocalURL: os.Getenv("CRE_LOCAL_AGENT_URL"), + EC2URL: os.Getenv("CRE_EC2_AGENT_URL"), + EC2InstanceID: os.Getenv("CRE_EC2_INSTANCE_ID"), + EC2AgentPort: os.Getenv("CRE_EC2_AGENT_PORT"), + AWSProfile: firstNonEmpty(os.Getenv("CRE_AWS_PROFILE"), os.Getenv("AWS_PROFILE")), + }, + } + for _, configuredBlockchain := range cfg.Blockchains { + if configuredBlockchain != nil && configuredBlockchain.Target == envconfig.TargetRemote { + state.Blockchains = append(state.Blockchains, configuredBlockchain) + } + } + if cfg.JD != nil && cfg.JD.Target == envconfig.TargetRemote { + state.JD = cfg.JD + } + data, err := toml.Marshal(state) + if err != nil { + return err + } + path := remoteStateFileAbsPath(relativePathToRepoRoot) + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return err + } + return os.WriteFile(path, data, 0o600) +} + +func firstNonEmpty(values ...string) string { + for _, value := range values { + if trimmed := strings.TrimSpace(value); trimmed != "" { + return trimmed + } + } + return "" +} + +func removeRemoteStopConfig(relativePathToRepoRoot string) error { + path := remoteStateFileAbsPath(relativePathToRepoRoot) + err := os.Remove(path) + if err == nil || os.IsNotExist(err) { + return nil + } + return err +} diff --git a/core/scripts/go.mod b/core/scripts/go.mod index 4f290dd00f0..4b894362e3a 100644 --- a/core/scripts/go.mod +++ b/core/scripts/go.mod @@ -9,6 +9,8 @@ replace github.com/smartcontractkit/chainlink/deployment => ../../deployment replace github.com/smartcontractkit/chainlink/system-tests/lib => ../../system-tests/lib +replace github.com/smartcontractkit/chainlink-testing-framework/framework => /Users/bartektofel/Desktop/repos/chainlink-testing-framework/framework + replace github.com/smartcontractkit/chainlink/core/scripts/cre/environment/examples/workflows/v1/proof-of-reserve/cron-based => ./cre/environment/examples/workflows/v1/proof-of-reserve/cron-based replace github.com/smartcontractkit/chainlink/core/scripts/cre/environment/examples/workflows/v1/proof-of-reserve/web-trigger-based => ./cre/environment/examples/workflows/v1/proof-of-reserve/web-trigger-based @@ -29,7 +31,7 @@ require ( github.com/docker/docker v28.5.1+incompatible github.com/docker/go-connections v0.6.0 github.com/ethereum/go-ethereum v1.16.8 - github.com/gkampitakis/go-snaps v0.5.13 + github.com/gkampitakis/go-snaps v0.5.19 github.com/google/go-cmp v0.7.0 github.com/google/uuid v1.6.0 github.com/jmoiron/sqlx v1.4.0 @@ -266,7 +268,6 @@ require ( github.com/gin-contrib/sse v0.1.0 // indirect github.com/gin-gonic/gin v1.10.1 // indirect github.com/gkampitakis/ciinfo v0.3.2 // indirect - github.com/gkampitakis/go-diff v1.3.2 // indirect github.com/go-asn1-ber/asn1-ber v1.5.5 // indirect github.com/go-chi/chi v1.5.5 // indirect github.com/go-errors/errors v1.4.2 // indirect @@ -468,6 +469,7 @@ require ( github.com/sasha-s/go-deadlock v0.3.5 // indirect github.com/scylladb/go-reflectx v1.0.1 // indirect github.com/secure-systems-lab/go-securesystemslib v0.6.0 // indirect + github.com/sergi/go-diff v1.4.0 // indirect github.com/serialx/hashring v0.0.0-20200727003509-22c0c7ab6b1b // indirect github.com/sethvargo/go-retry v0.3.0 // indirect github.com/shibumi/go-pathspec v1.3.0 // indirect diff --git a/core/scripts/go.sum b/core/scripts/go.sum index 7c9b08a5cc2..213072b79e0 100644 --- a/core/scripts/go.sum +++ b/core/scripts/go.sum @@ -654,10 +654,8 @@ github.com/gin-gonic/gin v1.10.1 h1:T0ujvqyCSqRopADpgPgiTT63DUQVSfojyME59Ei63pQ= github.com/gin-gonic/gin v1.10.1/go.mod h1:4PMNQiOhvDRa013RKVbsiNwoyezlm2rm0uX/T7kzp5Y= github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BNhXs= github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo= -github.com/gkampitakis/go-diff v1.3.2 h1:Qyn0J9XJSDTgnsgHRdz9Zp24RaJeKMUHg2+PDZZdC4M= -github.com/gkampitakis/go-diff v1.3.2/go.mod h1:LLgOrpqleQe26cte8s36HTWcTmMEur6OPYerdAAS9tk= -github.com/gkampitakis/go-snaps v0.5.13 h1:Hhjmvv1WboSCxkR9iU2mj5PQ8tsz/y8ECGrIbjjPF8Q= -github.com/gkampitakis/go-snaps v0.5.13/go.mod h1:HNpx/9GoKisdhw9AFOBT1N7DBs9DiHo/hGheFGBZ+mc= +github.com/gkampitakis/go-snaps v0.5.19 h1:hUJlCQOpTt1M+kSisMwioDWZDWpDtdAvUhvWCx1YGW0= +github.com/gkampitakis/go-snaps v0.5.19/go.mod h1:gC3YqxQTPyIXvQrw/Vpt3a8VqR1MO8sVpZFWN4DGwNs= github.com/go-asn1-ber/asn1-ber v1.5.5 h1:MNHlNMBDgEKD4TcKr36vQN68BA00aDfjIt3/bD50WnA= github.com/go-asn1-ber/asn1-ber v1.5.5/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0= github.com/go-chi/chi v1.5.5 h1:vOB/HbEMt9QqBqErz07QehcOKHaWFtuj87tTDVz2qXE= @@ -1553,8 +1551,8 @@ github.com/secure-systems-lab/go-securesystemslib v0.6.0 h1:T65atpAVCJQK14UA57LM github.com/secure-systems-lab/go-securesystemslib v0.6.0/go.mod h1:8Mtpo9JKks/qhPG4HGZ2LGMvrPbzuxwfz/f/zLfEWkk= github.com/segmentio/ksuid v1.0.4 h1:sBo2BdShXjmcugAMwjugoGUdUV0pcxY5mW4xKRn3v4c= github.com/segmentio/ksuid v1.0.4/go.mod h1:/XUiZBD3kVx5SmUOl55voK5yeAbBNNIed+2O73XgrPE= -github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 h1:n661drycOFuPLCN3Uc8sB6B/s6Z4t2xvBgU1htSHuq8= -github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= +github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw= +github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= github.com/serialx/hashring v0.0.0-20200727003509-22c0c7ab6b1b h1:h+3JX2VoWTFuyQEo87pStk/a99dzIO1mM9KxIyLPGTU= github.com/serialx/hashring v0.0.0-20200727003509-22c0c7ab6b1b/go.mod h1:/yeG0My1xr/u+HZrFQ1tOQQQQrOawfyMUH13ai5brBc= github.com/sethvargo/go-retry v0.3.0 h1:EEt31A35QhrcRZtrYFDTBg91cqZVnFL2navjDrah2SE= @@ -1680,8 +1678,6 @@ github.com/smartcontractkit/chainlink-sui v0.0.0-20260205175622-33e65031f9a9 h1: github.com/smartcontractkit/chainlink-sui v0.0.0-20260205175622-33e65031f9a9/go.mod h1:KpEWZJMLwbdMHeHQz9rbkES0vRrx4nk6OQXyhlHb9/8= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260124000807-bff5e296dfb7 h1:nC/FJN5iwh/zD5u8R6qwhkx60c/83E9f6EnRonr/RG8= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260124000807-bff5e296dfb7/go.mod h1:FbqbTFP9aBvE/2GDmfcFr/03HEWkzjP7OMmxdib26aY= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.13.14-0.20260202230832-eb33f42188d1 h1:JijOMT/94w/mt2q69vBQodliDlVfe+jqeaSTQJP3uxo= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.13.14-0.20260202230832-eb33f42188d1/go.mod h1:IQC7fXKDsFjD1vb0Jh83WWY4BCFhN1fkcn+z3oSuFIA= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.20 h1:8D2DUnn7mLUZOLhPDGGFKKvBrgU6LQd00tq2VOprvfI= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.20/go.mod h1:98jNYBOPuKWJw9a8x0LgQuudp5enrHhQQP5Hq0YwRB8= github.com/smartcontractkit/chainlink-testing-framework/framework/components/fake v0.10.0 h1:PWAMYu0WaAMBfbpxCpFJGRIDHmcgmYin6a+UQC0OdtY= diff --git a/system-tests/lib/cre/environment/agent/deploy.go b/system-tests/lib/cre/environment/agent/deploy.go index 68fb1de3f68..12e4a2d445c 100644 --- a/system-tests/lib/cre/environment/agent/deploy.go +++ b/system-tests/lib/cre/environment/agent/deploy.go @@ -3,10 +3,13 @@ package agent import ( "context" "fmt" + "strings" + dockerclient "github.com/docker/docker/client" pkgerrors "github.com/pkg/errors" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" ) @@ -48,3 +51,48 @@ func DeployBlockchainComponent( return deployed.CtfOutput(), nil } + +func DeployJDComponent(ctx context.Context, input *jd.Input) (*jd.Output, error) { + if input == nil { + return nil, pkgerrors.New("jd input is nil") + } + if err := ensureJDImagePresent(ctx, input.Image); err != nil { + return nil, err + } + + effectiveInput, err := buildRemoteJDInput(input) + if err != nil { + return nil, err + } + output, err := jd.NewWithContext(ctx, effectiveInput) + if err != nil { + return nil, pkgerrors.Wrap(err, "failed to deploy jd component") + } + return output, nil +} + +func buildRemoteJDInput(input *jd.Input) (*jd.Input, error) { + jdInput := *input + // Remote agent deployments require Docker service discovery (jd -> jd-db), + // so keep Docker embedded DNS instead of isolated localhost DNS. + jdInput.DisableDNSIsolation = true + + return &jdInput, nil +} + +func ensureJDImagePresent(ctx context.Context, image string) error { + if strings.TrimSpace(image) == "" { + return nil + } + + client, err := dockerclient.NewClientWithOpts(dockerclient.WithAPIVersionNegotiation()) + if err != nil { + return pkgerrors.Wrap(err, "failed to create docker client for jd image check") + } + defer client.Close() + + if _, err := client.ImageInspect(ctx, image); err != nil { + return fmt.Errorf("jd image %q is not available on remote host; please preload it before starting remote jd", image) + } + return nil +} diff --git a/system-tests/lib/cre/environment/agent/server.go b/system-tests/lib/cre/environment/agent/server.go index c32b894d77f..e47093587f5 100644 --- a/system-tests/lib/cre/environment/agent/server.go +++ b/system-tests/lib/cre/environment/agent/server.go @@ -10,21 +10,31 @@ import ( "io" "net/http" "os" + "slices" "strings" "sync" + "time" + cerrdefs "github.com/containerd/errdefs" + "github.com/docker/docker/api/types/container" + dockerevents "github.com/docker/docker/api/types/events" + "github.com/docker/docker/api/types/filters" + dockerclient "github.com/docker/docker/client" "github.com/rs/zerolog" "github.com/smartcontractkit/chainlink-testing-framework/framework" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" ) const ( SchemaVersionV1 = "v1" OperationStartComponent = "StartComponent" + OperationStopComponent = "StopComponent" OperationHealth = "Health" ComponentTypeBlockchain = "blockchain" + ComponentTypeJD = "jd" ErrCodeMethodNotAllowed = "method_not_allowed" ErrCodeInvalidRequestBody = "invalid_request_body" @@ -48,14 +58,18 @@ type StartComponentEnvelope struct { Payload json.RawMessage `json:"payload"` } -type StartBlockchainPayload struct { +type StartComponentPayload struct { ComponentType string `json:"componentType"` Blockchain *blockchain.Input `json:"blockchain"` + JD *jd.Input `json:"jd"` ReusePolicy string `json:"reusePolicy,omitempty"` } type StartComponentResponse struct { - BlockchainOutput map[string]any `json:"blockchainOutput,omitempty"` + ComponentType string `json:"componentType,omitempty"` + Output map[string]any `json:"output,omitempty"` + Found bool `json:"found,omitempty"` + Stopped bool `json:"stopped,omitempty"` AgentLogs []string `json:"agentLogs,omitempty"` ErrorCode string `json:"errorCode,omitempty"` Error string `json:"error,omitempty"` @@ -64,8 +78,10 @@ type StartComponentResponse struct { type Server struct { lggr zerolog.Logger deployers map[blockchain.ChainFamily]blockchains.Deployer + lifecycleMu sync.Mutex cacheMu sync.Mutex cache map[string]cachedStart + runtime map[string]runtimeState } type cachedStart struct { @@ -73,11 +89,17 @@ type cachedStart struct { Output map[string]any } +type runtimeState struct { + ComponentType string + ContainerIDs []string +} + func NewServer(lggr zerolog.Logger, deployers map[blockchain.ChainFamily]blockchains.Deployer) *Server { return &Server{ lggr: lggr, deployers: deployers, cache: make(map[string]cachedStart), + runtime: make(map[string]runtimeState), } } @@ -109,70 +131,155 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { s.respondError(w, http.StatusBadRequest, ErrCodeUnsupportedSchema, fmt.Sprintf("unsupported schema version: %s", envelope.SchemaVersion), nil) return } - if envelope.Operation != OperationStartComponent { - s.respondError(w, http.StatusBadRequest, ErrCodeUnsupportedOperation, fmt.Sprintf("unsupported operation: %s", envelope.Operation), nil) - return - } - - var payload StartBlockchainPayload + var payload StartComponentPayload if err := json.Unmarshal(envelope.Payload, &payload); err != nil { s.respondError(w, http.StatusBadRequest, ErrCodeInvalidPayload, fmt.Sprintf("invalid payload: %v", err), nil) return } - if payload.ComponentType != ComponentTypeBlockchain { + if payload.ComponentType != ComponentTypeBlockchain && payload.ComponentType != ComponentTypeJD { s.respondError(w, http.StatusBadRequest, ErrCodeUnsupportedComponent, fmt.Sprintf("unsupported component type: %s", payload.ComponentType), nil) return } - if payload.Blockchain == nil { - s.respondError(w, http.StatusBadRequest, ErrCodeMissingComponentInput, "blockchain payload is required", nil) + + componentKey, inputErr := componentCacheKey(payload) + if inputErr != nil { + s.respondError(w, http.StatusBadRequest, ErrCodeMissingComponentInput, inputErr.Error(), nil) + return + } + if envelope.Operation == OperationStopComponent { + s.stopComponentByKey(w, r, payload.ComponentType, componentKey) + return + } + if envelope.Operation != OperationStartComponent { + s.respondError(w, http.StatusBadRequest, ErrCodeUnsupportedOperation, fmt.Sprintf("unsupported operation: %s", envelope.Operation), nil) return } - componentKey := fmt.Sprintf("%s:%s:%s", payload.ComponentType, payload.Blockchain.Type, payload.Blockchain.ChainID) payloadHash := hashPayload(envelope.Payload) // Keep this stderr write explicit so startup behavior is visible when agent runs as a subprocess. - requestLog := fmt.Sprintf("[cre-agent] starting component type=%s blockchain=%s chain_id=%s", payload.ComponentType, payload.Blockchain.Type, payload.Blockchain.ChainID) + requestLog := fmt.Sprintf("[cre-agent] starting component type=%s key=%s", payload.ComponentType, componentKey) _, _ = fmt.Fprintln(os.Stderr, requestLog) - if shouldReuseRemoteStart(payload.ReusePolicy) { + preStartLogs := make([]string, 0, 2) + s.lifecycleMu.Lock() + defer s.lifecycleMu.Unlock() + if shouldRestartBeforeStart(payload.ComponentType, payload.ReusePolicy) { + stopped, stopErr := s.stopTrackedComponentLocked(r.Context(), componentKey) + if stopErr != nil { + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to stop existing component before start: %v", stopErr), []string{requestLog}) + return + } + if stopped { + preStartLogs = append(preStartLogs, "[cre-agent] stopped existing component before start") + } else { + preStartLogs = append(preStartLogs, "[cre-agent] no existing component to stop before start") + } + } + if shouldReuseRemoteStart(payload.ComponentType, payload.ReusePolicy) { if cached, ok := s.lookupCachedStart(componentKey, payloadHash); ok { reuseLog := fmt.Sprintf("[cre-agent] reusing existing component for key=%s (payload hash matched)", componentKey) _, _ = fmt.Fprintln(os.Stderr, reuseLog) s.respondJSON(w, http.StatusOK, StartComponentResponse{ - BlockchainOutput: cached.Output, - AgentLogs: []string{requestLog, reuseLog}, + ComponentType: payload.ComponentType, + Output: cached.Output, + AgentLogs: []string{requestLog, reuseLog}, }) return } } - var startedOutput *blockchain.Output - capturedFrameworkLogs, startErr := captureFrameworkLogs(func() error { - deployed, err := DeployBlockchainComponent(r.Context(), s.deployers, payload.Blockchain) - if err != nil { - return err - } - startedOutput = deployed - return nil - }) - - agentLogs := make([]string, 0, 1+len(capturedFrameworkLogs)) + agentLogs := make([]string, 0, 8) agentLogs = append(agentLogs, requestLog) - agentLogs = append(agentLogs, capturedFrameworkLogs...) + agentLogs = append(agentLogs, preStartLogs...) + var blockchainOutput *blockchain.Output + var jdOutput *jd.Output + trackedContainers, startErr := s.discoverOwnedContainers(r.Context(), func() error { + capturedFrameworkLogs, runErr := captureFrameworkLogs(func() error { + switch payload.ComponentType { + case ComponentTypeBlockchain: + deployed, err := DeployBlockchainComponent(r.Context(), s.deployers, payload.Blockchain) + if err != nil { + return err + } + blockchainOutput = deployed + case ComponentTypeJD: + deployed, err := DeployJDComponent(r.Context(), payload.JD) + if err != nil { + return err + } + jdOutput = deployed + } + return nil + }) + agentLogs = append(agentLogs, capturedFrameworkLogs...) + return runErr + }) if startErr != nil { + if len(trackedContainers) > 0 { + cleanupErr := stopContainers(r.Context(), trackedContainers) + if cleanupErr != nil { + agentLogs = append(agentLogs, fmt.Sprintf("[cre-agent] failed startup cleanup for %d tracked container(s): %v", len(trackedContainers), cleanupErr)) + } else { + agentLogs = append(agentLogs, fmt.Sprintf("[cre-agent] cleaned up %d tracked container(s) after failed startup", len(trackedContainers))) + } + } s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, startErr.Error(), agentLogs) return } - safeOutput, encErr := EncodeForTransport(startedOutput) + var output map[string]any + var encErr error + if blockchainOutput != nil { + output, encErr = EncodeForTransport(blockchainOutput) + } else if jdOutput != nil { + output, encErr = EncodeForTransport(jdOutput) + } if encErr != nil { s.respondError(w, http.StatusInternalServerError, ErrCodeTransportEncodeFailed, encErr.Error(), agentLogs) return } - s.cacheSuccessfulStart(componentKey, payloadHash, safeOutput) + if shouldReuseRemoteStart(payload.ComponentType, payload.ReusePolicy) { + s.cacheSuccessfulStart(componentKey, payloadHash, output) + } + s.storeRuntime(componentKey, runtimeState{ + ComponentType: payload.ComponentType, + ContainerIDs: trackedContainers, + }) + s.respondJSON(w, http.StatusOK, StartComponentResponse{ + ComponentType: payload.ComponentType, + Output: output, + AgentLogs: agentLogs, + }) +} + +func (s *Server) stopComponentByKey(w http.ResponseWriter, r *http.Request, componentType, componentKey string) { + s.lifecycleMu.Lock() + defer s.lifecycleMu.Unlock() + + requestLog := fmt.Sprintf("[cre-agent] stopping component type=%s key=%s", componentType, componentKey) + _, _ = fmt.Fprintln(os.Stderr, requestLog) + + stopped, err := s.stopTrackedComponentLocked(r.Context(), componentKey) + if err != nil { + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to stop component containers: %v", err), []string{requestLog}) + return + } + if !stopped { + s.deleteCachedStart(componentKey) + s.respondJSON(w, http.StatusOK, StartComponentResponse{ + ComponentType: componentType, + Found: false, + Stopped: false, + AgentLogs: []string{requestLog, "[cre-agent] nothing to stop (component not found)"}, + }) + return + } + s.deleteCachedStart(componentKey) s.respondJSON(w, http.StatusOK, StartComponentResponse{ - BlockchainOutput: safeOutput, - AgentLogs: agentLogs, + ComponentType: componentType, + Found: true, + Stopped: true, + AgentLogs: []string{requestLog, "[cre-agent] stopped existing component"}, }) } @@ -235,7 +342,32 @@ func (s *Server) cacheSuccessfulStart(componentKey, payloadHash string, output m } } -func shouldReuseRemoteStart(policy string) bool { +func (s *Server) deleteCachedStart(componentKey string) { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + delete(s.cache, componentKey) +} + +func (s *Server) storeRuntime(componentKey string, state runtimeState) { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + s.runtime[componentKey] = state +} + +func (s *Server) takeRuntime(componentKey string) (runtimeState, bool) { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + state, ok := s.runtime[componentKey] + if ok { + delete(s.runtime, componentKey) + } + return state, ok +} + +func shouldReuseRemoteStart(componentType, policy string) bool { + if componentType == ComponentTypeJD { + return false + } normalized := strings.TrimSpace(strings.ToLower(policy)) if normalized == "" { normalized = RemoteStartPolicyReuseIdentical @@ -243,11 +375,169 @@ func shouldReuseRemoteStart(policy string) bool { return normalized == RemoteStartPolicyReuseIdentical } +func shouldRestartBeforeStart(componentType, policy string) bool { + if componentType == ComponentTypeJD { + return true + } + normalized := strings.TrimSpace(strings.ToLower(policy)) + return normalized == RemoteStartPolicyAlways +} + +func (s *Server) stopTrackedComponentLocked(ctx context.Context, componentKey string) (bool, error) { + state, ok := s.takeRuntime(componentKey) + if !ok { + return false, nil + } + if err := stopContainers(ctx, state.ContainerIDs); err != nil { + return false, err + } + return true, nil +} + +func (s *Server) discoverOwnedContainers(ctx context.Context, fn func() error) ([]string, error) { + client, err := dockerclient.NewClientWithOpts(dockerclient.WithAPIVersionNegotiation()) + if err != nil { + s.lggr.Warn().Err(err).Msg("Docker unavailable for component ownership tracking; continuing without tracked dependencies") + if runErr := fn(); runErr != nil { + return nil, runErr + } + return []string{}, nil + } + defer client.Close() + + before, err := listContainerIDSet(ctx, client) + if err != nil { + return nil, err + } + + eventsCtx, cancelEvents := context.WithCancel(ctx) + defer cancelEvents() + events, errs := client.Events(eventsCtx, dockerevents.ListOptions{ + Filters: filters.NewArgs(filters.Arg("type", "container")), + }) + + var wg sync.WaitGroup + eventIDs := make([]string, 0) + var eventMu sync.Mutex + wg.Add(1) + go func() { + defer wg.Done() + for { + select { + case msg, ok := <-events: + if !ok { + return + } + if msg.Action == "create" || msg.Action == "start" { + eventMu.Lock() + eventIDs = append(eventIDs, msg.ID) + eventMu.Unlock() + } + case err, ok := <-errs: + if !ok || err == nil { + return + } + return + case <-eventsCtx.Done(): + return + } + } + }() + + runErr := fn() + time.Sleep(150 * time.Millisecond) + cancelEvents() + wg.Wait() + + after, err := listContainerIDSet(ctx, client) + if err != nil { + if runErr != nil { + return nil, runErr + } + return nil, err + } + + owned := make([]string, 0) + seen := make(map[string]struct{}) + for id := range after { + if _, existed := before[id]; existed { + continue + } + owned = append(owned, id) + seen[id] = struct{}{} + } + eventMu.Lock() + for _, id := range eventIDs { + if _, ok := after[id]; !ok { + continue + } + if _, ok := seen[id]; ok { + continue + } + owned = append(owned, id) + seen[id] = struct{}{} + } + eventMu.Unlock() + slices.Sort(owned) + if runErr != nil { + return owned, runErr + } + return owned, nil +} + +func listContainerIDSet(ctx context.Context, client *dockerclient.Client) (map[string]struct{}, error) { + containers, err := client.ContainerList(ctx, container.ListOptions{All: true}) + if err != nil { + return nil, fmt.Errorf("failed to list docker containers: %w", err) + } + ids := make(map[string]struct{}, len(containers)) + for _, c := range containers { + ids[c.ID] = struct{}{} + } + return ids, nil +} + +func stopContainers(ctx context.Context, ids []string) error { + if len(ids) == 0 { + return nil + } + client, err := dockerclient.NewClientWithOpts(dockerclient.WithAPIVersionNegotiation()) + if err != nil { + return fmt.Errorf("failed to create docker client for stop: %w", err) + } + defer client.Close() + + for i := len(ids) - 1; i >= 0; i-- { + err := client.ContainerRemove(ctx, ids[i], container.RemoveOptions{Force: true}) + if err != nil && !cerrdefs.IsNotFound(err) { + return fmt.Errorf("failed to remove container %s: %w", ids[i], err) + } + } + return nil +} + func hashPayload(payload []byte) string { sum := sha256.Sum256(payload) return hex.EncodeToString(sum[:]) } +func componentCacheKey(payload StartComponentPayload) (string, error) { + switch payload.ComponentType { + case ComponentTypeBlockchain: + if payload.Blockchain == nil { + return "", fmt.Errorf("blockchain payload is required") + } + return fmt.Sprintf("%s:%s:%s", payload.ComponentType, payload.Blockchain.Type, payload.Blockchain.ChainID), nil + case ComponentTypeJD: + if payload.JD == nil { + return "", fmt.Errorf("jd payload is required") + } + return fmt.Sprintf("%s:%s", payload.ComponentType, payload.JD.Image), nil + default: + return "", fmt.Errorf("unsupported component type: %s", payload.ComponentType) + } +} + func Run(ctx context.Context, addr string, srv *Server) error { httpSrv := &http.Server{ Addr: addr, diff --git a/system-tests/lib/cre/environment/agent/server_test.go b/system-tests/lib/cre/environment/agent/server_test.go index a1154d7c595..21855dc3a5d 100644 --- a/system-tests/lib/cre/environment/agent/server_test.go +++ b/system-tests/lib/cre/environment/agent/server_test.go @@ -36,7 +36,7 @@ func TestStartComponentReturnsErrorCodeForUnsupportedComponent(t *testing.T) { server := NewServer(zerolog.Nop(), nil) handler := server.Handler() - body := bytes.NewBufferString(`{"schemaVersion":"v1","operation":"StartComponent","payload":{"componentType":"jd"}}`) + body := bytes.NewBufferString(`{"schemaVersion":"v1","operation":"StartComponent","payload":{"componentType":"not-supported"}}`) req := httptest.NewRequest(http.MethodPost, "/v1/components/start", body) req.Header.Set("Content-Type", "application/json") rr := httptest.NewRecorder() @@ -142,3 +142,81 @@ func TestStartComponentAlwaysPolicyDisablesReuse(t *testing.T) { t.Fatalf("expected deployer to be called twice with always policy, got %d", deployer.calls) } } + +func TestStartComponentRequiresJDPayload(t *testing.T) { + server := NewServer(zerolog.Nop(), nil) + handler := server.Handler() + + body := bytes.NewBufferString(`{"schemaVersion":"v1","operation":"StartComponent","payload":{"componentType":"jd"}}`) + req := httptest.NewRequest(http.MethodPost, "/v1/components/start", body) + req.Header.Set("Content-Type", "application/json") + rr := httptest.NewRecorder() + + handler.ServeHTTP(rr, req) + if rr.Code != http.StatusBadRequest { + t.Fatalf("expected bad request, got %d", rr.Code) + } + if !strings.Contains(rr.Body.String(), ErrCodeMissingComponentInput) { + t.Fatalf("expected missing input error code in response, got body: %s", rr.Body.String()) + } +} + +func TestShouldReuseRemoteStartDisablesJDReuse(t *testing.T) { + if shouldReuseRemoteStart(ComponentTypeJD, RemoteStartPolicyReuseIdentical) { + t.Fatal("expected JD reuse to be hard disabled") + } + if !shouldReuseRemoteStart(ComponentTypeBlockchain, "") { + t.Fatal("expected blockchain reuse to default to enabled") + } +} + +func TestStopComponentIdempotent(t *testing.T) { + deployer := &fakeOutputDeployer{} + server := NewServer(zerolog.Nop(), map[blockchain.ChainFamily]blockchains.Deployer{ + blockchain.FamilyEVM: deployer, + }) + handler := server.Handler() + + startPayload := `{"componentType":"blockchain","blockchain":{"type":"anvil","chain_id":"1337"}}` + startBody := bytes.NewBufferString(`{"schemaVersion":"v1","operation":"StartComponent","payload":` + startPayload + `}`) + startReq := httptest.NewRequest(http.MethodPost, "/v1/components/start", bytes.NewReader(startBody.Bytes())) + startReq.Header.Set("Content-Type", "application/json") + startRR := httptest.NewRecorder() + handler.ServeHTTP(startRR, startReq) + if startRR.Code != http.StatusOK { + t.Fatalf("expected start request OK, got %d: %s", startRR.Code, startRR.Body.String()) + } + + stopBody := bytes.NewBufferString(`{"schemaVersion":"v1","operation":"StopComponent","payload":` + startPayload + `}`) + stopReq1 := httptest.NewRequest(http.MethodPost, "/v1/components/start", bytes.NewReader(stopBody.Bytes())) + stopReq1.Header.Set("Content-Type", "application/json") + stopRR1 := httptest.NewRecorder() + handler.ServeHTTP(stopRR1, stopReq1) + if stopRR1.Code != http.StatusOK { + t.Fatalf("expected first stop request OK, got %d: %s", stopRR1.Code, stopRR1.Body.String()) + } + + var stopResp1 StartComponentResponse + if err := json.Unmarshal(stopRR1.Body.Bytes(), &stopResp1); err != nil { + t.Fatalf("failed to decode first stop response: %v", err) + } + if !stopResp1.Found || !stopResp1.Stopped { + t.Fatalf("expected first stop to find and stop component, got found=%v stopped=%v", stopResp1.Found, stopResp1.Stopped) + } + + stopReq2 := httptest.NewRequest(http.MethodPost, "/v1/components/start", bytes.NewReader(stopBody.Bytes())) + stopReq2.Header.Set("Content-Type", "application/json") + stopRR2 := httptest.NewRecorder() + handler.ServeHTTP(stopRR2, stopReq2) + if stopRR2.Code != http.StatusOK { + t.Fatalf("expected second stop request OK, got %d: %s", stopRR2.Code, stopRR2.Body.String()) + } + + var stopResp2 StartComponentResponse + if err := json.Unmarshal(stopRR2.Body.Bytes(), &stopResp2); err != nil { + t.Fatalf("failed to decode second stop response: %v", err) + } + if stopResp2.Found || stopResp2.Stopped { + t.Fatalf("expected second stop to be no-op, got found=%v stopped=%v", stopResp2.Found, stopResp2.Stopped) + } +} diff --git a/system-tests/lib/cre/environment/blockchain_start.go b/system-tests/lib/cre/environment/blockchain_start.go index b81f8fcbb83..2c30a3ddc62 100644 --- a/system-tests/lib/cre/environment/blockchain_start.go +++ b/system-tests/lib/cre/environment/blockchain_start.go @@ -22,6 +22,7 @@ import ( cldf_chain "github.com/smartcontractkit/chainlink-deployments-framework/chain" "github.com/smartcontractkit/chainlink-testing-framework/framework" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/adapters" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" @@ -32,6 +33,7 @@ import ( const ( componentTypeBlockchain = "blockchain" + componentTypeJD = "jd" envLocalAgentURL = "CRE_LOCAL_AGENT_URL" envEC2AgentURL = "CRE_EC2_AGENT_URL" envEC2InstanceID = "CRE_EC2_INSTANCE_ID" @@ -47,21 +49,25 @@ type startComponentEnvelope struct { Payload json.RawMessage `json:"payload"` } -type startBlockchainRequest struct { +type startComponentRequest struct { ComponentType string `json:"componentType"` Blockchain *blockchain.Input `json:"blockchain"` + JD *jd.Input `json:"jd"` ReusePolicy string `json:"reusePolicy,omitempty"` } -type startBlockchainResult struct { - BlockchainOutput map[string]any `json:"blockchainOutput"` +type startComponentResult struct { + ComponentType string `json:"componentType"` + Output map[string]any `json:"output"` + Found bool `json:"found"` + Stopped bool `json:"stopped"` AgentLogs []string `json:"agentLogs"` ErrorCode string `json:"errorCode"` Error string `json:"error"` } type componentClient interface { - StartComponent(ctx context.Context, envelope startComponentEnvelope) (*startBlockchainResult, error) + StartComponent(ctx context.Context, envelope startComponentEnvelope) (*startComponentResult, error) } type httpComponentClient struct { @@ -96,14 +102,14 @@ func newEC2HTTPComponentClient(baseURL string) *httpComponentClient { } } -func (c *httpComponentClient) StartComponent(ctx context.Context, envelope startComponentEnvelope) (*startBlockchainResult, error) { +func (c *httpComponentClient) StartComponent(ctx context.Context, envelope startComponentEnvelope) (*startComponentResult, error) { if c.checkHealth { if err := c.waitForHealth(ctx); err != nil { return nil, err } } - var result *startBlockchainResult + var result *startComponentResult err := retry.Do( func() error { var err error @@ -122,7 +128,7 @@ func (c *httpComponentClient) StartComponent(ctx context.Context, envelope start return result, nil } -func (c *httpComponentClient) startComponentOnce(ctx context.Context, envelope startComponentEnvelope) (*startBlockchainResult, error) { +func (c *httpComponentClient) startComponentOnce(ctx context.Context, envelope startComponentEnvelope) (*startComponentResult, error) { body, err := json.Marshal(envelope) if err != nil { return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to encode start component envelope")) @@ -148,7 +154,7 @@ func (c *httpComponentClient) startComponentOnce(ctx context.Context, envelope s return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to read start component response")) } - var startResp startBlockchainResult + var startResp startComponentResult if len(respBody) > 0 { if err := json.Unmarshal(respBody, &startResp); err != nil { return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to decode start component response")) @@ -186,18 +192,18 @@ func (c *httpComponentClient) waitForHealth(ctx context.Context) error { func() error { req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+"/v1/health", nil) if err != nil { - return retry.Unrecoverable(pkgerrors.Wrap(err, "failed to create health request")) + return retry.Unrecoverable(pkgerrors.Wrap(err, "failed to create EC2 agent health request")) } resp, err := c.client.Do(req) if err != nil { - return pkgerrors.Wrap(err, "failed to execute health request") + return pkgerrors.Wrap(err, describeEC2AgentHealthFailure(c.baseURL)) } _ = resp.Body.Close() if resp.StatusCode == http.StatusOK { return nil } - return fmt.Errorf("agent health check returned status %s", resp.Status) + return fmt.Errorf("%s: status %s", describeEC2AgentHealthFailure(c.baseURL), resp.Status) }, retry.Attempts(uint(c.maxAttempts)), retry.Delay(c.retryDelay), @@ -206,6 +212,15 @@ func (c *httpComponentClient) waitForHealth(ctx context.Context) error { ) } +func describeEC2AgentHealthFailure(baseURL string) string { + return fmt.Sprintf( + "failed EC2 CRE agent health check (%s/v1/health); verify the agent process is running and %s matches its listen port (or set %s explicitly)", + baseURL, + envEC2AgentPort, + envEC2AgentURL, + ) +} + func isRetriableStatus(statusCode int) bool { return statusCode == http.StatusBadGateway || statusCode == http.StatusServiceUnavailable || statusCode == http.StatusGatewayTimeout } @@ -383,7 +398,7 @@ func startBlockchainsWithTargets( return nil, err } - payload := startBlockchainRequest{ + payload := startComponentRequest{ ComponentType: componentTypeBlockchain, Blockchain: input, ReusePolicy: string(configured.RemoteStartPolicy), @@ -401,6 +416,9 @@ func startBlockchainsWithTargets( if err != nil { return nil, err } + if response.ComponentType != componentTypeBlockchain { + return nil, fmt.Errorf("unexpected component type in start response: %s", response.ComponentType) + } for _, logLine := range response.AgentLogs { pretty := prettifyAgentLogLine(logLine) if pretty == "" { @@ -408,8 +426,7 @@ func startBlockchainsWithTargets( } testLogger.Info().Msgf("[agent] %s", pretty) } - - blockchainOutput, err := agent.DecodeFromTransport[blockchain.Output](response.BlockchainOutput) + blockchainOutput, err := agent.DecodeFromTransport[blockchain.Output](response.Output) if err != nil { return nil, pkgerrors.Wrap(err, "failed to decode blockchain transport payload") } diff --git a/system-tests/lib/cre/environment/config/config.go b/system-tests/lib/cre/environment/config/config.go index bcad49a1812..e4e337e0b7a 100644 --- a/system-tests/lib/cre/environment/config/config.go +++ b/system-tests/lib/cre/environment/config/config.go @@ -59,7 +59,7 @@ func (c *Config) SetAddresses(refs []datastore.AddressRef) error { type Config struct { Blockchains []*Blockchain `toml:"blockchains" validate:"required"` NodeSets []*cre.NodeSet `toml:"nodesets" validate:"required"` - JD *jd.Input `toml:"jd" validate:"required"` + JD *JobDistributor `toml:"jd" validate:"required"` Infra *infra.Provider `toml:"infra" validate:"required"` Fake *fake.Input `toml:"fake" validate:"required"` FakeHTTP *fake.Input `toml:"fake_http" validate:"required"` @@ -93,6 +93,14 @@ type Blockchain struct { RemoteStartPolicy RemoteStartPolicy `toml:"remote_start_policy"` } +// JobDistributor wraps the existing CTF JD input and adds placement metadata. +// The embedded input keeps TOML fields backward-compatible. +type JobDistributor struct { + jd.Input + Target ComponentTarget `toml:"target"` + RemoteStartPolicy RemoteStartPolicy `toml:"remote_start_policy"` +} + func (b *Blockchain) Normalize() { if b.Target == "" { b.Target = TargetDocker @@ -125,6 +133,38 @@ func (b *Blockchain) InputRef() *blockchain.Input { return &b.Input } +func (j *JobDistributor) Normalize() { + if j.Target == "" { + j.Target = TargetDocker + } + if j.RemoteStartPolicy == "" { + j.RemoteStartPolicy = RemoteStartPolicyReuseIfIdentical + } +} + +func (j *JobDistributor) Validate() error { + if j == nil { + return errors.New("jd is nil") + } + + j.Normalize() + if j.Target != TargetDocker && j.Target != TargetRemote { + return fmt.Errorf("invalid jd target: %s", j.Target) + } + if j.RemoteStartPolicy != RemoteStartPolicyReuseIfIdentical && j.RemoteStartPolicy != RemoteStartPolicyAlways { + return fmt.Errorf("invalid jd remote_start_policy: %s", j.RemoteStartPolicy) + } + + return nil +} + +func (j *JobDistributor) InputRef() *jd.Input { + if j == nil { + return nil + } + return &j.Input +} + func (c *Config) EffectiveBlockchains() ([]*blockchain.Input, error) { return ResolveBlockchainInputs(c.Blockchains) } @@ -147,6 +187,13 @@ func ResolveBlockchainInputs(blockchains []*Blockchain) ([]*blockchain.Input, er // Validate performs validation checks on the configuration, ensuring all required fields // are present and all referenced capabilities are known to the system. func (c *Config) Validate(envDependencies cre.CLIEnvironmentDependencies) error { + if c.JD == nil { + return errors.New("jd configuration must be provided") + } + if err := c.JD.Validate(); err != nil { + return err + } + if c.JD.CSAEncryptionKey == "" { return errors.New("jd.csa_encryption_key must be provided") } diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index 680be009b24..d3b502b11a6 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -84,7 +84,7 @@ func (s *SetupOutput) TunnelBindings() []tunnel.TunnelBinding { type SetupInput struct { NodeSets []*cre.NodeSet Blockchains []*config.Blockchain - JdInput *jd.Input + JdInput *config.JobDistributor Provider infra.Provider ContractVersions map[cre.ContractType]*semver.Version WithV2Registries bool @@ -123,6 +123,9 @@ func (s *SetupInput) Validate() error { if s.JdInput == nil { return pkgerrors.New("jd input is nil") } + if err := s.JdInput.Validate(); err != nil { + return pkgerrors.Wrap(err, "jd input validation failed") + } return nil } @@ -261,7 +264,7 @@ func SetupTestEnvironment( jdStartedFuture := queue.SubmitAny(func(ctx context.Context) (any, error) { // TODO: pass context after we update the CTF to accept context, when creating new JD instance - jdOutput, startJDErr := StartJD(ctx, testLogger, *input.JdInput, input.Provider) + jdOutput, startJDErr := StartJD(ctx, testLogger, input.JdInput, input.Provider, tunnelManager) if startJDErr != nil { return nil, pkgerrors.Wrap(startJDErr, "failed to start Job Distributor") } diff --git a/system-tests/lib/cre/environment/jobs.go b/system-tests/lib/cre/environment/jobs.go index a12d8f3752c..461100402d5 100644 --- a/system-tests/lib/cre/environment/jobs.go +++ b/system-tests/lib/cre/environment/jobs.go @@ -2,8 +2,12 @@ package environment import ( "context" + "encoding/json" "errors" "fmt" + "net" + "net/url" + "strconv" "strings" "time" @@ -15,8 +19,12 @@ import ( "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/insecure" + "github.com/smartcontractkit/chainlink-testing-framework/framework" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" "github.com/smartcontractkit/chainlink/system-tests/lib/infra" ) @@ -54,14 +62,62 @@ func getJDCredentials(lggr zerolog.Logger, infraInput infra.Provider, jdOutput * return creds } -func StartJD(ctx context.Context, lggr zerolog.Logger, jdInput jd.Input, infraInput infra.Provider) (*StartedJD, error) { +func StartJD( + ctx context.Context, + lggr zerolog.Logger, + jdConfig *config.JobDistributor, + infraInput infra.Provider, + tunnelManager tunnel.Manager, +) (*StartedJD, error) { startTime := time.Now() lggr.Info().Msg("Starting Job Distributor") + if jdConfig == nil { + return nil, errors.New("jd configuration is nil") + } var jdOutput *jd.Output var jdErr error - if infraInput.IsKubernetes() { + if jdConfig.Target == config.TargetRemote { + startClient, err := newStartComponentClient(lggr, tunnelManager) + if err != nil { + return nil, err + } + payload := startComponentRequest{ + ComponentType: componentTypeJD, + JD: jdConfig.InputRef(), + ReusePolicy: string(jdConfig.RemoteStartPolicy), + } + payloadBytes, err := json.Marshal(payload) + if err != nil { + return nil, pkgerrors.Wrap(err, "failed to encode jd payload") + } + response, err := startClient.StartComponent(ctx, startComponentEnvelope{ + SchemaVersion: agent.SchemaVersionV1, + Operation: agent.OperationStartComponent, + Payload: payloadBytes, + }) + if err != nil { + return nil, err + } + if response.ComponentType != componentTypeJD { + return nil, fmt.Errorf("unexpected component type in start response: %s", response.ComponentType) + } + for _, logLine := range response.AgentLogs { + pretty := prettifyAgentLogLine(logLine) + if pretty == "" { + continue + } + lggr.Info().Msgf("[agent] %s", pretty) + } + jdOutput, err = agent.DecodeFromTransport[jd.Output](response.Output) + if err != nil { + return nil, pkgerrors.Wrap(err, "failed to decode jd transport payload") + } + if err := rewriteRemoteJDOutputForLocalAccess(ctx, lggr, tunnelManager, jdOutput); err != nil { + return nil, err + } + } else if infraInput.IsKubernetes() { // For Kubernetes, JD is already running in the cluster, generate service URLs lggr.Info().Msg("Generating Kubernetes service URLs for Job Distributor (already running in cluster)") jdOutput, jdErr = infra.GenerateKubernetesJDOutput(&infraInput, lggr) @@ -72,9 +128,9 @@ func StartJD(ctx context.Context, lggr zerolog.Logger, jdInput jd.Input, infraIn // Only start JD container for Docker provider if jdOutput == nil { - jdOutput, jdErr = jd.NewWithContext(ctx, &jdInput) + jdOutput, jdErr = jd.NewWithContext(ctx, jdConfig.InputRef()) if jdErr != nil { - jdErr = fmt.Errorf("failed to start JD container for image %s: %w", jdInput.Image, jdErr) + jdErr = fmt.Errorf("failed to start JD container for image %s: %w", jdConfig.Image, jdErr) // useful end user messages if strings.Contains(jdErr.Error(), "pull access denied") || strings.Contains(jdErr.Error(), "may require 'docker login'") { @@ -90,7 +146,7 @@ func StartJD(ctx context.Context, lggr zerolog.Logger, jdInput jd.Input, infraIn // Configure gRPC credentials for JD connection creds := getJDCredentials(lggr, infraInput, jdOutput) - jdConfig := cldf_jd.JDConfig{ + jdClientConfig := cldf_jd.JDConfig{ GRPC: jdOutput.ExternalGRPCUrl, WSRPC: jdOutput.InternalWSRPCUrl, Creds: creds, @@ -98,7 +154,7 @@ func StartJD(ctx context.Context, lggr zerolog.Logger, jdInput jd.Input, infraIn lggr.Info().Msgf("Connecting to JD GRPC at: %s", jdOutput.ExternalGRPCUrl) - jdClient, jdErr := cldf_jd.NewJDClient(jdConfig) + jdClient, jdErr := cldf_jd.NewJDClient(jdClientConfig) if jdErr != nil { return nil, pkgerrors.Wrap(jdErr, "failed to create JD client") } @@ -110,3 +166,131 @@ func StartJD(ctx context.Context, lggr zerolog.Logger, jdInput jd.Input, infraIn Client: jdClient, }, nil } + +func rewriteRemoteJDOutputForLocalAccess( + ctx context.Context, + lggr zerolog.Logger, + tunnelManager tunnel.Manager, + output *jd.Output, +) error { + if output == nil { + return nil + } + if tunnelManager == nil { + return errors.New("tunnel manager is required for remote jd target") + } + + refs, err := describeJDEndpoints(output) + if err != nil { + return pkgerrors.Wrap(err, "failed to describe jd tunnel endpoints") + } + bindings, err := tunnelManager.Start(ctx, refs) + if err != nil { + return pkgerrors.Wrap(err, "failed to start tunnels for jd output") + } + for _, binding := range bindings { + lggr.Info(). + Str("componentID", binding.ComponentID). + Str("endpointName", binding.EndpointName). + Str("originalURL", binding.OriginalURL). + Str("localURL", binding.LocalURL). + Msg("Established endpoint tunnel") + } + return rewriteJDWithBindings(output, bindings) +} + +func describeJDEndpoints(output *jd.Output) ([]tunnel.EndpointRef, error) { + refs := make([]tunnel.EndpointRef, 0, 2) + componentID := tunnel.CanonicalComponentID(tunnel.KindJD, 0, "job-distributor") + + grpcRef, err := jdEndpointFromAddress(componentID, "grpc", output.ExternalGRPCUrl) + if err != nil { + return nil, err + } + if grpcRef != nil { + refs = append(refs, *grpcRef) + } + + wsrpcRef, err := jdEndpointFromAddress(componentID, "wsrpc", output.ExternalWSRPCUrl) + if err != nil { + return nil, err + } + if wsrpcRef != nil { + refs = append(refs, *wsrpcRef) + } + + return refs, nil +} + +func rewriteJDWithBindings(output *jd.Output, bindings []tunnel.TunnelBinding) error { + byName := make(map[string]tunnel.TunnelBinding, len(bindings)) + for _, binding := range bindings { + byName[binding.EndpointName] = binding + } + + if output.ExternalGRPCUrl != "" { + binding, ok := byName["grpc"] + if !ok { + return fmt.Errorf("missing tunnel binding for jd grpc endpoint") + } + dockerHost := strings.TrimPrefix(framework.HostDockerInternal(), "http://") + output.ExternalGRPCUrl = net.JoinHostPort("127.0.0.1", fmt.Sprintf("%d", binding.LocalPort)) + output.InternalGRPCUrl = net.JoinHostPort(dockerHost, fmt.Sprintf("%d", binding.LocalPort)) + } + + if output.ExternalWSRPCUrl != "" || output.InternalWSRPCUrl != "" { + binding, ok := byName["wsrpc"] + if !ok { + return fmt.Errorf("missing tunnel binding for jd wsrpc endpoint") + } + dockerHost := strings.TrimPrefix(framework.HostDockerInternal(), "http://") + output.InternalWSRPCUrl = net.JoinHostPort(dockerHost, fmt.Sprintf("%d", binding.LocalPort)) + output.ExternalWSRPCUrl = net.JoinHostPort("127.0.0.1", fmt.Sprintf("%d", binding.LocalPort)) + } + + return nil +} + +func jdEndpointFromAddress(componentID, endpointName, rawAddress string) (*tunnel.EndpointRef, error) { + trimmed := strings.TrimSpace(rawAddress) + if trimmed == "" { + return nil, nil + } + + host := "" + port := "" + + if strings.Contains(trimmed, "://") { + parsedURL, err := url.Parse(trimmed) + if err != nil { + return nil, fmt.Errorf("failed to parse jd endpoint %q: %w", rawAddress, err) + } + host = parsedURL.Hostname() + port = parsedURL.Port() + } else { + parsedHost, parsedPort, err := net.SplitHostPort(trimmed) + if err != nil { + return nil, fmt.Errorf("failed to parse jd host:port endpoint %q: %w", rawAddress, err) + } + host = parsedHost + port = parsedPort + } + + if host == "" || port == "" { + return nil, fmt.Errorf("jd endpoint %q must contain host and port", rawAddress) + } + + portNumber, err := strconv.Atoi(port) + if err != nil || portNumber <= 0 || portNumber > 65535 { + return nil, fmt.Errorf("jd endpoint %q has invalid port %q", rawAddress, port) + } + + return &tunnel.EndpointRef{ + ComponentID: componentID, + EndpointName: endpointName, + Scheme: "tcp", + Host: host, + Port: portNumber, + OriginalURL: trimmed, + }, nil +} diff --git a/system-tests/lib/cre/environment/jobs_test.go b/system-tests/lib/cre/environment/jobs_test.go new file mode 100644 index 00000000000..4c3f416dc3a --- /dev/null +++ b/system-tests/lib/cre/environment/jobs_test.go @@ -0,0 +1,68 @@ +package environment + +import ( + "strings" + "testing" + + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" +) + +func TestDescribeJDEndpointsUsesExternalWSRPC(t *testing.T) { + output := &jd.Output{ + ExternalGRPCUrl: "127.0.0.1:14231", + ExternalWSRPCUrl: "127.0.0.1:8080", + InternalWSRPCUrl: "job-distributor:8080", + } + + refs, err := describeJDEndpoints(output) + if err != nil { + t.Fatalf("describeJDEndpoints returned error: %v", err) + } + if len(refs) != 2 { + t.Fatalf("expected 2 endpoint refs, got %d", len(refs)) + } + + var wsrpcRef *tunnel.EndpointRef + for i := range refs { + if refs[i].EndpointName == "wsrpc" { + wsrpcRef = &refs[i] + break + } + } + if wsrpcRef == nil { + t.Fatal("missing wsrpc endpoint ref") + } + if wsrpcRef.Host != "127.0.0.1" || wsrpcRef.Port != 8080 { + t.Fatalf("expected wsrpc endpoint to use external address 127.0.0.1:8080, got %s:%d", wsrpcRef.Host, wsrpcRef.Port) + } +} + +func TestRewriteJDWithBindingsRewritesNodeFacingWSRPC(t *testing.T) { + output := &jd.Output{ + ExternalGRPCUrl: "127.0.0.1:14231", + ExternalWSRPCUrl: "127.0.0.1:8080", + InternalWSRPCUrl: "job-distributor:8080", + } + bindings := []tunnel.TunnelBinding{ + { + EndpointRef: tunnel.EndpointRef{EndpointName: "grpc"}, + LocalPort: 61001, + }, + { + EndpointRef: tunnel.EndpointRef{EndpointName: "wsrpc"}, + LocalPort: 61002, + }, + } + + if err := rewriteJDWithBindings(output, bindings); err != nil { + t.Fatalf("rewriteJDWithBindings returned error: %v", err) + } + + if output.ExternalWSRPCUrl != "127.0.0.1:61002" { + t.Fatalf("expected external wsrpc url to be rewritten to 127.0.0.1:61002, got %s", output.ExternalWSRPCUrl) + } + if !strings.HasSuffix(output.InternalWSRPCUrl, ":61002") { + t.Fatalf("expected internal wsrpc url to use tunneled port 61002, got %s", output.InternalWSRPCUrl) + } +} diff --git a/system-tests/lib/cre/environment/remote_stop.go b/system-tests/lib/cre/environment/remote_stop.go new file mode 100644 index 00000000000..9ab8148823e --- /dev/null +++ b/system-tests/lib/cre/environment/remote_stop.go @@ -0,0 +1,146 @@ +package environment + +import ( + "context" + "encoding/json" + "errors" + "fmt" + + pkgerrors "github.com/pkg/errors" + "github.com/rs/zerolog" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" +) + +type RemoteStopSummary struct { + Requested int + Stopped int + Missing int + Failed int +} + +// StopRemoteComponents sends StopComponent operations for all remote-targeted components. +// It is idempotent from the caller perspective; missing components are treated as success. +func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config.Config) (RemoteStopSummary, error) { + summary := RemoteStopSummary{} + if cfg == nil { + return summary, errors.New("config is nil") + } + summary.Requested = countRemoteStopTargets(cfg) + if summary.Requested == 0 { + return summary, nil + } + + tunnelManager, err := newEC2TunnelManager(lggr) + if err != nil { + return summary, pkgerrors.Wrap(err, "failed to initialize tunnel manager for remote stop") + } + defer func() { _ = tunnelManager.Stop(ctx) }() + + startClient, err := newStartComponentClient(lggr, tunnelManager) + if err != nil { + return summary, pkgerrors.Wrap(err, "failed to initialize remote component client for stop") + } + + var joined error + for _, configuredBlockchain := range cfg.Blockchains { + if configuredBlockchain == nil || configuredBlockchain.Target != config.TargetRemote { + continue + } + payload := startComponentRequest{ + ComponentType: componentTypeBlockchain, + Blockchain: configuredBlockchain.InputRef(), + ReusePolicy: string(configuredBlockchain.RemoteStartPolicy), + } + result, err := stopRemoteComponent(ctx, lggr, startClient, payload, componentTypeBlockchain) + if err != nil { + summary.Failed++ + joined = errors.Join(joined, err) + continue + } + if result.Stopped { + summary.Stopped++ + } else if !result.Found { + summary.Missing++ + } + } + + if cfg.JD != nil && cfg.JD.Target == config.TargetRemote { + payload := startComponentRequest{ + ComponentType: componentTypeJD, + JD: cfg.JD.InputRef(), + ReusePolicy: string(cfg.JD.RemoteStartPolicy), + } + result, err := stopRemoteComponent(ctx, lggr, startClient, payload, componentTypeJD) + if err != nil { + summary.Failed++ + joined = errors.Join(joined, err) + return summary, joined + } + if result.Stopped { + summary.Stopped++ + } else if !result.Found { + summary.Missing++ + } + } + + return summary, joined +} + +func countRemoteStopTargets(cfg *config.Config) int { + if cfg == nil { + return 0 + } + count := 0 + for _, configuredBlockchain := range cfg.Blockchains { + if configuredBlockchain != nil && configuredBlockchain.Target == config.TargetRemote { + count++ + } + } + if cfg.JD != nil && cfg.JD.Target == config.TargetRemote { + count++ + } + return count +} + +func stopRemoteComponent( + ctx context.Context, + lggr zerolog.Logger, + client componentClient, + payload startComponentRequest, + expectedType string, +) (*startComponentResult, error) { + payloadBytes, err := json.Marshal(payload) + if err != nil { + return nil, pkgerrors.Wrapf(err, "failed to encode stop payload for component type %s", payload.ComponentType) + } + + response, err := client.StartComponent(ctx, startComponentEnvelope{ + SchemaVersion: agent.SchemaVersionV1, + Operation: agent.OperationStopComponent, + Payload: payloadBytes, + }) + if err != nil { + return nil, pkgerrors.Wrapf(err, "failed to stop remote component type %s", payload.ComponentType) + } + if response.ComponentType != expectedType { + return nil, fmt.Errorf("unexpected component type in stop response: %s", response.ComponentType) + } + + lggr.Info(). + Str("componentType", response.ComponentType). + Bool("found", response.Found). + Bool("stopped", response.Stopped). + Msg("Processed remote component stop") + + for _, logLine := range response.AgentLogs { + pretty := prettifyAgentLogLine(logLine) + if pretty == "" { + continue + } + lggr.Info().Msgf("[agent] %s", pretty) + } + + return response, nil +} diff --git a/system-tests/lib/cre/environment/tunnel/provider_ssm.go b/system-tests/lib/cre/environment/tunnel/provider_ssm.go index 9d06d5e7285..d8a7207418b 100644 --- a/system-tests/lib/cre/environment/tunnel/provider_ssm.go +++ b/system-tests/lib/cre/environment/tunnel/provider_ssm.go @@ -7,6 +7,7 @@ import ( "net" "os" "os/exec" + "strings" "sync" "syscall" "time" @@ -37,20 +38,28 @@ func (p *SSMProvider) Name() string { } func (p *SSMProvider) Open(ctx context.Context, ref EndpointRef) (TunnelBinding, error) { + profile, authMode := resolveAWSProfileSelection() + if err := validateAWSSession(ctx, p.region, profile, authMode); err != nil { + return TunnelBinding{}, err + } + localPort, err := reserveLocalPort() if err != nil { return TunnelBinding{}, fmt.Errorf("failed to reserve local port: %w", err) } - cmd := exec.Command( - "aws", + args := []string{ "ssm", "start-session", "--region", p.region, "--target", p.instanceID, "--document-name", "AWS-StartPortForwardingSession", "--parameters", fmt.Sprintf("portNumber=%d,localPortNumber=%d", ref.Port, localPort), - ) + } + if profile != "" { + args = append(args, "--profile", profile) + } + cmd := exec.Command("aws", args...) // Start in a dedicated process group so cleanup can kill aws + session-manager-plugin together. cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} if p.logger.GetLevel() <= zerolog.DebugLevel { @@ -64,6 +73,8 @@ func (p *SSMProvider) Open(ctx context.Context, ref EndpointRef) (TunnelBinding, p.logger.Info(). Str("componentID", ref.ComponentID). Str("endpointName", ref.EndpointName). + Str("awsAuthMode", authMode). + Str("awsProfile", profile). Int("remotePort", ref.Port). Int("localPort", localPort). Msg("Opening SSM endpoint tunnel") @@ -92,6 +103,54 @@ func (p *SSMProvider) Open(ctx context.Context, ref EndpointRef) (TunnelBinding, }, nil } +func resolveAWSProfileSelection() (string, string) { + if hasStaticAWSKeys() { + return "", "env-creds" + } + + if profile := strings.TrimSpace(os.Getenv("CRE_AWS_PROFILE")); profile != "" { + return profile, "profile:CRE_AWS_PROFILE" + } + if profile := strings.TrimSpace(os.Getenv("AWS_PROFILE")); profile != "" { + return profile, "profile:AWS_PROFILE" + } + + return "", "default-profile" +} + +func hasStaticAWSKeys() bool { + accessKeyID := strings.TrimSpace(os.Getenv("AWS_ACCESS_KEY_ID")) + secretAccessKey := strings.TrimSpace(os.Getenv("AWS_SECRET_ACCESS_KEY")) + return accessKeyID != "" && secretAccessKey != "" +} + +func validateAWSSession(ctx context.Context, region, profile, authMode string) error { + if ctx == nil { + ctx = context.Background() + } + preflightCtx, cancel := context.WithTimeout(ctx, 8*time.Second) + defer cancel() + + args := []string{"sts", "get-caller-identity", "--region", region} + if profile != "" { + args = append(args, "--profile", profile) + } + out, err := exec.CommandContext(preflightCtx, "aws", args...).CombinedOutput() + if err == nil { + return nil + } + + loginHint := "Verify AWS credentials are configured and valid." + if profile != "" { + loginHint = fmt.Sprintf("Run `aws sso login --profile %s` (or configure profile credentials) and retry.", profile) + } + trimmed := strings.TrimSpace(string(out)) + if trimmed == "" { + return fmt.Errorf("aws authentication check failed for SSM tunnel (mode=%s): %w. %s", authMode, err, loginHint) + } + return fmt.Errorf("aws authentication check failed for SSM tunnel (mode=%s): %w: %s. %s", authMode, err, trimmed, loginHint) +} + func (p *SSMProvider) Close(_ context.Context, binding TunnelBinding) error { p.mu.Lock() cmd, ok := p.sessions[binding.LocalPort] diff --git a/system-tests/lib/go.mod b/system-tests/lib/go.mod index 8a04a2e7ccb..70bd24015aa 100644 --- a/system-tests/lib/go.mod +++ b/system-tests/lib/go.mod @@ -11,11 +11,14 @@ replace github.com/smartcontractkit/chainlink/v2 => ../../ replace github.com/smartcontractkit/chainlink/deployment => ../../deployment +replace github.com/smartcontractkit/chainlink-testing-framework/framework => /Users/bartektofel/Desktop/repos/chainlink-testing-framework/framework + require ( dario.cat/mergo v1.0.2 github.com/Masterminds/semver/v3 v3.4.0 github.com/alitto/pond/v2 v2.5.0 github.com/andybalholm/brotli v1.2.0 + github.com/avast/retry-go/v4 v4.6.1 github.com/cockroachdb/errors v1.11.3 github.com/cosmos/gogoproto v1.7.0 github.com/docker/docker v28.5.1+incompatible @@ -100,7 +103,6 @@ require ( github.com/aptos-labs/aptos-go-sdk v1.11.0 // indirect github.com/atombender/go-jsonschema v0.16.1-0.20240916205339-a74cd4e2851c // indirect github.com/avast/retry-go v3.0.0+incompatible // indirect - github.com/avast/retry-go/v4 v4.6.1 // indirect github.com/awalterschulze/gographviz v2.0.3+incompatible // indirect github.com/aws/aws-sdk-go v1.55.7 // indirect github.com/aws/aws-sdk-go-v2 v1.41.1 // indirect diff --git a/system-tests/lib/go.sum b/system-tests/lib/go.sum index 99dc51b7130..45beae4b082 100644 --- a/system-tests/lib/go.sum +++ b/system-tests/lib/go.sum @@ -643,6 +643,10 @@ github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI= github.com/gin-gonic/gin v1.10.1 h1:T0ujvqyCSqRopADpgPgiTT63DUQVSfojyME59Ei63pQ= github.com/gin-gonic/gin v1.10.1/go.mod h1:4PMNQiOhvDRa013RKVbsiNwoyezlm2rm0uX/T7kzp5Y= +github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BNhXs= +github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo= +github.com/gkampitakis/go-snaps v0.5.19 h1:hUJlCQOpTt1M+kSisMwioDWZDWpDtdAvUhvWCx1YGW0= +github.com/gkampitakis/go-snaps v0.5.19/go.mod h1:gC3YqxQTPyIXvQrw/Vpt3a8VqR1MO8sVpZFWN4DGwNs= github.com/go-asn1-ber/asn1-ber v1.5.5 h1:MNHlNMBDgEKD4TcKr36vQN68BA00aDfjIt3/bD50WnA= github.com/go-asn1-ber/asn1-ber v1.5.5/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0= github.com/go-chi/chi v1.5.5 h1:vOB/HbEMt9QqBqErz07QehcOKHaWFtuj87tTDVz2qXE= @@ -1177,6 +1181,8 @@ github.com/manyminds/api2go v0.0.0-20171030193247-e7b693844a6f h1:tVvGiZQFjOXP+9 github.com/manyminds/api2go v0.0.0-20171030193247-e7b693844a6f/go.mod h1:Z60vy0EZVSu0bOugCHdcN5ZxFMKSpjRgsnh0XKPFqqk= github.com/marcboeker/go-duckdb v1.8.5 h1:tkYp+TANippy0DaIOP5OEfBEwbUINqiFqgwMQ44jME0= github.com/marcboeker/go-duckdb v1.8.5/go.mod h1:6mK7+WQE4P4u5AFLvVBmhFxY5fvhymFptghgJX6B+/8= +github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= +github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg= github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ= github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= @@ -1509,8 +1515,8 @@ github.com/secure-systems-lab/go-securesystemslib v0.6.0 h1:T65atpAVCJQK14UA57LM github.com/secure-systems-lab/go-securesystemslib v0.6.0/go.mod h1:8Mtpo9JKks/qhPG4HGZ2LGMvrPbzuxwfz/f/zLfEWkk= github.com/segmentio/ksuid v1.0.4 h1:sBo2BdShXjmcugAMwjugoGUdUV0pcxY5mW4xKRn3v4c= github.com/segmentio/ksuid v1.0.4/go.mod h1:/XUiZBD3kVx5SmUOl55voK5yeAbBNNIed+2O73XgrPE= -github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 h1:n661drycOFuPLCN3Uc8sB6B/s6Z4t2xvBgU1htSHuq8= -github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= +github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw= +github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= github.com/serialx/hashring v0.0.0-20200727003509-22c0c7ab6b1b h1:h+3JX2VoWTFuyQEo87pStk/a99dzIO1mM9KxIyLPGTU= github.com/serialx/hashring v0.0.0-20200727003509-22c0c7ab6b1b/go.mod h1:/yeG0My1xr/u+HZrFQ1tOQQQQrOawfyMUH13ai5brBc= github.com/sethvargo/go-retry v0.3.0 h1:EEt31A35QhrcRZtrYFDTBg91cqZVnFL2navjDrah2SE= @@ -1636,8 +1642,6 @@ github.com/smartcontractkit/chainlink-sui v0.0.0-20260205175622-33e65031f9a9 h1: github.com/smartcontractkit/chainlink-sui v0.0.0-20260205175622-33e65031f9a9/go.mod h1:KpEWZJMLwbdMHeHQz9rbkES0vRrx4nk6OQXyhlHb9/8= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260124000807-bff5e296dfb7 h1:nC/FJN5iwh/zD5u8R6qwhkx60c/83E9f6EnRonr/RG8= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260124000807-bff5e296dfb7/go.mod h1:FbqbTFP9aBvE/2GDmfcFr/03HEWkzjP7OMmxdib26aY= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.13.14-0.20260202230832-eb33f42188d1 h1:JijOMT/94w/mt2q69vBQodliDlVfe+jqeaSTQJP3uxo= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.13.14-0.20260202230832-eb33f42188d1/go.mod h1:IQC7fXKDsFjD1vb0Jh83WWY4BCFhN1fkcn+z3oSuFIA= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.15 h1:usf6YCNmSO8R1/rU28wUfIdp7zXlqGGOAttXW5mgkXU= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.15/go.mod h1:YqrpawYGRkT/jcvXcmaZeZPOtu0erIenrHl5Mb8+U/c= github.com/smartcontractkit/chainlink-testing-framework/framework/components/fake v0.10.0 h1:PWAMYu0WaAMBfbpxCpFJGRIDHmcgmYin6a+UQC0OdtY= diff --git a/system-tests/tests/go.mod b/system-tests/tests/go.mod index e3a80c33b61..6e7f7b0c751 100644 --- a/system-tests/tests/go.mod +++ b/system-tests/tests/go.mod @@ -39,6 +39,8 @@ replace github.com/smartcontractkit/chainlink/system-tests/tests/regression/cre/ replace github.com/smartcontractkit/chainlink/system-tests/tests/smoke/cre/solana/solwrite => ./smoke/cre/solana/solwrite +replace github.com/smartcontractkit/chainlink-testing-framework/framework => /Users/bartektofel/Desktop/repos/chainlink-testing-framework/framework + require ( github.com/Masterminds/semver/v3 v3.4.0 github.com/avast/retry-go/v4 v4.6.1 diff --git a/system-tests/tests/go.sum b/system-tests/tests/go.sum index a80046d8044..e52afa70163 100644 --- a/system-tests/tests/go.sum +++ b/system-tests/tests/go.sum @@ -709,6 +709,10 @@ github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI= github.com/gin-gonic/gin v1.10.1 h1:T0ujvqyCSqRopADpgPgiTT63DUQVSfojyME59Ei63pQ= github.com/gin-gonic/gin v1.10.1/go.mod h1:4PMNQiOhvDRa013RKVbsiNwoyezlm2rm0uX/T7kzp5Y= +github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BNhXs= +github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo= +github.com/gkampitakis/go-snaps v0.5.19 h1:hUJlCQOpTt1M+kSisMwioDWZDWpDtdAvUhvWCx1YGW0= +github.com/gkampitakis/go-snaps v0.5.19/go.mod h1:gC3YqxQTPyIXvQrw/Vpt3a8VqR1MO8sVpZFWN4DGwNs= github.com/go-asn1-ber/asn1-ber v1.5.5 h1:MNHlNMBDgEKD4TcKr36vQN68BA00aDfjIt3/bD50WnA= github.com/go-asn1-ber/asn1-ber v1.5.5/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0= github.com/go-chi/chi v1.5.5 h1:vOB/HbEMt9QqBqErz07QehcOKHaWFtuj87tTDVz2qXE= @@ -1334,6 +1338,8 @@ github.com/manyminds/api2go v0.0.0-20171030193247-e7b693844a6f h1:tVvGiZQFjOXP+9 github.com/manyminds/api2go v0.0.0-20171030193247-e7b693844a6f/go.mod h1:Z60vy0EZVSu0bOugCHdcN5ZxFMKSpjRgsnh0XKPFqqk= github.com/marcboeker/go-duckdb v1.8.5 h1:tkYp+TANippy0DaIOP5OEfBEwbUINqiFqgwMQ44jME0= github.com/marcboeker/go-duckdb v1.8.5/go.mod h1:6mK7+WQE4P4u5AFLvVBmhFxY5fvhymFptghgJX6B+/8= +github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= +github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg= github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ= github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= @@ -1718,8 +1724,8 @@ github.com/segmentio/ksuid v1.0.4 h1:sBo2BdShXjmcugAMwjugoGUdUV0pcxY5mW4xKRn3v4c github.com/segmentio/ksuid v1.0.4/go.mod h1:/XUiZBD3kVx5SmUOl55voK5yeAbBNNIed+2O73XgrPE= github.com/sercand/kuberesolver/v6 v6.0.0 h1:ScvS2Ga9snVkpOahln/BCLySr3/iBAHJf25u66DweZ0= github.com/sercand/kuberesolver/v6 v6.0.0/go.mod h1:Dxkqms3OJadP5zirIBPLi9FV8Qpys3T3w40XPEcVsu0= -github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 h1:n661drycOFuPLCN3Uc8sB6B/s6Z4t2xvBgU1htSHuq8= -github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= +github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw= +github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= github.com/serialx/hashring v0.0.0-20200727003509-22c0c7ab6b1b h1:h+3JX2VoWTFuyQEo87pStk/a99dzIO1mM9KxIyLPGTU= github.com/serialx/hashring v0.0.0-20200727003509-22c0c7ab6b1b/go.mod h1:/yeG0My1xr/u+HZrFQ1tOQQQQrOawfyMUH13ai5brBc= github.com/sethvargo/go-retry v0.3.0 h1:EEt31A35QhrcRZtrYFDTBg91cqZVnFL2navjDrah2SE= @@ -1845,8 +1851,6 @@ github.com/smartcontractkit/chainlink-sui v0.0.0-20260205175622-33e65031f9a9 h1: github.com/smartcontractkit/chainlink-sui v0.0.0-20260205175622-33e65031f9a9/go.mod h1:KpEWZJMLwbdMHeHQz9rbkES0vRrx4nk6OQXyhlHb9/8= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260124000807-bff5e296dfb7 h1:nC/FJN5iwh/zD5u8R6qwhkx60c/83E9f6EnRonr/RG8= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260124000807-bff5e296dfb7/go.mod h1:FbqbTFP9aBvE/2GDmfcFr/03HEWkzjP7OMmxdib26aY= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.13.14-0.20260202230832-eb33f42188d1 h1:JijOMT/94w/mt2q69vBQodliDlVfe+jqeaSTQJP3uxo= -github.com/smartcontractkit/chainlink-testing-framework/framework v0.13.14-0.20260202230832-eb33f42188d1/go.mod h1:IQC7fXKDsFjD1vb0Jh83WWY4BCFhN1fkcn+z3oSuFIA= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.18 h1:1ng+p/+85zcVLHB050PiWUAjOcxyd4KjwkUlJy34rgE= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.18/go.mod h1:2+OrSz56pdgtY0Oc20nCS9LH/bEksFDBQjoR82De5PI= github.com/smartcontractkit/chainlink-testing-framework/framework/components/fake v0.10.0 h1:PWAMYu0WaAMBfbpxCpFJGRIDHmcgmYin6a+UQC0OdtY= diff --git a/system-tests/tests/load/cre/workflow_don_load_test.go b/system-tests/tests/load/cre/workflow_don_load_test.go index ab53166bc84..74b455fd4eb 100644 --- a/system-tests/tests/load/cre/workflow_don_load_test.go +++ b/system-tests/tests/load/cre/workflow_don_load_test.go @@ -141,7 +141,7 @@ func setupLoadTestEnvironment( NodeSets: mustSetCapabilitiesFn(in.NodeSets), CapabilitiesContractFactoryFunctions: capabilityFactoryFns, Blockchains: blockchains, - JdInput: in.JD, + JdInput: &envconfig.JobDistributor{Input: *in.JD}, Provider: *in.Infra, JobSpecFactoryFunctions: jobSpecFactoryFns, ContractVersions: cretypes.NewContractVersionsProvider(envconfig.DefaultContractSet(false)).ContractVersions(), diff --git a/system-tests/tests/load/cre/writer_don_load_test.go b/system-tests/tests/load/cre/writer_don_load_test.go index c97cbf1deec..74699c208cd 100644 --- a/system-tests/tests/load/cre/writer_don_load_test.go +++ b/system-tests/tests/load/cre/writer_don_load_test.go @@ -97,7 +97,7 @@ func setupLoadTestWriterEnvironment( NodeSets: mustSetCapabilitiesFn(in.NodeSets), CapabilitiesContractFactoryFunctions: capabilityFactoryFns, Blockchains: blockchains, - JdInput: in.JD, + JdInput: &creenvconfig.JobDistributor{Input: *in.JD}, Provider: *in.Infra, JobSpecFactoryFunctions: jobSpecFactoryFns, BlockchainDeployers: blockchain_sets.NewDeployerSet(testLogger, in.Infra), From 23790bf99144c6cd855cd9b36d1b1efa37ad4324 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Thu, 19 Feb 2026 08:48:41 +0100 Subject: [PATCH 05/34] add support for remote nodesets --- .../configs/workflow-gateway-don-remote.toml | 9 +- .../environment/environment/environment.go | 45 ++- .../environment/environment/remote_state.go | 30 +- .../lib/cre/environment/agent/deploy.go | 16 ++ .../lib/cre/environment/agent/server.go | 43 ++- .../lib/cre/environment/agent/server_test.go | 16 ++ .../lib/cre/environment/blockchain_start.go | 25 +- .../cre/environment/blockchain_start_test.go | 1 + .../lib/cre/environment/config/config.go | 29 ++ system-tests/lib/cre/environment/dons.go | 271 +++++++++++++++++- system-tests/lib/cre/environment/dons_test.go | 62 ++++ .../lib/cre/environment/environment.go | 41 ++- system-tests/lib/cre/environment/jobs.go | 20 +- system-tests/lib/cre/environment/jobs_test.go | 2 +- .../lib/cre/environment/remote_stop.go | 29 ++ system-tests/lib/cre/types.go | 3 + system-tests/lib/go.mod | 2 +- 17 files changed, 610 insertions(+), 34 deletions(-) create mode 100644 system-tests/lib/cre/environment/dons_test.go diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml b/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml index 09f5bd76c8a..f482477aa42 100644 --- a/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml +++ b/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml @@ -3,7 +3,7 @@ type = "anvil" chain_id = "1337" docker_cmd_params = ["-b", "0.5", "--mixed-mining"] - #target = "remote" + target = "remote" [[blockchains]] type = "anvil" @@ -11,6 +11,7 @@ port = "8546" docker_cmd_params = ["-b", "0.5", "--mixed-mining"] target = "remote" + container_name = "anvil-2337" remote_start_policy = "always" [jd] @@ -42,6 +43,7 @@ don_types = ["workflow"] override_mode = "all" http_port_range_start = 10100 + target = "remote" env_vars = { CL_EVM_CMD = "" } capabilities = ["ocr3", "custom-compute", "web-api-target", "web-api-trigger", "vault", "cron", "http-action", "http-trigger", "consensus", "don-time", "write-evm-1337", "write-evm-2337", "evm-1337", "evm-2337", "read-contract-1337", "read-contract-2337"] @@ -56,7 +58,7 @@ #docker_ctx = "../../../.." #docker_file = "core/chainlink.Dockerfile" #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } - image = "chainlink-tmp:latest" + image = "chainlink-amd:latest" user_config_overrides = "" [[nodesets]] @@ -65,6 +67,7 @@ don_types = ["bootstrap", "gateway"] override_mode = "each" http_port_range_start = 10300 + target = "remote" env_vars = { CL_EVM_CMD = "" } supported_evm_chains = [1337, 2337] @@ -79,7 +82,7 @@ #ocker_ctx = "../../../.." #docker_file = "core/chainlink.Dockerfile" #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } - image = "chainlink-tmp:latest" + image = "chainlink-amd:latest" # 5002 is the web API capabilities port for incoming requests # 15002 is the vault port for incoming requests custom_ports = ["5002:5002","15002:15002"] diff --git a/core/scripts/cre/environment/environment/environment.go b/core/scripts/cre/environment/environment/environment.go index 33ee8ca1257..eba8d99ac9a 100644 --- a/core/scripts/cre/environment/environment/environment.go +++ b/core/scripts/cre/environment/environment/environment.go @@ -707,6 +707,7 @@ go run . env stop-remote --dry-run framework.L.Info(). Int("total", remoteConfiguredSummary.Total). Int("blockchains", remoteConfiguredSummary.Blockchains). + Int("nodesets", remoteConfiguredSummary.NodeSets). Int("jd", remoteConfiguredSummary.JD). Msg("Dry-run: remote components that would be stopped") return nil @@ -756,9 +757,11 @@ func loadRemoteStopTargets(relativePathToRepoRoot string) (remoteComponentSummar } func stopRemoteTargets(ctx context.Context, relativePathToRepoRoot string, targets *envconfig.Config) error { - remoteState, loadErr := loadRemoteStopState(relativePathToRepoRoot) - if loadErr == nil { - applyRemoteAgentEnvFallback(framework.L, remoteState) + agentState, agentLoadErr := loadRemoteAgentState(relativePathToRepoRoot) + if agentLoadErr != nil { + framework.L.Warn().Err(agentLoadErr).Msgf("failed to load remote agent state from %s", remoteStateFileAbsPath(relativePathToRepoRoot)) + } else if agentState != nil { + applyRemoteAgentEnvFallback(framework.L, &remoteStopState{Agent: *agentState}) } summary, stopRemoteErr := creenv.StopRemoteComponents(ctx, framework.L, targets) @@ -774,6 +777,14 @@ func stopRemoteTargets(ctx context.Context, relativePathToRepoRoot string, targe if err := removeRemoteStopConfig(relativePathToRepoRoot); err != nil { framework.L.Warn().Err(err).Msg("failed to remove remote component stop state") } + if !hasLocalComponents(targets) { + statePath := envconfig.MustLocalCREStateFileAbsPath(relativePathToRepoRoot) + if err := os.Remove(statePath); err == nil { + framework.L.Info().Msgf("removed local CRE state file after remote-only stop: %s", statePath) + } else if err != nil && !os.IsNotExist(err) { + framework.L.Warn().Err(err).Msgf("failed to remove local CRE state file after remote-only stop: %s", statePath) + } + } return nil } @@ -825,6 +836,7 @@ func stopLocalResources(relativePathToRepoRoot string, removeAllState bool) erro type remoteComponentSummary struct { Total int Blockchains int + NodeSets int JD int } @@ -838,13 +850,38 @@ func summarizeRemoteComponents(cfg *envconfig.Config) remoteComponentSummary { summary.Blockchains++ } } + for _, nodeSet := range cfg.NodeSets { + if nodeSet != nil && strings.TrimSpace(nodeSet.Target) == string(envconfig.TargetRemote) { + summary.NodeSets++ + } + } if cfg.JD != nil && cfg.JD.Target == envconfig.TargetRemote { summary.JD = 1 } - summary.Total = summary.Blockchains + summary.JD + summary.Total = summary.Blockchains + summary.NodeSets + summary.JD return summary } +func hasLocalComponents(cfg *envconfig.Config) bool { + if cfg == nil { + return false + } + for _, configuredBlockchain := range cfg.Blockchains { + if configuredBlockchain != nil && configuredBlockchain.Target != envconfig.TargetRemote { + return true + } + } + for _, nodeSet := range cfg.NodeSets { + if nodeSet != nil && strings.TrimSpace(nodeSet.Target) != string(envconfig.TargetRemote) { + return true + } + } + if cfg.JD != nil && cfg.JD.Target != envconfig.TargetRemote { + return true + } + return false +} + func applyRemoteAgentEnvFallback(logger zerolog.Logger, state *remoteStopState) { if state == nil { return diff --git a/core/scripts/cre/environment/environment/remote_state.go b/core/scripts/cre/environment/environment/remote_state.go index 2bbdb5fe943..40f98171151 100644 --- a/core/scripts/cre/environment/environment/remote_state.go +++ b/core/scripts/cre/environment/environment/remote_state.go @@ -8,6 +8,7 @@ import ( "github.com/pelletier/go-toml/v2" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre" envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" ) @@ -17,8 +18,9 @@ const ( ) type remoteStopState struct { - Version int `toml:"version"` + Version int `toml:"version"` Blockchains []*envconfig.Blockchain `toml:"blockchains"` + NodeSets []*cre.NodeSet `toml:"nodesets"` JD *envconfig.JobDistributor `toml:"jd"` Agent remoteAgentState `toml:"agent"` } @@ -32,6 +34,10 @@ type remoteAgentState struct { AWSProfile string `toml:"aws_profile,omitempty"` } +type remoteAgentStateEnvelope struct { + Agent remoteAgentState `toml:"agent"` +} + func remoteStateFileAbsPath(relativePathToRepoRoot string) string { absPath, err := filepath.Abs(filepath.Join(relativePathToRepoRoot, remoteStateDirname, remoteStateFilename)) if err != nil { @@ -60,15 +66,31 @@ func loadRemoteStopState(relativePathToRepoRoot string) (*remoteStopState, error if state.Blockchains == nil { state.Blockchains = []*envconfig.Blockchain{} } + if state.NodeSets == nil { + state.NodeSets = []*cre.NodeSet{} + } return state, nil } +func loadRemoteAgentState(relativePathToRepoRoot string) (*remoteAgentState, error) { + data, err := os.ReadFile(remoteStateFileAbsPath(relativePathToRepoRoot)) + if err != nil { + return nil, err + } + envelope := &remoteAgentStateEnvelope{} + if err := toml.Unmarshal(data, envelope); err != nil { + return nil, err + } + return &envelope.Agent, nil +} + func (s *remoteStopState) Config() *envconfig.Config { if s == nil { return nil } return &envconfig.Config{ Blockchains: s.Blockchains, + NodeSets: s.NodeSets, JD: s.JD, } } @@ -80,6 +102,7 @@ func storeRemoteStopState(relativePathToRepoRoot string, cfg *envconfig.Config) state := &remoteStopState{ Version: 1, Blockchains: []*envconfig.Blockchain{}, + NodeSets: []*cre.NodeSet{}, Agent: remoteAgentState{ Mode: os.Getenv("CRE_AGENT_MODE"), LocalURL: os.Getenv("CRE_LOCAL_AGENT_URL"), @@ -94,6 +117,11 @@ func storeRemoteStopState(relativePathToRepoRoot string, cfg *envconfig.Config) state.Blockchains = append(state.Blockchains, configuredBlockchain) } } + for _, nodeSet := range cfg.NodeSets { + if nodeSet != nil && strings.TrimSpace(nodeSet.Target) == string(envconfig.TargetRemote) { + state.NodeSets = append(state.NodeSets, nodeSet) + } + } if cfg.JD != nil && cfg.JD.Target == envconfig.TargetRemote { state.JD = cfg.JD } diff --git a/system-tests/lib/cre/environment/agent/deploy.go b/system-tests/lib/cre/environment/agent/deploy.go index 12e4a2d445c..07857b1ea1e 100644 --- a/system-tests/lib/cre/environment/agent/deploy.go +++ b/system-tests/lib/cre/environment/agent/deploy.go @@ -10,6 +10,7 @@ import ( "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" + ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" ) @@ -71,6 +72,21 @@ func DeployJDComponent(ctx context.Context, input *jd.Input) (*jd.Output, error) return output, nil } +func DeployNodeSetComponent(ctx context.Context, input *ns.Input, registryChain *blockchain.Output) (*ns.Output, error) { + if input == nil { + return nil, pkgerrors.New("nodeset input is nil") + } + if registryChain == nil { + return nil, pkgerrors.New("registry blockchain output is nil") + } + inputCopy := *input + output, err := ns.NewSharedDBNodeSetWithContext(ctx, &inputCopy, registryChain) + if err != nil { + return nil, pkgerrors.Wrapf(err, "failed to deploy nodeset %s", inputCopy.Name) + } + return output, nil +} + func buildRemoteJDInput(input *jd.Input) (*jd.Input, error) { jdInput := *input // Remote agent deployments require Docker service discovery (jd -> jd-db), diff --git a/system-tests/lib/cre/environment/agent/server.go b/system-tests/lib/cre/environment/agent/server.go index e47093587f5..c2d87555bff 100644 --- a/system-tests/lib/cre/environment/agent/server.go +++ b/system-tests/lib/cre/environment/agent/server.go @@ -25,6 +25,7 @@ import ( "github.com/smartcontractkit/chainlink-testing-framework/framework" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" + ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" ) @@ -35,6 +36,7 @@ const ( OperationHealth = "Health" ComponentTypeBlockchain = "blockchain" ComponentTypeJD = "jd" + ComponentTypeNodeSet = "nodeset" ErrCodeMethodNotAllowed = "method_not_allowed" ErrCodeInvalidRequestBody = "invalid_request_body" @@ -48,6 +50,8 @@ const ( RemoteStartPolicyAlways = "always" RemoteStartPolicyReuseIdentical = "reuse_if_identical" + + EnvKeepFailedContainers = "CRE_AGENT_KEEP_FAILED_CONTAINERS" ) var frameworkLogCaptureMu sync.Mutex @@ -59,10 +63,12 @@ type StartComponentEnvelope struct { } type StartComponentPayload struct { - ComponentType string `json:"componentType"` - Blockchain *blockchain.Input `json:"blockchain"` - JD *jd.Input `json:"jd"` - ReusePolicy string `json:"reusePolicy,omitempty"` + ComponentType string `json:"componentType"` + Blockchain *blockchain.Input `json:"blockchain"` + RegistryBlockchain map[string]any `json:"registryBlockchain,omitempty"` + JD *jd.Input `json:"jd"` + NodeSet *ns.Input `json:"nodeset,omitempty"` + ReusePolicy string `json:"reusePolicy,omitempty"` } type StartComponentResponse struct { @@ -136,7 +142,7 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { s.respondError(w, http.StatusBadRequest, ErrCodeInvalidPayload, fmt.Sprintf("invalid payload: %v", err), nil) return } - if payload.ComponentType != ComponentTypeBlockchain && payload.ComponentType != ComponentTypeJD { + if payload.ComponentType != ComponentTypeBlockchain && payload.ComponentType != ComponentTypeJD && payload.ComponentType != ComponentTypeNodeSet { s.respondError(w, http.StatusBadRequest, ErrCodeUnsupportedComponent, fmt.Sprintf("unsupported component type: %s", payload.ComponentType), nil) return } @@ -192,6 +198,7 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { agentLogs = append(agentLogs, preStartLogs...) var blockchainOutput *blockchain.Output var jdOutput *jd.Output + var nodeSetOutput *ns.Output trackedContainers, startErr := s.discoverOwnedContainers(r.Context(), func() error { capturedFrameworkLogs, runErr := captureFrameworkLogs(func() error { switch payload.ComponentType { @@ -207,6 +214,16 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { return err } jdOutput = deployed + case ComponentTypeNodeSet: + registryOutput, err := DecodeFromTransport[blockchain.Output](payload.RegistryBlockchain) + if err != nil { + return fmt.Errorf("failed to decode registry blockchain payload for nodeset: %w", err) + } + deployed, err := DeployNodeSetComponent(r.Context(), payload.NodeSet, registryOutput) + if err != nil { + return err + } + nodeSetOutput = deployed } return nil }) @@ -215,13 +232,15 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { }) if startErr != nil { - if len(trackedContainers) > 0 { + if len(trackedContainers) > 0 && shouldCleanupFailedContainers() { cleanupErr := stopContainers(r.Context(), trackedContainers) if cleanupErr != nil { agentLogs = append(agentLogs, fmt.Sprintf("[cre-agent] failed startup cleanup for %d tracked container(s): %v", len(trackedContainers), cleanupErr)) } else { agentLogs = append(agentLogs, fmt.Sprintf("[cre-agent] cleaned up %d tracked container(s) after failed startup", len(trackedContainers))) } + } else if len(trackedContainers) > 0 { + agentLogs = append(agentLogs, fmt.Sprintf("[cre-agent] preserving %d tracked container(s) after failed startup because %s is enabled", len(trackedContainers), EnvKeepFailedContainers)) } s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, startErr.Error(), agentLogs) return @@ -233,6 +252,8 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { output, encErr = EncodeForTransport(blockchainOutput) } else if jdOutput != nil { output, encErr = EncodeForTransport(jdOutput) + } else if nodeSetOutput != nil { + output, encErr = EncodeForTransport(nodeSetOutput) } if encErr != nil { s.respondError(w, http.StatusInternalServerError, ErrCodeTransportEncodeFailed, encErr.Error(), agentLogs) @@ -521,6 +542,11 @@ func hashPayload(payload []byte) string { return hex.EncodeToString(sum[:]) } +func shouldCleanupFailedContainers() bool { + raw := strings.TrimSpace(strings.ToLower(os.Getenv(EnvKeepFailedContainers))) + return raw == "" || (raw != "1" && raw != "true" && raw != "yes" && raw != "on") +} + func componentCacheKey(payload StartComponentPayload) (string, error) { switch payload.ComponentType { case ComponentTypeBlockchain: @@ -533,6 +559,11 @@ func componentCacheKey(payload StartComponentPayload) (string, error) { return "", fmt.Errorf("jd payload is required") } return fmt.Sprintf("%s:%s", payload.ComponentType, payload.JD.Image), nil + case ComponentTypeNodeSet: + if payload.NodeSet == nil { + return "", fmt.Errorf("nodeset payload is required") + } + return fmt.Sprintf("%s:%s", payload.ComponentType, payload.NodeSet.Name), nil default: return "", fmt.Errorf("unsupported component type: %s", payload.ComponentType) } diff --git a/system-tests/lib/cre/environment/agent/server_test.go b/system-tests/lib/cre/environment/agent/server_test.go index 21855dc3a5d..347b5204263 100644 --- a/system-tests/lib/cre/environment/agent/server_test.go +++ b/system-tests/lib/cre/environment/agent/server_test.go @@ -220,3 +220,19 @@ func TestStopComponentIdempotent(t *testing.T) { t.Fatalf("expected second stop to be no-op, got found=%v stopped=%v", stopResp2.Found, stopResp2.Stopped) } } + +func TestShouldCleanupFailedContainersDefaultsToTrue(t *testing.T) { + t.Setenv(EnvKeepFailedContainers, "") + if !shouldCleanupFailedContainers() { + t.Fatal("expected cleanup to be enabled by default") + } +} + +func TestShouldCleanupFailedContainersCanBeDisabled(t *testing.T) { + for _, value := range []string{"1", "true", "yes", "on", "TRUE"} { + t.Setenv(EnvKeepFailedContainers, value) + if shouldCleanupFailedContainers() { + t.Fatalf("expected cleanup to be disabled for value %q", value) + } + } +} diff --git a/system-tests/lib/cre/environment/blockchain_start.go b/system-tests/lib/cre/environment/blockchain_start.go index 2c30a3ddc62..f8543e3fc94 100644 --- a/system-tests/lib/cre/environment/blockchain_start.go +++ b/system-tests/lib/cre/environment/blockchain_start.go @@ -23,6 +23,7 @@ import ( "github.com/smartcontractkit/chainlink-testing-framework/framework" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" + ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/adapters" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" @@ -34,6 +35,7 @@ import ( const ( componentTypeBlockchain = "blockchain" componentTypeJD = "jd" + componentTypeNodeSet = "nodeset" envLocalAgentURL = "CRE_LOCAL_AGENT_URL" envEC2AgentURL = "CRE_EC2_AGENT_URL" envEC2InstanceID = "CRE_EC2_INSTANCE_ID" @@ -50,10 +52,12 @@ type startComponentEnvelope struct { } type startComponentRequest struct { - ComponentType string `json:"componentType"` - Blockchain *blockchain.Input `json:"blockchain"` - JD *jd.Input `json:"jd"` - ReusePolicy string `json:"reusePolicy,omitempty"` + ComponentType string `json:"componentType"` + Blockchain *blockchain.Input `json:"blockchain"` + RegistryBlockchain map[string]any `json:"registryBlockchain,omitempty"` + JD *jd.Input `json:"jd"` + NodeSet *ns.Input `json:"nodeset,omitempty"` + ReusePolicy string `json:"reusePolicy,omitempty"` } type startComponentResult struct { @@ -231,7 +235,8 @@ func isRetriableNetworkError(err error) bool { } func newStartComponentClient(testLogger zerolog.Logger, tunnelManager tunnel.Manager) (componentClient, error) { - if os.Getenv(envAgentMode) == "ec2" { + agentMode := strings.TrimSpace(os.Getenv(envAgentMode)) + if strings.EqualFold(agentMode, "ec2") { baseURL, err := resolveEC2AgentBaseURL(testLogger, tunnelManager) if err != nil { return nil, err @@ -351,6 +356,7 @@ func startBlockchainsWithTargets( configuredBlockchains []*config.Blockchain, deployers map[blockchain.ChainFamily]blockchains.Deployer, tunnelManager tunnel.Manager, + rewriteInternalForLocalNodes bool, ) (*blockchains.DeployedBlockchains, error) { blockchainInputs, err := config.ResolveBlockchainInputs(configuredBlockchains) if err != nil { @@ -431,7 +437,7 @@ func startBlockchainsWithTargets( return nil, pkgerrors.Wrap(err, "failed to decode blockchain transport payload") } - if err := rewriteRemoteBlockchainOutputForLocalAccess(ctx, testLogger, tunnelManager, idx, input, blockchainOutput); err != nil { + if err := rewriteRemoteBlockchainOutputForLocalAccess(ctx, testLogger, tunnelManager, idx, input, blockchainOutput, rewriteInternalForLocalNodes); err != nil { return nil, err } @@ -482,6 +488,7 @@ func rewriteRemoteBlockchainOutputForLocalAccess( configuredIndex int, input *blockchain.Input, output *blockchain.Output, + rewriteInternalForLocalNodes bool, ) error { if output == nil { return nil @@ -511,8 +518,10 @@ func rewriteRemoteBlockchainOutputForLocalAccess( if err := adapter.RewriteWithBindings(output, bindings); err != nil { return pkgerrors.Wrap(err, "failed to rewrite blockchain output with local tunnel bindings") } - if err := rewriteBlockchainInternalURLsForLocalNodes(output); err != nil { - return pkgerrors.Wrap(err, "failed to rewrite blockchain internal urls for local node containers") + if rewriteInternalForLocalNodes { + if err := rewriteBlockchainInternalURLsForLocalNodes(output); err != nil { + return pkgerrors.Wrap(err, "failed to rewrite blockchain internal urls for local node containers") + } } return nil diff --git a/system-tests/lib/cre/environment/blockchain_start_test.go b/system-tests/lib/cre/environment/blockchain_start_test.go index e8358bcfdf4..b724cc298b7 100644 --- a/system-tests/lib/cre/environment/blockchain_start_test.go +++ b/system-tests/lib/cre/environment/blockchain_start_test.go @@ -151,6 +151,7 @@ func TestRewriteRemoteBlockchainOutputForLocalAccess(t *testing.T) { 0, &blockchain.Input{Type: blockchain.TypeAnvil}, out, + true, ); err != nil { t.Fatalf("expected rewrite helper to succeed: %v", err) } diff --git a/system-tests/lib/cre/environment/config/config.go b/system-tests/lib/cre/environment/config/config.go index e4e337e0b7a..d6775a72fbb 100644 --- a/system-tests/lib/cre/environment/config/config.go +++ b/system-tests/lib/cre/environment/config/config.go @@ -211,6 +211,10 @@ func (c *Config) Validate(envDependencies cre.CLIEnvironmentDependencies) error } for _, nodeSet := range c.NodeSets { + normalizeNodeSetPlacement(nodeSet) + if err := validateNodeSetPlacement(nodeSet); err != nil { + return err + } for _, capability := range nodeSet.Capabilities { capability = removeChainIDFromFlag(capability) if !slices.Contains(envDependencies.SupportedCapabilityFlags(), capability) { @@ -226,6 +230,31 @@ func (c *Config) Validate(envDependencies cre.CLIEnvironmentDependencies) error return nil } +func normalizeNodeSetPlacement(nodeSet *cre.NodeSet) { + if nodeSet == nil { + return + } + if strings.TrimSpace(nodeSet.Target) == "" { + nodeSet.Target = string(TargetDocker) + } + if strings.TrimSpace(nodeSet.RemoteStartPolicy) == "" { + nodeSet.RemoteStartPolicy = string(RemoteStartPolicyReuseIfIdentical) + } +} + +func validateNodeSetPlacement(nodeSet *cre.NodeSet) error { + if nodeSet == nil { + return errors.New("nodeset is nil") + } + if nodeSet.Target != string(TargetDocker) && nodeSet.Target != string(TargetRemote) { + return fmt.Errorf("invalid nodeset target: %s", nodeSet.Target) + } + if nodeSet.RemoteStartPolicy != string(RemoteStartPolicyReuseIfIdentical) && nodeSet.RemoteStartPolicy != string(RemoteStartPolicyAlways) { + return fmt.Errorf("invalid nodeset remote_start_policy: %s", nodeSet.RemoteStartPolicy) + } + return nil +} + func removeChainIDFromFlag(flag string) string { lastIdx := strings.LastIndex(flag, "-") if lastIdx == -1 { diff --git a/system-tests/lib/cre/environment/dons.go b/system-tests/lib/cre/environment/dons.go index 417d0403052..1a4ddc5e057 100644 --- a/system-tests/lib/cre/environment/dons.go +++ b/system-tests/lib/cre/environment/dons.go @@ -2,7 +2,11 @@ package environment import ( "context" + "encoding/json" "fmt" + "net/url" + "strconv" + "strings" "sync" "time" @@ -13,12 +17,16 @@ import ( chainselectors "github.com/smartcontractkit/chain-selectors" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/clnode" ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" crecapabilities "github.com/smartcontractkit/chainlink/system-tests/lib/cre/capabilities" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/solana" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/flags" "github.com/smartcontractkit/chainlink/system-tests/lib/infra" ) @@ -55,6 +63,7 @@ func StartDONs( capabilityConfigs cre.CapabilityConfigs, copyCapabilityBinaries bool, nodeSets []*cre.NodeSet, + tunnelManager tunnel.Manager, ) (*StartedDONs, error) { if infraInput.IsKubernetes() { // For Kubernetes, DONs are already running in the cluster, generate service URLs @@ -74,7 +83,8 @@ func StartDONs( } // Skip binary operations for Kubernetes (binaries are in the cluster images) - if infraInput.IsDocker() { + if infraInput.IsDocker() && !hasRemoteNodeSets(nodeSets) { + // TODO in the future check here if don is remote and skip if it is instead of !hasRemoteNodeSets() for donIdx, donMetadata := range topology.DonsMetadata.List() { if !copyCapabilityBinaries { continue @@ -121,6 +131,14 @@ func StartDONs( errGroup, _ := errgroup.WithContext(ctx) var resultMap sync.Map + var startClient componentClient + if hasRemoteNodeSets(nodeSets) { + client, clientErr := newStartComponentClient(lggr, tunnelManager) + if clientErr != nil { + return nil, clientErr + } + startClient = client + } for idx, nodeSet := range nodeSets { errGroup.Go(func() error { @@ -134,6 +152,50 @@ func StartDONs( if nodeSet.Out != nil { lggr.Info().Msgf("Using pre-configured node URLs for DON %s", nodeSet.Name) nodeset = nodeSet.Out + } else if strings.TrimSpace(nodeSet.Target) == string(config.TargetRemote) { + registryChainPayload, err := agent.EncodeForTransport(registryChainBlockchainOutput) + if err != nil { + return pkgerrors.Wrap(err, "failed to encode registry blockchain payload for remote nodeset start") + } + remoteInput, err := buildRemoteNodeSetInput(nodeSet) + if err != nil { + return err + } + payload := startComponentRequest{ + ComponentType: componentTypeNodeSet, + NodeSet: remoteInput, + RegistryBlockchain: registryChainPayload, + ReusePolicy: nodeSetRemoteStartPolicy(nodeSet), + } + payloadBytes, err := json.Marshal(payload) + if err != nil { + return pkgerrors.Wrap(err, "failed to encode nodeset payload") + } + response, err := startClient.StartComponent(ctx, startComponentEnvelope{ + SchemaVersion: agent.SchemaVersionV1, + Operation: agent.OperationStartComponent, + Payload: payloadBytes, + }) + if err != nil { + return err + } + if response.ComponentType != componentTypeNodeSet { + return fmt.Errorf("unexpected component type in start response: %s", response.ComponentType) + } + for _, logLine := range response.AgentLogs { + pretty := prettifyAgentLogLine(logLine) + if pretty == "" { + continue + } + lggr.Info().Msgf("[agent] %s", pretty) + } + nodeset, err = agent.DecodeFromTransport[ns.Output](response.Output) + if err != nil { + return pkgerrors.Wrap(err, "failed to decode nodeset transport payload") + } + if err := rewriteRemoteNodeSetOutputForLocalAccess(ctx, lggr, tunnelManager, idx, nodeSet, nodeset); err != nil { + return err + } } else { // For Docker, start the nodes nodeSet.Input.NodeSpecs = nodeSet.ExtractCTFInputs() @@ -181,6 +243,213 @@ func StartDONs( return &startedDONs, nil } +func hasRemoteNodeSets(nodeSets []*cre.NodeSet) bool { + for _, nodeSet := range nodeSets { + if nodeSet != nil && strings.TrimSpace(nodeSet.Target) == string(config.TargetRemote) { + return true + } + } + return false +} + +func nodeSetRemoteStartPolicy(nodeSet *cre.NodeSet) string { + if nodeSet == nil || strings.TrimSpace(nodeSet.RemoteStartPolicy) == "" { + return string(config.RemoteStartPolicyReuseIfIdentical) + } + return nodeSet.RemoteStartPolicy +} + +func buildRemoteNodeSetInput(nodeSet *cre.NodeSet) (*ns.Input, error) { + if nodeSet == nil || nodeSet.Input == nil { + return nil, pkgerrors.New("nodeset input is nil for remote target") + } + inputCopy := *nodeSet.Input + inputCopy.NodeSpecs = nodeSet.ExtractCTFInputs() + if err := validateRemoteNodeSetNodeSpecs(inputCopy.Name, inputCopy.NodeSpecs); err != nil { + return nil, err + } + return &inputCopy, nil +} + +func validateRemoteNodeSetNodeSpecs(nodeSetName string, specs []*clnode.Input) error { + for idx, spec := range specs { + if spec == nil || spec.Node == nil { + return fmt.Errorf("remote nodeset %q node_specs[%d] is nil", nodeSetName, idx) + } + hasImage := strings.TrimSpace(spec.Node.Image) != "" + hasBuildConfig := strings.TrimSpace(spec.Node.DockerContext) != "" || + strings.TrimSpace(spec.Node.DockerFilePath) != "" || + len(spec.Node.DockerBuildArgs) > 0 + if hasImage && hasBuildConfig { + return fmt.Errorf( + "remote nodeset %q node_specs[%d] must configure either node.image or docker build fields (docker_ctx/docker_file/docker_build_args), not both", + nodeSetName, + idx, + ) + } + if !hasImage && !hasBuildConfig { + return fmt.Errorf( + "remote nodeset %q node_specs[%d] must set node.image or docker build fields (docker_ctx/docker_file/docker_build_args)", + nodeSetName, + idx, + ) + } + } + return nil +} + +func rewriteRemoteNodeSetOutputForLocalAccess( + ctx context.Context, + lggr zerolog.Logger, + tunnelManager tunnel.Manager, + configuredIndex int, + nodeSet *cre.NodeSet, + output *ns.Output, +) error { + if output == nil && (nodeSet == nil || nodeSet.DbInput == nil || nodeSet.DbInput.Port == 0) { + return nil + } + componentID := tunnel.CanonicalComponentID(tunnel.KindNodeSet, configuredIndex, nodeSet.Name) + refs, err := describeNodeSetEndpoints(componentID, nodeSet, output) + if err != nil { + return pkgerrors.Wrap(err, "failed to describe nodeset tunnel endpoints") + } + bindings, err := tunnelManager.Start(ctx, refs) + if err != nil { + return pkgerrors.Wrap(err, "failed to start tunnels for nodeset output") + } + for _, binding := range bindings { + lggr.Info(). + Str("componentID", binding.ComponentID). + Str("endpointName", binding.EndpointName). + Str("originalURL", binding.OriginalURL). + Str("localURL", binding.LocalURL). + Msg("Established endpoint tunnel") + } + return rewriteNodeSetWithBindings(output, nodeSet, bindings) +} + +const nodeSetDBEndpointName = "nodeset-db" + +func describeNodeSetEndpoints(componentID string, nodeSet *cre.NodeSet, output *ns.Output) ([]tunnel.EndpointRef, error) { + sizeHint := 1 + if output != nil { + sizeHint += len(output.CLNodes) + } + refs := make([]tunnel.EndpointRef, 0, sizeHint) + if output != nil { + for idx := range output.CLNodes { + endpointName := fmt.Sprintf("node-%d-api", idx) + rawURL := output.CLNodes[idx].Node.ExternalURL + ref, err := nodeSetEndpointFromURL(componentID, endpointName, rawURL) + if err != nil { + return nil, err + } + if ref != nil { + refs = append(refs, *ref) + } + } + } + dbRef, err := nodeSetDBEndpointRef(componentID, nodeSet) + if err != nil { + return nil, err + } + if dbRef != nil { + refs = append(refs, *dbRef) + } + return refs, nil +} + +func nodeSetDBEndpointRef(componentID string, nodeSet *cre.NodeSet) (*tunnel.EndpointRef, error) { + if nodeSet == nil || nodeSet.DbInput == nil || nodeSet.DbInput.Port == 0 { + return nil, nil + } + if nodeSet.DbInput.Port < 0 || nodeSet.DbInput.Port > 65535 { + return nil, fmt.Errorf("nodeset db port %d is invalid", nodeSet.DbInput.Port) + } + return &tunnel.EndpointRef{ + ComponentID: componentID, + EndpointName: nodeSetDBEndpointName, + Scheme: "tcp", + Host: "127.0.0.1", + Port: nodeSet.DbInput.Port, + OriginalURL: fmt.Sprintf("tcp://127.0.0.1:%d", nodeSet.DbInput.Port), + }, nil +} + +func rewriteNodeSetWithBindings(output *ns.Output, nodeSet *cre.NodeSet, bindings []tunnel.TunnelBinding) error { + byName := make(map[string]tunnel.TunnelBinding, len(bindings)) + for _, binding := range bindings { + byName[binding.EndpointName] = binding + } + if output != nil { + for idx := range output.CLNodes { + endpointName := fmt.Sprintf("node-%d-api", idx) + rawURL := output.CLNodes[idx].Node.ExternalURL + if rawURL == "" { + continue + } + binding, ok := byName[endpointName] + if !ok { + return fmt.Errorf("missing tunnel binding for nodeset endpoint %s", endpointName) + } + output.CLNodes[idx].Node.ExternalURL = binding.LocalURL + } + } + if nodeSet != nil && nodeSet.DbInput != nil && nodeSet.DbInput.Port != 0 { + binding, ok := byName[nodeSetDBEndpointName] + if !ok { + return fmt.Errorf("missing tunnel binding for nodeset endpoint %s", nodeSetDBEndpointName) + } + nodeSet.DbInput.Port = binding.LocalPort + } + return nil +} + +func nodeSetEndpointFromURL(componentID, endpointName, rawURL string) (*tunnel.EndpointRef, error) { + if strings.TrimSpace(rawURL) == "" { + return nil, nil + } + parsed, err := url.Parse(rawURL) + if err != nil { + return nil, fmt.Errorf("failed to parse endpoint url %q: %w", rawURL, err) + } + host := parsed.Hostname() + if host == "" { + return nil, fmt.Errorf("endpoint url %q has empty hostname", rawURL) + } + port, err := nodeSetResolveURLPort(parsed) + if err != nil { + return nil, err + } + return &tunnel.EndpointRef{ + ComponentID: componentID, + EndpointName: endpointName, + Scheme: parsed.Scheme, + Host: host, + Port: port, + OriginalURL: rawURL, + }, nil +} + +func nodeSetResolveURLPort(parsed *url.URL) (int, error) { + if parsed.Port() != "" { + port, err := strconv.Atoi(parsed.Port()) + if err != nil || port <= 0 || port > 65535 { + return 0, fmt.Errorf("url %q has invalid port %q", parsed.String(), parsed.Port()) + } + return port, nil + } + switch parsed.Scheme { + case "http", "ws": + return 80, nil + case "https", "wss": + return 443, nil + default: + return 0, fmt.Errorf("url %q has unsupported scheme %q without explicit port", parsed.String(), parsed.Scheme) + } +} + func FundNodes(ctx context.Context, testLogger zerolog.Logger, dons *cre.Dons, blockchains []blockchains.Blockchain, fundingAmountPerChainFamily map[string]uint64) error { for _, don := range dons.List() { testLogger.Info().Msgf("Funding nodes for DON %s", don.Name) diff --git a/system-tests/lib/cre/environment/dons_test.go b/system-tests/lib/cre/environment/dons_test.go new file mode 100644 index 00000000000..026a36c3b1a --- /dev/null +++ b/system-tests/lib/cre/environment/dons_test.go @@ -0,0 +1,62 @@ +package environment + +import ( + "strings" + "testing" + + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/clnode" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre" +) + +func TestBuildRemoteNodeSetInputRequiresImageOrBuildFields(t *testing.T) { + nodeSet := &cre.NodeSet{ + Input: &simple_node_set.Input{ + Name: "remote-don", + }, + NodeSpecs: []*cre.NodeSpecWithRole{ + { + Input: &clnode.Input{ + Node: &clnode.NodeInput{ + Image: "", + }, + }, + }, + }, + } + + _, err := buildRemoteNodeSetInput(nodeSet) + if err == nil { + t.Fatal("expected missing image/build validation error") + } + if !strings.Contains(err.Error(), "must set node.image or docker build fields") { + t.Fatalf("expected image validation error, got: %v", err) + } +} + +func TestBuildRemoteNodeSetInputRejectsImageAndBuildFieldsTogether(t *testing.T) { + nodeSet := &cre.NodeSet{ + Input: &simple_node_set.Input{ + Name: "remote-don", + }, + NodeSpecs: []*cre.NodeSpecWithRole{ + { + Input: &clnode.Input{ + Node: &clnode.NodeInput{ + Image: "repo/chainlink:tag", + DockerContext: "../../../..", + DockerFilePath: "core/chainlink.Dockerfile", + }, + }, + }, + }, + } + + _, err := buildRemoteNodeSetInput(nodeSet) + if err == nil { + t.Fatal("expected image+build conflict validation error") + } + if !strings.Contains(err.Error(), "either node.image or docker build fields") { + t.Fatalf("expected image/build conflict error, got: %v", err) + } +} diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index d3b502b11a6..8ca31c3af80 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -6,6 +6,7 @@ import ( "fmt" "maps" "os" + "strings" "sync" "github.com/Masterminds/semver/v3" @@ -153,6 +154,10 @@ func SetupTestEnvironment( if err := input.Validate(); err != nil { return nil, pkgerrors.Wrap(err, "input validation failed") } + nodeSetPlacement, err := summarizeNodeSetPlacement(input.NodeSets) + if err != nil { + return nil, pkgerrors.Wrap(err, "nodeset placement validation failed") + } s3Output, s3Err := workflow.StartS3(testLogger, input.S3ProviderInput, input.StageGen) if s3Err != nil { @@ -172,6 +177,7 @@ func SetupTestEnvironment( input.Blockchains, input.BlockchainDeployers, tunnelManager, + nodeSetPlacement.HasLocalTargets, ) if startErr != nil { return nil, pkgerrors.Wrap(startErr, "failed to start blockchains") @@ -264,7 +270,7 @@ func SetupTestEnvironment( jdStartedFuture := queue.SubmitAny(func(ctx context.Context) (any, error) { // TODO: pass context after we update the CTF to accept context, when creating new JD instance - jdOutput, startJDErr := StartJD(ctx, testLogger, input.JdInput, input.Provider, tunnelManager) + jdOutput, startJDErr := StartJD(ctx, testLogger, input.JdInput, input.Provider, tunnelManager, nodeSetPlacement.HasLocalTargets) if startJDErr != nil { return nil, pkgerrors.Wrap(startJDErr, "failed to start Job Distributor") } @@ -272,7 +278,7 @@ func SetupTestEnvironment( }) donsStartedFuture := queue.SubmitAny(func(ctx context.Context) (any, error) { - nodeSetOutput, startDonsErr := StartDONs(ctx, testLogger, topology, input.Provider, deployedBlockchains.RegistryChain().CtfOutput(), input.CapabilityConfigs, input.CopyCapabilityBinaries, updatedNodeSets) + nodeSetOutput, startDonsErr := StartDONs(ctx, testLogger, topology, input.Provider, deployedBlockchains.RegistryChain().CtfOutput(), input.CapabilityConfigs, input.CopyCapabilityBinaries, updatedNodeSets, tunnelManager) if startDonsErr != nil { return nil, pkgerrors.Wrap(startDonsErr, "failed to start DONs") } @@ -514,6 +520,37 @@ func appendOutputsToInput(input *SetupInput, nodeSetOutput []*cre.NodeSetOutput, input.JdInput.Out = jdOutput } +type nodeSetPlacementSummary struct { + HasLocalTargets bool + HasRemoteTargets bool +} + +func summarizeNodeSetPlacement(nodeSets []*cre.NodeSet) (*nodeSetPlacementSummary, error) { + summary := &nodeSetPlacementSummary{} + for _, nodeSet := range nodeSets { + if nodeSet == nil { + continue + } + configTarget := strings.TrimSpace(nodeSet.Target) + if configTarget == "" || configTarget == string(config.TargetDocker) { + summary.HasLocalTargets = true + continue + } + if configTarget == string(config.TargetRemote) { + summary.HasRemoteTargets = true + continue + } + return nil, fmt.Errorf("invalid nodeset target: %s", nodeSet.Target) + } + + // Mixed local and remote nodeset targets need per-DON node-facing URL config selection. + // Current PrepareNodeTOMLs builds one node-facing URL shape, so keep this unsupported for now. + if summary.HasLocalTargets && summary.HasRemoteTargets { + return nil, errors.New("mixed nodeset targets are not supported yet; set all nodesets target=docker or all target=remote") + } + return summary, nil +} + func newCldfEnvironment(ctx context.Context, singleFileLogger logger.Logger, cldfBlockchains cldf_chain.BlockChains) *cldf.Environment { allChainsCLDEnvironment := &cldf.Environment{ diff --git a/system-tests/lib/cre/environment/jobs.go b/system-tests/lib/cre/environment/jobs.go index 461100402d5..b15517df44c 100644 --- a/system-tests/lib/cre/environment/jobs.go +++ b/system-tests/lib/cre/environment/jobs.go @@ -68,6 +68,7 @@ func StartJD( jdConfig *config.JobDistributor, infraInput infra.Provider, tunnelManager tunnel.Manager, + rewriteInternalForLocalNodes bool, ) (*StartedJD, error) { startTime := time.Now() lggr.Info().Msg("Starting Job Distributor") @@ -114,7 +115,7 @@ func StartJD( if err != nil { return nil, pkgerrors.Wrap(err, "failed to decode jd transport payload") } - if err := rewriteRemoteJDOutputForLocalAccess(ctx, lggr, tunnelManager, jdOutput); err != nil { + if err := rewriteRemoteJDOutputForLocalAccess(ctx, lggr, tunnelManager, jdOutput, rewriteInternalForLocalNodes); err != nil { return nil, err } } else if infraInput.IsKubernetes() { @@ -172,6 +173,7 @@ func rewriteRemoteJDOutputForLocalAccess( lggr zerolog.Logger, tunnelManager tunnel.Manager, output *jd.Output, + rewriteInternalForLocalNodes bool, ) error { if output == nil { return nil @@ -196,7 +198,7 @@ func rewriteRemoteJDOutputForLocalAccess( Str("localURL", binding.LocalURL). Msg("Established endpoint tunnel") } - return rewriteJDWithBindings(output, bindings) + return rewriteJDWithBindings(output, bindings, rewriteInternalForLocalNodes) } func describeJDEndpoints(output *jd.Output) ([]tunnel.EndpointRef, error) { @@ -222,7 +224,7 @@ func describeJDEndpoints(output *jd.Output) ([]tunnel.EndpointRef, error) { return refs, nil } -func rewriteJDWithBindings(output *jd.Output, bindings []tunnel.TunnelBinding) error { +func rewriteJDWithBindings(output *jd.Output, bindings []tunnel.TunnelBinding, rewriteInternalForLocalNodes bool) error { byName := make(map[string]tunnel.TunnelBinding, len(bindings)) for _, binding := range bindings { byName[binding.EndpointName] = binding @@ -233,9 +235,11 @@ func rewriteJDWithBindings(output *jd.Output, bindings []tunnel.TunnelBinding) e if !ok { return fmt.Errorf("missing tunnel binding for jd grpc endpoint") } - dockerHost := strings.TrimPrefix(framework.HostDockerInternal(), "http://") output.ExternalGRPCUrl = net.JoinHostPort("127.0.0.1", fmt.Sprintf("%d", binding.LocalPort)) - output.InternalGRPCUrl = net.JoinHostPort(dockerHost, fmt.Sprintf("%d", binding.LocalPort)) + if rewriteInternalForLocalNodes { + dockerHost := strings.TrimPrefix(framework.HostDockerInternal(), "http://") + output.InternalGRPCUrl = net.JoinHostPort(dockerHost, fmt.Sprintf("%d", binding.LocalPort)) + } } if output.ExternalWSRPCUrl != "" || output.InternalWSRPCUrl != "" { @@ -243,9 +247,11 @@ func rewriteJDWithBindings(output *jd.Output, bindings []tunnel.TunnelBinding) e if !ok { return fmt.Errorf("missing tunnel binding for jd wsrpc endpoint") } - dockerHost := strings.TrimPrefix(framework.HostDockerInternal(), "http://") - output.InternalWSRPCUrl = net.JoinHostPort(dockerHost, fmt.Sprintf("%d", binding.LocalPort)) output.ExternalWSRPCUrl = net.JoinHostPort("127.0.0.1", fmt.Sprintf("%d", binding.LocalPort)) + if rewriteInternalForLocalNodes { + dockerHost := strings.TrimPrefix(framework.HostDockerInternal(), "http://") + output.InternalWSRPCUrl = net.JoinHostPort(dockerHost, fmt.Sprintf("%d", binding.LocalPort)) + } } return nil diff --git a/system-tests/lib/cre/environment/jobs_test.go b/system-tests/lib/cre/environment/jobs_test.go index 4c3f416dc3a..d2d84fa529d 100644 --- a/system-tests/lib/cre/environment/jobs_test.go +++ b/system-tests/lib/cre/environment/jobs_test.go @@ -55,7 +55,7 @@ func TestRewriteJDWithBindingsRewritesNodeFacingWSRPC(t *testing.T) { }, } - if err := rewriteJDWithBindings(output, bindings); err != nil { + if err := rewriteJDWithBindings(output, bindings, true); err != nil { t.Fatalf("rewriteJDWithBindings returned error: %v", err) } diff --git a/system-tests/lib/cre/environment/remote_stop.go b/system-tests/lib/cre/environment/remote_stop.go index 9ab8148823e..4a36dc3f6cc 100644 --- a/system-tests/lib/cre/environment/remote_stop.go +++ b/system-tests/lib/cre/environment/remote_stop.go @@ -5,10 +5,12 @@ import ( "encoding/json" "errors" "fmt" + "strings" pkgerrors "github.com/pkg/errors" "github.com/rs/zerolog" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" ) @@ -66,6 +68,28 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. } } + for _, nodeSet := range cfg.NodeSets { + if nodeSet == nil || strings.TrimSpace(nodeSet.Target) != string(config.TargetRemote) { + continue + } + payload := startComponentRequest{ + ComponentType: componentTypeNodeSet, + NodeSet: &simple_node_set.Input{Name: nodeSet.Name}, + ReusePolicy: nodeSet.RemoteStartPolicy, + } + result, err := stopRemoteComponent(ctx, lggr, startClient, payload, componentTypeNodeSet) + if err != nil { + summary.Failed++ + joined = errors.Join(joined, err) + continue + } + if result.Stopped { + summary.Stopped++ + } else if !result.Found { + summary.Missing++ + } + } + if cfg.JD != nil && cfg.JD.Target == config.TargetRemote { payload := startComponentRequest{ ComponentType: componentTypeJD, @@ -98,6 +122,11 @@ func countRemoteStopTargets(cfg *config.Config) int { count++ } } + for _, nodeSet := range cfg.NodeSets { + if nodeSet != nil && strings.TrimSpace(nodeSet.Target) == string(config.TargetRemote) { + count++ + } + } if cfg.JD != nil && cfg.JD.Target == config.TargetRemote { count++ } diff --git a/system-tests/lib/cre/types.go b/system-tests/lib/cre/types.go index 8853bfb967d..f149320246e 100644 --- a/system-tests/lib/cre/types.go +++ b/system-tests/lib/cre/types.go @@ -1184,6 +1184,9 @@ type NodeSpecWithRole struct { type NodeSet struct { *ns.Input + Target string `toml:"target"` // docker (default) or remote + RemoteStartPolicy string `toml:"remote_start_policy"` // reuse_if_identical (default) or always + // Our role-aware node specs (shadows ns.Input.NodeSpecs) NodeSpecs []*NodeSpecWithRole `toml:"node_specs" validate:"required"` diff --git a/system-tests/lib/go.mod b/system-tests/lib/go.mod index 70bd24015aa..4161c743426 100644 --- a/system-tests/lib/go.mod +++ b/system-tests/lib/go.mod @@ -20,6 +20,7 @@ require ( github.com/andybalholm/brotli v1.2.0 github.com/avast/retry-go/v4 v4.6.1 github.com/cockroachdb/errors v1.11.3 + github.com/containerd/errdefs v1.0.0 github.com/cosmos/gogoproto v1.7.0 github.com/docker/docker v28.5.1+incompatible github.com/ethereum/go-ethereum v1.16.8 @@ -170,7 +171,6 @@ require ( github.com/containerd/containerd/api v1.9.0 // indirect github.com/containerd/containerd/v2 v2.1.5 // indirect github.com/containerd/continuity v0.4.5 // indirect - github.com/containerd/errdefs v1.0.0 // indirect github.com/containerd/errdefs/pkg v0.3.0 // indirect github.com/containerd/log v0.1.0 // indirect github.com/containerd/platforms v1.0.0-rc.1 // indirect From 23f96ebb79d8789a1b7703361fc65dbb68416e6c Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Thu, 19 Feb 2026 18:40:17 +0100 Subject: [PATCH 06/34] add support for sending artifacts to remote dons --- .../cre/environment/environment/workflow.go | 95 ++++++++++-- .../lib/cre/environment/agent/server.go | 141 ++++++++++++++---- .../lib/cre/environment/artifacts_remote.go | 86 +++++++++++ .../lib/cre/environment/blockchain_start.go | 65 +++----- system-tests/lib/cre/environment/jobs.go | 4 +- .../lib/cre/environment/remote_stop.go | 14 +- .../lib/cre/internal/dockerops/files.go | 97 ++++++++++++ .../lib/cre/workflow/deploy_artifacts.go | 42 ++++++ system-tests/lib/cre/workflow/docker.go | 113 +++----------- .../tests/smoke/cre/v2_grpc_source_test.go | 32 +++- system-tests/tests/test-helpers/t_helpers.go | 49 +++++- 11 files changed, 552 insertions(+), 186 deletions(-) create mode 100644 system-tests/lib/cre/environment/artifacts_remote.go create mode 100644 system-tests/lib/cre/internal/dockerops/files.go create mode 100644 system-tests/lib/cre/workflow/deploy_artifacts.go diff --git a/core/scripts/cre/environment/environment/workflow.go b/core/scripts/cre/environment/environment/workflow.go index aaaf7d5f88b..50f54bba72f 100644 --- a/core/scripts/cre/environment/environment/workflow.go +++ b/core/scripts/cre/environment/environment/workflow.go @@ -18,12 +18,15 @@ import ( "github.com/smartcontractkit/chainlink-deployments-framework/datastore" "github.com/smartcontractkit/chainlink-deployments-framework/deployment" + "github.com/smartcontractkit/chainlink-testing-framework/framework" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink-testing-framework/seth" keystone_changeset "github.com/smartcontractkit/chainlink/deployment/keystone/changeset" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment" envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" creworkflow "github.com/smartcontractkit/chainlink/system-tests/lib/cre/workflow" ) @@ -113,6 +116,7 @@ func deployWorkflowCmd() *cobra.Command { compileWorkflowFlag bool containerTargetDirFlag string containerNamePatternFlag string + nodeSetNameFlag string workflowNameFlag string workflowOwnerAddressFlag string workflowRegistryAddressFlag string @@ -188,7 +192,7 @@ func deployWorkflowCmd() *cobra.Command { capabilitiesRegistryVersion = addrRef.Version } - regErr = deployWorkflow(cmd.Context(), workflowFilePathFlag, workflowNameFlag, workflowOwnerAddressFlag, workflowRegistryAddress, capabilitiesRegistryAddress, containerNamePatternFlag, containerTargetDirFlag, configFilePathFlag, secretsFilePathFlag, secretsOutputFilePathFlag, rpcURLFlag, workflowRegistryVersion, capabilitiesRegistryVersion, donIDFlag, deleteWorkflowFileFlag) + regErr = deployWorkflow(cmd.Context(), workflowFilePathFlag, workflowNameFlag, workflowOwnerAddressFlag, workflowRegistryAddress, capabilitiesRegistryAddress, containerNamePatternFlag, nodeSetNameFlag, containerTargetDirFlag, configFilePathFlag, secretsFilePathFlag, secretsOutputFilePathFlag, rpcURLFlag, workflowRegistryVersion, capabilitiesRegistryVersion, donIDFlag, deleteWorkflowFileFlag) return regErr }, @@ -200,6 +204,7 @@ func deployWorkflowCmd() *cobra.Command { cmd.Flags().StringVarP(&secretsOutputFilePathFlag, "secrets-output-file-path", "o", "", "Path to encrypted secrets output file (default \"./encrypted.secrets.json\")") cmd.Flags().StringVarP(&containerTargetDirFlag, "container-target-dir", "t", creworkflow.DefaultWorkflowTargetDir, "Path to the target directory in the Docker container") cmd.Flags().StringVarP(&containerNamePatternFlag, "container-name-pattern", "p", creworkflow.DefaultWorkflowNodePattern, "Pattern to match Docker containers workkflow DON containers (e.g. 'workflow-node')") + cmd.Flags().StringVar(&nodeSetNameFlag, "nodeset-name", "", "NodeSet name for remote artifact deployment (optional; auto-detected if omitted)") cmd.Flags().StringVarP(&rpcURLFlag, "rpc-url", "r", "http://localhost:8545", "RPC URL") cmd.Flags().StringVarP(&workflowOwnerAddressFlag, "workflow-owner-address", "d", DefaultWorkflowOwnerAddress, "Workflow owner address") cmd.Flags().StringVarP(&workflowRegistryAddressFlag, "workflow-registry-address", "a", "", "Workflow registry address (if not provided, address from the state file will be used)") @@ -383,14 +388,43 @@ func compileWorkflow(ctx context.Context, workflowFilePathFlag, workflowNameFlag func deployWorkflow( ctx context.Context, - wasmWorkflowFilePathFlag, workflowNameFlag, workflowOwnerAddressFlag, workflowRegistryAddress, capabilitiesRegistryAddress, containerNamePatternFlag, containerTargetDirFlag, configFilePathFlag, secretsFilePathFlag, secretsOutputFilePathFlag, rpcURLFlag string, + wasmWorkflowFilePathFlag, workflowNameFlag, workflowOwnerAddressFlag, workflowRegistryAddress, capabilitiesRegistryAddress, containerNamePatternFlag, nodeSetNameFlag, containerTargetDirFlag, configFilePathFlag, secretsFilePathFlag, secretsOutputFilePathFlag, rpcURLFlag string, workflowRegistryVersion, capabilitiesRegistryVersion *semver.Version, donIDFlag uint32, deleteWorkflowFile bool, ) error { - copyErr := creworkflow.CopyArtifactsToDockerContainers(containerTargetDirFlag, containerNamePatternFlag, wasmWorkflowFilePathFlag) + mode, resolvedNodeSetName, modeErr := resolveWorkflowArtifactDeployModeFromState(containerNamePatternFlag, nodeSetNameFlag) + if modeErr != nil { + return modeErr + } + var remoteTunnelManager tunnel.Manager + if mode == creworkflow.ArtifactDeployModeRemote { + manager, err := environment.NewEC2TunnelManager(framework.L) + if err != nil { + return errors.Wrap(err, "failed to initialize tunnel manager for remote workflow artifact deploy") + } + remoteTunnelManager = manager + defer func() { _ = remoteTunnelManager.Stop(ctx) }() + } + deployArtifacts := func(files ...string) error { + return creworkflow.DeployArtifacts( + ctx, + creworkflow.DeployArtifactsOptions{ + Mode: mode, + NodeSetName: resolvedNodeSetName, + ContainerNamePattern: containerNamePatternFlag, + ContainerTargetDir: containerTargetDirFlag, + Files: files, + RemoteDeployer: func(ctx context.Context, nodeSetName, containerTargetDir string, files []string) error { + return environment.DeployArtifactsToRemoteNodeSet(ctx, framework.L, remoteTunnelManager, nodeSetName, containerTargetDir, files) + }, + }, + ) + } + + copyErr := deployArtifacts(wasmWorkflowFilePathFlag) if copyErr != nil { - return errors.Wrap(copyErr, "❌ failed to copy workflow to Docker container") + return errors.Wrap(copyErr, "❌ failed to deploy workflow artifact") } fmt.Printf("\n✅ Workflow copied to Docker containers\n") @@ -417,9 +451,9 @@ func deployWorkflow( return errors.Wrap(configPathAbsErr, "failed to get absolute path of the config file") } - configCopyErr := creworkflow.CopyArtifactsToDockerContainers(containerTargetDirFlag, containerNamePatternFlag, configFilePathFlag) + configCopyErr := deployArtifacts(configFilePathFlag) if configCopyErr != nil { - return errors.Wrap(configCopyErr, "❌ failed to copy config file to Docker container") + return errors.Wrap(configCopyErr, "❌ failed to deploy config artifact") } configPathAbs = "file://" + configPathAbs @@ -444,9 +478,9 @@ func deployWorkflow( fmt.Printf("\n✅ Encrypted workflow secrets file created at: %s\n\n", secretPathAbs) fmt.Printf("\n⚙️ Copying encrypted secrets file to Docker container\n") - secretsCopyErr := creworkflow.CopyArtifactsToDockerContainers(containerTargetDirFlag, containerNamePatternFlag, secretPathAbs) + secretsCopyErr := deployArtifacts(secretPathAbs) if secretsCopyErr != nil { - return errors.Wrap(secretsCopyErr, "❌ failed to copy encrypted secrets file to Docker container") + return errors.Wrap(secretsCopyErr, "❌ failed to deploy encrypted secrets artifact") } secretPathAbs = "file://" + secretPathAbs @@ -497,7 +531,7 @@ func compileCopyAndRegisterWorkflow(ctx context.Context, workflowFilePathFlag, w return errors.Wrap(compileErr, "❌ failed to compile workflow") } - return deployWorkflow(ctx, compressedWorkflowWasmPath, workflowNameFlag, workflowOwnerAddressFlag, workflowRegistryAddress, capabilitiesRegistryAddress, containerNamePatternFlag, containerTargetDirFlag, configFilePathFlag, secretsFilePathFlag, secretsOutputFilePathFlag, rpcURLFlag, workflowRegistryVersion, capabilitiesRegistryVersion, donIDFlag, true) + return deployWorkflow(ctx, compressedWorkflowWasmPath, workflowNameFlag, workflowOwnerAddressFlag, workflowRegistryAddress, capabilitiesRegistryAddress, containerNamePatternFlag, "", containerTargetDirFlag, configFilePathFlag, secretsFilePathFlag, secretsOutputFilePathFlag, rpcURLFlag, workflowRegistryVersion, capabilitiesRegistryVersion, donIDFlag, true) } func isBase64File(filename string) error { @@ -542,6 +576,49 @@ func isBase64Content(content string) bool { return err == nil } +func resolveWorkflowArtifactDeployModeFromState(containerNamePattern, nodeSetName string) (creworkflow.ArtifactDeployMode, string, error) { + cfg := &envconfig.Config{} + if err := cfg.Load(envconfig.MustLocalCREStateFileAbsPath(relativePathToRepoRoot)); err != nil { + if nodeSetName != "" { + return "", "", errors.Wrap(err, "failed to load local CRE state for remote artifact deployment") + } + return creworkflow.ArtifactDeployModeLocal, "", nil + } + + if nodeSetName != "" { + for _, cfgNodeSet := range cfg.NodeSets { + if cfgNodeSet == nil || cfgNodeSet.Name != nodeSetName { + continue + } + if cfgNodeSet.Target == string(envconfig.TargetRemote) { + return creworkflow.ArtifactDeployModeRemote, nodeSetName, nil + } + return creworkflow.ArtifactDeployModeLocal, nodeSetName, nil + } + return "", "", fmt.Errorf("nodeset %q not found in local CRE state", nodeSetName) + } + + matches := make([]string, 0) + for _, cfgNodeSet := range cfg.NodeSets { + if cfgNodeSet == nil || cfgNodeSet.Target != string(envconfig.TargetRemote) { + continue + } + prefix := ns.NodeNamePrefix(cfgNodeSet.Name) + if strings.Contains(prefix, containerNamePattern) || strings.Contains(containerNamePattern, prefix) { + matches = append(matches, cfgNodeSet.Name) + } + } + + switch len(matches) { + case 0: + return creworkflow.ArtifactDeployModeLocal, "", nil + case 1: + return creworkflow.ArtifactDeployModeRemote, matches[0], nil + default: + return "", "", fmt.Errorf("container pattern %q matches multiple remote nodesets %v; specify --nodeset-name", containerNamePattern, matches) + } +} + func addressRefFromStateFile(contractType deployment.ContractType) (*datastore.AddressRef, error) { in := &envconfig.Config{} err := in.Load(envconfig.MustLocalCREStateFileAbsPath(relativePathToRepoRoot)) diff --git a/system-tests/lib/cre/environment/agent/server.go b/system-tests/lib/cre/environment/agent/server.go index c2d87555bff..eee0f683d24 100644 --- a/system-tests/lib/cre/environment/agent/server.go +++ b/system-tests/lib/cre/environment/agent/server.go @@ -4,12 +4,14 @@ import ( "bytes" "context" "crypto/sha256" - "encoding/json" + "encoding/base64" "encoding/hex" + "encoding/json" "fmt" "io" "net/http" "os" + "path/filepath" "slices" "strings" "sync" @@ -27,28 +29,30 @@ import ( "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/internal/dockerops" ) const ( SchemaVersionV1 = "v1" OperationStartComponent = "StartComponent" OperationStopComponent = "StopComponent" + OperationDeployArtifacts = "DeployArtifacts" OperationHealth = "Health" ComponentTypeBlockchain = "blockchain" ComponentTypeJD = "jd" ComponentTypeNodeSet = "nodeset" - ErrCodeMethodNotAllowed = "method_not_allowed" - ErrCodeInvalidRequestBody = "invalid_request_body" - ErrCodeUnsupportedSchema = "unsupported_schema_version" - ErrCodeUnsupportedOperation = "unsupported_operation" - ErrCodeInvalidPayload = "invalid_payload" - ErrCodeUnsupportedComponent = "unsupported_component_type" - ErrCodeMissingComponentInput = "missing_component_input" - ErrCodeDeployFailed = "deployment_failed" - ErrCodeTransportEncodeFailed = "transport_encode_failed" - - RemoteStartPolicyAlways = "always" + ErrCodeMethodNotAllowed = "method_not_allowed" + ErrCodeInvalidRequestBody = "invalid_request_body" + ErrCodeUnsupportedSchema = "unsupported_schema_version" + ErrCodeUnsupportedOperation = "unsupported_operation" + ErrCodeInvalidPayload = "invalid_payload" + ErrCodeUnsupportedComponent = "unsupported_component_type" + ErrCodeMissingComponentInput = "missing_component_input" + ErrCodeDeployFailed = "deployment_failed" + ErrCodeTransportEncodeFailed = "transport_encode_failed" + + RemoteStartPolicyAlways = "always" RemoteStartPolicyReuseIdentical = "reuse_if_identical" EnvKeepFailedContainers = "CRE_AGENT_KEEP_FAILED_CONTAINERS" @@ -63,12 +67,23 @@ type StartComponentEnvelope struct { } type StartComponentPayload struct { - ComponentType string `json:"componentType"` - Blockchain *blockchain.Input `json:"blockchain"` - RegistryBlockchain map[string]any `json:"registryBlockchain,omitempty"` - JD *jd.Input `json:"jd"` - NodeSet *ns.Input `json:"nodeset,omitempty"` - ReusePolicy string `json:"reusePolicy,omitempty"` + ComponentType string `json:"componentType"` + Blockchain *blockchain.Input `json:"blockchain"` + RegistryBlockchain map[string]any `json:"registryBlockchain,omitempty"` + JD *jd.Input `json:"jd"` + NodeSet *ns.Input `json:"nodeset,omitempty"` + ReusePolicy string `json:"reusePolicy,omitempty"` +} + +type DeployArtifactsPayload struct { + NodeSetName string `json:"nodeSetName"` + TargetDir string `json:"targetDir"` + Files []DeployArtifactsFile `json:"files"` +} + +type DeployArtifactsFile struct { + Name string `json:"name"` + ContentBase64 string `json:"contentBase64"` } type StartComponentResponse struct { @@ -76,18 +91,18 @@ type StartComponentResponse struct { Output map[string]any `json:"output,omitempty"` Found bool `json:"found,omitempty"` Stopped bool `json:"stopped,omitempty"` - AgentLogs []string `json:"agentLogs,omitempty"` - ErrorCode string `json:"errorCode,omitempty"` - Error string `json:"error,omitempty"` + AgentLogs []string `json:"agentLogs,omitempty"` + ErrorCode string `json:"errorCode,omitempty"` + Error string `json:"error,omitempty"` } type Server struct { - lggr zerolog.Logger - deployers map[blockchain.ChainFamily]blockchains.Deployer + lggr zerolog.Logger + deployers map[blockchain.ChainFamily]blockchains.Deployer lifecycleMu sync.Mutex - cacheMu sync.Mutex - cache map[string]cachedStart - runtime map[string]runtimeState + cacheMu sync.Mutex + cache map[string]cachedStart + runtime map[string]runtimeState } type cachedStart struct { @@ -137,6 +152,10 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { s.respondError(w, http.StatusBadRequest, ErrCodeUnsupportedSchema, fmt.Sprintf("unsupported schema version: %s", envelope.SchemaVersion), nil) return } + if envelope.Operation == OperationDeployArtifacts { + s.deployArtifacts(w, r, envelope.Payload) + return + } var payload StartComponentPayload if err := json.Unmarshal(envelope.Payload, &payload); err != nil { s.respondError(w, http.StatusBadRequest, ErrCodeInvalidPayload, fmt.Sprintf("invalid payload: %v", err), nil) @@ -187,7 +206,7 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { s.respondJSON(w, http.StatusOK, StartComponentResponse{ ComponentType: payload.ComponentType, Output: cached.Output, - AgentLogs: []string{requestLog, reuseLog}, + AgentLogs: []string{requestLog, reuseLog}, }) return } @@ -273,6 +292,74 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { }) } +func (s *Server) deployArtifacts(w http.ResponseWriter, r *http.Request, rawPayload json.RawMessage) { + var payload DeployArtifactsPayload + if err := json.Unmarshal(rawPayload, &payload); err != nil { + s.respondError(w, http.StatusBadRequest, ErrCodeInvalidPayload, fmt.Sprintf("invalid payload: %v", err), nil) + return + } + if strings.TrimSpace(payload.NodeSetName) == "" { + s.respondError(w, http.StatusBadRequest, ErrCodeMissingComponentInput, "nodeset name is required", nil) + return + } + if strings.TrimSpace(payload.TargetDir) == "" { + s.respondError(w, http.StatusBadRequest, ErrCodeMissingComponentInput, "target dir is required", nil) + return + } + if len(payload.Files) == 0 { + s.respondError(w, http.StatusBadRequest, ErrCodeMissingComponentInput, "at least one artifact file is required", nil) + return + } + + containerPrefix := ns.NodeNamePrefix(payload.NodeSetName) + containerNames, err := dockerops.FindContainerNames(r.Context(), containerPrefix) + if err != nil { + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to list nodeset containers: %v", err), nil) + return + } + if len(containerNames) == 0 { + s.respondError(w, http.StatusNotFound, ErrCodeDeployFailed, fmt.Sprintf("no nodeset containers found for pattern %s", containerPrefix), nil) + return + } + + tmpDir, err := os.MkdirTemp("", "cre-agent-artifacts") + if err != nil { + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to create temp dir: %v", err), nil) + return + } + defer os.RemoveAll(tmpDir) + + filePaths := make([]string, 0, len(payload.Files)) + for idx, f := range payload.Files { + if strings.TrimSpace(f.Name) == "" { + s.respondError(w, http.StatusBadRequest, ErrCodeInvalidPayload, fmt.Sprintf("artifact %d has empty name", idx), nil) + return + } + decoded, err := base64.StdEncoding.DecodeString(f.ContentBase64) + if err != nil { + s.respondError(w, http.StatusBadRequest, ErrCodeInvalidPayload, fmt.Sprintf("artifact %s has invalid base64 content: %v", f.Name, err), nil) + return + } + target := filepath.Join(tmpDir, filepath.Base(f.Name)) + if err := os.WriteFile(target, decoded, 0o600); err != nil { + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to write artifact %s: %v", f.Name, err), nil) + return + } + filePaths = append(filePaths, target) + } + + if err := dockerops.CopyFilesToContainers(r.Context(), containerNames, payload.TargetDir, filePaths); err != nil { + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to copy artifacts to containers: %v", err), nil) + return + } + + s.respondJSON(w, http.StatusOK, StartComponentResponse{ + AgentLogs: []string{ + fmt.Sprintf("[cre-agent] copied %d artifact(s) to %d container(s) for nodeset %s", len(filePaths), len(containerNames), payload.NodeSetName), + }, + }) +} + func (s *Server) stopComponentByKey(w http.ResponseWriter, r *http.Request, componentType, componentKey string) { s.lifecycleMu.Lock() defer s.lifecycleMu.Unlock() diff --git a/system-tests/lib/cre/environment/artifacts_remote.go b/system-tests/lib/cre/environment/artifacts_remote.go new file mode 100644 index 00000000000..0bb388c678c --- /dev/null +++ b/system-tests/lib/cre/environment/artifacts_remote.go @@ -0,0 +1,86 @@ +package environment + +import ( + "context" + "encoding/base64" + "encoding/json" + "fmt" + "os" + "path/filepath" + + pkgerrors "github.com/pkg/errors" + "github.com/rs/zerolog" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" +) + +func DeployArtifactsToRemoteNodeSet( + ctx context.Context, + lggr zerolog.Logger, + tunnelManager tunnel.Manager, + nodeSetName string, + containerTargetDir string, + files []string, +) error { + if nodeSetName == "" { + return fmt.Errorf("nodeset name is required") + } + if containerTargetDir == "" { + return fmt.Errorf("container target dir is required") + } + + if tunnelManager == nil { + return fmt.Errorf("tunnel manager is required for remote artifact deploy") + } + + startClient, err := newStartComponentClient(lggr, tunnelManager) + if err != nil { + return pkgerrors.Wrap(err, "failed to initialize remote component client for artifact deploy") + } + + payloadFiles := make([]agent.DeployArtifactsFile, 0, len(files)) + for _, path := range files { + if path == "" { + continue + } + data, readErr := os.ReadFile(path) + if readErr != nil { + return pkgerrors.Wrapf(readErr, "failed to read artifact file %s", path) + } + payloadFiles = append(payloadFiles, agent.DeployArtifactsFile{ + Name: filepath.Base(path), + ContentBase64: base64.StdEncoding.EncodeToString(data), + }) + } + if len(payloadFiles) == 0 { + return fmt.Errorf("no artifact files to deploy") + } + + payloadBytes, err := json.Marshal(agent.DeployArtifactsPayload{ + NodeSetName: nodeSetName, + TargetDir: containerTargetDir, + Files: payloadFiles, + }) + if err != nil { + return pkgerrors.Wrap(err, "failed to encode deploy artifacts payload") + } + + response, err := startClient.StartComponent(ctx, agent.StartComponentEnvelope{ + SchemaVersion: agent.SchemaVersionV1, + Operation: agent.OperationDeployArtifacts, + Payload: payloadBytes, + }) + if err != nil { + return pkgerrors.Wrapf(err, "failed to deploy artifacts to remote nodeset %s", nodeSetName) + } + + for _, logLine := range response.AgentLogs { + pretty := prettifyAgentLogLine(logLine) + if pretty == "" { + continue + } + lggr.Info().Msgf("[agent] %s", pretty) + } + return nil +} diff --git a/system-tests/lib/cre/environment/blockchain_start.go b/system-tests/lib/cre/environment/blockchain_start.go index f8543e3fc94..6500cdff703 100644 --- a/system-tests/lib/cre/environment/blockchain_start.go +++ b/system-tests/lib/cre/environment/blockchain_start.go @@ -3,8 +3,8 @@ package environment import ( "bytes" "context" - "errors" "encoding/json" + "errors" "fmt" "io" "net" @@ -22,10 +22,8 @@ import ( cldf_chain "github.com/smartcontractkit/chainlink-deployments-framework/chain" "github.com/smartcontractkit/chainlink-testing-framework/framework" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" - "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" - ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/adapters" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" @@ -45,46 +43,25 @@ const ( defaultEC2AgentPort = 8080 ) -type startComponentEnvelope struct { - SchemaVersion string `json:"schemaVersion"` - Operation string `json:"operation"` - Payload json.RawMessage `json:"payload"` -} - -type startComponentRequest struct { - ComponentType string `json:"componentType"` - Blockchain *blockchain.Input `json:"blockchain"` - RegistryBlockchain map[string]any `json:"registryBlockchain,omitempty"` - JD *jd.Input `json:"jd"` - NodeSet *ns.Input `json:"nodeset,omitempty"` - ReusePolicy string `json:"reusePolicy,omitempty"` -} - -type startComponentResult struct { - ComponentType string `json:"componentType"` - Output map[string]any `json:"output"` - Found bool `json:"found"` - Stopped bool `json:"stopped"` - AgentLogs []string `json:"agentLogs"` - ErrorCode string `json:"errorCode"` - Error string `json:"error"` -} +type startComponentEnvelope = agent.StartComponentEnvelope +type startComponentRequest = agent.StartComponentPayload +type startComponentResult = agent.StartComponentResponse type componentClient interface { - StartComponent(ctx context.Context, envelope startComponentEnvelope) (*startComponentResult, error) + StartComponent(ctx context.Context, envelope agent.StartComponentEnvelope) (*agent.StartComponentResponse, error) } type httpComponentClient struct { - baseURL string - client *http.Client - maxAttempts int - retryDelay time.Duration - checkHealth bool + baseURL string + client *http.Client + maxAttempts int + retryDelay time.Duration + checkHealth bool } func newHTTPComponentClient(baseURL string) *httpComponentClient { return &httpComponentClient{ - baseURL: baseURL, + baseURL: baseURL, client: &http.Client{ Timeout: 4 * time.Minute, }, @@ -96,7 +73,7 @@ func newHTTPComponentClient(baseURL string) *httpComponentClient { func newEC2HTTPComponentClient(baseURL string) *httpComponentClient { return &httpComponentClient{ - baseURL: baseURL, + baseURL: baseURL, client: &http.Client{ Timeout: 4 * time.Minute, }, @@ -106,14 +83,14 @@ func newEC2HTTPComponentClient(baseURL string) *httpComponentClient { } } -func (c *httpComponentClient) StartComponent(ctx context.Context, envelope startComponentEnvelope) (*startComponentResult, error) { +func (c *httpComponentClient) StartComponent(ctx context.Context, envelope agent.StartComponentEnvelope) (*agent.StartComponentResponse, error) { if c.checkHealth { if err := c.waitForHealth(ctx); err != nil { return nil, err } } - var result *startComponentResult + var result *agent.StartComponentResponse err := retry.Do( func() error { var err error @@ -132,7 +109,7 @@ func (c *httpComponentClient) StartComponent(ctx context.Context, envelope start return result, nil } -func (c *httpComponentClient) startComponentOnce(ctx context.Context, envelope startComponentEnvelope) (*startComponentResult, error) { +func (c *httpComponentClient) startComponentOnce(ctx context.Context, envelope agent.StartComponentEnvelope) (*agent.StartComponentResponse, error) { body, err := json.Marshal(envelope) if err != nil { return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to encode start component envelope")) @@ -158,7 +135,7 @@ func (c *httpComponentClient) startComponentOnce(ctx context.Context, envelope s return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to read start component response")) } - var startResp startComponentResult + var startResp agent.StartComponentResponse if len(respBody) > 0 { if err := json.Unmarshal(respBody, &startResp); err != nil { return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to decode start component response")) @@ -404,7 +381,7 @@ func startBlockchainsWithTargets( return nil, err } - payload := startComponentRequest{ + payload := agent.StartComponentPayload{ ComponentType: componentTypeBlockchain, Blockchain: input, ReusePolicy: string(configured.RemoteStartPolicy), @@ -414,7 +391,7 @@ func startBlockchainsWithTargets( return nil, pkgerrors.Wrap(err, "failed to encode blockchain payload") } - response, err := startClient.StartComponent(ctx, startComponentEnvelope{ + response, err := startClient.StartComponent(ctx, agent.StartComponentEnvelope{ SchemaVersion: agent.SchemaVersionV1, Operation: agent.OperationStartComponent, Payload: payloadBytes, @@ -481,6 +458,10 @@ func newEC2TunnelManager(testLogger zerolog.Logger) (tunnel.Manager, error) { return tunnel.NewManager(tunnel.NewSSMProvider(instanceID, ec2Region, testLogger)), nil } +func NewEC2TunnelManager(testLogger zerolog.Logger) (tunnel.Manager, error) { + return newEC2TunnelManager(testLogger) +} + func rewriteRemoteBlockchainOutputForLocalAccess( ctx context.Context, testLogger zerolog.Logger, diff --git a/system-tests/lib/cre/environment/jobs.go b/system-tests/lib/cre/environment/jobs.go index b15517df44c..6a5fc1082b9 100644 --- a/system-tests/lib/cre/environment/jobs.go +++ b/system-tests/lib/cre/environment/jobs.go @@ -84,7 +84,7 @@ func StartJD( if err != nil { return nil, err } - payload := startComponentRequest{ + payload := agent.StartComponentPayload{ ComponentType: componentTypeJD, JD: jdConfig.InputRef(), ReusePolicy: string(jdConfig.RemoteStartPolicy), @@ -93,7 +93,7 @@ func StartJD( if err != nil { return nil, pkgerrors.Wrap(err, "failed to encode jd payload") } - response, err := startClient.StartComponent(ctx, startComponentEnvelope{ + response, err := startClient.StartComponent(ctx, agent.StartComponentEnvelope{ SchemaVersion: agent.SchemaVersionV1, Operation: agent.OperationStartComponent, Payload: payloadBytes, diff --git a/system-tests/lib/cre/environment/remote_stop.go b/system-tests/lib/cre/environment/remote_stop.go index 4a36dc3f6cc..9c67d81dcc1 100644 --- a/system-tests/lib/cre/environment/remote_stop.go +++ b/system-tests/lib/cre/environment/remote_stop.go @@ -50,7 +50,7 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. if configuredBlockchain == nil || configuredBlockchain.Target != config.TargetRemote { continue } - payload := startComponentRequest{ + payload := agent.StartComponentPayload{ ComponentType: componentTypeBlockchain, Blockchain: configuredBlockchain.InputRef(), ReusePolicy: string(configuredBlockchain.RemoteStartPolicy), @@ -72,9 +72,9 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. if nodeSet == nil || strings.TrimSpace(nodeSet.Target) != string(config.TargetRemote) { continue } - payload := startComponentRequest{ + payload := agent.StartComponentPayload{ ComponentType: componentTypeNodeSet, - NodeSet: &simple_node_set.Input{Name: nodeSet.Name}, + NodeSet: &simple_node_set.Input{Name: nodeSet.Name}, ReusePolicy: nodeSet.RemoteStartPolicy, } result, err := stopRemoteComponent(ctx, lggr, startClient, payload, componentTypeNodeSet) @@ -91,7 +91,7 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. } if cfg.JD != nil && cfg.JD.Target == config.TargetRemote { - payload := startComponentRequest{ + payload := agent.StartComponentPayload{ ComponentType: componentTypeJD, JD: cfg.JD.InputRef(), ReusePolicy: string(cfg.JD.RemoteStartPolicy), @@ -137,15 +137,15 @@ func stopRemoteComponent( ctx context.Context, lggr zerolog.Logger, client componentClient, - payload startComponentRequest, + payload agent.StartComponentPayload, expectedType string, -) (*startComponentResult, error) { +) (*agent.StartComponentResponse, error) { payloadBytes, err := json.Marshal(payload) if err != nil { return nil, pkgerrors.Wrapf(err, "failed to encode stop payload for component type %s", payload.ComponentType) } - response, err := client.StartComponent(ctx, startComponentEnvelope{ + response, err := client.StartComponent(ctx, agent.StartComponentEnvelope{ SchemaVersion: agent.SchemaVersionV1, Operation: agent.OperationStopComponent, Payload: payloadBytes, diff --git a/system-tests/lib/cre/internal/dockerops/files.go b/system-tests/lib/cre/internal/dockerops/files.go new file mode 100644 index 00000000000..a8567004b53 --- /dev/null +++ b/system-tests/lib/cre/internal/dockerops/files.go @@ -0,0 +1,97 @@ +package dockerops + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + "time" + + ctypes "github.com/docker/docker/api/types/container" + dc "github.com/docker/docker/client" + "github.com/pkg/errors" + "github.com/smartcontractkit/chainlink-testing-framework/framework" +) + +func FindContainerNames(ctx context.Context, pattern string) ([]string, error) { + dockerClient, dockerClientErr := dc.NewClientWithOpts(dc.FromEnv, dc.WithAPIVersionNegotiation()) + if dockerClientErr != nil { + return nil, errors.Wrap(dockerClientErr, "failed to create Docker client") + } + defer dockerClient.Close() + + containers, containersErr := dockerClient.ContainerList(ctx, ctypes.ListOptions{}) + if containersErr != nil { + return nil, errors.Wrap(containersErr, "failed to list Docker containers") + } + + containerNames := make([]string, 0) + for _, container := range containers { + for _, name := range container.Names { + if strings.Contains(name, pattern) { + containerNames = append(containerNames, strings.TrimPrefix(name, "/")) + } + } + } + + return containerNames, nil +} + +func CopyFilesToContainers(ctx context.Context, containerNames []string, targetDir string, files []string) error { + frameworkDockerClient, frameworkDockerClientErr := framework.NewDockerClient() + if frameworkDockerClientErr != nil { + return errors.Wrap(frameworkDockerClientErr, "failed to create framework Docker client") + } + + dockerClient, dockerClientErr := dc.NewClientWithOpts(dc.FromEnv, dc.WithAPIVersionNegotiation()) + if dockerClientErr != nil { + return errors.Wrap(dockerClientErr, "failed to create Docker client") + } + defer dockerClient.Close() + + for _, containerName := range containerNames { + execOutput, execOutputErr := frameworkDockerClient.ExecContainer(containerName, []string{"mkdir", "-p", targetDir}) + if execOutputErr != nil { + fmt.Fprint(os.Stderr, execOutput) + return errors.Wrap(execOutputErr, "failed to execute mkdir command in Docker container") + } + + for _, filePath := range files { + framework.L.Info().Msgf("Copying file '%s' to Docker container %s", filePath, containerName) + copyErr := frameworkDockerClient.CopyFile(containerName, filePath, targetDir) + if copyErr != nil { + fmt.Fprint(os.Stderr, execOutput) + return errors.Wrap(copyErr, "failed to copy artifact to Docker container") + } + } + + inspectCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + containerJSON, inspectErr := dockerClient.ContainerInspect(inspectCtx, containerName) + cancel() + if inspectErr != nil { + return errors.Wrap(inspectErr, "failed to inspect Docker container") + } + + user := containerJSON.Config.User + if user == "" { + continue + } + for _, filePath := range files { + targetFilePath := filepath.Join(targetDir, filepath.Base(filePath)) + execConfig := ctypes.ExecOptions{ + Cmd: []string{"chown", user, targetFilePath}, + AttachStdout: true, + AttachStderr: true, + User: "root", + } + execOutput, execOutputErr := frameworkDockerClient.ExecContainerOptions(containerName, execConfig) + if execOutputErr != nil { + fmt.Fprint(os.Stderr, execOutput) + return errors.Wrap(execOutputErr, "failed to execute chown command in Docker container") + } + } + } + + return nil +} diff --git a/system-tests/lib/cre/workflow/deploy_artifacts.go b/system-tests/lib/cre/workflow/deploy_artifacts.go new file mode 100644 index 00000000000..410b8fd7078 --- /dev/null +++ b/system-tests/lib/cre/workflow/deploy_artifacts.go @@ -0,0 +1,42 @@ +package workflow + +import ( + "context" + "fmt" +) + +type ArtifactDeployMode string + +const ( + ArtifactDeployModeLocal ArtifactDeployMode = "local" + ArtifactDeployModeRemote ArtifactDeployMode = "remote" +) + +type DeployArtifactsOptions struct { + Mode ArtifactDeployMode + NodeSetName string + ContainerNamePattern string + ContainerTargetDir string + Files []string + RemoteDeployer func(ctx context.Context, nodeSetName, containerTargetDir string, files []string) error +} + +func DeployArtifacts(ctx context.Context, opts DeployArtifactsOptions) error { + switch opts.Mode { + case ArtifactDeployModeRemote: + if opts.RemoteDeployer == nil { + return fmt.Errorf("remote artifact deployer is required for mode=%s", opts.Mode) + } + if opts.NodeSetName == "" { + return fmt.Errorf("nodeset name is required for mode=%s", opts.Mode) + } + return opts.RemoteDeployer(ctx, opts.NodeSetName, opts.ContainerTargetDir, opts.Files) + case ArtifactDeployModeLocal: + fallthrough + default: + if opts.ContainerNamePattern == "" { + return fmt.Errorf("container name pattern is required for mode=%s", opts.Mode) + } + return CopyArtifactsToDockerContainers(opts.ContainerTargetDir, opts.ContainerNamePattern, opts.Files...) + } +} diff --git a/system-tests/lib/cre/workflow/docker.go b/system-tests/lib/cre/workflow/docker.go index fa44dce4eeb..d1f03b8473b 100644 --- a/system-tests/lib/cre/workflow/docker.go +++ b/system-tests/lib/cre/workflow/docker.go @@ -3,16 +3,10 @@ package workflow import ( "context" "fmt" - "os" - "path/filepath" - "strings" - "time" - - ctypes "github.com/docker/docker/api/types/container" - dc "github.com/docker/docker/client" "github.com/pkg/errors" + "os" - "github.com/smartcontractkit/chainlink-testing-framework/framework" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/internal/dockerops" ) var ( @@ -21,105 +15,36 @@ var ( ) func findAllDockerContainerNames(pattern string) ([]string, error) { - dockerClient, dockerClientErr := dc.NewClientWithOpts(dc.FromEnv, dc.WithAPIVersionNegotiation()) - if dockerClientErr != nil { - return nil, errors.Wrap(dockerClientErr, "failed to create Docker client") - } - - containers, containersErr := dockerClient.ContainerList(context.Background(), ctypes.ListOptions{}) - if containersErr != nil { - return nil, errors.Wrap(containersErr, "failed to list Docker containers") - } - - containerNames := []string{} - for _, container := range containers { - for _, name := range container.Names { - if strings.Contains(name, pattern) { - // Remove leading slash from container name - cleanName := strings.TrimPrefix(name, "/") - containerNames = append(containerNames, cleanName) - } - } - } - - return containerNames, nil + return FindDockerContainerNames(context.Background(), pattern) } -func CopyArtifactsToDockerContainers(containerTargetDir string, containerNamePattern string, filesToCopy ...string) error { - for _, file := range filesToCopy { - if _, err := os.Stat(file); err != nil { - fmt.Fprintf(os.Stderr, "Warning: File '%s' does not exist. Skipping file copying to docker containers\n", file) - continue - } - - workflowCopyErr := copyArtifactToDockerContainers(file, containerNamePattern, containerTargetDir) - if workflowCopyErr != nil { - return errors.Wrapf(workflowCopyErr, "failed to copy a file (%s) to docker containers", file) - } - } - return nil +func FindDockerContainerNames(ctx context.Context, pattern string) ([]string, error) { + return dockerops.FindContainerNames(ctx, pattern) } -func copyArtifactToDockerContainers(filePath string, containerNamePattern string, targetDir string) error { - framework.L.Info().Msgf("Copying file '%s' to Docker containers", filePath) +func CopyArtifactsToDockerContainers(containerTargetDir string, containerNamePattern string, filesToCopy ...string) error { containerNames, containerNamesErr := findAllDockerContainerNames(containerNamePattern) if containerNamesErr != nil { return errors.Wrap(containerNamesErr, "failed to find Docker containers") } - if len(containerNames) == 0 { return fmt.Errorf("no Docker containers found with name pattern %s", containerNamePattern) } - frameworkDockerClient, frameworkDockerClientErr := framework.NewDockerClient() - if frameworkDockerClientErr != nil { - return errors.Wrap(frameworkDockerClientErr, "failed to create framework Docker client") - } - - for _, containerName := range containerNames { - execOutput, execOutputErr := frameworkDockerClient.ExecContainer(containerName, []string{"mkdir", "-p", targetDir}) - if execOutputErr != nil { - fmt.Fprint(os.Stderr, execOutput) - return errors.Wrap(execOutputErr, "failed to execute mkdir command in Docker container") - } - - copyErr := frameworkDockerClient.CopyFile(containerName, filePath, targetDir) - if copyErr != nil { - fmt.Fprint(os.Stderr, execOutput) - return errors.Wrap(copyErr, "failed to copy artifact to Docker container") - } - - dockerClient, dockerClientErr := dc.NewClientWithOpts(dc.FromEnv, dc.WithAPIVersionNegotiation()) - if dockerClientErr != nil { - return errors.Wrap(dockerClientErr, "failed to create Docker client") - } - - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - - containerJSON, ispectErr := dockerClient.ContainerInspect(ctx, containerName) - if ispectErr != nil { - cancel() - return errors.Wrap(ispectErr, "failed to inspect Docker container") - } - cancel() - user := containerJSON.Config.User - // if not running as root, change ownership to user that is running the container to avoid permission issues - if user != "" { - targetFilePath := filepath.Join(targetDir, filepath.Base(filePath)) - execConfig := ctypes.ExecOptions{ - Cmd: []string{"chown", user, targetFilePath}, - AttachStdout: true, - AttachStderr: true, - User: "root", - } - execOutput, execOutputErr := frameworkDockerClient.ExecContainerOptions(containerName, execConfig) - if execOutputErr != nil { - fmt.Fprint(os.Stderr, execOutput) - return errors.Wrap(execOutputErr, "failed to execute mkdir command in Docker container") - } - fmt.Println("output " + execOutput) + existingFiles := make([]string, 0, len(filesToCopy)) + for _, file := range filesToCopy { + if _, err := os.Stat(file); err != nil { + fmt.Fprintf(os.Stderr, "Warning: File '%s' does not exist. Skipping file copying to docker containers\n", file) + continue } + existingFiles = append(existingFiles, file) } + if len(existingFiles) == 0 { + return nil + } + return CopyFilesToDockerContainers(context.Background(), containerNames, containerTargetDir, existingFiles) +} - return nil +func CopyFilesToDockerContainers(ctx context.Context, containerNames []string, targetDir string, files []string) error { + return dockerops.CopyFilesToContainers(ctx, containerNames, targetDir, files) } diff --git a/system-tests/tests/smoke/cre/v2_grpc_source_test.go b/system-tests/tests/smoke/cre/v2_grpc_source_test.go index 7eaf091b511..66389b1de25 100644 --- a/system-tests/tests/smoke/cre/v2_grpc_source_test.go +++ b/system-tests/tests/smoke/cre/v2_grpc_source_test.go @@ -24,6 +24,9 @@ import ( "github.com/smartcontractkit/chainlink-common/pkg/workflows/privateregistry" crontypes "github.com/smartcontractkit/chainlink/core/scripts/cre/environment/examples/workflows/v2/cron/types" + creenv "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment" + envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" grpcsourcemock "github.com/smartcontractkit/chainlink/system-tests/lib/cre/grpc_source_mock" creworkflow "github.com/smartcontractkit/chainlink/system-tests/lib/cre/workflow" t_helpers "github.com/smartcontractkit/chainlink/system-tests/tests/test-helpers" @@ -590,10 +593,35 @@ func compileAndCopyWorkflow(t *testing.T, testEnv *ttypes.TestEnvironment, workf } require.NotEmpty(t, workflowDONName, "failed to find workflow DON name") - // Copy to containers + // Copy workflow artifacts to local or remote workflow DON targets. testLogger.Info().Str("workflowName", workflowName).Str("donName", workflowDONName).Msg("Copying workflow artifacts to containers...") containerTargetDir := creworkflow.DefaultWorkflowTargetDir - err = creworkflow.CopyArtifactsToDockerContainers(containerTargetDir, ns.NodeNamePrefix(workflowDONName), compressedWasmPath, configFilePath) + mode := creworkflow.ArtifactDeployModeLocal + for _, nodeSet := range testEnv.Config.NodeSets { + if nodeSet != nil && nodeSet.Name == workflowDONName && nodeSet.Target == string(envconfig.TargetRemote) { + mode = creworkflow.ArtifactDeployModeRemote + break + } + } + var remoteTunnelManager tunnel.Manager + if mode == creworkflow.ArtifactDeployModeRemote { + remoteTunnelManager, err = creenv.NewEC2TunnelManager(testLogger) + require.NoError(t, err, "failed to initialize tunnel manager for remote artifact deploy") + defer func() { _ = remoteTunnelManager.Stop(ctx) }() + } + err = creworkflow.DeployArtifacts( + ctx, + creworkflow.DeployArtifactsOptions{ + Mode: mode, + NodeSetName: workflowDONName, + ContainerNamePattern: ns.NodeNamePrefix(workflowDONName), + ContainerTargetDir: containerTargetDir, + Files: []string{compressedWasmPath, configFilePath}, + RemoteDeployer: func(ctx context.Context, nodeSetName, containerTargetDir string, files []string) error { + return creenv.DeployArtifactsToRemoteNodeSet(ctx, testLogger, remoteTunnelManager, nodeSetName, containerTargetDir, files) + }, + }, + ) require.NoError(t, err, "failed to copy workflow artifacts to containers") // Return the file:// URLs that nodes will use to fetch the artifacts diff --git a/system-tests/tests/test-helpers/t_helpers.go b/system-tests/tests/test-helpers/t_helpers.go index e73824c14ae..b820ec634d9 100644 --- a/system-tests/tests/test-helpers/t_helpers.go +++ b/system-tests/tests/test-helpers/t_helpers.go @@ -59,8 +59,11 @@ import ( keystone_changeset "github.com/smartcontractkit/chainlink/deployment/keystone/changeset" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" crecontracts "github.com/smartcontractkit/chainlink/system-tests/lib/cre/contracts" + creenv "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" + envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/flags" creworkflow "github.com/smartcontractkit/chainlink/system-tests/lib/cre/workflow" crecrypto "github.com/smartcontractkit/chainlink/system-tests/lib/crypto" @@ -327,7 +330,7 @@ It returns the paths to: 1. the compressed WASM file; 2. the workflow config file. */ -func createWorkflowArtifacts[T WorkflowConfig](t *testing.T, testLogger zerolog.Logger, workflowName string, workflowDONs []*cre.Don, workflowConfig *T, workflowFileLocation string) (string, string) { +func createWorkflowArtifacts[T WorkflowConfig](t *testing.T, testLogger zerolog.Logger, testEnv *ttypes.TestEnvironment, workflowName string, workflowDONs []*cre.Don, workflowConfig *T, workflowFileLocation string) (string, string) { t.Helper() workflowConfigFilePath := workflowConfigFactory(t, testLogger, workflowName, workflowConfig) @@ -337,8 +340,32 @@ func createWorkflowArtifacts[T WorkflowConfig](t *testing.T, testLogger zerolog. // Copy workflow artifacts to Docker containers to use blockchain client running inside for workflow registration testLogger.Info().Msg("Copying workflow artifacts to Docker containers.") + var remoteTunnelManager tunnel.Manager + defer func() { + if remoteTunnelManager != nil { + _ = remoteTunnelManager.Stop(t.Context()) + } + }() for _, don := range workflowDONs { - copyErr := creworkflow.CopyArtifactsToDockerContainers(creworkflow.DefaultWorkflowTargetDir, ns.NodeNamePrefix(don.Name), compressedWorkflowWasmPath, workflowConfigFilePath) + mode, nodeSetName := resolveWorkflowDONArtifactMode(testEnv.Config, don.Name) + if mode == creworkflow.ArtifactDeployModeRemote && remoteTunnelManager == nil { + manager, managerErr := creenv.NewEC2TunnelManager(testLogger) + require.NoError(t, managerErr, "failed to initialize tunnel manager for remote artifact deploy") + remoteTunnelManager = manager + } + copyErr := creworkflow.DeployArtifacts( + t.Context(), + creworkflow.DeployArtifactsOptions{ + Mode: mode, + NodeSetName: nodeSetName, + ContainerNamePattern: ns.NodeNamePrefix(don.Name), + ContainerTargetDir: creworkflow.DefaultWorkflowTargetDir, + Files: []string{compressedWorkflowWasmPath, workflowConfigFilePath}, + RemoteDeployer: func(ctx context.Context, nodeSetName, containerTargetDir string, files []string) error { + return creenv.DeployArtifactsToRemoteNodeSet(ctx, testLogger, remoteTunnelManager, nodeSetName, containerTargetDir, files) + }, + }, + ) require.NoError(t, copyErr, "failed to copy workflow artifacts to docker containers") } testLogger.Info().Msg("Workflow artifacts successfully copied to the Docker containers.") @@ -639,7 +666,7 @@ func CompileAndDeployWorkflow[T WorkflowConfig](t *testing.T, workflowDONs = append(workflowDONs, don) } - compressedWorkflowWasmPath, workflowConfigPath := createWorkflowArtifacts(t, testLogger, workflowName, workflowDONs, workflowConfig, workflowFileLocation) + compressedWorkflowWasmPath, workflowConfigPath := createWorkflowArtifacts(t, testLogger, testEnv, workflowName, workflowDONs, workflowConfig, workflowFileLocation) require.NotEmpty(t, compressedWorkflowWasmPath, "failed to find workflow DON in the topology") workflowRegistryAddress := crecontracts.MustGetAddressRefFromDataStore(testEnv.CreEnvironment.CldfEnvironment.DataStore, testEnv.CreEnvironment.Blockchains[0].ChainSelector(), keystone_changeset.WorkflowRegistry.String(), testEnv.CreEnvironment.ContractVersions[keystone_changeset.WorkflowRegistry.String()], "") @@ -660,3 +687,19 @@ func CompileAndDeployWorkflow[T WorkflowConfig](t *testing.T, workflowID := registerWorkflow(t.Context(), t, workflowRegConfig, testEnv.CreEnvironment.Blockchains[0].(*evm.Blockchain).SethClient, testLogger) return workflowID } + +func resolveWorkflowDONArtifactMode(cfg *envconfig.Config, donName string) (creworkflow.ArtifactDeployMode, string) { + if cfg == nil { + return creworkflow.ArtifactDeployModeLocal, donName + } + for _, nodeSet := range cfg.NodeSets { + if nodeSet == nil || nodeSet.Name != donName { + continue + } + if strings.TrimSpace(nodeSet.Target) == string(envconfig.TargetRemote) { + return creworkflow.ArtifactDeployModeRemote, nodeSet.Name + } + return creworkflow.ArtifactDeployModeLocal, nodeSet.Name + } + return creworkflow.ArtifactDeployModeLocal, donName +} From 9d878208de6583d7f3ee003de542ee2eb141c269 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Fri, 20 Feb 2026 19:31:35 +0100 Subject: [PATCH 07/34] add direct mode for ec2 agent that doesn't create any tunnels, add untested mixed mode --- .../configs/workflow-gateway-don-mixed.toml | 90 +++++ .../configs/workflow-gateway-don.toml | 16 +- system-tests/lib/cre/connectivity/chooser.go | 122 +++++++ .../lib/cre/connectivity/chooser_test.go | 86 +++++ system-tests/lib/cre/don/config/config.go | 163 +++++++-- .../environment/agent/cmd/local-agent/main.go | 7 +- .../lib/cre/environment/agent/relay.go | 290 ++++++++++++++++ .../lib/cre/environment/agent/server.go | 10 + .../lib/cre/environment/blockchain_start.go | 89 ++++- .../cre/environment/blockchain_start_test.go | 66 ++++ .../lib/cre/environment/config/config.go | 34 +- system-tests/lib/cre/environment/dons.go | 195 ++++++++++- .../lib/cre/environment/environment.go | 20 +- system-tests/lib/cre/environment/jobs.go | 64 ++++ system-tests/lib/cre/gateway.go | 1 + .../lib/cre/runtimecfg/access_mode.go | 38 +++ system-tests/lib/cre/types.go | 1 + system-tests/lib/cre/vault/vault.go | 34 +- system-tests/lib/cre/workflow/registry.go | 28 +- .../cre/v2_http_trigger_regression_test.go | 1 + system-tests/tests/smoke/cre/README.md | 1 + .../tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md | 51 +++ .../tests/smoke/cre/billing_helpers.go | 6 +- .../tests/smoke/cre/cre_suite_test.go | 8 + system-tests/tests/smoke/cre/por_test.go | 1 + .../tests/smoke/cre/v2_http_action_test.go | 2 + .../smoke/cre/v2_http_trigger_action_test.go | 1 + .../tests/smoke/cre/v2_vault_don_test.go | 8 +- .../test-helpers/chip_testsink_helpers.go | 4 + .../test-helpers/fixture_relay_helpers.go | 317 ++++++++++++++++++ 30 files changed, 1693 insertions(+), 61 deletions(-) create mode 100644 core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml create mode 100644 system-tests/lib/cre/connectivity/chooser.go create mode 100644 system-tests/lib/cre/connectivity/chooser_test.go create mode 100644 system-tests/lib/cre/environment/agent/relay.go create mode 100644 system-tests/lib/cre/runtimecfg/access_mode.go create mode 100644 system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md create mode 100644 system-tests/tests/test-helpers/fixture_relay_helpers.go diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml b/core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml new file mode 100644 index 00000000000..a9e04343326 --- /dev/null +++ b/core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml @@ -0,0 +1,90 @@ + +[[blockchains]] + type = "anvil" + chain_id = "1337" + docker_cmd_params = ["-b", "0.5", "--mixed-mining"] + target = "remote" + +[[blockchains]] + type = "anvil" + chain_id = "2337" + port = "8546" + docker_cmd_params = ["-b", "0.5", "--mixed-mining"] + target = "remote" + container_name = "anvil-2337" + remote_start_policy = "always" + +[jd] + csa_encryption_key = "d1093c0060d50a3c89c189b2e485da5a3ce57f3dcb38ab7e2c0d5f0bb2314a44" # any random 32 byte hex string + # change to your version + image = "job-distributor:0.22.1" + target = "local" + # we need fresh DB on each run to avoid DB-level job name uniquness violations + remote_start_policy = "always" + +[fake] + port = 8171 + +[fake_http] + port = 8666 + +#[s3provider] +# # use all defaults +# port = 9000 +# console_port = 9001 + +[infra] + # either "docker" or "kubernetes" + type = "docker" + +[[nodesets]] + nodes = 4 + name = "workflow" + don_types = ["workflow"] + override_mode = "all" + http_port_range_start = 10100 + target = "remote" + + env_vars = { CL_EVM_CMD = "" } + capabilities = ["ocr3", "custom-compute", "web-api-target", "web-api-trigger", "vault", "cron", "http-action", "http-trigger", "consensus", "don-time", "write-evm-1337", "write-evm-2337", "evm-1337", "evm-2337", "read-contract-1337", "read-contract-2337"] + + [nodesets.db] + image = "postgres:12.0" + port = 13000 + +[[nodesets.node_specs]] + roles = ["plugin"] + [nodesets.node_specs.node] + #docker_ctx = "../../../.." + #docker_file = "core/chainlink.Dockerfile" + #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } + image = "chainlink-amd:latest" + user_config_overrides = "" + +[[nodesets]] + nodes = 1 + name = "bootstrap-gateway" + don_types = ["bootstrap", "gateway"] + override_mode = "each" + http_port_range_start = 10300 + target = "remote" + + env_vars = { CL_EVM_CMD = "" } + supported_evm_chains = [1337, 2337] + + [nodesets.db] + image = "postgres:12.0" + port = 13200 + + [[nodesets.node_specs]] + roles = ["bootstrap", "gateway"] + [nodesets.node_specs.node] + #ocker_ctx = "../../../.." + #docker_file = "core/chainlink.Dockerfile" + #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } + image = "chainlink-amd:latest" + # 5002 is the web API capabilities port for incoming requests + # 15002 is the vault port for incoming requests + custom_ports = ["5002:5002","15002:15002"] + # image = "chainlink-tmp:latest" + user_config_overrides = "" diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don.toml b/core/scripts/cre/environment/configs/workflow-gateway-don.toml index ef5ddbb45ff..60a9a49665f 100644 --- a/core/scripts/cre/environment/configs/workflow-gateway-don.toml +++ b/core/scripts/cre/environment/configs/workflow-gateway-don.toml @@ -47,10 +47,10 @@ [[nodesets.node_specs]] roles = ["plugin"] [nodesets.node_specs.node] - docker_ctx = "../../../.." - docker_file = "core/chainlink.Dockerfile" - docker_build_args = { "CL_IS_PROD_BUILD" = "false" } - # image = "chainlink-tmp:latest" + #docker_ctx = "../../../.." + #docker_file = "core/chainlink.Dockerfile" + #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } + image = "chainlink-tmp:latest" user_config_overrides = "" [[nodesets]] @@ -70,11 +70,11 @@ [[nodesets.node_specs]] roles = ["bootstrap", "gateway"] [nodesets.node_specs.node] - docker_ctx = "../../../.." - docker_file = "core/chainlink.Dockerfile" - docker_build_args = { "CL_IS_PROD_BUILD" = "false" } + #docker_ctx = "../../../.." + #docker_file = "core/chainlink.Dockerfile" + #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } # 5002 is the web API capabilities port for incoming requests # 15002 is the vault port for incoming requests custom_ports = ["5002:5002","15002:15002"] - # image = "chainlink-tmp:latest" + image = "chainlink-tmp:latest" user_config_overrides = "" diff --git a/system-tests/lib/cre/connectivity/chooser.go b/system-tests/lib/cre/connectivity/chooser.go new file mode 100644 index 00000000000..550d69c9515 --- /dev/null +++ b/system-tests/lib/cre/connectivity/chooser.go @@ -0,0 +1,122 @@ +package connectivity + +import ( + "context" + "fmt" + "net" + "net/url" + "strconv" + "strings" +) + +type Placement string + +const ( + PlacementLocal Placement = "local" + PlacementRemote Placement = "remote" +) + +type EndpointPair struct { + Name string + Internal string + External string +} + +type Resolution struct { + URL string + SelectedKind string + RequiresBridge bool + BridgePort int +} + +type BridgeEnsurer func(ctx context.Context, endpoint EndpointPair, port int) error + +func Resolve(caller, target Placement, endpoint EndpointPair) (*Resolution, error) { + if caller == "" || target == "" { + return nil, fmt.Errorf("caller and target placement must be set") + } + + selectedKind := "internal" + selectedURL := strings.TrimSpace(endpoint.Internal) + if caller != target { + selectedKind = "external" + selectedURL = strings.TrimSpace(endpoint.External) + } + if selectedURL == "" { + return nil, fmt.Errorf("missing %s url for endpoint %q", selectedKind, endpoint.Name) + } + + res := &Resolution{URL: selectedURL, SelectedKind: selectedKind} + if caller == PlacementRemote && target == PlacementLocal { + port, err := endpointPort(selectedURL) + if err != nil { + return nil, fmt.Errorf("failed to resolve bridge port for endpoint %q: %w", endpoint.Name, err) + } + res.RequiresBridge = true + res.BridgePort = port + } + return res, nil +} + +func ResolveAndEnsureReachable( + ctx context.Context, + caller, target Placement, + endpoint EndpointPair, + ensureBridge BridgeEnsurer, +) (*Resolution, error) { + res, err := Resolve(caller, target, endpoint) + if err != nil { + return nil, err + } + if !res.RequiresBridge { + return res, nil + } + if ensureBridge == nil { + return nil, fmt.Errorf("bridge required for endpoint %q (remote caller -> local target) but no bridge ensurer was provided", endpoint.Name) + } + if err := ensureBridge(ctx, endpoint, res.BridgePort); err != nil { + return nil, fmt.Errorf("ensure bridge for endpoint %q on port %d: %w", endpoint.Name, res.BridgePort, err) + } + return res, nil +} + +func PlacementFromTarget(target string) (Placement, error) { + switch strings.ToLower(strings.TrimSpace(target)) { + case "", "docker", "local": + return PlacementLocal, nil + case "remote": + return PlacementRemote, nil + default: + return "", fmt.Errorf("unsupported component target %q", target) + } +} + +func endpointPort(raw string) (int, error) { + trimmed := strings.TrimSpace(raw) + if trimmed == "" { + return 0, fmt.Errorf("endpoint is empty") + } + if strings.Contains(trimmed, "://") { + parsed, err := url.Parse(trimmed) + if err != nil { + return 0, fmt.Errorf("parse url: %w", err) + } + if parsed.Port() == "" { + return 0, fmt.Errorf("url has no explicit port") + } + port, err := strconv.Atoi(parsed.Port()) + if err != nil || port <= 0 || port > 65535 { + return 0, fmt.Errorf("invalid port %q", parsed.Port()) + } + return port, nil + } + _, portRaw, err := net.SplitHostPort(trimmed) + if err != nil { + return 0, fmt.Errorf("parse host:port: %w", err) + } + port, err := strconv.Atoi(portRaw) + if err != nil || port <= 0 || port > 65535 { + return 0, fmt.Errorf("invalid port %q", portRaw) + } + return port, nil +} diff --git a/system-tests/lib/cre/connectivity/chooser_test.go b/system-tests/lib/cre/connectivity/chooser_test.go new file mode 100644 index 00000000000..632c5b14bfb --- /dev/null +++ b/system-tests/lib/cre/connectivity/chooser_test.go @@ -0,0 +1,86 @@ +package connectivity + +import ( + "context" + "errors" + "testing" +) + +func TestResolveSamePlacementUsesInternal(t *testing.T) { + r, err := Resolve(PlacementLocal, PlacementLocal, EndpointPair{ + Name: "evm-rpc", + Internal: "http://anvil:8545", + External: "http://10.0.0.1:8545", + }) + if err != nil { + t.Fatalf("expected resolve to succeed: %v", err) + } + if r.URL != "http://anvil:8545" || r.SelectedKind != "internal" { + t.Fatalf("unexpected resolution: %+v", r) + } + if r.RequiresBridge { + t.Fatalf("did not expect bridge requirement for same placement") + } +} + +func TestResolveRemoteToLocalRequiresBridge(t *testing.T) { + r, err := Resolve(PlacementRemote, PlacementLocal, EndpointPair{ + Name: "jd-grpc", + Internal: "jd:14231", + External: "127.0.0.1:14231", + }) + if err != nil { + t.Fatalf("expected resolve to succeed: %v", err) + } + if !r.RequiresBridge || r.BridgePort != 14231 { + t.Fatalf("expected bridge requirement with port 14231, got %+v", r) + } +} + +func TestResolveAndEnsureReachableCallsEnsurer(t *testing.T) { + called := false + r, err := ResolveAndEnsureReachable(context.Background(), PlacementRemote, PlacementLocal, EndpointPair{ + Name: "jd-grpc", + Internal: "jd:14231", + External: "127.0.0.1:14231", + }, func(_ context.Context, endpoint EndpointPair, port int) error { + called = true + if endpoint.Name != "jd-grpc" || port != 14231 { + t.Fatalf("unexpected bridge args: endpoint=%s port=%d", endpoint.Name, port) + } + return nil + }) + if err != nil { + t.Fatalf("expected resolve+ensure to succeed: %v", err) + } + if !called { + t.Fatalf("expected bridge ensurer to be called") + } + if r.URL != "127.0.0.1:14231" { + t.Fatalf("unexpected resolution URL: %s", r.URL) + } +} + +func TestResolveAndEnsureReachableFailsWithoutEnsurer(t *testing.T) { + _, err := ResolveAndEnsureReachable(context.Background(), PlacementRemote, PlacementLocal, EndpointPair{ + Name: "jd-grpc", + Internal: "jd:14231", + External: "127.0.0.1:14231", + }, nil) + if err == nil { + t.Fatalf("expected missing bridge ensurer to fail") + } +} + +func TestResolveAndEnsureReachablePropagatesEnsurerError(t *testing.T) { + _, err := ResolveAndEnsureReachable(context.Background(), PlacementRemote, PlacementLocal, EndpointPair{ + Name: "jd-grpc", + Internal: "jd:14231", + External: "127.0.0.1:14231", + }, func(_ context.Context, _ EndpointPair, _ int) error { + return errors.New("boom") + }) + if err == nil { + t.Fatalf("expected ensurer error to be returned") + } +} diff --git a/system-tests/lib/cre/don/config/config.go b/system-tests/lib/cre/don/config/config.go index 0cb87111214..b14087b2436 100644 --- a/system-tests/lib/cre/don/config/config.go +++ b/system-tests/lib/cre/don/config/config.go @@ -33,6 +33,7 @@ import ( libc "github.com/smartcontractkit/chainlink/system-tests/lib/conversions" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/connectivity" crecontracts "github.com/smartcontractkit/chainlink/system-tests/lib/cre/contracts" creblockchains "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/solana" @@ -46,6 +47,7 @@ func PrepareNodeTOMLs( topology *cre.Topology, creEnv *cre.Environment, nodeSets []*cre.NodeSet, + blockchainTargetBySelector map[uint64]string, capabilities []cre.InstallableCapability, // Deprecated, use Features instead and modify node configs inside a Feature nodeConfigTransformerFns []cre.NodeConfigTransformerFn, ) ([]*cre.NodeSet, error) { @@ -104,16 +106,17 @@ func PrepareNodeTOMLs( if configsFound == 0 { config, configErr := generateNodeTomlConfig( cre.GenerateConfigsInput{ - Datastore: creEnv.CldfEnvironment.DataStore, - ContractVersions: creEnv.ContractVersions, - DonMetadata: donMetadata, - Blockchains: chainPerSelector, - Flags: donMetadata.Flags, - CapabilitiesPeeringData: capabilitiesPeeringData, - OCRPeeringData: ocrPeeringData, - RegistryChainSelector: creEnv.RegistryChainSelector, - Topology: topology, - Provider: creEnv.Provider, + Datastore: creEnv.CldfEnvironment.DataStore, + ContractVersions: creEnv.ContractVersions, + DonMetadata: donMetadata, + Blockchains: chainPerSelector, + BlockchainTargetBySelector: blockchainTargetBySelector, + Flags: donMetadata.Flags, + CapabilitiesPeeringData: capabilitiesPeeringData, + OCRPeeringData: ocrPeeringData, + RegistryChainSelector: creEnv.RegistryChainSelector, + Topology: topology, + Provider: creEnv.Provider, }, configFactoryFunctions, ) @@ -487,16 +490,42 @@ func addWorkerNodeConfig( if !ok { return existingConfig, fmt.Errorf("failed to get EVM key (chainID %d, node index %d)", commonInputs.registryChainID, m.Index) } + callerPlacement, placementErr := connectivity.PlacementFromTarget(donMetadata.MustNodeSet().Target) + if placementErr != nil { + return existingConfig, placementErr + } + placementByGatewayNodeUUID, placementMapErr := gatewayPlacementByNodeUUID(topology) + if placementMapErr != nil { + return existingConfig, placementMapErr + } gateways := []coretoml.ConnectorGateway{} if topology != nil && len(topology.GatewayConnectors.Configurations) > 0 { for _, gateway := range topology.GatewayConnectors.Configurations { + gatewayPlacement, ok := placementByGatewayNodeUUID[gateway.NodeUUID] + if !ok { + return existingConfig, fmt.Errorf("failed to resolve placement for gateway node UUID %s", gateway.NodeUUID) + } + internalURL := fmt.Sprintf("ws://%s:%d%s", gateway.Outgoing.Host, gateway.Outgoing.Port, gateway.Outgoing.Path) + externalURL := gatewayExternalConnectorURL(gateway) + resolvedGateway, err := connectivity.ResolveAndEnsureReachable( + context.Background(), + callerPlacement, + gatewayPlacement, + connectivity.EndpointPair{ + Name: fmt.Sprintf("gateway-%s", gateway.AuthGatewayID), + Internal: internalURL, + External: externalURL, + }, + // Bridge creation for remote->local gateway is handled outside config generation. + func(_ context.Context, _ connectivity.EndpointPair, _ int) error { return nil }, + ) + if err != nil { + return existingConfig, err + } gateways = append(gateways, coretoml.ConnectorGateway{ - ID: ptr.Ptr(gateway.AuthGatewayID), - URL: ptr.Ptr(fmt.Sprintf("ws://%s:%d%s", - gateway.Outgoing.Host, - gateway.Outgoing.Port, - gateway.Outgoing.Path)), + ID: ptr.Ptr(gateway.AuthGatewayID), + URL: ptr.Ptr(resolvedGateway.URL), }) } @@ -612,7 +641,10 @@ func gatherCommonInputs(input cre.GenerateConfigsInput) (*commonInputs, error) { return nil, errors.Wrap(homeErr, "failed to get home chain ID") } - evmChains := findEVMChains(input) + evmChains, evmErr := findEVMChains(input) + if evmErr != nil { + return nil, errors.Wrap(evmErr, "failed to resolve EVM chain endpoints for node config") + } solanaChain, solErr := findOneSolanaChain(input) if solErr != nil { return nil, errors.Wrap(solErr, "failed to find Solana chain in the environment configuration") @@ -645,8 +677,12 @@ type evmChain struct { WSRPC string } -func findEVMChains(input cre.GenerateConfigsInput) []*evmChain { +func findEVMChains(input cre.GenerateConfigsInput) ([]*evmChain, error) { evmChains := make([]*evmChain, 0) + callerPlacement, err := connectivity.PlacementFromTarget(input.DonMetadata.MustNodeSet().Target) + if err != nil { + return nil, err + } for chainSelector, bcOut := range input.Blockchains { if bcOut.IsFamily(chain_selectors.FamilySolana) { continue @@ -657,14 +693,39 @@ func findEVMChains(input cre.GenerateConfigsInput) []*evmChain { continue } + targetPlacement, err := connectivity.PlacementFromTarget(input.BlockchainTargetBySelector[chainSelector]) + if err != nil { + return nil, err + } + resolvedHTTP, err := connectivity.ResolveAndEnsureReachable(context.Background(), callerPlacement, targetPlacement, connectivity.EndpointPair{ + Name: fmt.Sprintf("evm-http-%d", bcOut.ChainID()), + Internal: bcOut.CtfOutput().Nodes[0].InternalHTTPUrl, + External: bcOut.CtfOutput().Nodes[0].ExternalHTTPUrl, + }, func(_ context.Context, _ connectivity.EndpointPair, _ int) error { + return fmt.Errorf("bridge is required for node->blockchain HTTP endpoint on chain %d (remote caller -> local target), automatic component bridge is not implemented yet", bcOut.ChainID()) + }) + if err != nil { + return nil, err + } + resolvedWS, err := connectivity.ResolveAndEnsureReachable(context.Background(), callerPlacement, targetPlacement, connectivity.EndpointPair{ + Name: fmt.Sprintf("evm-ws-%d", bcOut.ChainID()), + Internal: bcOut.CtfOutput().Nodes[0].InternalWSUrl, + External: bcOut.CtfOutput().Nodes[0].ExternalWSUrl, + }, func(_ context.Context, _ connectivity.EndpointPair, _ int) error { + return fmt.Errorf("bridge is required for node->blockchain WS endpoint on chain %d (remote caller -> local target), automatic component bridge is not implemented yet", bcOut.ChainID()) + }) + if err != nil { + return nil, err + } + evmChains = append(evmChains, &evmChain{ Name: fmt.Sprintf("node-%d", chainSelector), ChainID: bcOut.ChainID(), - HTTPRPC: bcOut.CtfOutput().Nodes[0].InternalHTTPUrl, - WSRPC: bcOut.CtfOutput().Nodes[0].InternalWSUrl, + HTTPRPC: resolvedHTTP.URL, + WSRPC: resolvedWS.URL, }) } - return evmChains + return evmChains, nil } type solanaChain struct { @@ -676,6 +737,10 @@ type solanaChain struct { func findOneSolanaChain(input cre.GenerateConfigsInput) (*solanaChain, error) { var solChain *solanaChain chainsFound := 0 + callerPlacement, err := connectivity.PlacementFromTarget(input.DonMetadata.MustNodeSet().Target) + if err != nil { + return nil, err + } for _, bcOut := range input.Blockchains { if !bcOut.IsFamily(chain_selectors.FamilySolana) { @@ -688,6 +753,20 @@ func findOneSolanaChain(input cre.GenerateConfigsInput) (*solanaChain, error) { } solBc := bcOut.(*solana.Blockchain) + targetPlacement, err := connectivity.PlacementFromTarget(input.BlockchainTargetBySelector[solBc.ChainSelector()]) + if err != nil { + return nil, err + } + resolvedNodeURL, err := connectivity.ResolveAndEnsureReachable(context.Background(), callerPlacement, targetPlacement, connectivity.EndpointPair{ + Name: "solana-rpc", + Internal: bcOut.CtfOutput().Nodes[0].InternalHTTPUrl, + External: bcOut.CtfOutput().Nodes[0].ExternalHTTPUrl, + }, func(_ context.Context, _ connectivity.EndpointPair, _ int) error { + return errors.New("bridge is required for node->solana RPC endpoint (remote caller -> local target), automatic component bridge is not implemented yet") + }) + if err != nil { + return nil, err + } ctx, cancelFn := context.WithTimeout(context.Background(), 15*time.Second) chainID, err := solBc.SolClient.GetGenesisHash(ctx) @@ -700,13 +779,55 @@ func findOneSolanaChain(input cre.GenerateConfigsInput) (*solanaChain, error) { solChain = &solanaChain{ Name: fmt.Sprintf("node-%d", solBc.ChainSelector()), ChainID: chainID.String(), - NodeURL: bcOut.CtfOutput().Nodes[0].InternalHTTPUrl, + NodeURL: resolvedNodeURL.URL, } } return solChain, nil } +func gatewayPlacementByNodeUUID(topology *cre.Topology) (map[string]connectivity.Placement, error) { + out := make(map[string]connectivity.Placement) + if topology == nil { + return out, nil + } + for _, don := range topology.DonsMetadata.List() { + placement, err := connectivity.PlacementFromTarget(don.MustNodeSet().Target) + if err != nil { + return nil, err + } + for _, node := range don.NodesMetadata { + if node == nil || strings.TrimSpace(node.UUID) == "" { + continue + } + out[node.UUID] = placement + } + } + return out, nil +} + +func gatewayExternalConnectorURL(gateway *cre.DonGatewayConfiguration) string { + if gateway == nil || gateway.GatewayConfiguration == nil { + return "" + } + scheme := "ws" + switch strings.ToLower(strings.TrimSpace(gateway.Incoming.Protocol)) { + case "https": + scheme = "wss" + case "wss": + scheme = "wss" + case "http": + scheme = "ws" + } + path := strings.TrimSpace(gateway.Incoming.Path) + if path == "" || path == "/" { + path = "/node" + } else if !strings.HasSuffix(path, "/node") { + path = strings.TrimRight(path, "/") + "/node" + } + return fmt.Sprintf("%s://%s:%d%s", scheme, gateway.Incoming.Host, gateway.Incoming.ExternalPort, path) +} + func buildTronEVMConfig(evmChain *evmChain) evmconfigtoml.EVMConfig { tronRPC := strings.Replace(evmChain.HTTPRPC, "jsonrpc", "wallet", 1) return evmconfigtoml.EVMConfig{ diff --git a/system-tests/lib/cre/environment/agent/cmd/local-agent/main.go b/system-tests/lib/cre/environment/agent/cmd/local-agent/main.go index 05e7cefb04c..55c4ac6c655 100644 --- a/system-tests/lib/cre/environment/agent/cmd/local-agent/main.go +++ b/system-tests/lib/cre/environment/agent/cmd/local-agent/main.go @@ -12,11 +12,16 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" blockchainsets "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/sets" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" "github.com/smartcontractkit/chainlink/system-tests/lib/infra" ) func main() { - addr := flag.String("addr", "127.0.0.1:18080", "agent listen address") + defaultAddr := "127.0.0.1:18080" + if runtimecfg.IsDirectMode() { + defaultAddr = "0.0.0.0:18080" + } + addr := flag.String("addr", defaultAddr, "agent listen address") flag.Parse() lggr := zerolog.New(os.Stderr).With().Timestamp().Logger() diff --git a/system-tests/lib/cre/environment/agent/relay.go b/system-tests/lib/cre/environment/agent/relay.go new file mode 100644 index 00000000000..67ceb856c6b --- /dev/null +++ b/system-tests/lib/cre/environment/agent/relay.go @@ -0,0 +1,290 @@ +package agent + +import ( + "encoding/json" + "errors" + "fmt" + "io" + "net" + "net/http" + "strconv" + "strings" + "sync/atomic" + "time" + + "github.com/gorilla/websocket" +) + +const relayIncomingQueueSize = 64 + +var relayIDSeq uint64 + +type relayRegistration struct { + ID string + Name string + RequestedPort int + Listener net.Listener + Incoming chan net.Conn + Closed chan struct{} +} + +type openRelayRequest struct { + Name string `json:"name"` + RequestedPort int `json:"requestedPort"` +} + +type openRelayResponse struct { + RelayID string `json:"relayId"` + RequestedPort int `json:"requestedPort"` + BoundPort int `json:"boundPort"` +} + +type closeRelayRequest struct { + RelayID string `json:"relayId"` +} + +var relayWSUpgrader = websocket.Upgrader{ + CheckOrigin: func(_ *http.Request) bool { return true }, +} + +func (s *Server) openRelay(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) + return + } + + var req openRelayRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + s.respondError(w, http.StatusBadRequest, ErrCodeInvalidRequestBody, fmt.Sprintf("invalid relay open request body: %v", err), nil) + return + } + req.Name = strings.TrimSpace(req.Name) + if req.Name == "" { + s.respondError(w, http.StatusBadRequest, ErrCodeMissingComponentInput, "relay name is required", nil) + return + } + if req.RequestedPort < 0 || req.RequestedPort > 65535 { + s.respondError(w, http.StatusBadRequest, ErrCodeInvalidPayload, fmt.Sprintf("invalid requestedPort %d", req.RequestedPort), nil) + return + } + + // Idempotent open: same name+port returns the existing relay. + s.relayMu.Lock() + for _, relay := range s.relays { + if relay.Name == req.Name && relay.RequestedPort == req.RequestedPort { + s.relayMu.Unlock() + s.respondJSONAny(w, http.StatusOK, openRelayResponse{ + RelayID: relay.ID, + RequestedPort: relay.RequestedPort, + BoundPort: listenerPort(relay.Listener), + }) + return + } + } + s.relayMu.Unlock() + + listenAddr := fmt.Sprintf("0.0.0.0:%d", req.RequestedPort) + ln, err := net.Listen("tcp", listenAddr) + if err != nil { + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to open relay listener: %v", err), nil) + return + } + + relayID := fmt.Sprintf("relay-%x", atomic.AddUint64(&relayIDSeq, 1)) + reg := &relayRegistration{ + ID: relayID, + Name: req.Name, + RequestedPort: req.RequestedPort, + Listener: ln, + Incoming: make(chan net.Conn, relayIncomingQueueSize), + Closed: make(chan struct{}), + } + + s.relayMu.Lock() + s.relays[relayID] = reg + s.relayMu.Unlock() + + go s.acceptRelayConnections(reg) + + s.respondJSONAny(w, http.StatusOK, openRelayResponse{ + RelayID: relayID, + RequestedPort: req.RequestedPort, + BoundPort: listenerPort(ln), + }) +} + +func (s *Server) closeRelay(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) + return + } + + var req closeRelayRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + s.respondError(w, http.StatusBadRequest, ErrCodeInvalidRequestBody, fmt.Sprintf("invalid relay close request body: %v", err), nil) + return + } + relayID := strings.TrimSpace(req.RelayID) + if relayID == "" { + s.respondError(w, http.StatusBadRequest, ErrCodeMissingComponentInput, "relayId is required", nil) + return + } + + s.relayMu.Lock() + relay, ok := s.relays[relayID] + if ok { + delete(s.relays, relayID) + } + s.relayMu.Unlock() + + if !ok { + s.respondJSONAny(w, http.StatusOK, map[string]any{"relayId": relayID, "closed": false, "found": false}) + return + } + close(relay.Closed) + _ = relay.Listener.Close() + drainAndCloseIncoming(relay.Incoming) + + s.respondJSONAny(w, http.StatusOK, map[string]any{"relayId": relayID, "closed": true, "found": true}) +} + +func (s *Server) connectRelay(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) + return + } + relayID := strings.TrimSpace(r.URL.Query().Get("relayId")) + if relayID == "" { + s.respondError(w, http.StatusBadRequest, ErrCodeMissingComponentInput, "relayId query parameter is required", nil) + return + } + + s.relayMu.Lock() + relay, ok := s.relays[relayID] + s.relayMu.Unlock() + if !ok { + s.respondError(w, http.StatusNotFound, ErrCodeDeployFailed, fmt.Sprintf("relay not found: %s", relayID), nil) + return + } + + wsConn, err := relayWSUpgrader.Upgrade(w, r, nil) + if err != nil { + return + } + defer wsConn.Close() + + var incoming net.Conn + select { + case incoming = <-relay.Incoming: + case <-relay.Closed: + _ = wsConn.WriteControl(websocket.CloseMessage, websocket.FormatCloseMessage(websocket.CloseNormalClosure, "relay closed"), time.Now().Add(2*time.Second)) + return + case <-r.Context().Done(): + return + } + if incoming == nil { + return + } + defer incoming.Close() + + _ = bridgeWebSocketAndTCP(wsConn, incoming) +} + +func (s *Server) acceptRelayConnections(relay *relayRegistration) { + for { + conn, err := relay.Listener.Accept() + if err != nil { + select { + case <-relay.Closed: + return + default: + } + if ne, ok := err.(net.Error); ok && ne.Temporary() { + time.Sleep(50 * time.Millisecond) + continue + } + return + } + + select { + case relay.Incoming <- conn: + default: + _ = conn.Close() + } + } +} + +func bridgeWebSocketAndTCP(ws *websocket.Conn, tcp net.Conn) error { + errCh := make(chan error, 2) + + go func() { + buf := make([]byte, 32*1024) + for { + n, err := tcp.Read(buf) + if n > 0 { + if wErr := ws.WriteMessage(websocket.BinaryMessage, buf[:n]); wErr != nil { + errCh <- wErr + return + } + } + if err != nil { + errCh <- err + return + } + } + }() + + go func() { + for { + msgType, payload, err := ws.ReadMessage() + if err != nil { + errCh <- err + return + } + if msgType != websocket.BinaryMessage && msgType != websocket.TextMessage { + continue + } + if len(payload) == 0 { + continue + } + if _, wErr := tcp.Write(payload); wErr != nil { + errCh <- wErr + return + } + } + }() + + err := <-errCh + if err == nil || errors.Is(err, io.EOF) || websocket.IsCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway) { + return nil + } + return err +} + +func drainAndCloseIncoming(ch chan net.Conn) { + for { + select { + case conn := <-ch: + if conn != nil { + _ = conn.Close() + } + default: + return + } + } +} + +func listenerPort(ln net.Listener) int { + if ln == nil { + return 0 + } + _, portRaw, err := net.SplitHostPort(ln.Addr().String()) + if err != nil { + return 0 + } + port, err := strconv.Atoi(portRaw) + if err != nil { + return 0 + } + return port +} diff --git a/system-tests/lib/cre/environment/agent/server.go b/system-tests/lib/cre/environment/agent/server.go index eee0f683d24..274dc4779f5 100644 --- a/system-tests/lib/cre/environment/agent/server.go +++ b/system-tests/lib/cre/environment/agent/server.go @@ -103,6 +103,8 @@ type Server struct { cacheMu sync.Mutex cache map[string]cachedStart runtime map[string]runtimeState + relayMu sync.Mutex + relays map[string]*relayRegistration } type cachedStart struct { @@ -121,6 +123,7 @@ func NewServer(lggr zerolog.Logger, deployers map[blockchain.ChainFamily]blockch deployers: deployers, cache: make(map[string]cachedStart), runtime: make(map[string]runtimeState), + relays: make(map[string]*relayRegistration), } } @@ -128,6 +131,9 @@ func (s *Server) Handler() http.Handler { mux := http.NewServeMux() mux.HandleFunc("/v1/health", s.health) mux.HandleFunc("/v1/components/start", s.startComponent) + mux.HandleFunc("/v1/relay/open", s.openRelay) + mux.HandleFunc("/v1/relay/close", s.closeRelay) + mux.HandleFunc("/v1/relay/connect", s.connectRelay) return mux } @@ -392,6 +398,10 @@ func (s *Server) stopComponentByKey(w http.ResponseWriter, r *http.Request, comp } func (s *Server) respondJSON(w http.ResponseWriter, code int, body StartComponentResponse) { + s.respondJSONAny(w, code, body) +} + +func (s *Server) respondJSONAny(w http.ResponseWriter, code int, body any) { w.Header().Set("Content-Type", "application/json") w.WriteHeader(code) _ = json.NewEncoder(w).Encode(body) diff --git a/system-tests/lib/cre/environment/blockchain_start.go b/system-tests/lib/cre/environment/blockchain_start.go index 6500cdff703..67c0f0227de 100644 --- a/system-tests/lib/cre/environment/blockchain_start.go +++ b/system-tests/lib/cre/environment/blockchain_start.go @@ -28,6 +28,7 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" ) const ( @@ -232,22 +233,24 @@ func resolveEC2AgentBaseURL(testLogger zerolog.Logger, tunnelManager tunnel.Mana if configured := os.Getenv(envEC2AgentURL); configured != "" { return configured, nil } - if tunnelManager == nil { - return "", errors.New("tunnel manager is required to auto-open ec2 agent tunnel") + remotePort, err := resolveEC2AgentPort() + if err != nil { + return "", err + } + if isRemoteAccessDirectMode() { + hostIP, err := resolveDirectAccessHostIP() + if err != nil { + return "", err + } + return fmt.Sprintf("http://%s:%d", hostIP, remotePort), nil } instanceID := strings.TrimSpace(os.Getenv(envEC2InstanceID)) if instanceID == "" { return "", fmt.Errorf("%s must be set when %s=ec2 and %s is not provided", envEC2InstanceID, envAgentMode, envEC2AgentURL) } - - remotePort := defaultEC2AgentPort - if configuredPort := strings.TrimSpace(os.Getenv(envEC2AgentPort)); configuredPort != "" { - parsedPort, err := strconv.Atoi(configuredPort) - if err != nil || parsedPort <= 0 || parsedPort > 65535 { - return "", fmt.Errorf("invalid %s: %q", envEC2AgentPort, configuredPort) - } - remotePort = parsedPort + if tunnelManager == nil { + return "", errors.New("tunnel manager is required to auto-open ec2 agent tunnel") } bindings, err := tunnelManager.Start(context.Background(), []tunnel.EndpointRef{ @@ -276,6 +279,18 @@ func resolveEC2AgentBaseURL(testLogger zerolog.Logger, tunnelManager tunnel.Mana return bindings[0].LocalURL, nil } +func resolveEC2AgentPort() (int, error) { + remotePort := defaultEC2AgentPort + if configuredPort := strings.TrimSpace(os.Getenv(envEC2AgentPort)); configuredPort != "" { + parsedPort, err := strconv.Atoi(configuredPort) + if err != nil || parsedPort <= 0 || parsedPort > 65535 { + return 0, fmt.Errorf("invalid %s: %q", envEC2AgentPort, configuredPort) + } + remotePort = parsedPort + } + return remotePort, nil +} + func blockchainFromOutput(testLogger zerolog.Logger, output *blockchain.Output) (blockchains.Blockchain, error) { if output == nil { return nil, pkgerrors.New("blockchain output is nil") @@ -448,6 +463,9 @@ func newEC2TunnelManager(testLogger zerolog.Logger) (tunnel.Manager, error) { if os.Getenv(envAgentMode) != "ec2" { return tunnel.NewNoopManager(), nil } + if isRemoteAccessDirectMode() { + return tunnel.NewNoopManager(), nil + } instanceID := strings.TrimSpace(os.Getenv(envEC2InstanceID)) if instanceID == "" { @@ -474,6 +492,13 @@ func rewriteRemoteBlockchainOutputForLocalAccess( if output == nil { return nil } + if isRemoteAccessDirectMode() { + hostIP, err := resolveDirectAccessHostIP() + if err != nil { + return err + } + return rewriteRemoteBlockchainOutputForDirectAccess(output, hostIP, rewriteInternalForLocalNodes) + } componentID := tunnel.CanonicalComponentID(tunnel.KindBlockchain, configuredIndex, input.Type) adapter := adapters.NewBlockchainAdapter() @@ -508,6 +533,42 @@ func rewriteRemoteBlockchainOutputForLocalAccess( return nil } +func rewriteRemoteBlockchainOutputForDirectAccess( + output *blockchain.Output, + hostIP string, + rewriteInternalForLocalNodes bool, +) error { + if output == nil { + return nil + } + for _, node := range output.Nodes { + if node == nil { + continue + } + if node.ExternalHTTPUrl != "" { + rewritten, err := rewriteURLHost(node.ExternalHTTPUrl, hostIP) + if err != nil { + return err + } + node.ExternalHTTPUrl = rewritten + if rewriteInternalForLocalNodes { + node.InternalHTTPUrl = rewritten + } + } + if node.ExternalWSUrl != "" { + rewritten, err := rewriteURLHost(node.ExternalWSUrl, hostIP) + if err != nil { + return err + } + node.ExternalWSUrl = rewritten + if rewriteInternalForLocalNodes { + node.InternalWSUrl = rewritten + } + } + } + return nil +} + func rewriteBlockchainInternalURLsForLocalNodes(output *blockchain.Output) error { if output == nil { return nil @@ -552,6 +613,14 @@ func rewriteURLHost(rawURL, host string) (string, error) { return parsed.String(), nil } +func isRemoteAccessDirectMode() bool { + return runtimecfg.IsDirectMode() +} + +func resolveDirectAccessHostIP() (string, error) { + return runtimecfg.DirectHostIP() +} + func remoteAgentError(code, message string) error { return fmt.Errorf("remote agent error (%s): %s", code, message) } diff --git a/system-tests/lib/cre/environment/blockchain_start_test.go b/system-tests/lib/cre/environment/blockchain_start_test.go index b724cc298b7..1e2b7cbb2f8 100644 --- a/system-tests/lib/cre/environment/blockchain_start_test.go +++ b/system-tests/lib/cre/environment/blockchain_start_test.go @@ -8,6 +8,7 @@ import ( "github.com/rs/zerolog" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" ) @@ -66,6 +67,7 @@ func TestResolveEC2AgentBaseURLRequiresInstanceIDWhenURLMissing(t *testing.T) { func TestResolveEC2AgentBaseURLRejectsInvalidPort(t *testing.T) { t.Setenv(envEC2AgentURL, "") + t.Setenv(runtimecfg.EnvRemoteAccessMode, runtimecfg.RemoteAccessModeSSM) t.Setenv(envEC2InstanceID, "i-123") t.Setenv(envEC2AgentPort, "not-a-port") @@ -78,6 +80,25 @@ func TestResolveEC2AgentBaseURLRejectsInvalidPort(t *testing.T) { } } +func TestResolveEC2AgentBaseURLDirectMode(t *testing.T) { + t.Setenv(envEC2AgentURL, "") + t.Setenv(runtimecfg.EnvRemoteAccessMode, runtimecfg.RemoteAccessModeDirect) + t.Setenv(runtimecfg.EnvEC2HostIP, "10.193.28.183") + t.Setenv(envEC2AgentPort, "18080") + + manager := &fakeTunnelManager{} + baseURL, err := resolveEC2AgentBaseURL(zerolog.Nop(), manager) + if err != nil { + t.Fatalf("expected direct mode url resolution to succeed, got %v", err) + } + if baseURL != "http://10.193.28.183:18080" { + t.Fatalf("unexpected direct mode base url: %s", baseURL) + } + if manager.startCalls != 0 { + t.Fatalf("expected direct mode to skip tunnel manager") + } +} + func TestNewStartComponentClientLocalMode(t *testing.T) { t.Setenv(envAgentMode, "") t.Setenv(envEC2AgentURL, "") @@ -134,6 +155,7 @@ func (f *fakeTunnelManager) IsStarted() bool { return f.startCalls > 0 } func (f *fakeTunnelManager) Snapshot() []tunnel.TunnelBinding { return []tunnel.TunnelBinding{} } func TestRewriteRemoteBlockchainOutputForLocalAccess(t *testing.T) { + t.Setenv(runtimecfg.EnvRemoteAccessMode, runtimecfg.RemoteAccessModeSSM) out := &blockchain.Output{ Nodes: []*blockchain.Node{ { @@ -173,6 +195,50 @@ func TestRewriteRemoteBlockchainOutputForLocalAccess(t *testing.T) { } } +func TestRewriteRemoteBlockchainOutputForLocalAccessDirectMode(t *testing.T) { + t.Setenv(runtimecfg.EnvRemoteAccessMode, runtimecfg.RemoteAccessModeDirect) + t.Setenv(runtimecfg.EnvEC2HostIP, "10.193.28.183") + + out := &blockchain.Output{ + Nodes: []*blockchain.Node{ + { + ExternalHTTPUrl: "http://anvil-1337:8545", + ExternalWSUrl: "ws://anvil-1337:8546", + InternalHTTPUrl: "http://anvil-1337:8545", + InternalWSUrl: "ws://anvil-1337:8546", + }, + }, + } + manager := &fakeTunnelManager{} + + if err := rewriteRemoteBlockchainOutputForLocalAccess( + context.Background(), + zerolog.Nop(), + manager, + 0, + &blockchain.Input{Type: blockchain.TypeAnvil}, + out, + true, + ); err != nil { + t.Fatalf("expected direct mode rewrite helper to succeed: %v", err) + } + if manager.startCalls != 0 { + t.Fatalf("expected direct mode to skip tunnel manager, got %d calls", manager.startCalls) + } + if out.Nodes[0].ExternalHTTPUrl != "http://10.193.28.183:8545" { + t.Fatalf("unexpected rewritten http url in direct mode: %s", out.Nodes[0].ExternalHTTPUrl) + } + if out.Nodes[0].ExternalWSUrl != "ws://10.193.28.183:8546" { + t.Fatalf("unexpected rewritten ws url in direct mode: %s", out.Nodes[0].ExternalWSUrl) + } + if out.Nodes[0].InternalHTTPUrl != "http://10.193.28.183:8545" { + t.Fatalf("unexpected rewritten internal http url in direct mode: %s", out.Nodes[0].InternalHTTPUrl) + } + if out.Nodes[0].InternalWSUrl != "ws://10.193.28.183:8546" { + t.Fatalf("unexpected rewritten internal ws url in direct mode: %s", out.Nodes[0].InternalWSUrl) + } +} + func TestNewEC2TunnelManagerReturnsNoopWhenNotApplicable(t *testing.T) { t.Setenv(envAgentMode, "") t.Setenv(envEC2InstanceID, "") diff --git a/system-tests/lib/cre/environment/config/config.go b/system-tests/lib/cre/environment/config/config.go index d6775a72fbb..6b6e8674d5d 100644 --- a/system-tests/lib/cre/environment/config/config.go +++ b/system-tests/lib/cre/environment/config/config.go @@ -74,6 +74,8 @@ type Config struct { type ComponentTarget string const ( + TargetLocal ComponentTarget = "local" + // TargetDocker is a backward-compatible alias for local placement. TargetDocker ComponentTarget = "docker" TargetRemote ComponentTarget = "remote" ) @@ -102,8 +104,9 @@ type JobDistributor struct { } func (b *Blockchain) Normalize() { + b.Target = normalizeComponentTarget(b.Target) if b.Target == "" { - b.Target = TargetDocker + b.Target = TargetLocal } if b.RemoteStartPolicy == "" { b.RemoteStartPolicy = RemoteStartPolicyReuseIfIdentical @@ -116,7 +119,7 @@ func (b *Blockchain) Validate() error { } b.Normalize() - if b.Target != TargetDocker && b.Target != TargetRemote { + if b.Target != TargetLocal && b.Target != TargetRemote { return fmt.Errorf("invalid blockchain target: %s", b.Target) } if b.RemoteStartPolicy != RemoteStartPolicyReuseIfIdentical && b.RemoteStartPolicy != RemoteStartPolicyAlways { @@ -134,8 +137,9 @@ func (b *Blockchain) InputRef() *blockchain.Input { } func (j *JobDistributor) Normalize() { + j.Target = normalizeComponentTarget(j.Target) if j.Target == "" { - j.Target = TargetDocker + j.Target = TargetLocal } if j.RemoteStartPolicy == "" { j.RemoteStartPolicy = RemoteStartPolicyReuseIfIdentical @@ -148,7 +152,7 @@ func (j *JobDistributor) Validate() error { } j.Normalize() - if j.Target != TargetDocker && j.Target != TargetRemote { + if j.Target != TargetLocal && j.Target != TargetRemote { return fmt.Errorf("invalid jd target: %s", j.Target) } if j.RemoteStartPolicy != RemoteStartPolicyReuseIfIdentical && j.RemoteStartPolicy != RemoteStartPolicyAlways { @@ -234,8 +238,9 @@ func normalizeNodeSetPlacement(nodeSet *cre.NodeSet) { if nodeSet == nil { return } + nodeSet.Target = normalizeNodeSetTarget(nodeSet.Target) if strings.TrimSpace(nodeSet.Target) == "" { - nodeSet.Target = string(TargetDocker) + nodeSet.Target = string(TargetLocal) } if strings.TrimSpace(nodeSet.RemoteStartPolicy) == "" { nodeSet.RemoteStartPolicy = string(RemoteStartPolicyReuseIfIdentical) @@ -246,7 +251,7 @@ func validateNodeSetPlacement(nodeSet *cre.NodeSet) error { if nodeSet == nil { return errors.New("nodeset is nil") } - if nodeSet.Target != string(TargetDocker) && nodeSet.Target != string(TargetRemote) { + if nodeSet.Target != string(TargetLocal) && nodeSet.Target != string(TargetRemote) { return fmt.Errorf("invalid nodeset target: %s", nodeSet.Target) } if nodeSet.RemoteStartPolicy != string(RemoteStartPolicyReuseIfIdentical) && nodeSet.RemoteStartPolicy != string(RemoteStartPolicyAlways) { @@ -270,6 +275,23 @@ func removeChainIDFromFlag(flag string) string { return flag[:lastIdx] } +func normalizeComponentTarget(target ComponentTarget) ComponentTarget { + switch strings.ToLower(strings.TrimSpace(string(target))) { + case "": + return "" + case string(TargetRemote): + return TargetRemote + case string(TargetLocal), string(TargetDocker): + return TargetLocal + default: + return target + } +} + +func normalizeNodeSetTarget(target string) string { + return string(normalizeComponentTarget(ComponentTarget(target))) +} + func validateContractVersions(envDependencies cre.CLIEnvironmentDependencies) error { supportedSet := DefaultContractSet(envDependencies.WithV2Registries()) cv := envDependencies.ContractVersions() diff --git a/system-tests/lib/cre/environment/dons.go b/system-tests/lib/cre/environment/dons.go index 1a4ddc5e057..84ae380c0b5 100644 --- a/system-tests/lib/cre/environment/dons.go +++ b/system-tests/lib/cre/environment/dons.go @@ -193,7 +193,7 @@ func StartDONs( if err != nil { return pkgerrors.Wrap(err, "failed to decode nodeset transport payload") } - if err := rewriteRemoteNodeSetOutputForLocalAccess(ctx, lggr, tunnelManager, idx, nodeSet, nodeset); err != nil { + if err := rewriteRemoteNodeSetOutputForLocalAccess(ctx, lggr, tunnelManager, topology, idx, nodeSet, nodeset); err != nil { return err } } else { @@ -302,6 +302,7 @@ func rewriteRemoteNodeSetOutputForLocalAccess( ctx context.Context, lggr zerolog.Logger, tunnelManager tunnel.Manager, + topology *cre.Topology, configuredIndex int, nodeSet *cre.NodeSet, output *ns.Output, @@ -309,6 +310,17 @@ func rewriteRemoteNodeSetOutputForLocalAccess( if output == nil && (nodeSet == nil || nodeSet.DbInput == nil || nodeSet.DbInput.Port == 0) { return nil } + if isRemoteAccessDirectMode() { + hostIP, err := resolveDirectAccessHostIP() + if err != nil { + return err + } + if err := rewriteNodeSetForDirectAccess(output, hostIP); err != nil { + return err + } + rewriteGatewayIncomingForDirectAccess(topology, configuredIndex, hostIP) + return nil + } componentID := tunnel.CanonicalComponentID(tunnel.KindNodeSet, configuredIndex, nodeSet.Name) refs, err := describeNodeSetEndpoints(componentID, nodeSet, output) if err != nil { @@ -326,9 +338,28 @@ func rewriteRemoteNodeSetOutputForLocalAccess( Str("localURL", binding.LocalURL). Msg("Established endpoint tunnel") } + rewriteGatewayIncomingForNodeSetBindings(topology, configuredIndex, nodeSet, bindings) return rewriteNodeSetWithBindings(output, nodeSet, bindings) } +func rewriteNodeSetForDirectAccess(output *ns.Output, hostIP string) error { + if output == nil { + return nil + } + for idx := range output.CLNodes { + rawURL := output.CLNodes[idx].Node.ExternalURL + if strings.TrimSpace(rawURL) == "" { + continue + } + rewritten, err := rewriteURLHost(rawURL, hostIP) + if err != nil { + return err + } + output.CLNodes[idx].Node.ExternalURL = rewritten + } + return nil +} + const nodeSetDBEndpointName = "nodeset-db" func describeNodeSetEndpoints(componentID string, nodeSet *cre.NodeSet, output *ns.Output) ([]tunnel.EndpointRef, error) { @@ -336,6 +367,14 @@ func describeNodeSetEndpoints(componentID string, nodeSet *cre.NodeSet, output * if output != nil { sizeHint += len(output.CLNodes) } + if nodeSet != nil { + for _, spec := range nodeSet.NodeSpecs { + if spec == nil || spec.Node == nil { + continue + } + sizeHint += len(spec.Node.CustomPorts) + } + } refs := make([]tunnel.EndpointRef, 0, sizeHint) if output != nil { for idx := range output.CLNodes { @@ -350,6 +389,15 @@ func describeNodeSetEndpoints(componentID string, nodeSet *cre.NodeSet, output * } } } + if nodeSet != nil { + for nodeIdx, spec := range nodeSet.NodeSpecs { + customRefs, err := nodeSetCustomPortEndpointRefs(componentID, nodeIdx, spec) + if err != nil { + return nil, err + } + refs = append(refs, customRefs...) + } + } dbRef, err := nodeSetDBEndpointRef(componentID, nodeSet) if err != nil { return nil, err @@ -384,8 +432,8 @@ func rewriteNodeSetWithBindings(output *ns.Output, nodeSet *cre.NodeSet, binding } if output != nil { for idx := range output.CLNodes { - endpointName := fmt.Sprintf("node-%d-api", idx) - rawURL := output.CLNodes[idx].Node.ExternalURL + endpointName := fmt.Sprintf("node-%d-api", idx) + rawURL := output.CLNodes[idx].Node.ExternalURL if rawURL == "" { continue } @@ -403,9 +451,150 @@ func rewriteNodeSetWithBindings(output *ns.Output, nodeSet *cre.NodeSet, binding } nodeSet.DbInput.Port = binding.LocalPort } + if nodeSet != nil { + for nodeIdx, spec := range nodeSet.NodeSpecs { + if spec == nil || spec.Input == nil || spec.Input.Node == nil || len(spec.Input.Node.CustomPorts) == 0 { + continue + } + for portIdx, mapping := range spec.Input.Node.CustomPorts { + _, containerPort, err := parseCustomPortMapping(mapping) + if err != nil { + return fmt.Errorf("invalid custom_ports entry %q for node %d: %w", mapping, nodeIdx, err) + } + binding, ok := byName[nodeSetCustomPortEndpointName(nodeIdx, portIdx, containerPort)] + if !ok { + return fmt.Errorf("missing tunnel binding for nodeset endpoint %s", nodeSetCustomPortEndpointName(nodeIdx, portIdx, containerPort)) + } + spec.Input.Node.CustomPorts[portIdx] = rewriteCustomPortMappingHostPort(mapping, binding.LocalPort) + } + } + } return nil } +func nodeSetCustomPortEndpointRefs(componentID string, nodeIdx int, spec *cre.NodeSpecWithRole) ([]tunnel.EndpointRef, error) { + if spec == nil || spec.Input == nil || spec.Input.Node == nil || len(spec.Input.Node.CustomPorts) == 0 { + return nil, nil + } + refs := make([]tunnel.EndpointRef, 0, len(spec.Input.Node.CustomPorts)) + for portIdx, mapping := range spec.Input.Node.CustomPorts { + hostPort, containerPort, err := parseCustomPortMapping(mapping) + if err != nil { + return nil, fmt.Errorf("invalid custom_ports entry %q for node %d: %w", mapping, nodeIdx, err) + } + refs = append(refs, tunnel.EndpointRef{ + ComponentID: componentID, + EndpointName: nodeSetCustomPortEndpointName(nodeIdx, portIdx, containerPort), + Scheme: "tcp", + Host: "127.0.0.1", + Port: hostPort, + OriginalURL: fmt.Sprintf("tcp://127.0.0.1:%d", hostPort), + }) + } + return refs, nil +} + +func nodeSetCustomPortEndpointName(nodeIdx, portIdx, containerPort int) string { + return fmt.Sprintf("node-%d-custom-%d-%d", nodeIdx, portIdx, containerPort) +} + +func parseCustomPortMapping(mapping string) (hostPort int, containerPort int, err error) { + parts := strings.Split(strings.TrimSpace(mapping), ":") + if len(parts) < 2 { + return 0, 0, fmt.Errorf("expected hostPort:containerPort, got %q", mapping) + } + hostPortRaw := parts[len(parts)-2] + containerPortRaw := parts[len(parts)-1] + hostPort, err = strconv.Atoi(hostPortRaw) + if err != nil || hostPort <= 0 || hostPort > 65535 { + return 0, 0, fmt.Errorf("invalid host port %q", hostPortRaw) + } + containerPort, err = strconv.Atoi(containerPortRaw) + if err != nil || containerPort <= 0 || containerPort > 65535 { + return 0, 0, fmt.Errorf("invalid container port %q", containerPortRaw) + } + return hostPort, containerPort, nil +} + +func rewriteCustomPortMappingHostPort(mapping string, newHostPort int) string { + parts := strings.Split(strings.TrimSpace(mapping), ":") + if len(parts) < 2 { + return mapping + } + parts[len(parts)-2] = strconv.Itoa(newHostPort) + return strings.Join(parts, ":") +} + +func rewriteGatewayIncomingForNodeSetBindings( + topology *cre.Topology, + configuredIndex int, + nodeSet *cre.NodeSet, + bindings []tunnel.TunnelBinding, +) { + if topology == nil || topology.GatewayConnectors == nil || len(topology.GatewayConnectors.Configurations) == 0 || nodeSet == nil { + return + } + if configuredIndex < 0 || configuredIndex >= len(topology.DonsMetadata.List()) { + return + } + donMeta := topology.DonsMetadata.List()[configuredIndex] + gatewayNode, hasGateway := donMeta.Gateway() + if !hasGateway { + return + } + if gatewayNode.Index < 0 || gatewayNode.Index >= len(nodeSet.NodeSpecs) { + return + } + spec := nodeSet.NodeSpecs[gatewayNode.Index] + if spec == nil || spec.Input == nil || spec.Input.Node == nil || len(spec.Input.Node.CustomPorts) == 0 { + return + } + + for _, cfg := range topology.GatewayConnectors.Configurations { + if cfg == nil || cfg.GatewayConfiguration == nil || cfg.NodeUUID != gatewayNode.UUID { + continue + } + // Test process reaches gateway via local port (direct for local runs, tunneled for remote runs). + cfg.Incoming.Host = "127.0.0.1" + // Resolve tunnel by gateway container port (e.g. 5002), not by possibly stale host-side custom port. + if localPort, ok := gatewayLocalPortFromBindings(gatewayNode.Index, cfg.Incoming.ExternalPort, bindings); ok { + cfg.Incoming.ExternalPort = localPort + } + } +} + +func rewriteGatewayIncomingForDirectAccess(topology *cre.Topology, configuredIndex int, hostIP string) { + if topology == nil || topology.GatewayConnectors == nil || len(topology.GatewayConnectors.Configurations) == 0 { + return + } + if configuredIndex < 0 || configuredIndex >= len(topology.DonsMetadata.List()) { + return + } + donMeta := topology.DonsMetadata.List()[configuredIndex] + gatewayNode, hasGateway := donMeta.Gateway() + if !hasGateway { + return + } + for _, cfg := range topology.GatewayConnectors.Configurations { + if cfg == nil || cfg.GatewayConfiguration == nil || cfg.NodeUUID != gatewayNode.UUID { + continue + } + cfg.Incoming.Host = hostIP + } +} + +func gatewayLocalPortFromBindings(gatewayNodeIndex, gatewayContainerPort int, bindings []tunnel.TunnelBinding) (int, bool) { + for _, binding := range bindings { + if !strings.HasPrefix(binding.EndpointName, fmt.Sprintf("node-%d-custom-", gatewayNodeIndex)) { + continue + } + if strings.HasSuffix(binding.EndpointName, fmt.Sprintf("-%d", gatewayContainerPort)) { + return binding.LocalPort, true + } + } + return 0, false +} + func nodeSetEndpointFromURL(componentID, endpointName, rawURL string) (*tunnel.EndpointRef, error) { if strings.TrimSpace(rawURL) == "" { return nil, nil diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index 8ca31c3af80..79cd3337212 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -222,12 +222,14 @@ func SetupTestEnvironment( if tErr != nil { return nil, pkgerrors.Wrap(tErr, "failed to create topology") } + blockchainTargetBySelector := blockchainTargetsBySelector(input.Blockchains) updatedNodeSets, topoErr := donconfig.PrepareNodeTOMLs( ctx, topology, creEnvironment, input.NodeSets, + blockchainTargetBySelector, input.Capabilities, input.ConfigFactoryFunctions, ) @@ -504,6 +506,23 @@ func SetupTestEnvironment( }, nil } +func blockchainTargetsBySelector(blockchains []*config.Blockchain) map[uint64]string { + bySelector := make(map[uint64]string, len(blockchains)) + for _, blockchainCfg := range blockchains { + if blockchainCfg == nil { + continue + } + input := blockchainCfg.InputRef() + if input == nil { + continue + } + for _, chainID := range input.ChainID { + bySelector[uint64(chainID)] = string(blockchainCfg.Target) + } + } + return bySelector +} + func appendOutputsToInput(input *SetupInput, nodeSetOutput []*cre.NodeSetOutput, blockchains []blockchains.Blockchain, jdOutput *jd.Output) { // append the nodeset output, so that later it can be stored in the cached output, so that we can use the environment again without running setup for idx, nsOut := range nodeSetOutput { @@ -551,7 +570,6 @@ func summarizeNodeSetPlacement(nodeSets []*cre.NodeSet) (*nodeSetPlacementSummar return summary, nil } - func newCldfEnvironment(ctx context.Context, singleFileLogger logger.Logger, cldfBlockchains cldf_chain.BlockChains) *cldf.Environment { allChainsCLDEnvironment := &cldf.Environment{ Name: cre.EnvironmentName, diff --git a/system-tests/lib/cre/environment/jobs.go b/system-tests/lib/cre/environment/jobs.go index 6a5fc1082b9..d9779627bb4 100644 --- a/system-tests/lib/cre/environment/jobs.go +++ b/system-tests/lib/cre/environment/jobs.go @@ -178,6 +178,13 @@ func rewriteRemoteJDOutputForLocalAccess( if output == nil { return nil } + if isRemoteAccessDirectMode() { + hostIP, err := resolveDirectAccessHostIP() + if err != nil { + return err + } + return rewriteJDForDirectAccess(output, hostIP, rewriteInternalForLocalNodes) + } if tunnelManager == nil { return errors.New("tunnel manager is required for remote jd target") } @@ -201,6 +208,39 @@ func rewriteRemoteJDOutputForLocalAccess( return rewriteJDWithBindings(output, bindings, rewriteInternalForLocalNodes) } +func rewriteJDForDirectAccess(output *jd.Output, hostIP string, rewriteInternalForLocalNodes bool) error { + if output == nil { + return nil + } + + if output.ExternalGRPCUrl != "" { + rewritten, err := rewriteAddressHost(output.ExternalGRPCUrl, hostIP) + if err != nil { + return err + } + output.ExternalGRPCUrl = rewritten + if rewriteInternalForLocalNodes { + output.InternalGRPCUrl = rewritten + } + } + + if output.ExternalWSRPCUrl != "" || output.InternalWSRPCUrl != "" { + source := output.ExternalWSRPCUrl + if source == "" { + source = output.InternalWSRPCUrl + } + rewritten, err := rewriteAddressHost(source, hostIP) + if err != nil { + return err + } + output.ExternalWSRPCUrl = rewritten + if rewriteInternalForLocalNodes { + output.InternalWSRPCUrl = rewritten + } + } + return nil +} + func describeJDEndpoints(output *jd.Output) ([]tunnel.EndpointRef, error) { refs := make([]tunnel.EndpointRef, 0, 2) componentID := tunnel.CanonicalComponentID(tunnel.KindJD, 0, "job-distributor") @@ -300,3 +340,27 @@ func jdEndpointFromAddress(componentID, endpointName, rawAddress string) (*tunne OriginalURL: trimmed, }, nil } + +func rewriteAddressHost(rawAddress, host string) (string, error) { + trimmed := strings.TrimSpace(rawAddress) + if trimmed == "" { + return "", nil + } + if strings.Contains(trimmed, "://") { + parsed, err := url.Parse(trimmed) + if err != nil { + return "", fmt.Errorf("failed to parse address %q: %w", rawAddress, err) + } + port := parsed.Port() + if port == "" { + return "", fmt.Errorf("address %q must include a port", rawAddress) + } + parsed.Host = net.JoinHostPort(host, port) + return parsed.String(), nil + } + _, port, err := net.SplitHostPort(trimmed) + if err != nil { + return "", fmt.Errorf("failed to parse host:port %q: %w", rawAddress, err) + } + return net.JoinHostPort(host, port), nil +} diff --git a/system-tests/lib/cre/gateway.go b/system-tests/lib/cre/gateway.go index ca007dddb4c..9a7e2c80f4d 100644 --- a/system-tests/lib/cre/gateway.go +++ b/system-tests/lib/cre/gateway.go @@ -21,6 +21,7 @@ func NewGatewayConfig(p infra.Provider, id, gatewayNodeIdx int, isBootstrap bool }, Incoming: Incoming{ Protocol: "http", + Host: p.ExternalGatewayHost(), Path: "/", InternalPort: gatewayIncomingPort, ExternalPort: p.ExternalGatewayPort(gatewayIncomingPort), diff --git a/system-tests/lib/cre/runtimecfg/access_mode.go b/system-tests/lib/cre/runtimecfg/access_mode.go new file mode 100644 index 00000000000..1415c3c0404 --- /dev/null +++ b/system-tests/lib/cre/runtimecfg/access_mode.go @@ -0,0 +1,38 @@ +package runtimecfg + +import ( + "fmt" + "os" + "strings" +) + +const ( + EnvRemoteAccessMode = "CRE_REMOTE_ACCESS_MODE" + EnvEC2HostIP = "CRE_EC2_HOST_IP" + + RemoteAccessModeSSM = "ssm" + RemoteAccessModeDirect = "direct" +) + +func RemoteAccessMode() string { + mode := strings.ToLower(strings.TrimSpace(os.Getenv(EnvRemoteAccessMode))) + if mode == "" { + return RemoteAccessModeSSM + } + if mode == RemoteAccessModeDirect { + return mode + } + return RemoteAccessModeSSM +} + +func IsDirectMode() bool { + return RemoteAccessMode() == RemoteAccessModeDirect +} + +func DirectHostIP() (string, error) { + hostIP := strings.TrimSpace(os.Getenv(EnvEC2HostIP)) + if hostIP == "" { + return "", fmt.Errorf("%s must be set when %s=%s", EnvEC2HostIP, EnvRemoteAccessMode, RemoteAccessModeDirect) + } + return hostIP, nil +} diff --git a/system-tests/lib/cre/types.go b/system-tests/lib/cre/types.go index f149320246e..765b8cfb39b 100644 --- a/system-tests/lib/cre/types.go +++ b/system-tests/lib/cre/types.go @@ -456,6 +456,7 @@ type GenerateConfigsInput struct { Datastore datastore.DataStore DonMetadata *DonMetadata Blockchains map[uint64]blockchains.Blockchain + BlockchainTargetBySelector map[uint64]string RegistryChainSelector uint64 Flags []string CapabilitiesPeeringData CapabilitiesPeeringData diff --git a/system-tests/lib/cre/vault/vault.go b/system-tests/lib/cre/vault/vault.go index e5ae8a8a7ab..ab5ae5464ae 100644 --- a/system-tests/lib/cre/vault/vault.go +++ b/system-tests/lib/cre/vault/vault.go @@ -8,11 +8,14 @@ import ( "github.com/scylladb/go-reflectx" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/postgres" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" "github.com/smartcontractkit/chainlink/v2/core/services/ocr2/plugins/vault" ) -func newVaultORM(nodeIndex, externalPort int) (vault.ORM, *sqlx.DB, error) { - dsn := fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=disable", "127.0.0.1", externalPort, postgres.User, postgres.Password, fmt.Sprintf("db_%d", nodeIndex)) +const defaultDBHost = "127.0.0.1" + +func newVaultORM(nodeIndex int, host string, externalPort int) (vault.ORM, *sqlx.DB, error) { + dsn := fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=disable", host, externalPort, postgres.User, postgres.Password, fmt.Sprintf("db_%d", nodeIndex)) db, err := sqlx.Open("postgres", dsn) if err != nil { return nil, db, err @@ -23,7 +26,7 @@ func newVaultORM(nodeIndex, externalPort int) (vault.ORM, *sqlx.DB, error) { } func GetResultPackageCount(ctx context.Context, nodeIndex, externalPort int) (int64, error) { - orm, db, err := newVaultORM(nodeIndex, externalPort) + orm, db, err := newVaultORM(nodeIndex, defaultDBHost, externalPort) if err != nil { return 0, err } @@ -31,3 +34,28 @@ func GetResultPackageCount(ctx context.Context, nodeIndex, externalPort int) (in defer db.Close() return orm.GetResultPackageCount(ctx) } + +func GetResultPackageCountRemoteAware(ctx context.Context, nodeIndex, externalPort int, isRemoteNodeSet bool) (int64, error) { + host, err := resolveDBHostForNodeSet(isRemoteNodeSet) + if err != nil { + return 0, err + } + + orm, db, err := newVaultORM(nodeIndex, host, externalPort) + if err != nil { + return 0, err + } + + defer db.Close() + return orm.GetResultPackageCount(ctx) +} + +func resolveDBHostForNodeSet(isRemoteNodeSet bool) (string, error) { + if !isRemoteNodeSet { + return defaultDBHost, nil + } + if !runtimecfg.IsDirectMode() { + return defaultDBHost, nil + } + return runtimecfg.DirectHostIP() +} diff --git a/system-tests/lib/cre/workflow/registry.go b/system-tests/lib/cre/workflow/registry.go index b08826f9633..1c0ff1b5945 100644 --- a/system-tests/lib/cre/workflow/registry.go +++ b/system-tests/lib/cre/workflow/registry.go @@ -28,6 +28,7 @@ import ( ks_contracts_op "github.com/smartcontractkit/chainlink/deployment/keystone/changeset/operations/contracts" libc "github.com/smartcontractkit/chainlink/system-tests/lib/conversions" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/stagegen" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/flags" @@ -285,7 +286,13 @@ func WaitForAllNodesToHaveExpectedFiltersRegistered(ctx context.Context, singleF } testLogger.Info().Msgf("Checking if all WorkflowRegistry filters are registered for worker node %d", workerNode.Index) - allFilters, filtersErr := getAllFilters(checkCtx, singleFileLogger, big.NewInt(libc.MustSafeInt64(registryChainID)), workerNode.Index, nodeSet[donIdx].DbInput.Port) + dbHost, hostErr := resolveNodeSetDBHost(nodeSet[donIdx]) + if hostErr != nil { + cancel() + ticker.Stop() + return errors.Wrap(hostErr, "failed to resolve nodeset db host") + } + allFilters, filtersErr := getAllFilters(checkCtx, singleFileLogger, big.NewInt(libc.MustSafeInt64(registryChainID)), workerNode.Index, dbHost, nodeSet[donIdx].DbInput.Port) if filtersErr != nil { cancel() ticker.Stop() @@ -337,8 +344,8 @@ func StartS3(testLogger zerolog.Logger, input *s3provider.Input, stageGen *stage return s3ProviderOutput, nil } -func newORM(logger logger.Logger, chainID *big.Int, nodeIndex, externalPort int) (logpoller.ORM, *sqlx.DB, error) { - dsn := fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=disable", "127.0.0.1", externalPort, postgres.User, postgres.Password, fmt.Sprintf("db_%d", nodeIndex)) +func newORM(logger logger.Logger, chainID *big.Int, nodeIndex int, host string, externalPort int) (logpoller.ORM, *sqlx.DB, error) { + dsn := fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=disable", host, externalPort, postgres.User, postgres.Password, fmt.Sprintf("db_%d", nodeIndex)) db, err := sqlx.Open("postgres", dsn) if err != nil { return nil, db, err @@ -348,8 +355,8 @@ func newORM(logger logger.Logger, chainID *big.Int, nodeIndex, externalPort int) return logpoller.NewORM(chainID, db, logger), db, nil } -func getAllFilters(ctx context.Context, logger logger.Logger, chainID *big.Int, nodeIndex, externalPort int) (map[string]logpoller.Filter, error) { - orm, db, err := newORM(logger, chainID, nodeIndex, externalPort) +func getAllFilters(ctx context.Context, logger logger.Logger, chainID *big.Int, nodeIndex int, host string, externalPort int) (map[string]logpoller.Filter, error) { + orm, db, err := newORM(logger, chainID, nodeIndex, host, externalPort) if err != nil { return nil, err } @@ -357,3 +364,14 @@ func getAllFilters(ctx context.Context, logger logger.Logger, chainID *big.Int, defer db.Close() return orm.LoadFilters(ctx) } + +func resolveNodeSetDBHost(nodeSet *cre.NodeSet) (string, error) { + defaultHost := "127.0.0.1" + if nodeSet == nil || strings.TrimSpace(nodeSet.Target) != string(config.TargetRemote) { + return defaultHost, nil + } + if !runtimecfg.IsDirectMode() { + return defaultHost, nil + } + return runtimecfg.DirectHostIP() +} diff --git a/system-tests/tests/regression/cre/v2_http_trigger_regression_test.go b/system-tests/tests/regression/cre/v2_http_trigger_regression_test.go index 160096d9a0a..d92af1f3930 100644 --- a/system-tests/tests/regression/cre/v2_http_trigger_regression_test.go +++ b/system-tests/tests/regression/cre/v2_http_trigger_regression_test.go @@ -90,6 +90,7 @@ func HTTPTriggerFailsTest(t *testing.T, testEnv *ttypes.TestEnvironment, httpNeg testID := uuid.New().String()[0:8] fakeServer, err := startTestOrderServer(t, freePort, testID) require.NoError(t, err, "failed to start fake HTTP server") + t_helpers.EnsureFixtureRelayForPort(t, testEnv, "http-trigger-regression-order-server", freePort) // Ensure cleanup of the fake server defer func() { diff --git a/system-tests/tests/smoke/cre/README.md b/system-tests/tests/smoke/cre/README.md index 9fed56009ac..384bdb85c38 100644 --- a/system-tests/tests/smoke/cre/README.md +++ b/system-tests/tests/smoke/cre/README.md @@ -51,6 +51,7 @@ This guide explains how to set up and run system tests for Chainlink workflows u --- For more information about the local CRE check its [README.md](../../../../core/scripts/cre/environment/README.md). +For remote/hybrid EC2 execution details, see [REMOTE_HYBRID_RUNBOOK.md](./REMOTE_HYBRID_RUNBOOK.md). --- diff --git a/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md b/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md new file mode 100644 index 00000000000..1d66da6a942 --- /dev/null +++ b/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md @@ -0,0 +1,51 @@ +# CRE Remote Hybrid Runbook + +This runbook covers the EC2-based remote mode for CRE where components can run either locally or remotely. + +## Scope + +- Remote backend is EC2 + Docker (no Kubernetes path). +- Remote control plane is the CRE agent. +- Access modes: + - `ssm`: control and endpoint reachability via SSM tunnels. + - `direct`: endpoint reachability via EC2 host IP, with SSM optional for agent only. + +## Core Environment Variables + +- `CRE_AGENT_MODE=ec2` +- `CRE_EC2_INSTANCE_ID=` (required for SSM mode) +- `CRE_EC2_AGENT_PORT=` (defaults to `8080`) +- `CRE_EC2_AGENT_URL=` (optional explicit override) +- `CRE_REMOTE_ACCESS_MODE=ssm|direct` +- `CRE_EC2_HOST_IP=` (required when `CRE_REMOTE_ACCESS_MODE=direct`) +- `CRE_AWS_PROFILE=` (optional SSM auth profile) + +## Agent Startup + +- In `ssm` mode, bind agent to loopback (for example `127.0.0.1:18080`). +- In `direct` mode, bind agent to all interfaces (for example `0.0.0.0:18080`). + +## Placement Rules + +- Same placement (`local->local`, `remote->remote`) uses **internal** URLs. +- Cross placement (`local->remote`, `remote->local`) uses **external** URLs. +- Remote NodeSets targeting local gateway are allowed when bridge/tunnel plumbing for gateway ingress is present. + +## Bridge and Fixture Relay + +- Remote components cannot directly call local in-process fixtures. +- Use fixture relay for local fixtures (CHiP testsink, fake HTTP, billing/PoR mocks). +- Relay is opened per fixture port and uses fixed remote port parity. + +## Recommended Test Order + +1. All remote. +2. All local. +3. Mixed (for example JD local + NodeSet remote). + +## Fast Triage Checklist + +- Agent unreachable: verify bind address/port vs chosen access mode. +- `invalid jd target: local`: use `target=local` (supported; `docker` is alias). +- Remote nodes hitting local-only fixtures: ensure fixture relay helper is active. +- Mixed remote->local gateway case: expected failure for now. diff --git a/system-tests/tests/smoke/cre/billing_helpers.go b/system-tests/tests/smoke/cre/billing_helpers.go index 299680a3c69..0bbd6c4f920 100644 --- a/system-tests/tests/smoke/cre/billing_helpers.go +++ b/system-tests/tests/smoke/cre/billing_helpers.go @@ -24,6 +24,7 @@ import ( libcre "github.com/smartcontractkit/chainlink/system-tests/lib/cre" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + t_helpers "github.com/smartcontractkit/chainlink/system-tests/tests/test-helpers" ttypes "github.com/smartcontractkit/chainlink/system-tests/tests/test-helpers/configuration" ) @@ -76,7 +77,7 @@ func loadBillingStackCache(relativePathToRepoRoot string) (*config.BillingConfig func startBillingStackIfIsNotRunning(t *testing.T, relativePathToRepoRoot, environmentDir string, testEnv *ttypes.TestEnvironment) error { if !config.BillingStateFileExists(relativePathToRepoRoot) { - priceURL := setupFakeBillingPriceProvider(t, testEnv.Config.Fake) + priceURL := setupFakeBillingPriceProvider(t, testEnv.Config.Fake, testEnv) t.Cleanup(func() { /* @@ -279,7 +280,7 @@ func queryCredits(t *testing.T, db *sql.DB) []billingCredit { return credits } -func setupFakeBillingPriceProvider(t *testing.T, input *fake.Input) string { +func setupFakeBillingPriceProvider(t *testing.T, input *fake.Input, testEnv *ttypes.TestEnvironment) string { t.Helper() fakeProviderStarted.Do(func() { @@ -319,6 +320,7 @@ func setupFakeBillingPriceProvider(t *testing.T, input *fake.Input) string { }) require.NoError(t, err) + t_helpers.EnsureFixtureRelayForPort(t, testEnv, "billing-fake-price-provider", input.Port) return url } diff --git a/system-tests/tests/smoke/cre/cre_suite_test.go b/system-tests/tests/smoke/cre/cre_suite_test.go index 1cee094ef73..3c9f1e33790 100644 --- a/system-tests/tests/smoke/cre/cre_suite_test.go +++ b/system-tests/tests/smoke/cre/cre_suite_test.go @@ -89,35 +89,41 @@ func Test_CRE_V2_Suite(t *testing.T) { }) t.Run("[v2] Vault DON - "+topology, func(t *testing.T) { + t.Skip() testEnv := t_helpers.SetupTestEnvironmentWithConfig(t, t_helpers.GetDefaultTestConfig(t), v2RegistriesFlags...) ExecuteVaultTest(t, testEnv) }) t.Run("[v2] Cron Beholder - "+topology, func(t *testing.T) { + t.Skip() testEnv := t_helpers.SetupTestEnvironmentWithConfig(t, t_helpers.GetDefaultTestConfig(t), v2RegistriesFlags...) ExecuteCronBeholderTest(t, testEnv) }) t.Run("[v2] HTTP Trigger Action - "+topology, func(t *testing.T) { + t.Skip() testEnv := t_helpers.SetupTestEnvironmentWithConfig(t, t_helpers.GetDefaultTestConfig(t), v2RegistriesFlags...) ExecuteHTTPTriggerActionTest(t, testEnv) }) t.Run("[v2] HTTP Action CRUD - "+topology, func(t *testing.T) { + t.Skip() testEnv := t_helpers.SetupTestEnvironmentWithConfig(t, t_helpers.GetDefaultTestConfig(t), v2RegistriesFlags...) ExecuteHTTPActionCRUDSuccessTest(t, testEnv) }) t.Run("[v2] DON Time - "+topology, func(t *testing.T) { + t.Skip() testEnv := t_helpers.SetupTestEnvironmentWithConfig(t, t_helpers.GetDefaultTestConfig(t), v2RegistriesFlags...) ExecuteDonTimeTest(t, testEnv) }) t.Run("[v2] Consensus - "+topology, func(t *testing.T) { + t.Skip() testEnv := t_helpers.SetupTestEnvironmentWithConfig(t, t_helpers.GetDefaultTestConfig(t), v2RegistriesFlags...) ExecuteConsensusTest(t, testEnv) @@ -129,6 +135,7 @@ func Test_CRE_V2_EVM_Suite(t *testing.T) { testEnv := t_helpers.SetupTestEnvironmentWithConfig(t, t_helpers.GetDefaultTestConfig(t), v2RegistriesFlags...) t.Run("[v2] EVM Write - "+topology, func(t *testing.T) { + t.Skip() priceProvider, porWfCfg := beforePoRTest(t, testEnv, "por-workflowV2", PoRWFV2Location) ExecutePoRTest(t, testEnv, priceProvider, porWfCfg, false) }) @@ -138,6 +145,7 @@ func Test_CRE_V2_EVM_Suite(t *testing.T) { }) t.Run("[v2] EVM LogTrigger - "+topology, func(t *testing.T) { + t.Skip() ExecuteEVMLogTriggerTest(t, testEnv) }) } diff --git a/system-tests/tests/smoke/cre/por_test.go b/system-tests/tests/smoke/cre/por_test.go index de647078c46..300cbf166d1 100644 --- a/system-tests/tests/smoke/cre/por_test.go +++ b/system-tests/tests/smoke/cre/por_test.go @@ -66,6 +66,7 @@ func beforePoRTest(t *testing.T, testEnv *ttypes.TestEnvironment, workflowName, AuthorizationKey := "" // required by FakePriceProvider priceProvider, err := NewFakePriceProvider(testLogger, testEnv.Config.Fake, AuthorizationKey, porWfCfg.FeedIDs) require.NoError(t, err, "failed to create fake price provider") + t_helpers.EnsureFixtureRelayForPort(t, testEnv, "por-fake-price-provider", testEnv.Config.Fake.Port) return priceProvider, porWfCfg } diff --git a/system-tests/tests/smoke/cre/v2_http_action_test.go b/system-tests/tests/smoke/cre/v2_http_action_test.go index 1de69a01df9..162ac448c78 100644 --- a/system-tests/tests/smoke/cre/v2_http_action_test.go +++ b/system-tests/tests/smoke/cre/v2_http_action_test.go @@ -91,6 +91,7 @@ func ExecuteHTTPActionRegressionTest(t *testing.T, testEnv *ttypes.TestEnvironme fakeHTTP, err := fake.NewFakeDataProvider(testEnv.Config.FakeHTTP) require.NoError(t, err, "Failed to start fake HTTP") + t_helpers.EnsureFixtureRelayForPort(t, testEnv, "http-action-regression-fake-http", testEnv.Config.FakeHTTP.Port) testLogger.Info().Msg("Fake HTTP started for regression test") defer func() { testLogger.Info().Msgf("Cleaning up fake server on port %d", testEnv.Config.FakeHTTP.Port) @@ -147,6 +148,7 @@ func ExecuteHTTPActionCRUDSuccessTest(t *testing.T, testEnv *ttypes.TestEnvironm } else { testLogger.Info().Msg("Fake HTTP started successfully") } + t_helpers.EnsureFixtureRelayForPort(t, testEnv, "http-action-smoke-fake-http", testEnv.Config.FakeHTTP.Port) // Set up a unique endpoint for this test response := map[string]any{ diff --git a/system-tests/tests/smoke/cre/v2_http_trigger_action_test.go b/system-tests/tests/smoke/cre/v2_http_trigger_action_test.go index d01d03dc7b4..e81c10de716 100644 --- a/system-tests/tests/smoke/cre/v2_http_trigger_action_test.go +++ b/system-tests/tests/smoke/cre/v2_http_trigger_action_test.go @@ -50,6 +50,7 @@ func ExecuteHTTPTriggerActionTest(t *testing.T, testEnv *ttypes.TestEnvironment) fakeServer, err := startTestOrderServer(t, testEnv.Config.Fake.Port) require.NoError(t, err, "failed to start fake HTTP server") + t_helpers.EnsureFixtureRelayForPort(t, testEnv, "http-trigger-order-server", testEnv.Config.Fake.Port) uniqueWorkflowName := "http-trigger-action-test-" + uuid.New().String()[0:8] httpWorkflowConfig := t_helpers.HTTPWorkflowConfig{ diff --git a/system-tests/tests/smoke/cre/v2_vault_don_test.go b/system-tests/tests/smoke/cre/v2_vault_don_test.go index 83fcdca0515..55485e3deb5 100644 --- a/system-tests/tests/smoke/cre/v2_vault_don_test.go +++ b/system-tests/tests/smoke/cre/v2_vault_don_test.go @@ -24,6 +24,7 @@ import ( "github.com/smartcontractkit/chainlink-testing-framework/seth" keystone_changeset "github.com/smartcontractkit/chainlink/deployment/keystone/changeset" crecontracts "github.com/smartcontractkit/chainlink/system-tests/lib/cre/contracts" + creconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" t_helpers "github.com/smartcontractkit/chainlink/system-tests/tests/test-helpers" "github.com/smartcontractkit/chainlink/v2/core/capabilities/vault/vaulttypes" @@ -50,7 +51,12 @@ func ExecuteVaultTest(t *testing.T, testEnv *ttypes.TestEnvironment) { if slices.Contains(nodeSet.Capabilities, cre.VaultCapability) { for i, node := range nodeSet.NodeSpecs { if !slices.Contains(node.Roles, cre.BootstrapNode) { - packageCount, err := vault.GetResultPackageCount(t.Context(), i, nodeSet.DbInput.Port) + packageCount, err := vault.GetResultPackageCountRemoteAware( + t.Context(), + i, + nodeSet.DbInput.Port, + nodeSet.Target == string(creconfig.TargetRemote), + ) if err != nil || packageCount != 1 { return false } diff --git a/system-tests/tests/test-helpers/chip_testsink_helpers.go b/system-tests/tests/test-helpers/chip_testsink_helpers.go index a7f09a97d3f..b1e19078437 100644 --- a/system-tests/tests/test-helpers/chip_testsink_helpers.go +++ b/system-tests/tests/test-helpers/chip_testsink_helpers.go @@ -6,6 +6,7 @@ import ( "net" "os" "path/filepath" + "strconv" "strings" "sync" "testing" @@ -277,6 +278,9 @@ func StartChipTestSink(t *testing.T, publishFn chiptestsink.PublishFn) *chiptest If you want to use both together start ChIP Ingress on a different port with '--grpc-port' flag and make sure that the sink is pointing to correct upstream endpoint ('localhost:' in most cases)`, chipingressset.DEFAULT_CHIP_INGRESS_GRPC_PORT) } + grpcPort, convErr := strconv.Atoi(chipingressset.DEFAULT_CHIP_INGRESS_GRPC_PORT) + require.NoError(t, convErr, "invalid default chip ingress grpc port") + EnsureFixtureRelayForPort(t, nil, "chip-testsink", grpcPort) startCh := make(chan struct{}, 1) server, err := chiptestsink.NewServer(chiptestsink.Config{ diff --git a/system-tests/tests/test-helpers/fixture_relay_helpers.go b/system-tests/tests/test-helpers/fixture_relay_helpers.go new file mode 100644 index 00000000000..e019def7a47 --- /dev/null +++ b/system-tests/tests/test-helpers/fixture_relay_helpers.go @@ -0,0 +1,317 @@ +package helpers + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net" + "net/http" + "net/url" + "os" + "strconv" + "strings" + "sync" + "testing" + "time" + + "github.com/gorilla/websocket" + "github.com/stretchr/testify/require" + + envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" + ttypes "github.com/smartcontractkit/chainlink/system-tests/tests/test-helpers/configuration" +) + +const ( + envLocalAgentURL = "CRE_LOCAL_AGENT_URL" + envEC2AgentURL = "CRE_EC2_AGENT_URL" + envEC2AgentPort = "CRE_EC2_AGENT_PORT" +) + +type relayOpenResponse struct { + RelayID string `json:"relayId"` +} + +type relayCloseResponse struct { + Found bool `json:"found"` +} + +type fixtureRelayHandle struct { + relayID string + cancel context.CancelFunc +} + +var ( + fixtureRelayMu sync.Mutex + fixtureRelayHandles = make(map[string]*fixtureRelayHandle) +) + +// EnsureFixtureRelayForPort ensures a local fixture port is reachable from remote components. +// It is a no-op when no remote NodeSets are configured. +func EnsureFixtureRelayForPort(t *testing.T, testEnv *ttypes.TestEnvironment, relayName string, localPort int) { + t.Helper() + require.Greater(t, localPort, 0, "fixture relay local port must be > 0") + + cfg := resolveEnvConfigForRelay(t, testEnv) + if !hasRemoteNodeSets(cfg) { + return + } + + agentBaseURL, err := resolveAgentBaseURLForRelay() + require.NoError(t, err, "failed to resolve agent base URL for fixture relay") + + key := fmt.Sprintf("%s|%s|%d", strings.TrimSpace(relayName), agentBaseURL, localPort) + fixtureRelayMu.Lock() + if _, exists := fixtureRelayHandles[key]; exists { + fixtureRelayMu.Unlock() + return + } + fixtureRelayMu.Unlock() + + relayID, err := openRelay(context.Background(), agentBaseURL, relayName, localPort) + require.NoError(t, err, "failed to open fixture relay on agent") + + ctx, cancel := context.WithCancel(context.Background()) + localFixtureAddr := net.JoinHostPort("127.0.0.1", strconv.Itoa(localPort)) + for i := 0; i < 4; i++ { + go relayWorker(ctx, agentBaseURL, relayID, localFixtureAddr) + } + + fixtureRelayMu.Lock() + fixtureRelayHandles[key] = &fixtureRelayHandle{relayID: relayID, cancel: cancel} + fixtureRelayMu.Unlock() + + t.Cleanup(func() { + fixtureRelayMu.Lock() + handle, ok := fixtureRelayHandles[key] + if ok { + delete(fixtureRelayHandles, key) + } + fixtureRelayMu.Unlock() + if !ok { + return + } + handle.cancel() + _, _ = closeRelay(context.Background(), agentBaseURL, handle.relayID) + }) +} + +func resolveEnvConfigForRelay(t *testing.T, testEnv *ttypes.TestEnvironment) *envconfig.Config { + t.Helper() + if testEnv != nil && testEnv.Config != nil { + return testEnv.Config + } + configPath := strings.TrimSpace(os.Getenv("CTF_CONFIGS")) + if configPath == "" { + return nil + } + cfg := &envconfig.Config{} + if err := cfg.Load(configPath); err != nil { + return nil + } + return cfg +} + +func hasRemoteNodeSets(cfg *envconfig.Config) bool { + if cfg == nil { + return false + } + for _, nodeSet := range cfg.NodeSets { + if nodeSet != nil && strings.EqualFold(strings.TrimSpace(nodeSet.Target), string(envconfig.TargetRemote)) { + return true + } + } + return false +} + +func resolveAgentBaseURLForRelay() (string, error) { + if v := strings.TrimSpace(os.Getenv(envEC2AgentURL)); v != "" { + return v, nil + } + if runtimecfg.IsDirectMode() { + hostIP, err := runtimecfg.DirectHostIP() + if err != nil { + return "", err + } + port := 8080 + if rawPort := strings.TrimSpace(os.Getenv(envEC2AgentPort)); rawPort != "" { + parsed, err := strconv.Atoi(rawPort) + if err != nil || parsed <= 0 || parsed > 65535 { + return "", fmt.Errorf("invalid %s: %q", envEC2AgentPort, rawPort) + } + port = parsed + } + return fmt.Sprintf("http://%s:%d", hostIP, port), nil + } + if v := strings.TrimSpace(os.Getenv(envLocalAgentURL)); v != "" { + return v, nil + } + return "", fmt.Errorf("missing agent URL for fixture relay (set %s, or set %s/%s for direct mode)", envEC2AgentURL, runtimecfg.EnvRemoteAccessMode, runtimecfg.EnvEC2HostIP) +} + +func openRelay(ctx context.Context, agentBaseURL, name string, requestedPort int) (string, error) { + body, _ := json.Marshal(map[string]any{ + "name": name, + "requestedPort": requestedPort, + }) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, strings.TrimRight(agentBaseURL, "/")+"/v1/relay/open", bytes.NewReader(body)) + if err != nil { + return "", err + } + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + respBody, _ := io.ReadAll(resp.Body) + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return "", fmt.Errorf("open relay failed: status %s body %s", resp.Status, strings.TrimSpace(string(respBody))) + } + + var out relayOpenResponse + if err := json.Unmarshal(respBody, &out); err != nil { + return "", err + } + if strings.TrimSpace(out.RelayID) == "" { + return "", fmt.Errorf("open relay returned empty relayId") + } + return out.RelayID, nil +} + +func closeRelay(ctx context.Context, agentBaseURL, relayID string) (*relayCloseResponse, error) { + body, _ := json.Marshal(map[string]any{"relayId": relayID}) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, strings.TrimRight(agentBaseURL, "/")+"/v1/relay/close", bytes.NewReader(body)) + if err != nil { + return nil, err + } + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + respBody, _ := io.ReadAll(resp.Body) + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return nil, fmt.Errorf("close relay failed: status %s body %s", resp.Status, strings.TrimSpace(string(respBody))) + } + out := &relayCloseResponse{} + if len(respBody) > 0 { + _ = json.Unmarshal(respBody, out) + } + return out, nil +} + +func relayWorker(ctx context.Context, agentBaseURL, relayID, localFixtureAddr string) { + backoff := 250 * time.Millisecond + for { + select { + case <-ctx.Done(): + return + default: + } + + wsURL, err := relayConnectWSURL(agentBaseURL, relayID) + if err != nil { + time.Sleep(backoff) + continue + } + ws, _, err := websocket.DefaultDialer.Dial(wsURL, nil) + if err != nil { + time.Sleep(backoff) + continue + } + + localConn, err := net.DialTimeout("tcp", localFixtureAddr, 2*time.Second) + if err != nil { + _ = ws.Close() + time.Sleep(backoff) + continue + } + + _ = bridgeFixtureRelayStream(ctx, ws, localConn) + _ = localConn.Close() + _ = ws.Close() + + if backoff < 2*time.Second { + backoff *= 2 + } + } +} + +func relayConnectWSURL(agentBaseURL, relayID string) (string, error) { + base := strings.TrimRight(agentBaseURL, "/") + u, err := url.Parse(base) + if err != nil { + return "", err + } + switch u.Scheme { + case "http": + u.Scheme = "ws" + case "https": + u.Scheme = "wss" + default: + return "", fmt.Errorf("unsupported agent url scheme: %s", u.Scheme) + } + u.Path = "/v1/relay/connect" + q := u.Query() + q.Set("relayId", relayID) + u.RawQuery = q.Encode() + return u.String(), nil +} + +func bridgeFixtureRelayStream(ctx context.Context, ws *websocket.Conn, localConn net.Conn) error { + errCh := make(chan error, 2) + + go func() { + buf := make([]byte, 32*1024) + for { + n, err := localConn.Read(buf) + if n > 0 { + if wErr := ws.WriteMessage(websocket.BinaryMessage, buf[:n]); wErr != nil { + errCh <- wErr + return + } + } + if err != nil { + errCh <- err + return + } + } + }() + + go func() { + for { + msgType, payload, err := ws.ReadMessage() + if err != nil { + errCh <- err + return + } + if msgType != websocket.BinaryMessage && msgType != websocket.TextMessage { + continue + } + if len(payload) == 0 { + continue + } + if _, wErr := localConn.Write(payload); wErr != nil { + errCh <- wErr + return + } + } + }() + + select { + case <-ctx.Done(): + return ctx.Err() + case err := <-errCh: + if err == nil || errors.Is(err, io.EOF) || websocket.IsCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway) { + return nil + } + return err + } +} From 1a729683860fc36d47662410dc3e112b3ece6ac0 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Mon, 23 Feb 2026 12:28:54 +0100 Subject: [PATCH 08/34] update runbook, make direct mode default, auto-evaluate instance's IP, make sure all remote and all local works --- system-tests/lib/cre/connectivity/chooser.go | 2 +- .../lib/cre/environment/config/config.go | 8 +- .../lib/cre/environment/environment.go | 20 ++-- .../cre/environment/tunnel/provider_ssm.go | 24 +---- .../lib/cre/runtimecfg/access_mode.go | 96 +++++++++++++++++-- .../tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md | 35 ++++++- 6 files changed, 135 insertions(+), 50 deletions(-) diff --git a/system-tests/lib/cre/connectivity/chooser.go b/system-tests/lib/cre/connectivity/chooser.go index 550d69c9515..daedac0f525 100644 --- a/system-tests/lib/cre/connectivity/chooser.go +++ b/system-tests/lib/cre/connectivity/chooser.go @@ -82,7 +82,7 @@ func ResolveAndEnsureReachable( func PlacementFromTarget(target string) (Placement, error) { switch strings.ToLower(strings.TrimSpace(target)) { - case "", "docker", "local": + case "", "local": return PlacementLocal, nil case "remote": return PlacementRemote, nil diff --git a/system-tests/lib/cre/environment/config/config.go b/system-tests/lib/cre/environment/config/config.go index 6b6e8674d5d..c56051dd07d 100644 --- a/system-tests/lib/cre/environment/config/config.go +++ b/system-tests/lib/cre/environment/config/config.go @@ -57,7 +57,7 @@ func (c *Config) SetAddresses(refs []datastore.AddressRef) error { } type Config struct { - Blockchains []*Blockchain `toml:"blockchains" validate:"required"` + Blockchains []*Blockchain `toml:"blockchains" validate:"required"` NodeSets []*cre.NodeSet `toml:"nodesets" validate:"required"` JD *JobDistributor `toml:"jd" validate:"required"` Infra *infra.Provider `toml:"infra" validate:"required"` @@ -74,9 +74,7 @@ type Config struct { type ComponentTarget string const ( - TargetLocal ComponentTarget = "local" - // TargetDocker is a backward-compatible alias for local placement. - TargetDocker ComponentTarget = "docker" + TargetLocal ComponentTarget = "local" TargetRemote ComponentTarget = "remote" ) @@ -281,7 +279,7 @@ func normalizeComponentTarget(target ComponentTarget) ComponentTarget { return "" case string(TargetRemote): return TargetRemote - case string(TargetLocal), string(TargetDocker): + case string(TargetLocal): return TargetLocal default: return target diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index 79cd3337212..28436e71fa5 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -222,7 +222,7 @@ func SetupTestEnvironment( if tErr != nil { return nil, pkgerrors.Wrap(tErr, "failed to create topology") } - blockchainTargetBySelector := blockchainTargetsBySelector(input.Blockchains) + blockchainTargetBySelector := blockchainTargetsBySelector(input.Blockchains, deployedBlockchains.Outputs) updatedNodeSets, topoErr := donconfig.PrepareNodeTOMLs( ctx, @@ -506,19 +506,17 @@ func SetupTestEnvironment( }, nil } -func blockchainTargetsBySelector(blockchains []*config.Blockchain) map[uint64]string { - bySelector := make(map[uint64]string, len(blockchains)) - for _, blockchainCfg := range blockchains { +func blockchainTargetsBySelector(configured []*config.Blockchain, deployed []blockchains.Blockchain) map[uint64]string { + bySelector := make(map[uint64]string, len(deployed)) + for idx, blockchainCfg := range configured { if blockchainCfg == nil { continue } - input := blockchainCfg.InputRef() - if input == nil { + if idx >= len(deployed) || deployed[idx] == nil { continue } - for _, chainID := range input.ChainID { - bySelector[uint64(chainID)] = string(blockchainCfg.Target) - } + selector := deployed[idx].ChainSelector() + bySelector[selector] = string(blockchainCfg.Target) } return bySelector } @@ -551,7 +549,7 @@ func summarizeNodeSetPlacement(nodeSets []*cre.NodeSet) (*nodeSetPlacementSummar continue } configTarget := strings.TrimSpace(nodeSet.Target) - if configTarget == "" || configTarget == string(config.TargetDocker) { + if configTarget == "" || configTarget == string(config.TargetLocal) { summary.HasLocalTargets = true continue } @@ -565,7 +563,7 @@ func summarizeNodeSetPlacement(nodeSets []*cre.NodeSet) (*nodeSetPlacementSummar // Mixed local and remote nodeset targets need per-DON node-facing URL config selection. // Current PrepareNodeTOMLs builds one node-facing URL shape, so keep this unsupported for now. if summary.HasLocalTargets && summary.HasRemoteTargets { - return nil, errors.New("mixed nodeset targets are not supported yet; set all nodesets target=docker or all target=remote") + return nil, errors.New("mixed nodeset targets are not supported yet; set all nodesets target=local or all target=remote") } return summary, nil } diff --git a/system-tests/lib/cre/environment/tunnel/provider_ssm.go b/system-tests/lib/cre/environment/tunnel/provider_ssm.go index d8a7207418b..6300f49c58d 100644 --- a/system-tests/lib/cre/environment/tunnel/provider_ssm.go +++ b/system-tests/lib/cre/environment/tunnel/provider_ssm.go @@ -13,6 +13,7 @@ import ( "time" "github.com/rs/zerolog" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" ) type SSMProvider struct { @@ -38,7 +39,7 @@ func (p *SSMProvider) Name() string { } func (p *SSMProvider) Open(ctx context.Context, ref EndpointRef) (TunnelBinding, error) { - profile, authMode := resolveAWSProfileSelection() + profile, authMode := runtimecfg.ResolveAWSCLIProfileSelection() if err := validateAWSSession(ctx, p.region, profile, authMode); err != nil { return TunnelBinding{}, err } @@ -103,27 +104,6 @@ func (p *SSMProvider) Open(ctx context.Context, ref EndpointRef) (TunnelBinding, }, nil } -func resolveAWSProfileSelection() (string, string) { - if hasStaticAWSKeys() { - return "", "env-creds" - } - - if profile := strings.TrimSpace(os.Getenv("CRE_AWS_PROFILE")); profile != "" { - return profile, "profile:CRE_AWS_PROFILE" - } - if profile := strings.TrimSpace(os.Getenv("AWS_PROFILE")); profile != "" { - return profile, "profile:AWS_PROFILE" - } - - return "", "default-profile" -} - -func hasStaticAWSKeys() bool { - accessKeyID := strings.TrimSpace(os.Getenv("AWS_ACCESS_KEY_ID")) - secretAccessKey := strings.TrimSpace(os.Getenv("AWS_SECRET_ACCESS_KEY")) - return accessKeyID != "" && secretAccessKey != "" -} - func validateAWSSession(ctx context.Context, region, profile, authMode string) error { if ctx == nil { ctx = context.Background() diff --git a/system-tests/lib/cre/runtimecfg/access_mode.go b/system-tests/lib/cre/runtimecfg/access_mode.go index 1415c3c0404..cf8b0e0cc13 100644 --- a/system-tests/lib/cre/runtimecfg/access_mode.go +++ b/system-tests/lib/cre/runtimecfg/access_mode.go @@ -1,28 +1,34 @@ package runtimecfg import ( + "context" "fmt" "os" + "os/exec" "strings" + "time" ) const ( EnvRemoteAccessMode = "CRE_REMOTE_ACCESS_MODE" EnvEC2HostIP = "CRE_EC2_HOST_IP" + EnvEC2InstanceID = "CRE_EC2_INSTANCE_ID" + EnvAWSProfile = "CRE_AWS_PROFILE" RemoteAccessModeSSM = "ssm" RemoteAccessModeDirect = "direct" + defaultEC2Region = "us-west-2" ) func RemoteAccessMode() string { mode := strings.ToLower(strings.TrimSpace(os.Getenv(EnvRemoteAccessMode))) if mode == "" { - return RemoteAccessModeSSM + return RemoteAccessModeDirect } - if mode == RemoteAccessModeDirect { + if mode == RemoteAccessModeDirect || mode == RemoteAccessModeSSM { return mode } - return RemoteAccessModeSSM + return RemoteAccessModeDirect } func IsDirectMode() bool { @@ -31,8 +37,86 @@ func IsDirectMode() bool { func DirectHostIP() (string, error) { hostIP := strings.TrimSpace(os.Getenv(EnvEC2HostIP)) - if hostIP == "" { - return "", fmt.Errorf("%s must be set when %s=%s", EnvEC2HostIP, EnvRemoteAccessMode, RemoteAccessModeDirect) + if hostIP != "" { + return hostIP, nil } - return hostIP, nil + + instanceID := strings.TrimSpace(os.Getenv(EnvEC2InstanceID)) + if instanceID == "" { + return "", fmt.Errorf("%s must be set when %s=%s (or set %s explicitly)", EnvEC2InstanceID, EnvRemoteAccessMode, RemoteAccessModeDirect, EnvEC2HostIP) + } + return discoverEC2HostIP(instanceID) +} + +func ResolveAWSCLIProfileSelection() (string, string) { + if hasStaticAWSKeys() { + return "", "env-creds" + } + if hasWebIdentityCreds() { + return "", "web-identity" + } + if profile := strings.TrimSpace(os.Getenv(EnvAWSProfile)); profile != "" { + return profile, "profile:CRE_AWS_PROFILE" + } + if profile := strings.TrimSpace(os.Getenv("AWS_PROFILE")); profile != "" { + return profile, "profile:AWS_PROFILE" + } + if profile := strings.TrimSpace(os.Getenv("AWS_DEFAULT_PROFILE")); profile != "" { + return profile, "profile:AWS_DEFAULT_PROFILE" + } + return "", "default-profile" +} + +func hasStaticAWSKeys() bool { + return strings.TrimSpace(os.Getenv("AWS_ACCESS_KEY_ID")) != "" && strings.TrimSpace(os.Getenv("AWS_SECRET_ACCESS_KEY")) != "" +} + +func hasWebIdentityCreds() bool { + return strings.TrimSpace(os.Getenv("AWS_WEB_IDENTITY_TOKEN_FILE")) != "" && strings.TrimSpace(os.Getenv("AWS_ROLE_ARN")) != "" +} + +func awsRegion() string { + if region := strings.TrimSpace(os.Getenv("AWS_REGION")); region != "" { + return region + } + if region := strings.TrimSpace(os.Getenv("AWS_DEFAULT_REGION")); region != "" { + return region + } + return defaultEC2Region +} + +func discoverEC2HostIP(instanceID string) (string, error) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + profile, authMode := ResolveAWSCLIProfileSelection() + args := []string{ + "ec2", "describe-instances", + "--instance-ids", instanceID, + "--region", awsRegion(), + "--query", "Reservations[0].Instances[0].[PrivateIpAddress,PublicIpAddress]", + "--output", "text", + } + if profile != "" { + args = append(args, "--profile", profile) + } + + out, err := exec.CommandContext(ctx, "aws", args...).CombinedOutput() + if err != nil { + msg := strings.TrimSpace(string(out)) + if msg == "" { + msg = err.Error() + } + return "", fmt.Errorf("failed to resolve EC2 host IP via aws cli (auth mode=%s, instance=%s): %s", authMode, instanceID, msg) + } + + parts := strings.Fields(strings.TrimSpace(string(out))) + for _, part := range parts { + part = strings.TrimSpace(part) + if part == "" || strings.EqualFold(part, "none") { + continue + } + return part, nil + } + return "", fmt.Errorf("no private/public IP found for instance %s", instanceID) } diff --git a/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md b/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md index 1d66da6a942..0ee940f043a 100644 --- a/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md +++ b/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md @@ -6,6 +6,7 @@ This runbook covers the EC2-based remote mode for CRE where components can run e - Remote backend is EC2 + Docker (no Kubernetes path). - Remote control plane is the CRE agent. +- Default access mode is `direct`. - Access modes: - `ssm`: control and endpoint reachability via SSM tunnels. - `direct`: endpoint reachability via EC2 host IP, with SSM optional for agent only. @@ -13,17 +14,40 @@ This runbook covers the EC2-based remote mode for CRE where components can run e ## Core Environment Variables - `CRE_AGENT_MODE=ec2` -- `CRE_EC2_INSTANCE_ID=` (required for SSM mode) +- `CRE_EC2_INSTANCE_ID=` (required for SSM mode; also used by direct mode auto IP lookup) - `CRE_EC2_AGENT_PORT=` (defaults to `8080`) - `CRE_EC2_AGENT_URL=` (optional explicit override) -- `CRE_REMOTE_ACCESS_MODE=ssm|direct` -- `CRE_EC2_HOST_IP=` (required when `CRE_REMOTE_ACCESS_MODE=direct`) +- `CRE_REMOTE_ACCESS_MODE=ssm|direct` (defaults to `direct`) +- `CRE_EC2_HOST_IP=` (optional in direct mode; if missing, resolved from AWS CLI using instance ID) - `CRE_AWS_PROFILE=` (optional SSM auth profile) +## Direct Mode Defaults and IP Resolution + +- If `CRE_REMOTE_ACCESS_MODE` is unset, CRE defaults to `direct`. +- In direct mode, host IP resolution is: + 1. `CRE_EC2_HOST_IP` if set. + 2. Otherwise, resolve from AWS CLI using `CRE_EC2_INSTANCE_ID`: + - `aws ec2 describe-instances --instance-ids --query ...` + - prefers private IP; falls back to public IP if needed. +- Region defaults to `us-west-2` unless AWS env region overrides are present. +- If no explicit host IP and no instance ID are available, startup fails with a clear error. + +## AWS Credentials Resolution (CLI) + +For both SSM and direct-mode auto IP lookup, AWS CLI auth selection follows: + +1. Static env credentials (`AWS_ACCESS_KEY_ID` + `AWS_SECRET_ACCESS_KEY`) +2. Web identity (`AWS_WEB_IDENTITY_TOKEN_FILE` + `AWS_ROLE_ARN`) +3. `CRE_AWS_PROFILE` +4. `AWS_PROFILE` +5. `AWS_DEFAULT_PROFILE` +6. AWS CLI default credential chain/profile + ## Agent Startup - In `ssm` mode, bind agent to loopback (for example `127.0.0.1:18080`). - In `direct` mode, bind agent to all interfaces (for example `0.0.0.0:18080`). +- With defaults, agent starts in direct mode unless `CRE_REMOTE_ACCESS_MODE=ssm` is set. ## Placement Rules @@ -46,6 +70,7 @@ This runbook covers the EC2-based remote mode for CRE where components can run e ## Fast Triage Checklist - Agent unreachable: verify bind address/port vs chosen access mode. -- `invalid jd target: local`: use `target=local` (supported; `docker` is alias). +- Direct mode cannot resolve EC2 IP: ensure `CRE_EC2_INSTANCE_ID` is set and AWS CLI credentials are valid, or set `CRE_EC2_HOST_IP` explicitly. +- `invalid jd target`: use `target=local` or `target=remote` (only supported values). - Remote nodes hitting local-only fixtures: ensure fixture relay helper is active. -- Mixed remote->local gateway case: expected failure for now. +- Mixed remote->local gateway from NodeSets is supported when bridge plumbing is present. From b65bbd2d9dffc6e50b8dc0a7ff6e8cd9f825290e Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Mon, 23 Feb 2026 17:28:16 +0100 Subject: [PATCH 09/34] add a local relayer for remote -> local traffic --- .../environment/environment/environment.go | 62 +- .../environment/relay_supervisor.go | 1111 +++++++++++++++++ .../environment/environment/remote_state.go | 100 +- system-tests/lib/cre/don.go | 5 +- system-tests/lib/cre/don/config/config.go | 67 +- .../lib/cre/environment/agent/relay.go | 148 ++- .../lib/cre/environment/component_relay.go | 315 +++++ .../lib/cre/environment/environment.go | 206 ++- system-tests/lib/cre/environment/jobs.go | 33 +- .../tests/test-helpers/before_suite.go | 118 +- 10 files changed, 2016 insertions(+), 149 deletions(-) create mode 100644 core/scripts/cre/environment/environment/relay_supervisor.go create mode 100644 system-tests/lib/cre/environment/component_relay.go diff --git a/core/scripts/cre/environment/environment/environment.go b/core/scripts/cre/environment/environment/environment.go index eba8d99ac9a..8a52de030d8 100644 --- a/core/scripts/cre/environment/environment/environment.go +++ b/core/scripts/cre/environment/environment/environment.go @@ -88,6 +88,7 @@ func init() { EnvironmentCmd.AddCommand(stopCmd()) EnvironmentCmd.AddCommand(stopAllCmd()) EnvironmentCmd.AddCommand(stopRemoteCmd()) + EnvironmentCmd.AddCommand(relaySupervisorCmd()) EnvironmentCmd.AddCommand(workflowCmds()) EnvironmentCmd.AddCommand(beholderCmds()) EnvironmentCmd.AddCommand(swapCmds()) @@ -262,6 +263,9 @@ func startCmd() *cobra.Command { if err := cleanupTrackedTunnels(relativePathToRepoRoot); err != nil { framework.L.Warn().Err(err).Msg("failed to clean up tracked SSM tunnels before start") } + if err := stopRelaySupervisor(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to stop tracked relay supervisor before start") + } cleanUpErr := envconfig.RemoveAllEnvironmentStateDir(relativePathToRepoRoot) if cleanUpErr != nil { @@ -307,6 +311,10 @@ func startCmd() *cobra.Command { sig := <-sigCh fmt.Printf("\nReceived signal: %s\n", sig) + if err := stopRelaySupervisor(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to stop relay supervisor during signal cleanup") + } + // Only cleanup Docker containers if using Docker provider if isDocker { removeErr := framework.RemoveTestContainers() @@ -745,11 +753,11 @@ func loadRemoteStopTargets(relativePathToRepoRoot string) (remoteComponentSummar } if summary.Total == 0 && remoteStateFileExists(relativePathToRepoRoot) { - remoteState, loadErr := loadRemoteStopState(relativePathToRepoRoot) + remoteCfg, loadErr := loadRemoteStopConfig(relativePathToRepoRoot) if loadErr != nil { framework.L.Warn().Err(loadErr).Msgf("failed to load remote component stop state from %s", remoteStateFileAbsPath(relativePathToRepoRoot)) } else { - targets = remoteState.Config() + targets = remoteCfg summary = summarizeRemoteComponents(targets) } } @@ -761,7 +769,7 @@ func stopRemoteTargets(ctx context.Context, relativePathToRepoRoot string, targe if agentLoadErr != nil { framework.L.Warn().Err(agentLoadErr).Msgf("failed to load remote agent state from %s", remoteStateFileAbsPath(relativePathToRepoRoot)) } else if agentState != nil { - applyRemoteAgentEnvFallback(framework.L, &remoteStopState{Agent: *agentState}) + applyRemoteAgentEnvFallback(framework.L, agentState) } summary, stopRemoteErr := creenv.StopRemoteComponents(ctx, framework.L, targets) @@ -774,6 +782,9 @@ func stopRemoteTargets(ctx context.Context, relativePathToRepoRoot string, targe if stopRemoteErr != nil { return errors.Wrap(stopRemoteErr, "failed to stop one or more remote components") } + if err := stopRelaySupervisor(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to stop relay supervisor after remote stop") + } if err := removeRemoteStopConfig(relativePathToRepoRoot); err != nil { framework.L.Warn().Err(err).Msg("failed to remove remote component stop state") } @@ -781,7 +792,7 @@ func stopRemoteTargets(ctx context.Context, relativePathToRepoRoot string, targe statePath := envconfig.MustLocalCREStateFileAbsPath(relativePathToRepoRoot) if err := os.Remove(statePath); err == nil { framework.L.Info().Msgf("removed local CRE state file after remote-only stop: %s", statePath) - } else if err != nil && !os.IsNotExist(err) { + } else if !os.IsNotExist(err) { framework.L.Warn().Err(err).Msgf("failed to remove local CRE state file after remote-only stop: %s", statePath) } } @@ -789,6 +800,10 @@ func stopRemoteTargets(ctx context.Context, relativePathToRepoRoot string, targe } func stopLocalResources(relativePathToRepoRoot string, removeAllState bool) error { + if err := stopRelaySupervisor(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to stop relay supervisor") + } + removeErr := framework.RemoveTestContainers() if removeErr != nil { return errors.Wrap(removeErr, "failed to remove environment containers. Please remove them manually") @@ -882,8 +897,8 @@ func hasLocalComponents(cfg *envconfig.Config) bool { return false } -func applyRemoteAgentEnvFallback(logger zerolog.Logger, state *remoteStopState) { - if state == nil { +func applyRemoteAgentEnvFallback(logger zerolog.Logger, agentState *remoteAgentState) { + if agentState == nil { return } setIfEmpty := func(key, value string) { @@ -898,12 +913,12 @@ func applyRemoteAgentEnvFallback(logger zerolog.Logger, state *remoteStopState) } } - setIfEmpty("CRE_AGENT_MODE", state.Agent.Mode) - setIfEmpty("CRE_LOCAL_AGENT_URL", state.Agent.LocalURL) - setIfEmpty("CRE_EC2_AGENT_URL", state.Agent.EC2URL) - setIfEmpty("CRE_EC2_INSTANCE_ID", state.Agent.EC2InstanceID) - setIfEmpty("CRE_EC2_AGENT_PORT", state.Agent.EC2AgentPort) - setIfEmpty("CRE_AWS_PROFILE", state.Agent.AWSProfile) + setIfEmpty("CRE_AGENT_MODE", agentState.Mode) + setIfEmpty("CRE_LOCAL_AGENT_URL", agentState.LocalURL) + setIfEmpty("CRE_EC2_AGENT_URL", agentState.EC2URL) + setIfEmpty("CRE_EC2_INSTANCE_ID", agentState.EC2InstanceID) + setIfEmpty("CRE_EC2_AGENT_PORT", agentState.EC2AgentPort) + setIfEmpty("CRE_AWS_PROFILE", agentState.AWSProfile) } func StartCLIEnvironment( @@ -918,6 +933,10 @@ func StartCLIEnvironment( gatewayWhitelistConfig gateway.WhitelistConfig, ) (*creenv.SetupOutput, error) { testLogger := framework.L + relaySupervisorStarted := false + defer func() { + _ = os.Unsetenv("CRE_USE_PERSISTENT_RELAY_SUPERVISOR") + }() // unset DockerFilePath and DockerContext as we cannot use them with existing images if withPluginsDockerImageFlag != "" { @@ -958,12 +977,31 @@ func StartCLIEnvironment( Features: features, GatewayWhitelistConfig: gatewayWhitelistConfig, BlockchainDeployers: blockchains_sets.NewDeployerSet(testLogger, in.Infra), + PreDONsStartHook: func(context.Context) error { + if relaySupervisorStarted { + return nil + } + started, err := maybeStartRelaySupervisor(relativePathToRepoRoot, in) + if err != nil { + return errors.Wrap(err, "failed to start persistent relay supervisor") + } + if started { + relaySupervisorStarted = true + _ = os.Setenv("CRE_USE_PERSISTENT_RELAY_SUPERVISOR", "true") + } + return nil + }, } ctx, cancel := context.WithTimeout(cmdContext, 10*time.Minute) defer cancel() universalSetupOutput, setupErr := creenv.SetupTestEnvironment(ctx, testLogger, singleFileLogger, universalSetupInput, relativePathToRepoRoot) if setupErr != nil { + if relaySupervisorStarted { + if err := stopRelaySupervisor(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to stop relay supervisor during startup rollback") + } + } return nil, fmt.Errorf("failed to setup test environment: %w", setupErr) } diff --git a/core/scripts/cre/environment/environment/relay_supervisor.go b/core/scripts/cre/environment/environment/relay_supervisor.go new file mode 100644 index 00000000000..0e908c0ed5b --- /dev/null +++ b/core/scripts/cre/environment/environment/relay_supervisor.go @@ -0,0 +1,1111 @@ +package environment + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net" + "net/http" + "net/url" + "os" + "os/exec" + "os/signal" + "path/filepath" + "sort" + "strconv" + "strings" + "sync" + "sync/atomic" + "syscall" + "time" + + "github.com/gorilla/websocket" + "github.com/pelletier/go-toml/v2" + "github.com/pkg/errors" + "github.com/rs/zerolog" + "github.com/spf13/cobra" + + "github.com/smartcontractkit/chainlink-testing-framework/framework" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" + envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" +) + +const ( + relaySupervisorStateFilename = "relay_supervisor.toml" + relaySupervisorLogFilename = "relay_supervisor.log" + relaySupervisorLockFilename = "relay_supervisor.lock" + defaultEC2AgentPort = 18080 + defaultRelayWorkerPoolSize = 16 + + envRelaySupervisorLockPath = "CRE_RELAY_SUPERVISOR_LOCK_PATH" +) + +var relaySupervisorLockFile *os.File + +type relaySpec struct { + Name string + Port int +} + +type relaySupervisorState struct { + Version int `toml:"version"` + PID int `toml:"pid"` + Ports []int `toml:"ports"` + StartedAt string `toml:"started_at,omitempty"` + LogPath string `toml:"log_path,omitempty"` +} + +type localComponentRelayManager struct { + lggr zerolog.Logger + baseURL string + + mu sync.Mutex + handles map[string]*relayHandle +} + +type relayHandle struct { + mu sync.RWMutex + relayID string + name string + port int + cancel context.CancelFunc +} + +type relayOpenResponse struct { + RelayID string `json:"relayId"` +} + +type localBridgeStats struct { + WSMessages uint64 + WSToTCPBytes uint64 + TCPToWSBytes uint64 + LocalDialed bool + LocalDialFails uint64 +} + +func relaySupervisorCmd() *cobra.Command { + var portsRaw string + var relaySpecsRaw string + cmd := &cobra.Command{ + Use: "relay-supervisor", + Short: "Run detached mixed-mode relay supervisor", + Hidden: true, + RunE: func(cmd *cobra.Command, args []string) error { + lockPath, err := resolveRelaySupervisorLockPath() + if err != nil { + return err + } + if err := acquireRelaySupervisorLock(lockPath); err != nil { + return err + } + defer releaseRelaySupervisorLock() + + specs, err := parseRelaySpecsCSV(relaySpecsRaw) + if err != nil { + return err + } + if len(specs) == 0 { + ports, perr := parsePortsCSV(portsRaw) + if perr != nil { + return perr + } + for _, p := range ports { + specs = append(specs, relaySpec{ + Name: fmt.Sprintf("component-%d", p), + Port: p, + }) + } + } + if len(specs) == 0 { + return fmt.Errorf("no relay specs or ports were provided") + } + + manager, err := newLocalComponentRelayManager(framework.L) + if err != nil { + return err + } + ctx := cmd.Context() + for _, spec := range specs { + if err := manager.EnsurePort(ctx, spec.Name, spec.Port); err != nil { + _ = manager.Close(ctx) + return err + } + } + + sigCtx, stop := signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM) + defer stop() + <-sigCtx.Done() + + closeCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + return manager.Close(closeCtx) + }, + } + cmd.Flags().StringVar(&portsRaw, "ports", "", "Comma-separated list of local ports to bridge") + cmd.Flags().StringVar(&relaySpecsRaw, "relay-specs", "", "Comma-separated list of relay specs in form name:port") + return cmd +} + +func maybeStartRelaySupervisor(relativePathToRepoRoot string, cfg *envconfig.Config) (bool, error) { + specs := relaySpecsFromConfig(cfg) + if len(specs) == 0 { + if err := stopRelaySupervisor(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to stop stale relay supervisor") + } + return false, nil + } + return true, startRelaySupervisor(relativePathToRepoRoot, specs) +} + +func relaySpecsFromConfig(cfg *envconfig.Config) []relaySpec { + if cfg == nil { + return nil + } + hasRemoteNodeSets := false + for _, nodeSet := range cfg.NodeSets { + if nodeSet != nil && strings.TrimSpace(nodeSet.Target) == string(envconfig.TargetRemote) { + hasRemoteNodeSets = true + break + } + } + if !hasRemoteNodeSets { + return nil + } + + specByPort := map[int]relaySpec{} + addSpec := func(name string, port int) { + if port <= 0 || port > 65535 { + return + } + if _, exists := specByPort[port]; exists { + return + } + specByPort[port] = relaySpec{Name: name, Port: port} + } + for _, blockchainCfg := range cfg.Blockchains { + if blockchainCfg == nil || blockchainCfg.Target != envconfig.TargetLocal { + continue + } + if blockchainCfg.Out != nil { + for nodeIdx, node := range blockchainCfg.Out.Nodes { + if node == nil { + continue + } + if p, ok := endpointPort(node.ExternalHTTPUrl); ok { + addSpec(fmt.Sprintf("blockchain-%s-http-%d", blockchainCfg.ChainID, nodeIdx), p) + } + if p, ok := endpointPort(node.ExternalWSUrl); ok { + addSpec(fmt.Sprintf("blockchain-%s-ws-%d", blockchainCfg.ChainID, nodeIdx), p) + } + } + continue + } + for _, p := range inferLocalBlockchainPortsFromInput(blockchainCfg.Input) { + addSpec(fmt.Sprintf("blockchain-%s-port-%d", blockchainCfg.ChainID, p), p) + } + } + + if cfg.JD != nil && cfg.JD.Target == envconfig.TargetLocal { + if cfg.JD.Out != nil { + if p, ok := endpointPort(cfg.JD.Out.ExternalGRPCUrl); ok { + addSpec("jd-grpc", p) + } + if p, ok := endpointPort(cfg.JD.Out.ExternalWSRPCUrl); ok { + addSpec("jd-wsrpc", p) + } + } else { + ports := inferLocalJDPortsFromInput(cfg.JD.Input) + for idx, p := range ports { + if idx == 0 { + addSpec("jd-grpc", p) + continue + } + if idx == 1 { + addSpec("jd-wsrpc", p) + continue + } + addSpec(fmt.Sprintf("jd-port-%d", p), p) + } + } + } + + specs := make([]relaySpec, 0, len(specByPort)) + for _, spec := range specByPort { + specs = append(specs, spec) + } + sort.Slice(specs, func(i, j int) bool { + if specs[i].Port == specs[j].Port { + return specs[i].Name < specs[j].Name + } + return specs[i].Port < specs[j].Port + }) + return specs +} + +func inferLocalBlockchainPortsFromInput(in blockchain.Input) []int { + portSet := map[int]struct{}{} + add := func(raw string) { + raw = strings.TrimSpace(raw) + if raw == "" { + return + } + p, err := strconv.Atoi(raw) + if err == nil && p > 0 && p <= 65535 { + portSet[p] = struct{}{} + } + } + chainType := strings.ToLower(strings.TrimSpace(in.Type)) + switch chainType { + case "anvil", "": + add(firstNonEmpty(in.Port, "8545")) + // Anvil WS is served on the same port. + add(firstNonEmpty(in.WSPort, in.Port, "8545")) + default: + // Best effort for other families: infer from explicit configured ports only. + add(in.Port) + add(in.WSPort) + } + out := make([]int, 0, len(portSet)) + for p := range portSet { + out = append(out, p) + } + sort.Ints(out) + return out +} + +func inferLocalJDPortsFromInput(in jd.Input) []int { + const ( + defaultJDGRPC = "14231" + defaultJDWSRPC = "8080" + ) + portSet := map[int]struct{}{} + add := func(raw string) { + raw = strings.TrimSpace(raw) + if raw == "" { + return + } + p, err := strconv.Atoi(raw) + if err == nil && p > 0 && p <= 65535 { + portSet[p] = struct{}{} + } + } + add(firstNonEmpty(in.GRPCPort, defaultJDGRPC)) + add(firstNonEmpty(in.WSRPCPort, defaultJDWSRPC)) + out := make([]int, 0, len(portSet)) + for p := range portSet { + out = append(out, p) + } + sort.Ints(out) + return out +} + +func endpointPort(raw string) (int, bool) { + trimmed := strings.TrimSpace(raw) + if trimmed == "" { + return 0, false + } + if strings.Contains(trimmed, "://") { + parsed, err := url.Parse(trimmed) + if err != nil || parsed.Port() == "" { + return 0, false + } + port, convErr := strconv.Atoi(parsed.Port()) + if convErr != nil || port <= 0 || port > 65535 { + return 0, false + } + return port, true + } + _, portRaw, err := net.SplitHostPort(trimmed) + if err != nil { + return 0, false + } + port, convErr := strconv.Atoi(portRaw) + if convErr != nil || port <= 0 || port > 65535 { + return 0, false + } + return port, true +} + +func startRelaySupervisor(relativePathToRepoRoot string, specs []relaySpec) error { + if len(specs) == 0 { + return nil + } + ports := make([]int, 0, len(specs)) + for _, spec := range specs { + ports = append(ports, spec.Port) + } + ports = uniqueSortedPorts(ports) + if err := stopRelaySupervisor(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to stop existing relay supervisor before restart") + } + + executablePath, err := os.Executable() + if err != nil { + return errors.Wrap(err, "resolve executable path for relay supervisor") + } + + statePath := relaySupervisorStatePath(relativePathToRepoRoot) + if err := os.MkdirAll(filepath.Dir(statePath), 0o755); err != nil { + return errors.Wrap(err, "create relay supervisor state directory") + } + logPath := filepath.Join(filepath.Dir(statePath), relaySupervisorLogFilename) + logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o600) + if err != nil { + return errors.Wrap(err, "open relay supervisor log file") + } + defer logFile.Close() + + cmd := exec.Command(executablePath, "env", "relay-supervisor", "--relay-specs", relaySpecsCSV(specs)) + lockPath := filepath.Join(filepath.Dir(statePath), relaySupervisorLockFilename) + cmd.Env = append(os.Environ(), fmt.Sprintf("%s=%s", envRelaySupervisorLockPath, lockPath)) + cmd.Stdout = logFile + cmd.Stderr = logFile + cmd.Stdin = nil + cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true} + if err := cmd.Start(); err != nil { + return errors.Wrap(err, "start relay supervisor process") + } + pid := cmd.Process.Pid + _ = cmd.Process.Release() + + if !waitForPIDAlive(pid, 1500*time.Millisecond) { + return fmt.Errorf("relay supervisor process exited too quickly (pid=%d)", pid) + } + + state := relaySupervisorState{ + Version: 1, + PID: pid, + Ports: ports, + StartedAt: time.Now().UTC().Format(time.RFC3339Nano), + LogPath: logPath, + } + return storeRelaySupervisorState(relativePathToRepoRoot, &state) +} + +func stopRelaySupervisor(relativePathToRepoRoot string) error { + state, err := loadRelaySupervisorState(relativePathToRepoRoot) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + if state.PID <= 0 { + return removeRelaySupervisorState(relativePathToRepoRoot) + } + if !processExists(state.PID) { + return removeRelaySupervisorState(relativePathToRepoRoot) + } + isRelayProc, verifyErr := isRelaySupervisorProcess(state.PID) + if verifyErr != nil { + return verifyErr + } + if !isRelayProc { + return fmt.Errorf("refusing to kill non-relay process pid=%d from relay supervisor state", state.PID) + } + proc, findErr := os.FindProcess(state.PID) + if findErr != nil { + return findErr + } + _ = proc.Signal(syscall.SIGTERM) + deadline := time.Now().Add(2 * time.Second) + for processExists(state.PID) && time.Now().Before(deadline) { + time.Sleep(100 * time.Millisecond) + } + if processExists(state.PID) { + _ = proc.Signal(syscall.SIGKILL) + } + if processExists(state.PID) { + return fmt.Errorf("relay supervisor pid=%d did not stop", state.PID) + } + return removeRelaySupervisorState(relativePathToRepoRoot) +} + +func loadRelaySupervisorState(relativePathToRepoRoot string) (*relaySupervisorState, error) { + data, err := os.ReadFile(relaySupervisorStatePath(relativePathToRepoRoot)) + if err != nil { + return nil, err + } + state := &relaySupervisorState{} + if err := toml.Unmarshal(data, state); err != nil { + return nil, err + } + if state.Version == 0 { + state.Version = 1 + } + return state, nil +} + +func storeRelaySupervisorState(relativePathToRepoRoot string, state *relaySupervisorState) error { + data, err := toml.Marshal(state) + if err != nil { + return err + } + path := relaySupervisorStatePath(relativePathToRepoRoot) + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return err + } + return os.WriteFile(path, data, 0o600) +} + +func removeRelaySupervisorState(relativePathToRepoRoot string) error { + path := relaySupervisorStatePath(relativePathToRepoRoot) + if err := os.Remove(path); err != nil && !os.IsNotExist(err) { + return err + } + return nil +} + +func relaySupervisorStatePath(relativePathToRepoRoot string) string { + absPath, err := filepath.Abs(filepath.Join(relativePathToRepoRoot, remoteStateDirname, relaySupervisorStateFilename)) + if err != nil { + panic(fmt.Errorf("failed to get absolute path for relay supervisor state file: %w", err)) + } + return absPath +} + +func resolveRelaySupervisorLockPath() (string, error) { + if configured := strings.TrimSpace(os.Getenv(envRelaySupervisorLockPath)); configured != "" { + return configured, nil + } + wd, err := os.Getwd() + if err != nil { + return "", errors.Wrap(err, "resolve working directory for relay supervisor lock") + } + return filepath.Join(wd, remoteStateDirname, relaySupervisorLockFilename), nil +} + +func acquireRelaySupervisorLock(lockPath string) error { + if relaySupervisorLockFile != nil { + return nil + } + if err := os.MkdirAll(filepath.Dir(lockPath), 0o755); err != nil { + return errors.Wrap(err, "create relay supervisor lock directory") + } + f, err := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0o600) + if err != nil { + return errors.Wrap(err, "open relay supervisor lock file") + } + if err := syscall.Flock(int(f.Fd()), syscall.LOCK_EX|syscall.LOCK_NB); err != nil { + _ = f.Close() + if errors.Is(err, syscall.EWOULDBLOCK) { + return fmt.Errorf("relay supervisor already running (lock file in use: %s)", lockPath) + } + return errors.Wrap(err, "acquire relay supervisor file lock") + } + if err := f.Truncate(0); err != nil { + _ = syscall.Flock(int(f.Fd()), syscall.LOCK_UN) + _ = f.Close() + return errors.Wrap(err, "truncate relay supervisor lock file") + } + if _, err := f.Seek(0, io.SeekStart); err != nil { + _ = syscall.Flock(int(f.Fd()), syscall.LOCK_UN) + _ = f.Close() + return errors.Wrap(err, "seek relay supervisor lock file") + } + _, _ = f.WriteString(fmt.Sprintf("pid=%d\nstarted_at=%s\n", os.Getpid(), time.Now().UTC().Format(time.RFC3339Nano))) + _ = f.Sync() + relaySupervisorLockFile = f + return nil +} + +func releaseRelaySupervisorLock() { + if relaySupervisorLockFile == nil { + return + } + _ = syscall.Flock(int(relaySupervisorLockFile.Fd()), syscall.LOCK_UN) + _ = relaySupervisorLockFile.Close() + relaySupervisorLockFile = nil +} + +func isRelaySupervisorProcess(pid int) (bool, error) { + out, err := exec.Command("ps", "-o", "command=", "-p", strconv.Itoa(pid)).Output() + if err != nil { + return false, err + } + cmd := strings.TrimSpace(string(out)) + if cmd == "" { + return false, nil + } + return strings.Contains(cmd, "relay-supervisor"), nil +} + +func waitForPIDAlive(pid int, maxWait time.Duration) bool { + deadline := time.Now().Add(maxWait) + for time.Now().Before(deadline) { + if processExists(pid) { + return true + } + time.Sleep(50 * time.Millisecond) + } + return processExists(pid) +} + +func portsCSV(ports []int) string { + if len(ports) == 0 { + return "" + } + parts := make([]string, 0, len(ports)) + for _, port := range ports { + parts = append(parts, strconv.Itoa(port)) + } + return strings.Join(parts, ",") +} + +func parsePortsCSV(raw string) ([]int, error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return nil, nil + } + parts := strings.Split(raw, ",") + out := make([]int, 0, len(parts)) + for _, part := range parts { + port, err := strconv.Atoi(strings.TrimSpace(part)) + if err != nil { + return nil, fmt.Errorf("invalid port %q: %w", part, err) + } + if port <= 0 || port > 65535 { + return nil, fmt.Errorf("invalid port %d", port) + } + out = append(out, port) + } + return uniqueSortedPorts(out), nil +} + +func parseRelaySpecsCSV(raw string) ([]relaySpec, error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return nil, nil + } + parts := strings.Split(raw, ",") + specByPort := make(map[int]relaySpec, len(parts)) + for _, part := range parts { + token := strings.TrimSpace(part) + if token == "" { + continue + } + idx := strings.LastIndex(token, ":") + if idx <= 0 || idx >= len(token)-1 { + return nil, fmt.Errorf("invalid relay spec %q; expected name:port", token) + } + name := strings.TrimSpace(token[:idx]) + portRaw := strings.TrimSpace(token[idx+1:]) + if name == "" { + return nil, fmt.Errorf("invalid relay spec %q; name is empty", token) + } + port, err := strconv.Atoi(portRaw) + if err != nil || port <= 0 || port > 65535 { + return nil, fmt.Errorf("invalid relay port %q in spec %q", portRaw, token) + } + if _, exists := specByPort[port]; exists { + continue + } + specByPort[port] = relaySpec{Name: name, Port: port} + } + specs := make([]relaySpec, 0, len(specByPort)) + for _, spec := range specByPort { + specs = append(specs, spec) + } + sort.Slice(specs, func(i, j int) bool { + if specs[i].Port == specs[j].Port { + return specs[i].Name < specs[j].Name + } + return specs[i].Port < specs[j].Port + }) + return specs, nil +} + +func relaySpecsCSV(specs []relaySpec) string { + if len(specs) == 0 { + return "" + } + parts := make([]string, 0, len(specs)) + for _, spec := range specs { + if spec.Port <= 0 || spec.Port > 65535 { + continue + } + name := strings.TrimSpace(spec.Name) + if name == "" { + name = fmt.Sprintf("component-%d", spec.Port) + } + parts = append(parts, fmt.Sprintf("%s:%d", name, spec.Port)) + } + return strings.Join(parts, ",") +} + +func uniqueSortedPorts(in []int) []int { + if len(in) == 0 { + return nil + } + set := make(map[int]struct{}, len(in)) + for _, p := range in { + if p > 0 && p <= 65535 { + set[p] = struct{}{} + } + } + out := make([]int, 0, len(set)) + for p := range set { + out = append(out, p) + } + sort.Ints(out) + return out +} + +func newLocalComponentRelayManager(lggr zerolog.Logger) (*localComponentRelayManager, error) { + baseURL, err := resolveAgentBaseURLForRelay() + if err != nil { + return nil, err + } + return &localComponentRelayManager{ + lggr: lggr, + baseURL: baseURL, + handles: make(map[string]*relayHandle), + }, nil +} + +func (m *localComponentRelayManager) EnsurePort(ctx context.Context, relayName string, localPort int) error { + if m == nil || localPort <= 0 { + return nil + } + // Deduplicate by port. HTTP and WS for the same endpoint can share one listener. + key := strconv.Itoa(localPort) + + m.mu.Lock() + if _, ok := m.handles[key]; ok { + m.mu.Unlock() + return nil + } + m.mu.Unlock() + + relayID, err := openRelay(ctx, m.baseURL, relayName, localPort) + if err != nil { + return err + } + + workerCtx, cancel := context.WithCancel(context.Background()) + localAddr := net.JoinHostPort("127.0.0.1", strconv.Itoa(localPort)) + handle := &relayHandle{ + relayID: relayID, + name: relayName, + port: localPort, + cancel: cancel, + } + for i := 0; i < defaultRelayWorkerPoolSize; i++ { + go relayWorker(workerCtx, m.lggr, m.baseURL, handle, localAddr, i) + } + + m.mu.Lock() + m.handles[key] = handle + m.mu.Unlock() + m.lggr.Info().Str("relayName", relayName).Int("port", localPort).Msg("ensured persistent mixed component relay") + return nil +} + +func (m *localComponentRelayManager) Close(ctx context.Context) error { + if m == nil { + return nil + } + m.mu.Lock() + handles := make([]*relayHandle, 0, len(m.handles)) + for _, h := range m.handles { + handles = append(handles, h) + } + m.handles = map[string]*relayHandle{} + m.mu.Unlock() + + var firstErr error + for _, h := range handles { + h.cancel() + if err := closeRelay(ctx, m.baseURL, h.getRelayID()); err != nil && firstErr == nil { + firstErr = err + } + } + return firstErr +} + +func (h *relayHandle) getRelayID() string { + if h == nil { + return "" + } + h.mu.RLock() + defer h.mu.RUnlock() + return h.relayID +} + +func (h *relayHandle) setRelayID(relayID string) { + if h == nil { + return + } + h.mu.Lock() + h.relayID = relayID + h.mu.Unlock() +} + +func resolveAgentBaseURLForRelay() (string, error) { + if v := strings.TrimSpace(os.Getenv("CRE_EC2_AGENT_URL")); v != "" { + return v, nil + } + if strings.EqualFold(strings.TrimSpace(os.Getenv("CRE_AGENT_MODE")), "ec2") && runtimecfg.IsDirectMode() { + hostIP, err := runtimecfg.DirectHostIP() + if err != nil { + return "", err + } + port, err := resolveEC2AgentPortForRelay() + if err != nil { + return "", err + } + return fmt.Sprintf("http://%s:%d", hostIP, port), nil + } + if v := strings.TrimSpace(os.Getenv("CRE_LOCAL_AGENT_URL")); v != "" { + return v, nil + } + return "", fmt.Errorf("cannot resolve agent base URL for relay; set CRE_EC2_AGENT_URL or CRE_LOCAL_AGENT_URL") +} + +func resolveEC2AgentPortForRelay() (int, error) { + raw := strings.TrimSpace(os.Getenv("CRE_EC2_AGENT_PORT")) + if raw == "" { + return defaultEC2AgentPort, nil + } + port, err := strconv.Atoi(raw) + if err != nil || port <= 0 || port > 65535 { + return 0, fmt.Errorf("invalid CRE_EC2_AGENT_PORT: %q", raw) + } + return port, nil +} + +func openRelay(ctx context.Context, baseURL, name string, requestedPort int) (string, error) { + body, _ := json.Marshal(map[string]any{"name": name, "requestedPort": requestedPort}) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, strings.TrimRight(baseURL, "/")+"/v1/relay/open", bytes.NewReader(body)) + if err != nil { + return "", err + } + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + respBody, _ := io.ReadAll(resp.Body) + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return "", fmt.Errorf("open relay failed: status %s body %s", resp.Status, strings.TrimSpace(string(respBody))) + } + + var out relayOpenResponse + if err := json.Unmarshal(respBody, &out); err != nil { + return "", err + } + if strings.TrimSpace(out.RelayID) == "" { + return "", fmt.Errorf("open relay returned empty relayId") + } + return out.RelayID, nil +} + +func closeRelay(ctx context.Context, baseURL, relayID string) error { + body, _ := json.Marshal(map[string]any{"relayId": relayID}) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, strings.TrimRight(baseURL, "/")+"/v1/relay/close", bytes.NewReader(body)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + respBody, _ := io.ReadAll(resp.Body) + return fmt.Errorf("close relay failed: status %s body %s", resp.Status, strings.TrimSpace(string(respBody))) + } + return nil +} + +func relayWorker(ctx context.Context, lggr zerolog.Logger, baseURL string, handle *relayHandle, localAddr string, workerIndex int) { + backoff := 250 * time.Millisecond + for { + select { + case <-ctx.Done(): + return + default: + } + + relayID := handle.getRelayID() + wsURL, err := relayConnectWSURL(baseURL, relayID) + if err != nil { + lggr.Warn(). + Err(err). + Str("relayId", relayID). + Str("relayName", handle.name). + Int("workerIndex", workerIndex). + Msg("relay worker failed to construct websocket URL") + time.Sleep(backoff) + continue + } + ws, _, err := websocket.DefaultDialer.Dial(wsURL, nil) + if err != nil { + if isBadHandshakeError(err) { + reopenCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + newRelayID, reopenErr := openRelay(reopenCtx, baseURL, handle.name, handle.port) + cancel() + if reopenErr != nil { + lggr.Warn(). + Err(reopenErr). + Str("relayId", relayID). + Str("relayName", handle.name). + Int("requestedPort", handle.port). + Int("workerIndex", workerIndex). + Msg("relay worker failed to reopen relay after websocket bad handshake") + } else { + handle.setRelayID(newRelayID) + lggr.Info(). + Str("oldRelayId", relayID). + Str("newRelayId", newRelayID). + Str("relayName", handle.name). + Int("requestedPort", handle.port). + Int("workerIndex", workerIndex). + Msg("relay worker refreshed relay id after websocket bad handshake") + backoff = 250 * time.Millisecond + continue + } + } + lggr.Warn(). + Err(err). + Str("relayId", relayID). + Str("relayName", handle.name). + Int("workerIndex", workerIndex). + Msg("relay worker failed to connect websocket bridge") + time.Sleep(backoff) + continue + } + lggr.Info(). + Str("relayId", relayID). + Str("relayName", handle.name). + Str("localAddr", localAddr). + Int("workerIndex", workerIndex). + Msg("relay worker established websocket bridge; waiting for payload to dial local endpoint") + bridgeStarted := time.Now() + stats, bridgeErr := bridgeRelayStream(ctx, lggr, handle.name, relayID, workerIndex, ws, localAddr) + _ = ws.Close() + if bridgeErr != nil && !errors.Is(bridgeErr, context.Canceled) { + lggr.Warn(). + Err(bridgeErr). + Str("relayId", relayID). + Str("relayName", handle.name). + Int("workerIndex", workerIndex). + Uint64("wsMessages", stats.WSMessages). + Uint64("wsToTCPBytes", stats.WSToTCPBytes). + Uint64("tcpToWSBytes", stats.TCPToWSBytes). + Bool("localDialed", stats.LocalDialed). + Uint64("localDialFails", stats.LocalDialFails). + Dur("duration", time.Since(bridgeStarted)). + Msg("relay worker bridge ended with error") + } else { + lggr.Info(). + Str("relayId", relayID). + Str("relayName", handle.name). + Int("workerIndex", workerIndex). + Uint64("wsMessages", stats.WSMessages). + Uint64("wsToTCPBytes", stats.WSToTCPBytes). + Uint64("tcpToWSBytes", stats.TCPToWSBytes). + Bool("localDialed", stats.LocalDialed). + Uint64("localDialFails", stats.LocalDialFails). + Dur("duration", time.Since(bridgeStarted)). + Msg("relay worker bridge ended") + } + if backoff < 2*time.Second { + backoff *= 2 + } + } +} + +func isBadHandshakeError(err error) bool { + if err == nil { + return false + } + return strings.Contains(strings.ToLower(err.Error()), "bad handshake") +} + +func relayConnectWSURL(baseURL, relayID string) (string, error) { + u, err := url.Parse(strings.TrimRight(baseURL, "/")) + if err != nil { + return "", err + } + switch u.Scheme { + case "http": + u.Scheme = "ws" + case "https": + u.Scheme = "wss" + default: + return "", fmt.Errorf("unsupported agent url scheme: %s", u.Scheme) + } + u.Path = "/v1/relay/connect" + q := u.Query() + q.Set("relayId", relayID) + u.RawQuery = q.Encode() + return u.String(), nil +} + +func bridgeRelayStream( + ctx context.Context, + lggr zerolog.Logger, + relayName, relayID string, + workerIndex int, + ws *websocket.Conn, + localAddr string, +) (*localBridgeStats, error) { + errCh := make(chan error, 2) + stats := &localBridgeStats{} + writeMu := &sync.Mutex{} + localReady := make(chan net.Conn, 1) + var localConn net.Conn + var localConnMu sync.Mutex + keepAliveCtx, keepAliveCancel := context.WithCancel(ctx) + defer keepAliveCancel() + go relayKeepAlive(keepAliveCtx, ws, writeMu, errCh) + getLocalConn := func() net.Conn { + localConnMu.Lock() + defer localConnMu.Unlock() + return localConn + } + setLocalConn := func(conn net.Conn) { + localConnMu.Lock() + localConn = conn + localConnMu.Unlock() + } + ensureLocalConn := func() (net.Conn, error) { + if existing := getLocalConn(); existing != nil { + return existing, nil + } + conn, err := net.DialTimeout("tcp", localAddr, 2*time.Second) + if err != nil { + atomic.AddUint64(&stats.LocalDialFails, 1) + lggr.Warn(). + Err(err). + Str("relayId", relayID). + Str("relayName", relayName). + Int("workerIndex", workerIndex). + Str("localAddr", localAddr). + Msg("relay worker lazy local dial failed") + return nil, err + } + stats.LocalDialed = true + lggr.Info(). + Str("relayId", relayID). + Str("relayName", relayName). + Int("workerIndex", workerIndex). + Str("localAddr", localAddr). + Msg("relay worker lazy local dial succeeded") + setLocalConn(conn) + select { + case localReady <- conn: + default: + } + return conn, nil + } + defer func() { + if conn := getLocalConn(); conn != nil { + _ = conn.Close() + } + }() + + go func() { + var conn net.Conn + select { + case conn = <-localReady: + case <-ctx.Done(): + errCh <- ctx.Err() + return + } + if conn == nil { + errCh <- fmt.Errorf("local relay connection was nil") + return + } + buf := make([]byte, 32*1024) + for { + n, err := conn.Read(buf) + if n > 0 { + atomic.AddUint64(&stats.TCPToWSBytes, uint64(n)) + writeMu.Lock() + wErr := ws.WriteMessage(websocket.BinaryMessage, buf[:n]) + writeMu.Unlock() + if wErr != nil { + errCh <- wErr + return + } + } + if err != nil { + errCh <- err + return + } + } + }() + go func() { + for { + msgType, payload, err := ws.ReadMessage() + if err != nil { + errCh <- err + return + } + if msgType != websocket.BinaryMessage && msgType != websocket.TextMessage { + continue + } + if len(payload) == 0 { + continue + } + atomic.AddUint64(&stats.WSMessages, 1) + atomic.AddUint64(&stats.WSToTCPBytes, uint64(len(payload))) + if stats.WSMessages == 1 { + lggr.Info(). + Str("relayId", relayID). + Str("relayName", relayName). + Int("workerIndex", workerIndex). + Int("payloadBytes", len(payload)). + Msg("relay worker received first websocket payload") + } + conn, dialErr := ensureLocalConn() + if dialErr != nil { + errCh <- dialErr + return + } + if _, wErr := conn.Write(payload); wErr != nil { + errCh <- wErr + return + } + } + }() + select { + case <-ctx.Done(): + return stats, ctx.Err() + case err := <-errCh: + return stats, err + } +} + +func relayKeepAlive(ctx context.Context, ws *websocket.Conn, writeMu *sync.Mutex, errCh chan<- error) { + ticker := time.NewTicker(20 * time.Second) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + writeMu.Lock() + err := ws.WriteControl(websocket.PingMessage, []byte("keepalive"), time.Now().Add(5*time.Second)) + writeMu.Unlock() + if err != nil { + select { + case errCh <- fmt.Errorf("keepalive ping failed: %w", err): + default: + } + return + } + } + } +} + diff --git a/core/scripts/cre/environment/environment/remote_state.go b/core/scripts/cre/environment/environment/remote_state.go index 40f98171151..5a6cf8d8dc9 100644 --- a/core/scripts/cre/environment/environment/remote_state.go +++ b/core/scripts/cre/environment/environment/remote_state.go @@ -15,16 +15,9 @@ import ( const ( remoteStateDirname = "core/scripts/cre/environment/state_remote" remoteStateFilename = "remote_components.toml" + remoteAgentFilename = "remote_agent.toml" ) -type remoteStopState struct { - Version int `toml:"version"` - Blockchains []*envconfig.Blockchain `toml:"blockchains"` - NodeSets []*cre.NodeSet `toml:"nodesets"` - JD *envconfig.JobDistributor `toml:"jd"` - Agent remoteAgentState `toml:"agent"` -} - type remoteAgentState struct { Mode string `toml:"mode,omitempty"` LocalURL string `toml:"local_url,omitempty"` @@ -51,29 +44,16 @@ func remoteStateFileExists(relativePathToRepoRoot string) bool { return statErr == nil } -func loadRemoteStopState(relativePathToRepoRoot string) (*remoteStopState, error) { - data, err := os.ReadFile(remoteStateFileAbsPath(relativePathToRepoRoot)) - if err != nil { - return nil, err - } - state := &remoteStopState{} - if err := toml.Unmarshal(data, state); err != nil { +func loadRemoteStopConfig(relativePathToRepoRoot string) (*envconfig.Config, error) { + cfg := &envconfig.Config{} + if err := cfg.Load(remoteStateFileAbsPath(relativePathToRepoRoot)); err != nil { return nil, err } - if state.Version == 0 { - state.Version = 1 - } - if state.Blockchains == nil { - state.Blockchains = []*envconfig.Blockchain{} - } - if state.NodeSets == nil { - state.NodeSets = []*cre.NodeSet{} - } - return state, nil + return cfg, nil } func loadRemoteAgentState(relativePathToRepoRoot string) (*remoteAgentState, error) { - data, err := os.ReadFile(remoteStateFileAbsPath(relativePathToRepoRoot)) + data, err := os.ReadFile(remoteAgentFileAbsPath(relativePathToRepoRoot)) if err != nil { return nil, err } @@ -84,52 +64,45 @@ func loadRemoteAgentState(relativePathToRepoRoot string) (*remoteAgentState, err return &envelope.Agent, nil } -func (s *remoteStopState) Config() *envconfig.Config { - if s == nil { - return nil - } - return &envconfig.Config{ - Blockchains: s.Blockchains, - NodeSets: s.NodeSets, - JD: s.JD, - } -} - func storeRemoteStopState(relativePathToRepoRoot string, cfg *envconfig.Config) error { if cfg == nil { return fmt.Errorf("cannot store nil remote stop config") } - state := &remoteStopState{ - Version: 1, + stopCfg := &envconfig.Config{ Blockchains: []*envconfig.Blockchain{}, NodeSets: []*cre.NodeSet{}, - Agent: remoteAgentState{ - Mode: os.Getenv("CRE_AGENT_MODE"), - LocalURL: os.Getenv("CRE_LOCAL_AGENT_URL"), - EC2URL: os.Getenv("CRE_EC2_AGENT_URL"), - EC2InstanceID: os.Getenv("CRE_EC2_INSTANCE_ID"), - EC2AgentPort: os.Getenv("CRE_EC2_AGENT_PORT"), - AWSProfile: firstNonEmpty(os.Getenv("CRE_AWS_PROFILE"), os.Getenv("AWS_PROFILE")), - }, } for _, configuredBlockchain := range cfg.Blockchains { if configuredBlockchain != nil && configuredBlockchain.Target == envconfig.TargetRemote { - state.Blockchains = append(state.Blockchains, configuredBlockchain) + stopCfg.Blockchains = append(stopCfg.Blockchains, configuredBlockchain) } } for _, nodeSet := range cfg.NodeSets { if nodeSet != nil && strings.TrimSpace(nodeSet.Target) == string(envconfig.TargetRemote) { - state.NodeSets = append(state.NodeSets, nodeSet) + stopCfg.NodeSets = append(stopCfg.NodeSets, nodeSet) } } if cfg.JD != nil && cfg.JD.Target == envconfig.TargetRemote { - state.JD = cfg.JD + stopCfg.JD = cfg.JD + } + if err := stopCfg.Store(remoteStateFileAbsPath(relativePathToRepoRoot)); err != nil { + return err } - data, err := toml.Marshal(state) + agentEnvelope := &remoteAgentStateEnvelope{ + Agent: remoteAgentState{ + Mode: os.Getenv("CRE_AGENT_MODE"), + LocalURL: os.Getenv("CRE_LOCAL_AGENT_URL"), + EC2URL: os.Getenv("CRE_EC2_AGENT_URL"), + EC2InstanceID: os.Getenv("CRE_EC2_INSTANCE_ID"), + EC2AgentPort: os.Getenv("CRE_EC2_AGENT_PORT"), + AWSProfile: firstNonEmpty(os.Getenv("CRE_AWS_PROFILE"), os.Getenv("AWS_PROFILE")), + }, + } + data, err := toml.Marshal(agentEnvelope) if err != nil { return err } - path := remoteStateFileAbsPath(relativePathToRepoRoot) + path := remoteAgentFileAbsPath(relativePathToRepoRoot) if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { return err } @@ -146,10 +119,23 @@ func firstNonEmpty(values ...string) string { } func removeRemoteStopConfig(relativePathToRepoRoot string) error { - path := remoteStateFileAbsPath(relativePathToRepoRoot) - err := os.Remove(path) - if err == nil || os.IsNotExist(err) { - return nil + for _, path := range []string{ + remoteStateFileAbsPath(relativePathToRepoRoot), + remoteAgentFileAbsPath(relativePathToRepoRoot), + } { + err := os.Remove(path) + if err == nil || os.IsNotExist(err) { + continue + } + return err } - return err + return nil +} + +func remoteAgentFileAbsPath(relativePathToRepoRoot string) string { + absPath, err := filepath.Abs(filepath.Join(relativePathToRepoRoot, remoteStateDirname, remoteAgentFilename)) + if err != nil { + panic(fmt.Errorf("failed to get absolute path for remote agent state file: %w", err)) + } + return absPath } diff --git a/system-tests/lib/cre/don.go b/system-tests/lib/cre/don.go index 70dea24b21b..7a2958587cf 100644 --- a/system-tests/lib/cre/don.go +++ b/system-tests/lib/cre/don.go @@ -43,6 +43,7 @@ const ( LabelNodeTypeValuePlugin = "plugin" LabelNodeP2PIDKey = "p2p_id" + ) type Role string @@ -704,12 +705,12 @@ func (n *Node) setUpAndLinkJobDistributor(ctx context.Context, cldfEnv *cldf.Env return fmt.Errorf("no node found for node id %s", n.JobDistributorDetails.NodeID) } if !getRes.GetNode().IsConnected { - return retry.RetryableError(fmt.Errorf("node %s not connected to job distributor", n.Name)) + return retry.RetryableError(fmt.Errorf("node %s not connected to job distributor (jd_uri=%s)", n.Name, jd.WSRPC)) } return nil }) if err != nil { - return fmt.Errorf("failed to connect node %s to job distributor: %w", n.Name, err) + return fmt.Errorf("failed to connect node %s to job distributor (jd_uri=%s): %w", n.Name, jd.WSRPC, err) } n.JobDistributorDetails.JDID = id return nil diff --git a/system-tests/lib/cre/don/config/config.go b/system-tests/lib/cre/don/config/config.go index b14087b2436..9dd940c81c8 100644 --- a/system-tests/lib/cre/don/config/config.go +++ b/system-tests/lib/cre/don/config/config.go @@ -5,6 +5,8 @@ import ( "fmt" "maps" "math/big" + "net" + "net/url" "slices" "strconv" "strings" @@ -523,6 +525,13 @@ func addWorkerNodeConfig( if err != nil { return existingConfig, err } + if resolvedGateway.RequiresBridge { + bridgeURL, bridgeErr := rewriteEndpointForRemoteCaller(resolvedGateway.URL) + if bridgeErr != nil { + return existingConfig, bridgeErr + } + resolvedGateway.URL = bridgeURL + } gateways = append(gateways, coretoml.ConnectorGateway{ ID: ptr.Ptr(gateway.AuthGatewayID), URL: ptr.Ptr(resolvedGateway.URL), @@ -701,22 +710,32 @@ func findEVMChains(input cre.GenerateConfigsInput) ([]*evmChain, error) { Name: fmt.Sprintf("evm-http-%d", bcOut.ChainID()), Internal: bcOut.CtfOutput().Nodes[0].InternalHTTPUrl, External: bcOut.CtfOutput().Nodes[0].ExternalHTTPUrl, - }, func(_ context.Context, _ connectivity.EndpointPair, _ int) error { - return fmt.Errorf("bridge is required for node->blockchain HTTP endpoint on chain %d (remote caller -> local target), automatic component bridge is not implemented yet", bcOut.ChainID()) - }) + }, func(_ context.Context, _ connectivity.EndpointPair, _ int) error { return nil }) if err != nil { return nil, err } + if resolvedHTTP.RequiresBridge { + bridgeURL, bridgeErr := rewriteEndpointForRemoteCaller(resolvedHTTP.URL) + if bridgeErr != nil { + return nil, fmt.Errorf("bridge url rewrite failed for node->blockchain HTTP endpoint on chain %d: %w", bcOut.ChainID(), bridgeErr) + } + resolvedHTTP.URL = bridgeURL + } resolvedWS, err := connectivity.ResolveAndEnsureReachable(context.Background(), callerPlacement, targetPlacement, connectivity.EndpointPair{ Name: fmt.Sprintf("evm-ws-%d", bcOut.ChainID()), Internal: bcOut.CtfOutput().Nodes[0].InternalWSUrl, External: bcOut.CtfOutput().Nodes[0].ExternalWSUrl, - }, func(_ context.Context, _ connectivity.EndpointPair, _ int) error { - return fmt.Errorf("bridge is required for node->blockchain WS endpoint on chain %d (remote caller -> local target), automatic component bridge is not implemented yet", bcOut.ChainID()) - }) + }, func(_ context.Context, _ connectivity.EndpointPair, _ int) error { return nil }) if err != nil { return nil, err } + if resolvedWS.RequiresBridge { + bridgeURL, bridgeErr := rewriteEndpointForRemoteCaller(resolvedWS.URL) + if bridgeErr != nil { + return nil, fmt.Errorf("bridge url rewrite failed for node->blockchain WS endpoint on chain %d: %w", bcOut.ChainID(), bridgeErr) + } + resolvedWS.URL = bridgeURL + } evmChains = append(evmChains, &evmChain{ Name: fmt.Sprintf("node-%d", chainSelector), @@ -761,12 +780,17 @@ func findOneSolanaChain(input cre.GenerateConfigsInput) (*solanaChain, error) { Name: "solana-rpc", Internal: bcOut.CtfOutput().Nodes[0].InternalHTTPUrl, External: bcOut.CtfOutput().Nodes[0].ExternalHTTPUrl, - }, func(_ context.Context, _ connectivity.EndpointPair, _ int) error { - return errors.New("bridge is required for node->solana RPC endpoint (remote caller -> local target), automatic component bridge is not implemented yet") - }) + }, func(_ context.Context, _ connectivity.EndpointPair, _ int) error { return nil }) if err != nil { return nil, err } + if resolvedNodeURL.RequiresBridge { + bridgeURL, bridgeErr := rewriteEndpointForRemoteCaller(resolvedNodeURL.URL) + if bridgeErr != nil { + return nil, fmt.Errorf("bridge url rewrite failed for node->solana RPC endpoint: %w", bridgeErr) + } + resolvedNodeURL.URL = bridgeURL + } ctx, cancelFn := context.WithTimeout(context.Background(), 15*time.Second) chainID, err := solBc.SolClient.GetGenesisHash(ctx) @@ -828,6 +852,31 @@ func gatewayExternalConnectorURL(gateway *cre.DonGatewayConfiguration) string { return fmt.Sprintf("%s://%s:%d%s", scheme, gateway.Incoming.Host, gateway.Incoming.ExternalPort, path) } +func rewriteEndpointForRemoteCaller(raw string) (string, error) { + dockerHost := strings.TrimPrefix(framework.HostDockerInternal(), "http://") + trimmed := strings.TrimSpace(raw) + if trimmed == "" { + return "", fmt.Errorf("endpoint is empty") + } + if strings.Contains(trimmed, "://") { + parsed, err := url.Parse(trimmed) + if err != nil { + return "", fmt.Errorf("parse url %q: %w", raw, err) + } + if parsed.Port() != "" { + parsed.Host = net.JoinHostPort(dockerHost, parsed.Port()) + return parsed.String(), nil + } + parsed.Host = dockerHost + return parsed.String(), nil + } + _, port, err := net.SplitHostPort(trimmed) + if err != nil { + return "", fmt.Errorf("parse host:port %q: %w", raw, err) + } + return net.JoinHostPort(dockerHost, port), nil +} + func buildTronEVMConfig(evmChain *evmChain) evmconfigtoml.EVMConfig { tronRPC := strings.Replace(evmChain.HTTPRPC, "jsonrpc", "wallet", 1) return evmconfigtoml.EVMConfig{ diff --git a/system-tests/lib/cre/environment/agent/relay.go b/system-tests/lib/cre/environment/agent/relay.go index 67ceb856c6b..caf2314de3f 100644 --- a/system-tests/lib/cre/environment/agent/relay.go +++ b/system-tests/lib/cre/environment/agent/relay.go @@ -43,6 +43,11 @@ type closeRelayRequest struct { RelayID string `json:"relayId"` } +type relayBridgeStats struct { + TCPToWSBytes uint64 + WSToTCPBytes uint64 +} + var relayWSUpgrader = websocket.Upgrader{ CheckOrigin: func(_ *http.Request) bool { return true }, } @@ -68,15 +73,38 @@ func (s *Server) openRelay(w http.ResponseWriter, r *http.Request) { return } - // Idempotent open: same name+port returns the existing relay. + // Idempotent open: + // - for fixed ports, any existing relay on the same requested port is reusable + // - fallback to same name+port for compatibility with older callers s.relayMu.Lock() for _, relay := range s.relays { + if req.RequestedPort > 0 && relay.RequestedPort == req.RequestedPort { + s.relayMu.Unlock() + s.lggr.Info(). + Str("relayId", relay.ID). + Str("name", relay.Name). + Int("requestedPort", relay.RequestedPort). + Int("boundPort", listenerPort(relay.Listener)). + Msg("reusing existing relay by requested port") + s.respondJSONAny(w, http.StatusOK, openRelayResponse{ + RelayID: relay.ID, + RequestedPort: relay.RequestedPort, + BoundPort: listenerPort(relay.Listener), + }) + return + } if relay.Name == req.Name && relay.RequestedPort == req.RequestedPort { s.relayMu.Unlock() + s.lggr.Info(). + Str("relayId", relay.ID). + Str("name", relay.Name). + Int("requestedPort", relay.RequestedPort). + Int("boundPort", listenerPort(relay.Listener)). + Msg("reusing existing relay by name+port") s.respondJSONAny(w, http.StatusOK, openRelayResponse{ - RelayID: relay.ID, + RelayID: relay.ID, RequestedPort: relay.RequestedPort, - BoundPort: listenerPort(relay.Listener), + BoundPort: listenerPort(relay.Listener), }) return } @@ -106,6 +134,13 @@ func (s *Server) openRelay(w http.ResponseWriter, r *http.Request) { go s.acceptRelayConnections(reg) + s.lggr.Info(). + Str("relayId", relayID). + Str("name", req.Name). + Int("requestedPort", req.RequestedPort). + Int("boundPort", listenerPort(ln)). + Msg("opened relay listener") + s.respondJSONAny(w, http.StatusOK, openRelayResponse{ RelayID: relayID, RequestedPort: req.RequestedPort, @@ -145,6 +180,12 @@ func (s *Server) closeRelay(w http.ResponseWriter, r *http.Request) { _ = relay.Listener.Close() drainAndCloseIncoming(relay.Incoming) + s.lggr.Info(). + Str("relayId", relayID). + Str("name", relay.Name). + Int("requestedPort", relay.RequestedPort). + Msg("closed relay listener") + s.respondJSONAny(w, http.StatusOK, map[string]any{"relayId": relayID, "closed": true, "found": true}) } @@ -169,25 +210,76 @@ func (s *Server) connectRelay(w http.ResponseWriter, r *http.Request) { wsConn, err := relayWSUpgrader.Upgrade(w, r, nil) if err != nil { + s.lggr.Warn().Err(err).Str("relayId", relayID).Msg("failed to upgrade relay websocket") return } defer wsConn.Close() + s.lggr.Debug(). + Str("relayId", relayID). + Str("name", relay.Name). + Str("wsRemoteAddr", wsConn.RemoteAddr().String()). + Msg("relay websocket bridge client connected") var incoming net.Conn - select { - case incoming = <-relay.Incoming: - case <-relay.Closed: - _ = wsConn.WriteControl(websocket.CloseMessage, websocket.FormatCloseMessage(websocket.CloseNormalClosure, "relay closed"), time.Now().Add(2*time.Second)) - return - case <-r.Context().Done(): - return + waitStarted := time.Now() + nextWaitLogAt := 30 * time.Second + for { + select { + case incoming = <-relay.Incoming: + goto bridge + case <-relay.Closed: + s.lggr.Info().Str("relayId", relayID).Str("name", relay.Name).Msg("relay closed while waiting for incoming tcp connection") + _ = wsConn.WriteControl(websocket.CloseMessage, websocket.FormatCloseMessage(websocket.CloseNormalClosure, "relay closed"), time.Now().Add(2*time.Second)) + return + case <-r.Context().Done(): + s.lggr.Info().Str("relayId", relayID).Str("name", relay.Name).Msg("relay websocket request context cancelled while waiting for incoming tcp connection") + return + case <-time.After(5 * time.Second): + waited := time.Since(waitStarted) + if waited >= nextWaitLogAt { + s.lggr.Info(). + Str("relayId", relayID). + Str("name", relay.Name). + Dur("waited", waited). + Int("queued", len(relay.Incoming)). + Msg("relay websocket still waiting for incoming tcp connection") + nextWaitLogAt += 30 * time.Second + } + } } + +bridge: if incoming == nil { + s.lggr.Warn().Str("relayId", relayID).Str("name", relay.Name).Msg("relay incoming connection was nil") return } defer incoming.Close() - - _ = bridgeWebSocketAndTCP(wsConn, incoming) + s.lggr.Info(). + Str("relayId", relayID). + Str("name", relay.Name). + Str("tcpRemoteAddr", incoming.RemoteAddr().String()). + Msg("bridging relay tcp connection to websocket client") + + bridgeStarted := time.Now() + stats, err := bridgeWebSocketAndTCP(wsConn, incoming) + if err != nil { + s.lggr.Warn(). + Err(err). + Str("relayId", relayID). + Str("name", relay.Name). + Uint64("tcpToWSBytes", stats.TCPToWSBytes). + Uint64("wsToTCPBytes", stats.WSToTCPBytes). + Dur("duration", time.Since(bridgeStarted)). + Msg("relay bridge ended with error") + } else { + s.lggr.Info(). + Str("relayId", relayID). + Str("name", relay.Name). + Uint64("tcpToWSBytes", stats.TCPToWSBytes). + Uint64("wsToTCPBytes", stats.WSToTCPBytes). + Dur("duration", time.Since(bridgeStarted)). + Msg("relay bridge stream ended") + } } func (s *Server) acceptRelayConnections(relay *relayRegistration) { @@ -205,30 +297,47 @@ func (s *Server) acceptRelayConnections(relay *relayRegistration) { } return } + s.lggr.Info(). + Str("relayId", relay.ID). + Str("name", relay.Name). + Int("requestedPort", relay.RequestedPort). + Str("tcpRemoteAddr", conn.RemoteAddr().String()). + Msg("accepted relay tcp connection") select { case relay.Incoming <- conn: + s.lggr.Info(). + Str("relayId", relay.ID). + Str("name", relay.Name). + Int("queued", len(relay.Incoming)). + Msg("queued relay tcp connection for websocket bridge") default: + s.lggr.Warn(). + Str("relayId", relay.ID). + Str("name", relay.Name). + Msg("dropping relay tcp connection: incoming queue is full") _ = conn.Close() } } } -func bridgeWebSocketAndTCP(ws *websocket.Conn, tcp net.Conn) error { +func bridgeWebSocketAndTCP(ws *websocket.Conn, tcp net.Conn) (*relayBridgeStats, error) { errCh := make(chan error, 2) + stats := &relayBridgeStats{} go func() { buf := make([]byte, 32*1024) for { n, err := tcp.Read(buf) if n > 0 { + atomic.AddUint64(&stats.TCPToWSBytes, uint64(n)) if wErr := ws.WriteMessage(websocket.BinaryMessage, buf[:n]); wErr != nil { - errCh <- wErr + errCh <- fmt.Errorf("tcp->ws write: %w", wErr) return } } if err != nil { - errCh <- err + errCh <- fmt.Errorf("tcp read: %w", err) return } } @@ -238,7 +347,7 @@ func bridgeWebSocketAndTCP(ws *websocket.Conn, tcp net.Conn) error { for { msgType, payload, err := ws.ReadMessage() if err != nil { - errCh <- err + errCh <- fmt.Errorf("ws read: %w", err) return } if msgType != websocket.BinaryMessage && msgType != websocket.TextMessage { @@ -247,8 +356,9 @@ func bridgeWebSocketAndTCP(ws *websocket.Conn, tcp net.Conn) error { if len(payload) == 0 { continue } + atomic.AddUint64(&stats.WSToTCPBytes, uint64(len(payload))) if _, wErr := tcp.Write(payload); wErr != nil { - errCh <- wErr + errCh <- fmt.Errorf("ws->tcp write: %w", wErr) return } } @@ -256,9 +366,9 @@ func bridgeWebSocketAndTCP(ws *websocket.Conn, tcp net.Conn) error { err := <-errCh if err == nil || errors.Is(err, io.EOF) || websocket.IsCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway) { - return nil + return stats, nil } - return err + return stats, err } func drainAndCloseIncoming(ch chan net.Conn) { diff --git a/system-tests/lib/cre/environment/component_relay.go b/system-tests/lib/cre/environment/component_relay.go new file mode 100644 index 00000000000..4196b372350 --- /dev/null +++ b/system-tests/lib/cre/environment/component_relay.go @@ -0,0 +1,315 @@ +package environment + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net" + "net/http" + "net/url" + "os" + "strconv" + "strings" + "sync" + "time" + + "github.com/gorilla/websocket" + "github.com/rs/zerolog" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" +) + +type componentRelayManager struct { + lggr zerolog.Logger + baseURL string + + mu sync.Mutex + handles map[string]*componentRelayHandle +} + +type componentRelayHandle struct { + relayID string + cancel context.CancelFunc +} + +type relayOpenResponse struct { + RelayID string `json:"relayId"` +} + +func newComponentRelayManager(lggr zerolog.Logger) (*componentRelayManager, error) { + baseURL, err := resolveAgentBaseURLForRelay() + if err != nil { + return nil, err + } + return &componentRelayManager{ + lggr: lggr, + baseURL: baseURL, + handles: make(map[string]*componentRelayHandle), + }, nil +} + +func (m *componentRelayManager) EnsurePort(ctx context.Context, relayName string, localPort int) error { + if m == nil || localPort <= 0 { + return nil + } + // Deduplicate by port. HTTP and WS for the same endpoint can share one listener. + key := strconv.Itoa(localPort) + + m.mu.Lock() + if _, ok := m.handles[key]; ok { + m.mu.Unlock() + return nil + } + m.mu.Unlock() + + relayID, err := openRelay(ctx, m.baseURL, relayName, localPort) + if err != nil { + return err + } + + workerCtx, cancel := context.WithCancel(context.Background()) + localAddr := net.JoinHostPort("127.0.0.1", strconv.Itoa(localPort)) + for i := 0; i < 4; i++ { + go relayWorker(workerCtx, m.baseURL, relayID, localAddr) + } + + m.mu.Lock() + m.handles[key] = &componentRelayHandle{relayID: relayID, cancel: cancel} + m.mu.Unlock() + m.lggr.Info().Str("relayName", relayName).Int("port", localPort).Msg("ensured mixed component relay") + return nil +} + +func (m *componentRelayManager) Close(ctx context.Context) error { + if m == nil { + return nil + } + m.mu.Lock() + handles := make([]*componentRelayHandle, 0, len(m.handles)) + for _, h := range m.handles { + handles = append(handles, h) + } + m.handles = map[string]*componentRelayHandle{} + m.mu.Unlock() + + var firstErr error + for _, h := range handles { + h.cancel() + if err := closeRelay(ctx, m.baseURL, h.relayID); err != nil && firstErr == nil { + firstErr = err + } + } + return firstErr +} + +func resolveAgentBaseURLForRelay() (string, error) { + if v := strings.TrimSpace(os.Getenv(envEC2AgentURL)); v != "" { + return v, nil + } + if strings.EqualFold(strings.TrimSpace(os.Getenv(envAgentMode)), "ec2") && runtimecfg.IsDirectMode() { + hostIP, err := runtimecfg.DirectHostIP() + if err != nil { + return "", err + } + port, err := resolveEC2AgentPort() + if err != nil { + return "", err + } + return fmt.Sprintf("http://%s:%d", hostIP, port), nil + } + if v := strings.TrimSpace(os.Getenv(envLocalAgentURL)); v != "" { + return v, nil + } + return "", fmt.Errorf("cannot resolve agent base URL for relay; set %s (or use direct mode with EC2 host resolution)", envEC2AgentURL) +} + +func openRelay(ctx context.Context, baseURL, name string, requestedPort int) (string, error) { + body, _ := json.Marshal(map[string]any{"name": name, "requestedPort": requestedPort}) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, strings.TrimRight(baseURL, "/")+"/v1/relay/open", bytes.NewReader(body)) + if err != nil { + return "", err + } + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + respBody, _ := io.ReadAll(resp.Body) + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return "", fmt.Errorf("open relay failed: status %s body %s", resp.Status, strings.TrimSpace(string(respBody))) + } + + var out relayOpenResponse + if err := json.Unmarshal(respBody, &out); err != nil { + return "", err + } + if strings.TrimSpace(out.RelayID) == "" { + return "", fmt.Errorf("open relay returned empty relayId") + } + return out.RelayID, nil +} + +func closeRelay(ctx context.Context, baseURL, relayID string) error { + body, _ := json.Marshal(map[string]any{"relayId": relayID}) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, strings.TrimRight(baseURL, "/")+"/v1/relay/close", bytes.NewReader(body)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + respBody, _ := io.ReadAll(resp.Body) + return fmt.Errorf("close relay failed: status %s body %s", resp.Status, strings.TrimSpace(string(respBody))) + } + return nil +} + +func relayWorker(ctx context.Context, baseURL, relayID, localAddr string) { + backoff := 250 * time.Millisecond + for { + select { + case <-ctx.Done(): + return + default: + } + + wsURL, err := relayConnectWSURL(baseURL, relayID) + if err != nil { + time.Sleep(backoff) + continue + } + ws, _, err := websocket.DefaultDialer.Dial(wsURL, nil) + if err != nil { + time.Sleep(backoff) + continue + } + _ = bridgeRelayStream(ctx, ws, localAddr) + _ = ws.Close() + if backoff < 2*time.Second { + backoff *= 2 + } + } +} + +func relayConnectWSURL(baseURL, relayID string) (string, error) { + u, err := url.Parse(strings.TrimRight(baseURL, "/")) + if err != nil { + return "", err + } + switch u.Scheme { + case "http": + u.Scheme = "ws" + case "https": + u.Scheme = "wss" + default: + return "", fmt.Errorf("unsupported agent url scheme: %s", u.Scheme) + } + u.Path = "/v1/relay/connect" + q := u.Query() + q.Set("relayId", relayID) + u.RawQuery = q.Encode() + return u.String(), nil +} + +func bridgeRelayStream(ctx context.Context, ws *websocket.Conn, localAddr string) error { + errCh := make(chan error, 2) + localReady := make(chan net.Conn, 1) + var localConn net.Conn + var localConnMu sync.Mutex + getLocalConn := func() net.Conn { + localConnMu.Lock() + defer localConnMu.Unlock() + return localConn + } + setLocalConn := func(conn net.Conn) { + localConnMu.Lock() + localConn = conn + localConnMu.Unlock() + } + ensureLocalConn := func() (net.Conn, error) { + if existing := getLocalConn(); existing != nil { + return existing, nil + } + conn, err := net.DialTimeout("tcp", localAddr, 2*time.Second) + if err != nil { + return nil, err + } + setLocalConn(conn) + select { + case localReady <- conn: + default: + } + return conn, nil + } + defer func() { + if conn := getLocalConn(); conn != nil { + _ = conn.Close() + } + }() + go func() { + var conn net.Conn + select { + case conn = <-localReady: + case <-ctx.Done(): + errCh <- ctx.Err() + return + } + if conn == nil { + errCh <- fmt.Errorf("local relay connection was nil") + return + } + buf := make([]byte, 32*1024) + for { + n, err := conn.Read(buf) + if n > 0 { + if wErr := ws.WriteMessage(websocket.BinaryMessage, buf[:n]); wErr != nil { + errCh <- wErr + return + } + } + if err != nil { + errCh <- err + return + } + } + }() + go func() { + for { + msgType, payload, err := ws.ReadMessage() + if err != nil { + errCh <- err + return + } + if msgType != websocket.BinaryMessage && msgType != websocket.TextMessage { + continue + } + if len(payload) == 0 { + continue + } + conn, dialErr := ensureLocalConn() + if dialErr != nil { + errCh <- dialErr + return + } + if _, wErr := conn.Write(payload); wErr != nil { + errCh <- wErr + return + } + } + }() + select { + case <-ctx.Done(): + return ctx.Err() + case <-errCh: + return nil + } +} diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index 28436e71fa5..bd25cbc8a5c 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -5,7 +5,10 @@ import ( "errors" "fmt" "maps" + "net" + "net/url" "os" + "strconv" "strings" "sync" @@ -43,9 +46,10 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/workflow" libformat "github.com/smartcontractkit/chainlink/system-tests/lib/format" "github.com/smartcontractkit/chainlink/system-tests/lib/infra" - "github.com/smartcontractkit/chainlink/system-tests/lib/worker" ) +const envUsePersistentRelaySupervisor = "CRE_USE_PERSISTENT_RELAY_SUPERVISOR" + type SetupOutput struct { WorkflowRegistryConfigurationOutput *cre.WorkflowRegistryOutput CreEnvironment *cre.Environment @@ -55,6 +59,7 @@ type SetupOutput struct { GatewayConnectors *cre.GatewayConnectors tunnelManager tunnel.Manager + relayManager *componentRelayManager closeOnce sync.Once closeErr error } @@ -69,6 +74,9 @@ func (s *SetupOutput) Close(ctx context.Context) error { } s.closeOnce.Do(func() { + if s.relayManager != nil { + _ = s.relayManager.Close(ctx) + } s.closeErr = manager.Stop(ctx) }) @@ -106,6 +114,10 @@ type SetupInput struct { CapabilitiesContractFactoryFunctions []cre.CapabilityRegistryConfigFn StageGen *stagegen.StageGen + + // Optional hook executed after local dependencies are started (including JD), + // and right before DON containers are started. + PreDONsStartHook func(ctx context.Context) error } func (s *SetupInput) Validate() error { @@ -158,6 +170,9 @@ func SetupTestEnvironment( if err != nil { return nil, pkgerrors.Wrap(err, "nodeset placement validation failed") } + if err := validateUnsupportedPlacements(input.Blockchains, nodeSetPlacement); err != nil { + return nil, pkgerrors.Wrap(err, "invalid component placement") + } s3Output, s3Err := workflow.StartS3(testLogger, input.S3ProviderInput, input.StageGen) if s3Err != nil { @@ -168,6 +183,16 @@ func SetupTestEnvironment( if tmErr != nil { return nil, pkgerrors.Wrap(tmErr, "failed to initialize tunnel manager") } + var relayManager *componentRelayManager + if !strings.EqualFold(strings.TrimSpace(os.Getenv(envUsePersistentRelaySupervisor)), "true") { + rm, rmErr := newComponentRelayManager(testLogger) + if rmErr != nil && nodeSetPlacement.HasRemoteTargets { + return nil, pkgerrors.Wrap(rmErr, "failed to initialize relay manager") + } + relayManager = rm + } else { + testLogger.Info().Msg("persistent relay supervisor enabled; skipping in-process relay manager") + } fmt.Print(libformat.PurpleText("%s", input.StageGen.Wrap("Starting %d blockchain(s)", len(input.Blockchains)))) @@ -183,10 +208,14 @@ func SetupTestEnvironment( return nil, pkgerrors.Wrap(startErr, "failed to start blockchains") } cleanupTunnelsOnError := true + cleanupRelaysOnError := true defer func() { if cleanupTunnelsOnError { _ = tunnelManager.Stop(ctx) } + if cleanupRelaysOnError && relayManager != nil { + _ = relayManager.Close(ctx) + } }() creEnvironment := &cre.Environment{ @@ -238,6 +267,12 @@ func SetupTestEnvironment( } fmt.Print(libformat.PurpleText("%s", input.StageGen.WrapAndNext("DONs configuration prepared in %.2f seconds", input.StageGen.Elapsed().Seconds()))) + if nodeSetPlacement.HasRemoteTargets && relayManager != nil { + if err := ensureMixedRelaysForLocalBlockchains(ctx, relayManager, input.Blockchains, deployedBlockchains.Outputs); err != nil { + return nil, pkgerrors.Wrap(err, "failed to ensure mixed relays for local blockchains") + } + } + fmt.Print(libformat.PurpleText("%s", input.StageGen.Wrap("Applying Features before environment startup"))) var donsCapabilities = make(map[uint64][]keystone_changeset.DONCapabilityWithConfig) var capabilityToOCR3Config = make(map[string]*ocr3.OracleConfig) @@ -267,47 +302,24 @@ func SetupTestEnvironment( fmt.Print(libformat.PurpleText("%s", input.StageGen.WrapAndNext("Applied Features in %.2f seconds", input.StageGen.Elapsed().Seconds()))) - queue := worker.New(ctx, 10) - defer queue.StopAndWait() // Ensure cleanup on any exit path - - jdStartedFuture := queue.SubmitAny(func(ctx context.Context) (any, error) { - // TODO: pass context after we update the CTF to accept context, when creating new JD instance - jdOutput, startJDErr := StartJD(ctx, testLogger, input.JdInput, input.Provider, tunnelManager, nodeSetPlacement.HasLocalTargets) - if startJDErr != nil { - return nil, pkgerrors.Wrap(startJDErr, "failed to start Job Distributor") - } - return jdOutput, nil - }) - - donsStartedFuture := queue.SubmitAny(func(ctx context.Context) (any, error) { - nodeSetOutput, startDonsErr := StartDONs(ctx, testLogger, topology, input.Provider, deployedBlockchains.RegistryChain().CtfOutput(), input.CapabilityConfigs, input.CopyCapabilityBinaries, updatedNodeSets, tunnelManager) - if startDonsErr != nil { - return nil, pkgerrors.Wrap(startDonsErr, "failed to start DONs") - } - - return nodeSetOutput, nil - }) - - // Await both futures to ensure proper cleanup even if one fails - startedJD, jdStartErr := worker.AwaitAs[*StartedJD](ctx, jdStartedFuture) - startedDONs, donStartErr := worker.AwaitAs[*StartedDONs](ctx, donsStartedFuture) - - // Check errors after both awaits complete - // If both failed, prefer the non-context-cancelled error as it's likely the root cause - if jdStartErr != nil && donStartErr != nil { - // If one is context.Canceled, it was likely caused by the other task's error - if pkgerrors.Is(jdStartErr, context.Canceled) && !pkgerrors.Is(donStartErr, context.Canceled) { - return nil, pkgerrors.Wrap(donStartErr, "failed to start DONs") - } - if pkgerrors.Is(donStartErr, context.Canceled) && !pkgerrors.Is(jdStartErr, context.Canceled) { - return nil, pkgerrors.Wrap(jdStartErr, "failed to start Job Distributor") - } - // Both real errors - return nil, pkgerrors.Wrap(errors.Join(fmt.Errorf("JD failed to start: %w", jdStartErr), fmt.Errorf("DONs failed to start: %w", donStartErr)), "failed to start Job Distributor AND Dons") - } + // Start JD first when we need to expose local JD endpoints to remote nodesets. + requireJDRelayBootstrap := nodeSetPlacement.HasRemoteTargets && input.JdInput != nil && input.JdInput.Target == config.TargetLocal + startedJD, jdStartErr := StartJD(ctx, testLogger, input.JdInput, input.Provider, tunnelManager, nodeSetPlacement.HasLocalTargets) if jdStartErr != nil { return nil, pkgerrors.Wrap(jdStartErr, "failed to start Job Distributor") } + if requireJDRelayBootstrap && relayManager != nil { + if err := ensureMixedRelaysForLocalJD(ctx, relayManager, startedJD.JDOutput); err != nil { + return nil, pkgerrors.Wrap(err, "failed to ensure mixed relays for local JD") + } + } + if input.PreDONsStartHook != nil { + if err := input.PreDONsStartHook(ctx); err != nil { + return nil, pkgerrors.Wrap(err, "failed to execute pre-DON startup hook") + } + } + + startedDONs, donStartErr := StartDONs(ctx, testLogger, topology, input.Provider, deployedBlockchains.RegistryChain().CtfOutput(), input.CapabilityConfigs, input.CopyCapabilityBinaries, updatedNodeSets, tunnelManager) if donStartErr != nil { return nil, pkgerrors.Wrap(donStartErr, "failed to start DONs") } @@ -398,7 +410,7 @@ func SetupTestEnvironment( return nil, pkgerrors.Wrap(wfErr, "failed to configure workflow registry") } - wfFiltersFuture := queue.SubmitErr(func(ctx context.Context) error { + waitForWorkflowFilters := func(ctx context.Context) error { // we currently have no way of checking if filters were registered in Kubernetes mode // as we don't have a way to get its database connection string if !input.Provider.IsDocker() { @@ -416,7 +428,7 @@ func SetupTestEnvironment( default: return workflow.WaitForAllNodesToHaveExpectedFiltersRegistered(ctx, singleFileLogger, testLogger, deployedBlockchains.RegistryChain().ChainID(), dons, updatedNodeSets) } - }) + } capRegInput := cre.ConfigureCapabilityRegistryInput{ ChainSelector: deployedBlockchains.RegistryChain().ChainSelector(), @@ -484,7 +496,7 @@ func SetupTestEnvironment( fmt.Print(libformat.PurpleText("%s", input.StageGen.WrapAndNext("Sharding setup in %.2f seconds", input.StageGen.Elapsed().Seconds()))) } - if err := worker.AwaitErr(ctx, wfFiltersFuture); err != nil { + if err := waitForWorkflowFilters(ctx); err != nil { return nil, pkgerrors.Wrap(err, "failed while waiting for workflow registry filters registration") } @@ -503,9 +515,98 @@ func SetupTestEnvironment( S3ProviderOutput: s3Output, GatewayConnectors: topology.GatewayConnectors, tunnelManager: tunnelManager, + relayManager: relayManager, }, nil } +func ensureMixedRelaysForLocalBlockchains( + ctx context.Context, + relayManager *componentRelayManager, + configuredBlockchains []*config.Blockchain, + deployedBlockchains []blockchains.Blockchain, +) error { + attempted := 0 + for idx, configured := range configuredBlockchains { + if configured == nil || configured.Target != config.TargetLocal { + continue + } + if idx >= len(deployedBlockchains) || deployedBlockchains[idx] == nil { + continue + } + for nodeIdx, node := range deployedBlockchains[idx].CtfOutput().Nodes { + if node == nil { + continue + } + if p, ok := extractEndpointPort(node.ExternalHTTPUrl); ok { + attempted++ + if err := relayManager.EnsurePort(ctx, fmt.Sprintf("blockchain-http-%d-%d", idx, nodeIdx), p); err != nil { + return err + } + } + if p, ok := extractEndpointPort(node.ExternalWSUrl); ok { + attempted++ + if err := relayManager.EnsurePort(ctx, fmt.Sprintf("blockchain-ws-%d-%d", idx, nodeIdx), p); err != nil { + return err + } + } + } + } + if attempted == 0 { + relayManager.lggr.Warn().Msg("no local blockchain relay ports were detected; mixed remote nodesets may not reach local blockchains") + } + return nil +} + +func ensureMixedRelaysForLocalJD(ctx context.Context, relayManager *componentRelayManager, jdOutput *jd.Output) error { + if jdOutput == nil { + return nil + } + attempted := 0 + if p, ok := extractEndpointPort(jdOutput.ExternalGRPCUrl); ok { + attempted++ + if err := relayManager.EnsurePort(ctx, "jd-grpc", p); err != nil { + return err + } + } + if p, ok := extractEndpointPort(jdOutput.ExternalWSRPCUrl); ok { + attempted++ + if err := relayManager.EnsurePort(ctx, "jd-wsrpc", p); err != nil { + return err + } + } + if attempted == 0 { + relayManager.lggr.Warn().Msg("no local JD relay ports were detected; mixed remote nodesets may not reach local JD") + } + return nil +} + +func extractEndpointPort(raw string) (int, bool) { + trimmed := strings.TrimSpace(raw) + if trimmed == "" { + return 0, false + } + if strings.Contains(trimmed, "://") { + parsed, err := url.Parse(trimmed) + if err != nil || parsed.Port() == "" { + return 0, false + } + port, convErr := strconv.Atoi(parsed.Port()) + if convErr != nil || port <= 0 || port > 65535 { + return 0, false + } + return port, true + } + _, portRaw, err := net.SplitHostPort(trimmed) + if err != nil { + return 0, false + } + port, convErr := strconv.Atoi(portRaw) + if convErr != nil || port <= 0 || port > 65535 { + return 0, false + } + return port, true +} + func blockchainTargetsBySelector(configured []*config.Blockchain, deployed []blockchains.Blockchain) map[uint64]string { bySelector := make(map[uint64]string, len(deployed)) for idx, blockchainCfg := range configured { @@ -568,6 +669,27 @@ func summarizeNodeSetPlacement(nodeSets []*cre.NodeSet) (*nodeSetPlacementSummar return summary, nil } +func validateUnsupportedPlacements( + configuredBlockchains []*config.Blockchain, + nodeSetPlacement *nodeSetPlacementSummary, +) error { + if nodeSetPlacement == nil || !nodeSetPlacement.HasRemoteTargets { + return nil + } + for _, bc := range configuredBlockchains { + if bc == nil { + continue + } + if bc.Target == config.TargetLocal { + return errors.New( + "remote nodesets with local blockchains are not supported in this PoC. " + + "Set all blockchains to target=remote, or run nodesets with target=local so nodes stay colocated with local blockchains", + ) + } + } + return nil +} + func newCldfEnvironment(ctx context.Context, singleFileLogger logger.Logger, cldfBlockchains cldf_chain.BlockChains) *cldf.Environment { allChainsCLDEnvironment := &cldf.Environment{ Name: cre.EnvironmentName, diff --git a/system-tests/lib/cre/environment/jobs.go b/system-tests/lib/cre/environment/jobs.go index d9779627bb4..d05b8338a4e 100644 --- a/system-tests/lib/cre/environment/jobs.go +++ b/system-tests/lib/cre/environment/jobs.go @@ -147,13 +147,24 @@ func StartJD( // Configure gRPC credentials for JD connection creds := getJDCredentials(lggr, infraInput, jdOutput) + nodeFacingWSRPC, wsrpcErr := resolveNodeFacingJDWSRPC(jdOutput, rewriteInternalForLocalNodes) + if wsrpcErr != nil { + return nil, pkgerrors.Wrap(wsrpcErr, "failed to resolve node-facing JD WSRPC endpoint") + } + jdClientConfig := cldf_jd.JDConfig{ GRPC: jdOutput.ExternalGRPCUrl, - WSRPC: jdOutput.InternalWSRPCUrl, + WSRPC: nodeFacingWSRPC, Creds: creds, } lggr.Info().Msgf("Connecting to JD GRPC at: %s", jdOutput.ExternalGRPCUrl) + lggr.Info(). + Str("nodeFacingWSRPC", nodeFacingWSRPC). + Str("internalWSRPC", jdOutput.InternalWSRPCUrl). + Str("externalWSRPC", jdOutput.ExternalWSRPCUrl). + Bool("hasLocalNodeSets", rewriteInternalForLocalNodes). + Msg("Resolved JD WSRPC endpoint for node registration") jdClient, jdErr := cldf_jd.NewJDClient(jdClientConfig) if jdErr != nil { @@ -168,6 +179,26 @@ func StartJD( }, nil } +func resolveNodeFacingJDWSRPC(output *jd.Output, rewriteInternalForLocalNodes bool) (string, error) { + if output == nil { + return "", fmt.Errorf("jd output is nil") + } + // Local nodesets can resolve JD on the Docker network directly. + if rewriteInternalForLocalNodes { + return output.InternalWSRPCUrl, nil + } + // Remote nodesets need the relay endpoint on the remote Docker host. + source := strings.TrimSpace(output.ExternalWSRPCUrl) + if source == "" { + source = strings.TrimSpace(output.InternalWSRPCUrl) + } + if source == "" { + return "", fmt.Errorf("jd output does not include WSRPC endpoint") + } + dockerHost := strings.TrimPrefix(framework.HostDockerInternal(), "http://") + return rewriteAddressHost(source, dockerHost) +} + func rewriteRemoteJDOutputForLocalAccess( ctx context.Context, lggr zerolog.Logger, diff --git a/system-tests/tests/test-helpers/before_suite.go b/system-tests/tests/test-helpers/before_suite.go index 7318eddf7c7..a52dfc90cb4 100644 --- a/system-tests/tests/test-helpers/before_suite.go +++ b/system-tests/tests/test-helpers/before_suite.go @@ -2,9 +2,13 @@ package helpers import ( "context" + "net" + "net/url" "os" "os/exec" "path/filepath" + "strconv" + "strings" "testing" "github.com/pkg/errors" @@ -31,6 +35,15 @@ func SetupTestEnvironmentWithConfig(t *testing.T, tconf *ttypes.TestConfig, flag creEnvironment, dons, err := environment.BuildFromSavedState(t.Context(), cldlogger.NewSingleFileLogger(t), in) require.NoError(t, err, "failed to load environment") + testEnv := &ttypes.TestEnvironment{ + Config: in, + TestConfig: tconf, + Logger: framework.L, + CreEnvironment: creEnvironment, + Dons: dons, + } + ensureMixedModeComponentRelays(t, testEnv) + t.Cleanup(func() { if t.Failed() { framework.L.Warn().Msg("Test failed - checking for panics in Docker containers...") @@ -43,13 +56,7 @@ func SetupTestEnvironmentWithConfig(t *testing.T, tconf *ttypes.TestConfig, flag } }) - return &ttypes.TestEnvironment{ - Config: in, - TestConfig: tconf, - Logger: framework.L, - CreEnvironment: creEnvironment, - Dons: dons, - } + return testEnv } func GetDefaultTestConfig(t *testing.T) *ttypes.TestConfig { @@ -124,3 +131,100 @@ func createEnvironmentIfNotExists(ctx context.Context, relativePathToRepoRoot, e return nil } + +func ensureMixedModeComponentRelays(t *testing.T, testEnv *ttypes.TestEnvironment) { + t.Helper() + if testEnv == nil || testEnv.Config == nil || !hasRemoteNodeSets(testEnv.Config) { + return + } + nodeSetTargetsByName := map[string]string{} + for _, nsCfg := range testEnv.Config.NodeSets { + if nsCfg == nil { + continue + } + name := strings.TrimSpace(nsCfg.Name) + if name == "" { + continue + } + nodeSetTargetsByName[name] = strings.TrimSpace(nsCfg.Target) + } + + // Local blockchain endpoints used by remote nodesets. + for idx, bcCfg := range testEnv.Config.Blockchains { + if bcCfg == nil || strings.TrimSpace(string(bcCfg.Target)) != string(envconfig.TargetLocal) { + continue + } + if idx >= len(testEnv.CreEnvironment.Blockchains) || testEnv.CreEnvironment.Blockchains[idx] == nil { + continue + } + for nodeIdx, node := range testEnv.CreEnvironment.Blockchains[idx].CtfOutput().Nodes { + if node == nil { + continue + } + if p, ok := extractPort(node.ExternalHTTPUrl); ok { + EnsureFixtureRelayForPort(t, testEnv, "blockchain-http-"+strconv.Itoa(idx)+"-"+strconv.Itoa(nodeIdx), p) + } + if p, ok := extractPort(node.ExternalWSUrl); ok { + EnsureFixtureRelayForPort(t, testEnv, "blockchain-ws-"+strconv.Itoa(idx)+"-"+strconv.Itoa(nodeIdx), p) + } + } + } + + // Local JD endpoints used by remote nodesets. + if testEnv.Config.JD != nil && strings.TrimSpace(string(testEnv.Config.JD.Target)) == string(envconfig.TargetLocal) && testEnv.Config.JD.Out != nil { + if p, ok := extractPort(testEnv.Config.JD.Out.ExternalGRPCUrl); ok { + EnsureFixtureRelayForPort(t, testEnv, "jd-grpc", p) + } + if p, ok := extractPort(testEnv.Config.JD.Out.ExternalWSRPCUrl); ok { + EnsureFixtureRelayForPort(t, testEnv, "jd-wsrpc", p) + } + } + + // Local gateway incoming ports used by remote workflow nodesets. + if testEnv.Dons != nil && testEnv.Dons.GatewayConnectors != nil { + for _, cfg := range testEnv.Dons.GatewayConnectors.Configurations { + if cfg == nil || cfg.GatewayConfiguration == nil { + continue + } + node, found := testEnv.Dons.NodeWithUUID(cfg.NodeUUID) + if !found || node == nil || node.DON == nil { + continue + } + donName := strings.TrimSpace(node.DON.Name) + target := nodeSetTargetsByName[donName] + if target != string(envconfig.TargetLocal) { + continue + } + if cfg.Incoming.ExternalPort > 0 { + EnsureFixtureRelayForPort(t, testEnv, "gateway-"+cfg.AuthGatewayID, cfg.Incoming.ExternalPort) + } + } + } +} + +func extractPort(raw string) (int, bool) { + trimmed := strings.TrimSpace(raw) + if trimmed == "" { + return 0, false + } + if strings.Contains(trimmed, "://") { + parsed, err := url.Parse(trimmed) + if err != nil || parsed.Port() == "" { + return 0, false + } + port, convErr := strconv.Atoi(parsed.Port()) + if convErr != nil || port <= 0 || port > 65535 { + return 0, false + } + return port, true + } + _, portRaw, err := net.SplitHostPort(trimmed) + if err != nil { + return 0, false + } + port, convErr := strconv.Atoi(portRaw) + if convErr != nil || port <= 0 || port > 65535 { + return 0, false + } + return port, true +} From 96ac85d2d9817422386c000d7263c3bed318fac1 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Mon, 23 Feb 2026 18:23:08 +0100 Subject: [PATCH 10/34] better cleanup --- .../environment/environment/environment.go | 59 +++++---- .../environment/environment/remote_state.go | 23 ++-- .../lib/cre/environment/agent/server.go | 113 +++++++++++++++++- .../lib/cre/environment/environment.go | 8 +- .../lib/cre/environment/remote_stop.go | 44 +++++++ 5 files changed, 208 insertions(+), 39 deletions(-) diff --git a/core/scripts/cre/environment/environment/environment.go b/core/scripts/cre/environment/environment/environment.go index 8a52de030d8..66aa356ed8b 100644 --- a/core/scripts/cre/environment/environment/environment.go +++ b/core/scripts/cre/environment/environment/environment.go @@ -701,9 +701,9 @@ func stopAllCmd() *cobra.Command { func stopRemoteCmd() *cobra.Command { var dryRunFlag bool cmd := &cobra.Command{ - Use: "stop-remote", - Short: "Stops remote components only", - Long: `Stops remote CRE components through the agent without performing any local cleanup.`, + Use: "stop-remote", + Short: "Stops remote components only", + Long: `Stops remote CRE components through the agent without performing any local cleanup.`, Example: strings.TrimSpace(` go run . env stop-remote go run . env stop-remote --dry-run @@ -779,14 +779,32 @@ func stopRemoteTargets(ctx context.Context, relativePathToRepoRoot string, targe Int("missing", summary.Missing). Int("failed", summary.Failed). Msg("Remote component stop summary") + if summary.ResidualQueryError != "" { + framework.L.Warn().Msgf("failed to query remote residual CTF resources: %s", summary.ResidualQueryError) + } else { + framework.L.Info(). + Int("containers", len(summary.ResidualContainers)). + Int("volumes", len(summary.ResidualVolumes)). + Msg("Remote residual CTF resources after stop") + if len(summary.ResidualContainers) > 0 { + framework.L.Warn().Msgf("residual remote CTF containers: %s", strings.Join(summary.ResidualContainers, ", ")) + } + if len(summary.ResidualVolumes) > 0 { + framework.L.Warn().Msgf("residual remote CTF volumes: %s", strings.Join(summary.ResidualVolumes, ", ")) + } + } if stopRemoteErr != nil { return errors.Wrap(stopRemoteErr, "failed to stop one or more remote components") } if err := stopRelaySupervisor(relativePathToRepoRoot); err != nil { framework.L.Warn().Err(err).Msg("failed to stop relay supervisor after remote stop") + } else { + framework.L.Info().Msg("stopped local relay supervisor after remote stop") } if err := removeRemoteStopConfig(relativePathToRepoRoot); err != nil { framework.L.Warn().Err(err).Msg("failed to remove remote component stop state") + } else { + framework.L.Info().Msgf("removed remote state directory: %s", filepath.Join(relativePathToRepoRoot, remoteStateDirname)) } if !hasLocalComponents(targets) { statePath := envconfig.MustLocalCREStateFileAbsPath(relativePathToRepoRoot) @@ -934,9 +952,6 @@ func StartCLIEnvironment( ) (*creenv.SetupOutput, error) { testLogger := framework.L relaySupervisorStarted := false - defer func() { - _ = os.Unsetenv("CRE_USE_PERSISTENT_RELAY_SUPERVISOR") - }() // unset DockerFilePath and DockerContext as we cannot use them with existing images if withPluginsDockerImageFlag != "" { @@ -962,21 +977,22 @@ func StartCLIEnvironment( singleFileLogger := cldlogger.NewSingleFileLogger(nil) universalSetupInput := &creenv.SetupInput{ - NodeSets: in.NodeSets, - Blockchains: in.Blockchains, - ContractVersions: env.ContractVersions(), - WithV2Registries: env.WithV2Registries(), - JdInput: in.JD, - Provider: *in.Infra, - S3ProviderInput: in.S3ProviderInput, - CapabilityConfigs: in.CapabilityConfigs, - CopyCapabilityBinaries: withPluginsDockerImageFlag == "", // do not copy any binaries to the containers, if we are using plugins image (they already have them) - Capabilities: capabilities, - JobSpecFactoryFunctions: extraJobSpecFunctions, - StageGen: initLocalCREStageGen(in), - Features: features, - GatewayWhitelistConfig: gatewayWhitelistConfig, - BlockchainDeployers: blockchains_sets.NewDeployerSet(testLogger, in.Infra), + NodeSets: in.NodeSets, + Blockchains: in.Blockchains, + ContractVersions: env.ContractVersions(), + WithV2Registries: env.WithV2Registries(), + JdInput: in.JD, + Provider: *in.Infra, + S3ProviderInput: in.S3ProviderInput, + CapabilityConfigs: in.CapabilityConfigs, + CopyCapabilityBinaries: withPluginsDockerImageFlag == "", // do not copy any binaries to the containers, if we are using plugins image (they already have them) + Capabilities: capabilities, + JobSpecFactoryFunctions: extraJobSpecFunctions, + StageGen: initLocalCREStageGen(in), + Features: features, + GatewayWhitelistConfig: gatewayWhitelistConfig, + BlockchainDeployers: blockchains_sets.NewDeployerSet(testLogger, in.Infra), + UsePersistentRelaySupervisor: true, PreDONsStartHook: func(context.Context) error { if relaySupervisorStarted { return nil @@ -987,7 +1003,6 @@ func StartCLIEnvironment( } if started { relaySupervisorStarted = true - _ = os.Setenv("CRE_USE_PERSISTENT_RELAY_SUPERVISOR", "true") } return nil }, diff --git a/core/scripts/cre/environment/environment/remote_state.go b/core/scripts/cre/environment/environment/remote_state.go index 5a6cf8d8dc9..a6ee736c99e 100644 --- a/core/scripts/cre/environment/environment/remote_state.go +++ b/core/scripts/cre/environment/environment/remote_state.go @@ -19,12 +19,12 @@ const ( ) type remoteAgentState struct { - Mode string `toml:"mode,omitempty"` - LocalURL string `toml:"local_url,omitempty"` - EC2URL string `toml:"ec2_url,omitempty"` + Mode string `toml:"mode,omitempty"` + LocalURL string `toml:"local_url,omitempty"` + EC2URL string `toml:"ec2_url,omitempty"` EC2InstanceID string `toml:"ec2_instance_id,omitempty"` - EC2AgentPort string `toml:"ec2_agent_port,omitempty"` - AWSProfile string `toml:"aws_profile,omitempty"` + EC2AgentPort string `toml:"ec2_agent_port,omitempty"` + AWSProfile string `toml:"aws_profile,omitempty"` } type remoteAgentStateEnvelope struct { @@ -119,16 +119,13 @@ func firstNonEmpty(values ...string) string { } func removeRemoteStopConfig(relativePathToRepoRoot string) error { - for _, path := range []string{ - remoteStateFileAbsPath(relativePathToRepoRoot), - remoteAgentFileAbsPath(relativePathToRepoRoot), - } { - err := os.Remove(path) - if err == nil || os.IsNotExist(err) { - continue - } + stateDir, err := filepath.Abs(filepath.Join(relativePathToRepoRoot, remoteStateDirname)) + if err != nil { return err } + if removeErr := os.RemoveAll(stateDir); removeErr != nil { + return removeErr + } return nil } diff --git a/system-tests/lib/cre/environment/agent/server.go b/system-tests/lib/cre/environment/agent/server.go index 274dc4779f5..5f9f9401c2c 100644 --- a/system-tests/lib/cre/environment/agent/server.go +++ b/system-tests/lib/cre/environment/agent/server.go @@ -7,6 +7,7 @@ import ( "encoding/base64" "encoding/hex" "encoding/json" + "errors" "fmt" "io" "net/http" @@ -21,6 +22,8 @@ import ( "github.com/docker/docker/api/types/container" dockerevents "github.com/docker/docker/api/types/events" "github.com/docker/docker/api/types/filters" + "github.com/docker/docker/api/types/mount" + "github.com/docker/docker/api/types/volume" dockerclient "github.com/docker/docker/client" "github.com/rs/zerolog" @@ -96,6 +99,11 @@ type StartComponentResponse struct { Error string `json:"error,omitempty"` } +type CTFResourcesResponse struct { + Containers []string `json:"containers,omitempty"` + Volumes []string `json:"volumes,omitempty"` +} + type Server struct { lggr zerolog.Logger deployers map[blockchain.ChainFamily]blockchains.Deployer @@ -134,6 +142,7 @@ func (s *Server) Handler() http.Handler { mux.HandleFunc("/v1/relay/open", s.openRelay) mux.HandleFunc("/v1/relay/close", s.closeRelay) mux.HandleFunc("/v1/relay/connect", s.connectRelay) + mux.HandleFunc("/v1/resources/ctf", s.listCTFResources) return mux } @@ -142,6 +151,60 @@ func (s *Server) health(w http.ResponseWriter, _ *http.Request) { _, _ = w.Write([]byte("ok")) } +func (s *Server) listCTFResources(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) + return + } + + client, err := dockerclient.NewClientWithOpts(dockerclient.WithAPIVersionNegotiation()) + if err != nil { + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to create docker client: %v", err), nil) + return + } + defer client.Close() + + filterArgs := filters.NewArgs(filters.Arg("label", "framework=ctf")) + containers, err := client.ContainerList(r.Context(), container.ListOptions{ + All: true, + Filters: filterArgs, + }) + if err != nil { + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to list ctf containers: %v", err), nil) + return + } + containerNames := make([]string, 0, len(containers)) + for _, c := range containers { + if len(c.Names) > 0 { + containerNames = append(containerNames, strings.TrimPrefix(c.Names[0], "/")) + continue + } + containerNames = append(containerNames, c.ID) + } + slices.Sort(containerNames) + + volResp, err := client.VolumeList(r.Context(), volume.ListOptions{ + Filters: filterArgs, + }) + if err != nil { + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to list ctf volumes: %v", err), nil) + return + } + volumeNames := make([]string, 0, len(volResp.Volumes)) + for _, v := range volResp.Volumes { + if v == nil || strings.TrimSpace(v.Name) == "" { + continue + } + volumeNames = append(volumeNames, v.Name) + } + slices.Sort(volumeNames) + + s.respondJSONAny(w, http.StatusOK, CTFResourcesResponse{ + Containers: containerNames, + Volumes: volumeNames, + }) +} + func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodPost { s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) @@ -625,15 +688,63 @@ func stopContainers(ctx context.Context, ids []string) error { } defer client.Close() + namedVolumes, err := discoverNamedVolumesForContainers(ctx, client, ids) + if err != nil { + return err + } + for i := len(ids) - 1; i >= 0; i-- { - err := client.ContainerRemove(ctx, ids[i], container.RemoveOptions{Force: true}) + err := client.ContainerRemove(ctx, ids[i], container.RemoveOptions{ + Force: true, + RemoveVolumes: true, + }) if err != nil && !cerrdefs.IsNotFound(err) { return fmt.Errorf("failed to remove container %s: %w", ids[i], err) } } + + var removeVolumeErrors []error + for _, volumeName := range namedVolumes { + err := client.VolumeRemove(ctx, volumeName, true) + if err != nil && !cerrdefs.IsNotFound(err) { + removeVolumeErrors = append(removeVolumeErrors, fmt.Errorf("remove volume %s: %w", volumeName, err)) + } + } + if len(removeVolumeErrors) > 0 { + return fmt.Errorf("failed to remove one or more named volumes: %w", errors.Join(removeVolumeErrors...)) + } return nil } +func discoverNamedVolumesForContainers(ctx context.Context, client *dockerclient.Client, ids []string) ([]string, error) { + volumes := make(map[string]struct{}) + for _, id := range ids { + inspect, err := client.ContainerInspect(ctx, id) + if err != nil { + if cerrdefs.IsNotFound(err) { + continue + } + return nil, fmt.Errorf("inspect container %s before removal: %w", id, err) + } + for _, mountPoint := range inspect.Mounts { + if mountPoint.Type != mount.TypeVolume { + continue + } + name := strings.TrimSpace(mountPoint.Name) + if name == "" { + continue + } + volumes[name] = struct{}{} + } + } + out := make([]string, 0, len(volumes)) + for name := range volumes { + out = append(out, name) + } + slices.Sort(out) + return out, nil +} + func hashPayload(payload []byte) string { sum := sha256.Sum256(payload) return hex.EncodeToString(sum[:]) diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index bd25cbc8a5c..9031856eae4 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -48,8 +48,6 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/infra" ) -const envUsePersistentRelaySupervisor = "CRE_USE_PERSISTENT_RELAY_SUPERVISOR" - type SetupOutput struct { WorkflowRegistryConfigurationOutput *cre.WorkflowRegistryOutput CreEnvironment *cre.Environment @@ -118,6 +116,10 @@ type SetupInput struct { // Optional hook executed after local dependencies are started (including JD), // and right before DON containers are started. PreDONsStartHook func(ctx context.Context) error + + // When true, SetupTestEnvironment skips the in-process relay manager + // because a persistent external relay supervisor owns mixed component relays. + UsePersistentRelaySupervisor bool } func (s *SetupInput) Validate() error { @@ -184,7 +186,7 @@ func SetupTestEnvironment( return nil, pkgerrors.Wrap(tmErr, "failed to initialize tunnel manager") } var relayManager *componentRelayManager - if !strings.EqualFold(strings.TrimSpace(os.Getenv(envUsePersistentRelaySupervisor)), "true") { + if !input.UsePersistentRelaySupervisor { rm, rmErr := newComponentRelayManager(testLogger) if rmErr != nil && nodeSetPlacement.HasRemoteTargets { return nil, pkgerrors.Wrap(rmErr, "failed to initialize relay manager") diff --git a/system-tests/lib/cre/environment/remote_stop.go b/system-tests/lib/cre/environment/remote_stop.go index 9c67d81dcc1..a05eabd99ef 100644 --- a/system-tests/lib/cre/environment/remote_stop.go +++ b/system-tests/lib/cre/environment/remote_stop.go @@ -5,6 +5,8 @@ import ( "encoding/json" "errors" "fmt" + "io" + "net/http" "strings" pkgerrors "github.com/pkg/errors" @@ -13,6 +15,7 @@ import ( "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" ) type RemoteStopSummary struct { @@ -20,6 +23,10 @@ type RemoteStopSummary struct { Stopped int Missing int Failed int + + ResidualContainers []string + ResidualVolumes []string + ResidualQueryError string } // StopRemoteComponents sends StopComponent operations for all remote-targeted components. @@ -109,6 +116,14 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. } } + containers, volumes, listErr := listRemoteCTFResources(ctx, lggr, tunnelManager) + if listErr != nil { + summary.ResidualQueryError = listErr.Error() + } else { + summary.ResidualContainers = containers + summary.ResidualVolumes = volumes + } + return summary, joined } @@ -173,3 +188,32 @@ func stopRemoteComponent( return response, nil } + +func listRemoteCTFResources( + ctx context.Context, + lggr zerolog.Logger, + tunnelManager tunnel.Manager, +) ([]string, []string, error) { + baseURL, err := resolveEC2AgentBaseURL(lggr, tunnelManager) + if err != nil { + return nil, nil, pkgerrors.Wrap(err, "resolve agent base url for ctf resource query") + } + req, err := http.NewRequestWithContext(ctx, http.MethodGet, strings.TrimRight(baseURL, "/")+"/v1/resources/ctf", nil) + if err != nil { + return nil, nil, err + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, nil, err + } + defer resp.Body.Close() + body, _ := io.ReadAll(resp.Body) + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return nil, nil, fmt.Errorf("ctf resource query failed: status %s body %s", resp.Status, strings.TrimSpace(string(body))) + } + var out agent.CTFResourcesResponse + if err := json.Unmarshal(body, &out); err != nil { + return nil, nil, err + } + return out.Containers, out.Volumes, nil +} From bf78874fea5cd5afbdb9954664636670ad773640 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Mon, 23 Feb 2026 18:58:41 +0100 Subject: [PATCH 11/34] simplify relayer --- .../environment/environment/environment.go | 31 +- .../environment/environment/remote_state.go | 6 +- .../lib/cre/environment/component_relay.go | 315 ------------------ .../lib/cre/environment/environment.go | 128 +------ 4 files changed, 21 insertions(+), 459 deletions(-) delete mode 100644 system-tests/lib/cre/environment/component_relay.go diff --git a/core/scripts/cre/environment/environment/environment.go b/core/scripts/cre/environment/environment/environment.go index 66aa356ed8b..9eec9e772f3 100644 --- a/core/scripts/cre/environment/environment/environment.go +++ b/core/scripts/cre/environment/environment/environment.go @@ -977,22 +977,21 @@ func StartCLIEnvironment( singleFileLogger := cldlogger.NewSingleFileLogger(nil) universalSetupInput := &creenv.SetupInput{ - NodeSets: in.NodeSets, - Blockchains: in.Blockchains, - ContractVersions: env.ContractVersions(), - WithV2Registries: env.WithV2Registries(), - JdInput: in.JD, - Provider: *in.Infra, - S3ProviderInput: in.S3ProviderInput, - CapabilityConfigs: in.CapabilityConfigs, - CopyCapabilityBinaries: withPluginsDockerImageFlag == "", // do not copy any binaries to the containers, if we are using plugins image (they already have them) - Capabilities: capabilities, - JobSpecFactoryFunctions: extraJobSpecFunctions, - StageGen: initLocalCREStageGen(in), - Features: features, - GatewayWhitelistConfig: gatewayWhitelistConfig, - BlockchainDeployers: blockchains_sets.NewDeployerSet(testLogger, in.Infra), - UsePersistentRelaySupervisor: true, + NodeSets: in.NodeSets, + Blockchains: in.Blockchains, + ContractVersions: env.ContractVersions(), + WithV2Registries: env.WithV2Registries(), + JdInput: in.JD, + Provider: *in.Infra, + S3ProviderInput: in.S3ProviderInput, + CapabilityConfigs: in.CapabilityConfigs, + CopyCapabilityBinaries: withPluginsDockerImageFlag == "", // do not copy any binaries to the containers, if we are using plugins image (they already have them) + Capabilities: capabilities, + JobSpecFactoryFunctions: extraJobSpecFunctions, + StageGen: initLocalCREStageGen(in), + Features: features, + GatewayWhitelistConfig: gatewayWhitelistConfig, + BlockchainDeployers: blockchains_sets.NewDeployerSet(testLogger, in.Infra), PreDONsStartHook: func(context.Context) error { if relaySupervisorStarted { return nil diff --git a/core/scripts/cre/environment/environment/remote_state.go b/core/scripts/cre/environment/environment/remote_state.go index a6ee736c99e..339d9e5ae35 100644 --- a/core/scripts/cre/environment/environment/remote_state.go +++ b/core/scripts/cre/environment/environment/remote_state.go @@ -45,8 +45,12 @@ func remoteStateFileExists(relativePathToRepoRoot string) bool { } func loadRemoteStopConfig(relativePathToRepoRoot string) (*envconfig.Config, error) { + data, err := os.ReadFile(remoteStateFileAbsPath(relativePathToRepoRoot)) + if err != nil { + return nil, err + } cfg := &envconfig.Config{} - if err := cfg.Load(remoteStateFileAbsPath(relativePathToRepoRoot)); err != nil { + if err := toml.Unmarshal(data, cfg); err != nil { return nil, err } return cfg, nil diff --git a/system-tests/lib/cre/environment/component_relay.go b/system-tests/lib/cre/environment/component_relay.go deleted file mode 100644 index 4196b372350..00000000000 --- a/system-tests/lib/cre/environment/component_relay.go +++ /dev/null @@ -1,315 +0,0 @@ -package environment - -import ( - "bytes" - "context" - "encoding/json" - "fmt" - "io" - "net" - "net/http" - "net/url" - "os" - "strconv" - "strings" - "sync" - "time" - - "github.com/gorilla/websocket" - "github.com/rs/zerolog" - - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" -) - -type componentRelayManager struct { - lggr zerolog.Logger - baseURL string - - mu sync.Mutex - handles map[string]*componentRelayHandle -} - -type componentRelayHandle struct { - relayID string - cancel context.CancelFunc -} - -type relayOpenResponse struct { - RelayID string `json:"relayId"` -} - -func newComponentRelayManager(lggr zerolog.Logger) (*componentRelayManager, error) { - baseURL, err := resolveAgentBaseURLForRelay() - if err != nil { - return nil, err - } - return &componentRelayManager{ - lggr: lggr, - baseURL: baseURL, - handles: make(map[string]*componentRelayHandle), - }, nil -} - -func (m *componentRelayManager) EnsurePort(ctx context.Context, relayName string, localPort int) error { - if m == nil || localPort <= 0 { - return nil - } - // Deduplicate by port. HTTP and WS for the same endpoint can share one listener. - key := strconv.Itoa(localPort) - - m.mu.Lock() - if _, ok := m.handles[key]; ok { - m.mu.Unlock() - return nil - } - m.mu.Unlock() - - relayID, err := openRelay(ctx, m.baseURL, relayName, localPort) - if err != nil { - return err - } - - workerCtx, cancel := context.WithCancel(context.Background()) - localAddr := net.JoinHostPort("127.0.0.1", strconv.Itoa(localPort)) - for i := 0; i < 4; i++ { - go relayWorker(workerCtx, m.baseURL, relayID, localAddr) - } - - m.mu.Lock() - m.handles[key] = &componentRelayHandle{relayID: relayID, cancel: cancel} - m.mu.Unlock() - m.lggr.Info().Str("relayName", relayName).Int("port", localPort).Msg("ensured mixed component relay") - return nil -} - -func (m *componentRelayManager) Close(ctx context.Context) error { - if m == nil { - return nil - } - m.mu.Lock() - handles := make([]*componentRelayHandle, 0, len(m.handles)) - for _, h := range m.handles { - handles = append(handles, h) - } - m.handles = map[string]*componentRelayHandle{} - m.mu.Unlock() - - var firstErr error - for _, h := range handles { - h.cancel() - if err := closeRelay(ctx, m.baseURL, h.relayID); err != nil && firstErr == nil { - firstErr = err - } - } - return firstErr -} - -func resolveAgentBaseURLForRelay() (string, error) { - if v := strings.TrimSpace(os.Getenv(envEC2AgentURL)); v != "" { - return v, nil - } - if strings.EqualFold(strings.TrimSpace(os.Getenv(envAgentMode)), "ec2") && runtimecfg.IsDirectMode() { - hostIP, err := runtimecfg.DirectHostIP() - if err != nil { - return "", err - } - port, err := resolveEC2AgentPort() - if err != nil { - return "", err - } - return fmt.Sprintf("http://%s:%d", hostIP, port), nil - } - if v := strings.TrimSpace(os.Getenv(envLocalAgentURL)); v != "" { - return v, nil - } - return "", fmt.Errorf("cannot resolve agent base URL for relay; set %s (or use direct mode with EC2 host resolution)", envEC2AgentURL) -} - -func openRelay(ctx context.Context, baseURL, name string, requestedPort int) (string, error) { - body, _ := json.Marshal(map[string]any{"name": name, "requestedPort": requestedPort}) - req, err := http.NewRequestWithContext(ctx, http.MethodPost, strings.TrimRight(baseURL, "/")+"/v1/relay/open", bytes.NewReader(body)) - if err != nil { - return "", err - } - req.Header.Set("Content-Type", "application/json") - - resp, err := http.DefaultClient.Do(req) - if err != nil { - return "", err - } - defer resp.Body.Close() - respBody, _ := io.ReadAll(resp.Body) - if resp.StatusCode < 200 || resp.StatusCode >= 300 { - return "", fmt.Errorf("open relay failed: status %s body %s", resp.Status, strings.TrimSpace(string(respBody))) - } - - var out relayOpenResponse - if err := json.Unmarshal(respBody, &out); err != nil { - return "", err - } - if strings.TrimSpace(out.RelayID) == "" { - return "", fmt.Errorf("open relay returned empty relayId") - } - return out.RelayID, nil -} - -func closeRelay(ctx context.Context, baseURL, relayID string) error { - body, _ := json.Marshal(map[string]any{"relayId": relayID}) - req, err := http.NewRequestWithContext(ctx, http.MethodPost, strings.TrimRight(baseURL, "/")+"/v1/relay/close", bytes.NewReader(body)) - if err != nil { - return err - } - req.Header.Set("Content-Type", "application/json") - - resp, err := http.DefaultClient.Do(req) - if err != nil { - return err - } - defer resp.Body.Close() - if resp.StatusCode < 200 || resp.StatusCode >= 300 { - respBody, _ := io.ReadAll(resp.Body) - return fmt.Errorf("close relay failed: status %s body %s", resp.Status, strings.TrimSpace(string(respBody))) - } - return nil -} - -func relayWorker(ctx context.Context, baseURL, relayID, localAddr string) { - backoff := 250 * time.Millisecond - for { - select { - case <-ctx.Done(): - return - default: - } - - wsURL, err := relayConnectWSURL(baseURL, relayID) - if err != nil { - time.Sleep(backoff) - continue - } - ws, _, err := websocket.DefaultDialer.Dial(wsURL, nil) - if err != nil { - time.Sleep(backoff) - continue - } - _ = bridgeRelayStream(ctx, ws, localAddr) - _ = ws.Close() - if backoff < 2*time.Second { - backoff *= 2 - } - } -} - -func relayConnectWSURL(baseURL, relayID string) (string, error) { - u, err := url.Parse(strings.TrimRight(baseURL, "/")) - if err != nil { - return "", err - } - switch u.Scheme { - case "http": - u.Scheme = "ws" - case "https": - u.Scheme = "wss" - default: - return "", fmt.Errorf("unsupported agent url scheme: %s", u.Scheme) - } - u.Path = "/v1/relay/connect" - q := u.Query() - q.Set("relayId", relayID) - u.RawQuery = q.Encode() - return u.String(), nil -} - -func bridgeRelayStream(ctx context.Context, ws *websocket.Conn, localAddr string) error { - errCh := make(chan error, 2) - localReady := make(chan net.Conn, 1) - var localConn net.Conn - var localConnMu sync.Mutex - getLocalConn := func() net.Conn { - localConnMu.Lock() - defer localConnMu.Unlock() - return localConn - } - setLocalConn := func(conn net.Conn) { - localConnMu.Lock() - localConn = conn - localConnMu.Unlock() - } - ensureLocalConn := func() (net.Conn, error) { - if existing := getLocalConn(); existing != nil { - return existing, nil - } - conn, err := net.DialTimeout("tcp", localAddr, 2*time.Second) - if err != nil { - return nil, err - } - setLocalConn(conn) - select { - case localReady <- conn: - default: - } - return conn, nil - } - defer func() { - if conn := getLocalConn(); conn != nil { - _ = conn.Close() - } - }() - go func() { - var conn net.Conn - select { - case conn = <-localReady: - case <-ctx.Done(): - errCh <- ctx.Err() - return - } - if conn == nil { - errCh <- fmt.Errorf("local relay connection was nil") - return - } - buf := make([]byte, 32*1024) - for { - n, err := conn.Read(buf) - if n > 0 { - if wErr := ws.WriteMessage(websocket.BinaryMessage, buf[:n]); wErr != nil { - errCh <- wErr - return - } - } - if err != nil { - errCh <- err - return - } - } - }() - go func() { - for { - msgType, payload, err := ws.ReadMessage() - if err != nil { - errCh <- err - return - } - if msgType != websocket.BinaryMessage && msgType != websocket.TextMessage { - continue - } - if len(payload) == 0 { - continue - } - conn, dialErr := ensureLocalConn() - if dialErr != nil { - errCh <- dialErr - return - } - if _, wErr := conn.Write(payload); wErr != nil { - errCh <- wErr - return - } - } - }() - select { - case <-ctx.Done(): - return ctx.Err() - case <-errCh: - return nil - } -} diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index 9031856eae4..c26f2c73b25 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -5,10 +5,7 @@ import ( "errors" "fmt" "maps" - "net" - "net/url" "os" - "strconv" "strings" "sync" @@ -57,7 +54,6 @@ type SetupOutput struct { GatewayConnectors *cre.GatewayConnectors tunnelManager tunnel.Manager - relayManager *componentRelayManager closeOnce sync.Once closeErr error } @@ -72,9 +68,6 @@ func (s *SetupOutput) Close(ctx context.Context) error { } s.closeOnce.Do(func() { - if s.relayManager != nil { - _ = s.relayManager.Close(ctx) - } s.closeErr = manager.Stop(ctx) }) @@ -116,10 +109,6 @@ type SetupInput struct { // Optional hook executed after local dependencies are started (including JD), // and right before DON containers are started. PreDONsStartHook func(ctx context.Context) error - - // When true, SetupTestEnvironment skips the in-process relay manager - // because a persistent external relay supervisor owns mixed component relays. - UsePersistentRelaySupervisor bool } func (s *SetupInput) Validate() error { @@ -185,16 +174,7 @@ func SetupTestEnvironment( if tmErr != nil { return nil, pkgerrors.Wrap(tmErr, "failed to initialize tunnel manager") } - var relayManager *componentRelayManager - if !input.UsePersistentRelaySupervisor { - rm, rmErr := newComponentRelayManager(testLogger) - if rmErr != nil && nodeSetPlacement.HasRemoteTargets { - return nil, pkgerrors.Wrap(rmErr, "failed to initialize relay manager") - } - relayManager = rm - } else { - testLogger.Info().Msg("persistent relay supervisor enabled; skipping in-process relay manager") - } + testLogger.Info().Msg("using persistent relay supervisor for mixed component relays") fmt.Print(libformat.PurpleText("%s", input.StageGen.Wrap("Starting %d blockchain(s)", len(input.Blockchains)))) @@ -210,14 +190,10 @@ func SetupTestEnvironment( return nil, pkgerrors.Wrap(startErr, "failed to start blockchains") } cleanupTunnelsOnError := true - cleanupRelaysOnError := true defer func() { if cleanupTunnelsOnError { _ = tunnelManager.Stop(ctx) } - if cleanupRelaysOnError && relayManager != nil { - _ = relayManager.Close(ctx) - } }() creEnvironment := &cre.Environment{ @@ -269,12 +245,6 @@ func SetupTestEnvironment( } fmt.Print(libformat.PurpleText("%s", input.StageGen.WrapAndNext("DONs configuration prepared in %.2f seconds", input.StageGen.Elapsed().Seconds()))) - if nodeSetPlacement.HasRemoteTargets && relayManager != nil { - if err := ensureMixedRelaysForLocalBlockchains(ctx, relayManager, input.Blockchains, deployedBlockchains.Outputs); err != nil { - return nil, pkgerrors.Wrap(err, "failed to ensure mixed relays for local blockchains") - } - } - fmt.Print(libformat.PurpleText("%s", input.StageGen.Wrap("Applying Features before environment startup"))) var donsCapabilities = make(map[uint64][]keystone_changeset.DONCapabilityWithConfig) var capabilityToOCR3Config = make(map[string]*ocr3.OracleConfig) @@ -304,17 +274,10 @@ func SetupTestEnvironment( fmt.Print(libformat.PurpleText("%s", input.StageGen.WrapAndNext("Applied Features in %.2f seconds", input.StageGen.Elapsed().Seconds()))) - // Start JD first when we need to expose local JD endpoints to remote nodesets. - requireJDRelayBootstrap := nodeSetPlacement.HasRemoteTargets && input.JdInput != nil && input.JdInput.Target == config.TargetLocal startedJD, jdStartErr := StartJD(ctx, testLogger, input.JdInput, input.Provider, tunnelManager, nodeSetPlacement.HasLocalTargets) if jdStartErr != nil { return nil, pkgerrors.Wrap(jdStartErr, "failed to start Job Distributor") } - if requireJDRelayBootstrap && relayManager != nil { - if err := ensureMixedRelaysForLocalJD(ctx, relayManager, startedJD.JDOutput); err != nil { - return nil, pkgerrors.Wrap(err, "failed to ensure mixed relays for local JD") - } - } if input.PreDONsStartHook != nil { if err := input.PreDONsStartHook(ctx); err != nil { return nil, pkgerrors.Wrap(err, "failed to execute pre-DON startup hook") @@ -517,98 +480,9 @@ func SetupTestEnvironment( S3ProviderOutput: s3Output, GatewayConnectors: topology.GatewayConnectors, tunnelManager: tunnelManager, - relayManager: relayManager, }, nil } -func ensureMixedRelaysForLocalBlockchains( - ctx context.Context, - relayManager *componentRelayManager, - configuredBlockchains []*config.Blockchain, - deployedBlockchains []blockchains.Blockchain, -) error { - attempted := 0 - for idx, configured := range configuredBlockchains { - if configured == nil || configured.Target != config.TargetLocal { - continue - } - if idx >= len(deployedBlockchains) || deployedBlockchains[idx] == nil { - continue - } - for nodeIdx, node := range deployedBlockchains[idx].CtfOutput().Nodes { - if node == nil { - continue - } - if p, ok := extractEndpointPort(node.ExternalHTTPUrl); ok { - attempted++ - if err := relayManager.EnsurePort(ctx, fmt.Sprintf("blockchain-http-%d-%d", idx, nodeIdx), p); err != nil { - return err - } - } - if p, ok := extractEndpointPort(node.ExternalWSUrl); ok { - attempted++ - if err := relayManager.EnsurePort(ctx, fmt.Sprintf("blockchain-ws-%d-%d", idx, nodeIdx), p); err != nil { - return err - } - } - } - } - if attempted == 0 { - relayManager.lggr.Warn().Msg("no local blockchain relay ports were detected; mixed remote nodesets may not reach local blockchains") - } - return nil -} - -func ensureMixedRelaysForLocalJD(ctx context.Context, relayManager *componentRelayManager, jdOutput *jd.Output) error { - if jdOutput == nil { - return nil - } - attempted := 0 - if p, ok := extractEndpointPort(jdOutput.ExternalGRPCUrl); ok { - attempted++ - if err := relayManager.EnsurePort(ctx, "jd-grpc", p); err != nil { - return err - } - } - if p, ok := extractEndpointPort(jdOutput.ExternalWSRPCUrl); ok { - attempted++ - if err := relayManager.EnsurePort(ctx, "jd-wsrpc", p); err != nil { - return err - } - } - if attempted == 0 { - relayManager.lggr.Warn().Msg("no local JD relay ports were detected; mixed remote nodesets may not reach local JD") - } - return nil -} - -func extractEndpointPort(raw string) (int, bool) { - trimmed := strings.TrimSpace(raw) - if trimmed == "" { - return 0, false - } - if strings.Contains(trimmed, "://") { - parsed, err := url.Parse(trimmed) - if err != nil || parsed.Port() == "" { - return 0, false - } - port, convErr := strconv.Atoi(parsed.Port()) - if convErr != nil || port <= 0 || port > 65535 { - return 0, false - } - return port, true - } - _, portRaw, err := net.SplitHostPort(trimmed) - if err != nil { - return 0, false - } - port, convErr := strconv.Atoi(portRaw) - if convErr != nil || port <= 0 || port > 65535 { - return 0, false - } - return port, true -} - func blockchainTargetsBySelector(configured []*config.Blockchain, deployed []blockchains.Blockchain) map[uint64]string { bySelector := make(map[uint64]string, len(deployed)) for idx, blockchainCfg := range configured { From 0d86834657ceaa7671b83336f0444f27c0c27ad7 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Mon, 23 Feb 2026 20:28:09 +0100 Subject: [PATCH 12/34] rename target to placement, add mixed mode for nodesets --- .../configs/workflow-gateway-don-mixed.toml | 43 ++++++-- .../configs/workflow-gateway-don-remote.toml | 10 +- .../environment/environment/environment.go | 14 +-- .../environment/relay_supervisor.go | 39 +++++-- .../environment/environment/remote_state.go | 6 +- .../cre/environment/environment/workflow.go | 4 +- system-tests/lib/cre/bootstrap_peer.go | 101 ++++++++++++++++++ system-tests/lib/cre/don.go | 81 ++++++++++++-- system-tests/lib/cre/don/config/config.go | 54 ++++++++-- system-tests/lib/cre/don_jd_placement_test.go | 37 +++++++ .../lib/cre/environment/blockchain_start.go | 16 +-- .../lib/cre/environment/config/config.go | 58 +++++----- system-tests/lib/cre/environment/dons.go | 4 +- .../lib/cre/environment/environment.go | 30 +++--- .../environment/environment_placement_test.go | 22 ++++ system-tests/lib/cre/environment/jobs.go | 60 ++--------- system-tests/lib/cre/environment/jobs_test.go | 7 +- .../lib/cre/environment/remote_stop.go | 12 +-- .../cre/features/consensus/v1/consensus.go | 7 +- .../cre/features/consensus/v2/consensus.go | 23 ++-- .../lib/cre/features/don_time/don_time.go | 7 +- system-tests/lib/cre/features/evm/v2/evm.go | 7 +- system-tests/lib/cre/features/vault/vault.go | 6 +- system-tests/lib/cre/sharding/sharding.go | 14 +-- system-tests/lib/cre/types.go | 11 +- system-tests/lib/cre/workflow/registry.go | 4 +- .../tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md | 3 +- .../tests/smoke/cre/v2_grpc_source_test.go | 2 +- .../tests/smoke/cre/v2_vault_don_test.go | 2 +- .../tests/test-helpers/before_suite.go | 8 +- .../test-helpers/fixture_relay_helpers.go | 2 +- system-tests/tests/test-helpers/t_helpers.go | 2 +- 32 files changed, 484 insertions(+), 212 deletions(-) create mode 100644 system-tests/lib/cre/bootstrap_peer.go create mode 100644 system-tests/lib/cre/don_jd_placement_test.go create mode 100644 system-tests/lib/cre/environment/environment_placement_test.go diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml b/core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml index a9e04343326..1f705bfa7bd 100644 --- a/core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml +++ b/core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml @@ -3,14 +3,14 @@ type = "anvil" chain_id = "1337" docker_cmd_params = ["-b", "0.5", "--mixed-mining"] - target = "remote" + placement = "remote" [[blockchains]] type = "anvil" chain_id = "2337" port = "8546" docker_cmd_params = ["-b", "0.5", "--mixed-mining"] - target = "remote" + placement = "remote" container_name = "anvil-2337" remote_start_policy = "always" @@ -18,7 +18,7 @@ csa_encryption_key = "d1093c0060d50a3c89c189b2e485da5a3ce57f3dcb38ab7e2c0d5f0bb2314a44" # any random 32 byte hex string # change to your version image = "job-distributor:0.22.1" - target = "local" + placement = "remote" # we need fresh DB on each run to avoid DB-level job name uniquness violations remote_start_policy = "always" @@ -43,16 +43,46 @@ don_types = ["workflow"] override_mode = "all" http_port_range_start = 10100 - target = "remote" + placement = "local" env_vars = { CL_EVM_CMD = "" } - capabilities = ["ocr3", "custom-compute", "web-api-target", "web-api-trigger", "vault", "cron", "http-action", "http-trigger", "consensus", "don-time", "write-evm-1337", "write-evm-2337", "evm-1337", "evm-2337", "read-contract-1337", "read-contract-2337"] + capabilities = ["ocr3", "custom-compute", "web-api-trigger", "cron", "http-action", "http-trigger", "consensus", "don-time", "write-evm-1337", "evm-1337", "read-contract-1337"] [nodesets.db] image = "postgres:12.0" port = 13000 [[nodesets.node_specs]] + roles = ["plugin"] + [nodesets.node_specs.node] + #docker_ctx = "../../../.." + #docker_file = "core/chainlink.Dockerfile" + #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } + image = "chainlink-tmp:latest" + user_config_overrides = "" + +[[nodesets]] + nodes = 4 + name = "capabilities" + don_types = ["capabilities"] + exposes_remote_capabilities = true + override_mode = "all" + http_port_range_start = 10200 + placement = "remote" + remote_start_policy = "always" + + # we need to have chain 1337 configured (even if no capability uses it), because we use node addresses on chain 1337 + # to identify nodes in the gateway configuration (required by both web-api-target and vault capabilities) + supported_evm_chains = [1337, 2337] + + env_vars = { CL_EVM_CMD = "" } + capabilities = ["web-api-target", "vault", "write-evm-2337", "read-contract-2337", "evm-2337"] + + [nodesets.db] + image = "postgres:12.0" + port = 13100 + + [[nodesets.node_specs]] roles = ["plugin"] [nodesets.node_specs.node] #docker_ctx = "../../../.." @@ -67,7 +97,8 @@ don_types = ["bootstrap", "gateway"] override_mode = "each" http_port_range_start = 10300 - target = "remote" + placement = "remote" + remote_start_policy = "always" env_vars = { CL_EVM_CMD = "" } supported_evm_chains = [1337, 2337] diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml b/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml index f482477aa42..cbddfc6b7ab 100644 --- a/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml +++ b/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml @@ -3,14 +3,14 @@ type = "anvil" chain_id = "1337" docker_cmd_params = ["-b", "0.5", "--mixed-mining"] - target = "remote" + placement = "remote" [[blockchains]] type = "anvil" chain_id = "2337" port = "8546" docker_cmd_params = ["-b", "0.5", "--mixed-mining"] - target = "remote" + placement = "remote" container_name = "anvil-2337" remote_start_policy = "always" @@ -18,7 +18,7 @@ csa_encryption_key = "d1093c0060d50a3c89c189b2e485da5a3ce57f3dcb38ab7e2c0d5f0bb2314a44" # any random 32 byte hex string # change to your version image = "job-distributor:0.22.1" - target = "remote" + placement = "remote" # we need fresh DB on each run to avoid DB-level job name uniquness violations remote_start_policy = "always" @@ -43,7 +43,7 @@ don_types = ["workflow"] override_mode = "all" http_port_range_start = 10100 - target = "remote" + placement = "remote" env_vars = { CL_EVM_CMD = "" } capabilities = ["ocr3", "custom-compute", "web-api-target", "web-api-trigger", "vault", "cron", "http-action", "http-trigger", "consensus", "don-time", "write-evm-1337", "write-evm-2337", "evm-1337", "evm-2337", "read-contract-1337", "read-contract-2337"] @@ -67,7 +67,7 @@ don_types = ["bootstrap", "gateway"] override_mode = "each" http_port_range_start = 10300 - target = "remote" + placement = "remote" env_vars = { CL_EVM_CMD = "" } supported_evm_chains = [1337, 2337] diff --git a/core/scripts/cre/environment/environment/environment.go b/core/scripts/cre/environment/environment/environment.go index 9eec9e772f3..db37e91d6da 100644 --- a/core/scripts/cre/environment/environment/environment.go +++ b/core/scripts/cre/environment/environment/environment.go @@ -879,16 +879,16 @@ func summarizeRemoteComponents(cfg *envconfig.Config) remoteComponentSummary { return summary } for _, configuredBlockchain := range cfg.Blockchains { - if configuredBlockchain != nil && configuredBlockchain.Target == envconfig.TargetRemote { + if configuredBlockchain != nil && configuredBlockchain.Placement == envconfig.PlacementRemote { summary.Blockchains++ } } for _, nodeSet := range cfg.NodeSets { - if nodeSet != nil && strings.TrimSpace(nodeSet.Target) == string(envconfig.TargetRemote) { + if nodeSet != nil && strings.TrimSpace(nodeSet.Placement) == string(envconfig.PlacementRemote) { summary.NodeSets++ } } - if cfg.JD != nil && cfg.JD.Target == envconfig.TargetRemote { + if cfg.JD != nil && cfg.JD.Placement == envconfig.PlacementRemote { summary.JD = 1 } summary.Total = summary.Blockchains + summary.NodeSets + summary.JD @@ -900,16 +900,16 @@ func hasLocalComponents(cfg *envconfig.Config) bool { return false } for _, configuredBlockchain := range cfg.Blockchains { - if configuredBlockchain != nil && configuredBlockchain.Target != envconfig.TargetRemote { + if configuredBlockchain != nil && configuredBlockchain.Placement != envconfig.PlacementRemote { return true } } for _, nodeSet := range cfg.NodeSets { - if nodeSet != nil && strings.TrimSpace(nodeSet.Target) != string(envconfig.TargetRemote) { + if nodeSet != nil && strings.TrimSpace(nodeSet.Placement) != string(envconfig.PlacementRemote) { return true } } - if cfg.JD != nil && cfg.JD.Target != envconfig.TargetRemote { + if cfg.JD != nil && cfg.JD.Placement != envconfig.PlacementRemote { return true } return false @@ -1352,7 +1352,7 @@ func ensureDockerImagesExist(ctx context.Context, logger zerolog.Logger, in *env } if in.JD != nil { - if in.JD.Target == envconfig.TargetRemote { + if in.JD.Placement == envconfig.PlacementRemote { logger.Info().Msg("Skipping local JD image check for remote JD target") } else if err := ensureDockerImageExists(ctx, logger, in.JD.Image); err != nil { return errors.Wrapf(err, "Job Distributor image '%s' not found. Make sure it exists locally or run 'go run . env setup' to pull it and other dependencies that also might be missing", in.JD.Image) diff --git a/core/scripts/cre/environment/environment/relay_supervisor.go b/core/scripts/cre/environment/environment/relay_supervisor.go index 0e908c0ed5b..5b45ae340a1 100644 --- a/core/scripts/cre/environment/environment/relay_supervisor.go +++ b/core/scripts/cre/environment/environment/relay_supervisor.go @@ -30,8 +30,8 @@ import ( "github.com/smartcontractkit/chainlink-testing-framework/framework" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" ) const ( @@ -80,10 +80,10 @@ type relayOpenResponse struct { } type localBridgeStats struct { - WSMessages uint64 - WSToTCPBytes uint64 - TCPToWSBytes uint64 - LocalDialed bool + WSMessages uint64 + WSToTCPBytes uint64 + TCPToWSBytes uint64 + LocalDialed bool LocalDialFails uint64 } @@ -158,6 +158,7 @@ func maybeStartRelaySupervisor(relativePathToRepoRoot string, cfg *envconfig.Con } return false, nil } + framework.L.Info().Int("relaySpecs", len(specs)).Msgf("starting persistent relay supervisor with specs: %s", relaySpecsCSV(specs)) return true, startRelaySupervisor(relativePathToRepoRoot, specs) } @@ -167,7 +168,7 @@ func relaySpecsFromConfig(cfg *envconfig.Config) []relaySpec { } hasRemoteNodeSets := false for _, nodeSet := range cfg.NodeSets { - if nodeSet != nil && strings.TrimSpace(nodeSet.Target) == string(envconfig.TargetRemote) { + if nodeSet != nil && strings.TrimSpace(nodeSet.Placement) == string(envconfig.PlacementRemote) { hasRemoteNodeSets = true break } @@ -187,7 +188,7 @@ func relaySpecsFromConfig(cfg *envconfig.Config) []relaySpec { specByPort[port] = relaySpec{Name: name, Port: port} } for _, blockchainCfg := range cfg.Blockchains { - if blockchainCfg == nil || blockchainCfg.Target != envconfig.TargetLocal { + if blockchainCfg == nil || blockchainCfg.Placement != envconfig.PlacementLocal { continue } if blockchainCfg.Out != nil { @@ -209,7 +210,7 @@ func relaySpecsFromConfig(cfg *envconfig.Config) []relaySpec { } } - if cfg.JD != nil && cfg.JD.Target == envconfig.TargetLocal { + if cfg.JD != nil && cfg.JD.Placement == envconfig.PlacementLocal { if cfg.JD.Out != nil { if p, ok := endpointPort(cfg.JD.Out.ExternalGRPCUrl); ok { addSpec("jd-grpc", p) @@ -232,6 +233,18 @@ func relaySpecsFromConfig(cfg *envconfig.Config) []relaySpec { } } } + for _, nodeSet := range cfg.NodeSets { + if nodeSet == nil || strings.TrimSpace(nodeSet.Placement) != string(envconfig.PlacementLocal) { + continue + } + for _, nodeSpec := range nodeSet.NodeSpecs { + if nodeSpec == nil || !hasBootstrapRole(nodeSpec.Roles) { + continue + } + addSpec("ocr-bootstrap", 5001) + break + } + } specs := make([]relaySpec, 0, len(specByPort)) for _, spec := range specByPort { @@ -303,6 +316,15 @@ func inferLocalJDPortsFromInput(in jd.Input) []int { return out } +func hasBootstrapRole(roles []string) bool { + for _, role := range roles { + if strings.EqualFold(strings.TrimSpace(role), "bootstrap") { + return true + } + } + return false +} + func endpointPort(raw string) (int, bool) { trimmed := strings.TrimSpace(raw) if trimmed == "" { @@ -1108,4 +1130,3 @@ func relayKeepAlive(ctx context.Context, ws *websocket.Conn, writeMu *sync.Mutex } } } - diff --git a/core/scripts/cre/environment/environment/remote_state.go b/core/scripts/cre/environment/environment/remote_state.go index 339d9e5ae35..143c9ef6972 100644 --- a/core/scripts/cre/environment/environment/remote_state.go +++ b/core/scripts/cre/environment/environment/remote_state.go @@ -77,16 +77,16 @@ func storeRemoteStopState(relativePathToRepoRoot string, cfg *envconfig.Config) NodeSets: []*cre.NodeSet{}, } for _, configuredBlockchain := range cfg.Blockchains { - if configuredBlockchain != nil && configuredBlockchain.Target == envconfig.TargetRemote { + if configuredBlockchain != nil && configuredBlockchain.Placement == envconfig.PlacementRemote { stopCfg.Blockchains = append(stopCfg.Blockchains, configuredBlockchain) } } for _, nodeSet := range cfg.NodeSets { - if nodeSet != nil && strings.TrimSpace(nodeSet.Target) == string(envconfig.TargetRemote) { + if nodeSet != nil && strings.TrimSpace(nodeSet.Placement) == string(envconfig.PlacementRemote) { stopCfg.NodeSets = append(stopCfg.NodeSets, nodeSet) } } - if cfg.JD != nil && cfg.JD.Target == envconfig.TargetRemote { + if cfg.JD != nil && cfg.JD.Placement == envconfig.PlacementRemote { stopCfg.JD = cfg.JD } if err := stopCfg.Store(remoteStateFileAbsPath(relativePathToRepoRoot)); err != nil { diff --git a/core/scripts/cre/environment/environment/workflow.go b/core/scripts/cre/environment/environment/workflow.go index 50f54bba72f..8fc354af0fe 100644 --- a/core/scripts/cre/environment/environment/workflow.go +++ b/core/scripts/cre/environment/environment/workflow.go @@ -590,7 +590,7 @@ func resolveWorkflowArtifactDeployModeFromState(containerNamePattern, nodeSetNam if cfgNodeSet == nil || cfgNodeSet.Name != nodeSetName { continue } - if cfgNodeSet.Target == string(envconfig.TargetRemote) { + if cfgNodeSet.Placement == string(envconfig.PlacementRemote) { return creworkflow.ArtifactDeployModeRemote, nodeSetName, nil } return creworkflow.ArtifactDeployModeLocal, nodeSetName, nil @@ -600,7 +600,7 @@ func resolveWorkflowArtifactDeployModeFromState(containerNamePattern, nodeSetNam matches := make([]string, 0) for _, cfgNodeSet := range cfg.NodeSets { - if cfgNodeSet == nil || cfgNodeSet.Target != string(envconfig.TargetRemote) { + if cfgNodeSet == nil || cfgNodeSet.Placement != string(envconfig.PlacementRemote) { continue } prefix := ns.NodeNamePrefix(cfgNodeSet.Name) diff --git a/system-tests/lib/cre/bootstrap_peer.go b/system-tests/lib/cre/bootstrap_peer.go new file mode 100644 index 00000000000..3c31e2d0a8a --- /dev/null +++ b/system-tests/lib/cre/bootstrap_peer.go @@ -0,0 +1,101 @@ +package cre + +import ( + "fmt" + "net" + "net/url" + "strconv" + "strings" + + "github.com/smartcontractkit/chainlink-testing-framework/framework" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/connectivity" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" +) + +func ResolveBootstrapAddress(callerTarget, bootstrapTarget, internalHost string, port int) (string, error) { + if strings.TrimSpace(internalHost) == "" { + return "", fmt.Errorf("bootstrap internal host is empty") + } + if port <= 0 || port > 65535 { + return "", fmt.Errorf("invalid bootstrap port: %d", port) + } + + callerPlacement, err := connectivity.PlacementFromTarget(callerTarget) + if err != nil { + return "", err + } + targetPlacement, err := connectivity.PlacementFromTarget(bootstrapTarget) + if err != nil { + return "", err + } + + internal := net.JoinHostPort(strings.TrimSpace(internalHost), strconv.Itoa(port)) + external, err := resolveBootstrapExternalAddress(targetPlacement, port) + if err != nil { + return "", err + } + + resolved, err := connectivity.Resolve(callerPlacement, targetPlacement, connectivity.EndpointPair{ + Name: "ocr-bootstrap", + Internal: internal, + External: external, + }) + if err != nil { + return "", err + } + if !resolved.RequiresBridge { + return resolved.URL, nil + } + return rewriteEndpointForRemoteCaller(resolved.URL) +} + +func ResolveBootstrapPeerURL(callerTarget, bootstrapTarget, peerID, internalHost string, port int) (string, error) { + address, err := ResolveBootstrapAddress(callerTarget, bootstrapTarget, internalHost, port) + if err != nil { + return "", err + } + trimmedPeerID := strings.TrimSpace(strings.TrimPrefix(peerID, "p2p_")) + if trimmedPeerID == "" { + return "", fmt.Errorf("bootstrap peerID is empty") + } + return trimmedPeerID + "@" + address, nil +} + +func resolveBootstrapExternalAddress(targetPlacement connectivity.Placement, port int) (string, error) { + if targetPlacement == connectivity.PlacementLocal { + return net.JoinHostPort("127.0.0.1", strconv.Itoa(port)), nil + } + if !runtimecfg.IsDirectMode() { + return "", fmt.Errorf("mixed DON bootstrap resolution requires direct access mode for remote bootstrap targets") + } + hostIP, err := runtimecfg.DirectHostIP() + if err != nil { + return "", err + } + return net.JoinHostPort(hostIP, strconv.Itoa(port)), nil +} + +func rewriteEndpointForRemoteCaller(raw string) (string, error) { + dockerHost := strings.TrimPrefix(framework.HostDockerInternal(), "http://") + trimmed := strings.TrimSpace(raw) + if trimmed == "" { + return "", fmt.Errorf("endpoint is empty") + } + if strings.Contains(trimmed, "://") { + parsed, err := url.Parse(trimmed) + if err != nil { + return "", fmt.Errorf("parse url %q: %w", raw, err) + } + if parsed.Port() != "" { + parsed.Host = net.JoinHostPort(dockerHost, parsed.Port()) + return parsed.String(), nil + } + parsed.Host = dockerHost + return parsed.String(), nil + } + _, port, err := net.SplitHostPort(trimmed) + if err != nil { + return "", fmt.Errorf("parse host:port %q: %w", raw, err) + } + return net.JoinHostPort(dockerHost, port), nil +} diff --git a/system-tests/lib/cre/don.go b/system-tests/lib/cre/don.go index 7a2958587cf..9e34b0f6cfb 100644 --- a/system-tests/lib/cre/don.go +++ b/system-tests/lib/cre/don.go @@ -31,6 +31,7 @@ import ( chainselectors "github.com/smartcontractkit/chain-selectors" "golang.org/x/sync/errgroup" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/connectivity" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/don/secrets" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/solana" @@ -43,7 +44,6 @@ const ( LabelNodeTypeValuePlugin = "plugin" LabelNodeP2PIDKey = "p2p_id" - ) type Role string @@ -108,6 +108,7 @@ type Don struct { Name string `toml:"name" json:"name"` ID uint64 `toml:"id" json:"id"` F uint8 `toml:"f" json:"f"` // max faulty nodes + Placement string `toml:"placement" json:"placement"` Nodes []*Node `toml:"nodes" json:"nodes"` @@ -230,6 +231,7 @@ func NewDON(ctx context.Context, donMetadata *DonMetadata, ctfNodes []*clnode.Ou Name: donMetadata.Name, ID: donMetadata.ID, Flags: donMetadata.Flags, + Placement: donMetadata.MustNodeSet().Placement, capabilityConfigs: donMetadata.ns.CapabilityConfigs, chainCapabilityIndex: donMetadata.ns.chainCapabilityIndex, } @@ -271,19 +273,38 @@ func NewDON(ctx context.Context, donMetadata *DonMetadata, ctfNodes []*clnode.Ou return don, nil } -func registerWithJD(ctx context.Context, d *Don, supportedChains []blockchains.Blockchain, cldfEnv *cldf.Environment) error { +func registerWithJD( + ctx context.Context, + d *Don, + donMetadata *DonMetadata, + supportedChains []blockchains.Blockchain, + cldfEnv *cldf.Environment, + jdPlacement string, + jdInternalWSRPC string, + jdExternalWSRPC string, +) error { mu := &sync.Mutex{} jd, ok := cldfEnv.Offchain.(*jd.JobDistributor) if !ok { return fmt.Errorf("offchain environment is not a *.jd.JobDistributor, but %T", cldfEnv.Offchain) } + internalWSRPC := strings.TrimSpace(jdInternalWSRPC) + externalWSRPC := strings.TrimSpace(jdExternalWSRPC) + if internalWSRPC == "" && externalWSRPC == "" { + internalWSRPC = jd.WSRPC + externalWSRPC = jd.WSRPC + } errgroup := errgroup.Group{} + nodeFacingJDUri, uriErr := resolveNodeFacingJDUriForDON(donMetadata, jdPlacement, internalWSRPC, externalWSRPC) + if uriErr != nil { + return uriErr + } for idx, node := range d.Nodes { errgroup.Go(func() error { // Set up Job distributor in node and register node with the job distributor - setupErr := node.setUpAndLinkJobDistributor(ctx, cldfEnv) + setupErr := node.setUpAndLinkJobDistributor(ctx, cldfEnv, nodeFacingJDUri) if setupErr != nil { return fmt.Errorf("failed to set up job distributor in node %s: %w", node.Name, setupErr) } @@ -649,7 +670,7 @@ func (n *Node) RegisterNodeToJobDistributor(ctx context.Context, cldfEnv *cldf.E // CreateJobDistributor fetches the keypairs from the job distributor and creates the job distributor in the node // and returns the job distributor id -func (n *Node) CreateJobDistributor(ctx context.Context, jd *jd.JobDistributor) (string, error) { +func (n *Node) CreateJobDistributor(ctx context.Context, jd *jd.JobDistributor, jdWSRPC string) (string, error) { // Get the keypairs from the job distributor csaKey, err := jd.GetCSAPublicKey(ctx) if err != nil { @@ -669,14 +690,14 @@ func (n *Node) CreateJobDistributor(ctx context.Context, jd *jd.JobDistributor) } return n.Clients.GQLClient.CreateJobDistributor(ctx, client.JobDistributorInput{ Name: "Job Distributor", - Uri: jd.WSRPC, + Uri: jdWSRPC, PublicKey: csaKey, }) } // setUpAndLinkJobDistributor sets up the job distributor in the node and registers the node with the job distributor // it sets the job distributor id for node -func (n *Node) setUpAndLinkJobDistributor(ctx context.Context, cldfEnv *cldf.Environment) error { +func (n *Node) setUpAndLinkJobDistributor(ctx context.Context, cldfEnv *cldf.Environment, jdWSRPC string) error { err := n.RegisterNodeToJobDistributor(ctx, cldfEnv) if err != nil { return err @@ -688,7 +709,7 @@ func (n *Node) setUpAndLinkJobDistributor(ctx context.Context, cldfEnv *cldf.Env } // now create the job distributor in the node - id, err := n.CreateJobDistributor(ctx, jd) + id, err := n.CreateJobDistributor(ctx, jd, jdWSRPC) if err != nil && !strings.Contains(err.Error(), "DuplicateFeedsManagerError") { return fmt.Errorf("failed to create job distributor in node %s: %w", n.Name, err) @@ -705,12 +726,12 @@ func (n *Node) setUpAndLinkJobDistributor(ctx context.Context, cldfEnv *cldf.Env return fmt.Errorf("no node found for node id %s", n.JobDistributorDetails.NodeID) } if !getRes.GetNode().IsConnected { - return retry.RetryableError(fmt.Errorf("node %s not connected to job distributor (jd_uri=%s)", n.Name, jd.WSRPC)) + return retry.RetryableError(fmt.Errorf("node %s not connected to job distributor (jd_uri=%s)", n.Name, jdWSRPC)) } return nil }) if err != nil { - return fmt.Errorf("failed to connect node %s to job distributor (jd_uri=%s): %w", n.Name, jd.WSRPC, err) + return fmt.Errorf("failed to connect node %s to job distributor (jd_uri=%s): %w", n.Name, jdWSRPC, err) } n.JobDistributorDetails.JDID = id return nil @@ -779,7 +800,16 @@ func LinkToJobDistributor(ctx context.Context, input *LinkDonsToJDInput) error { return errors.Wrap(schErr, "failed to find supported chains for DON") } - if err := registerWithJD(ctx, don, supportedChains, input.CldfEnvironment); err != nil { + if err := registerWithJD( + ctx, + don, + input.Topology.DonsMetadata.List()[idx], + supportedChains, + input.CldfEnvironment, + input.JDPlacement, + input.JDInternalWSRPC, + input.JDExternalWSRPC, + ); err != nil { return fmt.Errorf("failed to register DON with JD: %w", err) } nodeIDs = append(nodeIDs, don.JDNodeIDs()...) @@ -790,6 +820,37 @@ func LinkToJobDistributor(ctx context.Context, input *LinkDonsToJDInput) error { return nil } +func resolveNodeFacingJDUriForDON(donMetadata *DonMetadata, jdPlacement, internalWSRPC, externalWSRPC string) (string, error) { + if donMetadata == nil { + return "", fmt.Errorf("don metadata is nil") + } + nodeSet := donMetadata.MustNodeSet() + callerPlacement, err := connectivity.PlacementFromTarget(nodeSet.Placement) + if err != nil { + return "", err + } + targetPlacement, err := connectivity.PlacementFromTarget(jdPlacement) + if err != nil { + return "", err + } + resolved, err := connectivity.Resolve(callerPlacement, targetPlacement, connectivity.EndpointPair{ + Name: "jd-wsrpc", + Internal: strings.TrimSpace(internalWSRPC), + External: strings.TrimSpace(externalWSRPC), + }) + if err != nil { + return "", err + } + if !resolved.RequiresBridge { + return resolved.URL, nil + } + bridgeURL, err := rewriteEndpointForRemoteCaller(resolved.URL) + if err != nil { + return "", err + } + return bridgeURL, nil +} + // copied from flags package to avoid circular dependency func HasFlag(values []string, capability string) bool { if slices.Contains(values, capability) { diff --git a/system-tests/lib/cre/don/config/config.go b/system-tests/lib/cre/don/config/config.go index 9dd940c81c8..28c1e5c5597 100644 --- a/system-tests/lib/cre/don/config/config.go +++ b/system-tests/lib/cre/don/config/config.go @@ -62,6 +62,11 @@ func PrepareNodeTOMLs( if peeringErr != nil { return nil, errors.Wrap(peeringErr, "failed to find peering data") } + ocrBootstrapPlacement, placementErr := resolveBootstrapPlacement(topology, bt.UUID) + if placementErr != nil { + return nil, placementErr + } + framework.L.Info().Str("placement", strings.TrimSpace(ocrBootstrapPlacement)).Str("bootstrapNodeUUID", bt.UUID).Msg("resolved OCR bootstrap placement") localNodeSets := topology.NodeSets() chainPerSelector := make(map[uint64]creblockchains.Blockchain) @@ -112,7 +117,8 @@ func PrepareNodeTOMLs( ContractVersions: creEnv.ContractVersions, DonMetadata: donMetadata, Blockchains: chainPerSelector, - BlockchainTargetBySelector: blockchainTargetBySelector, + BlockchainPlacementBySelector: blockchainTargetBySelector, + OCRBootstrapPlacement: ocrBootstrapPlacement, Flags: donMetadata.Flags, CapabilitiesPeeringData: capabilitiesPeeringData, OCRPeeringData: ocrPeeringData, @@ -224,7 +230,7 @@ func generateNodeTomlConfig(input cre.GenerateConfigsInput, nodeConfigTransforme } case cre.WorkerNode: var cErr error - nodeConfig, cErr = addWorkerNodeConfig(nodeConfig, input.Topology, input.OCRPeeringData, commonInputs, input.DonMetadata, nodeMetadata) + nodeConfig, cErr = addWorkerNodeConfig(nodeConfig, input.Topology, input.OCRPeeringData, input.OCRBootstrapPlacement, commonInputs, input.DonMetadata, nodeMetadata) if cErr != nil { return nil, errors.Wrapf(cErr, "failed to add worker node config for node at index %d in DON %s", nodeIdx, input.DonMetadata.Name) } @@ -379,11 +385,16 @@ func addWorkerNodeConfig( existingConfig corechainlink.Config, topology *cre.Topology, ocrPeeringData cre.OCRPeeringData, + ocrBootstrapPlacement string, commonInputs *commonInputs, donMetadata *cre.DonMetadata, m *cre.NodeMetadata, ) (corechainlink.Config, error) { - ocrBoostrapperLocator, ocrBErr := commontypes.NewBootstrapperLocator(ocrPeeringData.OCRBootstraperPeerID, []string{ocrPeeringData.OCRBootstraperHost + ":" + strconv.Itoa(ocrPeeringData.Port)}) + bootstrapAddress, bootstrapAddressErr := cre.ResolveBootstrapAddress(donMetadata.MustNodeSet().Placement, ocrBootstrapPlacement, ocrPeeringData.OCRBootstraperHost, ocrPeeringData.Port) + if bootstrapAddressErr != nil { + return existingConfig, errors.Wrap(bootstrapAddressErr, "failed to resolve bootstrap address for worker node") + } + ocrBoostrapperLocator, ocrBErr := commontypes.NewBootstrapperLocator(ocrPeeringData.OCRBootstraperPeerID, []string{bootstrapAddress}) if ocrBErr != nil { return existingConfig, errors.Wrap(ocrBErr, "failed to create OCR bootstrapper locator") } @@ -492,7 +503,7 @@ func addWorkerNodeConfig( if !ok { return existingConfig, fmt.Errorf("failed to get EVM key (chainID %d, node index %d)", commonInputs.registryChainID, m.Index) } - callerPlacement, placementErr := connectivity.PlacementFromTarget(donMetadata.MustNodeSet().Target) + callerPlacement, placementErr := connectivity.PlacementFromTarget(donMetadata.MustNodeSet().Placement) if placementErr != nil { return existingConfig, placementErr } @@ -688,7 +699,7 @@ type evmChain struct { func findEVMChains(input cre.GenerateConfigsInput) ([]*evmChain, error) { evmChains := make([]*evmChain, 0) - callerPlacement, err := connectivity.PlacementFromTarget(input.DonMetadata.MustNodeSet().Target) + callerPlacement, err := connectivity.PlacementFromTarget(input.DonMetadata.MustNodeSet().Placement) if err != nil { return nil, err } @@ -702,7 +713,7 @@ func findEVMChains(input cre.GenerateConfigsInput) ([]*evmChain, error) { continue } - targetPlacement, err := connectivity.PlacementFromTarget(input.BlockchainTargetBySelector[chainSelector]) + targetPlacement, err := connectivity.PlacementFromTarget(input.BlockchainPlacementBySelector[chainSelector]) if err != nil { return nil, err } @@ -756,7 +767,7 @@ type solanaChain struct { func findOneSolanaChain(input cre.GenerateConfigsInput) (*solanaChain, error) { var solChain *solanaChain chainsFound := 0 - callerPlacement, err := connectivity.PlacementFromTarget(input.DonMetadata.MustNodeSet().Target) + callerPlacement, err := connectivity.PlacementFromTarget(input.DonMetadata.MustNodeSet().Placement) if err != nil { return nil, err } @@ -772,7 +783,7 @@ func findOneSolanaChain(input cre.GenerateConfigsInput) (*solanaChain, error) { } solBc := bcOut.(*solana.Blockchain) - targetPlacement, err := connectivity.PlacementFromTarget(input.BlockchainTargetBySelector[solBc.ChainSelector()]) + targetPlacement, err := connectivity.PlacementFromTarget(input.BlockchainPlacementBySelector[solBc.ChainSelector()]) if err != nil { return nil, err } @@ -816,7 +827,7 @@ func gatewayPlacementByNodeUUID(topology *cre.Topology) (map[string]connectivity return out, nil } for _, don := range topology.DonsMetadata.List() { - placement, err := connectivity.PlacementFromTarget(don.MustNodeSet().Target) + placement, err := connectivity.PlacementFromTarget(don.MustNodeSet().Placement) if err != nil { return nil, err } @@ -830,6 +841,31 @@ func gatewayPlacementByNodeUUID(topology *cre.Topology) (map[string]connectivity return out, nil } +func resolveBootstrapPlacement(topology *cre.Topology, bootstrapNodeUUID string) (string, error) { + if topology == nil { + return "", fmt.Errorf("topology is nil") + } + bootstrapNodeUUID = strings.TrimSpace(bootstrapNodeUUID) + if bootstrapNodeUUID == "" { + return "", fmt.Errorf("bootstrap node UUID is empty") + } + for _, don := range topology.DonsMetadata.List() { + if don == nil { + continue + } + for _, node := range don.NodesMetadata { + if node == nil || strings.TrimSpace(node.UUID) == "" { + continue + } + if node.UUID != bootstrapNodeUUID { + continue + } + return strings.TrimSpace(don.MustNodeSet().Placement), nil + } + } + return "", fmt.Errorf("failed to resolve bootstrap placement for node UUID %s", bootstrapNodeUUID) +} + func gatewayExternalConnectorURL(gateway *cre.DonGatewayConfiguration) string { if gateway == nil || gateway.GatewayConfiguration == nil { return "" diff --git a/system-tests/lib/cre/don_jd_placement_test.go b/system-tests/lib/cre/don_jd_placement_test.go new file mode 100644 index 00000000000..ca3d15bf780 --- /dev/null +++ b/system-tests/lib/cre/don_jd_placement_test.go @@ -0,0 +1,37 @@ +package cre + +import "testing" + +func TestResolveNodeFacingJDUriForDON_LocalDonToLocalJD_UsesInternal(t *testing.T) { + donMeta := &DonMetadata{ + Name: "workflow", + ns: &NodeSet{ + Placement: "local", + }, + } + + got, err := resolveNodeFacingJDUriForDON(donMeta, "local", "jd:8080", "127.0.0.1:8080") + if err != nil { + t.Fatalf("resolveNodeFacingJDUriForDON returned error: %v", err) + } + if got != "jd:8080" { + t.Fatalf("expected internal JD URI jd:8080, got %s", got) + } +} + +func TestResolveNodeFacingJDUriForDON_RemoteDonToLocalJD_RewritesForBridge(t *testing.T) { + donMeta := &DonMetadata{ + Name: "workflow", + ns: &NodeSet{ + Placement: "remote", + }, + } + + got, err := resolveNodeFacingJDUriForDON(donMeta, "local", "jd:8080", "127.0.0.1:8080") + if err != nil { + t.Fatalf("resolveNodeFacingJDUriForDON returned error: %v", err) + } + if got != "host.docker.internal:8080" { + t.Fatalf("expected bridged JD URI host.docker.internal:8080, got %s", got) + } +} diff --git a/system-tests/lib/cre/environment/blockchain_start.go b/system-tests/lib/cre/environment/blockchain_start.go index 67c0f0227de..239bbe69dd1 100644 --- a/system-tests/lib/cre/environment/blockchain_start.go +++ b/system-tests/lib/cre/environment/blockchain_start.go @@ -359,7 +359,7 @@ func startBlockchainsWithTargets( localInputs := make([]*blockchain.Input, 0, len(configuredBlockchains)) remoteIdx := make([]int, 0, len(configuredBlockchains)) for idx, configuredBlockchain := range configuredBlockchains { - if configuredBlockchain.Target == config.TargetRemote { + if configuredBlockchain.Placement == config.PlacementRemote { remoteIdx = append(remoteIdx, idx) continue } @@ -497,7 +497,7 @@ func rewriteRemoteBlockchainOutputForLocalAccess( if err != nil { return err } - return rewriteRemoteBlockchainOutputForDirectAccess(output, hostIP, rewriteInternalForLocalNodes) + return rewriteRemoteBlockchainOutputForDirectAccess(output, hostIP) } componentID := tunnel.CanonicalComponentID(tunnel.KindBlockchain, configuredIndex, input.Type) @@ -533,11 +533,7 @@ func rewriteRemoteBlockchainOutputForLocalAccess( return nil } -func rewriteRemoteBlockchainOutputForDirectAccess( - output *blockchain.Output, - hostIP string, - rewriteInternalForLocalNodes bool, -) error { +func rewriteRemoteBlockchainOutputForDirectAccess(output *blockchain.Output, hostIP string) error { if output == nil { return nil } @@ -551,9 +547,6 @@ func rewriteRemoteBlockchainOutputForDirectAccess( return err } node.ExternalHTTPUrl = rewritten - if rewriteInternalForLocalNodes { - node.InternalHTTPUrl = rewritten - } } if node.ExternalWSUrl != "" { rewritten, err := rewriteURLHost(node.ExternalWSUrl, hostIP) @@ -561,9 +554,6 @@ func rewriteRemoteBlockchainOutputForDirectAccess( return err } node.ExternalWSUrl = rewritten - if rewriteInternalForLocalNodes { - node.InternalWSUrl = rewritten - } } } return nil diff --git a/system-tests/lib/cre/environment/config/config.go b/system-tests/lib/cre/environment/config/config.go index c56051dd07d..de92561e10a 100644 --- a/system-tests/lib/cre/environment/config/config.go +++ b/system-tests/lib/cre/environment/config/config.go @@ -71,11 +71,11 @@ type Config struct { loaded bool } -type ComponentTarget string +type ComponentPlacement string const ( - TargetLocal ComponentTarget = "local" - TargetRemote ComponentTarget = "remote" + PlacementLocal ComponentPlacement = "local" + PlacementRemote ComponentPlacement = "remote" ) type RemoteStartPolicy string @@ -89,7 +89,7 @@ const ( // The embedded input keeps TOML fields backward-compatible. type Blockchain struct { blockchain.Input - Target ComponentTarget `toml:"target"` + Placement ComponentPlacement `toml:"placement"` RemoteStartPolicy RemoteStartPolicy `toml:"remote_start_policy"` } @@ -97,14 +97,14 @@ type Blockchain struct { // The embedded input keeps TOML fields backward-compatible. type JobDistributor struct { jd.Input - Target ComponentTarget `toml:"target"` + Placement ComponentPlacement `toml:"placement"` RemoteStartPolicy RemoteStartPolicy `toml:"remote_start_policy"` } func (b *Blockchain) Normalize() { - b.Target = normalizeComponentTarget(b.Target) - if b.Target == "" { - b.Target = TargetLocal + b.Placement = normalizeComponentPlacement(b.Placement) + if b.Placement == "" { + b.Placement = PlacementLocal } if b.RemoteStartPolicy == "" { b.RemoteStartPolicy = RemoteStartPolicyReuseIfIdentical @@ -117,8 +117,8 @@ func (b *Blockchain) Validate() error { } b.Normalize() - if b.Target != TargetLocal && b.Target != TargetRemote { - return fmt.Errorf("invalid blockchain target: %s", b.Target) + if b.Placement != PlacementLocal && b.Placement != PlacementRemote { + return fmt.Errorf("invalid blockchain placement: %s", b.Placement) } if b.RemoteStartPolicy != RemoteStartPolicyReuseIfIdentical && b.RemoteStartPolicy != RemoteStartPolicyAlways { return fmt.Errorf("invalid blockchain remote_start_policy: %s", b.RemoteStartPolicy) @@ -135,9 +135,9 @@ func (b *Blockchain) InputRef() *blockchain.Input { } func (j *JobDistributor) Normalize() { - j.Target = normalizeComponentTarget(j.Target) - if j.Target == "" { - j.Target = TargetLocal + j.Placement = normalizeComponentPlacement(j.Placement) + if j.Placement == "" { + j.Placement = PlacementLocal } if j.RemoteStartPolicy == "" { j.RemoteStartPolicy = RemoteStartPolicyReuseIfIdentical @@ -150,8 +150,8 @@ func (j *JobDistributor) Validate() error { } j.Normalize() - if j.Target != TargetLocal && j.Target != TargetRemote { - return fmt.Errorf("invalid jd target: %s", j.Target) + if j.Placement != PlacementLocal && j.Placement != PlacementRemote { + return fmt.Errorf("invalid jd placement: %s", j.Placement) } if j.RemoteStartPolicy != RemoteStartPolicyReuseIfIdentical && j.RemoteStartPolicy != RemoteStartPolicyAlways { return fmt.Errorf("invalid jd remote_start_policy: %s", j.RemoteStartPolicy) @@ -236,9 +236,9 @@ func normalizeNodeSetPlacement(nodeSet *cre.NodeSet) { if nodeSet == nil { return } - nodeSet.Target = normalizeNodeSetTarget(nodeSet.Target) - if strings.TrimSpace(nodeSet.Target) == "" { - nodeSet.Target = string(TargetLocal) + nodeSet.Placement = normalizeNodeSetPlacementValue(nodeSet.Placement) + if strings.TrimSpace(nodeSet.Placement) == "" { + nodeSet.Placement = string(PlacementLocal) } if strings.TrimSpace(nodeSet.RemoteStartPolicy) == "" { nodeSet.RemoteStartPolicy = string(RemoteStartPolicyReuseIfIdentical) @@ -249,8 +249,8 @@ func validateNodeSetPlacement(nodeSet *cre.NodeSet) error { if nodeSet == nil { return errors.New("nodeset is nil") } - if nodeSet.Target != string(TargetLocal) && nodeSet.Target != string(TargetRemote) { - return fmt.Errorf("invalid nodeset target: %s", nodeSet.Target) + if nodeSet.Placement != string(PlacementLocal) && nodeSet.Placement != string(PlacementRemote) { + return fmt.Errorf("invalid nodeset placement: %s", nodeSet.Placement) } if nodeSet.RemoteStartPolicy != string(RemoteStartPolicyReuseIfIdentical) && nodeSet.RemoteStartPolicy != string(RemoteStartPolicyAlways) { return fmt.Errorf("invalid nodeset remote_start_policy: %s", nodeSet.RemoteStartPolicy) @@ -273,21 +273,21 @@ func removeChainIDFromFlag(flag string) string { return flag[:lastIdx] } -func normalizeComponentTarget(target ComponentTarget) ComponentTarget { - switch strings.ToLower(strings.TrimSpace(string(target))) { +func normalizeComponentPlacement(placement ComponentPlacement) ComponentPlacement { + switch strings.ToLower(strings.TrimSpace(string(placement))) { case "": return "" - case string(TargetRemote): - return TargetRemote - case string(TargetLocal): - return TargetLocal + case string(PlacementRemote): + return PlacementRemote + case string(PlacementLocal): + return PlacementLocal default: - return target + return placement } } -func normalizeNodeSetTarget(target string) string { - return string(normalizeComponentTarget(ComponentTarget(target))) +func normalizeNodeSetPlacementValue(placement string) string { + return string(normalizeComponentPlacement(ComponentPlacement(placement))) } func validateContractVersions(envDependencies cre.CLIEnvironmentDependencies) error { diff --git a/system-tests/lib/cre/environment/dons.go b/system-tests/lib/cre/environment/dons.go index 84ae380c0b5..bb8baf4ebdf 100644 --- a/system-tests/lib/cre/environment/dons.go +++ b/system-tests/lib/cre/environment/dons.go @@ -152,7 +152,7 @@ func StartDONs( if nodeSet.Out != nil { lggr.Info().Msgf("Using pre-configured node URLs for DON %s", nodeSet.Name) nodeset = nodeSet.Out - } else if strings.TrimSpace(nodeSet.Target) == string(config.TargetRemote) { + } else if strings.TrimSpace(nodeSet.Placement) == string(config.PlacementRemote) { registryChainPayload, err := agent.EncodeForTransport(registryChainBlockchainOutput) if err != nil { return pkgerrors.Wrap(err, "failed to encode registry blockchain payload for remote nodeset start") @@ -245,7 +245,7 @@ func StartDONs( func hasRemoteNodeSets(nodeSets []*cre.NodeSet) bool { for _, nodeSet := range nodeSets { - if nodeSet != nil && strings.TrimSpace(nodeSet.Target) == string(config.TargetRemote) { + if nodeSet != nil && strings.TrimSpace(nodeSet.Placement) == string(config.PlacementRemote) { return true } } diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index c26f2c73b25..3d9f7fa6a8d 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -229,14 +229,14 @@ func SetupTestEnvironment( if tErr != nil { return nil, pkgerrors.Wrap(tErr, "failed to create topology") } - blockchainTargetBySelector := blockchainTargetsBySelector(input.Blockchains, deployedBlockchains.Outputs) + blockchainPlacementBySelector := blockchainPlacementsBySelector(input.Blockchains, deployedBlockchains.Outputs) updatedNodeSets, topoErr := donconfig.PrepareNodeTOMLs( ctx, topology, creEnvironment, input.NodeSets, - blockchainTargetBySelector, + blockchainPlacementBySelector, input.Capabilities, input.ConfigFactoryFunctions, ) @@ -274,7 +274,7 @@ func SetupTestEnvironment( fmt.Print(libformat.PurpleText("%s", input.StageGen.WrapAndNext("Applied Features in %.2f seconds", input.StageGen.Elapsed().Seconds()))) - startedJD, jdStartErr := StartJD(ctx, testLogger, input.JdInput, input.Provider, tunnelManager, nodeSetPlacement.HasLocalTargets) + startedJD, jdStartErr := StartJD(ctx, testLogger, input.JdInput, input.Provider, tunnelManager) if jdStartErr != nil { return nil, pkgerrors.Wrap(jdStartErr, "failed to start Job Distributor") } @@ -296,6 +296,9 @@ func SetupTestEnvironment( CldfEnvironment: deployKeystoneContractsOutput.Env, Topology: topology, Dons: dons, + JDPlacement: string(input.JdInput.Placement), + JDInternalWSRPC: startedJD.JDOutput.InternalWSRPCUrl, + JDExternalWSRPC: startedJD.JDOutput.ExternalWSRPCUrl, } cldErr := cre.LinkToJobDistributor(ctx, linkDonsToJDInput) @@ -483,7 +486,7 @@ func SetupTestEnvironment( }, nil } -func blockchainTargetsBySelector(configured []*config.Blockchain, deployed []blockchains.Blockchain) map[uint64]string { +func blockchainPlacementsBySelector(configured []*config.Blockchain, deployed []blockchains.Blockchain) map[uint64]string { bySelector := make(map[uint64]string, len(deployed)) for idx, blockchainCfg := range configured { if blockchainCfg == nil { @@ -493,7 +496,7 @@ func blockchainTargetsBySelector(configured []*config.Blockchain, deployed []blo continue } selector := deployed[idx].ChainSelector() - bySelector[selector] = string(blockchainCfg.Target) + bySelector[selector] = string(blockchainCfg.Placement) } return bySelector } @@ -525,23 +528,18 @@ func summarizeNodeSetPlacement(nodeSets []*cre.NodeSet) (*nodeSetPlacementSummar if nodeSet == nil { continue } - configTarget := strings.TrimSpace(nodeSet.Target) - if configTarget == "" || configTarget == string(config.TargetLocal) { + configPlacement := strings.TrimSpace(nodeSet.Placement) + if configPlacement == "" || configPlacement == string(config.PlacementLocal) { summary.HasLocalTargets = true continue } - if configTarget == string(config.TargetRemote) { + if configPlacement == string(config.PlacementRemote) { summary.HasRemoteTargets = true continue } - return nil, fmt.Errorf("invalid nodeset target: %s", nodeSet.Target) + return nil, fmt.Errorf("invalid nodeset placement: %s", nodeSet.Placement) } - // Mixed local and remote nodeset targets need per-DON node-facing URL config selection. - // Current PrepareNodeTOMLs builds one node-facing URL shape, so keep this unsupported for now. - if summary.HasLocalTargets && summary.HasRemoteTargets { - return nil, errors.New("mixed nodeset targets are not supported yet; set all nodesets target=local or all target=remote") - } return summary, nil } @@ -556,10 +554,10 @@ func validateUnsupportedPlacements( if bc == nil { continue } - if bc.Target == config.TargetLocal { + if bc.Placement == config.PlacementLocal { return errors.New( "remote nodesets with local blockchains are not supported in this PoC. " + - "Set all blockchains to target=remote, or run nodesets with target=local so nodes stay colocated with local blockchains", + "Set all blockchains to placement=remote, or run nodesets with placement=local so nodes stay colocated with local blockchains", ) } } diff --git a/system-tests/lib/cre/environment/environment_placement_test.go b/system-tests/lib/cre/environment/environment_placement_test.go new file mode 100644 index 00000000000..0d16d589a47 --- /dev/null +++ b/system-tests/lib/cre/environment/environment_placement_test.go @@ -0,0 +1,22 @@ +package environment + +import ( + "testing" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre" +) + +func TestSummarizeNodeSetPlacement_AllowsMixedPlacements(t *testing.T) { + nodeSets := []*cre.NodeSet{ + {Placement: "local"}, + {Placement: "remote"}, + } + + summary, err := summarizeNodeSetPlacement(nodeSets) + if err != nil { + t.Fatalf("summarizeNodeSetPlacement returned error: %v", err) + } + if !summary.HasLocalTargets || !summary.HasRemoteTargets { + t.Fatalf("expected both local and remote placements, got %+v", summary) + } +} diff --git a/system-tests/lib/cre/environment/jobs.go b/system-tests/lib/cre/environment/jobs.go index d05b8338a4e..771d752b705 100644 --- a/system-tests/lib/cre/environment/jobs.go +++ b/system-tests/lib/cre/environment/jobs.go @@ -19,7 +19,6 @@ import ( "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/insecure" - "github.com/smartcontractkit/chainlink-testing-framework/framework" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" @@ -68,7 +67,6 @@ func StartJD( jdConfig *config.JobDistributor, infraInput infra.Provider, tunnelManager tunnel.Manager, - rewriteInternalForLocalNodes bool, ) (*StartedJD, error) { startTime := time.Now() lggr.Info().Msg("Starting Job Distributor") @@ -79,7 +77,7 @@ func StartJD( var jdOutput *jd.Output var jdErr error - if jdConfig.Target == config.TargetRemote { + if jdConfig.Placement == config.PlacementRemote { startClient, err := newStartComponentClient(lggr, tunnelManager) if err != nil { return nil, err @@ -115,7 +113,7 @@ func StartJD( if err != nil { return nil, pkgerrors.Wrap(err, "failed to decode jd transport payload") } - if err := rewriteRemoteJDOutputForLocalAccess(ctx, lggr, tunnelManager, jdOutput, rewriteInternalForLocalNodes); err != nil { + if err := rewriteRemoteJDOutputForLocalAccess(ctx, lggr, tunnelManager, jdOutput); err != nil { return nil, err } } else if infraInput.IsKubernetes() { @@ -147,24 +145,17 @@ func StartJD( // Configure gRPC credentials for JD connection creds := getJDCredentials(lggr, infraInput, jdOutput) - nodeFacingWSRPC, wsrpcErr := resolveNodeFacingJDWSRPC(jdOutput, rewriteInternalForLocalNodes) - if wsrpcErr != nil { - return nil, pkgerrors.Wrap(wsrpcErr, "failed to resolve node-facing JD WSRPC endpoint") - } - jdClientConfig := cldf_jd.JDConfig{ GRPC: jdOutput.ExternalGRPCUrl, - WSRPC: nodeFacingWSRPC, + WSRPC: jdOutput.ExternalWSRPCUrl, Creds: creds, } lggr.Info().Msgf("Connecting to JD GRPC at: %s", jdOutput.ExternalGRPCUrl) lggr.Info(). - Str("nodeFacingWSRPC", nodeFacingWSRPC). Str("internalWSRPC", jdOutput.InternalWSRPCUrl). Str("externalWSRPC", jdOutput.ExternalWSRPCUrl). - Bool("hasLocalNodeSets", rewriteInternalForLocalNodes). - Msg("Resolved JD WSRPC endpoint for node registration") + Msg("Resolved JD endpoints") jdClient, jdErr := cldf_jd.NewJDClient(jdClientConfig) if jdErr != nil { @@ -179,32 +170,11 @@ func StartJD( }, nil } -func resolveNodeFacingJDWSRPC(output *jd.Output, rewriteInternalForLocalNodes bool) (string, error) { - if output == nil { - return "", fmt.Errorf("jd output is nil") - } - // Local nodesets can resolve JD on the Docker network directly. - if rewriteInternalForLocalNodes { - return output.InternalWSRPCUrl, nil - } - // Remote nodesets need the relay endpoint on the remote Docker host. - source := strings.TrimSpace(output.ExternalWSRPCUrl) - if source == "" { - source = strings.TrimSpace(output.InternalWSRPCUrl) - } - if source == "" { - return "", fmt.Errorf("jd output does not include WSRPC endpoint") - } - dockerHost := strings.TrimPrefix(framework.HostDockerInternal(), "http://") - return rewriteAddressHost(source, dockerHost) -} - func rewriteRemoteJDOutputForLocalAccess( ctx context.Context, lggr zerolog.Logger, tunnelManager tunnel.Manager, output *jd.Output, - rewriteInternalForLocalNodes bool, ) error { if output == nil { return nil @@ -214,7 +184,7 @@ func rewriteRemoteJDOutputForLocalAccess( if err != nil { return err } - return rewriteJDForDirectAccess(output, hostIP, rewriteInternalForLocalNodes) + return rewriteJDForDirectAccess(output, hostIP) } if tunnelManager == nil { return errors.New("tunnel manager is required for remote jd target") @@ -236,10 +206,10 @@ func rewriteRemoteJDOutputForLocalAccess( Str("localURL", binding.LocalURL). Msg("Established endpoint tunnel") } - return rewriteJDWithBindings(output, bindings, rewriteInternalForLocalNodes) + return rewriteJDWithBindings(output, bindings) } -func rewriteJDForDirectAccess(output *jd.Output, hostIP string, rewriteInternalForLocalNodes bool) error { +func rewriteJDForDirectAccess(output *jd.Output, hostIP string) error { if output == nil { return nil } @@ -250,9 +220,6 @@ func rewriteJDForDirectAccess(output *jd.Output, hostIP string, rewriteInternalF return err } output.ExternalGRPCUrl = rewritten - if rewriteInternalForLocalNodes { - output.InternalGRPCUrl = rewritten - } } if output.ExternalWSRPCUrl != "" || output.InternalWSRPCUrl != "" { @@ -265,9 +232,6 @@ func rewriteJDForDirectAccess(output *jd.Output, hostIP string, rewriteInternalF return err } output.ExternalWSRPCUrl = rewritten - if rewriteInternalForLocalNodes { - output.InternalWSRPCUrl = rewritten - } } return nil } @@ -295,7 +259,7 @@ func describeJDEndpoints(output *jd.Output) ([]tunnel.EndpointRef, error) { return refs, nil } -func rewriteJDWithBindings(output *jd.Output, bindings []tunnel.TunnelBinding, rewriteInternalForLocalNodes bool) error { +func rewriteJDWithBindings(output *jd.Output, bindings []tunnel.TunnelBinding) error { byName := make(map[string]tunnel.TunnelBinding, len(bindings)) for _, binding := range bindings { byName[binding.EndpointName] = binding @@ -307,10 +271,6 @@ func rewriteJDWithBindings(output *jd.Output, bindings []tunnel.TunnelBinding, r return fmt.Errorf("missing tunnel binding for jd grpc endpoint") } output.ExternalGRPCUrl = net.JoinHostPort("127.0.0.1", fmt.Sprintf("%d", binding.LocalPort)) - if rewriteInternalForLocalNodes { - dockerHost := strings.TrimPrefix(framework.HostDockerInternal(), "http://") - output.InternalGRPCUrl = net.JoinHostPort(dockerHost, fmt.Sprintf("%d", binding.LocalPort)) - } } if output.ExternalWSRPCUrl != "" || output.InternalWSRPCUrl != "" { @@ -319,10 +279,6 @@ func rewriteJDWithBindings(output *jd.Output, bindings []tunnel.TunnelBinding, r return fmt.Errorf("missing tunnel binding for jd wsrpc endpoint") } output.ExternalWSRPCUrl = net.JoinHostPort("127.0.0.1", fmt.Sprintf("%d", binding.LocalPort)) - if rewriteInternalForLocalNodes { - dockerHost := strings.TrimPrefix(framework.HostDockerInternal(), "http://") - output.InternalWSRPCUrl = net.JoinHostPort(dockerHost, fmt.Sprintf("%d", binding.LocalPort)) - } } return nil diff --git a/system-tests/lib/cre/environment/jobs_test.go b/system-tests/lib/cre/environment/jobs_test.go index d2d84fa529d..e986aca5b96 100644 --- a/system-tests/lib/cre/environment/jobs_test.go +++ b/system-tests/lib/cre/environment/jobs_test.go @@ -1,7 +1,6 @@ package environment import ( - "strings" "testing" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" @@ -55,14 +54,14 @@ func TestRewriteJDWithBindingsRewritesNodeFacingWSRPC(t *testing.T) { }, } - if err := rewriteJDWithBindings(output, bindings, true); err != nil { + if err := rewriteJDWithBindings(output, bindings); err != nil { t.Fatalf("rewriteJDWithBindings returned error: %v", err) } if output.ExternalWSRPCUrl != "127.0.0.1:61002" { t.Fatalf("expected external wsrpc url to be rewritten to 127.0.0.1:61002, got %s", output.ExternalWSRPCUrl) } - if !strings.HasSuffix(output.InternalWSRPCUrl, ":61002") { - t.Fatalf("expected internal wsrpc url to use tunneled port 61002, got %s", output.InternalWSRPCUrl) + if output.InternalWSRPCUrl != "job-distributor:8080" { + t.Fatalf("expected internal wsrpc url to remain unchanged, got %s", output.InternalWSRPCUrl) } } diff --git a/system-tests/lib/cre/environment/remote_stop.go b/system-tests/lib/cre/environment/remote_stop.go index a05eabd99ef..f218ee430d9 100644 --- a/system-tests/lib/cre/environment/remote_stop.go +++ b/system-tests/lib/cre/environment/remote_stop.go @@ -54,7 +54,7 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. var joined error for _, configuredBlockchain := range cfg.Blockchains { - if configuredBlockchain == nil || configuredBlockchain.Target != config.TargetRemote { + if configuredBlockchain == nil || configuredBlockchain.Placement != config.PlacementRemote { continue } payload := agent.StartComponentPayload{ @@ -76,7 +76,7 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. } for _, nodeSet := range cfg.NodeSets { - if nodeSet == nil || strings.TrimSpace(nodeSet.Target) != string(config.TargetRemote) { + if nodeSet == nil || strings.TrimSpace(nodeSet.Placement) != string(config.PlacementRemote) { continue } payload := agent.StartComponentPayload{ @@ -97,7 +97,7 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. } } - if cfg.JD != nil && cfg.JD.Target == config.TargetRemote { + if cfg.JD != nil && cfg.JD.Placement == config.PlacementRemote { payload := agent.StartComponentPayload{ ComponentType: componentTypeJD, JD: cfg.JD.InputRef(), @@ -133,16 +133,16 @@ func countRemoteStopTargets(cfg *config.Config) int { } count := 0 for _, configuredBlockchain := range cfg.Blockchains { - if configuredBlockchain != nil && configuredBlockchain.Target == config.TargetRemote { + if configuredBlockchain != nil && configuredBlockchain.Placement == config.PlacementRemote { count++ } } for _, nodeSet := range cfg.NodeSets { - if nodeSet != nil && strings.TrimSpace(nodeSet.Target) == string(config.TargetRemote) { + if nodeSet != nil && strings.TrimSpace(nodeSet.Placement) == string(config.PlacementRemote) { count++ } } - if cfg.JD != nil && cfg.JD.Target == config.TargetRemote { + if cfg.JD != nil && cfg.JD.Placement == config.PlacementRemote { count++ } return count diff --git a/system-tests/lib/cre/features/consensus/v1/consensus.go b/system-tests/lib/cre/features/consensus/v1/consensus.go index 27111084a9f..f75d4d19ab1 100644 --- a/system-tests/lib/cre/features/consensus/v1/consensus.go +++ b/system-tests/lib/cre/features/consensus/v1/consensus.go @@ -3,7 +3,6 @@ package v1 import ( "context" "fmt" - "strconv" "dario.cat/mergo" "github.com/pkg/errors" @@ -148,9 +147,9 @@ func createJobs( specs := make(map[string][]string) - _, ocrPeeringCfg, err := cre.PeeringCfgs(bootstrap) + bootstrapPeerURL, err := cre.ResolveBootstrapPeerURL(don.Target, bootstrap.DON.Target, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) if err != nil { - return errors.Wrap(err, "failed to get peering configs") + return errors.Wrap(err, "failed to resolve bootstrap peer URL") } workerInput := cre_jobs.ProposeJobSpecInput{ @@ -167,7 +166,7 @@ func createJobs( "chainSelectorEVM": creEnv.RegistryChainSelector, "contractQualifier": ContractQualifier, "templateName": "worker-ocr3", - "bootstrapperOCR3Urls": []string{ocrPeeringCfg.OCRBootstraperPeerID + "@" + ocrPeeringCfg.OCRBootstraperHost + ":" + strconv.Itoa(ocrPeeringCfg.Port)}, + "bootstrapperOCR3Urls": []string{bootstrapPeerURL}, }, } diff --git a/system-tests/lib/cre/features/consensus/v2/consensus.go b/system-tests/lib/cre/features/consensus/v2/consensus.go index 3572ccac312..c4f7027dd67 100644 --- a/system-tests/lib/cre/features/consensus/v2/consensus.go +++ b/system-tests/lib/cre/features/consensus/v2/consensus.go @@ -138,8 +138,12 @@ func createJobs( specs := make(map[string][]string) + bootstrapPeer, bootstrapErr := formatBootstrapPeer(don, bootstrap) + if bootstrapErr != nil { + return bootstrapErr + } // Create node job - if nodeSpecs, err := proposeNodeJob(creEnv, don, command, []string{formatBootstrapPeer(bootstrap)}, configStr); err != nil { + if nodeSpecs, err := proposeNodeJob(creEnv, don, command, []string{bootstrapPeer}, configStr); err != nil { return err } else if err := mergo.Merge(&specs, nodeSpecs); err != nil { return fmt.Errorf("failed to merge node job specs: %w", err) @@ -195,11 +199,18 @@ func buildCapabilityConfig( return configStr, nil } -func formatBootstrapPeer(bootstrap *cre.Node) string { - return fmt.Sprintf("%s@%s:%d", - strings.TrimPrefix(bootstrap.Keys.PeerID(), "p2p_"), - bootstrap.Host, - cre.OCRPeeringPort) +func formatBootstrapPeer(caller *cre.Don, bootstrap *cre.Node) (string, error) { + if caller == nil { + return "", errors.New("caller don is nil") + } + if bootstrap == nil || bootstrap.DON == nil { + return "", errors.New("bootstrap node is nil") + } + peerURL, err := cre.ResolveBootstrapPeerURL(caller.Target, bootstrap.DON.Target, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) + if err != nil { + return "", fmt.Errorf("resolve bootstrap peer url: %w", err) + } + return peerURL, nil } func proposeNodeJob(creEnv *cre.Environment, don *cre.Don, command string, bootstrapPeers []string, configStr string) (map[string][]string, error) { diff --git a/system-tests/lib/cre/features/don_time/don_time.go b/system-tests/lib/cre/features/don_time/don_time.go index 3d13859a44d..76b58d2d922 100644 --- a/system-tests/lib/cre/features/don_time/don_time.go +++ b/system-tests/lib/cre/features/don_time/don_time.go @@ -3,7 +3,6 @@ package dontime import ( "context" "fmt" - "strconv" "dario.cat/mergo" "github.com/pkg/errors" @@ -121,9 +120,9 @@ func createJobs( return errors.New("could not find bootstrap node in topology, exactly one bootstrap node is required") } - _, ocrPeeringCfg, err := cre.PeeringCfgs(bootstrap) + bootstrapPeerURL, err := cre.ResolveBootstrapPeerURL(don.Target, bootstrap.DON.Target, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) if err != nil { - return errors.Wrap(err, "failed to get peering configs") + return errors.Wrap(err, "failed to resolve bootstrap peer URL") } workerInput := cre_jobs.ProposeJobSpecInput{ @@ -140,7 +139,7 @@ func createJobs( "chainSelectorEVM": creEnv.RegistryChainSelector, "contractQualifier": ContractQualifier, "templateName": "don-time", - "bootstrapperOCR3Urls": []string{ocrPeeringCfg.OCRBootstraperPeerID + "@" + ocrPeeringCfg.OCRBootstraperHost + ":" + strconv.Itoa(ocrPeeringCfg.Port)}, + "bootstrapperOCR3Urls": []string{bootstrapPeerURL}, }, } diff --git a/system-tests/lib/cre/features/evm/v2/evm.go b/system-tests/lib/cre/features/evm/v2/evm.go index 87f03f510f8..1dcfc889261 100644 --- a/system-tests/lib/cre/features/evm/v2/evm.go +++ b/system-tests/lib/cre/features/evm/v2/evm.go @@ -5,7 +5,6 @@ import ( "context" "fmt" "strconv" - "strings" "text/template" "time" @@ -216,6 +215,10 @@ func createJobs( if !isBootstrap { return errors.New("could not find bootstrap node in topology, exactly one bootstrap node is required") } + bootstrapPeerURL, peerErr := cre.ResolveBootstrapPeerURL(don.Target, bootstrap.DON.Target, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) + if peerErr != nil { + return errors.Wrap(peerErr, "failed to resolve bootstrap peer URL") + } workerNodes, wErr := don.Workers() if wErr != nil { @@ -303,7 +306,7 @@ func createJobs( return errors.New("failed to get key bundle id for evm family") } - bootstrapPeers := []string{fmt.Sprintf("%s@%s:%d", strings.TrimPrefix(bootstrap.Keys.PeerID(), "p2p_"), bootstrap.Host, cre.OCRPeeringPort)} + bootstrapPeers := []string{bootstrapPeerURL} strategyName := "single-chain" if len(workerNode.Keys.OCR2BundleIDs) > 1 { diff --git a/system-tests/lib/cre/features/vault/vault.go b/system-tests/lib/cre/features/vault/vault.go index f36bc931e8f..5d249748685 100644 --- a/system-tests/lib/cre/features/vault/vault.go +++ b/system-tests/lib/cre/features/vault/vault.go @@ -276,9 +276,9 @@ func createJobs( specs := make(map[string][]string) - _, ocrPeeringCfg, err := cre.PeeringCfgs(bootstrap) + bootstrapPeerURL, err := cre.ResolveBootstrapPeerURL(don.Target, bootstrap.DON.Target, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) if err != nil { - return errors.Wrap(err, "failed to get peering configs") + return errors.Wrap(err, "failed to resolve bootstrap peer URL") } workerInput := cre_jobs.ProposeJobSpecInput{ @@ -296,7 +296,7 @@ func createJobs( "contractQualifier": ContractQualifier + "_plugin", "dkgContractQualifier": ContractQualifier + "_dkg", "templateName": "worker-vault", - "bootstrapperOCR3Urls": []string{ocrPeeringCfg.OCRBootstraperPeerID + "@" + ocrPeeringCfg.OCRBootstraperHost + ":" + strconv.Itoa(ocrPeeringCfg.Port)}, + "bootstrapperOCR3Urls": []string{bootstrapPeerURL}, }, } diff --git a/system-tests/lib/cre/sharding/sharding.go b/system-tests/lib/cre/sharding/sharding.go index 94a53ba9847..c61463caccb 100644 --- a/system-tests/lib/cre/sharding/sharding.go +++ b/system-tests/lib/cre/sharding/sharding.go @@ -3,7 +3,6 @@ package sharding import ( "context" "fmt" - "strconv" "time" "github.com/Masterminds/semver/v3" @@ -62,7 +61,7 @@ func SetupSharding(ctx context.Context, input SetupShardingInput) error { } // 3. Get bootstrap URLs for Ring P2P - bootstrapURLs, err := getBootstrapURLs(input.Dons) + bootstrapURLs, err := getBootstrapURLs(shardLeaderDON, input.Dons) if err != nil { return fmt.Errorf("failed to get bootstrap URLs: %w", err) } @@ -164,18 +163,19 @@ func deployRingOCR3Contract(creEnv *cre.Environment, logger zerolog.Logger) (com } // getBootstrapURLs extracts P2P bootstrap URLs from the topology's bootstrap node -func getBootstrapURLs(dons *cre.Dons) ([]string, error) { +func getBootstrapURLs(callerDON *cre.Don, dons *cre.Dons) ([]string, error) { + if callerDON == nil { + return nil, errors.New("caller DON is nil") + } bootstrap, ok := dons.Bootstrap() if !ok { return nil, errors.New("no bootstrap node found in dons") } - _, ocrPeeringCfg, err := cre.PeeringCfgs(bootstrap) + bootstrapURL, err := cre.ResolveBootstrapPeerURL(callerDON.Target, bootstrap.DON.Target, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) if err != nil { - return nil, fmt.Errorf("failed to get peering configs: %w", err) + return nil, fmt.Errorf("failed to resolve bootstrap peer URL: %w", err) } - - bootstrapURL := ocrPeeringCfg.OCRBootstraperPeerID + "@" + ocrPeeringCfg.OCRBootstraperHost + ":" + strconv.Itoa(ocrPeeringCfg.Port) return []string{bootstrapURL}, nil } diff --git a/system-tests/lib/cre/types.go b/system-tests/lib/cre/types.go index 765b8cfb39b..74bf48bde7e 100644 --- a/system-tests/lib/cre/types.go +++ b/system-tests/lib/cre/types.go @@ -456,7 +456,8 @@ type GenerateConfigsInput struct { Datastore datastore.DataStore DonMetadata *DonMetadata Blockchains map[uint64]blockchains.Blockchain - BlockchainTargetBySelector map[uint64]string + BlockchainPlacementBySelector map[uint64]string + OCRBootstrapPlacement string RegistryChainSelector uint64 Flags []string CapabilitiesPeeringData CapabilitiesPeeringData @@ -476,6 +477,9 @@ func (g *GenerateConfigsInput) Validate() error { if g.RegistryChainSelector == 0 { return errors.New("home chain selector not set") } + if strings.TrimSpace(g.OCRBootstrapPlacement) == "" { + return errors.New("ocr bootstrap placement not set") + } if len(g.Flags) == 0 { return errors.New("flags not set") } @@ -1185,7 +1189,7 @@ type NodeSpecWithRole struct { type NodeSet struct { *ns.Input - Target string `toml:"target"` // docker (default) or remote + Placement string `toml:"placement"` // local (default) or remote RemoteStartPolicy string `toml:"remote_start_policy"` // reuse_if_identical (default) or always // Our role-aware node specs (shadows ns.Input.NodeSpecs) @@ -1457,6 +1461,9 @@ type LinkDonsToJDInput struct { Dons *Dons Topology *Topology CldfEnvironment *cldf.Environment + JDPlacement string + JDInternalWSRPC string + JDExternalWSRPC string } type Environment struct { diff --git a/system-tests/lib/cre/workflow/registry.go b/system-tests/lib/cre/workflow/registry.go index 1c0ff1b5945..63c1079bf77 100644 --- a/system-tests/lib/cre/workflow/registry.go +++ b/system-tests/lib/cre/workflow/registry.go @@ -28,10 +28,10 @@ import ( ks_contracts_op "github.com/smartcontractkit/chainlink/deployment/keystone/changeset/operations/contracts" libc "github.com/smartcontractkit/chainlink/system-tests/lib/conversions" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/stagegen" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/flags" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" libformat "github.com/smartcontractkit/chainlink/system-tests/lib/format" ) @@ -367,7 +367,7 @@ func getAllFilters(ctx context.Context, logger logger.Logger, chainID *big.Int, func resolveNodeSetDBHost(nodeSet *cre.NodeSet) (string, error) { defaultHost := "127.0.0.1" - if nodeSet == nil || strings.TrimSpace(nodeSet.Target) != string(config.TargetRemote) { + if nodeSet == nil || strings.TrimSpace(nodeSet.Placement) != string(config.PlacementRemote) { return defaultHost, nil } if !runtimecfg.IsDirectMode() { diff --git a/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md b/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md index 0ee940f043a..ade32ddd15e 100644 --- a/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md +++ b/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md @@ -51,6 +51,7 @@ For both SSM and direct-mode auto IP lookup, AWS CLI auth selection follows: ## Placement Rules +- Use `placement = "local" | "remote"` in CRE component config (NodeSets, JD, Blockchains). - Same placement (`local->local`, `remote->remote`) uses **internal** URLs. - Cross placement (`local->remote`, `remote->local`) uses **external** URLs. - Remote NodeSets targeting local gateway are allowed when bridge/tunnel plumbing for gateway ingress is present. @@ -71,6 +72,6 @@ For both SSM and direct-mode auto IP lookup, AWS CLI auth selection follows: - Agent unreachable: verify bind address/port vs chosen access mode. - Direct mode cannot resolve EC2 IP: ensure `CRE_EC2_INSTANCE_ID` is set and AWS CLI credentials are valid, or set `CRE_EC2_HOST_IP` explicitly. -- `invalid jd target`: use `target=local` or `target=remote` (only supported values). +- `invalid jd placement`: use `placement=local` or `placement=remote` (only supported values). - Remote nodes hitting local-only fixtures: ensure fixture relay helper is active. - Mixed remote->local gateway from NodeSets is supported when bridge plumbing is present. diff --git a/system-tests/tests/smoke/cre/v2_grpc_source_test.go b/system-tests/tests/smoke/cre/v2_grpc_source_test.go index 66389b1de25..fc6acbeec4a 100644 --- a/system-tests/tests/smoke/cre/v2_grpc_source_test.go +++ b/system-tests/tests/smoke/cre/v2_grpc_source_test.go @@ -598,7 +598,7 @@ func compileAndCopyWorkflow(t *testing.T, testEnv *ttypes.TestEnvironment, workf containerTargetDir := creworkflow.DefaultWorkflowTargetDir mode := creworkflow.ArtifactDeployModeLocal for _, nodeSet := range testEnv.Config.NodeSets { - if nodeSet != nil && nodeSet.Name == workflowDONName && nodeSet.Target == string(envconfig.TargetRemote) { + if nodeSet != nil && nodeSet.Name == workflowDONName && nodeSet.Placement == string(envconfig.PlacementRemote) { mode = creworkflow.ArtifactDeployModeRemote break } diff --git a/system-tests/tests/smoke/cre/v2_vault_don_test.go b/system-tests/tests/smoke/cre/v2_vault_don_test.go index 55485e3deb5..d55d82b537f 100644 --- a/system-tests/tests/smoke/cre/v2_vault_don_test.go +++ b/system-tests/tests/smoke/cre/v2_vault_don_test.go @@ -55,7 +55,7 @@ func ExecuteVaultTest(t *testing.T, testEnv *ttypes.TestEnvironment) { t.Context(), i, nodeSet.DbInput.Port, - nodeSet.Target == string(creconfig.TargetRemote), + nodeSet.Placement == string(creconfig.PlacementRemote), ) if err != nil || packageCount != 1 { return false diff --git a/system-tests/tests/test-helpers/before_suite.go b/system-tests/tests/test-helpers/before_suite.go index a52dfc90cb4..0d207826c67 100644 --- a/system-tests/tests/test-helpers/before_suite.go +++ b/system-tests/tests/test-helpers/before_suite.go @@ -146,12 +146,12 @@ func ensureMixedModeComponentRelays(t *testing.T, testEnv *ttypes.TestEnvironmen if name == "" { continue } - nodeSetTargetsByName[name] = strings.TrimSpace(nsCfg.Target) + nodeSetTargetsByName[name] = strings.TrimSpace(nsCfg.Placement) } // Local blockchain endpoints used by remote nodesets. for idx, bcCfg := range testEnv.Config.Blockchains { - if bcCfg == nil || strings.TrimSpace(string(bcCfg.Target)) != string(envconfig.TargetLocal) { + if bcCfg == nil || strings.TrimSpace(string(bcCfg.Placement)) != string(envconfig.PlacementLocal) { continue } if idx >= len(testEnv.CreEnvironment.Blockchains) || testEnv.CreEnvironment.Blockchains[idx] == nil { @@ -171,7 +171,7 @@ func ensureMixedModeComponentRelays(t *testing.T, testEnv *ttypes.TestEnvironmen } // Local JD endpoints used by remote nodesets. - if testEnv.Config.JD != nil && strings.TrimSpace(string(testEnv.Config.JD.Target)) == string(envconfig.TargetLocal) && testEnv.Config.JD.Out != nil { + if testEnv.Config.JD != nil && strings.TrimSpace(string(testEnv.Config.JD.Placement)) == string(envconfig.PlacementLocal) && testEnv.Config.JD.Out != nil { if p, ok := extractPort(testEnv.Config.JD.Out.ExternalGRPCUrl); ok { EnsureFixtureRelayForPort(t, testEnv, "jd-grpc", p) } @@ -192,7 +192,7 @@ func ensureMixedModeComponentRelays(t *testing.T, testEnv *ttypes.TestEnvironmen } donName := strings.TrimSpace(node.DON.Name) target := nodeSetTargetsByName[donName] - if target != string(envconfig.TargetLocal) { + if target != string(envconfig.PlacementLocal) { continue } if cfg.Incoming.ExternalPort > 0 { diff --git a/system-tests/tests/test-helpers/fixture_relay_helpers.go b/system-tests/tests/test-helpers/fixture_relay_helpers.go index e019def7a47..1c769fd16d7 100644 --- a/system-tests/tests/test-helpers/fixture_relay_helpers.go +++ b/system-tests/tests/test-helpers/fixture_relay_helpers.go @@ -120,7 +120,7 @@ func hasRemoteNodeSets(cfg *envconfig.Config) bool { return false } for _, nodeSet := range cfg.NodeSets { - if nodeSet != nil && strings.EqualFold(strings.TrimSpace(nodeSet.Target), string(envconfig.TargetRemote)) { + if nodeSet != nil && strings.EqualFold(strings.TrimSpace(nodeSet.Placement), string(envconfig.PlacementRemote)) { return true } } diff --git a/system-tests/tests/test-helpers/t_helpers.go b/system-tests/tests/test-helpers/t_helpers.go index b820ec634d9..4f0e3889d43 100644 --- a/system-tests/tests/test-helpers/t_helpers.go +++ b/system-tests/tests/test-helpers/t_helpers.go @@ -696,7 +696,7 @@ func resolveWorkflowDONArtifactMode(cfg *envconfig.Config, donName string) (crew if nodeSet == nil || nodeSet.Name != donName { continue } - if strings.TrimSpace(nodeSet.Target) == string(envconfig.TargetRemote) { + if strings.TrimSpace(nodeSet.Placement) == string(envconfig.PlacementRemote) { return creworkflow.ArtifactDeployModeRemote, nodeSet.Name } return creworkflow.ArtifactDeployModeLocal, nodeSet.Name From b43813faf191e9ffafe8b34c88145d5790f680ea Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Tue, 24 Feb 2026 08:39:34 +0100 Subject: [PATCH 13/34] fix compilation after code disappeared, start working on p2p connectivity across local <-> remote DONs --- .gitignore | 3 + .../environment/relay_supervisor_test.go | 59 ++++++++++++++ system-tests/lib/cre/bootstrap_peer.go | 54 +++++++++++++ system-tests/lib/cre/bootstrap_peer_test.go | 80 +++++++++++++++++++ system-tests/lib/cre/don/config/config.go | 71 +++++++++++++--- .../lib/cre/environment/environment.go | 71 ++++++++++++++++ .../cre/features/consensus/v1/consensus.go | 2 +- .../cre/features/consensus/v2/consensus.go | 2 +- .../lib/cre/features/don_time/don_time.go | 2 +- system-tests/lib/cre/features/evm/v2/evm.go | 2 +- system-tests/lib/cre/features/vault/vault.go | 2 +- system-tests/lib/cre/sharding/sharding.go | 2 +- system-tests/lib/cre/types.go | 20 ++--- .../tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md | 19 +++++ 14 files changed, 361 insertions(+), 28 deletions(-) create mode 100644 core/scripts/cre/environment/environment/relay_supervisor_test.go create mode 100644 system-tests/lib/cre/bootstrap_peer_test.go diff --git a/.gitignore b/.gitignore index 7e2524c1e9d..24ff4ae2e51 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,6 @@ core/scripts/cre/environment/logs/ core/scripts/cre/environment/cron core/scripts/cre/environment/binaries/* *.br.b64 + +# TODO remove later +system-tests/lib/cre/environment/agent/cre_agent \ No newline at end of file diff --git a/core/scripts/cre/environment/environment/relay_supervisor_test.go b/core/scripts/cre/environment/environment/relay_supervisor_test.go new file mode 100644 index 00000000000..6b0fc4e384a --- /dev/null +++ b/core/scripts/cre/environment/environment/relay_supervisor_test.go @@ -0,0 +1,59 @@ +package environment + +import ( + "testing" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre" + envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" +) + +func TestRelaySpecsFromConfig_AddsBootstrapPeeringPortForRemoteToLocalMixedDONs(t *testing.T) { + cfg := &envconfig.Config{ + NodeSets: []*cre.NodeSet{ + { + Placement: "local", + NodeSpecs: []*cre.NodeSpecWithRole{ + {Roles: []string{cre.BootstrapNode}}, + }, + }, + { + Placement: "remote", + NodeSpecs: []*cre.NodeSpecWithRole{ + {Roles: []string{cre.WorkerNode}}, + }, + }, + }, + } + + specs := relaySpecsFromConfig(cfg) + foundBootstrap := false + for _, spec := range specs { + if spec.Name == "ocr-bootstrap" && spec.Port == 5001 { + foundBootstrap = true + break + } + } + if !foundBootstrap { + t.Fatalf("expected relay specs to include ocr-bootstrap:5001, got %#v", specs) + } +} + +func TestRelaySpecsFromConfig_DoesNotAddBootstrapWhenNoRemoteNodeSets(t *testing.T) { + cfg := &envconfig.Config{ + NodeSets: []*cre.NodeSet{ + { + Placement: "local", + NodeSpecs: []*cre.NodeSpecWithRole{ + {Roles: []string{cre.BootstrapNode}}, + }, + }, + }, + } + + specs := relaySpecsFromConfig(cfg) + for _, spec := range specs { + if spec.Name == "ocr-bootstrap" && spec.Port == 5001 { + t.Fatalf("did not expect ocr-bootstrap relay spec without remote nodesets, got %#v", specs) + } + } +} diff --git a/system-tests/lib/cre/bootstrap_peer.go b/system-tests/lib/cre/bootstrap_peer.go index 3c31e2d0a8a..05ec74dacc8 100644 --- a/system-tests/lib/cre/bootstrap_peer.go +++ b/system-tests/lib/cre/bootstrap_peer.go @@ -12,6 +12,8 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" ) +const placeholderHostForPortRewrite = "127.0.0.1" + func ResolveBootstrapAddress(callerTarget, bootstrapTarget, internalHost string, port int) (string, error) { if strings.TrimSpace(internalHost) == "" { return "", fmt.Errorf("bootstrap internal host is empty") @@ -61,6 +63,58 @@ func ResolveBootstrapPeerURL(callerTarget, bootstrapTarget, peerID, internalHost return trimmedPeerID + "@" + address, nil } +func ResolveP2PAnnounceAddresses(nodePlacement string, hasRemoteNodeSets bool, internalHost string, port int) ([]string, error) { + if strings.TrimSpace(internalHost) == "" { + return nil, fmt.Errorf("p2p internal host is empty") + } + if port <= 0 || port > 65535 { + return nil, fmt.Errorf("invalid p2p port: %d", port) + } + + placement, err := connectivity.PlacementFromTarget(nodePlacement) + if err != nil { + return nil, err + } + + internal := net.JoinHostPort(strings.TrimSpace(internalHost), strconv.Itoa(port)) + addresses := []string{internal} + seen := map[string]struct{}{internal: {}} + add := func(addr string) { + addr = strings.TrimSpace(addr) + if addr == "" { + return + } + if _, ok := seen[addr]; ok { + return + } + seen[addr] = struct{}{} + addresses = append(addresses, addr) + } + + switch placement { + case connectivity.PlacementLocal: + if hasRemoteNodeSets { + // rewriteEndpointForRemoteCaller only uses the port for host:port inputs. + // The host here is an explicit placeholder and is not part of the final address. + bridged, bridgeErr := rewriteEndpointForRemoteCaller(net.JoinHostPort(placeholderHostForPortRewrite, strconv.Itoa(port))) + if bridgeErr != nil { + return nil, bridgeErr + } + add(bridged) + } + case connectivity.PlacementRemote: + external, externalErr := resolveBootstrapExternalAddress(connectivity.PlacementRemote, port) + if externalErr != nil { + return nil, externalErr + } + add(external) + default: + return nil, fmt.Errorf("unsupported node placement: %s", nodePlacement) + } + + return addresses, nil +} + func resolveBootstrapExternalAddress(targetPlacement connectivity.Placement, port int) (string, error) { if targetPlacement == connectivity.PlacementLocal { return net.JoinHostPort("127.0.0.1", strconv.Itoa(port)), nil diff --git a/system-tests/lib/cre/bootstrap_peer_test.go b/system-tests/lib/cre/bootstrap_peer_test.go new file mode 100644 index 00000000000..b93a9ae66cd --- /dev/null +++ b/system-tests/lib/cre/bootstrap_peer_test.go @@ -0,0 +1,80 @@ +package cre + +import ( + "os" + "testing" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" +) + +func TestResolveP2PAnnounceAddresses_LocalOnly_UsesInternalHost(t *testing.T) { + addresses, err := ResolveP2PAnnounceAddresses("local", false, "workflow-node0", 5001) + if err != nil { + t.Fatalf("ResolveP2PAnnounceAddresses returned error: %v", err) + } + if len(addresses) != 1 { + t.Fatalf("expected a single announce address, got %d (%v)", len(addresses), addresses) + } + if addresses[0] != "workflow-node0:5001" { + t.Fatalf("expected workflow-node0:5001, got %s", addresses[0]) + } +} + +func TestResolveP2PAnnounceAddresses_LocalMixed_AddsBridgedHost(t *testing.T) { + addresses, err := ResolveP2PAnnounceAddresses("local", true, "bootstrap-gateway-node0", 5001) + if err != nil { + t.Fatalf("ResolveP2PAnnounceAddresses returned error: %v", err) + } + if len(addresses) != 2 { + t.Fatalf("expected two announce addresses for mixed mode, got %d (%v)", len(addresses), addresses) + } + if addresses[0] != "bootstrap-gateway-node0:5001" { + t.Fatalf("expected first address to stay internal, got %s", addresses[0]) + } + if addresses[1] != "host.docker.internal:5001" { + t.Fatalf("expected bridged host.docker.internal:5001, got %s", addresses[1]) + } +} + +func TestResolveP2PAnnounceAddresses_Remote_AddsDirectHostIP(t *testing.T) { + prevMode, hadMode := os.LookupEnv(runtimecfg.EnvRemoteAccessMode) + prevIP, hadIP := os.LookupEnv(runtimecfg.EnvEC2HostIP) + t.Cleanup(func() { + if hadMode { + _ = os.Setenv(runtimecfg.EnvRemoteAccessMode, prevMode) + } else { + _ = os.Unsetenv(runtimecfg.EnvRemoteAccessMode) + } + if hadIP { + _ = os.Setenv(runtimecfg.EnvEC2HostIP, prevIP) + } else { + _ = os.Unsetenv(runtimecfg.EnvEC2HostIP) + } + }) + _ = os.Setenv(runtimecfg.EnvRemoteAccessMode, runtimecfg.RemoteAccessModeDirect) + _ = os.Setenv(runtimecfg.EnvEC2HostIP, "10.1.2.3") + + addresses, err := ResolveP2PAnnounceAddresses("remote", true, "workflow-node0", 5001) + if err != nil { + t.Fatalf("ResolveP2PAnnounceAddresses returned error: %v", err) + } + if len(addresses) != 2 { + t.Fatalf("expected two announce addresses for remote node, got %d (%v)", len(addresses), addresses) + } + if addresses[0] != "workflow-node0:5001" { + t.Fatalf("expected first address to stay internal, got %s", addresses[0]) + } + if addresses[1] != "10.1.2.3:5001" { + t.Fatalf("expected external EC2 address 10.1.2.3:5001, got %s", addresses[1]) + } +} + +func TestResolveBootstrapPeerURL_RemoteCallerToLocalBootstrap_UsesBridgedHost(t *testing.T) { + peerURL, err := ResolveBootstrapPeerURL("remote", "local", "p2p_testPeer", "bootstrap-gateway-node0", 5001) + if err != nil { + t.Fatalf("ResolveBootstrapPeerURL returned error: %v", err) + } + if peerURL != "testPeer@host.docker.internal:5001" { + t.Fatalf("expected bridged bootstrap peer URL, got %s", peerURL) + } +} diff --git a/system-tests/lib/cre/don/config/config.go b/system-tests/lib/cre/don/config/config.go index 28c1e5c5597..2cd9f9d1a32 100644 --- a/system-tests/lib/cre/don/config/config.go +++ b/system-tests/lib/cre/don/config/config.go @@ -113,18 +113,18 @@ func PrepareNodeTOMLs( if configsFound == 0 { config, configErr := generateNodeTomlConfig( cre.GenerateConfigsInput{ - Datastore: creEnv.CldfEnvironment.DataStore, - ContractVersions: creEnv.ContractVersions, - DonMetadata: donMetadata, - Blockchains: chainPerSelector, + Datastore: creEnv.CldfEnvironment.DataStore, + ContractVersions: creEnv.ContractVersions, + DonMetadata: donMetadata, + Blockchains: chainPerSelector, BlockchainPlacementBySelector: blockchainTargetBySelector, - OCRBootstrapPlacement: ocrBootstrapPlacement, - Flags: donMetadata.Flags, - CapabilitiesPeeringData: capabilitiesPeeringData, - OCRPeeringData: ocrPeeringData, - RegistryChainSelector: creEnv.RegistryChainSelector, - Topology: topology, - Provider: creEnv.Provider, + OCRBootstrapPlacement: ocrBootstrapPlacement, + Flags: donMetadata.Flags, + CapabilitiesPeeringData: capabilitiesPeeringData, + OCRPeeringData: ocrPeeringData, + RegistryChainSelector: creEnv.RegistryChainSelector, + Topology: topology, + Provider: creEnv.Provider, }, configFactoryFunctions, ) @@ -224,7 +224,14 @@ func generateNodeTomlConfig(input cre.GenerateConfigsInput, nodeConfigTransforme switch role { case cre.BootstrapNode: var cErr error - nodeConfig, cErr = addBootstrapNodeConfig(nodeConfig, input.OCRPeeringData, commonInputs) + nodeConfig, cErr = addBootstrapNodeConfig( + nodeConfig, + input.OCRPeeringData, + commonInputs, + input.DonMetadata, + nodeMetadata, + input.Topology, + ) if cErr != nil { return nil, errors.Wrapf(cErr, "failed to add bootstrap node config for node at index %d in DON %s", nodeIdx, input.DonMetadata.Name) } @@ -311,6 +318,9 @@ func addBootstrapNodeConfig( existingConfig corechainlink.Config, ocrPeeringData cre.OCRPeeringData, commonInputs *commonInputs, + donMetadata *cre.DonMetadata, + nodeMetadata *cre.NodeMetadata, + topology *cre.Topology, ) (corechainlink.Config, error) { existingConfig.OCR2 = coretoml.OCR2{ Enabled: ptr.Ptr(true), @@ -331,6 +341,18 @@ func addBootstrapNodeConfig( }, EnableExperimentalRageP2P: ptr.Ptr(true), } + if donMetadata != nil && nodeMetadata != nil { + announceAddresses, announceErr := cre.ResolveP2PAnnounceAddresses( + donMetadata.MustNodeSet().Placement, + hasRemoteNodeSets(topology), + nodeMetadata.Host, + ocrPeeringData.Port, + ) + if announceErr != nil { + return existingConfig, errors.Wrap(announceErr, "failed to resolve P2P announce addresses for bootstrap node") + } + existingConfig.P2P.V2.AnnounceAddresses = ptr.Ptr(announceAddresses) + } if commonInputs.provider.IsDocker() { existingConfig.CRE.WorkflowFetcher = &coretoml.WorkflowFetcherConfig{ @@ -413,6 +435,16 @@ func addWorkerNodeConfig( }, EnableExperimentalRageP2P: ptr.Ptr(true), } + announceAddresses, announceErr := cre.ResolveP2PAnnounceAddresses( + donMetadata.MustNodeSet().Placement, + hasRemoteNodeSets(topology), + m.Host, + ocrPeeringData.Port, + ) + if announceErr != nil { + return existingConfig, errors.Wrap(announceErr, "failed to resolve P2P announce addresses for worker node") + } + existingConfig.P2P.V2.AnnounceAddresses = ptr.Ptr(announceAddresses) if commonInputs.provider.IsDocker() { existingConfig.CRE.WorkflowFetcher = &coretoml.WorkflowFetcherConfig{ @@ -841,6 +873,21 @@ func gatewayPlacementByNodeUUID(topology *cre.Topology) (map[string]connectivity return out, nil } +func hasRemoteNodeSets(topology *cre.Topology) bool { + if topology == nil { + return false + } + for _, don := range topology.DonsMetadata.List() { + if don == nil || don.MustNodeSet() == nil { + continue + } + if strings.EqualFold(strings.TrimSpace(don.MustNodeSet().Placement), string(connectivity.PlacementRemote)) { + return true + } + } + return false +} + func resolveBootstrapPlacement(topology *cre.Topology, bootstrapNodeUUID string) (string, error) { if topology == nil { return "", fmt.Errorf("topology is nil") diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index 3d9f7fa6a8d..2c2034fe83e 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -5,9 +5,12 @@ import ( "errors" "fmt" "maps" + "net" "os" + "strconv" "strings" "sync" + "time" "github.com/Masterminds/semver/v3" "github.com/ethereum/go-ethereum/common" @@ -39,6 +42,7 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/stagegen" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/sharding" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/workflow" libformat "github.com/smartcontractkit/chainlink/system-tests/lib/format" @@ -283,6 +287,9 @@ func SetupTestEnvironment( return nil, pkgerrors.Wrap(err, "failed to execute pre-DON startup hook") } } + if err := verifyRemoteToLocalBootstrapReachability(ctx, testLogger, topology); err != nil { + return nil, pkgerrors.Wrap(err, "bootstrap reachability sanity check failed") + } startedDONs, donStartErr := StartDONs(ctx, testLogger, topology, input.Provider, deployedBlockchains.RegistryChain().CtfOutput(), input.CapabilityConfigs, input.CopyCapabilityBinaries, updatedNodeSets, tunnelManager) if donStartErr != nil { @@ -564,6 +571,70 @@ func validateUnsupportedPlacements( return nil } +func verifyRemoteToLocalBootstrapReachability(ctx context.Context, lggr zerolog.Logger, topology *cre.Topology) error { + if topology == nil { + return nil + } + hasRemoteDONs := false + hasLocalBootstrap := false + for _, don := range topology.DonsMetadata.List() { + if don == nil || don.MustNodeSet() == nil { + continue + } + placement := strings.TrimSpace(don.MustNodeSet().Placement) + if placement == string(config.PlacementRemote) { + hasRemoteDONs = true + } + if placement == string(config.PlacementLocal) { + for _, node := range don.NodesMetadata { + if node != nil && node.HasRole(cre.BootstrapNode) { + hasLocalBootstrap = true + break + } + } + } + } + if !hasRemoteDONs || !hasLocalBootstrap { + return nil + } + if !runtimecfg.IsDirectMode() { + return nil + } + + hostIP, err := runtimecfg.DirectHostIP() + if err != nil { + return fmt.Errorf("resolve direct EC2 host ip: %w", err) + } + remoteRelayAddr := net.JoinHostPort(hostIP, strconv.Itoa(cre.OCRPeeringPort)) + if err := waitForTCPReachable(ctx, remoteRelayAddr, 6*time.Second); err != nil { + return fmt.Errorf("remote relay listener for bootstrap peering is not reachable at %s: %w", remoteRelayAddr, err) + } + lggr.Info().Str("remoteRelay", remoteRelayAddr).Msg("verified remote->local bootstrap relay listener reachability") + return nil +} + +func waitForTCPReachable(ctx context.Context, addr string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + var lastErr error + for { + dialer := net.Dialer{Timeout: 600 * time.Millisecond} + conn, err := dialer.DialContext(ctx, "tcp", addr) + if err == nil { + _ = conn.Close() + return nil + } + lastErr = err + if time.Now().After(deadline) { + return lastErr + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(250 * time.Millisecond): + } + } +} + func newCldfEnvironment(ctx context.Context, singleFileLogger logger.Logger, cldfBlockchains cldf_chain.BlockChains) *cldf.Environment { allChainsCLDEnvironment := &cldf.Environment{ Name: cre.EnvironmentName, diff --git a/system-tests/lib/cre/features/consensus/v1/consensus.go b/system-tests/lib/cre/features/consensus/v1/consensus.go index f75d4d19ab1..6530f4d7d2c 100644 --- a/system-tests/lib/cre/features/consensus/v1/consensus.go +++ b/system-tests/lib/cre/features/consensus/v1/consensus.go @@ -147,7 +147,7 @@ func createJobs( specs := make(map[string][]string) - bootstrapPeerURL, err := cre.ResolveBootstrapPeerURL(don.Target, bootstrap.DON.Target, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) + bootstrapPeerURL, err := cre.ResolveBootstrapPeerURL(don.Placement, bootstrap.DON.Placement, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) if err != nil { return errors.Wrap(err, "failed to resolve bootstrap peer URL") } diff --git a/system-tests/lib/cre/features/consensus/v2/consensus.go b/system-tests/lib/cre/features/consensus/v2/consensus.go index c4f7027dd67..6705ac626bf 100644 --- a/system-tests/lib/cre/features/consensus/v2/consensus.go +++ b/system-tests/lib/cre/features/consensus/v2/consensus.go @@ -206,7 +206,7 @@ func formatBootstrapPeer(caller *cre.Don, bootstrap *cre.Node) (string, error) { if bootstrap == nil || bootstrap.DON == nil { return "", errors.New("bootstrap node is nil") } - peerURL, err := cre.ResolveBootstrapPeerURL(caller.Target, bootstrap.DON.Target, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) + peerURL, err := cre.ResolveBootstrapPeerURL(caller.Placement, bootstrap.DON.Placement, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) if err != nil { return "", fmt.Errorf("resolve bootstrap peer url: %w", err) } diff --git a/system-tests/lib/cre/features/don_time/don_time.go b/system-tests/lib/cre/features/don_time/don_time.go index 76b58d2d922..86486a18c60 100644 --- a/system-tests/lib/cre/features/don_time/don_time.go +++ b/system-tests/lib/cre/features/don_time/don_time.go @@ -120,7 +120,7 @@ func createJobs( return errors.New("could not find bootstrap node in topology, exactly one bootstrap node is required") } - bootstrapPeerURL, err := cre.ResolveBootstrapPeerURL(don.Target, bootstrap.DON.Target, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) + bootstrapPeerURL, err := cre.ResolveBootstrapPeerURL(don.Placement, bootstrap.DON.Placement, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) if err != nil { return errors.Wrap(err, "failed to resolve bootstrap peer URL") } diff --git a/system-tests/lib/cre/features/evm/v2/evm.go b/system-tests/lib/cre/features/evm/v2/evm.go index 1dcfc889261..a2a980d03f6 100644 --- a/system-tests/lib/cre/features/evm/v2/evm.go +++ b/system-tests/lib/cre/features/evm/v2/evm.go @@ -215,7 +215,7 @@ func createJobs( if !isBootstrap { return errors.New("could not find bootstrap node in topology, exactly one bootstrap node is required") } - bootstrapPeerURL, peerErr := cre.ResolveBootstrapPeerURL(don.Target, bootstrap.DON.Target, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) + bootstrapPeerURL, peerErr := cre.ResolveBootstrapPeerURL(don.Placement, bootstrap.DON.Placement, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) if peerErr != nil { return errors.Wrap(peerErr, "failed to resolve bootstrap peer URL") } diff --git a/system-tests/lib/cre/features/vault/vault.go b/system-tests/lib/cre/features/vault/vault.go index 5d249748685..e6bf3f61ca8 100644 --- a/system-tests/lib/cre/features/vault/vault.go +++ b/system-tests/lib/cre/features/vault/vault.go @@ -276,7 +276,7 @@ func createJobs( specs := make(map[string][]string) - bootstrapPeerURL, err := cre.ResolveBootstrapPeerURL(don.Target, bootstrap.DON.Target, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) + bootstrapPeerURL, err := cre.ResolveBootstrapPeerURL(don.Placement, bootstrap.DON.Placement, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) if err != nil { return errors.Wrap(err, "failed to resolve bootstrap peer URL") } diff --git a/system-tests/lib/cre/sharding/sharding.go b/system-tests/lib/cre/sharding/sharding.go index c61463caccb..e129e071c9d 100644 --- a/system-tests/lib/cre/sharding/sharding.go +++ b/system-tests/lib/cre/sharding/sharding.go @@ -172,7 +172,7 @@ func getBootstrapURLs(callerDON *cre.Don, dons *cre.Dons) ([]string, error) { return nil, errors.New("no bootstrap node found in dons") } - bootstrapURL, err := cre.ResolveBootstrapPeerURL(callerDON.Target, bootstrap.DON.Target, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) + bootstrapURL, err := cre.ResolveBootstrapPeerURL(callerDON.Placement, bootstrap.DON.Placement, bootstrap.Keys.PeerID(), bootstrap.Host, cre.OCRPeeringPort) if err != nil { return nil, fmt.Errorf("failed to resolve bootstrap peer URL: %w", err) } diff --git a/system-tests/lib/cre/types.go b/system-tests/lib/cre/types.go index 74bf48bde7e..c8989435f0c 100644 --- a/system-tests/lib/cre/types.go +++ b/system-tests/lib/cre/types.go @@ -453,18 +453,18 @@ type ( ) type GenerateConfigsInput struct { - Datastore datastore.DataStore - DonMetadata *DonMetadata - Blockchains map[uint64]blockchains.Blockchain + Datastore datastore.DataStore + DonMetadata *DonMetadata + Blockchains map[uint64]blockchains.Blockchain BlockchainPlacementBySelector map[uint64]string OCRBootstrapPlacement string - RegistryChainSelector uint64 - Flags []string - CapabilitiesPeeringData CapabilitiesPeeringData - OCRPeeringData OCRPeeringData - ContractVersions map[ContractType]*semver.Version - Topology *Topology - Provider infra.Provider + RegistryChainSelector uint64 + Flags []string + CapabilitiesPeeringData CapabilitiesPeeringData + OCRPeeringData OCRPeeringData + ContractVersions map[ContractType]*semver.Version + Topology *Topology + Provider infra.Provider } func (g *GenerateConfigsInput) Validate() error { diff --git a/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md b/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md index ade32ddd15e..6f4f70f8c17 100644 --- a/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md +++ b/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md @@ -56,6 +56,24 @@ For both SSM and direct-mode auto IP lookup, AWS CLI auth selection follows: - Cross placement (`local->remote`, `remote->local`) uses **external** URLs. - Remote NodeSets targeting local gateway are allowed when bridge/tunnel plumbing for gateway ingress is present. +## P2P Peering Rules (SharedPeering) + +- `P2P.V2.ListenAddresses` is the **bind** interface used by the node process (CRE sets `0.0.0.0:5001`). +- `P2P.V2.AnnounceAddresses` is the **routable** endpoint set peers learn via discovery. +- In mixed placement: + - local node announce set includes internal node host and a bridged host (`host.docker.internal:5001`) for remote callers. + - remote node announce set includes internal node host and direct EC2 host IP address (`:5001`) in direct mode. +- If announce addresses are not routable from the caller placement, DON2DON discovery can succeed but stream establishment will fail. + +## Mixed Bootstrap Reachability + +- When remote DONs and a local bootstrap node are both present, CRE starts persistent relay plumbing for bootstrap peering on `5001`. +- Before DON startup, CRE performs a fail-fast sanity check that the remote relay listener for bootstrap peering is reachable. +- If startup fails on bootstrap reachability: + - ensure relay supervisor was started, + - ensure EC2 agent is reachable and has relay open for `5001`, + - verify direct mode host IP resolution (`CRE_EC2_HOST_IP` or `CRE_EC2_INSTANCE_ID` + AWS CLI auth). + ## Bridge and Fixture Relay - Remote components cannot directly call local in-process fixtures. @@ -75,3 +93,4 @@ For both SSM and direct-mode auto IP lookup, AWS CLI auth selection follows: - `invalid jd placement`: use `placement=local` or `placement=remote` (only supported values). - Remote nodes hitting local-only fixtures: ensure fixture relay helper is active. - Mixed remote->local gateway from NodeSets is supported when bridge plumbing is present. +- DON2DON flakiness in mixed mode: check generated node TOML includes `P2P.V2.AnnounceAddresses` that are routable from the opposite placement. From 3c50b410930242553f666867e8f5a2db265add87 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Tue, 24 Feb 2026 10:24:47 +0100 Subject: [PATCH 14/34] mixed-mode WIP --- .../configs/workflow-gateway-don-mixed.toml | 61 ++-- .../environment/relay_supervisor.go | 40 ++- .../environment/relay_supervisor_test.go | 25 +- system-tests/lib/cre/bootstrap_peer.go | 47 ++- system-tests/lib/cre/bootstrap_peer_test.go | 52 +-- system-tests/lib/cre/don/config/config.go | 319 ++++-------------- .../lib/cre/environment/environment.go | 17 - system-tests/lib/go.mod | 2 +- 8 files changed, 200 insertions(+), 363 deletions(-) diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml b/core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml index 1f705bfa7bd..8dd7ef6d6fc 100644 --- a/core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml +++ b/core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml @@ -37,12 +37,43 @@ # either "docker" or "kubernetes" type = "docker" +[[nodesets]] + nodes = 1 + name = "bootstrap-gateway" + don_types = ["bootstrap", "gateway"] + override_mode = "each" + http_port_range_start = 10000 + ocr2_p2p_port_range_start = 10050 + placement = "remote" + remote_start_policy = "always" + + env_vars = { CL_EVM_CMD = "" } + supported_evm_chains = [1337, 2337] + + [nodesets.db] + image = "postgres:12.0" + port = 13200 + + [[nodesets.node_specs]] + roles = ["bootstrap", "gateway"] + [nodesets.node_specs.node] + #ocker_ctx = "../../../.." + #docker_file = "core/chainlink.Dockerfile" + #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } + image = "chainlink-amd:latest" + # 5002 is the web API capabilities port for incoming requests + # 15002 is the vault port for incoming requests + custom_ports = ["5002:5002","15002:15002"] + # image = "chainlink-tmp:latest" + user_config_overrides = "" + [[nodesets]] nodes = 4 name = "workflow" don_types = ["workflow"] override_mode = "all" http_port_range_start = 10100 + ocr2_p2p_port_range_start = 10150 placement = "local" env_vars = { CL_EVM_CMD = "" } @@ -68,6 +99,7 @@ exposes_remote_capabilities = true override_mode = "all" http_port_range_start = 10200 + ocr2_p2p_port_range_start = 10250 placement = "remote" remote_start_policy = "always" @@ -90,32 +122,3 @@ #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } image = "chainlink-amd:latest" user_config_overrides = "" - -[[nodesets]] - nodes = 1 - name = "bootstrap-gateway" - don_types = ["bootstrap", "gateway"] - override_mode = "each" - http_port_range_start = 10300 - placement = "remote" - remote_start_policy = "always" - - env_vars = { CL_EVM_CMD = "" } - supported_evm_chains = [1337, 2337] - - [nodesets.db] - image = "postgres:12.0" - port = 13200 - - [[nodesets.node_specs]] - roles = ["bootstrap", "gateway"] - [nodesets.node_specs.node] - #ocker_ctx = "../../../.." - #docker_file = "core/chainlink.Dockerfile" - #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } - image = "chainlink-amd:latest" - # 5002 is the web API capabilities port for incoming requests - # 15002 is the vault port for incoming requests - custom_ports = ["5002:5002","15002:15002"] - # image = "chainlink-tmp:latest" - user_config_overrides = "" diff --git a/core/scripts/cre/environment/environment/relay_supervisor.go b/core/scripts/cre/environment/environment/relay_supervisor.go index 5b45ae340a1..547d3c92fa7 100644 --- a/core/scripts/cre/environment/environment/relay_supervisor.go +++ b/core/scripts/cre/environment/environment/relay_supervisor.go @@ -30,6 +30,8 @@ import ( "github.com/smartcontractkit/chainlink-testing-framework/framework" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" + ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre" envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" ) @@ -237,12 +239,8 @@ func relaySpecsFromConfig(cfg *envconfig.Config) []relaySpec { if nodeSet == nil || strings.TrimSpace(nodeSet.Placement) != string(envconfig.PlacementLocal) { continue } - for _, nodeSpec := range nodeSet.NodeSpecs { - if nodeSpec == nil || !hasBootstrapRole(nodeSpec.Roles) { - continue - } - addSpec("ocr-bootstrap", 5001) - break + for idx, p := range inferLocalNodeSetOCR2Ports(nodeSet) { + addSpec(fmt.Sprintf("%s-ocr-%d", strings.TrimSpace(nodeSet.Name), idx), p) } } @@ -325,6 +323,36 @@ func hasBootstrapRole(roles []string) bool { return false } +func inferLocalNodeSetOCR2Ports(nodeSet *cre.NodeSet) []int { + if nodeSet == nil { + return nil + } + nodeCount := nodeSet.Nodes + if nodeCount <= 0 { + nodeCount = len(nodeSet.NodeSpecs) + } + if nodeCount <= 0 { + return nil + } + base := nodeSet.OCR2P2PRangeStart + if base == 0 { + httpStart := nodeSet.HTTPPortRangeStart + if httpStart == 0 { + httpStart = ns.DefaultHTTPPortStaticRangeStart + } + base = httpStart + (ns.DefaultOCR2P2PStaticRangeStart - ns.DefaultHTTPPortStaticRangeStart) + } + out := make([]int, 0, nodeCount) + for i := 0; i < nodeCount; i++ { + p := base + i + if p <= 0 || p > 65535 { + continue + } + out = append(out, p) + } + return out +} + func endpointPort(raw string) (int, bool) { trimmed := strings.TrimSpace(raw) if trimmed == "" { diff --git a/core/scripts/cre/environment/environment/relay_supervisor_test.go b/core/scripts/cre/environment/environment/relay_supervisor_test.go index 6b0fc4e384a..aa3274a1c85 100644 --- a/core/scripts/cre/environment/environment/relay_supervisor_test.go +++ b/core/scripts/cre/environment/environment/relay_supervisor_test.go @@ -3,6 +3,7 @@ package environment import ( "testing" + ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" ) @@ -11,12 +12,21 @@ func TestRelaySpecsFromConfig_AddsBootstrapPeeringPortForRemoteToLocalMixedDONs( cfg := &envconfig.Config{ NodeSets: []*cre.NodeSet{ { + Input: &ns.Input{ + Name: "workflow", + Nodes: 2, + HTTPPortRangeStart: 10100, + }, Placement: "local", NodeSpecs: []*cre.NodeSpecWithRole{ {Roles: []string{cre.BootstrapNode}}, }, }, { + Input: &ns.Input{ + Name: "capabilities", + Nodes: 1, + }, Placement: "remote", NodeSpecs: []*cre.NodeSpecWithRole{ {Roles: []string{cre.WorkerNode}}, @@ -26,15 +36,12 @@ func TestRelaySpecsFromConfig_AddsBootstrapPeeringPortForRemoteToLocalMixedDONs( } specs := relaySpecsFromConfig(cfg) - foundBootstrap := false + got := map[int]bool{} for _, spec := range specs { - if spec.Name == "ocr-bootstrap" && spec.Port == 5001 { - foundBootstrap = true - break - } + got[spec.Port] = true } - if !foundBootstrap { - t.Fatalf("expected relay specs to include ocr-bootstrap:5001, got %#v", specs) + if !got[14100] || !got[14101] { + t.Fatalf("expected relay specs to include per-node OCR relay ports 14100/14101, got %#v", specs) } } @@ -52,8 +59,8 @@ func TestRelaySpecsFromConfig_DoesNotAddBootstrapWhenNoRemoteNodeSets(t *testing specs := relaySpecsFromConfig(cfg) for _, spec := range specs { - if spec.Name == "ocr-bootstrap" && spec.Port == 5001 { - t.Fatalf("did not expect ocr-bootstrap relay spec without remote nodesets, got %#v", specs) + if spec.Port == 14100 || spec.Port == 5001 { + t.Fatalf("did not expect OCR relay specs without remote nodesets, got %#v", specs) } } } diff --git a/system-tests/lib/cre/bootstrap_peer.go b/system-tests/lib/cre/bootstrap_peer.go index 05ec74dacc8..46b17ccf6f6 100644 --- a/system-tests/lib/cre/bootstrap_peer.go +++ b/system-tests/lib/cre/bootstrap_peer.go @@ -12,8 +12,6 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" ) -const placeholderHostForPortRewrite = "127.0.0.1" - func ResolveBootstrapAddress(callerTarget, bootstrapTarget, internalHost string, port int) (string, error) { if strings.TrimSpace(internalHost) == "" { return "", fmt.Errorf("bootstrap internal host is empty") @@ -63,12 +61,9 @@ func ResolveBootstrapPeerURL(callerTarget, bootstrapTarget, peerID, internalHost return trimmedPeerID + "@" + address, nil } -func ResolveP2PAnnounceAddresses(nodePlacement string, hasRemoteNodeSets bool, internalHost string, port int) ([]string, error) { - if strings.TrimSpace(internalHost) == "" { - return nil, fmt.Errorf("p2p internal host is empty") - } - if port <= 0 || port > 65535 { - return nil, fmt.Errorf("invalid p2p port: %d", port) +func ResolveP2PAnnounceAddresses(nodePlacement string, hasRemoteNodeSets bool, advertisedPort int) ([]string, error) { + if advertisedPort <= 0 || advertisedPort > 65535 { + return nil, fmt.Errorf("invalid p2p announce port: %d", advertisedPort) } placement, err := connectivity.PlacementFromTarget(nodePlacement) @@ -76,34 +71,30 @@ func ResolveP2PAnnounceAddresses(nodePlacement string, hasRemoteNodeSets bool, i return nil, err } - internal := net.JoinHostPort(strings.TrimSpace(internalHost), strconv.Itoa(port)) - addresses := []string{internal} - seen := map[string]struct{}{internal: {}} + addresses := []string{} add := func(addr string) { - addr = strings.TrimSpace(addr) - if addr == "" { - return + trimmed := strings.TrimSpace(addr) + if trimmed != "" { + addresses = append(addresses, trimmed) } - if _, ok := seen[addr]; ok { - return - } - seen[addr] = struct{}{} - addresses = append(addresses, addr) } switch placement { case connectivity.PlacementLocal: - if hasRemoteNodeSets { - // rewriteEndpointForRemoteCaller only uses the port for host:port inputs. - // The host here is an explicit placeholder and is not part of the final address. - bridged, bridgeErr := rewriteEndpointForRemoteCaller(net.JoinHostPort(placeholderHostForPortRewrite, strconv.Itoa(port))) - if bridgeErr != nil { - return nil, bridgeErr - } - add(bridged) + if !hasRemoteNodeSets { + // Keep AnnounceAddresses unset for local-only setups. + // This lets libocr use default/local-network behavior. + return addresses, nil } + // In mixed mode, local nodes are reached through EC2 relay listeners. + external, externalErr := resolveBootstrapExternalAddress(connectivity.PlacementRemote, advertisedPort) + if externalErr != nil { + return nil, externalErr + } + add(external) case connectivity.PlacementRemote: - external, externalErr := resolveBootstrapExternalAddress(connectivity.PlacementRemote, port) + // Remote nodes advertise direct EC2-reachable host ports. + external, externalErr := resolveBootstrapExternalAddress(connectivity.PlacementRemote, advertisedPort) if externalErr != nil { return nil, externalErr } diff --git a/system-tests/lib/cre/bootstrap_peer_test.go b/system-tests/lib/cre/bootstrap_peer_test.go index b93a9ae66cd..04b08f7ecee 100644 --- a/system-tests/lib/cre/bootstrap_peer_test.go +++ b/system-tests/lib/cre/bootstrap_peer_test.go @@ -8,31 +8,42 @@ import ( ) func TestResolveP2PAnnounceAddresses_LocalOnly_UsesInternalHost(t *testing.T) { - addresses, err := ResolveP2PAnnounceAddresses("local", false, "workflow-node0", 5001) + addresses, err := ResolveP2PAnnounceAddresses("local", false, 15001) if err != nil { t.Fatalf("ResolveP2PAnnounceAddresses returned error: %v", err) } - if len(addresses) != 1 { - t.Fatalf("expected a single announce address, got %d (%v)", len(addresses), addresses) - } - if addresses[0] != "workflow-node0:5001" { - t.Fatalf("expected workflow-node0:5001, got %s", addresses[0]) + if len(addresses) != 0 { + t.Fatalf("expected local-only setup to leave announce addresses unset, got %v", addresses) } } func TestResolveP2PAnnounceAddresses_LocalMixed_AddsBridgedHost(t *testing.T) { - addresses, err := ResolveP2PAnnounceAddresses("local", true, "bootstrap-gateway-node0", 5001) + prevMode, hadMode := os.LookupEnv(runtimecfg.EnvRemoteAccessMode) + prevIP, hadIP := os.LookupEnv(runtimecfg.EnvEC2HostIP) + t.Cleanup(func() { + if hadMode { + _ = os.Setenv(runtimecfg.EnvRemoteAccessMode, prevMode) + } else { + _ = os.Unsetenv(runtimecfg.EnvRemoteAccessMode) + } + if hadIP { + _ = os.Setenv(runtimecfg.EnvEC2HostIP, prevIP) + } else { + _ = os.Unsetenv(runtimecfg.EnvEC2HostIP) + } + }) + _ = os.Setenv(runtimecfg.EnvRemoteAccessMode, runtimecfg.RemoteAccessModeDirect) + _ = os.Setenv(runtimecfg.EnvEC2HostIP, "10.1.2.3") + + addresses, err := ResolveP2PAnnounceAddresses("local", true, 15002) if err != nil { t.Fatalf("ResolveP2PAnnounceAddresses returned error: %v", err) } - if len(addresses) != 2 { - t.Fatalf("expected two announce addresses for mixed mode, got %d (%v)", len(addresses), addresses) - } - if addresses[0] != "bootstrap-gateway-node0:5001" { - t.Fatalf("expected first address to stay internal, got %s", addresses[0]) + if len(addresses) != 1 { + t.Fatalf("expected one announce address for mixed mode, got %d (%v)", len(addresses), addresses) } - if addresses[1] != "host.docker.internal:5001" { - t.Fatalf("expected bridged host.docker.internal:5001, got %s", addresses[1]) + if addresses[0] != "10.1.2.3:15002" { + t.Fatalf("expected external EC2 address 10.1.2.3:15002, got %s", addresses[0]) } } @@ -54,18 +65,15 @@ func TestResolveP2PAnnounceAddresses_Remote_AddsDirectHostIP(t *testing.T) { _ = os.Setenv(runtimecfg.EnvRemoteAccessMode, runtimecfg.RemoteAccessModeDirect) _ = os.Setenv(runtimecfg.EnvEC2HostIP, "10.1.2.3") - addresses, err := ResolveP2PAnnounceAddresses("remote", true, "workflow-node0", 5001) + addresses, err := ResolveP2PAnnounceAddresses("remote", true, 16001) if err != nil { t.Fatalf("ResolveP2PAnnounceAddresses returned error: %v", err) } - if len(addresses) != 2 { - t.Fatalf("expected two announce addresses for remote node, got %d (%v)", len(addresses), addresses) - } - if addresses[0] != "workflow-node0:5001" { - t.Fatalf("expected first address to stay internal, got %s", addresses[0]) + if len(addresses) != 1 { + t.Fatalf("expected one announce address for remote node, got %d (%v)", len(addresses), addresses) } - if addresses[1] != "10.1.2.3:5001" { - t.Fatalf("expected external EC2 address 10.1.2.3:5001, got %s", addresses[1]) + if addresses[0] != "10.1.2.3:16001" { + t.Fatalf("expected external EC2 address 10.1.2.3:16001, got %s", addresses[0]) } } diff --git a/system-tests/lib/cre/don/config/config.go b/system-tests/lib/cre/don/config/config.go index 2cd9f9d1a32..8a6ebddf437 100644 --- a/system-tests/lib/cre/don/config/config.go +++ b/system-tests/lib/cre/don/config/config.go @@ -5,8 +5,6 @@ import ( "fmt" "maps" "math/big" - "net" - "net/url" "slices" "strconv" "strings" @@ -27,6 +25,7 @@ import ( solcfg "github.com/smartcontractkit/chainlink-solana/pkg/solana/config" "github.com/smartcontractkit/chainlink-testing-framework/framework" chipingressset "github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose/chip_ingress_set" + ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink-testing-framework/lib/utils/ptr" keystone_changeset "github.com/smartcontractkit/chainlink/deployment/keystone/changeset" @@ -35,7 +34,6 @@ import ( libc "github.com/smartcontractkit/chainlink/system-tests/lib/conversions" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/connectivity" crecontracts "github.com/smartcontractkit/chainlink/system-tests/lib/cre/contracts" creblockchains "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/solana" @@ -49,7 +47,6 @@ func PrepareNodeTOMLs( topology *cre.Topology, creEnv *cre.Environment, nodeSets []*cre.NodeSet, - blockchainTargetBySelector map[uint64]string, capabilities []cre.InstallableCapability, // Deprecated, use Features instead and modify node configs inside a Feature nodeConfigTransformerFns []cre.NodeConfigTransformerFn, ) ([]*cre.NodeSet, error) { @@ -62,11 +59,6 @@ func PrepareNodeTOMLs( if peeringErr != nil { return nil, errors.Wrap(peeringErr, "failed to find peering data") } - ocrBootstrapPlacement, placementErr := resolveBootstrapPlacement(topology, bt.UUID) - if placementErr != nil { - return nil, placementErr - } - framework.L.Info().Str("placement", strings.TrimSpace(ocrBootstrapPlacement)).Str("bootstrapNodeUUID", bt.UUID).Msg("resolved OCR bootstrap placement") localNodeSets := topology.NodeSets() chainPerSelector := make(map[uint64]creblockchains.Blockchain) @@ -113,18 +105,16 @@ func PrepareNodeTOMLs( if configsFound == 0 { config, configErr := generateNodeTomlConfig( cre.GenerateConfigsInput{ - Datastore: creEnv.CldfEnvironment.DataStore, - ContractVersions: creEnv.ContractVersions, - DonMetadata: donMetadata, - Blockchains: chainPerSelector, - BlockchainPlacementBySelector: blockchainTargetBySelector, - OCRBootstrapPlacement: ocrBootstrapPlacement, - Flags: donMetadata.Flags, - CapabilitiesPeeringData: capabilitiesPeeringData, - OCRPeeringData: ocrPeeringData, - RegistryChainSelector: creEnv.RegistryChainSelector, - Topology: topology, - Provider: creEnv.Provider, + Datastore: creEnv.CldfEnvironment.DataStore, + ContractVersions: creEnv.ContractVersions, + DonMetadata: donMetadata, + Blockchains: chainPerSelector, + Flags: donMetadata.Flags, + CapabilitiesPeeringData: capabilitiesPeeringData, + OCRPeeringData: ocrPeeringData, + RegistryChainSelector: creEnv.RegistryChainSelector, + Topology: topology, + Provider: creEnv.Provider, }, configFactoryFunctions, ) @@ -237,7 +227,7 @@ func generateNodeTomlConfig(input cre.GenerateConfigsInput, nodeConfigTransforme } case cre.WorkerNode: var cErr error - nodeConfig, cErr = addWorkerNodeConfig(nodeConfig, input.Topology, input.OCRPeeringData, input.OCRBootstrapPlacement, commonInputs, input.DonMetadata, nodeMetadata) + nodeConfig, cErr = addWorkerNodeConfig(nodeConfig, input.Topology, input.OCRPeeringData, commonInputs, input.DonMetadata, nodeMetadata) if cErr != nil { return nil, errors.Wrapf(cErr, "failed to add worker node config for node at index %d in DON %s", nodeIdx, input.DonMetadata.Name) } @@ -342,16 +332,18 @@ func addBootstrapNodeConfig( EnableExperimentalRageP2P: ptr.Ptr(true), } if donMetadata != nil && nodeMetadata != nil { + announcePort := resolveNodeOCR2AnnouncePort(donMetadata.MustNodeSet(), nodeMetadata.Index) announceAddresses, announceErr := cre.ResolveP2PAnnounceAddresses( donMetadata.MustNodeSet().Placement, hasRemoteNodeSets(topology), - nodeMetadata.Host, - ocrPeeringData.Port, + announcePort, ) if announceErr != nil { return existingConfig, errors.Wrap(announceErr, "failed to resolve P2P announce addresses for bootstrap node") } - existingConfig.P2P.V2.AnnounceAddresses = ptr.Ptr(announceAddresses) + if len(announceAddresses) > 0 { + existingConfig.P2P.V2.AnnounceAddresses = ptr.Ptr(announceAddresses) + } } if commonInputs.provider.IsDocker() { @@ -407,16 +399,11 @@ func addWorkerNodeConfig( existingConfig corechainlink.Config, topology *cre.Topology, ocrPeeringData cre.OCRPeeringData, - ocrBootstrapPlacement string, commonInputs *commonInputs, donMetadata *cre.DonMetadata, m *cre.NodeMetadata, ) (corechainlink.Config, error) { - bootstrapAddress, bootstrapAddressErr := cre.ResolveBootstrapAddress(donMetadata.MustNodeSet().Placement, ocrBootstrapPlacement, ocrPeeringData.OCRBootstraperHost, ocrPeeringData.Port) - if bootstrapAddressErr != nil { - return existingConfig, errors.Wrap(bootstrapAddressErr, "failed to resolve bootstrap address for worker node") - } - ocrBoostrapperLocator, ocrBErr := commontypes.NewBootstrapperLocator(ocrPeeringData.OCRBootstraperPeerID, []string{bootstrapAddress}) + ocrBoostrapperLocator, ocrBErr := commontypes.NewBootstrapperLocator(ocrPeeringData.OCRBootstraperPeerID, []string{ocrPeeringData.OCRBootstraperHost + ":" + strconv.Itoa(ocrPeeringData.Port)}) if ocrBErr != nil { return existingConfig, errors.Wrap(ocrBErr, "failed to create OCR bootstrapper locator") } @@ -435,16 +422,18 @@ func addWorkerNodeConfig( }, EnableExperimentalRageP2P: ptr.Ptr(true), } + announcePort := resolveNodeOCR2AnnouncePort(donMetadata.MustNodeSet(), m.Index) announceAddresses, announceErr := cre.ResolveP2PAnnounceAddresses( donMetadata.MustNodeSet().Placement, hasRemoteNodeSets(topology), - m.Host, - ocrPeeringData.Port, + announcePort, ) if announceErr != nil { return existingConfig, errors.Wrap(announceErr, "failed to resolve P2P announce addresses for worker node") } - existingConfig.P2P.V2.AnnounceAddresses = ptr.Ptr(announceAddresses) + if len(announceAddresses) > 0 { + existingConfig.P2P.V2.AnnounceAddresses = ptr.Ptr(announceAddresses) + } if commonInputs.provider.IsDocker() { existingConfig.CRE.WorkflowFetcher = &coretoml.WorkflowFetcherConfig{ @@ -535,49 +524,16 @@ func addWorkerNodeConfig( if !ok { return existingConfig, fmt.Errorf("failed to get EVM key (chainID %d, node index %d)", commonInputs.registryChainID, m.Index) } - callerPlacement, placementErr := connectivity.PlacementFromTarget(donMetadata.MustNodeSet().Placement) - if placementErr != nil { - return existingConfig, placementErr - } - placementByGatewayNodeUUID, placementMapErr := gatewayPlacementByNodeUUID(topology) - if placementMapErr != nil { - return existingConfig, placementMapErr - } gateways := []coretoml.ConnectorGateway{} if topology != nil && len(topology.GatewayConnectors.Configurations) > 0 { for _, gateway := range topology.GatewayConnectors.Configurations { - gatewayPlacement, ok := placementByGatewayNodeUUID[gateway.NodeUUID] - if !ok { - return existingConfig, fmt.Errorf("failed to resolve placement for gateway node UUID %s", gateway.NodeUUID) - } - internalURL := fmt.Sprintf("ws://%s:%d%s", gateway.Outgoing.Host, gateway.Outgoing.Port, gateway.Outgoing.Path) - externalURL := gatewayExternalConnectorURL(gateway) - resolvedGateway, err := connectivity.ResolveAndEnsureReachable( - context.Background(), - callerPlacement, - gatewayPlacement, - connectivity.EndpointPair{ - Name: fmt.Sprintf("gateway-%s", gateway.AuthGatewayID), - Internal: internalURL, - External: externalURL, - }, - // Bridge creation for remote->local gateway is handled outside config generation. - func(_ context.Context, _ connectivity.EndpointPair, _ int) error { return nil }, - ) - if err != nil { - return existingConfig, err - } - if resolvedGateway.RequiresBridge { - bridgeURL, bridgeErr := rewriteEndpointForRemoteCaller(resolvedGateway.URL) - if bridgeErr != nil { - return existingConfig, bridgeErr - } - resolvedGateway.URL = bridgeURL - } gateways = append(gateways, coretoml.ConnectorGateway{ - ID: ptr.Ptr(gateway.AuthGatewayID), - URL: ptr.Ptr(resolvedGateway.URL), + ID: ptr.Ptr(gateway.AuthGatewayID), + URL: ptr.Ptr(fmt.Sprintf("ws://%s:%d%s", + gateway.Outgoing.Host, + gateway.Outgoing.Port, + gateway.Outgoing.Path)), }) } @@ -693,10 +649,7 @@ func gatherCommonInputs(input cre.GenerateConfigsInput) (*commonInputs, error) { return nil, errors.Wrap(homeErr, "failed to get home chain ID") } - evmChains, evmErr := findEVMChains(input) - if evmErr != nil { - return nil, errors.Wrap(evmErr, "failed to resolve EVM chain endpoints for node config") - } + evmChains := findEVMChains(input) solanaChain, solErr := findOneSolanaChain(input) if solErr != nil { return nil, errors.Wrap(solErr, "failed to find Solana chain in the environment configuration") @@ -729,12 +682,8 @@ type evmChain struct { WSRPC string } -func findEVMChains(input cre.GenerateConfigsInput) ([]*evmChain, error) { +func findEVMChains(input cre.GenerateConfigsInput) []*evmChain { evmChains := make([]*evmChain, 0) - callerPlacement, err := connectivity.PlacementFromTarget(input.DonMetadata.MustNodeSet().Placement) - if err != nil { - return nil, err - } for chainSelector, bcOut := range input.Blockchains { if bcOut.IsFamily(chain_selectors.FamilySolana) { continue @@ -745,49 +694,14 @@ func findEVMChains(input cre.GenerateConfigsInput) ([]*evmChain, error) { continue } - targetPlacement, err := connectivity.PlacementFromTarget(input.BlockchainPlacementBySelector[chainSelector]) - if err != nil { - return nil, err - } - resolvedHTTP, err := connectivity.ResolveAndEnsureReachable(context.Background(), callerPlacement, targetPlacement, connectivity.EndpointPair{ - Name: fmt.Sprintf("evm-http-%d", bcOut.ChainID()), - Internal: bcOut.CtfOutput().Nodes[0].InternalHTTPUrl, - External: bcOut.CtfOutput().Nodes[0].ExternalHTTPUrl, - }, func(_ context.Context, _ connectivity.EndpointPair, _ int) error { return nil }) - if err != nil { - return nil, err - } - if resolvedHTTP.RequiresBridge { - bridgeURL, bridgeErr := rewriteEndpointForRemoteCaller(resolvedHTTP.URL) - if bridgeErr != nil { - return nil, fmt.Errorf("bridge url rewrite failed for node->blockchain HTTP endpoint on chain %d: %w", bcOut.ChainID(), bridgeErr) - } - resolvedHTTP.URL = bridgeURL - } - resolvedWS, err := connectivity.ResolveAndEnsureReachable(context.Background(), callerPlacement, targetPlacement, connectivity.EndpointPair{ - Name: fmt.Sprintf("evm-ws-%d", bcOut.ChainID()), - Internal: bcOut.CtfOutput().Nodes[0].InternalWSUrl, - External: bcOut.CtfOutput().Nodes[0].ExternalWSUrl, - }, func(_ context.Context, _ connectivity.EndpointPair, _ int) error { return nil }) - if err != nil { - return nil, err - } - if resolvedWS.RequiresBridge { - bridgeURL, bridgeErr := rewriteEndpointForRemoteCaller(resolvedWS.URL) - if bridgeErr != nil { - return nil, fmt.Errorf("bridge url rewrite failed for node->blockchain WS endpoint on chain %d: %w", bcOut.ChainID(), bridgeErr) - } - resolvedWS.URL = bridgeURL - } - evmChains = append(evmChains, &evmChain{ Name: fmt.Sprintf("node-%d", chainSelector), ChainID: bcOut.ChainID(), - HTTPRPC: resolvedHTTP.URL, - WSRPC: resolvedWS.URL, + HTTPRPC: bcOut.CtfOutput().Nodes[0].InternalHTTPUrl, + WSRPC: bcOut.CtfOutput().Nodes[0].InternalWSUrl, }) } - return evmChains, nil + return evmChains } type solanaChain struct { @@ -799,10 +713,6 @@ type solanaChain struct { func findOneSolanaChain(input cre.GenerateConfigsInput) (*solanaChain, error) { var solChain *solanaChain chainsFound := 0 - callerPlacement, err := connectivity.PlacementFromTarget(input.DonMetadata.MustNodeSet().Placement) - if err != nil { - return nil, err - } for _, bcOut := range input.Blockchains { if !bcOut.IsFamily(chain_selectors.FamilySolana) { @@ -815,25 +725,6 @@ func findOneSolanaChain(input cre.GenerateConfigsInput) (*solanaChain, error) { } solBc := bcOut.(*solana.Blockchain) - targetPlacement, err := connectivity.PlacementFromTarget(input.BlockchainPlacementBySelector[solBc.ChainSelector()]) - if err != nil { - return nil, err - } - resolvedNodeURL, err := connectivity.ResolveAndEnsureReachable(context.Background(), callerPlacement, targetPlacement, connectivity.EndpointPair{ - Name: "solana-rpc", - Internal: bcOut.CtfOutput().Nodes[0].InternalHTTPUrl, - External: bcOut.CtfOutput().Nodes[0].ExternalHTTPUrl, - }, func(_ context.Context, _ connectivity.EndpointPair, _ int) error { return nil }) - if err != nil { - return nil, err - } - if resolvedNodeURL.RequiresBridge { - bridgeURL, bridgeErr := rewriteEndpointForRemoteCaller(resolvedNodeURL.URL) - if bridgeErr != nil { - return nil, fmt.Errorf("bridge url rewrite failed for node->solana RPC endpoint: %w", bridgeErr) - } - resolvedNodeURL.URL = bridgeURL - } ctx, cancelFn := context.WithTimeout(context.Background(), 15*time.Second) chainID, err := solBc.SolClient.GetGenesisHash(ctx) @@ -846,120 +737,13 @@ func findOneSolanaChain(input cre.GenerateConfigsInput) (*solanaChain, error) { solChain = &solanaChain{ Name: fmt.Sprintf("node-%d", solBc.ChainSelector()), ChainID: chainID.String(), - NodeURL: resolvedNodeURL.URL, + NodeURL: bcOut.CtfOutput().Nodes[0].InternalHTTPUrl, } } return solChain, nil } -func gatewayPlacementByNodeUUID(topology *cre.Topology) (map[string]connectivity.Placement, error) { - out := make(map[string]connectivity.Placement) - if topology == nil { - return out, nil - } - for _, don := range topology.DonsMetadata.List() { - placement, err := connectivity.PlacementFromTarget(don.MustNodeSet().Placement) - if err != nil { - return nil, err - } - for _, node := range don.NodesMetadata { - if node == nil || strings.TrimSpace(node.UUID) == "" { - continue - } - out[node.UUID] = placement - } - } - return out, nil -} - -func hasRemoteNodeSets(topology *cre.Topology) bool { - if topology == nil { - return false - } - for _, don := range topology.DonsMetadata.List() { - if don == nil || don.MustNodeSet() == nil { - continue - } - if strings.EqualFold(strings.TrimSpace(don.MustNodeSet().Placement), string(connectivity.PlacementRemote)) { - return true - } - } - return false -} - -func resolveBootstrapPlacement(topology *cre.Topology, bootstrapNodeUUID string) (string, error) { - if topology == nil { - return "", fmt.Errorf("topology is nil") - } - bootstrapNodeUUID = strings.TrimSpace(bootstrapNodeUUID) - if bootstrapNodeUUID == "" { - return "", fmt.Errorf("bootstrap node UUID is empty") - } - for _, don := range topology.DonsMetadata.List() { - if don == nil { - continue - } - for _, node := range don.NodesMetadata { - if node == nil || strings.TrimSpace(node.UUID) == "" { - continue - } - if node.UUID != bootstrapNodeUUID { - continue - } - return strings.TrimSpace(don.MustNodeSet().Placement), nil - } - } - return "", fmt.Errorf("failed to resolve bootstrap placement for node UUID %s", bootstrapNodeUUID) -} - -func gatewayExternalConnectorURL(gateway *cre.DonGatewayConfiguration) string { - if gateway == nil || gateway.GatewayConfiguration == nil { - return "" - } - scheme := "ws" - switch strings.ToLower(strings.TrimSpace(gateway.Incoming.Protocol)) { - case "https": - scheme = "wss" - case "wss": - scheme = "wss" - case "http": - scheme = "ws" - } - path := strings.TrimSpace(gateway.Incoming.Path) - if path == "" || path == "/" { - path = "/node" - } else if !strings.HasSuffix(path, "/node") { - path = strings.TrimRight(path, "/") + "/node" - } - return fmt.Sprintf("%s://%s:%d%s", scheme, gateway.Incoming.Host, gateway.Incoming.ExternalPort, path) -} - -func rewriteEndpointForRemoteCaller(raw string) (string, error) { - dockerHost := strings.TrimPrefix(framework.HostDockerInternal(), "http://") - trimmed := strings.TrimSpace(raw) - if trimmed == "" { - return "", fmt.Errorf("endpoint is empty") - } - if strings.Contains(trimmed, "://") { - parsed, err := url.Parse(trimmed) - if err != nil { - return "", fmt.Errorf("parse url %q: %w", raw, err) - } - if parsed.Port() != "" { - parsed.Host = net.JoinHostPort(dockerHost, parsed.Port()) - return parsed.String(), nil - } - parsed.Host = dockerHost - return parsed.String(), nil - } - _, port, err := net.SplitHostPort(trimmed) - if err != nil { - return "", fmt.Errorf("parse host:port %q: %w", raw, err) - } - return net.JoinHostPort(dockerHost, port), nil -} - func buildTronEVMConfig(evmChain *evmChain) evmconfigtoml.EVMConfig { tronRPC := strings.Replace(evmChain.HTTPRPC, "jsonrpc", "wallet", 1) return evmconfigtoml.EVMConfig{ @@ -1008,7 +792,7 @@ func appendEVMChain(existingConfig *evmconfigtoml.EVMConfigs, evmChain *evmChain // add only unconfigured chains, since other roles might have already added some chains for _, existingEVM := range *existingConfig { - if existingEVM.ChainID.Cmp(chainlinkbig.New(big.NewInt(libc.MustSafeInt64(evmChain.ChainID)))) == 0 { + if existingEVM.ChainID.ToInt().Cmp(big.NewInt(libc.MustSafeInt64(evmChain.ChainID))) == 0 { return } } @@ -1035,6 +819,39 @@ func appendSolanaChain(existingConfig *solcfg.TOMLConfigs, solChain *solanaChain }) } +func hasRemoteNodeSets(topology *cre.Topology) bool { + if topology == nil { + return false + } + for _, nodeSet := range topology.NodeSets() { + if nodeSet != nil && strings.EqualFold(strings.TrimSpace(nodeSet.Placement), "remote") { + return true + } + } + return false +} + +func resolveNodeOCR2AnnouncePort(nodeSet *cre.NodeSet, nodeIndex int) int { + base := 0 + if nodeSet != nil { + base = nodeSet.OCR2P2PRangeStart + if base == 0 { + httpStart := nodeSet.HTTPPortRangeStart + if httpStart == 0 { + httpStart = ns.DefaultHTTPPortStaticRangeStart + } + base = httpStart + (ns.DefaultOCR2P2PStaticRangeStart - ns.DefaultHTTPPortStaticRangeStart) + } + } + if base == 0 { + base = ns.DefaultOCR2P2PStaticRangeStart + } + if nodeIndex < 0 { + nodeIndex = 0 + } + return base + nodeIndex +} + // transformAdditionalSourceURLs transforms URLs in AdditionalSourcesConfig to use // platform-specific Docker host addresses. This handles differences between macOS // (host.docker.internal) and Linux (172.17.0.1 or similar) Docker host resolution. diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index 2c2034fe83e..9aa3612daa1 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -233,14 +233,12 @@ func SetupTestEnvironment( if tErr != nil { return nil, pkgerrors.Wrap(tErr, "failed to create topology") } - blockchainPlacementBySelector := blockchainPlacementsBySelector(input.Blockchains, deployedBlockchains.Outputs) updatedNodeSets, topoErr := donconfig.PrepareNodeTOMLs( ctx, topology, creEnvironment, input.NodeSets, - blockchainPlacementBySelector, input.Capabilities, input.ConfigFactoryFunctions, ) @@ -493,21 +491,6 @@ func SetupTestEnvironment( }, nil } -func blockchainPlacementsBySelector(configured []*config.Blockchain, deployed []blockchains.Blockchain) map[uint64]string { - bySelector := make(map[uint64]string, len(deployed)) - for idx, blockchainCfg := range configured { - if blockchainCfg == nil { - continue - } - if idx >= len(deployed) || deployed[idx] == nil { - continue - } - selector := deployed[idx].ChainSelector() - bySelector[selector] = string(blockchainCfg.Placement) - } - return bySelector -} - func appendOutputsToInput(input *SetupInput, nodeSetOutput []*cre.NodeSetOutput, blockchains []blockchains.Blockchain, jdOutput *jd.Output) { // append the nodeset output, so that later it can be stored in the cached output, so that we can use the environment again without running setup for idx, nsOut := range nodeSetOutput { diff --git a/system-tests/lib/go.mod b/system-tests/lib/go.mod index 4161c743426..3a59bc63a3e 100644 --- a/system-tests/lib/go.mod +++ b/system-tests/lib/go.mod @@ -28,6 +28,7 @@ require ( github.com/gagliardetto/solana-go v1.13.0 github.com/goccy/go-yaml v1.18.0 github.com/google/uuid v1.6.0 + github.com/gorilla/websocket v1.5.3 github.com/jmoiron/sqlx v1.4.0 github.com/pelletier/go-toml/v2 v2.2.4 github.com/pkg/errors v0.9.1 @@ -285,7 +286,6 @@ require ( github.com/gorilla/mux v1.8.1 // indirect github.com/gorilla/securecookie v1.1.2 // indirect github.com/gorilla/sessions v1.2.2 // indirect - github.com/gorilla/websocket v1.5.3 // indirect github.com/grafana/pyroscope-go v1.2.7 // indirect github.com/grafana/pyroscope-go/godeltaprof v0.1.9 // indirect github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect From a47d2eef634f838de285a463039221221e5160b2 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Tue, 24 Feb 2026 11:11:18 +0100 Subject: [PATCH 15/34] mixed-mode WIP#2 --- system-tests/lib/cre/bootstrap_peer.go | 27 ++- system-tests/lib/cre/bootstrap_peer_test.go | 18 +- system-tests/lib/cre/don/config/config.go | 187 ++++++++++++++++-- .../lib/cre/environment/environment.go | 17 ++ .../lib/cre/runtimecfg/access_mode.go | 60 ++++++ system-tests/lib/cre/types.go | 4 + 6 files changed, 286 insertions(+), 27 deletions(-) diff --git a/system-tests/lib/cre/bootstrap_peer.go b/system-tests/lib/cre/bootstrap_peer.go index 46b17ccf6f6..48181e8e5bb 100644 --- a/system-tests/lib/cre/bootstrap_peer.go +++ b/system-tests/lib/cre/bootstrap_peer.go @@ -72,21 +72,31 @@ func ResolveP2PAnnounceAddresses(nodePlacement string, hasRemoteNodeSets bool, a } addresses := []string{} + seen := map[string]struct{}{} add := func(addr string) { trimmed := strings.TrimSpace(addr) - if trimmed != "" { - addresses = append(addresses, trimmed) + if trimmed == "" { + return } + if _, exists := seen[trimmed]; exists { + return + } + seen[trimmed] = struct{}{} + addresses = append(addresses, trimmed) } switch placement { case connectivity.PlacementLocal: if !hasRemoteNodeSets { - // Keep AnnounceAddresses unset for local-only setups. - // This lets libocr use default/local-network behavior. + // Keep local announce addresses unset for local-only topologies. return addresses, nil } - // In mixed mode, local nodes are reached through EC2 relay listeners. + localHostIP, localErr := resolveLocalAnnounceHostIP() + if localErr != nil { + return nil, localErr + } + add(net.JoinHostPort(localHostIP, strconv.Itoa(advertisedPort))) + // In mixed mode, remote peers must reach local nodes through EC2 relay listeners. external, externalErr := resolveBootstrapExternalAddress(connectivity.PlacementRemote, advertisedPort) if externalErr != nil { return nil, externalErr @@ -106,6 +116,13 @@ func ResolveP2PAnnounceAddresses(nodePlacement string, hasRemoteNodeSets bool, a return addresses, nil } +func resolveLocalAnnounceHostIP() (string, error) { + if hostIP := strings.TrimSpace(runtimecfg.LocalHostIP()); hostIP != "" { + return hostIP, nil + } + return "", fmt.Errorf("failed to auto-resolve local docker-host gateway IP for mixed local/remote P2P announce; set %s to override", runtimecfg.EnvLocalHostIP) +} + func resolveBootstrapExternalAddress(targetPlacement connectivity.Placement, port int) (string, error) { if targetPlacement == connectivity.PlacementLocal { return net.JoinHostPort("127.0.0.1", strconv.Itoa(port)), nil diff --git a/system-tests/lib/cre/bootstrap_peer_test.go b/system-tests/lib/cre/bootstrap_peer_test.go index 04b08f7ecee..203ac0d3d2b 100644 --- a/system-tests/lib/cre/bootstrap_peer_test.go +++ b/system-tests/lib/cre/bootstrap_peer_test.go @@ -20,6 +20,7 @@ func TestResolveP2PAnnounceAddresses_LocalOnly_UsesInternalHost(t *testing.T) { func TestResolveP2PAnnounceAddresses_LocalMixed_AddsBridgedHost(t *testing.T) { prevMode, hadMode := os.LookupEnv(runtimecfg.EnvRemoteAccessMode) prevIP, hadIP := os.LookupEnv(runtimecfg.EnvEC2HostIP) + prevLocalIP, hadLocalIP := os.LookupEnv(runtimecfg.EnvLocalHostIP) t.Cleanup(func() { if hadMode { _ = os.Setenv(runtimecfg.EnvRemoteAccessMode, prevMode) @@ -31,19 +32,28 @@ func TestResolveP2PAnnounceAddresses_LocalMixed_AddsBridgedHost(t *testing.T) { } else { _ = os.Unsetenv(runtimecfg.EnvEC2HostIP) } + if hadLocalIP { + _ = os.Setenv(runtimecfg.EnvLocalHostIP, prevLocalIP) + } else { + _ = os.Unsetenv(runtimecfg.EnvLocalHostIP) + } }) _ = os.Setenv(runtimecfg.EnvRemoteAccessMode, runtimecfg.RemoteAccessModeDirect) _ = os.Setenv(runtimecfg.EnvEC2HostIP, "10.1.2.3") + _ = os.Setenv(runtimecfg.EnvLocalHostIP, "192.168.1.10") addresses, err := ResolveP2PAnnounceAddresses("local", true, 15002) if err != nil { t.Fatalf("ResolveP2PAnnounceAddresses returned error: %v", err) } - if len(addresses) != 1 { - t.Fatalf("expected one announce address for mixed mode, got %d (%v)", len(addresses), addresses) + if len(addresses) != 2 { + t.Fatalf("expected two announce addresses for mixed mode, got %d (%v)", len(addresses), addresses) + } + if addresses[0] != "192.168.1.10:15002" { + t.Fatalf("expected local host address 192.168.1.10:15002, got %s", addresses[0]) } - if addresses[0] != "10.1.2.3:15002" { - t.Fatalf("expected external EC2 address 10.1.2.3:15002, got %s", addresses[0]) + if addresses[1] != "10.1.2.3:15002" { + t.Fatalf("expected external EC2 address 10.1.2.3:15002, got %s", addresses[1]) } } diff --git a/system-tests/lib/cre/don/config/config.go b/system-tests/lib/cre/don/config/config.go index 8a6ebddf437..6f91b0437fa 100644 --- a/system-tests/lib/cre/don/config/config.go +++ b/system-tests/lib/cre/don/config/config.go @@ -5,6 +5,7 @@ import ( "fmt" "maps" "math/big" + "net" "slices" "strconv" "strings" @@ -34,9 +35,11 @@ import ( libc "github.com/smartcontractkit/chainlink/system-tests/lib/conversions" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/connectivity" crecontracts "github.com/smartcontractkit/chainlink/system-tests/lib/cre/contracts" creblockchains "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/solana" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" "github.com/smartcontractkit/chainlink/system-tests/lib/infra" ) @@ -47,6 +50,7 @@ func PrepareNodeTOMLs( topology *cre.Topology, creEnv *cre.Environment, nodeSets []*cre.NodeSet, + blockchainPlacementBySelector map[uint64]string, capabilities []cre.InstallableCapability, // Deprecated, use Features instead and modify node configs inside a Feature nodeConfigTransformerFns []cre.NodeConfigTransformerFn, ) ([]*cre.NodeSet, error) { @@ -59,6 +63,14 @@ func PrepareNodeTOMLs( if peeringErr != nil { return nil, errors.Wrap(peeringErr, "failed to find peering data") } + ocrBootstrapPlacement, placementErr := resolveBootstrapPlacement(topology, bt.UUID) + if placementErr != nil { + return nil, placementErr + } + ocrBootstrapAnnouncePort, announcePortErr := resolveBootstrapAnnouncePort(topology, bt.UUID) + if announcePortErr != nil { + return nil, announcePortErr + } localNodeSets := topology.NodeSets() chainPerSelector := make(map[uint64]creblockchains.Blockchain) @@ -105,16 +117,19 @@ func PrepareNodeTOMLs( if configsFound == 0 { config, configErr := generateNodeTomlConfig( cre.GenerateConfigsInput{ - Datastore: creEnv.CldfEnvironment.DataStore, - ContractVersions: creEnv.ContractVersions, - DonMetadata: donMetadata, - Blockchains: chainPerSelector, - Flags: donMetadata.Flags, - CapabilitiesPeeringData: capabilitiesPeeringData, - OCRPeeringData: ocrPeeringData, - RegistryChainSelector: creEnv.RegistryChainSelector, - Topology: topology, - Provider: creEnv.Provider, + Datastore: creEnv.CldfEnvironment.DataStore, + ContractVersions: creEnv.ContractVersions, + DonMetadata: donMetadata, + Blockchains: chainPerSelector, + BlockchainPlacementBySelector: blockchainPlacementBySelector, + OCRBootstrapPlacement: ocrBootstrapPlacement, + OCRBootstrapAnnouncePort: ocrBootstrapAnnouncePort, + Flags: donMetadata.Flags, + CapabilitiesPeeringData: capabilitiesPeeringData, + OCRPeeringData: ocrPeeringData, + RegistryChainSelector: creEnv.RegistryChainSelector, + Topology: topology, + Provider: creEnv.Provider, }, configFactoryFunctions, ) @@ -227,7 +242,7 @@ func generateNodeTomlConfig(input cre.GenerateConfigsInput, nodeConfigTransforme } case cre.WorkerNode: var cErr error - nodeConfig, cErr = addWorkerNodeConfig(nodeConfig, input.Topology, input.OCRPeeringData, commonInputs, input.DonMetadata, nodeMetadata) + nodeConfig, cErr = addWorkerNodeConfig(nodeConfig, input.Topology, input.OCRPeeringData, input.OCRBootstrapPlacement, input.OCRBootstrapAnnouncePort, commonInputs, input.DonMetadata, nodeMetadata) if cErr != nil { return nil, errors.Wrapf(cErr, "failed to add worker node config for node at index %d in DON %s", nodeIdx, input.DonMetadata.Name) } @@ -399,11 +414,23 @@ func addWorkerNodeConfig( existingConfig corechainlink.Config, topology *cre.Topology, ocrPeeringData cre.OCRPeeringData, + ocrBootstrapPlacement string, + ocrBootstrapAnnouncePort int, commonInputs *commonInputs, donMetadata *cre.DonMetadata, m *cre.NodeMetadata, ) (corechainlink.Config, error) { - ocrBoostrapperLocator, ocrBErr := commontypes.NewBootstrapperLocator(ocrPeeringData.OCRBootstraperPeerID, []string{ocrPeeringData.OCRBootstraperHost + ":" + strconv.Itoa(ocrPeeringData.Port)}) + bootstrapAddress, bootstrapAddressErr := resolveNodeFacingBootstrapAddress( + donMetadata.MustNodeSet().Placement, + ocrBootstrapPlacement, + ocrPeeringData.OCRBootstraperHost, + ocrPeeringData.Port, + ocrBootstrapAnnouncePort, + ) + if bootstrapAddressErr != nil { + return existingConfig, errors.Wrap(bootstrapAddressErr, "failed to resolve OCR bootstrapper address") + } + ocrBoostrapperLocator, ocrBErr := commontypes.NewBootstrapperLocator(ocrPeeringData.OCRBootstraperPeerID, []string{bootstrapAddress}) if ocrBErr != nil { return existingConfig, errors.Wrap(ocrBErr, "failed to create OCR bootstrapper locator") } @@ -649,7 +676,10 @@ func gatherCommonInputs(input cre.GenerateConfigsInput) (*commonInputs, error) { return nil, errors.Wrap(homeErr, "failed to get home chain ID") } - evmChains := findEVMChains(input) + evmChains, evmErr := findEVMChains(input) + if evmErr != nil { + return nil, errors.Wrap(evmErr, "failed to resolve EVM chain endpoints for node config") + } solanaChain, solErr := findOneSolanaChain(input) if solErr != nil { return nil, errors.Wrap(solErr, "failed to find Solana chain in the environment configuration") @@ -682,8 +712,12 @@ type evmChain struct { WSRPC string } -func findEVMChains(input cre.GenerateConfigsInput) []*evmChain { +func findEVMChains(input cre.GenerateConfigsInput) ([]*evmChain, error) { evmChains := make([]*evmChain, 0) + callerPlacement, err := connectivity.PlacementFromTarget(input.DonMetadata.MustNodeSet().Placement) + if err != nil { + return nil, err + } for chainSelector, bcOut := range input.Blockchains { if bcOut.IsFamily(chain_selectors.FamilySolana) { continue @@ -694,14 +728,38 @@ func findEVMChains(input cre.GenerateConfigsInput) []*evmChain { continue } + targetPlacementRaw, ok := input.BlockchainPlacementBySelector[chainSelector] + if !ok || strings.TrimSpace(targetPlacementRaw) == "" { + targetPlacementRaw = string(connectivity.PlacementLocal) + } + targetPlacement, err := connectivity.PlacementFromTarget(targetPlacementRaw) + if err != nil { + return nil, err + } + resolvedHTTP, err := connectivity.Resolve(callerPlacement, targetPlacement, connectivity.EndpointPair{ + Name: fmt.Sprintf("evm-http-%d", bcOut.ChainID()), + Internal: bcOut.CtfOutput().Nodes[0].InternalHTTPUrl, + External: bcOut.CtfOutput().Nodes[0].ExternalHTTPUrl, + }) + if err != nil { + return nil, err + } + resolvedWS, err := connectivity.Resolve(callerPlacement, targetPlacement, connectivity.EndpointPair{ + Name: fmt.Sprintf("evm-ws-%d", bcOut.ChainID()), + Internal: bcOut.CtfOutput().Nodes[0].InternalWSUrl, + External: bcOut.CtfOutput().Nodes[0].ExternalWSUrl, + }) + if err != nil { + return nil, err + } evmChains = append(evmChains, &evmChain{ Name: fmt.Sprintf("node-%d", chainSelector), ChainID: bcOut.ChainID(), - HTTPRPC: bcOut.CtfOutput().Nodes[0].InternalHTTPUrl, - WSRPC: bcOut.CtfOutput().Nodes[0].InternalWSUrl, + HTTPRPC: resolvedHTTP.URL, + WSRPC: resolvedWS.URL, }) } - return evmChains + return evmChains, nil } type solanaChain struct { @@ -713,6 +771,10 @@ type solanaChain struct { func findOneSolanaChain(input cre.GenerateConfigsInput) (*solanaChain, error) { var solChain *solanaChain chainsFound := 0 + callerPlacement, err := connectivity.PlacementFromTarget(input.DonMetadata.MustNodeSet().Placement) + if err != nil { + return nil, err + } for _, bcOut := range input.Blockchains { if !bcOut.IsFamily(chain_selectors.FamilySolana) { @@ -725,6 +787,22 @@ func findOneSolanaChain(input cre.GenerateConfigsInput) (*solanaChain, error) { } solBc := bcOut.(*solana.Blockchain) + targetPlacementRaw, ok := input.BlockchainPlacementBySelector[solBc.ChainSelector()] + if !ok || strings.TrimSpace(targetPlacementRaw) == "" { + targetPlacementRaw = string(connectivity.PlacementLocal) + } + targetPlacement, err := connectivity.PlacementFromTarget(targetPlacementRaw) + if err != nil { + return nil, err + } + resolvedNodeURL, err := connectivity.Resolve(callerPlacement, targetPlacement, connectivity.EndpointPair{ + Name: "solana-rpc", + Internal: bcOut.CtfOutput().Nodes[0].InternalHTTPUrl, + External: bcOut.CtfOutput().Nodes[0].ExternalHTTPUrl, + }) + if err != nil { + return nil, err + } ctx, cancelFn := context.WithTimeout(context.Background(), 15*time.Second) chainID, err := solBc.SolClient.GetGenesisHash(ctx) @@ -737,7 +815,7 @@ func findOneSolanaChain(input cre.GenerateConfigsInput) (*solanaChain, error) { solChain = &solanaChain{ Name: fmt.Sprintf("node-%d", solBc.ChainSelector()), ChainID: chainID.String(), - NodeURL: bcOut.CtfOutput().Nodes[0].InternalHTTPUrl, + NodeURL: resolvedNodeURL.URL, } } @@ -852,6 +930,79 @@ func resolveNodeOCR2AnnouncePort(nodeSet *cre.NodeSet, nodeIndex int) int { return base + nodeIndex } +func resolveBootstrapPlacement(topology *cre.Topology, bootstrapNodeUUID string) (string, error) { + if topology == nil { + return "", fmt.Errorf("topology is nil") + } + bootstrapNodeUUID = strings.TrimSpace(bootstrapNodeUUID) + if bootstrapNodeUUID == "" { + return "", fmt.Errorf("bootstrap node UUID is empty") + } + for _, don := range topology.DonsMetadata.List() { + if don == nil { + continue + } + for _, node := range don.NodesMetadata { + if node == nil || strings.TrimSpace(node.UUID) == "" { + continue + } + if node.UUID != bootstrapNodeUUID { + continue + } + return strings.TrimSpace(don.MustNodeSet().Placement), nil + } + } + return "", fmt.Errorf("failed to resolve bootstrap placement for node UUID %s", bootstrapNodeUUID) +} + +func resolveBootstrapAnnouncePort(topology *cre.Topology, bootstrapNodeUUID string) (int, error) { + if topology == nil { + return 0, fmt.Errorf("topology is nil") + } + bootstrapNodeUUID = strings.TrimSpace(bootstrapNodeUUID) + if bootstrapNodeUUID == "" { + return 0, fmt.Errorf("bootstrap node UUID is empty") + } + for _, don := range topology.DonsMetadata.List() { + if don == nil { + continue + } + for _, node := range don.NodesMetadata { + if node == nil || strings.TrimSpace(node.UUID) == "" { + continue + } + if node.UUID != bootstrapNodeUUID { + continue + } + return resolveNodeOCR2AnnouncePort(don.MustNodeSet(), node.Index), nil + } + } + return 0, fmt.Errorf("failed to resolve bootstrap announce port for node UUID %s", bootstrapNodeUUID) +} + +func resolveNodeFacingBootstrapAddress(callerPlacement, bootstrapPlacement, bootstrapHost string, internalPort, externalPort int) (string, error) { + caller, err := connectivity.PlacementFromTarget(callerPlacement) + if err != nil { + return "", err + } + target, err := connectivity.PlacementFromTarget(bootstrapPlacement) + if err != nil { + return "", err + } + // Local callers need EC2-host reachable port for remote bootstrap nodes. + if caller == connectivity.PlacementLocal && target == connectivity.PlacementRemote { + if !runtimecfg.IsDirectMode() { + return "", fmt.Errorf("mixed DON bootstrap resolution requires direct mode") + } + hostIP, err := runtimecfg.DirectHostIP() + if err != nil { + return "", err + } + return net.JoinHostPort(hostIP, strconv.Itoa(externalPort)), nil + } + return cre.ResolveBootstrapAddress(callerPlacement, bootstrapPlacement, bootstrapHost, internalPort) +} + // transformAdditionalSourceURLs transforms URLs in AdditionalSourcesConfig to use // platform-specific Docker host addresses. This handles differences between macOS // (host.docker.internal) and Linux (172.17.0.1 or similar) Docker host resolution. diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index 9aa3612daa1..a836843ef59 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -233,12 +233,14 @@ func SetupTestEnvironment( if tErr != nil { return nil, pkgerrors.Wrap(tErr, "failed to create topology") } + blockchainPlacementBySelector := blockchainPlacementsBySelector(input.Blockchains, deployedBlockchains.Outputs) updatedNodeSets, topoErr := donconfig.PrepareNodeTOMLs( ctx, topology, creEnvironment, input.NodeSets, + blockchainPlacementBySelector, input.Capabilities, input.ConfigFactoryFunctions, ) @@ -507,6 +509,21 @@ func appendOutputsToInput(input *SetupInput, nodeSetOutput []*cre.NodeSetOutput, input.JdInput.Out = jdOutput } +func blockchainPlacementsBySelector(configured []*config.Blockchain, deployed []blockchains.Blockchain) map[uint64]string { + bySelector := make(map[uint64]string, len(deployed)) + for idx, blockchainCfg := range configured { + if blockchainCfg == nil { + continue + } + if idx >= len(deployed) || deployed[idx] == nil { + continue + } + selector := deployed[idx].ChainSelector() + bySelector[selector] = string(blockchainCfg.Placement) + } + return bySelector +} + type nodeSetPlacementSummary struct { HasLocalTargets bool HasRemoteTargets bool diff --git a/system-tests/lib/cre/runtimecfg/access_mode.go b/system-tests/lib/cre/runtimecfg/access_mode.go index cf8b0e0cc13..ffe677aecef 100644 --- a/system-tests/lib/cre/runtimecfg/access_mode.go +++ b/system-tests/lib/cre/runtimecfg/access_mode.go @@ -2,16 +2,21 @@ package runtimecfg import ( "context" + "encoding/json" "fmt" + "net" "os" "os/exec" "strings" "time" + + "github.com/smartcontractkit/chainlink-testing-framework/framework" ) const ( EnvRemoteAccessMode = "CRE_REMOTE_ACCESS_MODE" EnvEC2HostIP = "CRE_EC2_HOST_IP" + EnvLocalHostIP = "CRE_LOCAL_HOST_IP" EnvEC2InstanceID = "CRE_EC2_INSTANCE_ID" EnvAWSProfile = "CRE_AWS_PROFILE" @@ -48,6 +53,61 @@ func DirectHostIP() (string, error) { return discoverEC2HostIP(instanceID) } +func LocalHostIP() string { + raw := strings.TrimSpace(os.Getenv(EnvLocalHostIP)) + if ip := net.ParseIP(raw); ip != nil { + return ip.String() + } + // Best-effort ensure the default CTF network exists before gateway discovery. + // This avoids startup-order coupling where announce resolution runs before first container start. + _ = framework.DefaultNetwork(nil) + if gatewayIP := discoverDockerNetworkGatewayIP(framework.DefaultNetworkName); gatewayIP != "" { + return gatewayIP + } + ips, err := net.LookupIP("host.docker.internal") + if err != nil { + return "" + } + for _, ip := range ips { + if ipv4 := ip.To4(); ipv4 != nil { + return ipv4.String() + } + } + return "" +} + +func discoverDockerNetworkGatewayIP(networkName string) string { + name := strings.TrimSpace(networkName) + if name == "" { + return "" + } + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + out, err := exec.CommandContext(ctx, "docker", "network", "inspect", name).Output() + if err != nil { + return "" + } + var inspected []struct { + IPAM struct { + Config []struct { + Gateway string `json:"Gateway"` + } `json:"Config"` + } `json:"IPAM"` + } + if jsonErr := json.Unmarshal(out, &inspected); jsonErr != nil { + return "" + } + for _, netCfg := range inspected { + for _, ipamCfg := range netCfg.IPAM.Config { + if ip := net.ParseIP(strings.TrimSpace(ipamCfg.Gateway)); ip != nil && ip.To4() != nil { + return ip.String() + } + } + } + return "" +} + func ResolveAWSCLIProfileSelection() (string, string) { if hasStaticAWSKeys() { return "", "env-creds" diff --git a/system-tests/lib/cre/types.go b/system-tests/lib/cre/types.go index c8989435f0c..a0dbf35751f 100644 --- a/system-tests/lib/cre/types.go +++ b/system-tests/lib/cre/types.go @@ -458,6 +458,7 @@ type GenerateConfigsInput struct { Blockchains map[uint64]blockchains.Blockchain BlockchainPlacementBySelector map[uint64]string OCRBootstrapPlacement string + OCRBootstrapAnnouncePort int RegistryChainSelector uint64 Flags []string CapabilitiesPeeringData CapabilitiesPeeringData @@ -480,6 +481,9 @@ func (g *GenerateConfigsInput) Validate() error { if strings.TrimSpace(g.OCRBootstrapPlacement) == "" { return errors.New("ocr bootstrap placement not set") } + if g.OCRBootstrapAnnouncePort <= 0 || g.OCRBootstrapAnnouncePort > 65535 { + return errors.New("ocr bootstrap announce port not set") + } if len(g.Flags) == 0 { return errors.New("flags not set") } From 1d406bf2fa2bc9116e6d4de45ece6bd342fdfac0 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Tue, 24 Feb 2026 16:40:32 +0100 Subject: [PATCH 16/34] working version on which I executed PoR V1 workflow using remote capabilities from capabilities don in EC2 --- .../configs/workflow-gateway-don-mixed.toml | 61 +++++------ system-tests/lib/cre/don/config/config.go | 100 +++++++++++++++--- system-tests/lib/cre/environment/dons.go | 8 +- system-tests/tests/test-helpers/t_helpers.go | 2 +- 4 files changed, 123 insertions(+), 48 deletions(-) diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml b/core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml index 8dd7ef6d6fc..2146731fdc8 100644 --- a/core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml +++ b/core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml @@ -37,36 +37,6 @@ # either "docker" or "kubernetes" type = "docker" -[[nodesets]] - nodes = 1 - name = "bootstrap-gateway" - don_types = ["bootstrap", "gateway"] - override_mode = "each" - http_port_range_start = 10000 - ocr2_p2p_port_range_start = 10050 - placement = "remote" - remote_start_policy = "always" - - env_vars = { CL_EVM_CMD = "" } - supported_evm_chains = [1337, 2337] - - [nodesets.db] - image = "postgres:12.0" - port = 13200 - - [[nodesets.node_specs]] - roles = ["bootstrap", "gateway"] - [nodesets.node_specs.node] - #ocker_ctx = "../../../.." - #docker_file = "core/chainlink.Dockerfile" - #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } - image = "chainlink-amd:latest" - # 5002 is the web API capabilities port for incoming requests - # 15002 is the vault port for incoming requests - custom_ports = ["5002:5002","15002:15002"] - # image = "chainlink-tmp:latest" - user_config_overrides = "" - [[nodesets]] nodes = 4 name = "workflow" @@ -122,3 +92,34 @@ #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } image = "chainlink-amd:latest" user_config_overrides = "" + +[[nodesets]] + nodes = 1 + name = "bootstrap-gateway" + don_types = ["bootstrap", "gateway"] + override_mode = "each" + http_port_range_start = 10000 + ocr2_p2p_port_range_start = 10050 + placement = "remote" + remote_start_policy = "always" + + env_vars = { CL_EVM_CMD = "" } + supported_evm_chains = [1337, 2337] + + [nodesets.db] + image = "postgres:12.0" + port = 13200 + + [[nodesets.node_specs]] + roles = ["bootstrap", "gateway"] + [nodesets.node_specs.node] + #ocker_ctx = "../../../.." + #docker_file = "core/chainlink.Dockerfile" + #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } + image = "chainlink-amd:latest" + # 5002 is the web API capabilities port for incoming requests + # 5003 is the gateway port for outgoing connections + # 15002 is the vault port for incoming requests + custom_ports = ["5002:5002","5003:5003","15002:15002"] + # image = "chainlink-tmp:latest" + user_config_overrides = "" \ No newline at end of file diff --git a/system-tests/lib/cre/don/config/config.go b/system-tests/lib/cre/don/config/config.go index 6f91b0437fa..ba3decf01b6 100644 --- a/system-tests/lib/cre/don/config/config.go +++ b/system-tests/lib/cre/don/config/config.go @@ -117,19 +117,19 @@ func PrepareNodeTOMLs( if configsFound == 0 { config, configErr := generateNodeTomlConfig( cre.GenerateConfigsInput{ - Datastore: creEnv.CldfEnvironment.DataStore, - ContractVersions: creEnv.ContractVersions, - DonMetadata: donMetadata, + Datastore: creEnv.CldfEnvironment.DataStore, + ContractVersions: creEnv.ContractVersions, + DonMetadata: donMetadata, Blockchains: chainPerSelector, BlockchainPlacementBySelector: blockchainPlacementBySelector, OCRBootstrapPlacement: ocrBootstrapPlacement, OCRBootstrapAnnouncePort: ocrBootstrapAnnouncePort, - Flags: donMetadata.Flags, - CapabilitiesPeeringData: capabilitiesPeeringData, - OCRPeeringData: ocrPeeringData, - RegistryChainSelector: creEnv.RegistryChainSelector, - Topology: topology, - Provider: creEnv.Provider, + Flags: donMetadata.Flags, + CapabilitiesPeeringData: capabilitiesPeeringData, + OCRPeeringData: ocrPeeringData, + RegistryChainSelector: creEnv.RegistryChainSelector, + Topology: topology, + Provider: creEnv.Provider, }, configFactoryFunctions, ) @@ -555,12 +555,13 @@ func addWorkerNodeConfig( gateways := []coretoml.ConnectorGateway{} if topology != nil && len(topology.GatewayConnectors.Configurations) > 0 { for _, gateway := range topology.GatewayConnectors.Configurations { + connectorURL, urlErr := resolveGatewayConnectorURL(donMetadata.MustNodeSet().Placement, topology, gateway) + if urlErr != nil { + return existingConfig, errors.Wrap(urlErr, "failed to resolve gateway connector url") + } gateways = append(gateways, coretoml.ConnectorGateway{ - ID: ptr.Ptr(gateway.AuthGatewayID), - URL: ptr.Ptr(fmt.Sprintf("ws://%s:%d%s", - gateway.Outgoing.Host, - gateway.Outgoing.Port, - gateway.Outgoing.Path)), + ID: ptr.Ptr(gateway.AuthGatewayID), + URL: ptr.Ptr(connectorURL), }) } @@ -1003,6 +1004,77 @@ func resolveNodeFacingBootstrapAddress(callerPlacement, bootstrapPlacement, boot return cre.ResolveBootstrapAddress(callerPlacement, bootstrapPlacement, bootstrapHost, internalPort) } +func resolveGatewayConnectorURL(callerPlacementRaw string, topology *cre.Topology, gateway *cre.DonGatewayConfiguration) (string, error) { + if gateway == nil || gateway.GatewayConfiguration == nil { + return "", fmt.Errorf("gateway configuration is nil") + } + callerPlacement, err := connectivity.PlacementFromTarget(callerPlacementRaw) + if err != nil { + return "", err + } + targetPlacement, err := resolveNodePlacement(topology, gateway.NodeUUID) + if err != nil { + return "", err + } + + internalURL := fmt.Sprintf("ws://%s:%d%s", gateway.Outgoing.Host, gateway.Outgoing.Port, gateway.Outgoing.Path) + + externalHost, err := gatewayExternalHost(targetPlacement) + if err != nil { + return "", err + } + externalURL := fmt.Sprintf("ws://%s:%d%s", externalHost, gateway.Outgoing.Port, gateway.Outgoing.Path) + + resolved, err := connectivity.Resolve(callerPlacement, targetPlacement, connectivity.EndpointPair{ + Name: fmt.Sprintf("gateway-%s-outgoing", gateway.AuthGatewayID), + Internal: internalURL, + External: externalURL, + }) + if err != nil { + return "", err + } + return resolved.URL, nil +} + +func resolveNodePlacement(topology *cre.Topology, nodeUUID string) (connectivity.Placement, error) { + if topology == nil { + return "", fmt.Errorf("topology is nil") + } + trimmedUUID := strings.TrimSpace(nodeUUID) + if trimmedUUID == "" { + return "", fmt.Errorf("node uuid is empty") + } + for _, don := range topology.DonsMetadata.List() { + if don == nil { + continue + } + for _, node := range don.NodesMetadata { + if node == nil || strings.TrimSpace(node.UUID) == "" { + continue + } + if node.UUID != trimmedUUID { + continue + } + return connectivity.PlacementFromTarget(don.MustNodeSet().Placement) + } + } + return "", fmt.Errorf("failed to resolve placement for node uuid %s", trimmedUUID) +} + +func gatewayExternalHost(targetPlacement connectivity.Placement) (string, error) { + switch targetPlacement { + case connectivity.PlacementRemote: + if !runtimecfg.IsDirectMode() { + return "", fmt.Errorf("gateway connector resolution for remote targets requires direct mode") + } + return runtimecfg.DirectHostIP() + case connectivity.PlacementLocal: + return strings.TrimPrefix(framework.HostDockerInternal(), "http://"), nil + default: + return "", fmt.Errorf("unsupported gateway placement: %s", targetPlacement) + } +} + // transformAdditionalSourceURLs transforms URLs in AdditionalSourcesConfig to use // platform-specific Docker host addresses. This handles differences between macOS // (host.docker.internal) and Linux (172.17.0.1 or similar) Docker host resolution. diff --git a/system-tests/lib/cre/environment/dons.go b/system-tests/lib/cre/environment/dons.go index bb8baf4ebdf..7f3a6455378 100644 --- a/system-tests/lib/cre/environment/dons.go +++ b/system-tests/lib/cre/environment/dons.go @@ -82,13 +82,15 @@ func StartDONs( } } - // Skip binary operations for Kubernetes (binaries are in the cluster images) - if infraInput.IsDocker() && !hasRemoteNodeSets(nodeSets) { - // TODO in the future check here if don is remote and skip if it is instead of !hasRemoteNodeSets() + // Skip binary operations for Kubernetes (binaries are in the cluster images) and for remote DONs + if infraInput.IsDocker() { for donIdx, donMetadata := range topology.DonsMetadata.List() { if !copyCapabilityBinaries { continue } + if donMetadata.MustNodeSet().Placement == string(config.PlacementRemote) { + continue + } customBinariesPaths := make(map[cre.CapabilityFlag]string) for flag, config := range capabilityConfigs { diff --git a/system-tests/tests/test-helpers/t_helpers.go b/system-tests/tests/test-helpers/t_helpers.go index 4f0e3889d43..bd7c850cc71 100644 --- a/system-tests/tests/test-helpers/t_helpers.go +++ b/system-tests/tests/test-helpers/t_helpers.go @@ -679,7 +679,7 @@ func CompileAndDeployWorkflow[T WorkflowConfig](t *testing.T, WorkflowRegistryAddr: common.HexToAddress(workflowRegistryAddress.Address), WorkflowRegistryVersion: workflowRegistryAddress.Version, ChainID: registryChainSelector, - DonID: testEnv.Dons.List()[0].ID, + DonID: workflowDONs[0].ID, //TODO think how to make this more robust, we are naively assuming that the first workflow DON is the one we want to register the workflow for ContainerTargetDir: creworkflow.DefaultWorkflowTargetDir, Blockchains: testEnv.CreEnvironment.Blockchains, } From 727028ab2b7ddc16249ac656c8b27afeb83dac3a Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Tue, 24 Feb 2026 17:48:14 +0100 Subject: [PATCH 17/34] simplification: remove ssm agent mode and a couple of env vars --- .../environment/environment/environment.go | 2 - .../environment/relay_supervisor.go | 12 +- .../environment/environment/remote_state.go | 4 - system-tests/lib/cre/bootstrap_peer_test.go | 14 -- .../lib/cre/environment/blockchain_start.go | 83 ++------ .../cre/environment/blockchain_start_test.go | 182 ++++-------------- .../lib/cre/environment/remote_stop.go | 3 +- .../lib/cre/runtimecfg/access_mode.go | 31 +-- .../tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md | 18 +- .../test-helpers/fixture_relay_helpers.go | 33 ++-- 10 files changed, 79 insertions(+), 303 deletions(-) diff --git a/core/scripts/cre/environment/environment/environment.go b/core/scripts/cre/environment/environment/environment.go index db37e91d6da..9e216a0d203 100644 --- a/core/scripts/cre/environment/environment/environment.go +++ b/core/scripts/cre/environment/environment/environment.go @@ -931,8 +931,6 @@ func applyRemoteAgentEnvFallback(logger zerolog.Logger, agentState *remoteAgentS } } - setIfEmpty("CRE_AGENT_MODE", agentState.Mode) - setIfEmpty("CRE_LOCAL_AGENT_URL", agentState.LocalURL) setIfEmpty("CRE_EC2_AGENT_URL", agentState.EC2URL) setIfEmpty("CRE_EC2_INSTANCE_ID", agentState.EC2InstanceID) setIfEmpty("CRE_EC2_AGENT_PORT", agentState.EC2AgentPort) diff --git a/core/scripts/cre/environment/environment/relay_supervisor.go b/core/scripts/cre/environment/environment/relay_supervisor.go index 547d3c92fa7..88b855b11b5 100644 --- a/core/scripts/cre/environment/environment/relay_supervisor.go +++ b/core/scripts/cre/environment/environment/relay_supervisor.go @@ -799,21 +799,15 @@ func resolveAgentBaseURLForRelay() (string, error) { if v := strings.TrimSpace(os.Getenv("CRE_EC2_AGENT_URL")); v != "" { return v, nil } - if strings.EqualFold(strings.TrimSpace(os.Getenv("CRE_AGENT_MODE")), "ec2") && runtimecfg.IsDirectMode() { - hostIP, err := runtimecfg.DirectHostIP() - if err != nil { - return "", err - } + hostIP, err := runtimecfg.DirectHostIP() + if err == nil { port, err := resolveEC2AgentPortForRelay() if err != nil { return "", err } return fmt.Sprintf("http://%s:%d", hostIP, port), nil } - if v := strings.TrimSpace(os.Getenv("CRE_LOCAL_AGENT_URL")); v != "" { - return v, nil - } - return "", fmt.Errorf("cannot resolve agent base URL for relay; set CRE_EC2_AGENT_URL or CRE_LOCAL_AGENT_URL") + return "", fmt.Errorf("cannot resolve agent base URL for relay; set CRE_EC2_AGENT_URL or provide EC2 discovery envs: %w", err) } func resolveEC2AgentPortForRelay() (int, error) { diff --git a/core/scripts/cre/environment/environment/remote_state.go b/core/scripts/cre/environment/environment/remote_state.go index 143c9ef6972..1e0f83f9bf7 100644 --- a/core/scripts/cre/environment/environment/remote_state.go +++ b/core/scripts/cre/environment/environment/remote_state.go @@ -19,8 +19,6 @@ const ( ) type remoteAgentState struct { - Mode string `toml:"mode,omitempty"` - LocalURL string `toml:"local_url,omitempty"` EC2URL string `toml:"ec2_url,omitempty"` EC2InstanceID string `toml:"ec2_instance_id,omitempty"` EC2AgentPort string `toml:"ec2_agent_port,omitempty"` @@ -94,8 +92,6 @@ func storeRemoteStopState(relativePathToRepoRoot string, cfg *envconfig.Config) } agentEnvelope := &remoteAgentStateEnvelope{ Agent: remoteAgentState{ - Mode: os.Getenv("CRE_AGENT_MODE"), - LocalURL: os.Getenv("CRE_LOCAL_AGENT_URL"), EC2URL: os.Getenv("CRE_EC2_AGENT_URL"), EC2InstanceID: os.Getenv("CRE_EC2_INSTANCE_ID"), EC2AgentPort: os.Getenv("CRE_EC2_AGENT_PORT"), diff --git a/system-tests/lib/cre/bootstrap_peer_test.go b/system-tests/lib/cre/bootstrap_peer_test.go index 203ac0d3d2b..a54a47d9320 100644 --- a/system-tests/lib/cre/bootstrap_peer_test.go +++ b/system-tests/lib/cre/bootstrap_peer_test.go @@ -18,15 +18,9 @@ func TestResolveP2PAnnounceAddresses_LocalOnly_UsesInternalHost(t *testing.T) { } func TestResolveP2PAnnounceAddresses_LocalMixed_AddsBridgedHost(t *testing.T) { - prevMode, hadMode := os.LookupEnv(runtimecfg.EnvRemoteAccessMode) prevIP, hadIP := os.LookupEnv(runtimecfg.EnvEC2HostIP) prevLocalIP, hadLocalIP := os.LookupEnv(runtimecfg.EnvLocalHostIP) t.Cleanup(func() { - if hadMode { - _ = os.Setenv(runtimecfg.EnvRemoteAccessMode, prevMode) - } else { - _ = os.Unsetenv(runtimecfg.EnvRemoteAccessMode) - } if hadIP { _ = os.Setenv(runtimecfg.EnvEC2HostIP, prevIP) } else { @@ -38,7 +32,6 @@ func TestResolveP2PAnnounceAddresses_LocalMixed_AddsBridgedHost(t *testing.T) { _ = os.Unsetenv(runtimecfg.EnvLocalHostIP) } }) - _ = os.Setenv(runtimecfg.EnvRemoteAccessMode, runtimecfg.RemoteAccessModeDirect) _ = os.Setenv(runtimecfg.EnvEC2HostIP, "10.1.2.3") _ = os.Setenv(runtimecfg.EnvLocalHostIP, "192.168.1.10") @@ -58,21 +51,14 @@ func TestResolveP2PAnnounceAddresses_LocalMixed_AddsBridgedHost(t *testing.T) { } func TestResolveP2PAnnounceAddresses_Remote_AddsDirectHostIP(t *testing.T) { - prevMode, hadMode := os.LookupEnv(runtimecfg.EnvRemoteAccessMode) prevIP, hadIP := os.LookupEnv(runtimecfg.EnvEC2HostIP) t.Cleanup(func() { - if hadMode { - _ = os.Setenv(runtimecfg.EnvRemoteAccessMode, prevMode) - } else { - _ = os.Unsetenv(runtimecfg.EnvRemoteAccessMode) - } if hadIP { _ = os.Setenv(runtimecfg.EnvEC2HostIP, prevIP) } else { _ = os.Unsetenv(runtimecfg.EnvEC2HostIP) } }) - _ = os.Setenv(runtimecfg.EnvRemoteAccessMode, runtimecfg.RemoteAccessModeDirect) _ = os.Setenv(runtimecfg.EnvEC2HostIP, "10.1.2.3") addresses, err := ResolveP2PAnnounceAddresses("remote", true, 16001) diff --git a/system-tests/lib/cre/environment/blockchain_start.go b/system-tests/lib/cre/environment/blockchain_start.go index 239bbe69dd1..e72ff333f84 100644 --- a/system-tests/lib/cre/environment/blockchain_start.go +++ b/system-tests/lib/cre/environment/blockchain_start.go @@ -35,12 +35,8 @@ const ( componentTypeBlockchain = "blockchain" componentTypeJD = "jd" componentTypeNodeSet = "nodeset" - envLocalAgentURL = "CRE_LOCAL_AGENT_URL" envEC2AgentURL = "CRE_EC2_AGENT_URL" - envEC2InstanceID = "CRE_EC2_INSTANCE_ID" envEC2AgentPort = "CRE_EC2_AGENT_PORT" - envAgentMode = "CRE_AGENT_MODE" - ec2Region = "us-west-2" defaultEC2AgentPort = 8080 ) @@ -213,70 +209,29 @@ func isRetriableNetworkError(err error) bool { } func newStartComponentClient(testLogger zerolog.Logger, tunnelManager tunnel.Manager) (componentClient, error) { - agentMode := strings.TrimSpace(os.Getenv(envAgentMode)) - if strings.EqualFold(agentMode, "ec2") { - baseURL, err := resolveEC2AgentBaseURL(testLogger, tunnelManager) - if err != nil { - return nil, err - } - return newEC2HTTPComponentClient(baseURL), nil - } + _ = tunnelManager // legacy parameter retained for call-site compatibility - baseURL := os.Getenv(envLocalAgentURL) - if baseURL == "" { - return nil, fmt.Errorf("%s must be set for remote component startup", envLocalAgentURL) + baseURL, err := resolveEC2AgentBaseURL(testLogger) + if err != nil { + return nil, fmt.Errorf("failed to resolve EC2 agent base URL: %w", err) } - return newHTTPComponentClient(baseURL), nil + return newEC2HTTPComponentClient(baseURL), nil } -func resolveEC2AgentBaseURL(testLogger zerolog.Logger, tunnelManager tunnel.Manager) (string, error) { - if configured := os.Getenv(envEC2AgentURL); configured != "" { +func resolveEC2AgentBaseURL(testLogger zerolog.Logger) (string, error) { + if configured := strings.TrimSpace(os.Getenv(envEC2AgentURL)); configured != "" { return configured, nil } remotePort, err := resolveEC2AgentPort() if err != nil { return "", err } - if isRemoteAccessDirectMode() { - hostIP, err := resolveDirectAccessHostIP() - if err != nil { - return "", err - } - return fmt.Sprintf("http://%s:%d", hostIP, remotePort), nil - } - - instanceID := strings.TrimSpace(os.Getenv(envEC2InstanceID)) - if instanceID == "" { - return "", fmt.Errorf("%s must be set when %s=ec2 and %s is not provided", envEC2InstanceID, envAgentMode, envEC2AgentURL) - } - if tunnelManager == nil { - return "", errors.New("tunnel manager is required to auto-open ec2 agent tunnel") - } - - bindings, err := tunnelManager.Start(context.Background(), []tunnel.EndpointRef{ - { - ComponentID: "agent", - EndpointName: "api", - Scheme: "http", - Host: "127.0.0.1", - Port: remotePort, - OriginalURL: fmt.Sprintf("http://127.0.0.1:%d", remotePort), - }, - }) + hostIP, err := resolveDirectAccessHostIP() if err != nil { - return "", pkgerrors.Wrap(err, "failed to open ssm tunnel to ec2 agent") - } - if len(bindings) == 0 { - return "", errors.New("failed to open ssm tunnel to ec2 agent: no bindings returned") + return "", err } - - testLogger.Info(). - Str("instanceID", instanceID). - Int("remotePort", remotePort). - Int("localPort", bindings[0].LocalPort). - Msg("Opened SSM tunnel to EC2 agent") - - return bindings[0].LocalURL, nil + testLogger.Debug().Str("hostIP", hostIP).Int("port", remotePort).Msg("resolved EC2 CRE agent base URL from direct mode host") + return fmt.Sprintf("http://%s:%d", hostIP, remotePort), nil } func resolveEC2AgentPort() (int, error) { @@ -460,20 +415,8 @@ func startBlockchainsWithTargets( } func newEC2TunnelManager(testLogger zerolog.Logger) (tunnel.Manager, error) { - if os.Getenv(envAgentMode) != "ec2" { - return tunnel.NewNoopManager(), nil - } - if isRemoteAccessDirectMode() { - return tunnel.NewNoopManager(), nil - } - - instanceID := strings.TrimSpace(os.Getenv(envEC2InstanceID)) - if instanceID == "" { - // Keep compatibility with pure manual-tunneling mode. - return tunnel.NewNoopManager(), nil - } - - return tunnel.NewManager(tunnel.NewSSMProvider(instanceID, ec2Region, testLogger)), nil + _ = testLogger + return tunnel.NewNoopManager(), nil } func NewEC2TunnelManager(testLogger zerolog.Logger) (tunnel.Manager, error) { diff --git a/system-tests/lib/cre/environment/blockchain_start_test.go b/system-tests/lib/cre/environment/blockchain_start_test.go index 1e2b7cbb2f8..ba14ade3d21 100644 --- a/system-tests/lib/cre/environment/blockchain_start_test.go +++ b/system-tests/lib/cre/environment/blockchain_start_test.go @@ -1,15 +1,13 @@ package environment import ( - "context" - "os" "strings" "testing" "github.com/rs/zerolog" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" ) func TestValidatePhase2ARemoteBlockchainInput(t *testing.T) { @@ -26,20 +24,14 @@ func TestValidatePhase2ARemoteBlockchainInput(t *testing.T) { } } -func TestNewStartComponentClientEC2Mode(t *testing.T) { - t.Setenv(envAgentMode, "ec2") - t.Setenv(envLocalAgentURL, "") +func TestNewStartComponentClientPrefersEC2(t *testing.T) { t.Setenv(envEC2AgentURL, "") - t.Setenv(envEC2InstanceID, "") - - if _, err := newStartComponentClient(zerolog.Nop(), &fakeTunnelManager{}); err == nil { - t.Fatalf("expected ec2 mode without %s or %s to fail", envEC2AgentURL, envEC2InstanceID) - } + t.Setenv(runtimecfg.EnvEC2HostIP, "10.193.28.183") + t.Setenv(envEC2AgentPort, "18080") - t.Setenv(envEC2AgentURL, "http://127.0.0.1:18080") // manual tunnel override - client, err := newStartComponentClient(zerolog.Nop(), &fakeTunnelManager{}) + client, err := newStartComponentClient(zerolog.Nop(), tunnel.NewNoopManager()) if err != nil { - t.Fatalf("expected ec2 mode client to be created, got %v", err) + t.Fatalf("expected ec2-first client to be created, got %v", err) } httpClient, ok := client.(*httpComponentClient) @@ -52,26 +44,29 @@ func TestNewStartComponentClientEC2Mode(t *testing.T) { if httpClient.maxAttempts != 3 { t.Fatalf("expected ec2 client retries to be enabled") } + if httpClient.baseURL != "http://10.193.28.183:18080" { + t.Fatalf("unexpected ec2 base url: %s", httpClient.baseURL) + } } -func TestResolveEC2AgentBaseURLRequiresInstanceIDWhenURLMissing(t *testing.T) { +func TestResolveEC2AgentBaseURLRequiresHostOrInstanceInfoWhenURLMissing(t *testing.T) { t.Setenv(envEC2AgentURL, "") - t.Setenv(envEC2InstanceID, "") + t.Setenv(runtimecfg.EnvEC2HostIP, "") + t.Setenv(runtimecfg.EnvEC2InstanceID, "") t.Setenv(envEC2AgentPort, "") - _, err := resolveEC2AgentBaseURL(zerolog.Nop(), &fakeTunnelManager{}) + _, err := resolveEC2AgentBaseURL(zerolog.Nop()) if err == nil { - t.Fatalf("expected missing %s to fail when %s is not set", envEC2InstanceID, envEC2AgentURL) + t.Fatalf("expected missing direct host resolution inputs to fail when %s is not set", envEC2AgentURL) } } func TestResolveEC2AgentBaseURLRejectsInvalidPort(t *testing.T) { t.Setenv(envEC2AgentURL, "") - t.Setenv(runtimecfg.EnvRemoteAccessMode, runtimecfg.RemoteAccessModeSSM) - t.Setenv(envEC2InstanceID, "i-123") + t.Setenv(runtimecfg.EnvEC2HostIP, "10.193.28.183") t.Setenv(envEC2AgentPort, "not-a-port") - _, err := resolveEC2AgentBaseURL(zerolog.Nop(), &fakeTunnelManager{}) + _, err := resolveEC2AgentBaseURL(zerolog.Nop()) if err == nil { t.Fatalf("expected invalid %s to fail", envEC2AgentPort) } @@ -82,123 +77,30 @@ func TestResolveEC2AgentBaseURLRejectsInvalidPort(t *testing.T) { func TestResolveEC2AgentBaseURLDirectMode(t *testing.T) { t.Setenv(envEC2AgentURL, "") - t.Setenv(runtimecfg.EnvRemoteAccessMode, runtimecfg.RemoteAccessModeDirect) t.Setenv(runtimecfg.EnvEC2HostIP, "10.193.28.183") t.Setenv(envEC2AgentPort, "18080") - manager := &fakeTunnelManager{} - baseURL, err := resolveEC2AgentBaseURL(zerolog.Nop(), manager) + baseURL, err := resolveEC2AgentBaseURL(zerolog.Nop()) if err != nil { t.Fatalf("expected direct mode url resolution to succeed, got %v", err) } if baseURL != "http://10.193.28.183:18080" { t.Fatalf("unexpected direct mode base url: %s", baseURL) } - if manager.startCalls != 0 { - t.Fatalf("expected direct mode to skip tunnel manager") - } } -func TestNewStartComponentClientLocalMode(t *testing.T) { - t.Setenv(envAgentMode, "") +func TestNewStartComponentClientRequiresEC2Resolution(t *testing.T) { t.Setenv(envEC2AgentURL, "") - t.Setenv(envLocalAgentURL, "") + t.Setenv(runtimecfg.EnvEC2HostIP, "") + t.Setenv(runtimecfg.EnvEC2InstanceID, "") - if _, err := newStartComponentClient(zerolog.Nop(), &fakeTunnelManager{}); err == nil { - t.Fatalf("expected local mode without %s to fail", envLocalAgentURL) - } - - t.Setenv(envLocalAgentURL, "http://127.0.0.1:8080") - client, err := newStartComponentClient(zerolog.Nop(), &fakeTunnelManager{}) - if err != nil { - t.Fatalf("expected local mode client to be created, got %v", err) - } - - httpClient, ok := client.(*httpComponentClient) - if !ok { - t.Fatalf("expected httpComponentClient, got %T", client) + if _, err := newStartComponentClient(zerolog.Nop(), tunnel.NewNoopManager()); err == nil { + t.Fatalf("expected client creation without EC2 resolution to fail") } - if httpClient.checkHealth { - t.Fatalf("expected local client health checks to be disabled") - } - if httpClient.maxAttempts != 1 { - t.Fatalf("expected local client retries to be disabled") - } - - if os.Getenv(envLocalAgentURL) == "" { - t.Fatalf("expected local agent url to remain set") - } -} - -type fakeTunnelManager struct { - startCalls int -} - -func (f *fakeTunnelManager) Start(_ context.Context, refs []tunnel.EndpointRef) ([]tunnel.TunnelBinding, error) { - f.startCalls++ - bindings := make([]tunnel.TunnelBinding, 0, len(refs)) - for i, ref := range refs { - bindings = append(bindings, tunnel.TunnelBinding{ - EndpointRef: ref, - LocalPort: 19000 + i, - LocalURL: map[string]string{ - "http": "http://127.0.0.1:19000", - "ws": "ws://127.0.0.1:19001", - }[ref.Scheme], - }) - } - return bindings, nil } -func (f *fakeTunnelManager) Stop(_ context.Context) error { return nil } -func (f *fakeTunnelManager) IsStarted() bool { return f.startCalls > 0 } -func (f *fakeTunnelManager) Snapshot() []tunnel.TunnelBinding { return []tunnel.TunnelBinding{} } - func TestRewriteRemoteBlockchainOutputForLocalAccess(t *testing.T) { - t.Setenv(runtimecfg.EnvRemoteAccessMode, runtimecfg.RemoteAccessModeSSM) - out := &blockchain.Output{ - Nodes: []*blockchain.Node{ - { - ExternalHTTPUrl: "http://10.0.0.10:8545", - ExternalWSUrl: "ws://10.0.0.10:8546", - }, - }, - } - manager := &fakeTunnelManager{} - - if err := rewriteRemoteBlockchainOutputForLocalAccess( - context.Background(), - zerolog.Nop(), - manager, - 0, - &blockchain.Input{Type: blockchain.TypeAnvil}, - out, - true, - ); err != nil { - t.Fatalf("expected rewrite helper to succeed: %v", err) - } - - if manager.startCalls != 1 { - t.Fatalf("expected tunnel manager start to be called once, got %d", manager.startCalls) - } - if out.Nodes[0].ExternalHTTPUrl != "http://127.0.0.1:19000" { - t.Fatalf("unexpected rewritten http url: %s", out.Nodes[0].ExternalHTTPUrl) - } - if out.Nodes[0].ExternalWSUrl != "ws://127.0.0.1:19001" { - t.Fatalf("unexpected rewritten ws url: %s", out.Nodes[0].ExternalWSUrl) - } - if out.Nodes[0].InternalHTTPUrl == "" || !strings.Contains(out.Nodes[0].InternalHTTPUrl, ":19000") { - t.Fatalf("expected internal http url to be rewritten for docker host access, got %s", out.Nodes[0].InternalHTTPUrl) - } - if out.Nodes[0].InternalWSUrl == "" || !strings.Contains(out.Nodes[0].InternalWSUrl, ":19001") { - t.Fatalf("expected internal ws url to be rewritten for docker host access, got %s", out.Nodes[0].InternalWSUrl) - } -} - -func TestRewriteRemoteBlockchainOutputForLocalAccessDirectMode(t *testing.T) { - t.Setenv(runtimecfg.EnvRemoteAccessMode, runtimecfg.RemoteAccessModeDirect) t.Setenv(runtimecfg.EnvEC2HostIP, "10.193.28.183") - out := &blockchain.Output{ Nodes: []*blockchain.Node{ { @@ -209,52 +111,36 @@ func TestRewriteRemoteBlockchainOutputForLocalAccessDirectMode(t *testing.T) { }, }, } - manager := &fakeTunnelManager{} - if err := rewriteRemoteBlockchainOutputForLocalAccess( - context.Background(), + t.Context(), zerolog.Nop(), - manager, + tunnel.NewNoopManager(), 0, &blockchain.Input{Type: blockchain.TypeAnvil}, out, true, ); err != nil { - t.Fatalf("expected direct mode rewrite helper to succeed: %v", err) - } - if manager.startCalls != 0 { - t.Fatalf("expected direct mode to skip tunnel manager, got %d calls", manager.startCalls) + t.Fatalf("expected rewrite helper to succeed: %v", err) } + if out.Nodes[0].ExternalHTTPUrl != "http://10.193.28.183:8545" { - t.Fatalf("unexpected rewritten http url in direct mode: %s", out.Nodes[0].ExternalHTTPUrl) + t.Fatalf("unexpected rewritten http url: %s", out.Nodes[0].ExternalHTTPUrl) } if out.Nodes[0].ExternalWSUrl != "ws://10.193.28.183:8546" { - t.Fatalf("unexpected rewritten ws url in direct mode: %s", out.Nodes[0].ExternalWSUrl) + t.Fatalf("unexpected rewritten ws url: %s", out.Nodes[0].ExternalWSUrl) } - if out.Nodes[0].InternalHTTPUrl != "http://10.193.28.183:8545" { - t.Fatalf("unexpected rewritten internal http url in direct mode: %s", out.Nodes[0].InternalHTTPUrl) + if out.Nodes[0].InternalHTTPUrl != "http://anvil-1337:8545" { + t.Fatalf("expected internal http url unchanged in direct mode, got %s", out.Nodes[0].InternalHTTPUrl) } - if out.Nodes[0].InternalWSUrl != "ws://10.193.28.183:8546" { - t.Fatalf("unexpected rewritten internal ws url in direct mode: %s", out.Nodes[0].InternalWSUrl) + if out.Nodes[0].InternalWSUrl != "ws://anvil-1337:8546" { + t.Fatalf("expected internal ws url unchanged in direct mode, got %s", out.Nodes[0].InternalWSUrl) } } -func TestNewEC2TunnelManagerReturnsNoopWhenNotApplicable(t *testing.T) { - t.Setenv(envAgentMode, "") - t.Setenv(envEC2InstanceID, "") +func TestNewEC2TunnelManagerAlwaysReturnsNoop(t *testing.T) { manager, err := newEC2TunnelManager(zerolog.Nop()) if err != nil { - t.Fatalf("expected noop manager for local mode, got error: %v", err) - } - if manager.IsStarted() { - t.Fatalf("expected noop manager to report not started") - } - - t.Setenv(envAgentMode, "ec2") - t.Setenv(envEC2InstanceID, "") - manager, err = newEC2TunnelManager(zerolog.Nop()) - if err != nil { - t.Fatalf("expected noop manager for ec2 mode without instance, got error: %v", err) + t.Fatalf("expected noop manager, got error: %v", err) } if manager.IsStarted() { t.Fatalf("expected noop manager to report not started") diff --git a/system-tests/lib/cre/environment/remote_stop.go b/system-tests/lib/cre/environment/remote_stop.go index f218ee430d9..776540b9bc4 100644 --- a/system-tests/lib/cre/environment/remote_stop.go +++ b/system-tests/lib/cre/environment/remote_stop.go @@ -194,7 +194,8 @@ func listRemoteCTFResources( lggr zerolog.Logger, tunnelManager tunnel.Manager, ) ([]string, []string, error) { - baseURL, err := resolveEC2AgentBaseURL(lggr, tunnelManager) + _ = tunnelManager + baseURL, err := resolveEC2AgentBaseURL(lggr) if err != nil { return nil, nil, pkgerrors.Wrap(err, "resolve agent base url for ctf resource query") } diff --git a/system-tests/lib/cre/runtimecfg/access_mode.go b/system-tests/lib/cre/runtimecfg/access_mode.go index ffe677aecef..79eafe0902a 100644 --- a/system-tests/lib/cre/runtimecfg/access_mode.go +++ b/system-tests/lib/cre/runtimecfg/access_mode.go @@ -14,30 +14,17 @@ import ( ) const ( - EnvRemoteAccessMode = "CRE_REMOTE_ACCESS_MODE" - EnvEC2HostIP = "CRE_EC2_HOST_IP" - EnvLocalHostIP = "CRE_LOCAL_HOST_IP" - EnvEC2InstanceID = "CRE_EC2_INSTANCE_ID" - EnvAWSProfile = "CRE_AWS_PROFILE" - - RemoteAccessModeSSM = "ssm" - RemoteAccessModeDirect = "direct" - defaultEC2Region = "us-west-2" -) + EnvEC2HostIP = "CRE_EC2_HOST_IP" + EnvLocalHostIP = "CRE_LOCAL_HOST_IP" + EnvEC2InstanceID = "CRE_EC2_INSTANCE_ID" + EnvAWSProfile = "CRE_AWS_PROFILE" -func RemoteAccessMode() string { - mode := strings.ToLower(strings.TrimSpace(os.Getenv(EnvRemoteAccessMode))) - if mode == "" { - return RemoteAccessModeDirect - } - if mode == RemoteAccessModeDirect || mode == RemoteAccessModeSSM { - return mode - } - return RemoteAccessModeDirect -} + defaultEC2Region = "us-west-2" +) +// IsDirectMode is retained for compatibility; CRE now only supports direct mode. func IsDirectMode() bool { - return RemoteAccessMode() == RemoteAccessModeDirect + return true } func DirectHostIP() (string, error) { @@ -48,7 +35,7 @@ func DirectHostIP() (string, error) { instanceID := strings.TrimSpace(os.Getenv(EnvEC2InstanceID)) if instanceID == "" { - return "", fmt.Errorf("%s must be set when %s=%s (or set %s explicitly)", EnvEC2InstanceID, EnvRemoteAccessMode, RemoteAccessModeDirect, EnvEC2HostIP) + return "", fmt.Errorf("%s must be set (or set %s explicitly)", EnvEC2InstanceID, EnvEC2HostIP) } return discoverEC2HostIP(instanceID) } diff --git a/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md b/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md index 6f4f70f8c17..df3aa571c88 100644 --- a/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md +++ b/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md @@ -6,25 +6,19 @@ This runbook covers the EC2-based remote mode for CRE where components can run e - Remote backend is EC2 + Docker (no Kubernetes path). - Remote control plane is the CRE agent. -- Default access mode is `direct`. -- Access modes: - - `ssm`: control and endpoint reachability via SSM tunnels. - - `direct`: endpoint reachability via EC2 host IP, with SSM optional for agent only. +- Access mode is direct-only. ## Core Environment Variables -- `CRE_AGENT_MODE=ec2` -- `CRE_EC2_INSTANCE_ID=` (required for SSM mode; also used by direct mode auto IP lookup) +- `CRE_EC2_INSTANCE_ID=` (used by direct mode auto IP lookup) - `CRE_EC2_AGENT_PORT=` (defaults to `8080`) - `CRE_EC2_AGENT_URL=` (optional explicit override) -- `CRE_REMOTE_ACCESS_MODE=ssm|direct` (defaults to `direct`) - `CRE_EC2_HOST_IP=` (optional in direct mode; if missing, resolved from AWS CLI using instance ID) -- `CRE_AWS_PROFILE=` (optional SSM auth profile) +- `CRE_AWS_PROFILE=` (optional AWS auth profile) ## Direct Mode Defaults and IP Resolution -- If `CRE_REMOTE_ACCESS_MODE` is unset, CRE defaults to `direct`. -- In direct mode, host IP resolution is: +- Host IP resolution is: 1. `CRE_EC2_HOST_IP` if set. 2. Otherwise, resolve from AWS CLI using `CRE_EC2_INSTANCE_ID`: - `aws ec2 describe-instances --instance-ids --query ...` @@ -34,7 +28,7 @@ This runbook covers the EC2-based remote mode for CRE where components can run e ## AWS Credentials Resolution (CLI) -For both SSM and direct-mode auto IP lookup, AWS CLI auth selection follows: +For direct-mode auto IP lookup, AWS CLI auth selection follows: 1. Static env credentials (`AWS_ACCESS_KEY_ID` + `AWS_SECRET_ACCESS_KEY`) 2. Web identity (`AWS_WEB_IDENTITY_TOKEN_FILE` + `AWS_ROLE_ARN`) @@ -45,9 +39,7 @@ For both SSM and direct-mode auto IP lookup, AWS CLI auth selection follows: ## Agent Startup -- In `ssm` mode, bind agent to loopback (for example `127.0.0.1:18080`). - In `direct` mode, bind agent to all interfaces (for example `0.0.0.0:18080`). -- With defaults, agent starts in direct mode unless `CRE_REMOTE_ACCESS_MODE=ssm` is set. ## Placement Rules diff --git a/system-tests/tests/test-helpers/fixture_relay_helpers.go b/system-tests/tests/test-helpers/fixture_relay_helpers.go index 1c769fd16d7..09e1ac04603 100644 --- a/system-tests/tests/test-helpers/fixture_relay_helpers.go +++ b/system-tests/tests/test-helpers/fixture_relay_helpers.go @@ -26,9 +26,8 @@ import ( ) const ( - envLocalAgentURL = "CRE_LOCAL_AGENT_URL" - envEC2AgentURL = "CRE_EC2_AGENT_URL" - envEC2AgentPort = "CRE_EC2_AGENT_PORT" + envEC2AgentURL = "CRE_EC2_AGENT_URL" + envEC2AgentPort = "CRE_EC2_AGENT_PORT" ) type relayOpenResponse struct { @@ -131,25 +130,19 @@ func resolveAgentBaseURLForRelay() (string, error) { if v := strings.TrimSpace(os.Getenv(envEC2AgentURL)); v != "" { return v, nil } - if runtimecfg.IsDirectMode() { - hostIP, err := runtimecfg.DirectHostIP() - if err != nil { - return "", err - } - port := 8080 - if rawPort := strings.TrimSpace(os.Getenv(envEC2AgentPort)); rawPort != "" { - parsed, err := strconv.Atoi(rawPort) - if err != nil || parsed <= 0 || parsed > 65535 { - return "", fmt.Errorf("invalid %s: %q", envEC2AgentPort, rawPort) - } - port = parsed - } - return fmt.Sprintf("http://%s:%d", hostIP, port), nil + hostIP, err := runtimecfg.DirectHostIP() + if err != nil { + return "", err } - if v := strings.TrimSpace(os.Getenv(envLocalAgentURL)); v != "" { - return v, nil + port := 8080 + if rawPort := strings.TrimSpace(os.Getenv(envEC2AgentPort)); rawPort != "" { + parsed, err := strconv.Atoi(rawPort) + if err != nil || parsed <= 0 || parsed > 65535 { + return "", fmt.Errorf("invalid %s: %q", envEC2AgentPort, rawPort) + } + port = parsed } - return "", fmt.Errorf("missing agent URL for fixture relay (set %s, or set %s/%s for direct mode)", envEC2AgentURL, runtimecfg.EnvRemoteAccessMode, runtimecfg.EnvEC2HostIP) + return fmt.Sprintf("http://%s:%d", hostIP, port), nil } func openRelay(ctx context.Context, agentBaseURL, name string, requestedPort int) (string, error) { From 5dd405c0e18e6254591ce98dd5a24379323d955a Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Wed, 25 Feb 2026 09:00:54 +0100 Subject: [PATCH 18/34] remove tunnel state persistance --- .../environment/environment/environment.go | 191 ------------------ .../environment/relay_supervisor.go | 12 ++ .../cre/environment/config/tunnel_state.go | 107 ---------- 3 files changed, 12 insertions(+), 298 deletions(-) delete mode 100644 system-tests/lib/cre/environment/config/tunnel_state.go diff --git a/core/scripts/cre/environment/environment/environment.go b/core/scripts/cre/environment/environment/environment.go index 9e216a0d203..02cbb7e82f9 100644 --- a/core/scripts/cre/environment/environment/environment.go +++ b/core/scripts/cre/environment/environment/environment.go @@ -1,7 +1,6 @@ package environment import ( - "bufio" "context" "crypto/ecdsa" "crypto/rand" @@ -14,7 +13,6 @@ import ( "path/filepath" "runtime/debug" "slices" - "strconv" "strings" "syscall" "time" @@ -260,9 +258,6 @@ func startCmd() *cobra.Command { return errors.Wrap(err, "failed to set default CTF configs") } - if err := cleanupTrackedTunnels(relativePathToRepoRoot); err != nil { - framework.L.Warn().Err(err).Msg("failed to clean up tracked SSM tunnels before start") - } if err := stopRelaySupervisor(relativePathToRepoRoot); err != nil { framework.L.Warn().Err(err).Msg("failed to stop tracked relay supervisor before start") } @@ -526,10 +521,6 @@ func startCmd() *cobra.Command { } else if err := removeRemoteStopConfig(relativePathToRepoRoot); err != nil { framework.L.Warn().Err(err).Msg("failed to clear stale remote component stop state") } - if err := persistTunnelState(relativePathToRepoRoot, output); err != nil { - return errors.Wrap(err, "failed to store tunnel state") - } - return nil }, } @@ -827,10 +818,6 @@ func stopLocalResources(relativePathToRepoRoot string, removeAllState bool) erro return errors.Wrap(removeErr, "failed to remove environment containers. Please remove them manually") } - if err := cleanupTrackedTunnels(relativePathToRepoRoot); err != nil { - framework.L.Warn().Err(err).Msg("failed to clean up tracked SSM tunnels") - } - if removeAllState { stopBeholderErr := stopBeholder() if stopBeholderErr != nil { @@ -1100,184 +1087,6 @@ func oneLineErrorMessage(errOrPanic any) string { return strings.SplitN(fmt.Sprintf("%v", errOrPanic), "\n", 1)[0] } -func cleanupTrackedTunnels(relativePathToRepoRoot string) error { - state, err := envconfig.LoadTunnelState(relativePathToRepoRoot) - if err != nil { - return errors.Wrap(err, "failed to load tracked tunnel state") - } - if len(state.Tunnels) == 0 { - return nil - } - - framework.L.Info().Msgf("Found %d tracked SSM tunnel process(es), cleaning up", len(state.Tunnels)) - failed := 0 - for _, t := range state.Tunnels { - // First, aggressively kill known long-lived plugin children by local forwarded port. - if pluginKilled, pluginErr := killSessionManagerPluginByLocalPort(t.LocalPort); pluginErr != nil { - framework.L.Warn().Err(pluginErr).Msgf("failed to clean session-manager-plugin for localPort=%d", t.LocalPort) - } else if pluginKilled { - framework.L.Info().Msgf("stopped session-manager-plugin for localPort=%d", t.LocalPort) - } - - if t.PID <= 0 { - continue - } - if !processExists(t.PID) { - continue - } - isSSM, checkErr := isSSMStartSessionProcess(t.PID) - if checkErr != nil { - framework.L.Warn().Err(checkErr).Msgf("failed to inspect process pid=%d before tunnel cleanup", t.PID) - failed++ - continue - } - if !isSSM { - framework.L.Warn().Msgf("refusing to kill non-SSM process pid=%d recorded in tunnel state", t.PID) - failed++ - continue - } - - proc, findErr := os.FindProcess(t.PID) - if findErr != nil { - failed++ - continue - } - - _ = proc.Signal(syscall.SIGTERM) - deadline := time.Now().Add(2 * time.Second) - for processExists(t.PID) && time.Now().Before(deadline) { - time.Sleep(150 * time.Millisecond) - } - if processExists(t.PID) { - _ = proc.Kill() - } - if processExists(t.PID) { - failed++ - framework.L.Warn().Msgf("failed to stop tracked tunnel process pid=%d localPort=%d remotePort=%d", t.PID, t.LocalPort, t.RemotePort) - continue - } - - framework.L.Info().Msgf("stopped tracked tunnel process pid=%d localPort=%d remotePort=%d kind=%s", t.PID, t.LocalPort, t.RemotePort, t.Kind) - } - - if clearErr := envconfig.ClearTunnelState(relativePathToRepoRoot); clearErr != nil { - framework.L.Warn().Err(clearErr).Msg("failed to clear tunnel state file after cleanup") - } - - if failed > 0 { - return fmt.Errorf("failed to clean up %d tracked tunnel process(es)", failed) - } - return nil -} - -func processExists(pid int) bool { - if pid <= 0 { - return false - } - proc, err := os.FindProcess(pid) - if err != nil { - return false - } - err = proc.Signal(syscall.Signal(0)) - return err == nil -} - -func isSSMStartSessionProcess(pid int) (bool, error) { - out, err := exec.Command("ps", "-o", "command=", "-p", strconv.Itoa(pid)).Output() - if err != nil { - return false, err - } - cmd := strings.TrimSpace(string(out)) - if cmd == "" { - return false, nil - } - - return strings.Contains(cmd, "aws ssm start-session"), nil -} - -func killSessionManagerPluginByLocalPort(localPort int) (bool, error) { - if localPort <= 0 { - return false, nil - } - - out, err := exec.Command("ps", "-axo", "pid=,command=").Output() - if err != nil { - return false, err - } - - pattern := fmt.Sprintf(`"localPortNumber": ["%d"]`, localPort) - killedAny := false - scanner := bufio.NewScanner(strings.NewReader(string(out))) - for scanner.Scan() { - line := strings.TrimSpace(scanner.Text()) - if line == "" { - continue - } - if !strings.Contains(line, "session-manager-plugin") || !strings.Contains(line, pattern) { - continue - } - - fields := strings.Fields(line) - if len(fields) == 0 { - continue - } - pid, parseErr := strconv.Atoi(fields[0]) - if parseErr != nil || pid <= 0 { - continue - } - - proc, findErr := os.FindProcess(pid) - if findErr != nil { - continue - } - _ = proc.Signal(syscall.SIGTERM) - deadline := time.Now().Add(2 * time.Second) - for processExists(pid) && time.Now().Before(deadline) { - time.Sleep(100 * time.Millisecond) - } - if processExists(pid) { - _ = proc.Kill() - } - if !processExists(pid) { - killedAny = true - } - } - if scanErr := scanner.Err(); scanErr != nil { - return killedAny, scanErr - } - - return killedAny, nil -} - -func persistTunnelState(relativePathToRepoRoot string, output *creenv.SetupOutput) error { - if output == nil { - return envconfig.ClearTunnelState(relativePathToRepoRoot) - } - - bindings := output.TunnelBindings() - processes := make([]envconfig.TunnelProcess, 0, len(bindings)) - for _, b := range bindings { - if b.PID <= 0 { - continue - } - processes = append(processes, envconfig.TunnelProcess{ - PID: b.PID, - Kind: "ssm", - InstanceID: os.Getenv("CRE_EC2_INSTANCE_ID"), - Region: "us-west-2", - RemotePort: b.Port, - LocalPort: b.LocalPort, - ComponentID: b.ComponentID, - Endpoint: b.EndpointName, - }) - } - - return envconfig.StoreTunnelState(relativePathToRepoRoot, &envconfig.TunnelState{ - Version: 1, - Tunnels: processes, - }) -} - func initDxTracker() { if dxTracker != nil { return diff --git a/core/scripts/cre/environment/environment/relay_supervisor.go b/core/scripts/cre/environment/environment/relay_supervisor.go index 88b855b11b5..3a31fd525c5 100644 --- a/core/scripts/cre/environment/environment/relay_supervisor.go +++ b/core/scripts/cre/environment/environment/relay_supervisor.go @@ -595,6 +595,18 @@ func waitForPIDAlive(pid int, maxWait time.Duration) bool { return processExists(pid) } +func processExists(pid int) bool { + if pid <= 0 { + return false + } + proc, err := os.FindProcess(pid) + if err != nil { + return false + } + err = proc.Signal(syscall.Signal(0)) + return err == nil +} + func portsCSV(ports []int) string { if len(ports) == 0 { return "" diff --git a/system-tests/lib/cre/environment/config/tunnel_state.go b/system-tests/lib/cre/environment/config/tunnel_state.go deleted file mode 100644 index 8ec0b0b8d5c..00000000000 --- a/system-tests/lib/cre/environment/config/tunnel_state.go +++ /dev/null @@ -1,107 +0,0 @@ -package config - -import ( - "fmt" - "os" - "path/filepath" - "sync" - - "github.com/pelletier/go-toml/v2" -) - -const TunnelStateFilename = "tunnels.toml" - -type TunnelProcess struct { - PID int `toml:"pid"` - Kind string `toml:"kind"` - InstanceID string `toml:"instance_id"` - Region string `toml:"region"` - RemotePort int `toml:"remote_port"` - LocalPort int `toml:"local_port"` - ComponentID string `toml:"component_id,omitempty"` - Endpoint string `toml:"endpoint,omitempty"` - CreatedAt string `toml:"created_at,omitempty"` -} - -type TunnelState struct { - Version int `toml:"version"` - Tunnels []TunnelProcess `toml:"tunnels"` -} - -var tunnelStateMu sync.Mutex - -func MustTunnelStateFileAbsPath(relativePathToRepoRoot string) string { - absPath, err := filepath.Abs(filepath.Join(relativePathToRepoRoot, StateDirname, TunnelStateFilename)) - if err != nil { - panic(fmt.Errorf("failed to get absolute path for tunnel state file: %w", err)) - } - return absPath -} - -func LoadTunnelState(relativePathToRepoRoot string) (*TunnelState, error) { - tunnelStateMu.Lock() - defer tunnelStateMu.Unlock() - return loadTunnelStateUnlocked(MustTunnelStateFileAbsPath(relativePathToRepoRoot)) -} - -func StoreTunnelState(relativePathToRepoRoot string, state *TunnelState) error { - tunnelStateMu.Lock() - defer tunnelStateMu.Unlock() - return storeTunnelStateUnlocked(MustTunnelStateFileAbsPath(relativePathToRepoRoot), state) -} - -func ClearTunnelState(relativePathToRepoRoot string) error { - tunnelStateMu.Lock() - defer tunnelStateMu.Unlock() - return storeTunnelStateUnlocked(MustTunnelStateFileAbsPath(relativePathToRepoRoot), &TunnelState{ - Version: 1, - Tunnels: []TunnelProcess{}, - }) -} - -func loadTunnelStateUnlocked(path string) (*TunnelState, error) { - if _, err := os.Stat(path); os.IsNotExist(err) { - return &TunnelState{Version: 1, Tunnels: []TunnelProcess{}}, nil - } - - data, err := os.ReadFile(path) - if err != nil { - return nil, fmt.Errorf("failed to read tunnel state file: %w", err) - } - - state := &TunnelState{} - if err := toml.Unmarshal(data, state); err != nil { - return nil, fmt.Errorf("failed to unmarshal tunnel state file: %w", err) - } - if state.Version == 0 { - state.Version = 1 - } - if state.Tunnels == nil { - state.Tunnels = []TunnelProcess{} - } - return state, nil -} - -func storeTunnelStateUnlocked(path string, state *TunnelState) error { - if state == nil { - state = &TunnelState{Version: 1, Tunnels: []TunnelProcess{}} - } - if state.Version == 0 { - state.Version = 1 - } - if state.Tunnels == nil { - state.Tunnels = []TunnelProcess{} - } - - if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { - return fmt.Errorf("failed to create tunnel state directory: %w", err) - } - data, err := toml.Marshal(state) - if err != nil { - return fmt.Errorf("failed to marshal tunnel state: %w", err) - } - if err := os.WriteFile(path, data, 0o600); err != nil { - return fmt.Errorf("failed to write tunnel state file: %w", err) - } - return nil -} From c3ca61a6a1ee57829ebab7a94e807077fe54ab51 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Wed, 25 Feb 2026 09:41:36 +0100 Subject: [PATCH 19/34] remove tunnel manager and non-ec2 agent clients --- .../environment/environment/environment.go | 30 +- .../cre/environment/environment/workflow.go | 12 +- .../lib/cre/environment/agent_log_format.go | 36 ++ .../lib/cre/environment/artifacts_remote.go | 10 +- .../lib/cre/environment/blockchain_start.go | 369 +----------------- .../cre/environment/blockchain_start_test.go | 55 ++- system-tests/lib/cre/environment/dons.go | 61 +-- .../lib/cre/environment/environment.go | 67 ++-- system-tests/lib/cre/environment/jobs.go | 149 +------ system-tests/lib/cre/environment/jobs_test.go | 53 +-- .../environment/remote_component_client.go | 243 ++++++++++++ .../lib/cre/environment/remote_stop.go | 19 +- .../lib/cre/environment/setup_output_test.go | 22 +- .../tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md | 5 +- .../tests/smoke/cre/v2_grpc_source_test.go | 9 +- system-tests/tests/test-helpers/t_helpers.go | 14 +- 16 files changed, 423 insertions(+), 731 deletions(-) create mode 100644 system-tests/lib/cre/environment/agent_log_format.go create mode 100644 system-tests/lib/cre/environment/remote_component_client.go diff --git a/core/scripts/cre/environment/environment/environment.go b/core/scripts/cre/environment/environment/environment.go index 02cbb7e82f9..bd4adf3e6e3 100644 --- a/core/scripts/cre/environment/environment/environment.go +++ b/core/scripts/cre/environment/environment/environment.go @@ -649,7 +649,7 @@ func stopCmd() *cobra.Command { Example: "go run . env stop", PersistentPreRun: globalPreRunFunc, RunE: func(cmd *cobra.Command, args []string) error { - if err := stopLocalResources(relativePathToRepoRoot, false); err != nil { + if err := stopLocalResources(relativePathToRepoRoot, false, false); err != nil { return err } remoteConfiguredSummary, _ := loadRemoteStopTargets(relativePathToRepoRoot) @@ -668,21 +668,21 @@ func stopCmd() *cobra.Command { func stopAllCmd() *cobra.Command { cmd := &cobra.Command{ Use: "stop-all", - Short: "Stops all local resources", - Long: `Stops local CRE resources and extra local services (beholder, billing, observability), then removes local state directory.`, + Short: "Stops local and remote resources", + Long: `Stops remote CRE components (when configured), then stops local CRE resources and extra local services (beholder, billing, observability), and removes local state directory.`, Example: "go run . env stop-all", PersistentPreRun: globalPreRunFunc, RunE: func(cmd *cobra.Command, args []string) error { - if err := stopLocalResources(relativePathToRepoRoot, true); err != nil { - return err - } - remoteConfiguredSummary, _ := loadRemoteStopTargets(relativePathToRepoRoot) + remoteConfiguredSummary, targets := loadRemoteStopTargets(relativePathToRepoRoot) if remoteConfiguredSummary.Total > 0 { - framework.L.Warn(). - Int("count", remoteConfiguredSummary.Total). - Msgf("Remote components are still running. Use `env stop-remote` to stop them. Remote stop state: %s", remoteStateFileAbsPath(relativePathToRepoRoot)) + if err := stopRemoteTargets(cmd.Context(), relativePathToRepoRoot, targets); err != nil { + return err + } + } + if err := stopLocalResources(relativePathToRepoRoot, true, false); err != nil { + return err } - fmt.Println("All local resources stopped successfully") + fmt.Println("All resources stopped successfully") return nil }, } @@ -808,9 +808,11 @@ func stopRemoteTargets(ctx context.Context, relativePathToRepoRoot string, targe return nil } -func stopLocalResources(relativePathToRepoRoot string, removeAllState bool) error { - if err := stopRelaySupervisor(relativePathToRepoRoot); err != nil { - framework.L.Warn().Err(err).Msg("failed to stop relay supervisor") +func stopLocalResources(relativePathToRepoRoot string, removeAllState bool, stopRelay bool) error { + if stopRelay { + if err := stopRelaySupervisor(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to stop relay supervisor") + } } removeErr := framework.RemoveTestContainers() diff --git a/core/scripts/cre/environment/environment/workflow.go b/core/scripts/cre/environment/environment/workflow.go index 8fc354af0fe..6d15686aaa8 100644 --- a/core/scripts/cre/environment/environment/workflow.go +++ b/core/scripts/cre/environment/environment/workflow.go @@ -26,7 +26,6 @@ import ( keystone_changeset "github.com/smartcontractkit/chainlink/deployment/keystone/changeset" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment" envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" creworkflow "github.com/smartcontractkit/chainlink/system-tests/lib/cre/workflow" ) @@ -397,15 +396,6 @@ func deployWorkflow( if modeErr != nil { return modeErr } - var remoteTunnelManager tunnel.Manager - if mode == creworkflow.ArtifactDeployModeRemote { - manager, err := environment.NewEC2TunnelManager(framework.L) - if err != nil { - return errors.Wrap(err, "failed to initialize tunnel manager for remote workflow artifact deploy") - } - remoteTunnelManager = manager - defer func() { _ = remoteTunnelManager.Stop(ctx) }() - } deployArtifacts := func(files ...string) error { return creworkflow.DeployArtifacts( ctx, @@ -416,7 +406,7 @@ func deployWorkflow( ContainerTargetDir: containerTargetDirFlag, Files: files, RemoteDeployer: func(ctx context.Context, nodeSetName, containerTargetDir string, files []string) error { - return environment.DeployArtifactsToRemoteNodeSet(ctx, framework.L, remoteTunnelManager, nodeSetName, containerTargetDir, files) + return environment.DeployArtifactsToRemoteNodeSet(ctx, framework.L, nodeSetName, containerTargetDir, files) }, }, ) diff --git a/system-tests/lib/cre/environment/agent_log_format.go b/system-tests/lib/cre/environment/agent_log_format.go new file mode 100644 index 00000000000..f95bf8df852 --- /dev/null +++ b/system-tests/lib/cre/environment/agent_log_format.go @@ -0,0 +1,36 @@ +package environment + +import ( + "encoding/json" + "fmt" + "strings" +) + +func prettifyAgentLogLine(line string) string { + trimmed := strings.TrimSpace(line) + if trimmed == "" { + return "" + } + + var payload map[string]any + if err := json.Unmarshal([]byte(trimmed), &payload); err != nil { + return trimmed + } + + message, _ := payload["message"].(string) + if message == "" { + return trimmed + } + + level, _ := payload["level"].(string) + if level == "" { + level = "info" + } + + cmd, _ := payload["Cmd"].(string) + if cmd != "" { + return fmt.Sprintf("[%s] %s (cmd=%s)", level, message, cmd) + } + + return fmt.Sprintf("[%s] %s", level, message) +} diff --git a/system-tests/lib/cre/environment/artifacts_remote.go b/system-tests/lib/cre/environment/artifacts_remote.go index 0bb388c678c..1f343526649 100644 --- a/system-tests/lib/cre/environment/artifacts_remote.go +++ b/system-tests/lib/cre/environment/artifacts_remote.go @@ -12,13 +12,11 @@ import ( "github.com/rs/zerolog" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" ) func DeployArtifactsToRemoteNodeSet( ctx context.Context, lggr zerolog.Logger, - tunnelManager tunnel.Manager, nodeSetName string, containerTargetDir string, files []string, @@ -30,11 +28,11 @@ func DeployArtifactsToRemoteNodeSet( return fmt.Errorf("container target dir is required") } - if tunnelManager == nil { - return fmt.Errorf("tunnel manager is required for remote artifact deploy") + remoteRuntime, err := resolveRemoteRuntime(lggr) + if err != nil { + return pkgerrors.Wrap(err, "failed to resolve remote runtime settings for artifact deploy") } - - startClient, err := newStartComponentClient(lggr, tunnelManager) + startClient, err := newRemoteComponentClient(remoteRuntime) if err != nil { return pkgerrors.Wrap(err, "failed to initialize remote component client for artifact deploy") } diff --git a/system-tests/lib/cre/environment/blockchain_start.go b/system-tests/lib/cre/environment/blockchain_start.go index e72ff333f84..a978bb71ec4 100644 --- a/system-tests/lib/cre/environment/blockchain_start.go +++ b/system-tests/lib/cre/environment/blockchain_start.go @@ -1,250 +1,26 @@ package environment import ( - "bytes" "context" "encoding/json" "errors" "fmt" - "io" "net" - "net/http" "net/url" - "os" - "strconv" - "strings" - "time" - retry "github.com/avast/retry-go/v4" pkgerrors "github.com/pkg/errors" "github.com/rs/zerolog" cldf_chain "github.com/smartcontractkit/chainlink-deployments-framework/chain" - "github.com/smartcontractkit/chainlink-testing-framework/framework" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/adapters" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" -) - -const ( - componentTypeBlockchain = "blockchain" - componentTypeJD = "jd" - componentTypeNodeSet = "nodeset" - envEC2AgentURL = "CRE_EC2_AGENT_URL" - envEC2AgentPort = "CRE_EC2_AGENT_PORT" - defaultEC2AgentPort = 8080 ) type startComponentEnvelope = agent.StartComponentEnvelope type startComponentRequest = agent.StartComponentPayload -type startComponentResult = agent.StartComponentResponse - -type componentClient interface { - StartComponent(ctx context.Context, envelope agent.StartComponentEnvelope) (*agent.StartComponentResponse, error) -} - -type httpComponentClient struct { - baseURL string - client *http.Client - maxAttempts int - retryDelay time.Duration - checkHealth bool -} - -func newHTTPComponentClient(baseURL string) *httpComponentClient { - return &httpComponentClient{ - baseURL: baseURL, - client: &http.Client{ - Timeout: 4 * time.Minute, - }, - maxAttempts: 1, - retryDelay: 0, - checkHealth: false, - } -} - -func newEC2HTTPComponentClient(baseURL string) *httpComponentClient { - return &httpComponentClient{ - baseURL: baseURL, - client: &http.Client{ - Timeout: 4 * time.Minute, - }, - maxAttempts: 3, - retryDelay: 2 * time.Second, - checkHealth: true, - } -} - -func (c *httpComponentClient) StartComponent(ctx context.Context, envelope agent.StartComponentEnvelope) (*agent.StartComponentResponse, error) { - if c.checkHealth { - if err := c.waitForHealth(ctx); err != nil { - return nil, err - } - } - - var result *agent.StartComponentResponse - err := retry.Do( - func() error { - var err error - result, err = c.startComponentOnce(ctx, envelope) - return err - }, - retry.Attempts(uint(c.maxAttempts)), - retry.Delay(c.retryDelay), - retry.Context(ctx), - retry.LastErrorOnly(true), - ) - if err != nil { - return nil, err - } - - return result, nil -} - -func (c *httpComponentClient) startComponentOnce(ctx context.Context, envelope agent.StartComponentEnvelope) (*agent.StartComponentResponse, error) { - body, err := json.Marshal(envelope) - if err != nil { - return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to encode start component envelope")) - } - - req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/v1/components/start", bytes.NewReader(body)) - if err != nil { - return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to create start component request")) - } - req.Header.Set("Content-Type", "application/json") - - resp, err := c.client.Do(req) - if err != nil { - if isRetriableNetworkError(err) { - return nil, pkgerrors.Wrap(err, "failed to execute start component request") - } - return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to execute start component request")) - } - defer resp.Body.Close() - - respBody, err := io.ReadAll(resp.Body) - if err != nil { - return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to read start component response")) - } - - var startResp agent.StartComponentResponse - if len(respBody) > 0 { - if err := json.Unmarshal(respBody, &startResp); err != nil { - return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to decode start component response")) - } - } - - if resp.StatusCode < 200 || resp.StatusCode >= 300 { - if startResp.Error != "" { - if startResp.ErrorCode != "" { - err = remoteAgentError(startResp.ErrorCode, startResp.Error) - } else { - err = remoteAgentError("remote_agent_error", startResp.Error) - } - } else { - err = fmt.Errorf("start component request failed with status %s: %s", resp.Status, string(respBody)) - } - - if isRetriableStatus(resp.StatusCode) { - return nil, err - } - return nil, retry.Unrecoverable(err) - } - if startResp.Error != "" { - if startResp.ErrorCode != "" { - return nil, retry.Unrecoverable(remoteAgentError(startResp.ErrorCode, startResp.Error)) - } - return nil, retry.Unrecoverable(remoteAgentError("remote_agent_error", startResp.Error)) - } - - return &startResp, nil -} - -func (c *httpComponentClient) waitForHealth(ctx context.Context) error { - return retry.Do( - func() error { - req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+"/v1/health", nil) - if err != nil { - return retry.Unrecoverable(pkgerrors.Wrap(err, "failed to create EC2 agent health request")) - } - - resp, err := c.client.Do(req) - if err != nil { - return pkgerrors.Wrap(err, describeEC2AgentHealthFailure(c.baseURL)) - } - _ = resp.Body.Close() - if resp.StatusCode == http.StatusOK { - return nil - } - return fmt.Errorf("%s: status %s", describeEC2AgentHealthFailure(c.baseURL), resp.Status) - }, - retry.Attempts(uint(c.maxAttempts)), - retry.Delay(c.retryDelay), - retry.Context(ctx), - retry.LastErrorOnly(true), - ) -} - -func describeEC2AgentHealthFailure(baseURL string) string { - return fmt.Sprintf( - "failed EC2 CRE agent health check (%s/v1/health); verify the agent process is running and %s matches its listen port (or set %s explicitly)", - baseURL, - envEC2AgentPort, - envEC2AgentURL, - ) -} - -func isRetriableStatus(statusCode int) bool { - return statusCode == http.StatusBadGateway || statusCode == http.StatusServiceUnavailable || statusCode == http.StatusGatewayTimeout -} - -func isRetriableNetworkError(err error) bool { - var netErr net.Error - return errors.As(err, &netErr) -} - -func newStartComponentClient(testLogger zerolog.Logger, tunnelManager tunnel.Manager) (componentClient, error) { - _ = tunnelManager // legacy parameter retained for call-site compatibility - - baseURL, err := resolveEC2AgentBaseURL(testLogger) - if err != nil { - return nil, fmt.Errorf("failed to resolve EC2 agent base URL: %w", err) - } - return newEC2HTTPComponentClient(baseURL), nil -} - -func resolveEC2AgentBaseURL(testLogger zerolog.Logger) (string, error) { - if configured := strings.TrimSpace(os.Getenv(envEC2AgentURL)); configured != "" { - return configured, nil - } - remotePort, err := resolveEC2AgentPort() - if err != nil { - return "", err - } - hostIP, err := resolveDirectAccessHostIP() - if err != nil { - return "", err - } - testLogger.Debug().Str("hostIP", hostIP).Int("port", remotePort).Msg("resolved EC2 CRE agent base URL from direct mode host") - return fmt.Sprintf("http://%s:%d", hostIP, remotePort), nil -} - -func resolveEC2AgentPort() (int, error) { - remotePort := defaultEC2AgentPort - if configuredPort := strings.TrimSpace(os.Getenv(envEC2AgentPort)); configuredPort != "" { - parsedPort, err := strconv.Atoi(configuredPort) - if err != nil || parsedPort <= 0 || parsedPort > 65535 { - return 0, fmt.Errorf("invalid %s: %q", envEC2AgentPort, configuredPort) - } - remotePort = parsedPort - } - return remotePort, nil -} func blockchainFromOutput(testLogger zerolog.Logger, output *blockchain.Output) (blockchains.Blockchain, error) { if output == nil { @@ -258,41 +34,12 @@ func blockchainFromOutput(testLogger zerolog.Logger, output *blockchain.Output) return evm.FromOutput(testLogger, output) } -func prettifyAgentLogLine(line string) string { - trimmed := strings.TrimSpace(line) - if trimmed == "" { - return "" - } - - var payload map[string]any - if err := json.Unmarshal([]byte(trimmed), &payload); err != nil { - return trimmed - } - - message, _ := payload["message"].(string) - if message == "" { - return trimmed - } - - level, _ := payload["level"].(string) - if level == "" { - level = "info" - } - - cmd, _ := payload["Cmd"].(string) - if cmd != "" { - return fmt.Sprintf("[%s] %s (cmd=%s)", level, message, cmd) - } - - return fmt.Sprintf("[%s] %s", level, message) -} - -func validatePhase2ARemoteBlockchainInput(input *blockchain.Input) error { +func validateRemoteBlockchainInput(input *blockchain.Input) error { if input == nil { return errors.New("blockchain input is nil") } if input.Type != blockchain.TypeAnvil { - return fmt.Errorf("remote target in phase 2A supports only %s, got %s", blockchain.TypeAnvil, input.Type) + return fmt.Errorf("remote target supports only %s, got %s", blockchain.TypeAnvil, input.Type) } return nil } @@ -302,7 +49,7 @@ func startBlockchainsWithTargets( testLogger zerolog.Logger, configuredBlockchains []*config.Blockchain, deployers map[blockchain.ChainFamily]blockchains.Deployer, - tunnelManager tunnel.Manager, + remoteRuntime *resolvedRemoteRuntime, rewriteInternalForLocalNodes bool, ) (*blockchains.DeployedBlockchains, error) { blockchainInputs, err := config.ResolveBlockchainInputs(configuredBlockchains) @@ -339,7 +86,10 @@ func startBlockchainsWithTargets( } if len(remoteIdx) > 0 { - startClient, err := newStartComponentClient(testLogger, tunnelManager) + if remoteRuntime == nil { + return nil, errors.New("remote runtime is required when starting remote blockchains") + } + startClient, err := newRemoteComponentClient(remoteRuntime) if err != nil { return nil, err } @@ -347,7 +97,7 @@ func startBlockchainsWithTargets( for _, idx := range remoteIdx { input := blockchainInputs[idx] configured := configuredBlockchains[idx] - if err := validatePhase2ARemoteBlockchainInput(input); err != nil { + if err := validateRemoteBlockchainInput(input); err != nil { return nil, err } @@ -384,7 +134,7 @@ func startBlockchainsWithTargets( return nil, pkgerrors.Wrap(err, "failed to decode blockchain transport payload") } - if err := rewriteRemoteBlockchainOutputForLocalAccess(ctx, testLogger, tunnelManager, idx, input, blockchainOutput, rewriteInternalForLocalNodes); err != nil { + if err := rewriteRemoteBlockchainOutputForLocalAccess(blockchainOutput, remoteRuntime.EC2HostIP, rewriteInternalForLocalNodes); err != nil { return nil, err } @@ -414,69 +164,19 @@ func startBlockchainsWithTargets( }, nil } -func newEC2TunnelManager(testLogger zerolog.Logger) (tunnel.Manager, error) { - _ = testLogger - return tunnel.NewNoopManager(), nil -} - -func NewEC2TunnelManager(testLogger zerolog.Logger) (tunnel.Manager, error) { - return newEC2TunnelManager(testLogger) -} - func rewriteRemoteBlockchainOutputForLocalAccess( - ctx context.Context, - testLogger zerolog.Logger, - tunnelManager tunnel.Manager, - configuredIndex int, - input *blockchain.Input, output *blockchain.Output, + ec2HostIP string, rewriteInternalForLocalNodes bool, ) error { + _ = rewriteInternalForLocalNodes // direct mode keeps internal URLs unchanged if output == nil { return nil } - if isRemoteAccessDirectMode() { - hostIP, err := resolveDirectAccessHostIP() - if err != nil { - return err - } - return rewriteRemoteBlockchainOutputForDirectAccess(output, hostIP) - } - - componentID := tunnel.CanonicalComponentID(tunnel.KindBlockchain, configuredIndex, input.Type) - adapter := adapters.NewBlockchainAdapter() - - refs, err := adapter.DescribeEndpoints(componentID, output) - if err != nil { - return pkgerrors.Wrap(err, "failed to describe blockchain tunnel endpoints") - } - - bindings, err := tunnelManager.Start(ctx, refs) - if err != nil { - return pkgerrors.Wrap(err, "failed to start tunnels for blockchain output") - } - for _, binding := range bindings { - testLogger.Info(). - Str("componentID", binding.ComponentID). - Str("endpointName", binding.EndpointName). - Str("originalURL", binding.OriginalURL). - Str("localURL", binding.LocalURL). - Msg("Established endpoint tunnel") - } - - if err := adapter.RewriteWithBindings(output, bindings); err != nil { - return pkgerrors.Wrap(err, "failed to rewrite blockchain output with local tunnel bindings") - } - if rewriteInternalForLocalNodes { - if err := rewriteBlockchainInternalURLsForLocalNodes(output); err != nil { - return pkgerrors.Wrap(err, "failed to rewrite blockchain internal urls for local node containers") - } - } - - return nil + return rewriteRemoteBlockchainOutputForDirectAccess(output, ec2HostIP) } -func rewriteRemoteBlockchainOutputForDirectAccess(output *blockchain.Output, hostIP string) error { +func rewriteRemoteBlockchainOutputForDirectAccess(output *blockchain.Output, ec2HostIP string) error { if output == nil { return nil } @@ -485,14 +185,14 @@ func rewriteRemoteBlockchainOutputForDirectAccess(output *blockchain.Output, hos continue } if node.ExternalHTTPUrl != "" { - rewritten, err := rewriteURLHost(node.ExternalHTTPUrl, hostIP) + rewritten, err := rewriteURLHost(node.ExternalHTTPUrl, ec2HostIP) if err != nil { return err } node.ExternalHTTPUrl = rewritten } if node.ExternalWSUrl != "" { - rewritten, err := rewriteURLHost(node.ExternalWSUrl, hostIP) + rewritten, err := rewriteURLHost(node.ExternalWSUrl, ec2HostIP) if err != nil { return err } @@ -502,37 +202,6 @@ func rewriteRemoteBlockchainOutputForDirectAccess(output *blockchain.Output, hos return nil } -func rewriteBlockchainInternalURLsForLocalNodes(output *blockchain.Output) error { - if output == nil { - return nil - } - - dockerHost := strings.TrimPrefix(framework.HostDockerInternal(), "http://") - for _, node := range output.Nodes { - if node == nil { - continue - } - - if node.ExternalHTTPUrl != "" { - internal, err := rewriteURLHost(node.ExternalHTTPUrl, dockerHost) - if err != nil { - return err - } - node.InternalHTTPUrl = internal - } - - if node.ExternalWSUrl != "" { - internal, err := rewriteURLHost(node.ExternalWSUrl, dockerHost) - if err != nil { - return err - } - node.InternalWSUrl = internal - } - } - - return nil -} - func rewriteURLHost(rawURL, host string) (string, error) { parsed, err := url.Parse(rawURL) if err != nil { @@ -546,14 +215,6 @@ func rewriteURLHost(rawURL, host string) (string, error) { return parsed.String(), nil } -func isRemoteAccessDirectMode() bool { - return runtimecfg.IsDirectMode() -} - -func resolveDirectAccessHostIP() (string, error) { - return runtimecfg.DirectHostIP() -} - func remoteAgentError(code, message string) error { return fmt.Errorf("remote agent error (%s): %s", code, message) } diff --git a/system-tests/lib/cre/environment/blockchain_start_test.go b/system-tests/lib/cre/environment/blockchain_start_test.go index ba14ade3d21..42c85742f84 100644 --- a/system-tests/lib/cre/environment/blockchain_start_test.go +++ b/system-tests/lib/cre/environment/blockchain_start_test.go @@ -6,30 +6,33 @@ import ( "github.com/rs/zerolog" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" ) -func TestValidatePhase2ARemoteBlockchainInput(t *testing.T) { - if err := validatePhase2ARemoteBlockchainInput(nil); err == nil { +func TestValidateRemoteBlockchainInput(t *testing.T) { + if err := validateRemoteBlockchainInput(nil); err == nil { t.Fatalf("expected nil input to fail validation") } - if err := validatePhase2ARemoteBlockchainInput(&blockchain.Input{Type: blockchain.TypeGeth}); err == nil { + if err := validateRemoteBlockchainInput(&blockchain.Input{Type: blockchain.TypeGeth}); err == nil { t.Fatalf("expected non-anvil input to fail validation") } - if err := validatePhase2ARemoteBlockchainInput(&blockchain.Input{Type: blockchain.TypeAnvil}); err != nil { + if err := validateRemoteBlockchainInput(&blockchain.Input{Type: blockchain.TypeAnvil}); err != nil { t.Fatalf("expected anvil input to pass validation, got %v", err) } } -func TestNewStartComponentClientPrefersEC2(t *testing.T) { +func TestNewRemoteComponentClientPrefersEC2(t *testing.T) { t.Setenv(envEC2AgentURL, "") - t.Setenv(runtimecfg.EnvEC2HostIP, "10.193.28.183") + t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") t.Setenv(envEC2AgentPort, "18080") - client, err := newStartComponentClient(zerolog.Nop(), tunnel.NewNoopManager()) + runtime, err := resolveRemoteRuntime(zerolog.Nop()) + if err != nil { + t.Fatalf("expected remote runtime to resolve, got %v", err) + } + client, err := newRemoteComponentClient(runtime) if err != nil { t.Fatalf("expected ec2-first client to be created, got %v", err) } @@ -44,7 +47,7 @@ func TestNewStartComponentClientPrefersEC2(t *testing.T) { if httpClient.maxAttempts != 3 { t.Fatalf("expected ec2 client retries to be enabled") } - if httpClient.baseURL != "http://10.193.28.183:18080" { + if httpClient.baseURL != "http://203.0.113.10:18080" { t.Fatalf("unexpected ec2 base url: %s", httpClient.baseURL) } } @@ -63,7 +66,7 @@ func TestResolveEC2AgentBaseURLRequiresHostOrInstanceInfoWhenURLMissing(t *testi func TestResolveEC2AgentBaseURLRejectsInvalidPort(t *testing.T) { t.Setenv(envEC2AgentURL, "") - t.Setenv(runtimecfg.EnvEC2HostIP, "10.193.28.183") + t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") t.Setenv(envEC2AgentPort, "not-a-port") _, err := resolveEC2AgentBaseURL(zerolog.Nop()) @@ -77,30 +80,30 @@ func TestResolveEC2AgentBaseURLRejectsInvalidPort(t *testing.T) { func TestResolveEC2AgentBaseURLDirectMode(t *testing.T) { t.Setenv(envEC2AgentURL, "") - t.Setenv(runtimecfg.EnvEC2HostIP, "10.193.28.183") + t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") t.Setenv(envEC2AgentPort, "18080") baseURL, err := resolveEC2AgentBaseURL(zerolog.Nop()) if err != nil { t.Fatalf("expected direct mode url resolution to succeed, got %v", err) } - if baseURL != "http://10.193.28.183:18080" { + if baseURL != "http://203.0.113.10:18080" { t.Fatalf("unexpected direct mode base url: %s", baseURL) } } -func TestNewStartComponentClientRequiresEC2Resolution(t *testing.T) { +func TestResolveRemoteRuntimeRequiresEC2Resolution(t *testing.T) { t.Setenv(envEC2AgentURL, "") t.Setenv(runtimecfg.EnvEC2HostIP, "") t.Setenv(runtimecfg.EnvEC2InstanceID, "") - if _, err := newStartComponentClient(zerolog.Nop(), tunnel.NewNoopManager()); err == nil { - t.Fatalf("expected client creation without EC2 resolution to fail") + if _, err := resolveRemoteRuntime(zerolog.Nop()); err == nil { + t.Fatalf("expected runtime resolution without EC2 inputs to fail") } } func TestRewriteRemoteBlockchainOutputForLocalAccess(t *testing.T) { - t.Setenv(runtimecfg.EnvEC2HostIP, "10.193.28.183") + t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") out := &blockchain.Output{ Nodes: []*blockchain.Node{ { @@ -112,21 +115,17 @@ func TestRewriteRemoteBlockchainOutputForLocalAccess(t *testing.T) { }, } if err := rewriteRemoteBlockchainOutputForLocalAccess( - t.Context(), - zerolog.Nop(), - tunnel.NewNoopManager(), - 0, - &blockchain.Input{Type: blockchain.TypeAnvil}, out, + "203.0.113.10", true, ); err != nil { t.Fatalf("expected rewrite helper to succeed: %v", err) } - if out.Nodes[0].ExternalHTTPUrl != "http://10.193.28.183:8545" { + if out.Nodes[0].ExternalHTTPUrl != "http://203.0.113.10:8545" { t.Fatalf("unexpected rewritten http url: %s", out.Nodes[0].ExternalHTTPUrl) } - if out.Nodes[0].ExternalWSUrl != "ws://10.193.28.183:8546" { + if out.Nodes[0].ExternalWSUrl != "ws://203.0.113.10:8546" { t.Fatalf("unexpected rewritten ws url: %s", out.Nodes[0].ExternalWSUrl) } if out.Nodes[0].InternalHTTPUrl != "http://anvil-1337:8545" { @@ -137,16 +136,6 @@ func TestRewriteRemoteBlockchainOutputForLocalAccess(t *testing.T) { } } -func TestNewEC2TunnelManagerAlwaysReturnsNoop(t *testing.T) { - manager, err := newEC2TunnelManager(zerolog.Nop()) - if err != nil { - t.Fatalf("expected noop manager, got error: %v", err) - } - if manager.IsStarted() { - t.Fatalf("expected noop manager to report not started") - } -} - func TestRemoteAgentErrorFormatting(t *testing.T) { err := remoteAgentError("deployment_failed", "failed to deploy blockchain output") want := "remote agent error (deployment_failed): failed to deploy blockchain output" diff --git a/system-tests/lib/cre/environment/dons.go b/system-tests/lib/cre/environment/dons.go index 7f3a6455378..fae986a0370 100644 --- a/system-tests/lib/cre/environment/dons.go +++ b/system-tests/lib/cre/environment/dons.go @@ -3,6 +3,7 @@ package environment import ( "context" "encoding/json" + "errors" "fmt" "net/url" "strconv" @@ -63,7 +64,7 @@ func StartDONs( capabilityConfigs cre.CapabilityConfigs, copyCapabilityBinaries bool, nodeSets []*cre.NodeSet, - tunnelManager tunnel.Manager, + remoteRuntime *resolvedRemoteRuntime, ) (*StartedDONs, error) { if infraInput.IsKubernetes() { // For Kubernetes, DONs are already running in the cluster, generate service URLs @@ -135,7 +136,10 @@ func StartDONs( var resultMap sync.Map var startClient componentClient if hasRemoteNodeSets(nodeSets) { - client, clientErr := newStartComponentClient(lggr, tunnelManager) + if remoteRuntime == nil { + return nil, errors.New("remote runtime is required when starting remote nodesets") + } + client, clientErr := newRemoteComponentClient(remoteRuntime) if clientErr != nil { return nil, clientErr } @@ -195,7 +199,7 @@ func StartDONs( if err != nil { return pkgerrors.Wrap(err, "failed to decode nodeset transport payload") } - if err := rewriteRemoteNodeSetOutputForLocalAccess(ctx, lggr, tunnelManager, topology, idx, nodeSet, nodeset); err != nil { + if err := rewriteRemoteNodeSetOutputForLocalAccess(topology, idx, nodeSet, nodeset, remoteRuntime.EC2HostIP); err != nil { return err } } else { @@ -300,51 +304,18 @@ func validateRemoteNodeSetNodeSpecs(nodeSetName string, specs []*clnode.Input) e return nil } -func rewriteRemoteNodeSetOutputForLocalAccess( - ctx context.Context, - lggr zerolog.Logger, - tunnelManager tunnel.Manager, - topology *cre.Topology, - configuredIndex int, - nodeSet *cre.NodeSet, - output *ns.Output, -) error { +func rewriteRemoteNodeSetOutputForLocalAccess(topology *cre.Topology, configuredIndex int, nodeSet *cre.NodeSet, output *ns.Output, ec2HostIP string) error { if output == nil && (nodeSet == nil || nodeSet.DbInput == nil || nodeSet.DbInput.Port == 0) { return nil } - if isRemoteAccessDirectMode() { - hostIP, err := resolveDirectAccessHostIP() - if err != nil { - return err - } - if err := rewriteNodeSetForDirectAccess(output, hostIP); err != nil { - return err - } - rewriteGatewayIncomingForDirectAccess(topology, configuredIndex, hostIP) - return nil - } - componentID := tunnel.CanonicalComponentID(tunnel.KindNodeSet, configuredIndex, nodeSet.Name) - refs, err := describeNodeSetEndpoints(componentID, nodeSet, output) - if err != nil { - return pkgerrors.Wrap(err, "failed to describe nodeset tunnel endpoints") + if err := rewriteNodeSetForDirectAccess(output, ec2HostIP); err != nil { + return err } - bindings, err := tunnelManager.Start(ctx, refs) - if err != nil { - return pkgerrors.Wrap(err, "failed to start tunnels for nodeset output") - } - for _, binding := range bindings { - lggr.Info(). - Str("componentID", binding.ComponentID). - Str("endpointName", binding.EndpointName). - Str("originalURL", binding.OriginalURL). - Str("localURL", binding.LocalURL). - Msg("Established endpoint tunnel") - } - rewriteGatewayIncomingForNodeSetBindings(topology, configuredIndex, nodeSet, bindings) - return rewriteNodeSetWithBindings(output, nodeSet, bindings) + rewriteGatewayIncomingForDirectAccess(topology, configuredIndex, ec2HostIP) + return nil } -func rewriteNodeSetForDirectAccess(output *ns.Output, hostIP string) error { +func rewriteNodeSetForDirectAccess(output *ns.Output, ec2HostIP string) error { if output == nil { return nil } @@ -353,7 +324,7 @@ func rewriteNodeSetForDirectAccess(output *ns.Output, hostIP string) error { if strings.TrimSpace(rawURL) == "" { continue } - rewritten, err := rewriteURLHost(rawURL, hostIP) + rewritten, err := rewriteURLHost(rawURL, ec2HostIP) if err != nil { return err } @@ -565,7 +536,7 @@ func rewriteGatewayIncomingForNodeSetBindings( } } -func rewriteGatewayIncomingForDirectAccess(topology *cre.Topology, configuredIndex int, hostIP string) { +func rewriteGatewayIncomingForDirectAccess(topology *cre.Topology, configuredIndex int, ec2HostIP string) { if topology == nil || topology.GatewayConnectors == nil || len(topology.GatewayConnectors.Configurations) == 0 { return } @@ -581,7 +552,7 @@ func rewriteGatewayIncomingForDirectAccess(topology *cre.Topology, configuredInd if cfg == nil || cfg.GatewayConfiguration == nil || cfg.NodeUUID != gatewayNode.UUID { continue } - cfg.Incoming.Host = hostIP + cfg.Incoming.Host = ec2HostIP } } diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index a836843ef59..eeaf12b13bc 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -41,7 +41,6 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/stagegen" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/sharding" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/workflow" @@ -56,35 +55,21 @@ type SetupOutput struct { NodeOutput []*cre.NodeSetOutput S3ProviderOutput *s3provider.Output GatewayConnectors *cre.GatewayConnectors - - tunnelManager tunnel.Manager - closeOnce sync.Once - closeErr error + closeOnce sync.Once + closeErr error } func (s *SetupOutput) Close(ctx context.Context) error { if s == nil { return nil } - manager := s.tunnelManager - if manager == nil { - manager = tunnel.NewNoopManager() - } - s.closeOnce.Do(func() { - s.closeErr = manager.Stop(ctx) + s.closeErr = nil }) return s.closeErr } -func (s *SetupOutput) TunnelBindings() []tunnel.TunnelBinding { - if s == nil || s.tunnelManager == nil { - return []tunnel.TunnelBinding{} - } - return s.tunnelManager.Snapshot() -} - type SetupInput struct { NodeSets []*cre.NodeSet Blockchains []*config.Blockchain @@ -173,11 +158,14 @@ func SetupTestEnvironment( if s3Err != nil { return nil, pkgerrors.Wrap(s3Err, "failed to start S3 provider") } - - tunnelManager, tmErr := newEC2TunnelManager(testLogger) - if tmErr != nil { - return nil, pkgerrors.Wrap(tmErr, "failed to initialize tunnel manager") + var remoteRuntime *resolvedRemoteRuntime + if hasRemoteComponents(input.Blockchains, input.JdInput, input.NodeSets) { + remoteRuntime, err = resolveRemoteRuntime(testLogger) + if err != nil { + return nil, pkgerrors.Wrap(err, "failed to resolve remote runtime settings") + } } + testLogger.Info().Msg("using persistent relay supervisor for mixed component relays") fmt.Print(libformat.PurpleText("%s", input.StageGen.Wrap("Starting %d blockchain(s)", len(input.Blockchains)))) @@ -187,18 +175,12 @@ func SetupTestEnvironment( testLogger, input.Blockchains, input.BlockchainDeployers, - tunnelManager, + remoteRuntime, nodeSetPlacement.HasLocalTargets, ) if startErr != nil { return nil, pkgerrors.Wrap(startErr, "failed to start blockchains") } - cleanupTunnelsOnError := true - defer func() { - if cleanupTunnelsOnError { - _ = tunnelManager.Stop(ctx) - } - }() creEnvironment := &cre.Environment{ Blockchains: deployedBlockchains.Outputs, @@ -278,7 +260,7 @@ func SetupTestEnvironment( fmt.Print(libformat.PurpleText("%s", input.StageGen.WrapAndNext("Applied Features in %.2f seconds", input.StageGen.Elapsed().Seconds()))) - startedJD, jdStartErr := StartJD(ctx, testLogger, input.JdInput, input.Provider, tunnelManager) + startedJD, jdStartErr := StartJD(ctx, testLogger, input.JdInput, input.Provider, remoteRuntime) if jdStartErr != nil { return nil, pkgerrors.Wrap(jdStartErr, "failed to start Job Distributor") } @@ -291,7 +273,7 @@ func SetupTestEnvironment( return nil, pkgerrors.Wrap(err, "bootstrap reachability sanity check failed") } - startedDONs, donStartErr := StartDONs(ctx, testLogger, topology, input.Provider, deployedBlockchains.RegistryChain().CtfOutput(), input.CapabilityConfigs, input.CopyCapabilityBinaries, updatedNodeSets, tunnelManager) + startedDONs, donStartErr := StartDONs(ctx, testLogger, topology, input.Provider, deployedBlockchains.RegistryChain().CtfOutput(), input.CapabilityConfigs, input.CopyCapabilityBinaries, updatedNodeSets, remoteRuntime) if donStartErr != nil { return nil, pkgerrors.Wrap(donStartErr, "failed to start DONs") } @@ -481,7 +463,6 @@ func SetupTestEnvironment( return nil, pkgerrors.Wrap(err, "failed to store workflow registry configuration output") } - cleanupTunnelsOnError = false return &SetupOutput{ WorkflowRegistryConfigurationOutput: workflowRegistryConfigurationOutput, // pass to caller, so that it can be optionally attached to TestConfig and saved to disk Dons: dons, @@ -489,7 +470,6 @@ func SetupTestEnvironment( CreEnvironment: creEnvironment, S3ProviderOutput: s3Output, GatewayConnectors: topology.GatewayConnectors, - tunnelManager: tunnelManager, }, nil } @@ -524,6 +504,23 @@ func blockchainPlacementsBySelector(configured []*config.Blockchain, deployed [] return bySelector } +func hasRemoteComponents(blockchains []*config.Blockchain, jdInput *config.JobDistributor, nodeSets []*cre.NodeSet) bool { + for _, configuredBlockchain := range blockchains { + if configuredBlockchain != nil && configuredBlockchain.Placement == config.PlacementRemote { + return true + } + } + if jdInput != nil && jdInput.Placement == config.PlacementRemote { + return true + } + for _, nodeSet := range nodeSets { + if nodeSet != nil && strings.TrimSpace(nodeSet.Placement) == string(config.PlacementRemote) { + return true + } + } + return false +} + type nodeSetPlacementSummary struct { HasLocalTargets bool HasRemoteTargets bool @@ -601,11 +598,11 @@ func verifyRemoteToLocalBootstrapReachability(ctx context.Context, lggr zerolog. return nil } - hostIP, err := runtimecfg.DirectHostIP() + ec2HostIP, err := runtimecfg.DirectHostIP() if err != nil { return fmt.Errorf("resolve direct EC2 host ip: %w", err) } - remoteRelayAddr := net.JoinHostPort(hostIP, strconv.Itoa(cre.OCRPeeringPort)) + remoteRelayAddr := net.JoinHostPort(ec2HostIP, strconv.Itoa(cre.OCRPeeringPort)) if err := waitForTCPReachable(ctx, remoteRelayAddr, 6*time.Second); err != nil { return fmt.Errorf("remote relay listener for bootstrap peering is not reachable at %s: %w", remoteRelayAddr, err) } diff --git a/system-tests/lib/cre/environment/jobs.go b/system-tests/lib/cre/environment/jobs.go index 771d752b705..e8a1a2d933b 100644 --- a/system-tests/lib/cre/environment/jobs.go +++ b/system-tests/lib/cre/environment/jobs.go @@ -7,7 +7,6 @@ import ( "fmt" "net" "net/url" - "strconv" "strings" "time" @@ -23,7 +22,6 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" "github.com/smartcontractkit/chainlink/system-tests/lib/infra" ) @@ -66,7 +64,7 @@ func StartJD( lggr zerolog.Logger, jdConfig *config.JobDistributor, infraInput infra.Provider, - tunnelManager tunnel.Manager, + remoteRuntime *resolvedRemoteRuntime, ) (*StartedJD, error) { startTime := time.Now() lggr.Info().Msg("Starting Job Distributor") @@ -78,7 +76,10 @@ func StartJD( var jdErr error if jdConfig.Placement == config.PlacementRemote { - startClient, err := newStartComponentClient(lggr, tunnelManager) + if remoteRuntime == nil { + return nil, errors.New("remote runtime is required when starting remote jd") + } + startClient, err := newRemoteComponentClient(remoteRuntime) if err != nil { return nil, err } @@ -113,7 +114,7 @@ func StartJD( if err != nil { return nil, pkgerrors.Wrap(err, "failed to decode jd transport payload") } - if err := rewriteRemoteJDOutputForLocalAccess(ctx, lggr, tunnelManager, jdOutput); err != nil { + if err := rewriteRemoteJDOutputForLocalAccess(jdOutput, remoteRuntime.EC2HostIP); err != nil { return nil, err } } else if infraInput.IsKubernetes() { @@ -170,52 +171,16 @@ func StartJD( }, nil } -func rewriteRemoteJDOutputForLocalAccess( - ctx context.Context, - lggr zerolog.Logger, - tunnelManager tunnel.Manager, - output *jd.Output, -) error { +func rewriteRemoteJDOutputForLocalAccess(output *jd.Output, ec2HostIP string) error { if output == nil { return nil } - if isRemoteAccessDirectMode() { - hostIP, err := resolveDirectAccessHostIP() - if err != nil { - return err - } - return rewriteJDForDirectAccess(output, hostIP) - } - if tunnelManager == nil { - return errors.New("tunnel manager is required for remote jd target") - } - - refs, err := describeJDEndpoints(output) - if err != nil { - return pkgerrors.Wrap(err, "failed to describe jd tunnel endpoints") - } - bindings, err := tunnelManager.Start(ctx, refs) - if err != nil { - return pkgerrors.Wrap(err, "failed to start tunnels for jd output") - } - for _, binding := range bindings { - lggr.Info(). - Str("componentID", binding.ComponentID). - Str("endpointName", binding.EndpointName). - Str("originalURL", binding.OriginalURL). - Str("localURL", binding.LocalURL). - Msg("Established endpoint tunnel") - } - return rewriteJDWithBindings(output, bindings) + return rewriteJDForDirectAccess(output, ec2HostIP) } -func rewriteJDForDirectAccess(output *jd.Output, hostIP string) error { - if output == nil { - return nil - } - +func rewriteJDForDirectAccess(output *jd.Output, ec2HostIP string) error { if output.ExternalGRPCUrl != "" { - rewritten, err := rewriteAddressHost(output.ExternalGRPCUrl, hostIP) + rewritten, err := rewriteAddressHost(output.ExternalGRPCUrl, ec2HostIP) if err != nil { return err } @@ -227,7 +192,7 @@ func rewriteJDForDirectAccess(output *jd.Output, hostIP string) error { if source == "" { source = output.InternalWSRPCUrl } - rewritten, err := rewriteAddressHost(source, hostIP) + rewritten, err := rewriteAddressHost(source, ec2HostIP) if err != nil { return err } @@ -236,98 +201,6 @@ func rewriteJDForDirectAccess(output *jd.Output, hostIP string) error { return nil } -func describeJDEndpoints(output *jd.Output) ([]tunnel.EndpointRef, error) { - refs := make([]tunnel.EndpointRef, 0, 2) - componentID := tunnel.CanonicalComponentID(tunnel.KindJD, 0, "job-distributor") - - grpcRef, err := jdEndpointFromAddress(componentID, "grpc", output.ExternalGRPCUrl) - if err != nil { - return nil, err - } - if grpcRef != nil { - refs = append(refs, *grpcRef) - } - - wsrpcRef, err := jdEndpointFromAddress(componentID, "wsrpc", output.ExternalWSRPCUrl) - if err != nil { - return nil, err - } - if wsrpcRef != nil { - refs = append(refs, *wsrpcRef) - } - - return refs, nil -} - -func rewriteJDWithBindings(output *jd.Output, bindings []tunnel.TunnelBinding) error { - byName := make(map[string]tunnel.TunnelBinding, len(bindings)) - for _, binding := range bindings { - byName[binding.EndpointName] = binding - } - - if output.ExternalGRPCUrl != "" { - binding, ok := byName["grpc"] - if !ok { - return fmt.Errorf("missing tunnel binding for jd grpc endpoint") - } - output.ExternalGRPCUrl = net.JoinHostPort("127.0.0.1", fmt.Sprintf("%d", binding.LocalPort)) - } - - if output.ExternalWSRPCUrl != "" || output.InternalWSRPCUrl != "" { - binding, ok := byName["wsrpc"] - if !ok { - return fmt.Errorf("missing tunnel binding for jd wsrpc endpoint") - } - output.ExternalWSRPCUrl = net.JoinHostPort("127.0.0.1", fmt.Sprintf("%d", binding.LocalPort)) - } - - return nil -} - -func jdEndpointFromAddress(componentID, endpointName, rawAddress string) (*tunnel.EndpointRef, error) { - trimmed := strings.TrimSpace(rawAddress) - if trimmed == "" { - return nil, nil - } - - host := "" - port := "" - - if strings.Contains(trimmed, "://") { - parsedURL, err := url.Parse(trimmed) - if err != nil { - return nil, fmt.Errorf("failed to parse jd endpoint %q: %w", rawAddress, err) - } - host = parsedURL.Hostname() - port = parsedURL.Port() - } else { - parsedHost, parsedPort, err := net.SplitHostPort(trimmed) - if err != nil { - return nil, fmt.Errorf("failed to parse jd host:port endpoint %q: %w", rawAddress, err) - } - host = parsedHost - port = parsedPort - } - - if host == "" || port == "" { - return nil, fmt.Errorf("jd endpoint %q must contain host and port", rawAddress) - } - - portNumber, err := strconv.Atoi(port) - if err != nil || portNumber <= 0 || portNumber > 65535 { - return nil, fmt.Errorf("jd endpoint %q has invalid port %q", rawAddress, port) - } - - return &tunnel.EndpointRef{ - ComponentID: componentID, - EndpointName: endpointName, - Scheme: "tcp", - Host: host, - Port: portNumber, - OriginalURL: trimmed, - }, nil -} - func rewriteAddressHost(rawAddress, host string) (string, error) { trimmed := strings.TrimSpace(rawAddress) if trimmed == "" { diff --git a/system-tests/lib/cre/environment/jobs_test.go b/system-tests/lib/cre/environment/jobs_test.go index e986aca5b96..943632ef930 100644 --- a/system-tests/lib/cre/environment/jobs_test.go +++ b/system-tests/lib/cre/environment/jobs_test.go @@ -4,62 +4,41 @@ import ( "testing" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" ) -func TestDescribeJDEndpointsUsesExternalWSRPC(t *testing.T) { +func TestRewriteJDForDirectAccessRewritesExternalEndpoints(t *testing.T) { output := &jd.Output{ ExternalGRPCUrl: "127.0.0.1:14231", - ExternalWSRPCUrl: "127.0.0.1:8080", + ExternalWSRPCUrl: "127.0.0.1:9080", InternalWSRPCUrl: "job-distributor:8080", } - refs, err := describeJDEndpoints(output) - if err != nil { - t.Fatalf("describeJDEndpoints returned error: %v", err) + if err := rewriteJDForDirectAccess(output, "10.20.30.40"); err != nil { + t.Fatalf("rewriteJDForDirectAccess returned error: %v", err) } - if len(refs) != 2 { - t.Fatalf("expected 2 endpoint refs, got %d", len(refs)) + if output.ExternalGRPCUrl != "10.20.30.40:14231" { + t.Fatalf("expected external grpc url to be rewritten, got %s", output.ExternalGRPCUrl) } - - var wsrpcRef *tunnel.EndpointRef - for i := range refs { - if refs[i].EndpointName == "wsrpc" { - wsrpcRef = &refs[i] - break - } - } - if wsrpcRef == nil { - t.Fatal("missing wsrpc endpoint ref") + if output.ExternalWSRPCUrl != "10.20.30.40:9080" { + t.Fatalf("expected external wsrpc url to be rewritten, got %s", output.ExternalWSRPCUrl) } - if wsrpcRef.Host != "127.0.0.1" || wsrpcRef.Port != 8080 { - t.Fatalf("expected wsrpc endpoint to use external address 127.0.0.1:8080, got %s:%d", wsrpcRef.Host, wsrpcRef.Port) + if output.InternalWSRPCUrl != "job-distributor:8080" { + t.Fatalf("expected internal wsrpc url to remain unchanged, got %s", output.InternalWSRPCUrl) } } -func TestRewriteJDWithBindingsRewritesNodeFacingWSRPC(t *testing.T) { +func TestRewriteJDForDirectAccessFallsBackToInternalWSRPCSource(t *testing.T) { output := &jd.Output{ ExternalGRPCUrl: "127.0.0.1:14231", - ExternalWSRPCUrl: "127.0.0.1:8080", + ExternalWSRPCUrl: "", InternalWSRPCUrl: "job-distributor:8080", } - bindings := []tunnel.TunnelBinding{ - { - EndpointRef: tunnel.EndpointRef{EndpointName: "grpc"}, - LocalPort: 61001, - }, - { - EndpointRef: tunnel.EndpointRef{EndpointName: "wsrpc"}, - LocalPort: 61002, - }, - } - if err := rewriteJDWithBindings(output, bindings); err != nil { - t.Fatalf("rewriteJDWithBindings returned error: %v", err) + if err := rewriteJDForDirectAccess(output, "10.20.30.40"); err != nil { + t.Fatalf("rewriteJDForDirectAccess returned error: %v", err) } - - if output.ExternalWSRPCUrl != "127.0.0.1:61002" { - t.Fatalf("expected external wsrpc url to be rewritten to 127.0.0.1:61002, got %s", output.ExternalWSRPCUrl) + if output.ExternalWSRPCUrl != "10.20.30.40:8080" { + t.Fatalf("expected external wsrpc url to be derived from internal source, got %s", output.ExternalWSRPCUrl) } if output.InternalWSRPCUrl != "job-distributor:8080" { t.Fatalf("expected internal wsrpc url to remain unchanged, got %s", output.InternalWSRPCUrl) diff --git a/system-tests/lib/cre/environment/remote_component_client.go b/system-tests/lib/cre/environment/remote_component_client.go new file mode 100644 index 00000000000..b94513d4a39 --- /dev/null +++ b/system-tests/lib/cre/environment/remote_component_client.go @@ -0,0 +1,243 @@ +package environment + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net" + "net/http" + "os" + "strconv" + "strings" + "time" + + retry "github.com/avast/retry-go/v4" + pkgerrors "github.com/pkg/errors" + "github.com/rs/zerolog" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" +) + +const ( + componentTypeBlockchain = "blockchain" + componentTypeJD = "jd" + componentTypeNodeSet = "nodeset" + envEC2AgentURL = "CRE_EC2_AGENT_URL" + envEC2AgentPort = "CRE_EC2_AGENT_PORT" + defaultEC2AgentPort = 8080 +) + +type componentClient interface { + StartComponent(ctx context.Context, envelope agent.StartComponentEnvelope) (*agent.StartComponentResponse, error) +} + +type httpComponentClient struct { + baseURL string + client *http.Client + maxAttempts int + retryDelay time.Duration + checkHealth bool +} + +type resolvedRemoteRuntime struct { + AgentBaseURL string + EC2HostIP string +} + +func newEC2HTTPComponentClient(baseURL string) *httpComponentClient { + return &httpComponentClient{ + baseURL: baseURL, + client: &http.Client{ + Timeout: 4 * time.Minute, + }, + maxAttempts: 3, + retryDelay: 2 * time.Second, + checkHealth: true, + } +} + +func resolveRemoteRuntime(testLogger zerolog.Logger) (*resolvedRemoteRuntime, error) { + baseURL, err := resolveEC2AgentBaseURL(testLogger) + if err != nil { + return nil, fmt.Errorf("failed to resolve EC2 agent base URL: %w", err) + } + ec2HostIP, err := resolveEC2HostIP() + if err != nil { + return nil, err + } + return &resolvedRemoteRuntime{ + AgentBaseURL: baseURL, + EC2HostIP: ec2HostIP, + }, nil +} + +func newRemoteComponentClient(runtime *resolvedRemoteRuntime) (componentClient, error) { + if runtime == nil || strings.TrimSpace(runtime.AgentBaseURL) == "" { + return nil, errors.New("resolved runtime is nil or missing agent base url") + } + return newEC2HTTPComponentClient(runtime.AgentBaseURL), nil +} + +func (c *httpComponentClient) StartComponent(ctx context.Context, envelope agent.StartComponentEnvelope) (*agent.StartComponentResponse, error) { + if c.checkHealth { + if err := c.waitForHealth(ctx); err != nil { + return nil, err + } + } + + var result *agent.StartComponentResponse + err := retry.Do( + func() error { + var err error + result, err = c.startComponentOnce(ctx, envelope) + return err + }, + retry.Attempts(uint(c.maxAttempts)), + retry.Delay(c.retryDelay), + retry.Context(ctx), + retry.LastErrorOnly(true), + ) + if err != nil { + return nil, err + } + + return result, nil +} + +func (c *httpComponentClient) startComponentOnce(ctx context.Context, envelope agent.StartComponentEnvelope) (*agent.StartComponentResponse, error) { + body, err := json.Marshal(envelope) + if err != nil { + return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to encode start component envelope")) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/v1/components/start", bytes.NewReader(body)) + if err != nil { + return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to create start component request")) + } + req.Header.Set("Content-Type", "application/json") + + resp, err := c.client.Do(req) + if err != nil { + if isRetriableNetworkError(err) { + return nil, pkgerrors.Wrap(err, "failed to execute start component request") + } + return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to execute start component request")) + } + defer resp.Body.Close() + + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to read start component response")) + } + + var startResp agent.StartComponentResponse + if len(respBody) > 0 { + if err := json.Unmarshal(respBody, &startResp); err != nil { + return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to decode start component response")) + } + } + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + if startResp.Error != "" { + if startResp.ErrorCode != "" { + err = remoteAgentError(startResp.ErrorCode, startResp.Error) + } else { + err = remoteAgentError("remote_agent_error", startResp.Error) + } + } else { + err = fmt.Errorf("start component request failed with status %s: %s", resp.Status, string(respBody)) + } + + if isRetriableStatus(resp.StatusCode) { + return nil, err + } + return nil, retry.Unrecoverable(err) + } + if startResp.Error != "" { + if startResp.ErrorCode != "" { + return nil, retry.Unrecoverable(remoteAgentError(startResp.ErrorCode, startResp.Error)) + } + return nil, retry.Unrecoverable(remoteAgentError("remote_agent_error", startResp.Error)) + } + + return &startResp, nil +} + +func (c *httpComponentClient) waitForHealth(ctx context.Context) error { + healthURL := c.baseURL + "/v1/health" + return retry.Do( + func() error { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, healthURL, nil) + if err != nil { + return retry.Unrecoverable(err) + } + resp, err := c.client.Do(req) + if err != nil { + return err + } + _ = resp.Body.Close() + if resp.StatusCode == http.StatusOK { + return nil + } + return fmt.Errorf("%s: status %s", describeEC2AgentHealthFailure(c.baseURL), resp.Status) + }, + retry.Attempts(uint(c.maxAttempts)), + retry.Delay(c.retryDelay), + retry.Context(ctx), + retry.LastErrorOnly(true), + ) +} + +func describeEC2AgentHealthFailure(baseURL string) string { + return fmt.Sprintf( + "failed EC2 CRE agent health check (%s/v1/health); verify the agent process is running and %s matches its listen port (or set %s explicitly)", + baseURL, + envEC2AgentPort, + envEC2AgentURL, + ) +} + +func isRetriableStatus(statusCode int) bool { + return statusCode == http.StatusBadGateway || statusCode == http.StatusServiceUnavailable || statusCode == http.StatusGatewayTimeout +} + +func isRetriableNetworkError(err error) bool { + var netErr net.Error + return errors.As(err, &netErr) +} + +func resolveEC2AgentBaseURL(testLogger zerolog.Logger) (string, error) { + if configured := strings.TrimSpace(os.Getenv(envEC2AgentURL)); configured != "" { + return configured, nil + } + remotePort, err := resolveEC2AgentPort() + if err != nil { + return "", err + } + ec2HostIP, err := resolveEC2HostIP() + if err != nil { + return "", err + } + testLogger.Debug().Str("ec2HostIP", ec2HostIP).Int("port", remotePort).Msg("resolved EC2 CRE agent base URL") + return fmt.Sprintf("http://%s:%d", ec2HostIP, remotePort), nil +} + +func resolveEC2AgentPort() (int, error) { + remotePort := defaultEC2AgentPort + if configuredPort := strings.TrimSpace(os.Getenv(envEC2AgentPort)); configuredPort != "" { + parsedPort, err := strconv.Atoi(configuredPort) + if err != nil || parsedPort <= 0 || parsedPort > 65535 { + return 0, fmt.Errorf("invalid %s: %q", envEC2AgentPort, configuredPort) + } + remotePort = parsedPort + } + return remotePort, nil +} + +func resolveEC2HostIP() (string, error) { + return runtimecfg.DirectHostIP() +} diff --git a/system-tests/lib/cre/environment/remote_stop.go b/system-tests/lib/cre/environment/remote_stop.go index 776540b9bc4..157e1dd54cc 100644 --- a/system-tests/lib/cre/environment/remote_stop.go +++ b/system-tests/lib/cre/environment/remote_stop.go @@ -15,7 +15,6 @@ import ( "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" ) type RemoteStopSummary struct { @@ -41,13 +40,11 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. return summary, nil } - tunnelManager, err := newEC2TunnelManager(lggr) + remoteRuntime, err := resolveRemoteRuntime(lggr) if err != nil { - return summary, pkgerrors.Wrap(err, "failed to initialize tunnel manager for remote stop") + return summary, pkgerrors.Wrap(err, "failed to resolve remote runtime settings for stop") } - defer func() { _ = tunnelManager.Stop(ctx) }() - - startClient, err := newStartComponentClient(lggr, tunnelManager) + startClient, err := newRemoteComponentClient(remoteRuntime) if err != nil { return summary, pkgerrors.Wrap(err, "failed to initialize remote component client for stop") } @@ -116,7 +113,7 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. } } - containers, volumes, listErr := listRemoteCTFResources(ctx, lggr, tunnelManager) + containers, volumes, listErr := listRemoteCTFResources(ctx, remoteRuntime.AgentBaseURL) if listErr != nil { summary.ResidualQueryError = listErr.Error() } else { @@ -191,14 +188,8 @@ func stopRemoteComponent( func listRemoteCTFResources( ctx context.Context, - lggr zerolog.Logger, - tunnelManager tunnel.Manager, + baseURL string, ) ([]string, []string, error) { - _ = tunnelManager - baseURL, err := resolveEC2AgentBaseURL(lggr) - if err != nil { - return nil, nil, pkgerrors.Wrap(err, "resolve agent base url for ctf resource query") - } req, err := http.NewRequestWithContext(ctx, http.MethodGet, strings.TrimRight(baseURL, "/")+"/v1/resources/ctf", nil) if err != nil { return nil, nil, err diff --git a/system-tests/lib/cre/environment/setup_output_test.go b/system-tests/lib/cre/environment/setup_output_test.go index 7ef20133d95..a8a77b7c42b 100644 --- a/system-tests/lib/cre/environment/setup_output_test.go +++ b/system-tests/lib/cre/environment/setup_output_test.go @@ -3,27 +3,10 @@ package environment import ( "context" "testing" - - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" ) -type countingTunnelManager struct { - stopCalls int -} - -func (c *countingTunnelManager) Start(_ context.Context, _ []tunnel.EndpointRef) ([]tunnel.TunnelBinding, error) { - return nil, nil -} -func (c *countingTunnelManager) Stop(_ context.Context) error { - c.stopCalls++ - return nil -} -func (c *countingTunnelManager) IsStarted() bool { return false } -func (c *countingTunnelManager) Snapshot() []tunnel.TunnelBinding { return []tunnel.TunnelBinding{} } - func TestSetupOutputCloseIsIdempotent(t *testing.T) { - manager := &countingTunnelManager{} - out := &SetupOutput{tunnelManager: manager} + out := &SetupOutput{} if err := out.Close(context.Background()); err != nil { t.Fatalf("expected first close to succeed: %v", err) @@ -31,7 +14,4 @@ func TestSetupOutputCloseIsIdempotent(t *testing.T) { if err := out.Close(context.Background()); err != nil { t.Fatalf("expected second close to succeed: %v", err) } - if manager.stopCalls != 1 { - t.Fatalf("expected tunnel manager stop once, got %d", manager.stopCalls) - } } diff --git a/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md b/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md index df3aa571c88..89c1174b149 100644 --- a/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md +++ b/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md @@ -7,6 +7,7 @@ This runbook covers the EC2-based remote mode for CRE where components can run e - Remote backend is EC2 + Docker (no Kubernetes path). - Remote control plane is the CRE agent. - Access mode is direct-only. +- Runtime no longer uses tunnel-manager orchestration for component endpoint access. ## Core Environment Variables @@ -46,7 +47,7 @@ For direct-mode auto IP lookup, AWS CLI auth selection follows: - Use `placement = "local" | "remote"` in CRE component config (NodeSets, JD, Blockchains). - Same placement (`local->local`, `remote->remote`) uses **internal** URLs. - Cross placement (`local->remote`, `remote->local`) uses **external** URLs. -- Remote NodeSets targeting local gateway are allowed when bridge/tunnel plumbing for gateway ingress is present. +- Remote NodeSets targeting local gateway are allowed when relay plumbing for gateway ingress is present. ## P2P Peering Rules (SharedPeering) @@ -80,7 +81,7 @@ For direct-mode auto IP lookup, AWS CLI auth selection follows: ## Fast Triage Checklist -- Agent unreachable: verify bind address/port vs chosen access mode. +- Agent unreachable: verify `CRE_EC2_AGENT_URL` (if set), or `CRE_EC2_INSTANCE_ID`/AWS credentials + `CRE_EC2_AGENT_PORT`. - Direct mode cannot resolve EC2 IP: ensure `CRE_EC2_INSTANCE_ID` is set and AWS CLI credentials are valid, or set `CRE_EC2_HOST_IP` explicitly. - `invalid jd placement`: use `placement=local` or `placement=remote` (only supported values). - Remote nodes hitting local-only fixtures: ensure fixture relay helper is active. diff --git a/system-tests/tests/smoke/cre/v2_grpc_source_test.go b/system-tests/tests/smoke/cre/v2_grpc_source_test.go index fc6acbeec4a..df06fc46b98 100644 --- a/system-tests/tests/smoke/cre/v2_grpc_source_test.go +++ b/system-tests/tests/smoke/cre/v2_grpc_source_test.go @@ -26,7 +26,6 @@ import ( crontypes "github.com/smartcontractkit/chainlink/core/scripts/cre/environment/examples/workflows/v2/cron/types" creenv "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment" envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" grpcsourcemock "github.com/smartcontractkit/chainlink/system-tests/lib/cre/grpc_source_mock" creworkflow "github.com/smartcontractkit/chainlink/system-tests/lib/cre/workflow" t_helpers "github.com/smartcontractkit/chainlink/system-tests/tests/test-helpers" @@ -603,12 +602,6 @@ func compileAndCopyWorkflow(t *testing.T, testEnv *ttypes.TestEnvironment, workf break } } - var remoteTunnelManager tunnel.Manager - if mode == creworkflow.ArtifactDeployModeRemote { - remoteTunnelManager, err = creenv.NewEC2TunnelManager(testLogger) - require.NoError(t, err, "failed to initialize tunnel manager for remote artifact deploy") - defer func() { _ = remoteTunnelManager.Stop(ctx) }() - } err = creworkflow.DeployArtifacts( ctx, creworkflow.DeployArtifactsOptions{ @@ -618,7 +611,7 @@ func compileAndCopyWorkflow(t *testing.T, testEnv *ttypes.TestEnvironment, workf ContainerTargetDir: containerTargetDir, Files: []string{compressedWasmPath, configFilePath}, RemoteDeployer: func(ctx context.Context, nodeSetName, containerTargetDir string, files []string) error { - return creenv.DeployArtifactsToRemoteNodeSet(ctx, testLogger, remoteTunnelManager, nodeSetName, containerTargetDir, files) + return creenv.DeployArtifactsToRemoteNodeSet(ctx, testLogger, nodeSetName, containerTargetDir, files) }, }, ) diff --git a/system-tests/tests/test-helpers/t_helpers.go b/system-tests/tests/test-helpers/t_helpers.go index bd7c850cc71..5c9c15caa3f 100644 --- a/system-tests/tests/test-helpers/t_helpers.go +++ b/system-tests/tests/test-helpers/t_helpers.go @@ -63,7 +63,6 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/flags" creworkflow "github.com/smartcontractkit/chainlink/system-tests/lib/cre/workflow" crecrypto "github.com/smartcontractkit/chainlink/system-tests/lib/crypto" @@ -340,19 +339,8 @@ func createWorkflowArtifacts[T WorkflowConfig](t *testing.T, testLogger zerolog. // Copy workflow artifacts to Docker containers to use blockchain client running inside for workflow registration testLogger.Info().Msg("Copying workflow artifacts to Docker containers.") - var remoteTunnelManager tunnel.Manager - defer func() { - if remoteTunnelManager != nil { - _ = remoteTunnelManager.Stop(t.Context()) - } - }() for _, don := range workflowDONs { mode, nodeSetName := resolveWorkflowDONArtifactMode(testEnv.Config, don.Name) - if mode == creworkflow.ArtifactDeployModeRemote && remoteTunnelManager == nil { - manager, managerErr := creenv.NewEC2TunnelManager(testLogger) - require.NoError(t, managerErr, "failed to initialize tunnel manager for remote artifact deploy") - remoteTunnelManager = manager - } copyErr := creworkflow.DeployArtifacts( t.Context(), creworkflow.DeployArtifactsOptions{ @@ -362,7 +350,7 @@ func createWorkflowArtifacts[T WorkflowConfig](t *testing.T, testLogger zerolog. ContainerTargetDir: creworkflow.DefaultWorkflowTargetDir, Files: []string{compressedWorkflowWasmPath, workflowConfigFilePath}, RemoteDeployer: func(ctx context.Context, nodeSetName, containerTargetDir string, files []string) error { - return creenv.DeployArtifactsToRemoteNodeSet(ctx, testLogger, remoteTunnelManager, nodeSetName, containerTargetDir, files) + return creenv.DeployArtifactsToRemoteNodeSet(ctx, testLogger, nodeSetName, containerTargetDir, files) }, }, ) From 818cb1824fe0002f6bd598ba08821be0ac713c3c Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Wed, 25 Feb 2026 11:05:04 +0100 Subject: [PATCH 20/34] increase test coverage --- system-tests/lib/cre/bootstrap_peer_test.go | 90 ++++--- .../lib/cre/don/config/config_test.go | 94 +++++++ .../lib/cre/environment/agent/deploy.go | 12 +- .../lib/cre/environment/agent/deploy_test.go | 236 ++++++++++++++++++ .../lib/cre/environment/agent/relay_test.go | 147 +++++++++++ .../environment/agent/server_handlers_test.go | 116 +++++++++ .../cre/environment/artifacts_remote_test.go | 86 +++++++ .../cre/environment/blockchain_start_test.go | 135 +++++----- .../lib/cre/environment/config/config_test.go | 74 ++++++ system-tests/lib/cre/environment/dons_test.go | 207 ++++++++++++++- .../environment/environment_placement_test.go | 52 +++- system-tests/lib/cre/environment/jobs_test.go | 60 +++-- .../remote_component_client_test.go | 158 ++++++++++++ .../lib/cre/environment/remote_stop_test.go | 214 ++++++++++++++++ .../lib/cre/runtimecfg/access_mode_test.go | 78 ++++++ .../lib/cre/workflow/deploy_artifacts_test.go | 71 ++++++ 16 files changed, 1689 insertions(+), 141 deletions(-) create mode 100644 system-tests/lib/cre/don/config/config_test.go create mode 100644 system-tests/lib/cre/environment/agent/deploy_test.go create mode 100644 system-tests/lib/cre/environment/agent/relay_test.go create mode 100644 system-tests/lib/cre/environment/agent/server_handlers_test.go create mode 100644 system-tests/lib/cre/environment/artifacts_remote_test.go create mode 100644 system-tests/lib/cre/environment/config/config_test.go create mode 100644 system-tests/lib/cre/environment/remote_component_client_test.go create mode 100644 system-tests/lib/cre/environment/remote_stop_test.go create mode 100644 system-tests/lib/cre/runtimecfg/access_mode_test.go create mode 100644 system-tests/lib/cre/workflow/deploy_artifacts_test.go diff --git a/system-tests/lib/cre/bootstrap_peer_test.go b/system-tests/lib/cre/bootstrap_peer_test.go index a54a47d9320..601c63099c4 100644 --- a/system-tests/lib/cre/bootstrap_peer_test.go +++ b/system-tests/lib/cre/bootstrap_peer_test.go @@ -4,17 +4,14 @@ import ( "os" "testing" + "github.com/stretchr/testify/require" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" ) func TestResolveP2PAnnounceAddresses_LocalOnly_UsesInternalHost(t *testing.T) { addresses, err := ResolveP2PAnnounceAddresses("local", false, 15001) - if err != nil { - t.Fatalf("ResolveP2PAnnounceAddresses returned error: %v", err) - } - if len(addresses) != 0 { - t.Fatalf("expected local-only setup to leave announce addresses unset, got %v", addresses) - } + require.NoError(t, err, "ResolveP2PAnnounceAddresses should not fail") + require.Len(t, addresses, 0, "expected local-only setup to leave announce addresses unset") } func TestResolveP2PAnnounceAddresses_LocalMixed_AddsBridgedHost(t *testing.T) { @@ -36,18 +33,10 @@ func TestResolveP2PAnnounceAddresses_LocalMixed_AddsBridgedHost(t *testing.T) { _ = os.Setenv(runtimecfg.EnvLocalHostIP, "192.168.1.10") addresses, err := ResolveP2PAnnounceAddresses("local", true, 15002) - if err != nil { - t.Fatalf("ResolveP2PAnnounceAddresses returned error: %v", err) - } - if len(addresses) != 2 { - t.Fatalf("expected two announce addresses for mixed mode, got %d (%v)", len(addresses), addresses) - } - if addresses[0] != "192.168.1.10:15002" { - t.Fatalf("expected local host address 192.168.1.10:15002, got %s", addresses[0]) - } - if addresses[1] != "10.1.2.3:15002" { - t.Fatalf("expected external EC2 address 10.1.2.3:15002, got %s", addresses[1]) - } + require.NoError(t, err, "ResolveP2PAnnounceAddresses should not fail") + require.Len(t, addresses, 2, "expected two announce addresses for mixed mode") + require.Equal(t, "192.168.1.10:15002", addresses[0], "unexpected local host announce address") + require.Equal(t, "10.1.2.3:15002", addresses[1], "unexpected external EC2 announce address") } func TestResolveP2PAnnounceAddresses_Remote_AddsDirectHostIP(t *testing.T) { @@ -62,23 +51,62 @@ func TestResolveP2PAnnounceAddresses_Remote_AddsDirectHostIP(t *testing.T) { _ = os.Setenv(runtimecfg.EnvEC2HostIP, "10.1.2.3") addresses, err := ResolveP2PAnnounceAddresses("remote", true, 16001) - if err != nil { - t.Fatalf("ResolveP2PAnnounceAddresses returned error: %v", err) - } - if len(addresses) != 1 { - t.Fatalf("expected one announce address for remote node, got %d (%v)", len(addresses), addresses) - } - if addresses[0] != "10.1.2.3:16001" { - t.Fatalf("expected external EC2 address 10.1.2.3:16001, got %s", addresses[0]) - } + require.NoError(t, err, "ResolveP2PAnnounceAddresses should not fail") + require.Len(t, addresses, 1, "expected one announce address for remote node") + require.Equal(t, "10.1.2.3:16001", addresses[0], "unexpected external EC2 announce address") } func TestResolveBootstrapPeerURL_RemoteCallerToLocalBootstrap_UsesBridgedHost(t *testing.T) { peerURL, err := ResolveBootstrapPeerURL("remote", "local", "p2p_testPeer", "bootstrap-gateway-node0", 5001) - if err != nil { - t.Fatalf("ResolveBootstrapPeerURL returned error: %v", err) + require.NoError(t, err, "ResolveBootstrapPeerURL should not fail") + require.Equal(t, "testPeer@host.docker.internal:5001", peerURL, "unexpected bridged bootstrap peer URL") +} + +func TestResolveBootstrapAddress_Matrix(t *testing.T) { + t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + + tests := []struct { + name string + callerTarget string + bootstrapTarget string + wantAddress string + }{ + { + name: "local to local uses internal", + callerTarget: "local", + bootstrapTarget: "local", + wantAddress: "bootstrap-gateway-node0:5001", + }, + { + name: "local to remote uses external ec2", + callerTarget: "local", + bootstrapTarget: "remote", + wantAddress: "203.0.113.10:5001", + }, + { + name: "remote to local uses bridged host", + callerTarget: "remote", + bootstrapTarget: "local", + wantAddress: "host.docker.internal:5001", + }, + { + name: "remote to remote uses internal", + callerTarget: "remote", + bootstrapTarget: "remote", + wantAddress: "bootstrap-gateway-node0:5001", + }, } - if peerURL != "testPeer@host.docker.internal:5001" { - t.Fatalf("expected bridged bootstrap peer URL, got %s", peerURL) + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + address, err := ResolveBootstrapAddress(tt.callerTarget, tt.bootstrapTarget, "bootstrap-gateway-node0", 5001) + require.NoError(t, err, "ResolveBootstrapAddress should not fail") + require.Equalf(t, tt.wantAddress, address, "expected ResolveBootstrapAddress() for %s", tt.name) + }) } } + +func TestResolveBootstrapPeerURL_RejectsEmptyPeerID(t *testing.T) { + _, err := ResolveBootstrapPeerURL("local", "local", "", "bootstrap-gateway-node0", 5001) + require.Error(t, err, "expected empty peer id to fail") +} diff --git a/system-tests/lib/cre/don/config/config_test.go b/system-tests/lib/cre/don/config/config_test.go new file mode 100644 index 00000000000..47189efd86a --- /dev/null +++ b/system-tests/lib/cre/don/config/config_test.go @@ -0,0 +1,94 @@ +package config + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/require" + "github.com/smartcontractkit/chainlink-testing-framework/framework" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/clnode" + ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" + "github.com/smartcontractkit/chainlink/system-tests/lib/infra" +) + +func TestResolveGatewayConnectorURL_PlacementMatrix(t *testing.T) { + t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + + tests := []struct { + name string + callerPlacement string + targetPlacement string + wantURL string + }{ + { + name: "local caller local target uses internal", + callerPlacement: "local", + targetPlacement: "local", + wantURL: "ws://bootstrap-gateway-node0:5003/node", + }, + { + name: "local caller remote target uses external ec2", + callerPlacement: "local", + targetPlacement: "remote", + wantURL: "ws://203.0.113.10:5003/node", + }, + { + name: "remote caller local target uses docker host external", + callerPlacement: "remote", + targetPlacement: "local", + wantURL: "ws://" + strings.TrimPrefix(framework.HostDockerInternal(), "http://") + ":5003/node", + }, + { + name: "remote caller remote target uses internal", + callerPlacement: "remote", + targetPlacement: "remote", + wantURL: "ws://bootstrap-gateway-node0:5003/node", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + topology, gateway := mustBuildGatewayTopology(t, tt.targetPlacement) + + gotURL, err := resolveGatewayConnectorURL(tt.callerPlacement, topology, gateway) + require.NoError(t, err, "resolveGatewayConnectorURL should not fail") + require.Equal(t, tt.wantURL, gotURL, "unexpected gateway connector URL") + }) + } +} + +func mustBuildGatewayTopology(t *testing.T, targetPlacement string) (*cre.Topology, *cre.DonGatewayConfiguration) { + t.Helper() + + provider := infra.Provider{Type: infra.Docker} + nodeSet := &cre.NodeSet{ + Input: &ns.Input{Name: "workflow"}, + NodeSpecs: []*cre.NodeSpecWithRole{ + { + Input: &clnode.Input{Node: &clnode.NodeInput{}}, + Roles: []cre.NodeType{cre.BootstrapNode}, + }, + }, + Placement: targetPlacement, + } + donMetadata, err := cre.NewDonMetadata(nodeSet, 1, provider, nil) + require.NoError(t, err, "failed to build DonMetadata") + donsMetadata, err := cre.NewDonsMetadata([]*cre.DonMetadata{donMetadata}, provider) + require.NoError(t, err, "failed to build DonsMetadata") + + gateway := &cre.DonGatewayConfiguration{ + GatewayConfiguration: &cre.GatewayConfiguration{ + NodeUUID: donMetadata.NodesMetadata[0].UUID, + Outgoing: cre.Outgoing{ + Host: "bootstrap-gateway-node0", + Port: 5003, + Path: "/node", + }, + AuthGatewayID: "gateway-node-0", + }, + } + + return &cre.Topology{DonsMetadata: donsMetadata}, gateway +} diff --git a/system-tests/lib/cre/environment/agent/deploy.go b/system-tests/lib/cre/environment/agent/deploy.go index 07857b1ea1e..743b6e53f8b 100644 --- a/system-tests/lib/cre/environment/agent/deploy.go +++ b/system-tests/lib/cre/environment/agent/deploy.go @@ -14,6 +14,12 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" ) +var ( + newJDWithContext = jd.NewWithContext + newSharedDBNodeSetWithContext = ns.NewSharedDBNodeSetWithContext + ensureJDImagePresentFn = ensureJDImagePresent +) + type OutputDeployer interface { DeployOutput(ctx context.Context, input *blockchain.Input) (*blockchain.Output, error) } @@ -57,7 +63,7 @@ func DeployJDComponent(ctx context.Context, input *jd.Input) (*jd.Output, error) if input == nil { return nil, pkgerrors.New("jd input is nil") } - if err := ensureJDImagePresent(ctx, input.Image); err != nil { + if err := ensureJDImagePresentFn(ctx, input.Image); err != nil { return nil, err } @@ -65,7 +71,7 @@ func DeployJDComponent(ctx context.Context, input *jd.Input) (*jd.Output, error) if err != nil { return nil, err } - output, err := jd.NewWithContext(ctx, effectiveInput) + output, err := newJDWithContext(ctx, effectiveInput) if err != nil { return nil, pkgerrors.Wrap(err, "failed to deploy jd component") } @@ -80,7 +86,7 @@ func DeployNodeSetComponent(ctx context.Context, input *ns.Input, registryChain return nil, pkgerrors.New("registry blockchain output is nil") } inputCopy := *input - output, err := ns.NewSharedDBNodeSetWithContext(ctx, &inputCopy, registryChain) + output, err := newSharedDBNodeSetWithContext(ctx, &inputCopy, registryChain) if err != nil { return nil, pkgerrors.Wrapf(err, "failed to deploy nodeset %s", inputCopy.Name) } diff --git a/system-tests/lib/cre/environment/agent/deploy_test.go b/system-tests/lib/cre/environment/agent/deploy_test.go new file mode 100644 index 00000000000..36d1599a19f --- /dev/null +++ b/system-tests/lib/cre/environment/agent/deploy_test.go @@ -0,0 +1,236 @@ +package agent + +import ( + "context" + "errors" + "testing" + + cldf_chain "github.com/smartcontractkit/chainlink-deployments-framework/chain" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" + ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" + "github.com/stretchr/testify/require" +) + +type fakeBlockchain struct { + out *blockchain.Output +} + +func (f *fakeBlockchain) ChainSelector() uint64 { return 0 } +func (f *fakeBlockchain) ChainID() uint64 { return 0 } +func (f *fakeBlockchain) ChainFamily() string { return "" } +func (f *fakeBlockchain) IsFamily(string) bool { return false } +func (f *fakeBlockchain) Fund(context.Context, string, uint64) error { return nil } +func (f *fakeBlockchain) CtfOutput() *blockchain.Output { return f.out } +func (f *fakeBlockchain) ToCldfChain() (cldf_chain.BlockChain, error) { return nil, nil } + +type outputPreferringDeployer struct { + deployCalls int + deployOutputCalls int +} + +func (d *outputPreferringDeployer) Deploy(context.Context, *blockchain.Input) (blockchains.Blockchain, error) { + d.deployCalls++ + return &fakeBlockchain{out: &blockchain.Output{ChainID: "fallback"}}, nil +} + +func (d *outputPreferringDeployer) DeployOutput(context.Context, *blockchain.Input) (*blockchain.Output, error) { + d.deployOutputCalls++ + return &blockchain.Output{ChainID: "1337", Type: blockchain.TypeAnvil}, nil +} + +type fallbackOnlyDeployer struct { + deployCalls int +} + +func (d *fallbackOnlyDeployer) Deploy(context.Context, *blockchain.Input) (blockchains.Blockchain, error) { + d.deployCalls++ + return &fakeBlockchain{ + out: &blockchain.Output{ + ChainID: "2337", + Type: blockchain.TypeAnvil, + }, + }, nil +} + +func TestBuildRemoteJDInputEnablesDNSIsolationOverride(t *testing.T) { + original := &jd.Input{Image: "job-distributor:0.22.1", DisableDNSIsolation: false} + + effective, err := buildRemoteJDInput(original) + require.NoError(t, err) + require.NotSame(t, original, effective, "expected a defensive copy") + require.True(t, effective.DisableDNSIsolation, "remote agent input should force Docker DNS") + require.False(t, original.DisableDNSIsolation, "original input should remain unchanged") +} + +func TestDeployBlockchainComponentNilInputFails(t *testing.T) { + _, err := DeployBlockchainComponent(context.Background(), nil, nil) + require.Error(t, err) + require.Contains(t, err.Error(), "blockchain input is nil") +} + +func TestDeployBlockchainComponentNoDeployerFails(t *testing.T) { + _, err := DeployBlockchainComponent(context.Background(), map[blockchain.ChainFamily]blockchains.Deployer{}, &blockchain.Input{Type: blockchain.TypeAnvil}) + require.Error(t, err) + require.Contains(t, err.Error(), "no deployer found") +} + +func TestDeployBlockchainComponentPrefersOutputDeployer(t *testing.T) { + deployer := &outputPreferringDeployer{} + output, err := DeployBlockchainComponent( + context.Background(), + map[blockchain.ChainFamily]blockchains.Deployer{blockchain.FamilyEVM: deployer}, + &blockchain.Input{Type: blockchain.TypeAnvil}, + ) + require.NoError(t, err) + require.Equal(t, "1337", output.ChainID) + require.Equal(t, 1, deployer.deployOutputCalls, "DeployOutput should be used when available") + require.Equal(t, 0, deployer.deployCalls, "Deploy fallback should not be called") +} + +func TestDeployBlockchainComponentFallsBackToDeploy(t *testing.T) { + deployer := &fallbackOnlyDeployer{} + output, err := DeployBlockchainComponent( + context.Background(), + map[blockchain.ChainFamily]blockchains.Deployer{blockchain.FamilyEVM: deployer}, + &blockchain.Input{Type: blockchain.TypeAnvil}, + ) + require.NoError(t, err) + require.Equal(t, "2337", output.ChainID) + require.Equal(t, 1, deployer.deployCalls, "Deploy should be called for non-output deployers") +} + +func TestDeployJDComponentNilInputFails(t *testing.T) { + _, err := DeployJDComponent(context.Background(), nil) + require.Error(t, err) + require.Contains(t, err.Error(), "jd input is nil") +} + +func TestDeployJDComponentSuccessUsesSeams(t *testing.T) { + prevEnsure := ensureJDImagePresentFn + prevNewJD := newJDWithContext + t.Cleanup(func() { + ensureJDImagePresentFn = prevEnsure + newJDWithContext = prevNewJD + }) + + imageChecked := "" + ensureJDImagePresentFn = func(_ context.Context, image string) error { + imageChecked = image + return nil + } + + var captured *jd.Input + expectedOutput := &jd.Output{} + newJDWithContext = func(_ context.Context, in *jd.Input) (*jd.Output, error) { + captured = in + return expectedOutput, nil + } + + out, err := DeployJDComponent(context.Background(), &jd.Input{ + Image: "job-distributor:0.22.1", + DisableDNSIsolation: false, + }) + require.NoError(t, err) + require.Same(t, expectedOutput, out) + require.Equal(t, "job-distributor:0.22.1", imageChecked) + require.NotNil(t, captured) + require.True(t, captured.DisableDNSIsolation, "remote JD deploy should force Docker DNS") +} + +func TestDeployJDComponentImageCheckFailureStopsEarly(t *testing.T) { + prevEnsure := ensureJDImagePresentFn + prevNewJD := newJDWithContext + t.Cleanup(func() { + ensureJDImagePresentFn = prevEnsure + newJDWithContext = prevNewJD + }) + + ensureJDImagePresentFn = func(context.Context, string) error { + return errors.New("image check failed") + } + + constructorCalled := false + newJDWithContext = func(context.Context, *jd.Input) (*jd.Output, error) { + constructorCalled = true + return &jd.Output{}, nil + } + + _, err := DeployJDComponent(context.Background(), &jd.Input{Image: "jd:latest"}) + require.Error(t, err) + require.Contains(t, err.Error(), "image check failed") + require.False(t, constructorCalled, "jd constructor should not be called when image check fails") +} + +func TestDeployJDComponentConstructorFailureIsWrapped(t *testing.T) { + prevEnsure := ensureJDImagePresentFn + prevNewJD := newJDWithContext + t.Cleanup(func() { + ensureJDImagePresentFn = prevEnsure + newJDWithContext = prevNewJD + }) + + ensureJDImagePresentFn = func(context.Context, string) error { return nil } + newJDWithContext = func(context.Context, *jd.Input) (*jd.Output, error) { + return nil, errors.New("constructor failed") + } + + _, err := DeployJDComponent(context.Background(), &jd.Input{Image: "jd:latest"}) + require.Error(t, err) + require.Contains(t, err.Error(), "failed to deploy jd component") +} + +func TestDeployNodeSetComponentNilInputsFail(t *testing.T) { + _, err := DeployNodeSetComponent(context.Background(), nil, &blockchain.Output{}) + require.Error(t, err) + require.Contains(t, err.Error(), "nodeset input is nil") + + _, err = DeployNodeSetComponent(context.Background(), &ns.Input{Name: "workflow"}, nil) + require.Error(t, err) + require.Contains(t, err.Error(), "registry blockchain output is nil") +} + +func TestDeployNodeSetComponentSuccessUsesSeam(t *testing.T) { + prevNewNodeSet := newSharedDBNodeSetWithContext + t.Cleanup(func() { + newSharedDBNodeSetWithContext = prevNewNodeSet + }) + + expected := &ns.Output{} + var capturedInput *ns.Input + var capturedRegistry *blockchain.Output + newSharedDBNodeSetWithContext = func(_ context.Context, in *ns.Input, registry *blockchain.Output) (*ns.Output, error) { + capturedInput = in + capturedRegistry = registry + return expected, nil + } + + registry := &blockchain.Output{ChainID: "1337"} + input := &ns.Input{Name: "workflow"} + out, err := DeployNodeSetComponent(context.Background(), input, registry) + require.NoError(t, err) + require.Same(t, expected, out) + require.NotNil(t, capturedInput) + require.Equal(t, "workflow", capturedInput.Name) + require.Same(t, registry, capturedRegistry) +} + +func TestDeployNodeSetComponentConstructorFailureIsWrapped(t *testing.T) { + prevNewNodeSet := newSharedDBNodeSetWithContext + t.Cleanup(func() { + newSharedDBNodeSetWithContext = prevNewNodeSet + }) + + newSharedDBNodeSetWithContext = func(context.Context, *ns.Input, *blockchain.Output) (*ns.Output, error) { + return nil, errors.New("nodeset constructor failed") + } + + _, err := DeployNodeSetComponent( + context.Background(), + &ns.Input{Name: "workflow"}, + &blockchain.Output{ChainID: "1337"}, + ) + require.Error(t, err) + require.Contains(t, err.Error(), "failed to deploy nodeset workflow") +} diff --git a/system-tests/lib/cre/environment/agent/relay_test.go b/system-tests/lib/cre/environment/agent/relay_test.go new file mode 100644 index 00000000000..766b993414f --- /dev/null +++ b/system-tests/lib/cre/environment/agent/relay_test.go @@ -0,0 +1,147 @@ +package agent + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net" + "net/http" + "net/http/httptest" + "net/url" + "testing" + "time" + + "github.com/gorilla/websocket" + "github.com/rs/zerolog" + "github.com/stretchr/testify/require" +) + +func TestRelay_OpenConnectBridgeAndClose(t *testing.T) { + srv := NewServer(zerolog.Nop(), nil) + httpServer := httptest.NewServer(srv.Handler()) + defer httpServer.Close() + + openResp := mustOpenRelay(t, httpServer.URL, openRelayRequest{ + Name: "relay-critical-path", + RequestedPort: 0, + }) + require.NotEmpty(t, openResp.RelayID) + require.Greater(t, openResp.BoundPort, 0) + + wsConn := mustConnectRelayWS(t, httpServer.URL, openResp.RelayID) + defer wsConn.Close() + + tcpConn, err := net.Dial("tcp", fmt.Sprintf("127.0.0.1:%d", openResp.BoundPort)) + require.NoError(t, err, "tcp client should connect to opened relay port") + defer tcpConn.Close() + + _ = tcpConn.SetDeadline(time.Now().Add(3 * time.Second)) + _ = wsConn.SetReadDeadline(time.Now().Add(3 * time.Second)) + _ = wsConn.SetWriteDeadline(time.Now().Add(3 * time.Second)) + + _, err = tcpConn.Write([]byte("hello-from-tcp")) + require.NoError(t, err, "writing to relay tcp side should succeed") + + msgType, payload, err := wsConn.ReadMessage() + require.NoError(t, err, "relay should forward tcp payload to websocket") + require.Equal(t, websocket.BinaryMessage, msgType) + require.Equal(t, "hello-from-tcp", string(payload)) + + err = wsConn.WriteMessage(websocket.BinaryMessage, []byte("hello-from-ws")) + require.NoError(t, err, "writing to relay websocket side should succeed") + + buf := make([]byte, 64) + n, err := tcpConn.Read(buf) + require.NoError(t, err, "relay should forward websocket payload to tcp") + require.Equal(t, "hello-from-ws", string(buf[:n])) + + closeResult := mustCloseRelay(t, httpServer.URL, openResp.RelayID) + require.Equal(t, openResp.RelayID, closeResult["relayId"]) + require.Equal(t, true, closeResult["closed"]) + require.Equal(t, true, closeResult["found"]) +} + +func TestRelay_OpenIdempotentByRequestedPort(t *testing.T) { + srv := NewServer(zerolog.Nop(), nil) + httpServer := httptest.NewServer(srv.Handler()) + defer httpServer.Close() + + requestedPort := reserveFreePort(t) + + first := mustOpenRelay(t, httpServer.URL, openRelayRequest{ + Name: "relay-first", + RequestedPort: requestedPort, + }) + second := mustOpenRelay(t, httpServer.URL, openRelayRequest{ + Name: "relay-second", + RequestedPort: requestedPort, + }) + + require.Equal(t, first.RelayID, second.RelayID, "same requested port should reuse existing relay") + require.Equal(t, first.BoundPort, second.BoundPort) + + closeResult := mustCloseRelay(t, httpServer.URL, first.RelayID) + require.Equal(t, true, closeResult["closed"]) + require.Equal(t, true, closeResult["found"]) +} + +func TestRelay_ConnectMissingRelayIDFails(t *testing.T) { + srv := NewServer(zerolog.Nop(), nil) + req := httptest.NewRequest(http.MethodGet, "/v1/relay/connect", nil) + rr := httptest.NewRecorder() + + srv.Handler().ServeHTTP(rr, req) + require.Equal(t, http.StatusBadRequest, rr.Code) + require.Contains(t, rr.Body.String(), ErrCodeMissingComponentInput) +} + +func mustOpenRelay(t *testing.T, baseURL string, req openRelayRequest) openRelayResponse { + t.Helper() + body, err := json.Marshal(req) + require.NoError(t, err) + resp, err := http.Post(baseURL+"/v1/relay/open", "application/json", bytes.NewReader(body)) + require.NoError(t, err) + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode) + + var out openRelayResponse + require.NoError(t, json.NewDecoder(resp.Body).Decode(&out)) + return out +} + +func mustCloseRelay(t *testing.T, baseURL, relayID string) map[string]any { + t.Helper() + body, err := json.Marshal(closeRelayRequest{RelayID: relayID}) + require.NoError(t, err) + resp, err := http.Post(baseURL+"/v1/relay/close", "application/json", bytes.NewReader(body)) + require.NoError(t, err) + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode) + + raw, err := io.ReadAll(resp.Body) + require.NoError(t, err) + var out map[string]any + require.NoError(t, json.Unmarshal(raw, &out)) + return out +} + +func mustConnectRelayWS(t *testing.T, baseURL, relayID string) *websocket.Conn { + t.Helper() + parsed, err := url.Parse(baseURL) + require.NoError(t, err) + wsURL := fmt.Sprintf("ws://%s/v1/relay/connect?relayId=%s", parsed.Host, relayID) + conn, _, err := websocket.DefaultDialer.Dial(wsURL, nil) + require.NoError(t, err, "websocket bridge should connect") + return conn +} + +func reserveFreePort(t *testing.T) int { + t.Helper() + ln, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + defer ln.Close() + addr, ok := ln.Addr().(*net.TCPAddr) + require.True(t, ok) + return addr.Port +} diff --git a/system-tests/lib/cre/environment/agent/server_handlers_test.go b/system-tests/lib/cre/environment/agent/server_handlers_test.go new file mode 100644 index 00000000000..9aec0574b68 --- /dev/null +++ b/system-tests/lib/cre/environment/agent/server_handlers_test.go @@ -0,0 +1,116 @@ +package agent + +import ( + "bytes" + "encoding/base64" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/rs/zerolog" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" + ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" + "github.com/stretchr/testify/require" +) + +func TestHealthEndpointReturnsOK(t *testing.T) { + server := NewServer(zerolog.Nop(), nil) + req := httptest.NewRequest(http.MethodGet, "/v1/health", nil) + rr := httptest.NewRecorder() + + server.Handler().ServeHTTP(rr, req) + require.Equal(t, http.StatusOK, rr.Code) + require.Equal(t, "ok", rr.Body.String()) +} + +func TestListCTFResourcesMethodNotAllowed(t *testing.T) { + server := NewServer(zerolog.Nop(), nil) + req := httptest.NewRequest(http.MethodPost, "/v1/resources/ctf", nil) + rr := httptest.NewRecorder() + + server.Handler().ServeHTTP(rr, req) + require.Equal(t, http.StatusMethodNotAllowed, rr.Code) + require.Contains(t, rr.Body.String(), ErrCodeMethodNotAllowed) +} + +func TestDeployArtifactsValidationErrors(t *testing.T) { + tests := []struct { + name string + payload DeployArtifactsPayload + wantCode int + wantErrCode string + wantMsg string + }{ + { + name: "missing nodeset name", + payload: DeployArtifactsPayload{NodeSetName: "", TargetDir: "/tmp", Files: []DeployArtifactsFile{{Name: "a.txt", ContentBase64: base64.StdEncoding.EncodeToString([]byte("x"))}}}, + wantCode: http.StatusBadRequest, + wantErrCode: ErrCodeMissingComponentInput, + wantMsg: "nodeset name is required", + }, + { + name: "missing target dir", + payload: DeployArtifactsPayload{NodeSetName: "workflow", TargetDir: "", Files: []DeployArtifactsFile{{Name: "a.txt", ContentBase64: base64.StdEncoding.EncodeToString([]byte("x"))}}}, + wantCode: http.StatusBadRequest, + wantErrCode: ErrCodeMissingComponentInput, + wantMsg: "target dir is required", + }, + { + name: "no files", + payload: DeployArtifactsPayload{NodeSetName: "workflow", TargetDir: "/tmp"}, + wantCode: http.StatusBadRequest, + wantErrCode: ErrCodeMissingComponentInput, + wantMsg: "at least one artifact file is required", + }, + } + + server := NewServer(zerolog.Nop(), nil) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + envelope := StartComponentEnvelope{ + SchemaVersion: SchemaVersionV1, + Operation: OperationDeployArtifacts, + } + payloadRaw, err := json.Marshal(tt.payload) + require.NoError(t, err) + envelope.Payload = payloadRaw + + reqBody, err := json.Marshal(envelope) + require.NoError(t, err) + + req := httptest.NewRequest(http.MethodPost, "/v1/components/start", bytes.NewReader(reqBody)) + req.Header.Set("Content-Type", "application/json") + rr := httptest.NewRecorder() + + server.Handler().ServeHTTP(rr, req) + require.Equal(t, tt.wantCode, rr.Code) + require.Contains(t, rr.Body.String(), tt.wantErrCode) + require.Contains(t, rr.Body.String(), tt.wantMsg) + }) + } +} + +func TestComponentCacheKeyVariants(t *testing.T) { + key, err := componentCacheKey(StartComponentPayload{ + ComponentType: ComponentTypeJD, + JD: &jd.Input{Image: "job-distributor:0.22.1"}, + }) + require.NoError(t, err) + require.Contains(t, key, ComponentTypeJD) + + key, err = componentCacheKey(StartComponentPayload{ + ComponentType: ComponentTypeNodeSet, + NodeSet: &ns.Input{Name: "workflow"}, + }) + require.NoError(t, err) + require.Equal(t, "nodeset:workflow", key) + + _, err = componentCacheKey(StartComponentPayload{ComponentType: ComponentTypeNodeSet}) + require.Error(t, err) + require.Contains(t, err.Error(), "nodeset payload is required") + + _, err = componentCacheKey(StartComponentPayload{ComponentType: "unknown"}) + require.Error(t, err) + require.Contains(t, err.Error(), "unsupported component type") +} diff --git a/system-tests/lib/cre/environment/artifacts_remote_test.go b/system-tests/lib/cre/environment/artifacts_remote_test.go new file mode 100644 index 00000000000..7b1dfb2c9be --- /dev/null +++ b/system-tests/lib/cre/environment/artifacts_remote_test.go @@ -0,0 +1,86 @@ +package environment + +import ( + "context" + "encoding/base64" + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "testing" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" + "github.com/stretchr/testify/require" + "github.com/rs/zerolog" +) + +func TestDeployArtifactsToRemoteNodeSetValidation(t *testing.T) { + err := DeployArtifactsToRemoteNodeSet(context.Background(), zerolog.Nop(), "", "/tmp", nil) + require.Error(t, err) + require.Contains(t, err.Error(), "nodeset name is required") + + err = DeployArtifactsToRemoteNodeSet(context.Background(), zerolog.Nop(), "workflow", "", nil) + require.Error(t, err) + require.Contains(t, err.Error(), "container target dir is required") +} + +func TestDeployArtifactsToRemoteNodeSetNoFilesFails(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/v1/health" { + w.WriteHeader(http.StatusOK) + return + } + t.Fatalf("unexpected path %s", r.URL.Path) + })) + defer server.Close() + + t.Setenv(envEC2AgentURL, server.URL) + t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + + err := DeployArtifactsToRemoteNodeSet(context.Background(), zerolog.Nop(), "workflow", "/home/chainlink/workflows", []string{"", ""}) + require.Error(t, err) + require.Contains(t, err.Error(), "no artifact files to deploy") +} + +func TestDeployArtifactsToRemoteNodeSetSuccess(t *testing.T) { + tmpDir := t.TempDir() + artifactPath := filepath.Join(tmpDir, "artifact.wasm") + require.NoError(t, os.WriteFile(artifactPath, []byte("artifact-content"), 0o600)) + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/v1/health": + w.WriteHeader(http.StatusOK) + case "/v1/components/start": + var envelope agent.StartComponentEnvelope + require.NoError(t, json.NewDecoder(r.Body).Decode(&envelope)) + require.Equal(t, agent.OperationDeployArtifacts, envelope.Operation) + + var payload agent.DeployArtifactsPayload + require.NoError(t, json.Unmarshal(envelope.Payload, &payload)) + require.Equal(t, "workflow", payload.NodeSetName) + require.Equal(t, "/home/chainlink/workflows", payload.TargetDir) + require.Len(t, payload.Files, 1) + require.Equal(t, "artifact.wasm", payload.Files[0].Name) + raw, err := base64.StdEncoding.DecodeString(payload.Files[0].ContentBase64) + require.NoError(t, err) + require.Equal(t, "artifact-content", string(raw)) + + _ = json.NewEncoder(w).Encode(agent.StartComponentResponse{ + ComponentType: componentTypeNodeSet, + AgentLogs: []string{"artifact deployed"}, + }) + default: + t.Fatalf("unexpected path %s", r.URL.Path) + } + })) + defer server.Close() + + t.Setenv(envEC2AgentURL, server.URL) + t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + + err := DeployArtifactsToRemoteNodeSet(context.Background(), zerolog.Nop(), "workflow", "/home/chainlink/workflows", []string{artifactPath}) + require.NoError(t, err) +} diff --git a/system-tests/lib/cre/environment/blockchain_start_test.go b/system-tests/lib/cre/environment/blockchain_start_test.go index 42c85742f84..be94fb603b6 100644 --- a/system-tests/lib/cre/environment/blockchain_start_test.go +++ b/system-tests/lib/cre/environment/blockchain_start_test.go @@ -1,26 +1,23 @@ package environment import ( - "strings" "testing" "github.com/rs/zerolog" + "github.com/stretchr/testify/require" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" ) func TestValidateRemoteBlockchainInput(t *testing.T) { - if err := validateRemoteBlockchainInput(nil); err == nil { - t.Fatalf("expected nil input to fail validation") - } + err := validateRemoteBlockchainInput(nil) + require.Error(t, err, "expected nil input to fail validation") - if err := validateRemoteBlockchainInput(&blockchain.Input{Type: blockchain.TypeGeth}); err == nil { - t.Fatalf("expected non-anvil input to fail validation") - } + err = validateRemoteBlockchainInput(&blockchain.Input{Type: blockchain.TypeGeth}) + require.Error(t, err, "expected non-anvil input to fail validation") - if err := validateRemoteBlockchainInput(&blockchain.Input{Type: blockchain.TypeAnvil}); err != nil { - t.Fatalf("expected anvil input to pass validation, got %v", err) - } + err = validateRemoteBlockchainInput(&blockchain.Input{Type: blockchain.TypeAnvil}) + require.NoError(t, err, "expected anvil input to pass validation") } func TestNewRemoteComponentClientPrefersEC2(t *testing.T) { @@ -29,27 +26,15 @@ func TestNewRemoteComponentClientPrefersEC2(t *testing.T) { t.Setenv(envEC2AgentPort, "18080") runtime, err := resolveRemoteRuntime(zerolog.Nop()) - if err != nil { - t.Fatalf("expected remote runtime to resolve, got %v", err) - } + require.NoError(t, err, "expected remote runtime to resolve") client, err := newRemoteComponentClient(runtime) - if err != nil { - t.Fatalf("expected ec2-first client to be created, got %v", err) - } + require.NoError(t, err, "expected ec2-first client to be created") httpClient, ok := client.(*httpComponentClient) - if !ok { - t.Fatalf("expected httpComponentClient, got %T", client) - } - if !httpClient.checkHealth { - t.Fatalf("expected ec2 client to enable health checks") - } - if httpClient.maxAttempts != 3 { - t.Fatalf("expected ec2 client retries to be enabled") - } - if httpClient.baseURL != "http://203.0.113.10:18080" { - t.Fatalf("unexpected ec2 base url: %s", httpClient.baseURL) - } + require.True(t, ok, "expected httpComponentClient, got %T", client) + require.True(t, httpClient.checkHealth, "expected ec2 client to enable health checks") + require.Equal(t, 3, httpClient.maxAttempts, "expected ec2 client retries to be enabled") + require.Equal(t, "http://203.0.113.10:18080", httpClient.baseURL, "unexpected ec2 base url") } func TestResolveEC2AgentBaseURLRequiresHostOrInstanceInfoWhenURLMissing(t *testing.T) { @@ -59,9 +44,7 @@ func TestResolveEC2AgentBaseURLRequiresHostOrInstanceInfoWhenURLMissing(t *testi t.Setenv(envEC2AgentPort, "") _, err := resolveEC2AgentBaseURL(zerolog.Nop()) - if err == nil { - t.Fatalf("expected missing direct host resolution inputs to fail when %s is not set", envEC2AgentURL) - } + require.Error(t, err, "expected missing direct host resolution inputs to fail when %s is not set", envEC2AgentURL) } func TestResolveEC2AgentBaseURLRejectsInvalidPort(t *testing.T) { @@ -70,12 +53,8 @@ func TestResolveEC2AgentBaseURLRejectsInvalidPort(t *testing.T) { t.Setenv(envEC2AgentPort, "not-a-port") _, err := resolveEC2AgentBaseURL(zerolog.Nop()) - if err == nil { - t.Fatalf("expected invalid %s to fail", envEC2AgentPort) - } - if !strings.Contains(err.Error(), envEC2AgentPort) { - t.Fatalf("expected error to mention %s, got: %v", envEC2AgentPort, err) - } + require.Error(t, err, "expected invalid %s to fail", envEC2AgentPort) + require.Contains(t, err.Error(), envEC2AgentPort, "expected error to mention %s", envEC2AgentPort) } func TestResolveEC2AgentBaseURLDirectMode(t *testing.T) { @@ -84,12 +63,8 @@ func TestResolveEC2AgentBaseURLDirectMode(t *testing.T) { t.Setenv(envEC2AgentPort, "18080") baseURL, err := resolveEC2AgentBaseURL(zerolog.Nop()) - if err != nil { - t.Fatalf("expected direct mode url resolution to succeed, got %v", err) - } - if baseURL != "http://203.0.113.10:18080" { - t.Fatalf("unexpected direct mode base url: %s", baseURL) - } + require.NoError(t, err, "expected direct mode url resolution to succeed") + require.Equal(t, "http://203.0.113.10:18080", baseURL, "unexpected direct mode base url") } func TestResolveRemoteRuntimeRequiresEC2Resolution(t *testing.T) { @@ -97,49 +72,65 @@ func TestResolveRemoteRuntimeRequiresEC2Resolution(t *testing.T) { t.Setenv(runtimecfg.EnvEC2HostIP, "") t.Setenv(runtimecfg.EnvEC2InstanceID, "") - if _, err := resolveRemoteRuntime(zerolog.Nop()); err == nil { - t.Fatalf("expected runtime resolution without EC2 inputs to fail") - } + _, err := resolveRemoteRuntime(zerolog.Nop()) + require.Error(t, err, "expected runtime resolution without EC2 inputs to fail") } -func TestRewriteRemoteBlockchainOutputForLocalAccess(t *testing.T) { +func TestRewriteRemoteBlockchainOutputForLocalAccess_DirectMode(t *testing.T) { t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + tests := []struct { + name string + rewriteInternalForLocalNodes bool + }{ + {name: "remote only path keeps internal URLs", rewriteInternalForLocalNodes: false}, + {name: "mixed path still keeps internal URLs in direct mode", rewriteInternalForLocalNodes: true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + out := &blockchain.Output{ + Nodes: []*blockchain.Node{ + { + ExternalHTTPUrl: "http://anvil-1337:8545", + ExternalWSUrl: "ws://anvil-1337:8546", + InternalHTTPUrl: "http://anvil-1337:8545", + InternalWSUrl: "ws://anvil-1337:8546", + }, + }, + } + err := rewriteRemoteBlockchainOutputForLocalAccess(out, "203.0.113.10", tt.rewriteInternalForLocalNodes) + require.NoError(t, err, "expected rewrite helper to succeed") + + require.Equal(t, "http://203.0.113.10:8545", out.Nodes[0].ExternalHTTPUrl, "unexpected rewritten http url") + require.Equal(t, "ws://203.0.113.10:8546", out.Nodes[0].ExternalWSUrl, "unexpected rewritten ws url") + require.Equal(t, "http://anvil-1337:8545", out.Nodes[0].InternalHTTPUrl, "internal http url should remain unchanged in direct mode") + require.Equal(t, "ws://anvil-1337:8546", out.Nodes[0].InternalWSUrl, "internal ws url should remain unchanged in direct mode") + }) + } +} + +func TestRewriteRemoteBlockchainOutputForLocalAccess_LocalOnlyNoop(t *testing.T) { + err := rewriteRemoteBlockchainOutputForLocalAccess(nil, "203.0.113.10", false) + require.NoError(t, err, "expected nil output rewrite to be a no-op") +} + +func TestRewriteRemoteBlockchainOutputForLocalAccess_InvalidExternalURL(t *testing.T) { out := &blockchain.Output{ Nodes: []*blockchain.Node{ { - ExternalHTTPUrl: "http://anvil-1337:8545", + ExternalHTTPUrl: "://bad-url", ExternalWSUrl: "ws://anvil-1337:8546", - InternalHTTPUrl: "http://anvil-1337:8545", - InternalWSUrl: "ws://anvil-1337:8546", }, }, } - if err := rewriteRemoteBlockchainOutputForLocalAccess( - out, - "203.0.113.10", - true, - ); err != nil { - t.Fatalf("expected rewrite helper to succeed: %v", err) - } - if out.Nodes[0].ExternalHTTPUrl != "http://203.0.113.10:8545" { - t.Fatalf("unexpected rewritten http url: %s", out.Nodes[0].ExternalHTTPUrl) - } - if out.Nodes[0].ExternalWSUrl != "ws://203.0.113.10:8546" { - t.Fatalf("unexpected rewritten ws url: %s", out.Nodes[0].ExternalWSUrl) - } - if out.Nodes[0].InternalHTTPUrl != "http://anvil-1337:8545" { - t.Fatalf("expected internal http url unchanged in direct mode, got %s", out.Nodes[0].InternalHTTPUrl) - } - if out.Nodes[0].InternalWSUrl != "ws://anvil-1337:8546" { - t.Fatalf("expected internal ws url unchanged in direct mode, got %s", out.Nodes[0].InternalWSUrl) - } + err := rewriteRemoteBlockchainOutputForLocalAccess(out, "203.0.113.10", false) + require.Error(t, err, "expected invalid external URL to fail rewrite") + require.Contains(t, err.Error(), "failed to parse url", "expected parse failure context") } func TestRemoteAgentErrorFormatting(t *testing.T) { err := remoteAgentError("deployment_failed", "failed to deploy blockchain output") want := "remote agent error (deployment_failed): failed to deploy blockchain output" - if err == nil || err.Error() != want { - t.Fatalf("expected %q, got %v", want, err) - } + require.EqualError(t, err, want, "unexpected remote agent error formatting") } diff --git a/system-tests/lib/cre/environment/config/config_test.go b/system-tests/lib/cre/environment/config/config_test.go new file mode 100644 index 00000000000..e46abbbefed --- /dev/null +++ b/system-tests/lib/cre/environment/config/config_test.go @@ -0,0 +1,74 @@ +package config + +import ( + "testing" + + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre" + "github.com/stretchr/testify/require" +) + +func TestBlockchainNormalizeAndValidate(t *testing.T) { + b := &Blockchain{Input: blockchain.Input{Type: blockchain.TypeAnvil}} + b.Normalize() + require.Equal(t, PlacementLocal, b.Placement) + require.Equal(t, RemoteStartPolicyReuseIfIdentical, b.RemoteStartPolicy) + require.NoError(t, b.Validate()) + + b = &Blockchain{Input: blockchain.Input{Type: blockchain.TypeAnvil}, Placement: ComponentPlacement("invalid")} + err := b.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "invalid blockchain placement") +} + +func TestJobDistributorNormalizeAndValidate(t *testing.T) { + j := &JobDistributor{Input: jd.Input{}} + j.Normalize() + require.Equal(t, PlacementLocal, j.Placement) + require.Equal(t, RemoteStartPolicyReuseIfIdentical, j.RemoteStartPolicy) + require.NoError(t, j.Validate()) + + j = &JobDistributor{Input: jd.Input{}, Placement: PlacementRemote, RemoteStartPolicy: RemoteStartPolicy("bad")} + err := j.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "invalid jd remote_start_policy") +} + +func TestNodeSetPlacementNormalizeAndValidate(t *testing.T) { + nodeSet := &cre.NodeSet{} + normalizeNodeSetPlacement(nodeSet) + require.Equal(t, string(PlacementLocal), nodeSet.Placement) + require.Equal(t, string(RemoteStartPolicyReuseIfIdentical), nodeSet.RemoteStartPolicy) + require.NoError(t, validateNodeSetPlacement(nodeSet)) + + nodeSet.Placement = "bad" + err := validateNodeSetPlacement(nodeSet) + require.Error(t, err) + require.Contains(t, err.Error(), "invalid nodeset placement") +} + +func TestResolveBlockchainInputs(t *testing.T) { + _, err := ResolveBlockchainInputs(nil) + require.Error(t, err) + require.Contains(t, err.Error(), "at least one blockchain") + + out, err := ResolveBlockchainInputs([]*Blockchain{ + {Input: blockchain.Input{Type: blockchain.TypeAnvil}, Placement: PlacementRemote}, + }) + require.NoError(t, err) + require.Len(t, out, 1) + require.Equal(t, blockchain.TypeAnvil, out[0].Type) +} + +func TestRemoveChainIDFromFlag(t *testing.T) { + require.Equal(t, "write-evm", removeChainIDFromFlag("write-evm-1337")) + require.Equal(t, "write-evm-mainnet", removeChainIDFromFlag("write-evm-mainnet")) + require.Equal(t, "cron", removeChainIDFromFlag("cron")) +} + +func TestNormalizeComponentPlacement(t *testing.T) { + require.Equal(t, PlacementLocal, normalizeComponentPlacement(ComponentPlacement(" LOCAL "))) + require.Equal(t, PlacementRemote, normalizeComponentPlacement(ComponentPlacement("REMOTE"))) + require.Equal(t, ComponentPlacement("weird"), normalizeComponentPlacement(ComponentPlacement("weird"))) +} diff --git a/system-tests/lib/cre/environment/dons_test.go b/system-tests/lib/cre/environment/dons_test.go index 026a36c3b1a..37e5ec68f83 100644 --- a/system-tests/lib/cre/environment/dons_test.go +++ b/system-tests/lib/cre/environment/dons_test.go @@ -1,12 +1,16 @@ package environment import ( + "net/url" "strings" "testing" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/clnode" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" + "github.com/stretchr/testify/require" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" + "github.com/smartcontractkit/chainlink/system-tests/lib/infra" ) func TestBuildRemoteNodeSetInputRequiresImageOrBuildFields(t *testing.T) { @@ -26,12 +30,8 @@ func TestBuildRemoteNodeSetInputRequiresImageOrBuildFields(t *testing.T) { } _, err := buildRemoteNodeSetInput(nodeSet) - if err == nil { - t.Fatal("expected missing image/build validation error") - } - if !strings.Contains(err.Error(), "must set node.image or docker build fields") { - t.Fatalf("expected image validation error, got: %v", err) - } + require.Error(t, err, "expected missing image/build validation error") + require.Contains(t, err.Error(), "must set node.image or docker build fields", "expected image validation error") } func TestBuildRemoteNodeSetInputRejectsImageAndBuildFieldsTogether(t *testing.T) { @@ -53,10 +53,197 @@ func TestBuildRemoteNodeSetInputRejectsImageAndBuildFieldsTogether(t *testing.T) } _, err := buildRemoteNodeSetInput(nodeSet) - if err == nil { - t.Fatal("expected image+build conflict validation error") + require.Error(t, err, "expected image+build conflict validation error") + require.Contains(t, err.Error(), "either node.image or docker build fields", "expected image/build conflict error") +} + +func TestRewriteRemoteNodeSetOutputForLocalAccess_LocalOnlyNoop(t *testing.T) { + err := rewriteRemoteNodeSetOutputForLocalAccess(nil, 0, nil, nil, "203.0.113.10") + require.NoError(t, err, "expected local-only no-op rewrite to succeed") +} + +func TestRewriteRemoteNodeSetOutputForLocalAccess_RemoteRewritesGatewayIncomingHost(t *testing.T) { + topology, nodeSet := mustBuildRemoteGatewayTopology(t) + output := &simple_node_set.Output{} + + err := rewriteRemoteNodeSetOutputForLocalAccess(topology, 0, nodeSet, output, "203.0.113.10") + require.NoError(t, err, "expected remote rewrite to succeed") + + require.NotNil(t, topology.GatewayConnectors) + require.Len(t, topology.GatewayConnectors.Configurations, 1) + require.Equal( + t, + "203.0.113.10", + topology.GatewayConnectors.Configurations[0].Incoming.Host, + "expected remote nodeset rewrite to expose gateway incoming via EC2 host", + ) +} + +func TestRewriteRemoteNodeSetOutputForLocalAccess_InvalidNodeExternalURLFails(t *testing.T) { + topology, nodeSet := mustBuildRemoteGatewayTopology(t) + output := &simple_node_set.Output{ + CLNodes: []*clnode.Output{ + { + Node: &clnode.NodeOut{ + ExternalURL: "://bad-url", + }, + }, + }, + } + + err := rewriteRemoteNodeSetOutputForLocalAccess(topology, 0, nodeSet, output, "203.0.113.10") + require.Error(t, err, "expected invalid node external URL to fail rewrite") + require.Contains(t, err.Error(), "failed to parse url", "expected parse failure context") +} + +func TestParseCustomPortMapping(t *testing.T) { + t.Run("valid mapping", func(t *testing.T) { + hostPort, containerPort, err := parseCustomPortMapping("127.0.0.1:18080:8080") + require.NoError(t, err, "expected valid mapping to parse") + require.Equal(t, 18080, hostPort) + require.Equal(t, 8080, containerPort) + }) + + t.Run("missing separator", func(t *testing.T) { + _, _, err := parseCustomPortMapping("8080") + require.Error(t, err, "expected malformed mapping to fail") + require.Contains(t, err.Error(), "expected hostPort:containerPort") + }) + + t.Run("invalid host port", func(t *testing.T) { + _, _, err := parseCustomPortMapping("bad:8080") + require.Error(t, err, "expected invalid host port to fail") + require.Contains(t, err.Error(), "invalid host port") + }) + + t.Run("invalid container port", func(t *testing.T) { + _, _, err := parseCustomPortMapping("18080:bad") + require.Error(t, err, "expected invalid container port to fail") + require.Contains(t, err.Error(), "invalid container port") + }) +} + +func TestNodeSetResolveURLPort(t *testing.T) { + tests := []struct { + name string + rawURL string + wantPort int + wantError string + }{ + {name: "explicit port", rawURL: "http://node:1234", wantPort: 1234}, + {name: "http default", rawURL: "http://node", wantPort: 80}, + {name: "https default", rawURL: "https://node", wantPort: 443}, + {name: "unsupported scheme without port", rawURL: "tcp://node", wantError: "unsupported scheme"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + parsed, err := url.Parse(tt.rawURL) + require.NoError(t, err, "test setup should parse URL") + port, err := nodeSetResolveURLPort(parsed) + if tt.wantError != "" { + require.Error(t, err, "expected port resolution failure") + require.Contains(t, err.Error(), tt.wantError) + return + } + require.NoError(t, err, "expected port resolution success") + require.Equal(t, tt.wantPort, port) + }) + } +} + +func TestNodeSetEndpointFromURL(t *testing.T) { + ref, err := nodeSetEndpointFromURL("nodeset:workflow", "node-0-api", "http://node-0:8081") + require.NoError(t, err, "expected endpoint ref to parse") + require.Equal(t, "nodeset:workflow", ref.ComponentID) + require.Equal(t, "node-0-api", ref.EndpointName) + require.Equal(t, "http", ref.Scheme) + require.Equal(t, "node-0", ref.Host) + require.Equal(t, 8081, ref.Port) + + ref, err = nodeSetEndpointFromURL("nodeset:workflow", "node-0-api", " ") + require.NoError(t, err, "expected blank URL to be ignored") + require.Nil(t, ref, "expected nil endpoint for blank URL") + + _, err = nodeSetEndpointFromURL("nodeset:workflow", "node-0-api", "http://") + require.Error(t, err, "expected empty hostname to fail") + require.Contains(t, err.Error(), "empty hostname") +} + +func TestGatewayLocalPortFromBindings(t *testing.T) { + bindings := []tunnel.TunnelBinding{ + {EndpointRef: tunnel.EndpointRef{EndpointName: "node-0-custom-0-5002"}, LocalPort: 22002}, + {EndpointRef: tunnel.EndpointRef{EndpointName: "node-1-custom-0-5002"}, LocalPort: 22012}, } - if !strings.Contains(err.Error(), "either node.image or docker build fields") { - t.Fatalf("expected image/build conflict error, got: %v", err) + + port, ok := gatewayLocalPortFromBindings(0, 5002, bindings) + require.True(t, ok, "expected matching binding") + require.Equal(t, 22002, port) + + _, ok = gatewayLocalPortFromBindings(0, 6000, bindings) + require.False(t, ok, "expected non-matching container port to return false") +} + +func TestRewriteGatewayIncomingForNodeSetBindings(t *testing.T) { + topology, nodeSet := mustBuildRemoteGatewayTopology(t) + nodeSet.NodeSpecs[0].Input.Node.CustomPorts = []string{"18080:5002"} + + bindings := []tunnel.TunnelBinding{ + {EndpointRef: tunnel.EndpointRef{EndpointName: "node-0-custom-0-5002"}, LocalPort: 22002}, + } + + rewriteGatewayIncomingForNodeSetBindings(topology, 0, nodeSet, bindings) + cfg := topology.GatewayConnectors.Configurations[0].GatewayConfiguration + require.Equal(t, "127.0.0.1", cfg.Incoming.Host, "incoming host should be local during binding mode") + require.Equal(t, 22002, cfg.Incoming.ExternalPort, "incoming external port should be rewritten from binding") +} + +func TestRewriteCustomPortMappingHostPort(t *testing.T) { + rewritten := rewriteCustomPortMappingHostPort("127.0.0.1:18080:8080", 22080) + require.Equal(t, "127.0.0.1:22080:8080", rewritten) + + unchanged := rewriteCustomPortMappingHostPort("bad", 22080) + require.True(t, strings.EqualFold("bad", unchanged), "malformed mapping should remain unchanged") +} + +func mustBuildRemoteGatewayTopology(t *testing.T) (*cre.Topology, *cre.NodeSet) { + t.Helper() + + provider := infra.Provider{Type: infra.Docker} + nodeSet := &cre.NodeSet{ + Input: &simple_node_set.Input{Name: "workflow"}, + NodeSpecs: []*cre.NodeSpecWithRole{ + { + Input: &clnode.Input{Node: &clnode.NodeInput{}}, + Roles: []cre.NodeType{cre.BootstrapNode, cre.GatewayNode}, + }, + }, + Placement: "remote", + } + + donMetadata, err := cre.NewDonMetadata(nodeSet, 1, provider, nil) + require.NoError(t, err, "failed to build DonMetadata") + donsMetadata, err := cre.NewDonsMetadata([]*cre.DonMetadata{donMetadata}, provider) + require.NoError(t, err, "failed to build DonsMetadata") + + gatewayNode, hasGateway := donMetadata.Gateway() + require.True(t, hasGateway, "expected gateway node in metadata") + + topology := &cre.Topology{ + DonsMetadata: donsMetadata, + GatewayConnectors: &cre.GatewayConnectors{ + Configurations: []*cre.DonGatewayConfiguration{ + { + GatewayConfiguration: &cre.GatewayConfiguration{ + NodeUUID: gatewayNode.UUID, + Incoming: cre.Incoming{ + Host: "bootstrap-gateway-node0", + ExternalPort: 5002, + }, + }, + }, + }, + }, } + return topology, nodeSet } diff --git a/system-tests/lib/cre/environment/environment_placement_test.go b/system-tests/lib/cre/environment/environment_placement_test.go index 0d16d589a47..0bdd9f111e8 100644 --- a/system-tests/lib/cre/environment/environment_placement_test.go +++ b/system-tests/lib/cre/environment/environment_placement_test.go @@ -3,6 +3,9 @@ package environment import ( "testing" + "github.com/stretchr/testify/require" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" ) @@ -13,10 +16,51 @@ func TestSummarizeNodeSetPlacement_AllowsMixedPlacements(t *testing.T) { } summary, err := summarizeNodeSetPlacement(nodeSets) - if err != nil { - t.Fatalf("summarizeNodeSetPlacement returned error: %v", err) + require.NoError(t, err, "summarizeNodeSetPlacement should succeed") + require.True(t, summary.HasLocalTargets, "expected local placement to be detected") + require.True(t, summary.HasRemoteTargets, "expected remote placement to be detected") +} + +func TestHasRemoteComponents(t *testing.T) { + tests := []struct { + name string + blockchains []*config.Blockchain + jd *config.JobDistributor + nodeSets []*cre.NodeSet + want bool + }{ + { + name: "none remote", + blockchains: []*config.Blockchain{ + {Placement: config.PlacementLocal}, + }, + jd: &config.JobDistributor{Placement: config.PlacementLocal}, + nodeSets: []*cre.NodeSet{{Placement: "local"}}, + want: false, + }, + { + name: "remote blockchain", + blockchains: []*config.Blockchain{ + {Placement: config.PlacementRemote}, + }, + want: true, + }, + { + name: "remote jd", + jd: &config.JobDistributor{Placement: config.PlacementRemote}, + want: true, + }, + { + name: "remote nodeset", + nodeSets: []*cre.NodeSet{{Placement: "remote"}}, + want: true, + }, } - if !summary.HasLocalTargets || !summary.HasRemoteTargets { - t.Fatalf("expected both local and remote placements, got %+v", summary) + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := hasRemoteComponents(tt.blockchains, tt.jd, tt.nodeSets) + require.Equalf(t, tt.want, got, "expected hasRemoteComponents() to return %v", tt.want) + }) } } diff --git a/system-tests/lib/cre/environment/jobs_test.go b/system-tests/lib/cre/environment/jobs_test.go index 943632ef930..47f858fd469 100644 --- a/system-tests/lib/cre/environment/jobs_test.go +++ b/system-tests/lib/cre/environment/jobs_test.go @@ -3,9 +3,16 @@ package environment import ( "testing" + "github.com/stretchr/testify/require" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" ) +func TestRewriteRemoteJDOutputForLocalAccess_LocalOnlyNoop(t *testing.T) { + var output *jd.Output + err := rewriteRemoteJDOutputForLocalAccess(output, "10.20.30.40") + require.NoError(t, err, "expected nil output rewrite to be a no-op") +} + func TestRewriteJDForDirectAccessRewritesExternalEndpoints(t *testing.T) { output := &jd.Output{ ExternalGRPCUrl: "127.0.0.1:14231", @@ -13,34 +20,45 @@ func TestRewriteJDForDirectAccessRewritesExternalEndpoints(t *testing.T) { InternalWSRPCUrl: "job-distributor:8080", } - if err := rewriteJDForDirectAccess(output, "10.20.30.40"); err != nil { - t.Fatalf("rewriteJDForDirectAccess returned error: %v", err) - } - if output.ExternalGRPCUrl != "10.20.30.40:14231" { - t.Fatalf("expected external grpc url to be rewritten, got %s", output.ExternalGRPCUrl) - } - if output.ExternalWSRPCUrl != "10.20.30.40:9080" { - t.Fatalf("expected external wsrpc url to be rewritten, got %s", output.ExternalWSRPCUrl) - } - if output.InternalWSRPCUrl != "job-distributor:8080" { - t.Fatalf("expected internal wsrpc url to remain unchanged, got %s", output.InternalWSRPCUrl) - } + err := rewriteJDForDirectAccess(output, "10.20.30.40") + require.NoError(t, err, "rewriteJDForDirectAccess should succeed") + require.Equal(t, "10.20.30.40:14231", output.ExternalGRPCUrl, "external grpc url should be rewritten") + require.Equal(t, "10.20.30.40:9080", output.ExternalWSRPCUrl, "external wsrpc url should be rewritten") + require.Equal(t, "job-distributor:8080", output.InternalWSRPCUrl, "internal wsrpc url should remain unchanged") } -func TestRewriteJDForDirectAccessFallsBackToInternalWSRPCSource(t *testing.T) { +func TestRewriteRemoteJDOutputForLocalAccess_MixedFallsBackToInternalWSRPCSource(t *testing.T) { output := &jd.Output{ ExternalGRPCUrl: "127.0.0.1:14231", ExternalWSRPCUrl: "", InternalWSRPCUrl: "job-distributor:8080", } - if err := rewriteJDForDirectAccess(output, "10.20.30.40"); err != nil { - t.Fatalf("rewriteJDForDirectAccess returned error: %v", err) - } - if output.ExternalWSRPCUrl != "10.20.30.40:8080" { - t.Fatalf("expected external wsrpc url to be derived from internal source, got %s", output.ExternalWSRPCUrl) - } - if output.InternalWSRPCUrl != "job-distributor:8080" { - t.Fatalf("expected internal wsrpc url to remain unchanged, got %s", output.InternalWSRPCUrl) + err := rewriteRemoteJDOutputForLocalAccess(output, "10.20.30.40") + require.NoError(t, err, "rewriteRemoteJDOutputForLocalAccess should succeed") + require.Equal(t, "10.20.30.40:8080", output.ExternalWSRPCUrl, "external wsrpc url should be derived from internal source") + require.Equal(t, "job-distributor:8080", output.InternalWSRPCUrl, "internal wsrpc url should remain unchanged") +} + +func TestRewriteRemoteJDOutputForLocalAccess_InvalidAddressFails(t *testing.T) { + output := &jd.Output{ + ExternalGRPCUrl: "127.0.0.1", + ExternalWSRPCUrl: "127.0.0.1:9080", } + + err := rewriteRemoteJDOutputForLocalAccess(output, "10.20.30.40") + require.Error(t, err, "expected invalid host:port to fail rewrite") + require.Contains(t, err.Error(), "failed to parse host:port", "expected parse failure context") +} + +func TestRewriteAddressHost_UnsupportedURLWithoutPortFails(t *testing.T) { + _, err := rewriteAddressHost("http://job-distributor", "10.20.30.40") + require.Error(t, err, "expected address without port to fail") + require.Contains(t, err.Error(), "must include a port", "expected missing-port context") +} + +func TestRewriteAddressHost_EmptyInputNoop(t *testing.T) { + rewritten, err := rewriteAddressHost(" ", "10.20.30.40") + require.NoError(t, err, "expected empty input to be a no-op") + require.Equal(t, "", rewritten, "expected empty output for empty input") } diff --git a/system-tests/lib/cre/environment/remote_component_client_test.go b/system-tests/lib/cre/environment/remote_component_client_test.go new file mode 100644 index 00000000000..c2d33aa8766 --- /dev/null +++ b/system-tests/lib/cre/environment/remote_component_client_test.go @@ -0,0 +1,158 @@ +package environment + +import ( + "context" + "encoding/json" + "errors" + "net" + "net/http" + "net/http/httptest" + "testing" + + "github.com/rs/zerolog" + "github.com/stretchr/testify/require" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" +) + +func TestResolveRemoteRuntimeWithExplicitEnv(t *testing.T) { + t.Setenv(envEC2AgentURL, "http://198.51.100.20:19090") + t.Setenv(runtimecfg.EnvEC2HostIP, "198.51.100.20") + t.Setenv(envEC2AgentPort, "19090") + + runtime, err := resolveRemoteRuntime(zerolog.Nop()) + require.NoError(t, err, "expected runtime resolution to succeed") + require.Equal(t, "http://198.51.100.20:19090", runtime.AgentBaseURL, "unexpected agent base url") + require.Equal(t, "198.51.100.20", runtime.EC2HostIP, "unexpected ec2 host ip") +} + +func TestResolveRemoteRuntimeRequiresHostResolution(t *testing.T) { + t.Setenv(envEC2AgentURL, "http://198.51.100.20:19090") + t.Setenv(runtimecfg.EnvEC2HostIP, "") + t.Setenv(runtimecfg.EnvEC2InstanceID, "") + + _, err := resolveRemoteRuntime(zerolog.Nop()) + require.Error(t, err, "expected runtime resolution without EC2 host inputs to fail") +} + +func TestNewRemoteComponentClientRequiresResolvedRuntime(t *testing.T) { + _, err := newRemoteComponentClient(nil) + require.Error(t, err, "expected nil runtime to fail") + + _, err = newRemoteComponentClient(&resolvedRemoteRuntime{}) + require.Error(t, err, "expected missing agent base URL to fail") +} + +func TestDescribeEC2AgentHealthFailureMentionsResolutionHints(t *testing.T) { + msg := describeEC2AgentHealthFailure("http://203.0.113.10:8080") + require.Contains(t, msg, "/v1/health") + require.Contains(t, msg, envEC2AgentPort) + require.Contains(t, msg, envEC2AgentURL) +} + +func TestIsRetriableStatus(t *testing.T) { + require.True(t, isRetriableStatus(502)) + require.True(t, isRetriableStatus(503)) + require.True(t, isRetriableStatus(504)) + require.False(t, isRetriableStatus(500)) +} + +func TestIsRetriableNetworkError(t *testing.T) { + var netErr net.Error = timeoutError{} + require.True(t, isRetriableNetworkError(netErr), "expected net.Error to be retriable") + require.False(t, isRetriableNetworkError(errors.New("plain error")), "expected non-network error to be non-retriable") +} + +type timeoutError struct{} + +func (timeoutError) Error() string { return "timeout" } +func (timeoutError) Timeout() bool { return true } +func (timeoutError) Temporary() bool { return true } + +func TestStartComponentOnce_Success(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _ = json.NewEncoder(w).Encode(agent.StartComponentResponse{ComponentType: componentTypeBlockchain}) + })) + defer server.Close() + + client := &httpComponentClient{baseURL: server.URL, client: server.Client()} + resp, err := client.startComponentOnce(context.Background(), agent.StartComponentEnvelope{ + SchemaVersion: agent.SchemaVersionV1, + Operation: agent.OperationStartComponent, + Payload: json.RawMessage(`{"componentType":"blockchain"}`), + }) + require.NoError(t, err) + require.Equal(t, componentTypeBlockchain, resp.ComponentType) +} + +func TestStartComponentOnce_Non2xxWithAgentErrorCode(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(agent.StartComponentResponse{ + ErrorCode: "deployment_failed", + Error: "bad payload", + }) + })) + defer server.Close() + + client := &httpComponentClient{baseURL: server.URL, client: server.Client()} + _, err := client.startComponentOnce(context.Background(), agent.StartComponentEnvelope{ + SchemaVersion: agent.SchemaVersionV1, + Operation: agent.OperationStartComponent, + Payload: json.RawMessage(`{"componentType":"blockchain"}`), + }) + require.Error(t, err) + require.Contains(t, err.Error(), "remote agent error (deployment_failed)") +} + +func TestStartComponentOnce_Non2xxWithoutAgentPayload(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusBadGateway) + _ = json.NewEncoder(w).Encode(agent.StartComponentResponse{}) + })) + defer server.Close() + + client := &httpComponentClient{baseURL: server.URL, client: server.Client()} + _, err := client.startComponentOnce(context.Background(), agent.StartComponentEnvelope{ + SchemaVersion: agent.SchemaVersionV1, + Operation: agent.OperationStartComponent, + Payload: json.RawMessage(`{"componentType":"blockchain"}`), + }) + require.Error(t, err) + require.Contains(t, err.Error(), "start component request failed with status") +} + +func TestStartComponentOnce_InvalidJSONResponseFails(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte("{not-json")) + })) + defer server.Close() + + client := &httpComponentClient{baseURL: server.URL, client: server.Client()} + _, err := client.startComponentOnce(context.Background(), agent.StartComponentEnvelope{ + SchemaVersion: agent.SchemaVersionV1, + Operation: agent.OperationStartComponent, + Payload: json.RawMessage(`{"componentType":"blockchain"}`), + }) + require.Error(t, err) + require.Contains(t, err.Error(), "failed to decode start component response") +} + +func TestStartComponentOnce_200WithAgentErrorFails(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _ = json.NewEncoder(w).Encode(agent.StartComponentResponse{ + ErrorCode: "deployment_failed", + Error: "start failed", + }) + })) + defer server.Close() + + client := &httpComponentClient{baseURL: server.URL, client: server.Client()} + _, err := client.startComponentOnce(context.Background(), agent.StartComponentEnvelope{ + SchemaVersion: agent.SchemaVersionV1, + Operation: agent.OperationStartComponent, + Payload: json.RawMessage(`{"componentType":"blockchain"}`), + }) + require.Error(t, err) + require.Contains(t, err.Error(), "remote agent error (deployment_failed)") +} diff --git a/system-tests/lib/cre/environment/remote_stop_test.go b/system-tests/lib/cre/environment/remote_stop_test.go new file mode 100644 index 00000000000..fbb449160c1 --- /dev/null +++ b/system-tests/lib/cre/environment/remote_stop_test.go @@ -0,0 +1,214 @@ +package environment + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "testing" + + "github.com/rs/zerolog" + "github.com/stretchr/testify/require" + + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" +) + +type stubComponentClient struct { + resp *agent.StartComponentResponse + err error +} + +func (s *stubComponentClient) StartComponent(context.Context, agent.StartComponentEnvelope) (*agent.StartComponentResponse, error) { + if s.err != nil { + return nil, s.err + } + return s.resp, nil +} + +func TestCountRemoteStopTargets(t *testing.T) { + cfg := &config.Config{ + Blockchains: []*config.Blockchain{ + {Input: blockchain.Input{}, Placement: config.PlacementRemote}, + {Input: blockchain.Input{}, Placement: config.PlacementLocal}, + }, + NodeSets: []*cre.NodeSet{ + {Input: &ns.Input{Name: "remote-don"}, Placement: "remote"}, + {Input: &ns.Input{Name: "local-don"}, Placement: "local"}, + }, + JD: &config.JobDistributor{Placement: config.PlacementRemote}, + } + + require.Equal(t, 3, countRemoteStopTargets(cfg), "expected only remote-targeted components to be counted") +} + +func TestListRemoteCTFResources(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, http.MethodGet, r.Method) + require.Equal(t, "/v1/resources/ctf", r.URL.Path) + _, _ = w.Write([]byte(`{"containers":["c1","c2"],"volumes":["v1"]}`)) + })) + defer server.Close() + + containers, volumes, err := listRemoteCTFResources(context.Background(), server.URL) + require.NoError(t, err, "expected ctf resource listing to succeed") + require.Equal(t, []string{"c1", "c2"}, containers) + require.Equal(t, []string{"v1"}, volumes) +} + +func TestListRemoteCTFResources_Non2xxFails(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusBadGateway) + _, _ = w.Write([]byte("upstream down")) + })) + defer server.Close() + + _, _, err := listRemoteCTFResources(context.Background(), server.URL) + require.Error(t, err, "expected non-2xx response to fail") + require.Contains(t, err.Error(), "ctf resource query failed", "expected status failure context") +} + +func TestStopRemoteComponents_SummaryAndResiduals(t *testing.T) { + server := newRemoteStopTestServer(t) + defer server.Close() + + t.Setenv(envEC2AgentURL, server.URL) + t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + + cfg := &config.Config{ + Blockchains: []*config.Blockchain{ + {Input: blockchain.Input{}, Placement: config.PlacementRemote}, + {Input: blockchain.Input{}, Placement: config.PlacementLocal}, + }, + NodeSets: []*cre.NodeSet{ + {Input: &ns.Input{Name: "capabilities"}, Placement: "remote"}, + {Input: &ns.Input{Name: "local-don"}, Placement: "local"}, + }, + JD: &config.JobDistributor{Placement: config.PlacementRemote}, + } + + summary, err := StopRemoteComponents(context.Background(), zerolog.Nop(), cfg) + require.NoError(t, err, "expected stop operation to succeed") + require.Equal(t, 3, summary.Requested, "expected remote blockchain + remote nodeset + remote jd") + require.Equal(t, 2, summary.Stopped, "expected blockchain and jd to be stopped") + require.Equal(t, 1, summary.Missing, "expected nodeset to be missing") + require.Equal(t, 0, summary.Failed, "expected no failed stop operations") + require.Equal(t, []string{"leftover-container"}, summary.ResidualContainers) + require.Equal(t, []string{"leftover-volume"}, summary.ResidualVolumes) + require.Empty(t, summary.ResidualQueryError, "expected residual query to succeed") +} + +func TestStopRemoteComponents_ResidualQueryFailureIsReportedInSummary(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/v1/health": + w.WriteHeader(http.StatusOK) + case "/v1/components/start": + resp := agent.StartComponentResponse{ + ComponentType: componentTypeBlockchain, + Found: true, + Stopped: true, + } + require.NoError(t, json.NewEncoder(w).Encode(resp)) + case "/v1/resources/ctf": + w.WriteHeader(http.StatusBadGateway) + _, _ = w.Write([]byte("ctf listing down")) + default: + t.Fatalf("unexpected path %s", r.URL.Path) + } + })) + defer server.Close() + + t.Setenv(envEC2AgentURL, server.URL) + t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + + cfg := &config.Config{ + Blockchains: []*config.Blockchain{ + {Input: blockchain.Input{}, Placement: config.PlacementRemote}, + }, + } + + summary, err := StopRemoteComponents(context.Background(), zerolog.Nop(), cfg) + require.NoError(t, err, "stop should still succeed when residual listing fails") + require.Equal(t, 1, summary.Requested) + require.Equal(t, 1, summary.Stopped) + require.NotEmpty(t, summary.ResidualQueryError, "expected residual query failure to be surfaced") +} + +func TestStopRemoteComponent_UnexpectedComponentTypeFails(t *testing.T) { + client := &stubComponentClient{ + resp: &agent.StartComponentResponse{ + ComponentType: componentTypeJD, + }, + } + + _, err := stopRemoteComponent( + context.Background(), + zerolog.Nop(), + client, + agent.StartComponentPayload{ComponentType: componentTypeBlockchain}, + componentTypeBlockchain, + ) + require.Error(t, err, "expected mismatched component type to fail") + require.Contains(t, err.Error(), "unexpected component type") +} + +func TestStopRemoteComponent_ClientErrorIsWrapped(t *testing.T) { + client := &stubComponentClient{err: errors.New("network down")} + + _, err := stopRemoteComponent( + context.Background(), + zerolog.Nop(), + client, + agent.StartComponentPayload{ComponentType: componentTypeBlockchain}, + componentTypeBlockchain, + ) + require.Error(t, err, "expected client failure to be returned") + require.Contains(t, err.Error(), "failed to stop remote component type") +} + +func newRemoteStopTestServer(t *testing.T) *httptest.Server { + t.Helper() + + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/v1/health": + w.WriteHeader(http.StatusOK) + return + case "/v1/resources/ctf": + _, _ = w.Write([]byte(`{"containers":["leftover-container"],"volumes":["leftover-volume"]}`)) + return + case "/v1/components/start": + var envelope agent.StartComponentEnvelope + require.NoError(t, json.NewDecoder(r.Body).Decode(&envelope)) + require.Equal(t, agent.OperationStopComponent, envelope.Operation) + + var payload agent.StartComponentPayload + require.NoError(t, json.Unmarshal(envelope.Payload, &payload)) + + resp := agent.StartComponentResponse{ComponentType: payload.ComponentType} + switch payload.ComponentType { + case componentTypeBlockchain: + resp.Found = true + resp.Stopped = true + case componentTypeNodeSet: + resp.Found = false + resp.Stopped = false + case componentTypeJD: + resp.Found = true + resp.Stopped = true + default: + t.Fatalf("unexpected component type %q", payload.ComponentType) + } + require.NoError(t, json.NewEncoder(w).Encode(resp)) + return + default: + t.Fatalf("unexpected path %s", r.URL.Path) + } + })) +} diff --git a/system-tests/lib/cre/runtimecfg/access_mode_test.go b/system-tests/lib/cre/runtimecfg/access_mode_test.go new file mode 100644 index 00000000000..e317822c625 --- /dev/null +++ b/system-tests/lib/cre/runtimecfg/access_mode_test.go @@ -0,0 +1,78 @@ +package runtimecfg + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestDirectHostIPUsesExplicitEnv(t *testing.T) { + t.Setenv(EnvEC2HostIP, "203.0.113.10") + t.Setenv(EnvEC2InstanceID, "") + + hostIP, err := DirectHostIP() + require.NoError(t, err) + require.Equal(t, "203.0.113.10", hostIP) +} + +func TestDirectHostIPRequiresInstanceWhenHostMissing(t *testing.T) { + t.Setenv(EnvEC2HostIP, "") + t.Setenv(EnvEC2InstanceID, "") + + _, err := DirectHostIP() + require.Error(t, err) + require.Contains(t, err.Error(), EnvEC2InstanceID) +} + +func TestLocalHostIPUsesExplicitEnv(t *testing.T) { + t.Setenv(EnvLocalHostIP, "192.168.1.11") + require.Equal(t, "192.168.1.11", LocalHostIP()) +} + +func TestResolveAWSCLIProfileSelectionOrder(t *testing.T) { + t.Setenv("AWS_ACCESS_KEY_ID", "key") + t.Setenv("AWS_SECRET_ACCESS_KEY", "secret") + t.Setenv(EnvAWSProfile, "profile-a") + profile, mode := ResolveAWSCLIProfileSelection() + require.Equal(t, "", profile) + require.Equal(t, "env-creds", mode) + + t.Setenv("AWS_ACCESS_KEY_ID", "") + t.Setenv("AWS_SECRET_ACCESS_KEY", "") + t.Setenv("AWS_WEB_IDENTITY_TOKEN_FILE", "/tmp/token") + t.Setenv("AWS_ROLE_ARN", "arn:aws:iam::123456789012:role/Role") + profile, mode = ResolveAWSCLIProfileSelection() + require.Equal(t, "", profile) + require.Equal(t, "web-identity", mode) + + t.Setenv("AWS_WEB_IDENTITY_TOKEN_FILE", "") + t.Setenv("AWS_ROLE_ARN", "") + t.Setenv(EnvAWSProfile, "profile-a") + t.Setenv("AWS_PROFILE", "profile-b") + t.Setenv("AWS_DEFAULT_PROFILE", "profile-c") + profile, mode = ResolveAWSCLIProfileSelection() + require.Equal(t, "profile-a", profile) + require.Equal(t, "profile:CRE_AWS_PROFILE", mode) + + t.Setenv(EnvAWSProfile, "") + profile, mode = ResolveAWSCLIProfileSelection() + require.Equal(t, "profile-b", profile) + require.Equal(t, "profile:AWS_PROFILE", mode) + + t.Setenv("AWS_PROFILE", "") + profile, mode = ResolveAWSCLIProfileSelection() + require.Equal(t, "profile-c", profile) + require.Equal(t, "profile:AWS_DEFAULT_PROFILE", mode) +} + +func TestAWSRegionResolution(t *testing.T) { + t.Setenv("AWS_REGION", "eu-central-1") + t.Setenv("AWS_DEFAULT_REGION", "us-east-1") + require.Equal(t, "eu-central-1", awsRegion()) + + t.Setenv("AWS_REGION", "") + require.Equal(t, "us-east-1", awsRegion()) + + t.Setenv("AWS_DEFAULT_REGION", "") + require.Equal(t, defaultEC2Region, awsRegion()) +} diff --git a/system-tests/lib/cre/workflow/deploy_artifacts_test.go b/system-tests/lib/cre/workflow/deploy_artifacts_test.go new file mode 100644 index 00000000000..07c5873dc0c --- /dev/null +++ b/system-tests/lib/cre/workflow/deploy_artifacts_test.go @@ -0,0 +1,71 @@ +package workflow + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestDeployArtifactsRemoteValidation(t *testing.T) { + err := DeployArtifacts(context.Background(), DeployArtifactsOptions{ + Mode: ArtifactDeployModeRemote, + }) + require.Error(t, err) + require.Contains(t, err.Error(), "remote artifact deployer is required") + + err = DeployArtifacts(context.Background(), DeployArtifactsOptions{ + Mode: ArtifactDeployModeRemote, + RemoteDeployer: func(context.Context, string, string, []string) error { return nil }, + }) + require.Error(t, err) + require.Contains(t, err.Error(), "nodeset name is required") +} + +func TestDeployArtifactsRemoteCallsDeployer(t *testing.T) { + called := false + err := DeployArtifacts(context.Background(), DeployArtifactsOptions{ + Mode: ArtifactDeployModeRemote, + NodeSetName: "workflow", + Files: []string{"a.wasm"}, + RemoteDeployer: func(_ context.Context, nodeSetName, targetDir string, files []string) error { + called = true + require.Equal(t, "workflow", nodeSetName) + require.Equal(t, "/home/chainlink/workflows", targetDir) + require.Equal(t, []string{"a.wasm"}, files) + return nil + }, + ContainerTargetDir: "/home/chainlink/workflows", + }) + require.NoError(t, err) + require.True(t, called, "expected remote deployer to be called") +} + +func TestDeployArtifactsLocalValidation(t *testing.T) { + err := DeployArtifacts(context.Background(), DeployArtifactsOptions{ + Mode: ArtifactDeployModeLocal, + ContainerTargetDir: "/tmp", + ContainerNamePattern: "", + }) + require.Error(t, err) + require.Contains(t, err.Error(), "container name pattern is required") + + err = DeployArtifacts(context.Background(), DeployArtifactsOptions{ + Mode: "", + ContainerTargetDir: "/tmp", + }) + require.Error(t, err) + require.Contains(t, err.Error(), "container name pattern is required") +} + +func TestDeployArtifactsRemotePropagatesDeployerError(t *testing.T) { + err := DeployArtifacts(context.Background(), DeployArtifactsOptions{ + Mode: ArtifactDeployModeRemote, + NodeSetName: "workflow", + RemoteDeployer: func(context.Context, string, string, []string) error { + return errors.New("deploy failed") + }, + }) + require.EqualError(t, err, "deploy failed") +} From 58dca7c09bf8800e085149ed07c787149f6ca720 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Wed, 25 Feb 2026 13:46:23 +0100 Subject: [PATCH 21/34] simplify and clean up --- .../environment/environment/remote_state.go | 48 +- .../environment/remote_state_test.go | 46 ++ system-tests/lib/cre/don/config/config.go | 37 +- .../lib/cre/environment/agent/deploy.go | 33 +- .../lib/cre/environment/agent/deploy_test.go | 60 +- .../lib/cre/environment/agent/server_test.go | 6 +- .../lib/cre/environment/artifacts_remote.go | 6 +- .../lib/cre/environment/blockchain_start.go | 128 ++-- .../cre/environment/blockchain_start_test.go | 54 +- .../environment/blockchains/blockchains.go | 75 +-- .../cre/environment/blockchains/evm/evm.go | 13 +- .../environment/blockchains/solana/solana.go | 32 +- .../cre/environment/blockchains/tron/tron.go | 27 +- system-tests/lib/cre/environment/dons.go | 594 +++++++----------- system-tests/lib/cre/environment/dons_test.go | 115 +--- .../lib/cre/environment/environment.go | 49 +- .../environment/environment_placement_test.go | 12 + system-tests/lib/cre/environment/jobs.go | 57 +- system-tests/lib/cre/environment/jobs_test.go | 16 +- .../environment/remote_component_client.go | 17 +- .../remote_component_client_test.go | 3 +- .../cre/environment/remote_component_start.go | 50 ++ .../lib/cre/environment/remote_stop.go | 10 +- system-tests/lib/cre/environment/state.go | 30 +- 24 files changed, 588 insertions(+), 930 deletions(-) create mode 100644 core/scripts/cre/environment/environment/remote_state_test.go create mode 100644 system-tests/lib/cre/environment/remote_component_start.go diff --git a/core/scripts/cre/environment/environment/remote_state.go b/core/scripts/cre/environment/environment/remote_state.go index 1e0f83f9bf7..6a0f15c59c1 100644 --- a/core/scripts/cre/environment/environment/remote_state.go +++ b/core/scripts/cre/environment/environment/remote_state.go @@ -10,12 +10,15 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre" envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" ) const ( remoteStateDirname = "core/scripts/cre/environment/state_remote" remoteStateFilename = "remote_components.toml" remoteAgentFilename = "remote_agent.toml" + envEC2AgentURL = "CRE_EC2_AGENT_URL" + envEC2AgentPort = "CRE_EC2_AGENT_PORT" ) type remoteAgentState struct { @@ -70,6 +73,23 @@ func storeRemoteStopState(relativePathToRepoRoot string, cfg *envconfig.Config) if cfg == nil { return fmt.Errorf("cannot store nil remote stop config") } + stopCfg := filteredRemoteStopConfig(cfg) + if err := stopCfg.Store(remoteStateFileAbsPath(relativePathToRepoRoot)); err != nil { + return err + } + agentEnvelope := &remoteAgentStateEnvelope{Agent: captureRemoteAgentState()} + data, err := toml.Marshal(agentEnvelope) + if err != nil { + return err + } + path := remoteAgentFileAbsPath(relativePathToRepoRoot) + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return err + } + return os.WriteFile(path, data, 0o600) +} + +func filteredRemoteStopConfig(cfg *envconfig.Config) *envconfig.Config { stopCfg := &envconfig.Config{ Blockchains: []*envconfig.Blockchain{}, NodeSets: []*cre.NodeSet{}, @@ -87,26 +107,16 @@ func storeRemoteStopState(relativePathToRepoRoot string, cfg *envconfig.Config) if cfg.JD != nil && cfg.JD.Placement == envconfig.PlacementRemote { stopCfg.JD = cfg.JD } - if err := stopCfg.Store(remoteStateFileAbsPath(relativePathToRepoRoot)); err != nil { - return err - } - agentEnvelope := &remoteAgentStateEnvelope{ - Agent: remoteAgentState{ - EC2URL: os.Getenv("CRE_EC2_AGENT_URL"), - EC2InstanceID: os.Getenv("CRE_EC2_INSTANCE_ID"), - EC2AgentPort: os.Getenv("CRE_EC2_AGENT_PORT"), - AWSProfile: firstNonEmpty(os.Getenv("CRE_AWS_PROFILE"), os.Getenv("AWS_PROFILE")), - }, - } - data, err := toml.Marshal(agentEnvelope) - if err != nil { - return err - } - path := remoteAgentFileAbsPath(relativePathToRepoRoot) - if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { - return err + return stopCfg +} + +func captureRemoteAgentState() remoteAgentState { + return remoteAgentState{ + EC2URL: os.Getenv(envEC2AgentURL), + EC2InstanceID: os.Getenv(runtimecfg.EnvEC2InstanceID), + EC2AgentPort: os.Getenv(envEC2AgentPort), + AWSProfile: firstNonEmpty(os.Getenv(runtimecfg.EnvAWSProfile), os.Getenv("AWS_PROFILE")), } - return os.WriteFile(path, data, 0o600) } func firstNonEmpty(values ...string) string { diff --git a/core/scripts/cre/environment/environment/remote_state_test.go b/core/scripts/cre/environment/environment/remote_state_test.go new file mode 100644 index 00000000000..a0b293996e2 --- /dev/null +++ b/core/scripts/cre/environment/environment/remote_state_test.go @@ -0,0 +1,46 @@ +package environment + +import ( + "testing" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre" + envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" + "github.com/stretchr/testify/require" +) + +func TestFilteredRemoteStopConfigKeepsOnlyRemoteComponents(t *testing.T) { + cfg := &envconfig.Config{ + Blockchains: []*envconfig.Blockchain{ + {Placement: envconfig.PlacementLocal}, + {Placement: envconfig.PlacementRemote}, + }, + NodeSets: []*cre.NodeSet{ + {Placement: "local"}, + {Placement: "remote"}, + }, + JD: &envconfig.JobDistributor{Placement: envconfig.PlacementRemote}, + } + + filtered := filteredRemoteStopConfig(cfg) + require.Len(t, filtered.Blockchains, 1) + require.Equal(t, envconfig.PlacementRemote, filtered.Blockchains[0].Placement) + require.Len(t, filtered.NodeSets, 1) + require.Equal(t, "remote", filtered.NodeSets[0].Placement) + require.NotNil(t, filtered.JD) + require.Equal(t, envconfig.PlacementRemote, filtered.JD.Placement) +} + +func TestCaptureRemoteAgentStateReadsExpectedEnvVars(t *testing.T) { + t.Setenv(envEC2AgentURL, "http://203.0.113.10:8080") + t.Setenv(runtimecfg.EnvEC2InstanceID, "i-abc") + t.Setenv(envEC2AgentPort, "18080") + t.Setenv(runtimecfg.EnvAWSProfile, "cre-profile") + t.Setenv("AWS_PROFILE", "fallback-profile") + + state := captureRemoteAgentState() + require.Equal(t, "http://203.0.113.10:8080", state.EC2URL) + require.Equal(t, "i-abc", state.EC2InstanceID) + require.Equal(t, "18080", state.EC2AgentPort) + require.Equal(t, "cre-profile", state.AWSProfile) +} diff --git a/system-tests/lib/cre/don/config/config.go b/system-tests/lib/cre/don/config/config.go index ba3decf01b6..1e40366de9b 100644 --- a/system-tests/lib/cre/don/config/config.go +++ b/system-tests/lib/cre/don/config/config.go @@ -39,6 +39,7 @@ import ( crecontracts "github.com/smartcontractkit/chainlink/system-tests/lib/cre/contracts" creblockchains "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/solana" + envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" "github.com/smartcontractkit/chainlink/system-tests/lib/infra" ) @@ -50,7 +51,7 @@ func PrepareNodeTOMLs( topology *cre.Topology, creEnv *cre.Environment, nodeSets []*cre.NodeSet, - blockchainPlacementBySelector map[uint64]string, + configuredBlockchains []*envconfig.Blockchain, capabilities []cre.InstallableCapability, // Deprecated, use Features instead and modify node configs inside a Feature nodeConfigTransformerFns []cre.NodeConfigTransformerFn, ) ([]*cre.NodeSet, error) { @@ -77,6 +78,7 @@ func PrepareNodeTOMLs( for _, bc := range creEnv.Blockchains { chainPerSelector[bc.ChainSelector()] = bc } + blockchainPlacementBySelector := blockchainPlacementsBySelector(configuredBlockchains, creEnv.Blockchains) for i, donMetadata := range topology.DonsMetadata.List() { // make sure that either all or none of the node specs have config or secrets provided in the TOML config @@ -117,19 +119,19 @@ func PrepareNodeTOMLs( if configsFound == 0 { config, configErr := generateNodeTomlConfig( cre.GenerateConfigsInput{ - Datastore: creEnv.CldfEnvironment.DataStore, - ContractVersions: creEnv.ContractVersions, - DonMetadata: donMetadata, + Datastore: creEnv.CldfEnvironment.DataStore, + ContractVersions: creEnv.ContractVersions, + DonMetadata: donMetadata, Blockchains: chainPerSelector, BlockchainPlacementBySelector: blockchainPlacementBySelector, OCRBootstrapPlacement: ocrBootstrapPlacement, OCRBootstrapAnnouncePort: ocrBootstrapAnnouncePort, - Flags: donMetadata.Flags, - CapabilitiesPeeringData: capabilitiesPeeringData, - OCRPeeringData: ocrPeeringData, - RegistryChainSelector: creEnv.RegistryChainSelector, - Topology: topology, - Provider: creEnv.Provider, + Flags: donMetadata.Flags, + CapabilitiesPeeringData: capabilitiesPeeringData, + OCRPeeringData: ocrPeeringData, + RegistryChainSelector: creEnv.RegistryChainSelector, + Topology: topology, + Provider: creEnv.Provider, }, configFactoryFunctions, ) @@ -1036,6 +1038,21 @@ func resolveGatewayConnectorURL(callerPlacementRaw string, topology *cre.Topolog return resolved.URL, nil } +func blockchainPlacementsBySelector(configured []*envconfig.Blockchain, deployed []creblockchains.Blockchain) map[uint64]string { + bySelector := make(map[uint64]string, len(deployed)) + for idx, blockchainCfg := range configured { + if blockchainCfg == nil { + continue + } + if idx >= len(deployed) || deployed[idx] == nil { + continue + } + selector := deployed[idx].ChainSelector() + bySelector[selector] = string(blockchainCfg.Placement) + } + return bySelector +} + func resolveNodePlacement(topology *cre.Topology, nodeUUID string) (connectivity.Placement, error) { if topology == nil { return "", fmt.Errorf("topology is nil") diff --git a/system-tests/lib/cre/environment/agent/deploy.go b/system-tests/lib/cre/environment/agent/deploy.go index 743b6e53f8b..9cfeb5c6c4e 100644 --- a/system-tests/lib/cre/environment/agent/deploy.go +++ b/system-tests/lib/cre/environment/agent/deploy.go @@ -20,43 +20,12 @@ var ( ensureJDImagePresentFn = ensureJDImagePresent ) -type OutputDeployer interface { - DeployOutput(ctx context.Context, input *blockchain.Input) (*blockchain.Output, error) -} - func DeployBlockchainComponent( ctx context.Context, deployers map[blockchain.ChainFamily]blockchains.Deployer, input *blockchain.Input, ) (*blockchain.Output, error) { - if input == nil { - return nil, pkgerrors.New("blockchain input is nil") - } - - chainFamily, err := blockchain.TypeToFamily(input.Type) - if err != nil { - return nil, err - } - - deployer, ok := deployers[chainFamily] - if !ok { - return nil, fmt.Errorf("no deployer found for blockchain type %s", input.Type) - } - - if outputDeployer, ok := deployer.(OutputDeployer); ok { - deployedOutput, err := outputDeployer.DeployOutput(ctx, input) - if err != nil { - return nil, pkgerrors.Wrapf(err, "failed to deploy blockchain output of type %s", input.Type) - } - return deployedOutput, nil - } - - deployed, err := deployer.Deploy(ctx, input) - if err != nil { - return nil, pkgerrors.Wrapf(err, "failed to deploy blockchain of type %s", input.Type) - } - - return deployed.CtfOutput(), nil + return blockchains.StartChain(ctx, deployers, input) } func DeployJDComponent(ctx context.Context, input *jd.Input) (*jd.Output, error) { diff --git a/system-tests/lib/cre/environment/agent/deploy_test.go b/system-tests/lib/cre/environment/agent/deploy_test.go index 36d1599a19f..6429bf954a3 100644 --- a/system-tests/lib/cre/environment/agent/deploy_test.go +++ b/system-tests/lib/cre/environment/agent/deploy_test.go @@ -5,7 +5,6 @@ import ( "errors" "testing" - cldf_chain "github.com/smartcontractkit/chainlink-deployments-framework/chain" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" @@ -13,47 +12,15 @@ import ( "github.com/stretchr/testify/require" ) -type fakeBlockchain struct { - out *blockchain.Output +type fakeStarterDeployer struct { + startCalls int } -func (f *fakeBlockchain) ChainSelector() uint64 { return 0 } -func (f *fakeBlockchain) ChainID() uint64 { return 0 } -func (f *fakeBlockchain) ChainFamily() string { return "" } -func (f *fakeBlockchain) IsFamily(string) bool { return false } -func (f *fakeBlockchain) Fund(context.Context, string, uint64) error { return nil } -func (f *fakeBlockchain) CtfOutput() *blockchain.Output { return f.out } -func (f *fakeBlockchain) ToCldfChain() (cldf_chain.BlockChain, error) { return nil, nil } - -type outputPreferringDeployer struct { - deployCalls int - deployOutputCalls int -} - -func (d *outputPreferringDeployer) Deploy(context.Context, *blockchain.Input) (blockchains.Blockchain, error) { - d.deployCalls++ - return &fakeBlockchain{out: &blockchain.Output{ChainID: "fallback"}}, nil -} - -func (d *outputPreferringDeployer) DeployOutput(context.Context, *blockchain.Input) (*blockchain.Output, error) { - d.deployOutputCalls++ +func (d *fakeStarterDeployer) Start(context.Context, *blockchain.Input) (*blockchain.Output, error) { + d.startCalls++ return &blockchain.Output{ChainID: "1337", Type: blockchain.TypeAnvil}, nil } -type fallbackOnlyDeployer struct { - deployCalls int -} - -func (d *fallbackOnlyDeployer) Deploy(context.Context, *blockchain.Input) (blockchains.Blockchain, error) { - d.deployCalls++ - return &fakeBlockchain{ - out: &blockchain.Output{ - ChainID: "2337", - Type: blockchain.TypeAnvil, - }, - }, nil -} - func TestBuildRemoteJDInputEnablesDNSIsolationOverride(t *testing.T) { original := &jd.Input{Image: "job-distributor:0.22.1", DisableDNSIsolation: false} @@ -76,8 +43,8 @@ func TestDeployBlockchainComponentNoDeployerFails(t *testing.T) { require.Contains(t, err.Error(), "no deployer found") } -func TestDeployBlockchainComponentPrefersOutputDeployer(t *testing.T) { - deployer := &outputPreferringDeployer{} +func TestDeployBlockchainComponentStartsBlockchain(t *testing.T) { + deployer := &fakeStarterDeployer{} output, err := DeployBlockchainComponent( context.Background(), map[blockchain.ChainFamily]blockchains.Deployer{blockchain.FamilyEVM: deployer}, @@ -85,20 +52,7 @@ func TestDeployBlockchainComponentPrefersOutputDeployer(t *testing.T) { ) require.NoError(t, err) require.Equal(t, "1337", output.ChainID) - require.Equal(t, 1, deployer.deployOutputCalls, "DeployOutput should be used when available") - require.Equal(t, 0, deployer.deployCalls, "Deploy fallback should not be called") -} - -func TestDeployBlockchainComponentFallsBackToDeploy(t *testing.T) { - deployer := &fallbackOnlyDeployer{} - output, err := DeployBlockchainComponent( - context.Background(), - map[blockchain.ChainFamily]blockchains.Deployer{blockchain.FamilyEVM: deployer}, - &blockchain.Input{Type: blockchain.TypeAnvil}, - ) - require.NoError(t, err) - require.Equal(t, "2337", output.ChainID) - require.Equal(t, 1, deployer.deployCalls, "Deploy should be called for non-output deployers") + require.Equal(t, 1, deployer.startCalls, "expected starter to be called once") } func TestDeployJDComponentNilInputFails(t *testing.T) { diff --git a/system-tests/lib/cre/environment/agent/server_test.go b/system-tests/lib/cre/environment/agent/server_test.go index 347b5204263..3ec3ea73d47 100644 --- a/system-tests/lib/cre/environment/agent/server_test.go +++ b/system-tests/lib/cre/environment/agent/server_test.go @@ -55,11 +55,7 @@ type fakeOutputDeployer struct { calls int } -func (f *fakeOutputDeployer) Deploy(context.Context, *blockchain.Input) (blockchains.Blockchain, error) { - return nil, nil -} - -func (f *fakeOutputDeployer) DeployOutput(context.Context, *blockchain.Input) (*blockchain.Output, error) { +func (f *fakeOutputDeployer) Start(context.Context, *blockchain.Input) (*blockchain.Output, error) { f.calls++ return &blockchain.Output{ Type: blockchain.TypeAnvil, diff --git a/system-tests/lib/cre/environment/artifacts_remote.go b/system-tests/lib/cre/environment/artifacts_remote.go index 1f343526649..14658d5bc96 100644 --- a/system-tests/lib/cre/environment/artifacts_remote.go +++ b/system-tests/lib/cre/environment/artifacts_remote.go @@ -32,10 +32,6 @@ func DeployArtifactsToRemoteNodeSet( if err != nil { return pkgerrors.Wrap(err, "failed to resolve remote runtime settings for artifact deploy") } - startClient, err := newRemoteComponentClient(remoteRuntime) - if err != nil { - return pkgerrors.Wrap(err, "failed to initialize remote component client for artifact deploy") - } payloadFiles := make([]agent.DeployArtifactsFile, 0, len(files)) for _, path := range files { @@ -64,7 +60,7 @@ func DeployArtifactsToRemoteNodeSet( return pkgerrors.Wrap(err, "failed to encode deploy artifacts payload") } - response, err := startClient.StartComponent(ctx, agent.StartComponentEnvelope{ + response, err := remoteRuntime.Client.StartComponent(ctx, agent.StartComponentEnvelope{ SchemaVersion: agent.SchemaVersionV1, Operation: agent.OperationDeployArtifacts, Payload: payloadBytes, diff --git a/system-tests/lib/cre/environment/blockchain_start.go b/system-tests/lib/cre/environment/blockchain_start.go index a978bb71ec4..b54bb6dd62e 100644 --- a/system-tests/lib/cre/environment/blockchain_start.go +++ b/system-tests/lib/cre/environment/blockchain_start.go @@ -2,7 +2,6 @@ package environment import ( "context" - "encoding/json" "errors" "fmt" "net" @@ -16,22 +15,26 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/solana" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/tron" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" ) -type startComponentEnvelope = agent.StartComponentEnvelope -type startComponentRequest = agent.StartComponentPayload - -func blockchainFromOutput(testLogger zerolog.Logger, output *blockchain.Output) (blockchains.Blockchain, error) { +func blockchainFromOutput(testLogger zerolog.Logger, input *blockchain.Input, output *blockchain.Output) (blockchains.Blockchain, error) { if output == nil { return nil, pkgerrors.New("blockchain output is nil") } - if output.Type != blockchain.TypeAnvil { - return nil, fmt.Errorf("remote blockchain reconstruction supports only %s in phase 2A, got %s", blockchain.TypeAnvil, output.Type) + switch output.Type { + case blockchain.TypeAnvil: + return evm.From(testLogger, output) + case blockchain.TypeTron: + return tron.From(testLogger, output) + case blockchain.TypeSolana: + return solana.From(input, output) + default: + return nil, fmt.Errorf("unsupported blockchain type for reconstruction: %s", output.Type) } - - return evm.FromOutput(testLogger, output) } func validateRemoteBlockchainInput(input *blockchain.Input) error { @@ -44,7 +47,7 @@ func validateRemoteBlockchainInput(input *blockchain.Input) error { return nil } -func startBlockchainsWithTargets( +func startBlockchains( ctx context.Context, testLogger zerolog.Logger, configuredBlockchains []*config.Blockchain, @@ -57,93 +60,48 @@ func startBlockchainsWithTargets( return nil, err } - localIdx := make([]int, 0, len(configuredBlockchains)) - localInputs := make([]*blockchain.Input, 0, len(configuredBlockchains)) - remoteIdx := make([]int, 0, len(configuredBlockchains)) - for idx, configuredBlockchain := range configuredBlockchains { - if configuredBlockchain.Placement == config.PlacementRemote { - remoteIdx = append(remoteIdx, idx) - continue - } - localIdx = append(localIdx, idx) - localInputs = append(localInputs, configuredBlockchain.InputRef()) - } - outputs := make([]blockchains.Blockchain, len(configuredBlockchains)) + for idx, configured := range configuredBlockchains { + input := blockchainInputs[idx] + var deployedOutput *blockchain.Output - if len(localInputs) > 0 { - for i, idx := range localIdx { - deployedOutput, err := agent.DeployBlockchainComponent(ctx, deployers, localInputs[i]) - if err != nil { - return nil, err - } - reconstructedBlockchain, err := blockchainFromOutput(testLogger, deployedOutput) - if err != nil { - return nil, err - } - outputs[idx] = reconstructedBlockchain - } - } - - if len(remoteIdx) > 0 { - if remoteRuntime == nil { - return nil, errors.New("remote runtime is required when starting remote blockchains") - } - startClient, err := newRemoteComponentClient(remoteRuntime) - if err != nil { - return nil, err - } - - for _, idx := range remoteIdx { - input := blockchainInputs[idx] - configured := configuredBlockchains[idx] + if configured.Placement == config.PlacementRemote { if err := validateRemoteBlockchainInput(input); err != nil { return nil, err } - payload := agent.StartComponentPayload{ ComponentType: componentTypeBlockchain, Blockchain: input, ReusePolicy: string(configured.RemoteStartPolicy), } - payloadBytes, err := json.Marshal(payload) - if err != nil { - return nil, pkgerrors.Wrap(err, "failed to encode blockchain payload") - } - - response, err := startClient.StartComponent(ctx, agent.StartComponentEnvelope{ - SchemaVersion: agent.SchemaVersionV1, - Operation: agent.OperationStartComponent, - Payload: payloadBytes, - }) + deployedOutput, err = startRemoteComponent[blockchain.Output]( + ctx, + testLogger, + remoteRuntime.Client, + payload, + componentTypeBlockchain, + ) if err != nil { return nil, err } - if response.ComponentType != componentTypeBlockchain { - return nil, fmt.Errorf("unexpected component type in start response: %s", response.ComponentType) - } - for _, logLine := range response.AgentLogs { - pretty := prettifyAgentLogLine(logLine) - if pretty == "" { - continue - } - testLogger.Info().Msgf("[agent] %s", pretty) - } - blockchainOutput, err := agent.DecodeFromTransport[blockchain.Output](response.Output) - if err != nil { - return nil, pkgerrors.Wrap(err, "failed to decode blockchain transport payload") + if rewriteInternalForLocalNodes { + // direct mode keeps internal URLs unchanged } - - if err := rewriteRemoteBlockchainOutputForLocalAccess(blockchainOutput, remoteRuntime.EC2HostIP, rewriteInternalForLocalNodes); err != nil { + if err := rewriteRemoteBlockchainOutputForDirectAccess(deployedOutput, remoteRuntime.EC2HostIP); err != nil { return nil, err } - - reconstructedBlockchain, err := blockchainFromOutput(testLogger, blockchainOutput) + } else { + deployedOutput, err = blockchains.StartChain(ctx, deployers, input) if err != nil { return nil, err } - outputs[idx] = reconstructedBlockchain } + + reconstructedBlockchain, err := blockchainFromOutput(testLogger, input, deployedOutput) + if err != nil { + return nil, err + } + outputs[idx] = reconstructedBlockchain } cldfBlockchains := make([]cldf_chain.BlockChain, 0, len(outputs)) @@ -164,18 +122,6 @@ func startBlockchainsWithTargets( }, nil } -func rewriteRemoteBlockchainOutputForLocalAccess( - output *blockchain.Output, - ec2HostIP string, - rewriteInternalForLocalNodes bool, -) error { - _ = rewriteInternalForLocalNodes // direct mode keeps internal URLs unchanged - if output == nil { - return nil - } - return rewriteRemoteBlockchainOutputForDirectAccess(output, ec2HostIP) -} - func rewriteRemoteBlockchainOutputForDirectAccess(output *blockchain.Output, ec2HostIP string) error { if output == nil { return nil @@ -214,7 +160,3 @@ func rewriteURLHost(rawURL, host string) (string, error) { parsed.Host = host return parsed.String(), nil } - -func remoteAgentError(code, message string) error { - return fmt.Errorf("remote agent error (%s): %s", code, message) -} diff --git a/system-tests/lib/cre/environment/blockchain_start_test.go b/system-tests/lib/cre/environment/blockchain_start_test.go index be94fb603b6..fb4a287814b 100644 --- a/system-tests/lib/cre/environment/blockchain_start_test.go +++ b/system-tests/lib/cre/environment/blockchain_start_test.go @@ -4,9 +4,9 @@ import ( "testing" "github.com/rs/zerolog" - "github.com/stretchr/testify/require" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" + "github.com/stretchr/testify/require" ) func TestValidateRemoteBlockchainInput(t *testing.T) { @@ -76,45 +76,33 @@ func TestResolveRemoteRuntimeRequiresEC2Resolution(t *testing.T) { require.Error(t, err, "expected runtime resolution without EC2 inputs to fail") } -func TestRewriteRemoteBlockchainOutputForLocalAccess_DirectMode(t *testing.T) { +func TestRewriteRemoteBlockchainOutputForDirectAccess(t *testing.T) { t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") - tests := []struct { - name string - rewriteInternalForLocalNodes bool - }{ - {name: "remote only path keeps internal URLs", rewriteInternalForLocalNodes: false}, - {name: "mixed path still keeps internal URLs in direct mode", rewriteInternalForLocalNodes: true}, + out := &blockchain.Output{ + Nodes: []*blockchain.Node{ + { + ExternalHTTPUrl: "http://anvil-1337:8545", + ExternalWSUrl: "ws://anvil-1337:8546", + InternalHTTPUrl: "http://anvil-1337:8545", + InternalWSUrl: "ws://anvil-1337:8546", + }, + }, } + err := rewriteRemoteBlockchainOutputForDirectAccess(out, "203.0.113.10") + require.NoError(t, err, "expected rewrite helper to succeed") - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - out := &blockchain.Output{ - Nodes: []*blockchain.Node{ - { - ExternalHTTPUrl: "http://anvil-1337:8545", - ExternalWSUrl: "ws://anvil-1337:8546", - InternalHTTPUrl: "http://anvil-1337:8545", - InternalWSUrl: "ws://anvil-1337:8546", - }, - }, - } - err := rewriteRemoteBlockchainOutputForLocalAccess(out, "203.0.113.10", tt.rewriteInternalForLocalNodes) - require.NoError(t, err, "expected rewrite helper to succeed") - - require.Equal(t, "http://203.0.113.10:8545", out.Nodes[0].ExternalHTTPUrl, "unexpected rewritten http url") - require.Equal(t, "ws://203.0.113.10:8546", out.Nodes[0].ExternalWSUrl, "unexpected rewritten ws url") - require.Equal(t, "http://anvil-1337:8545", out.Nodes[0].InternalHTTPUrl, "internal http url should remain unchanged in direct mode") - require.Equal(t, "ws://anvil-1337:8546", out.Nodes[0].InternalWSUrl, "internal ws url should remain unchanged in direct mode") - }) - } + require.Equal(t, "http://203.0.113.10:8545", out.Nodes[0].ExternalHTTPUrl, "unexpected rewritten http url") + require.Equal(t, "ws://203.0.113.10:8546", out.Nodes[0].ExternalWSUrl, "unexpected rewritten ws url") + require.Equal(t, "http://anvil-1337:8545", out.Nodes[0].InternalHTTPUrl, "internal http url should remain unchanged in direct mode") + require.Equal(t, "ws://anvil-1337:8546", out.Nodes[0].InternalWSUrl, "internal ws url should remain unchanged in direct mode") } -func TestRewriteRemoteBlockchainOutputForLocalAccess_LocalOnlyNoop(t *testing.T) { - err := rewriteRemoteBlockchainOutputForLocalAccess(nil, "203.0.113.10", false) +func TestRewriteRemoteBlockchainOutputForDirectAccess_NilOutputNoop(t *testing.T) { + err := rewriteRemoteBlockchainOutputForDirectAccess(nil, "203.0.113.10") require.NoError(t, err, "expected nil output rewrite to be a no-op") } -func TestRewriteRemoteBlockchainOutputForLocalAccess_InvalidExternalURL(t *testing.T) { +func TestRewriteRemoteBlockchainOutputForDirectAccess_InvalidExternalURL(t *testing.T) { out := &blockchain.Output{ Nodes: []*blockchain.Node{ { @@ -124,7 +112,7 @@ func TestRewriteRemoteBlockchainOutputForLocalAccess_InvalidExternalURL(t *testi }, } - err := rewriteRemoteBlockchainOutputForLocalAccess(out, "203.0.113.10", false) + err := rewriteRemoteBlockchainOutputForDirectAccess(out, "203.0.113.10") require.Error(t, err, "expected invalid external URL to fail rewrite") require.Contains(t, err.Error(), "failed to parse url", "expected parse failure context") } diff --git a/system-tests/lib/cre/environment/blockchains/blockchains.go b/system-tests/lib/cre/environment/blockchains/blockchains.go index 68053603078..069c66f5fa6 100644 --- a/system-tests/lib/cre/environment/blockchains/blockchains.go +++ b/system-tests/lib/cre/environment/blockchains/blockchains.go @@ -5,11 +5,7 @@ import ( "fmt" pkgerrors "github.com/pkg/errors" - "github.com/rs/zerolog" - - "github.com/smartcontractkit/chainlink-common/pkg/logger" cldf_chain "github.com/smartcontractkit/chainlink-deployments-framework/chain" - "github.com/smartcontractkit/chainlink/system-tests/lib/infra" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" ) @@ -27,7 +23,32 @@ type Blockchain interface { } type Deployer interface { - Deploy(ctx context.Context, input *blockchain.Input) (Blockchain, error) + Start(ctx context.Context, input *blockchain.Input) (*blockchain.Output, error) +} + +func StartChain( + ctx context.Context, + deployers map[blockchain.ChainFamily]Deployer, + input *blockchain.Input, +) (*blockchain.Output, error) { + if input == nil { + return nil, pkgerrors.New("blockchain input is nil") + } + + chainFamily, err := blockchain.TypeToFamily(input.Type) + if err != nil { + return nil, err + } + + deployer, ok := deployers[chainFamily] + if !ok { + return nil, fmt.Errorf("no deployer found for blockchain type %s", input.Type) + } + deployed, err := deployer.Start(ctx, input) + if err != nil { + return nil, pkgerrors.Wrapf(err, "failed to deploy blockchain of type %s", input.Type) + } + return deployed, nil } type DeployedBlockchains struct { @@ -47,47 +68,3 @@ func ValidateKubernetesBlockchainOutput(input *blockchain.Input) error { } return nil } - -func Start( - ctx context.Context, - testLogger zerolog.Logger, - commonLogger logger.Logger, - inputs []*blockchain.Input, - deployers map[blockchain.ChainFamily]Deployer, -) (*DeployedBlockchains, error) { - outputs := make([]Blockchain, 0, len(inputs)) - - for _, input := range inputs { - chainFamily, chErr := blockchain.TypeToFamily(input.Type) - if chErr != nil { - return nil, chErr - } - - deployer, ok := deployers[chainFamily] - if !ok { - infra.PrintFailedContainerLogs(testLogger, 30) - return nil, fmt.Errorf("no deployer found for blockchain type %s", input.Type) - } - - deployedBlockchain, deployErr := deployer.Deploy(ctx, input) - if deployErr != nil { - return nil, pkgerrors.Wrapf(deployErr, "failed to deploy blockchain of type %s", input.Type) - } - - outputs = append(outputs, deployedBlockchain) - } - - cldfBlockchains := make([]cldf_chain.BlockChain, 0, len(outputs)) - for _, db := range outputs { - chain, chainErr := db.ToCldfChain() - if chainErr != nil { - return nil, pkgerrors.Wrap(chainErr, "failed to create cldf chain from blockchain") - } - cldfBlockchains = append(cldfBlockchains, chain) - } - - return &DeployedBlockchains{ - Outputs: outputs, - CldfBlockChains: cldf_chain.NewBlockChainsFromSlice(cldfBlockchains), - }, nil -} diff --git a/system-tests/lib/cre/environment/blockchains/evm/evm.go b/system-tests/lib/cre/environment/blockchains/evm/evm.go index 3820225ff48..8f4cf79d3c8 100644 --- a/system-tests/lib/cre/environment/blockchains/evm/evm.go +++ b/system-tests/lib/cre/environment/blockchains/evm/evm.go @@ -134,16 +134,7 @@ func (e *Blockchain) ToCldfChain() (cldf_chain.BlockChain, error) { return provider, nil } -func (e *Deployer) Deploy(ctx context.Context, input *blockchain.Input) (blockchains.Blockchain, error) { - bcOut, err := e.DeployOutput(ctx, input) - if err != nil { - return nil, err - } - - return FromOutput(e.testLogger, bcOut) -} - -func (e *Deployer) DeployOutput(ctx context.Context, input *blockchain.Input) (*blockchain.Output, error) { +func (e *Deployer) Start(ctx context.Context, input *blockchain.Input) (*blockchain.Output, error) { var bcOut *blockchain.Output var err error @@ -173,7 +164,7 @@ func (e *Deployer) DeployOutput(ctx context.Context, input *blockchain.Input) (* return bcOut, nil } -func FromOutput(testLogger zerolog.Logger, out *blockchain.Output) (*Blockchain, error) { +func From(testLogger zerolog.Logger, out *blockchain.Output) (*Blockchain, error) { if out == nil { return nil, pkgerrors.New("blockchain output is nil") } diff --git a/system-tests/lib/cre/environment/blockchains/solana/solana.go b/system-tests/lib/cre/environment/blockchains/solana/solana.go index 6708d8aaa8c..46ea5a90e92 100644 --- a/system-tests/lib/cre/environment/blockchains/solana/solana.go +++ b/system-tests/lib/cre/environment/blockchains/solana/solana.go @@ -124,7 +124,7 @@ func (s *Blockchain) ToCldfChain() (cldf_chain.BlockChain, error) { }, nil } -func (s *Deployer) Deploy(ctx context.Context, input *blockchain.Input) (blockchains.Blockchain, error) { +func (s *Deployer) Start(ctx context.Context, input *blockchain.Input) (*blockchain.Output, error) { var bcOut *blockchain.Output var err error @@ -149,9 +149,17 @@ func (s *Deployer) Deploy(ctx context.Context, input *blockchain.Input) (blockch } } - sel, ok := chainselectors.SolanaChainIdToChainSelector()[input.ChainID] + return bcOut, nil +} + +func From(input *blockchain.Input, out *blockchain.Output) (*Blockchain, error) { + if out == nil { + return nil, pkgerrors.New("blockchain output is nil") + } + chainID := out.ChainID + sel, ok := chainselectors.SolanaChainIdToChainSelector()[chainID] if !ok { - return nil, fmt.Errorf("selector not found for solana chainID '%s'", input.ChainID) + return nil, fmt.Errorf("selector not found for solana chainID '%s'", chainID) } envp := os.Getenv("SOLANA_PRIVATE_KEY") @@ -160,19 +168,25 @@ func (s *Deployer) Deploy(ctx context.Context, input *blockchain.Input) (blockch return nil, errors.New("failed to decode private key for solana") } - if err := cldf_solana_provider.WritePrivateKeyToPath(filepath.Join(input.ContractsDir, "deploy-keypair.json"), pk); err != nil { + contractsDir := "" + if input != nil { + contractsDir = input.ContractsDir + } + if strings.TrimSpace(contractsDir) == "" { + return nil, errors.New("solana contracts dir is required for reconstruction") + } + if err := cldf_solana_provider.WritePrivateKeyToPath(filepath.Join(contractsDir, "deploy-keypair.json"), pk); err != nil { return nil, pkgerrors.Wrap(err, "failed to save private key for solana") } - solClient := solrpc.New(bcOut.Nodes[0].ExternalHTTPUrl) - + solClient := solrpc.New(out.Nodes[0].ExternalHTTPUrl) return &Blockchain{ SolClient: solClient, - SolanaChainID: input.ChainID, + SolanaChainID: chainID, chainSelector: sel, PrivateKey: pk, - ArtifactsDir: input.ContractsDir, - ctfOutput: bcOut, + ArtifactsDir: contractsDir, + ctfOutput: out, }, nil } diff --git a/system-tests/lib/cre/environment/blockchains/tron/tron.go b/system-tests/lib/cre/environment/blockchains/tron/tron.go index 4cd852df5e2..6b936396b89 100644 --- a/system-tests/lib/cre/environment/blockchains/tron/tron.go +++ b/system-tests/lib/cre/environment/blockchains/tron/tron.go @@ -143,7 +143,7 @@ func (t *Blockchain) lazyInitTronChain() error { return nil } -func (t *Deployer) Deploy(ctx context.Context, input *blockchain.Input) (blockchains.Blockchain, error) { +func (t *Deployer) Start(ctx context.Context, input *blockchain.Input) (*blockchain.Output, error) { var bcOut *blockchain.Output var err error @@ -167,28 +167,35 @@ func (t *Deployer) Deploy(ctx context.Context, input *blockchain.Input) (blockch } } - chainID, err := strconv.ParseUint(bcOut.ChainID, 10, 64) + return bcOut, nil +} + +func From(testLogger zerolog.Logger, out *blockchain.Output) (*Blockchain, error) { + if out == nil { + return nil, pkgerrors.New("blockchain output is nil") + } + chainID, err := strconv.ParseUint(out.ChainID, 10, 64) if err != nil { - return nil, pkgerrors.Wrapf(err, "failed to parse chain id %s", bcOut.ChainID) + return nil, pkgerrors.Wrapf(err, "failed to parse chain id %s", out.ChainID) } selector, err := chainselectors.SelectorFromChainId(chainID) if err != nil { - return nil, pkgerrors.Wrapf(err, "failed to get chain selector for chain id %s", bcOut.ChainID) + return nil, pkgerrors.Wrapf(err, "failed to get chain selector for chain id %s", out.ChainID) } // if jsonrpc is not present, add it - if !strings.HasSuffix(bcOut.Nodes[0].ExternalHTTPUrl, "/jsonrpc") { - bcOut.Nodes[0].ExternalHTTPUrl += "/jsonrpc" + if !strings.HasSuffix(out.Nodes[0].ExternalHTTPUrl, "/jsonrpc") { + out.Nodes[0].ExternalHTTPUrl += "/jsonrpc" } - if !strings.HasSuffix(bcOut.Nodes[0].InternalHTTPUrl, "/jsonrpc") { - bcOut.Nodes[0].InternalHTTPUrl += "/jsonrpc" + if !strings.HasSuffix(out.Nodes[0].InternalHTTPUrl, "/jsonrpc") { + out.Nodes[0].InternalHTTPUrl += "/jsonrpc" } return &Blockchain{ - testLogger: t.testLogger, + testLogger: testLogger, chainSelector: selector, chainID: chainID, - ctfOutput: bcOut, + ctfOutput: out, DeployerPrivateKey: blockchain.TRONAccounts.PrivateKeys[0], }, nil } diff --git a/system-tests/lib/cre/environment/dons.go b/system-tests/lib/cre/environment/dons.go index fae986a0370..6908e63b046 100644 --- a/system-tests/lib/cre/environment/dons.go +++ b/system-tests/lib/cre/environment/dons.go @@ -2,13 +2,9 @@ package environment import ( "context" - "encoding/json" "errors" "fmt" - "net/url" - "strconv" "strings" - "sync" "time" pkgerrors "github.com/pkg/errors" @@ -27,7 +23,6 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/solana" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/flags" "github.com/smartcontractkit/chainlink/system-tests/lib/infra" ) @@ -66,24 +61,70 @@ func StartDONs( nodeSets []*cre.NodeSet, remoteRuntime *resolvedRemoteRuntime, ) (*StartedDONs, error) { - if infraInput.IsKubernetes() { - // For Kubernetes, DONs are already running in the cluster, generate service URLs - lggr.Info().Msg("Generating Kubernetes service URLs for DONs (already running in cluster)") - for idx, nodeSet := range nodeSets { - donMetadata := topology.DonsMetadata.List()[idx] - - // Extract bootstrap flags for each node - nodeMetadataRoles := make([]bool, len(donMetadata.NodesMetadata)) - for i, nodeMeta := range donMetadata.NodesMetadata { - nodeMetadataRoles[i] = nodeMeta.HasRole(cre.BootstrapNode) - } + if err := verifyRemoteToLocalBootstrapReachability(ctx, lggr, topology); err != nil { + return nil, pkgerrors.Wrap(err, "bootstrap reachability sanity check failed") + } + + switch { + case infraInput.IsKubernetes(): + return startDONsKubernetes(ctx, lggr, topology, infraInput, nodeSets) + default: + return startDONsContainerized( + ctx, + lggr, + topology, + infraInput, + registryChainBlockchainOutput, + capabilityConfigs, + copyCapabilityBinaries, + nodeSets, + remoteRuntime, + ) + } +} - creds := infra.GetNodeCredentials(&infraInput) - nodeSet.Out = infra.GenerateKubernetesNodeSetOutput(&infraInput, nodeSet.Name, nodeSet.Nodes, nodeMetadataRoles, creds, lggr) +func startDONsKubernetes( + ctx context.Context, + lggr zerolog.Logger, + topology *cre.Topology, + infraInput infra.Provider, + nodeSets []*cre.NodeSet, +) (*StartedDONs, error) { + lggr.Info().Msg("Generating Kubernetes service URLs for DONs (already running in cluster)") + for idx, nodeSet := range nodeSets { + donMetadata := topology.DonsMetadata.List()[idx] + + // Extract bootstrap flags for each node. + nodeMetadataRoles := make([]bool, len(donMetadata.NodesMetadata)) + for i, nodeMeta := range donMetadata.NodesMetadata { + nodeMetadataRoles[i] = nodeMeta.HasRole(cre.BootstrapNode) } + + creds := infra.GetNodeCredentials(&infraInput) + nodeSet.Out = infra.GenerateKubernetesNodeSetOutput(&infraInput, nodeSet.Name, nodeSet.Nodes, nodeMetadataRoles, creds, lggr) } + if err := applyNodeSetEnvVars(topology, nodeSets); err != nil { + return nil, err + } + + return buildDONsConcurrently(ctx, lggr, false, nodeSets, func(configuredIndex int, configuredNodeSet *cre.NodeSet) (*StartedDON, error) { + lggr.Info().Msgf("Kubernetes mode: using existing DON named %s", configuredNodeSet.Name) + return buildStartedDON(ctx, topology, configuredIndex, configuredNodeSet, configuredNodeSet.Out) + }) +} - // Skip binary operations for Kubernetes (binaries are in the cluster images) and for remote DONs +func startDONsContainerized( + ctx context.Context, + lggr zerolog.Logger, + topology *cre.Topology, + infraInput infra.Provider, + registryChainBlockchainOutput *blockchain.Output, + capabilityConfigs cre.CapabilityConfigs, + copyCapabilityBinaries bool, + nodeSets []*cre.NodeSet, + remoteRuntime *resolvedRemoteRuntime, +) (*StartedDONs, error) { + // Skip binary operations for remote DONs. if infraInput.IsDocker() { for donIdx, donMetadata := range topology.DonsMetadata.List() { if !copyCapabilityBinaries { @@ -105,7 +146,6 @@ func StartDONs( return nil, pkgerrors.Wrap(executableErr, "failed to make binaries executable") } - var err error ns, err := crecapabilities.AppendBinariesPathsNodeSpec(nodeSets[donIdx], donMetadata, customBinariesPaths) if err != nil { return nil, pkgerrors.Wrapf(err, "failed to append binaries paths to node spec for DON %d", donMetadata.ID) @@ -113,7 +153,24 @@ func StartDONs( nodeSets[donIdx] = ns } } + if err := applyNodeSetEnvVars(topology, nodeSets); err != nil { + return nil, err + } + + return buildDONsConcurrently(ctx, lggr, true, nodeSets, func(configuredIndex int, configuredNodeSet *cre.NodeSet) (*StartedDON, error) { + return startDON( + ctx, + lggr, + topology, + configuredIndex, + configuredNodeSet, + registryChainBlockchainOutput, + remoteRuntime, + ) + }) +} +func applyNodeSetEnvVars(topology *cre.Topology, nodeSets []*cre.NodeSet) error { // Add env vars, which were provided programmatically, to the node specs // or fail, if node specs already had some env vars set in the TOML config for donIdx, donMetadata := range topology.DonsMetadata.List() { @@ -128,134 +185,162 @@ func StartDONs( } if hasEnvVarsInTomlConfig && len(nodeSets[donIdx].EnvVars) > 0 { - return nil, fmt.Errorf("extra env vars for Chainlink Nodes are provided in the TOML config for the %s DON, but you tried to provide them programatically. Please set them only in one place", donMetadata.Name) + return fmt.Errorf("extra env vars for Chainlink Nodes are provided in the TOML config for the %s DON, but you tried to provide them programatically. Please set them only in one place", donMetadata.Name) } } + return nil +} +func buildDONsConcurrently( + ctx context.Context, + lggr zerolog.Logger, + printFailedContainerLogs bool, + nodeSets []*cre.NodeSet, + startFn func(configuredIndex int, configuredNodeSet *cre.NodeSet) (*StartedDON, error), +) (*StartedDONs, error) { errGroup, _ := errgroup.WithContext(ctx) - var resultMap sync.Map - var startClient componentClient - if hasRemoteNodeSets(nodeSets) { - if remoteRuntime == nil { - return nil, errors.New("remote runtime is required when starting remote nodesets") - } - client, clientErr := newRemoteComponentClient(remoteRuntime) - if clientErr != nil { - return nil, clientErr - } - startClient = client - } + startedDONs := make(StartedDONs, len(nodeSets)) for idx, nodeSet := range nodeSets { + configuredIndex := idx + configuredNodeSet := nodeSet errGroup.Go(func() error { - startTime := time.Now() - lggr.Info().Msgf("Starting DON named %s", nodeSet.Name) - - var nodeset *ns.Output - var nodesetErr error - - // If output is already set (Kubernetes or cached), use it - if nodeSet.Out != nil { - lggr.Info().Msgf("Using pre-configured node URLs for DON %s", nodeSet.Name) - nodeset = nodeSet.Out - } else if strings.TrimSpace(nodeSet.Placement) == string(config.PlacementRemote) { - registryChainPayload, err := agent.EncodeForTransport(registryChainBlockchainOutput) - if err != nil { - return pkgerrors.Wrap(err, "failed to encode registry blockchain payload for remote nodeset start") - } - remoteInput, err := buildRemoteNodeSetInput(nodeSet) - if err != nil { - return err - } - payload := startComponentRequest{ - ComponentType: componentTypeNodeSet, - NodeSet: remoteInput, - RegistryBlockchain: registryChainPayload, - ReusePolicy: nodeSetRemoteStartPolicy(nodeSet), - } - payloadBytes, err := json.Marshal(payload) - if err != nil { - return pkgerrors.Wrap(err, "failed to encode nodeset payload") - } - response, err := startClient.StartComponent(ctx, startComponentEnvelope{ - SchemaVersion: agent.SchemaVersionV1, - Operation: agent.OperationStartComponent, - Payload: payloadBytes, - }) - if err != nil { - return err - } - if response.ComponentType != componentTypeNodeSet { - return fmt.Errorf("unexpected component type in start response: %s", response.ComponentType) - } - for _, logLine := range response.AgentLogs { - pretty := prettifyAgentLogLine(logLine) - if pretty == "" { - continue - } - lggr.Info().Msgf("[agent] %s", pretty) - } - nodeset, err = agent.DecodeFromTransport[ns.Output](response.Output) - if err != nil { - return pkgerrors.Wrap(err, "failed to decode nodeset transport payload") - } - if err := rewriteRemoteNodeSetOutputForLocalAccess(topology, idx, nodeSet, nodeset, remoteRuntime.EC2HostIP); err != nil { - return err - } - } else { - // For Docker, start the nodes - nodeSet.Input.NodeSpecs = nodeSet.ExtractCTFInputs() - nodeset, nodesetErr = ns.NewSharedDBNodeSetWithContext(ctx, nodeSet.Input, registryChainBlockchainOutput) - if nodesetErr != nil { - return pkgerrors.Wrapf(nodesetErr, "failed to start nodeSet named %s", nodeSet.Name) - } - } - - // For Kubernetes, we still need to create clients to register nodes with JD - don, donErr := cre.NewDON(ctx, topology.DonsMetadata.List()[idx], nodeset.CLNodes) - if donErr != nil { - return pkgerrors.Wrapf(donErr, "failed to create DON from node set named %s", nodeSet.Name) + startedDON, startErr := startFn(configuredIndex, configuredNodeSet) + if startErr != nil { + return startErr } - - resultMap.Store(idx, &StartedDON{ - NodeSetOutput: &cre.NodeSetOutput{ - Output: nodeset, - NodeSetName: nodeSet.Name, - Capabilities: nodeSet.Capabilities, - }, - DON: don, - }) - - lggr.Info().Msgf("DON %s started in %.2f seconds", nodeSet.Name, time.Since(startTime).Seconds()) - + startedDONs[configuredIndex] = startedDON return nil }) } if err := errGroup.Wait(); err != nil { - if !infraInput.IsKubernetes() { + if printFailedContainerLogs { infra.PrintFailedContainerLogs(lggr, 30) } return nil, err } - startedDONs := make(StartedDONs, len(nodeSets)) - resultMap.Range(func(key, value any) bool { - // key is index in the original slice - startedDONs[key.(int)] = value.(*StartedDON) - return true - }) - return &startedDONs, nil } -func hasRemoteNodeSets(nodeSets []*cre.NodeSet) bool { - for _, nodeSet := range nodeSets { - if nodeSet != nil && strings.TrimSpace(nodeSet.Placement) == string(config.PlacementRemote) { - return true +func startDON( + ctx context.Context, + lggr zerolog.Logger, + topology *cre.Topology, + configuredIndex int, + nodeSet *cre.NodeSet, + registryChainBlockchainOutput *blockchain.Output, + remoteRuntime *resolvedRemoteRuntime, +) (*StartedDON, error) { + if nodeSet == nil { + return nil, errors.New("nodeSet is nil") + } + startTime := time.Now() + lggr.Info().Msgf("Starting DON named %s", nodeSet.Name) + + nodeset, err := startNodeSet(ctx, lggr, topology, configuredIndex, nodeSet, registryChainBlockchainOutput, remoteRuntime) + if err != nil { + return nil, err + } + + startedDON, buildErr := buildStartedDON(ctx, topology, configuredIndex, nodeSet, nodeset) + if buildErr != nil { + return nil, buildErr + } + + lggr.Info().Msgf("DON %s started in %.2f seconds", nodeSet.Name, time.Since(startTime).Seconds()) + return startedDON, nil +} + +func buildStartedDON( + ctx context.Context, + topology *cre.Topology, + configuredIndex int, + nodeSet *cre.NodeSet, + nodeset *ns.Output, +) (*StartedDON, error) { + if nodeSet == nil { + return nil, errors.New("nodeSet is nil") + } + if nodeset == nil { + return nil, fmt.Errorf("nodeSet output is nil for DON %s", nodeSet.Name) + } + + donsMetadata := topology.DonsMetadata.List() + if configuredIndex < 0 || configuredIndex >= len(donsMetadata) { + return nil, fmt.Errorf("configured index %d out of bounds for dons metadata", configuredIndex) + } + don, donErr := cre.NewDON(ctx, donsMetadata[configuredIndex], nodeset.CLNodes) + if donErr != nil { + return nil, pkgerrors.Wrapf(donErr, "failed to create DON from node set named %s", nodeSet.Name) + } + + return &StartedDON{ + NodeSetOutput: &cre.NodeSetOutput{ + Output: nodeset, + NodeSetName: nodeSet.Name, + Capabilities: nodeSet.Capabilities, + }, + DON: don, + }, nil +} +func startNodeSet( + ctx context.Context, + lggr zerolog.Logger, + topology *cre.Topology, + configuredIndex int, + nodeSet *cre.NodeSet, + registryChainBlockchainOutput *blockchain.Output, + remoteRuntime *resolvedRemoteRuntime, +) (*ns.Output, error) { + // If output is already set (Kubernetes or cached), use it. + if nodeSet.Out != nil { + lggr.Info().Msgf("Using pre-configured node URLs for DON %s", nodeSet.Name) + return nodeSet.Out, nil + } + + if strings.TrimSpace(nodeSet.Placement) == string(config.PlacementRemote) { + if remoteRuntime == nil { + return nil, errors.New("remote runtime is required for remote nodeset placement") + } + registryChainPayload, err := agent.EncodeForTransport(registryChainBlockchainOutput) + if err != nil { + return nil, pkgerrors.Wrap(err, "failed to encode registry blockchain payload for remote nodeset start") + } + remoteInput, err := buildRemoteNodeSetInput(nodeSet) + if err != nil { + return nil, err + } + payload := agent.StartComponentPayload{ + ComponentType: componentTypeNodeSet, + NodeSet: remoteInput, + RegistryBlockchain: registryChainPayload, + ReusePolicy: nodeSetRemoteStartPolicy(nodeSet), + } + nodeset, err := startRemoteComponent[ns.Output]( + ctx, + lggr, + remoteRuntime.Client, + payload, + componentTypeNodeSet, + ) + if err != nil { + return nil, err + } + if err := rewriteRemoteNodeSetOutputForLocalAccess(topology, configuredIndex, nodeSet, nodeset, remoteRuntime.EC2HostIP); err != nil { + return nil, err } + return nodeset, nil + } + + // For Docker, start the nodes. + nodeSet.Input.NodeSpecs = nodeSet.ExtractCTFInputs() + nodeset, err := ns.NewSharedDBNodeSetWithContext(ctx, nodeSet.Input, registryChainBlockchainOutput) + if err != nil { + return nil, pkgerrors.Wrapf(err, "failed to start nodeSet named %s", nodeSet.Name) } - return false + return nodeset, nil } func nodeSetRemoteStartPolicy(nodeSet *cre.NodeSet) string { @@ -333,209 +418,6 @@ func rewriteNodeSetForDirectAccess(output *ns.Output, ec2HostIP string) error { return nil } -const nodeSetDBEndpointName = "nodeset-db" - -func describeNodeSetEndpoints(componentID string, nodeSet *cre.NodeSet, output *ns.Output) ([]tunnel.EndpointRef, error) { - sizeHint := 1 - if output != nil { - sizeHint += len(output.CLNodes) - } - if nodeSet != nil { - for _, spec := range nodeSet.NodeSpecs { - if spec == nil || spec.Node == nil { - continue - } - sizeHint += len(spec.Node.CustomPorts) - } - } - refs := make([]tunnel.EndpointRef, 0, sizeHint) - if output != nil { - for idx := range output.CLNodes { - endpointName := fmt.Sprintf("node-%d-api", idx) - rawURL := output.CLNodes[idx].Node.ExternalURL - ref, err := nodeSetEndpointFromURL(componentID, endpointName, rawURL) - if err != nil { - return nil, err - } - if ref != nil { - refs = append(refs, *ref) - } - } - } - if nodeSet != nil { - for nodeIdx, spec := range nodeSet.NodeSpecs { - customRefs, err := nodeSetCustomPortEndpointRefs(componentID, nodeIdx, spec) - if err != nil { - return nil, err - } - refs = append(refs, customRefs...) - } - } - dbRef, err := nodeSetDBEndpointRef(componentID, nodeSet) - if err != nil { - return nil, err - } - if dbRef != nil { - refs = append(refs, *dbRef) - } - return refs, nil -} - -func nodeSetDBEndpointRef(componentID string, nodeSet *cre.NodeSet) (*tunnel.EndpointRef, error) { - if nodeSet == nil || nodeSet.DbInput == nil || nodeSet.DbInput.Port == 0 { - return nil, nil - } - if nodeSet.DbInput.Port < 0 || nodeSet.DbInput.Port > 65535 { - return nil, fmt.Errorf("nodeset db port %d is invalid", nodeSet.DbInput.Port) - } - return &tunnel.EndpointRef{ - ComponentID: componentID, - EndpointName: nodeSetDBEndpointName, - Scheme: "tcp", - Host: "127.0.0.1", - Port: nodeSet.DbInput.Port, - OriginalURL: fmt.Sprintf("tcp://127.0.0.1:%d", nodeSet.DbInput.Port), - }, nil -} - -func rewriteNodeSetWithBindings(output *ns.Output, nodeSet *cre.NodeSet, bindings []tunnel.TunnelBinding) error { - byName := make(map[string]tunnel.TunnelBinding, len(bindings)) - for _, binding := range bindings { - byName[binding.EndpointName] = binding - } - if output != nil { - for idx := range output.CLNodes { - endpointName := fmt.Sprintf("node-%d-api", idx) - rawURL := output.CLNodes[idx].Node.ExternalURL - if rawURL == "" { - continue - } - binding, ok := byName[endpointName] - if !ok { - return fmt.Errorf("missing tunnel binding for nodeset endpoint %s", endpointName) - } - output.CLNodes[idx].Node.ExternalURL = binding.LocalURL - } - } - if nodeSet != nil && nodeSet.DbInput != nil && nodeSet.DbInput.Port != 0 { - binding, ok := byName[nodeSetDBEndpointName] - if !ok { - return fmt.Errorf("missing tunnel binding for nodeset endpoint %s", nodeSetDBEndpointName) - } - nodeSet.DbInput.Port = binding.LocalPort - } - if nodeSet != nil { - for nodeIdx, spec := range nodeSet.NodeSpecs { - if spec == nil || spec.Input == nil || spec.Input.Node == nil || len(spec.Input.Node.CustomPorts) == 0 { - continue - } - for portIdx, mapping := range spec.Input.Node.CustomPorts { - _, containerPort, err := parseCustomPortMapping(mapping) - if err != nil { - return fmt.Errorf("invalid custom_ports entry %q for node %d: %w", mapping, nodeIdx, err) - } - binding, ok := byName[nodeSetCustomPortEndpointName(nodeIdx, portIdx, containerPort)] - if !ok { - return fmt.Errorf("missing tunnel binding for nodeset endpoint %s", nodeSetCustomPortEndpointName(nodeIdx, portIdx, containerPort)) - } - spec.Input.Node.CustomPorts[portIdx] = rewriteCustomPortMappingHostPort(mapping, binding.LocalPort) - } - } - } - return nil -} - -func nodeSetCustomPortEndpointRefs(componentID string, nodeIdx int, spec *cre.NodeSpecWithRole) ([]tunnel.EndpointRef, error) { - if spec == nil || spec.Input == nil || spec.Input.Node == nil || len(spec.Input.Node.CustomPorts) == 0 { - return nil, nil - } - refs := make([]tunnel.EndpointRef, 0, len(spec.Input.Node.CustomPorts)) - for portIdx, mapping := range spec.Input.Node.CustomPorts { - hostPort, containerPort, err := parseCustomPortMapping(mapping) - if err != nil { - return nil, fmt.Errorf("invalid custom_ports entry %q for node %d: %w", mapping, nodeIdx, err) - } - refs = append(refs, tunnel.EndpointRef{ - ComponentID: componentID, - EndpointName: nodeSetCustomPortEndpointName(nodeIdx, portIdx, containerPort), - Scheme: "tcp", - Host: "127.0.0.1", - Port: hostPort, - OriginalURL: fmt.Sprintf("tcp://127.0.0.1:%d", hostPort), - }) - } - return refs, nil -} - -func nodeSetCustomPortEndpointName(nodeIdx, portIdx, containerPort int) string { - return fmt.Sprintf("node-%d-custom-%d-%d", nodeIdx, portIdx, containerPort) -} - -func parseCustomPortMapping(mapping string) (hostPort int, containerPort int, err error) { - parts := strings.Split(strings.TrimSpace(mapping), ":") - if len(parts) < 2 { - return 0, 0, fmt.Errorf("expected hostPort:containerPort, got %q", mapping) - } - hostPortRaw := parts[len(parts)-2] - containerPortRaw := parts[len(parts)-1] - hostPort, err = strconv.Atoi(hostPortRaw) - if err != nil || hostPort <= 0 || hostPort > 65535 { - return 0, 0, fmt.Errorf("invalid host port %q", hostPortRaw) - } - containerPort, err = strconv.Atoi(containerPortRaw) - if err != nil || containerPort <= 0 || containerPort > 65535 { - return 0, 0, fmt.Errorf("invalid container port %q", containerPortRaw) - } - return hostPort, containerPort, nil -} - -func rewriteCustomPortMappingHostPort(mapping string, newHostPort int) string { - parts := strings.Split(strings.TrimSpace(mapping), ":") - if len(parts) < 2 { - return mapping - } - parts[len(parts)-2] = strconv.Itoa(newHostPort) - return strings.Join(parts, ":") -} - -func rewriteGatewayIncomingForNodeSetBindings( - topology *cre.Topology, - configuredIndex int, - nodeSet *cre.NodeSet, - bindings []tunnel.TunnelBinding, -) { - if topology == nil || topology.GatewayConnectors == nil || len(topology.GatewayConnectors.Configurations) == 0 || nodeSet == nil { - return - } - if configuredIndex < 0 || configuredIndex >= len(topology.DonsMetadata.List()) { - return - } - donMeta := topology.DonsMetadata.List()[configuredIndex] - gatewayNode, hasGateway := donMeta.Gateway() - if !hasGateway { - return - } - if gatewayNode.Index < 0 || gatewayNode.Index >= len(nodeSet.NodeSpecs) { - return - } - spec := nodeSet.NodeSpecs[gatewayNode.Index] - if spec == nil || spec.Input == nil || spec.Input.Node == nil || len(spec.Input.Node.CustomPorts) == 0 { - return - } - - for _, cfg := range topology.GatewayConnectors.Configurations { - if cfg == nil || cfg.GatewayConfiguration == nil || cfg.NodeUUID != gatewayNode.UUID { - continue - } - // Test process reaches gateway via local port (direct for local runs, tunneled for remote runs). - cfg.Incoming.Host = "127.0.0.1" - // Resolve tunnel by gateway container port (e.g. 5002), not by possibly stale host-side custom port. - if localPort, ok := gatewayLocalPortFromBindings(gatewayNode.Index, cfg.Incoming.ExternalPort, bindings); ok { - cfg.Incoming.ExternalPort = localPort - } - } -} - func rewriteGatewayIncomingForDirectAccess(topology *cre.Topology, configuredIndex int, ec2HostIP string) { if topology == nil || topology.GatewayConnectors == nil || len(topology.GatewayConnectors.Configurations) == 0 { return @@ -556,62 +438,6 @@ func rewriteGatewayIncomingForDirectAccess(topology *cre.Topology, configuredInd } } -func gatewayLocalPortFromBindings(gatewayNodeIndex, gatewayContainerPort int, bindings []tunnel.TunnelBinding) (int, bool) { - for _, binding := range bindings { - if !strings.HasPrefix(binding.EndpointName, fmt.Sprintf("node-%d-custom-", gatewayNodeIndex)) { - continue - } - if strings.HasSuffix(binding.EndpointName, fmt.Sprintf("-%d", gatewayContainerPort)) { - return binding.LocalPort, true - } - } - return 0, false -} - -func nodeSetEndpointFromURL(componentID, endpointName, rawURL string) (*tunnel.EndpointRef, error) { - if strings.TrimSpace(rawURL) == "" { - return nil, nil - } - parsed, err := url.Parse(rawURL) - if err != nil { - return nil, fmt.Errorf("failed to parse endpoint url %q: %w", rawURL, err) - } - host := parsed.Hostname() - if host == "" { - return nil, fmt.Errorf("endpoint url %q has empty hostname", rawURL) - } - port, err := nodeSetResolveURLPort(parsed) - if err != nil { - return nil, err - } - return &tunnel.EndpointRef{ - ComponentID: componentID, - EndpointName: endpointName, - Scheme: parsed.Scheme, - Host: host, - Port: port, - OriginalURL: rawURL, - }, nil -} - -func nodeSetResolveURLPort(parsed *url.URL) (int, error) { - if parsed.Port() != "" { - port, err := strconv.Atoi(parsed.Port()) - if err != nil || port <= 0 || port > 65535 { - return 0, fmt.Errorf("url %q has invalid port %q", parsed.String(), parsed.Port()) - } - return port, nil - } - switch parsed.Scheme { - case "http", "ws": - return 80, nil - case "https", "wss": - return 443, nil - default: - return 0, fmt.Errorf("url %q has unsupported scheme %q without explicit port", parsed.String(), parsed.Scheme) - } -} - func FundNodes(ctx context.Context, testLogger zerolog.Logger, dons *cre.Dons, blockchains []blockchains.Blockchain, fundingAmountPerChainFamily map[string]uint64) error { for _, don := range dons.List() { testLogger.Info().Msgf("Funding nodes for DON %s", don.Name) diff --git a/system-tests/lib/cre/environment/dons_test.go b/system-tests/lib/cre/environment/dons_test.go index 37e5ec68f83..f460f924219 100644 --- a/system-tests/lib/cre/environment/dons_test.go +++ b/system-tests/lib/cre/environment/dons_test.go @@ -1,16 +1,13 @@ package environment import ( - "net/url" - "strings" "testing" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/clnode" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" - "github.com/stretchr/testify/require" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" "github.com/smartcontractkit/chainlink/system-tests/lib/infra" + "github.com/stretchr/testify/require" ) func TestBuildRemoteNodeSetInputRequiresImageOrBuildFields(t *testing.T) { @@ -96,116 +93,6 @@ func TestRewriteRemoteNodeSetOutputForLocalAccess_InvalidNodeExternalURLFails(t require.Contains(t, err.Error(), "failed to parse url", "expected parse failure context") } -func TestParseCustomPortMapping(t *testing.T) { - t.Run("valid mapping", func(t *testing.T) { - hostPort, containerPort, err := parseCustomPortMapping("127.0.0.1:18080:8080") - require.NoError(t, err, "expected valid mapping to parse") - require.Equal(t, 18080, hostPort) - require.Equal(t, 8080, containerPort) - }) - - t.Run("missing separator", func(t *testing.T) { - _, _, err := parseCustomPortMapping("8080") - require.Error(t, err, "expected malformed mapping to fail") - require.Contains(t, err.Error(), "expected hostPort:containerPort") - }) - - t.Run("invalid host port", func(t *testing.T) { - _, _, err := parseCustomPortMapping("bad:8080") - require.Error(t, err, "expected invalid host port to fail") - require.Contains(t, err.Error(), "invalid host port") - }) - - t.Run("invalid container port", func(t *testing.T) { - _, _, err := parseCustomPortMapping("18080:bad") - require.Error(t, err, "expected invalid container port to fail") - require.Contains(t, err.Error(), "invalid container port") - }) -} - -func TestNodeSetResolveURLPort(t *testing.T) { - tests := []struct { - name string - rawURL string - wantPort int - wantError string - }{ - {name: "explicit port", rawURL: "http://node:1234", wantPort: 1234}, - {name: "http default", rawURL: "http://node", wantPort: 80}, - {name: "https default", rawURL: "https://node", wantPort: 443}, - {name: "unsupported scheme without port", rawURL: "tcp://node", wantError: "unsupported scheme"}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - parsed, err := url.Parse(tt.rawURL) - require.NoError(t, err, "test setup should parse URL") - port, err := nodeSetResolveURLPort(parsed) - if tt.wantError != "" { - require.Error(t, err, "expected port resolution failure") - require.Contains(t, err.Error(), tt.wantError) - return - } - require.NoError(t, err, "expected port resolution success") - require.Equal(t, tt.wantPort, port) - }) - } -} - -func TestNodeSetEndpointFromURL(t *testing.T) { - ref, err := nodeSetEndpointFromURL("nodeset:workflow", "node-0-api", "http://node-0:8081") - require.NoError(t, err, "expected endpoint ref to parse") - require.Equal(t, "nodeset:workflow", ref.ComponentID) - require.Equal(t, "node-0-api", ref.EndpointName) - require.Equal(t, "http", ref.Scheme) - require.Equal(t, "node-0", ref.Host) - require.Equal(t, 8081, ref.Port) - - ref, err = nodeSetEndpointFromURL("nodeset:workflow", "node-0-api", " ") - require.NoError(t, err, "expected blank URL to be ignored") - require.Nil(t, ref, "expected nil endpoint for blank URL") - - _, err = nodeSetEndpointFromURL("nodeset:workflow", "node-0-api", "http://") - require.Error(t, err, "expected empty hostname to fail") - require.Contains(t, err.Error(), "empty hostname") -} - -func TestGatewayLocalPortFromBindings(t *testing.T) { - bindings := []tunnel.TunnelBinding{ - {EndpointRef: tunnel.EndpointRef{EndpointName: "node-0-custom-0-5002"}, LocalPort: 22002}, - {EndpointRef: tunnel.EndpointRef{EndpointName: "node-1-custom-0-5002"}, LocalPort: 22012}, - } - - port, ok := gatewayLocalPortFromBindings(0, 5002, bindings) - require.True(t, ok, "expected matching binding") - require.Equal(t, 22002, port) - - _, ok = gatewayLocalPortFromBindings(0, 6000, bindings) - require.False(t, ok, "expected non-matching container port to return false") -} - -func TestRewriteGatewayIncomingForNodeSetBindings(t *testing.T) { - topology, nodeSet := mustBuildRemoteGatewayTopology(t) - nodeSet.NodeSpecs[0].Input.Node.CustomPorts = []string{"18080:5002"} - - bindings := []tunnel.TunnelBinding{ - {EndpointRef: tunnel.EndpointRef{EndpointName: "node-0-custom-0-5002"}, LocalPort: 22002}, - } - - rewriteGatewayIncomingForNodeSetBindings(topology, 0, nodeSet, bindings) - cfg := topology.GatewayConnectors.Configurations[0].GatewayConfiguration - require.Equal(t, "127.0.0.1", cfg.Incoming.Host, "incoming host should be local during binding mode") - require.Equal(t, 22002, cfg.Incoming.ExternalPort, "incoming external port should be rewritten from binding") -} - -func TestRewriteCustomPortMappingHostPort(t *testing.T) { - rewritten := rewriteCustomPortMappingHostPort("127.0.0.1:18080:8080", 22080) - require.Equal(t, "127.0.0.1:22080:8080", rewritten) - - unchanged := rewriteCustomPortMappingHostPort("bad", 22080) - require.True(t, strings.EqualFold("bad", unchanged), "malformed mapping should remain unchanged") -} - func mustBuildRemoteGatewayTopology(t *testing.T) (*cre.Topology, *cre.NodeSet) { t.Helper() diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index eeaf12b13bc..7fc6e3d659d 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -55,8 +55,8 @@ type SetupOutput struct { NodeOutput []*cre.NodeSetOutput S3ProviderOutput *s3provider.Output GatewayConnectors *cre.GatewayConnectors - closeOnce sync.Once - closeErr error + closeOnce sync.Once + closeErr error } func (s *SetupOutput) Close(ctx context.Context) error { @@ -158,19 +158,17 @@ func SetupTestEnvironment( if s3Err != nil { return nil, pkgerrors.Wrap(s3Err, "failed to start S3 provider") } - var remoteRuntime *resolvedRemoteRuntime - if hasRemoteComponents(input.Blockchains, input.JdInput, input.NodeSets) { - remoteRuntime, err = resolveRemoteRuntime(testLogger) - if err != nil { - return nil, pkgerrors.Wrap(err, "failed to resolve remote runtime settings") - } + + remoteRuntime, err := resolveRemoteRuntimeForSetup(testLogger, input.Blockchains, input.JdInput, input.NodeSets) + if err != nil { + return nil, pkgerrors.Wrap(err, "failed to resolve remote runtime settings") } testLogger.Info().Msg("using persistent relay supervisor for mixed component relays") fmt.Print(libformat.PurpleText("%s", input.StageGen.Wrap("Starting %d blockchain(s)", len(input.Blockchains)))) - deployedBlockchains, startErr := startBlockchainsWithTargets( + deployedBlockchains, startErr := startBlockchains( ctx, testLogger, input.Blockchains, @@ -215,14 +213,13 @@ func SetupTestEnvironment( if tErr != nil { return nil, pkgerrors.Wrap(tErr, "failed to create topology") } - blockchainPlacementBySelector := blockchainPlacementsBySelector(input.Blockchains, deployedBlockchains.Outputs) updatedNodeSets, topoErr := donconfig.PrepareNodeTOMLs( ctx, topology, creEnvironment, input.NodeSets, - blockchainPlacementBySelector, + input.Blockchains, input.Capabilities, input.ConfigFactoryFunctions, ) @@ -269,9 +266,6 @@ func SetupTestEnvironment( return nil, pkgerrors.Wrap(err, "failed to execute pre-DON startup hook") } } - if err := verifyRemoteToLocalBootstrapReachability(ctx, testLogger, topology); err != nil { - return nil, pkgerrors.Wrap(err, "bootstrap reachability sanity check failed") - } startedDONs, donStartErr := StartDONs(ctx, testLogger, topology, input.Provider, deployedBlockchains.RegistryChain().CtfOutput(), input.CapabilityConfigs, input.CopyCapabilityBinaries, updatedNodeSets, remoteRuntime) if donStartErr != nil { @@ -489,21 +483,6 @@ func appendOutputsToInput(input *SetupInput, nodeSetOutput []*cre.NodeSetOutput, input.JdInput.Out = jdOutput } -func blockchainPlacementsBySelector(configured []*config.Blockchain, deployed []blockchains.Blockchain) map[uint64]string { - bySelector := make(map[uint64]string, len(deployed)) - for idx, blockchainCfg := range configured { - if blockchainCfg == nil { - continue - } - if idx >= len(deployed) || deployed[idx] == nil { - continue - } - selector := deployed[idx].ChainSelector() - bySelector[selector] = string(blockchainCfg.Placement) - } - return bySelector -} - func hasRemoteComponents(blockchains []*config.Blockchain, jdInput *config.JobDistributor, nodeSets []*cre.NodeSet) bool { for _, configuredBlockchain := range blockchains { if configuredBlockchain != nil && configuredBlockchain.Placement == config.PlacementRemote { @@ -521,6 +500,18 @@ func hasRemoteComponents(blockchains []*config.Blockchain, jdInput *config.JobDi return false } +func resolveRemoteRuntimeForSetup( + testLogger zerolog.Logger, + blockchains []*config.Blockchain, + jdInput *config.JobDistributor, + nodeSets []*cre.NodeSet, +) (*resolvedRemoteRuntime, error) { + if !hasRemoteComponents(blockchains, jdInput, nodeSets) { + return nil, nil + } + return resolveRemoteRuntime(testLogger) +} + type nodeSetPlacementSummary struct { HasLocalTargets bool HasRemoteTargets bool diff --git a/system-tests/lib/cre/environment/environment_placement_test.go b/system-tests/lib/cre/environment/environment_placement_test.go index 0bdd9f111e8..290f6b1884b 100644 --- a/system-tests/lib/cre/environment/environment_placement_test.go +++ b/system-tests/lib/cre/environment/environment_placement_test.go @@ -3,6 +3,7 @@ package environment import ( "testing" + "github.com/rs/zerolog" "github.com/stretchr/testify/require" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" @@ -64,3 +65,14 @@ func TestHasRemoteComponents(t *testing.T) { }) } } + +func TestResolveRemoteRuntimeForSetupSkipsResolutionWhenNoRemoteComponents(t *testing.T) { + runtime, err := resolveRemoteRuntimeForSetup( + zerolog.Nop(), + []*config.Blockchain{{Placement: config.PlacementLocal}}, + &config.JobDistributor{Placement: config.PlacementLocal}, + []*cre.NodeSet{{Placement: "local"}}, + ) + require.NoError(t, err) + require.Nil(t, runtime, "expected nil runtime when no remote components are configured") +} diff --git a/system-tests/lib/cre/environment/jobs.go b/system-tests/lib/cre/environment/jobs.go index e8a1a2d933b..e16d2ba69e7 100644 --- a/system-tests/lib/cre/environment/jobs.go +++ b/system-tests/lib/cre/environment/jobs.go @@ -2,7 +2,6 @@ package environment import ( "context" - "encoding/json" "errors" "fmt" "net" @@ -75,59 +74,37 @@ func StartJD( var jdOutput *jd.Output var jdErr error - if jdConfig.Placement == config.PlacementRemote { + switch { + case jdConfig.Placement == config.PlacementRemote: if remoteRuntime == nil { return nil, errors.New("remote runtime is required when starting remote jd") } - startClient, err := newRemoteComponentClient(remoteRuntime) - if err != nil { - return nil, err - } payload := agent.StartComponentPayload{ ComponentType: componentTypeJD, JD: jdConfig.InputRef(), ReusePolicy: string(jdConfig.RemoteStartPolicy), } - payloadBytes, err := json.Marshal(payload) - if err != nil { - return nil, pkgerrors.Wrap(err, "failed to encode jd payload") - } - response, err := startClient.StartComponent(ctx, agent.StartComponentEnvelope{ - SchemaVersion: agent.SchemaVersionV1, - Operation: agent.OperationStartComponent, - Payload: payloadBytes, - }) - if err != nil { - return nil, err - } - if response.ComponentType != componentTypeJD { - return nil, fmt.Errorf("unexpected component type in start response: %s", response.ComponentType) - } - for _, logLine := range response.AgentLogs { - pretty := prettifyAgentLogLine(logLine) - if pretty == "" { - continue - } - lggr.Info().Msgf("[agent] %s", pretty) - } - jdOutput, err = agent.DecodeFromTransport[jd.Output](response.Output) - if err != nil { - return nil, pkgerrors.Wrap(err, "failed to decode jd transport payload") + jdOutput, jdErr = startRemoteComponent[jd.Output]( + ctx, + lggr, + remoteRuntime.Client, + payload, + componentTypeJD, + ) + if jdErr != nil { + return nil, jdErr } - if err := rewriteRemoteJDOutputForLocalAccess(jdOutput, remoteRuntime.EC2HostIP); err != nil { + if err := rewriteJDForDirectAccess(jdOutput, remoteRuntime.EC2HostIP); err != nil { return nil, err } - } else if infraInput.IsKubernetes() { + case infraInput.IsKubernetes(): // For Kubernetes, JD is already running in the cluster, generate service URLs lggr.Info().Msg("Generating Kubernetes service URLs for Job Distributor (already running in cluster)") jdOutput, jdErr = infra.GenerateKubernetesJDOutput(&infraInput, lggr) if jdErr != nil { return nil, pkgerrors.Wrap(jdErr, "failed to generate Kubernetes JD output") } - } - - // Only start JD container for Docker provider - if jdOutput == nil { + default: jdOutput, jdErr = jd.NewWithContext(ctx, jdConfig.InputRef()) if jdErr != nil { jdErr = fmt.Errorf("failed to start JD container for image %s: %w", jdConfig.Image, jdErr) @@ -171,14 +148,10 @@ func StartJD( }, nil } -func rewriteRemoteJDOutputForLocalAccess(output *jd.Output, ec2HostIP string) error { +func rewriteJDForDirectAccess(output *jd.Output, ec2HostIP string) error { if output == nil { return nil } - return rewriteJDForDirectAccess(output, ec2HostIP) -} - -func rewriteJDForDirectAccess(output *jd.Output, ec2HostIP string) error { if output.ExternalGRPCUrl != "" { rewritten, err := rewriteAddressHost(output.ExternalGRPCUrl, ec2HostIP) if err != nil { diff --git a/system-tests/lib/cre/environment/jobs_test.go b/system-tests/lib/cre/environment/jobs_test.go index 47f858fd469..99c13aa3064 100644 --- a/system-tests/lib/cre/environment/jobs_test.go +++ b/system-tests/lib/cre/environment/jobs_test.go @@ -3,13 +3,13 @@ package environment import ( "testing" - "github.com/stretchr/testify/require" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" + "github.com/stretchr/testify/require" ) -func TestRewriteRemoteJDOutputForLocalAccess_LocalOnlyNoop(t *testing.T) { +func TestRewriteJDForDirectAccess_NilOutputNoop(t *testing.T) { var output *jd.Output - err := rewriteRemoteJDOutputForLocalAccess(output, "10.20.30.40") + err := rewriteJDForDirectAccess(output, "10.20.30.40") require.NoError(t, err, "expected nil output rewrite to be a no-op") } @@ -27,26 +27,26 @@ func TestRewriteJDForDirectAccessRewritesExternalEndpoints(t *testing.T) { require.Equal(t, "job-distributor:8080", output.InternalWSRPCUrl, "internal wsrpc url should remain unchanged") } -func TestRewriteRemoteJDOutputForLocalAccess_MixedFallsBackToInternalWSRPCSource(t *testing.T) { +func TestRewriteJDForDirectAccess_MixedFallsBackToInternalWSRPCSource(t *testing.T) { output := &jd.Output{ ExternalGRPCUrl: "127.0.0.1:14231", ExternalWSRPCUrl: "", InternalWSRPCUrl: "job-distributor:8080", } - err := rewriteRemoteJDOutputForLocalAccess(output, "10.20.30.40") - require.NoError(t, err, "rewriteRemoteJDOutputForLocalAccess should succeed") + err := rewriteJDForDirectAccess(output, "10.20.30.40") + require.NoError(t, err, "rewriteJDForDirectAccess should succeed") require.Equal(t, "10.20.30.40:8080", output.ExternalWSRPCUrl, "external wsrpc url should be derived from internal source") require.Equal(t, "job-distributor:8080", output.InternalWSRPCUrl, "internal wsrpc url should remain unchanged") } -func TestRewriteRemoteJDOutputForLocalAccess_InvalidAddressFails(t *testing.T) { +func TestRewriteJDForDirectAccess_InvalidAddressFails(t *testing.T) { output := &jd.Output{ ExternalGRPCUrl: "127.0.0.1", ExternalWSRPCUrl: "127.0.0.1:9080", } - err := rewriteRemoteJDOutputForLocalAccess(output, "10.20.30.40") + err := rewriteJDForDirectAccess(output, "10.20.30.40") require.Error(t, err, "expected invalid host:port to fail rewrite") require.Contains(t, err.Error(), "failed to parse host:port", "expected parse failure context") } diff --git a/system-tests/lib/cre/environment/remote_component_client.go b/system-tests/lib/cre/environment/remote_component_client.go index b94513d4a39..5aa5003eccb 100644 --- a/system-tests/lib/cre/environment/remote_component_client.go +++ b/system-tests/lib/cre/environment/remote_component_client.go @@ -46,6 +46,7 @@ type httpComponentClient struct { type resolvedRemoteRuntime struct { AgentBaseURL string EC2HostIP string + Client componentClient } func newEC2HTTPComponentClient(baseURL string) *httpComponentClient { @@ -69,15 +70,23 @@ func resolveRemoteRuntime(testLogger zerolog.Logger) (*resolvedRemoteRuntime, er if err != nil { return nil, err } + client := newEC2HTTPComponentClient(baseURL) return &resolvedRemoteRuntime{ AgentBaseURL: baseURL, EC2HostIP: ec2HostIP, + Client: client, }, nil } func newRemoteComponentClient(runtime *resolvedRemoteRuntime) (componentClient, error) { - if runtime == nil || strings.TrimSpace(runtime.AgentBaseURL) == "" { - return nil, errors.New("resolved runtime is nil or missing agent base url") + if runtime == nil { + return nil, errors.New("resolved runtime is nil") + } + if runtime.Client != nil { + return runtime.Client, nil + } + if strings.TrimSpace(runtime.AgentBaseURL) == "" { + return nil, errors.New("resolved runtime is missing agent base url") } return newEC2HTTPComponentClient(runtime.AgentBaseURL), nil } @@ -210,6 +219,10 @@ func isRetriableNetworkError(err error) bool { return errors.As(err, &netErr) } +func remoteAgentError(code, message string) error { + return fmt.Errorf("remote agent error (%s): %s", code, message) +} + func resolveEC2AgentBaseURL(testLogger zerolog.Logger) (string, error) { if configured := strings.TrimSpace(os.Getenv(envEC2AgentURL)); configured != "" { return configured, nil diff --git a/system-tests/lib/cre/environment/remote_component_client_test.go b/system-tests/lib/cre/environment/remote_component_client_test.go index c2d33aa8766..8cc1c4f53d7 100644 --- a/system-tests/lib/cre/environment/remote_component_client_test.go +++ b/system-tests/lib/cre/environment/remote_component_client_test.go @@ -10,9 +10,9 @@ import ( "testing" "github.com/rs/zerolog" - "github.com/stretchr/testify/require" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" + "github.com/stretchr/testify/require" ) func TestResolveRemoteRuntimeWithExplicitEnv(t *testing.T) { @@ -24,6 +24,7 @@ func TestResolveRemoteRuntimeWithExplicitEnv(t *testing.T) { require.NoError(t, err, "expected runtime resolution to succeed") require.Equal(t, "http://198.51.100.20:19090", runtime.AgentBaseURL, "unexpected agent base url") require.Equal(t, "198.51.100.20", runtime.EC2HostIP, "unexpected ec2 host ip") + require.NotNil(t, runtime.Client, "expected resolved runtime to include component client") } func TestResolveRemoteRuntimeRequiresHostResolution(t *testing.T) { diff --git a/system-tests/lib/cre/environment/remote_component_start.go b/system-tests/lib/cre/environment/remote_component_start.go new file mode 100644 index 00000000000..bbd474e7d6f --- /dev/null +++ b/system-tests/lib/cre/environment/remote_component_start.go @@ -0,0 +1,50 @@ +package environment + +import ( + "context" + "encoding/json" + "fmt" + + pkgerrors "github.com/pkg/errors" + "github.com/rs/zerolog" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" +) + +func startRemoteComponent[T any]( + ctx context.Context, + lggr zerolog.Logger, + client componentClient, + payload agent.StartComponentPayload, + expectedComponentType string, +) (*T, error) { + payloadBytes, err := json.Marshal(payload) + if err != nil { + return nil, pkgerrors.Wrapf(err, "failed to encode %s payload", expectedComponentType) + } + + response, err := client.StartComponent(ctx, agent.StartComponentEnvelope{ + SchemaVersion: agent.SchemaVersionV1, + Operation: agent.OperationStartComponent, + Payload: payloadBytes, + }) + if err != nil { + return nil, err + } + if response.ComponentType != expectedComponentType { + return nil, fmt.Errorf("unexpected component type in start response: %s", response.ComponentType) + } + for _, logLine := range response.AgentLogs { + pretty := prettifyAgentLogLine(logLine) + if pretty == "" { + continue + } + lggr.Info().Msgf("[agent] %s", pretty) + } + + output, err := agent.DecodeFromTransport[T](response.Output) + if err != nil { + return nil, pkgerrors.Wrapf(err, "failed to decode %s transport payload", expectedComponentType) + } + return output, nil +} diff --git a/system-tests/lib/cre/environment/remote_stop.go b/system-tests/lib/cre/environment/remote_stop.go index 157e1dd54cc..d5bc9ec556a 100644 --- a/system-tests/lib/cre/environment/remote_stop.go +++ b/system-tests/lib/cre/environment/remote_stop.go @@ -44,10 +44,6 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. if err != nil { return summary, pkgerrors.Wrap(err, "failed to resolve remote runtime settings for stop") } - startClient, err := newRemoteComponentClient(remoteRuntime) - if err != nil { - return summary, pkgerrors.Wrap(err, "failed to initialize remote component client for stop") - } var joined error for _, configuredBlockchain := range cfg.Blockchains { @@ -59,7 +55,7 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. Blockchain: configuredBlockchain.InputRef(), ReusePolicy: string(configuredBlockchain.RemoteStartPolicy), } - result, err := stopRemoteComponent(ctx, lggr, startClient, payload, componentTypeBlockchain) + result, err := stopRemoteComponent(ctx, lggr, remoteRuntime.Client, payload, componentTypeBlockchain) if err != nil { summary.Failed++ joined = errors.Join(joined, err) @@ -81,7 +77,7 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. NodeSet: &simple_node_set.Input{Name: nodeSet.Name}, ReusePolicy: nodeSet.RemoteStartPolicy, } - result, err := stopRemoteComponent(ctx, lggr, startClient, payload, componentTypeNodeSet) + result, err := stopRemoteComponent(ctx, lggr, remoteRuntime.Client, payload, componentTypeNodeSet) if err != nil { summary.Failed++ joined = errors.Join(joined, err) @@ -100,7 +96,7 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. JD: cfg.JD.InputRef(), ReusePolicy: string(cfg.JD.RemoteStartPolicy), } - result, err := stopRemoteComponent(ctx, lggr, startClient, payload, componentTypeJD) + result, err := stopRemoteComponent(ctx, lggr, remoteRuntime.Client, payload, componentTypeJD) if err != nil { summary.Failed++ joined = errors.Join(joined, err) diff --git a/system-tests/lib/cre/environment/state.go b/system-tests/lib/cre/environment/state.go index e92032b2565..b511281955e 100644 --- a/system-tests/lib/cre/environment/state.go +++ b/system-tests/lib/cre/environment/state.go @@ -42,15 +42,17 @@ func BuildFromSavedState(ctx context.Context, cldLogger logger.Logger, cachedInp if effErr != nil { return nil, nil, errors.Wrap(effErr, "failed to resolve cached blockchain inputs") } - deployedBlockchains, startErr := blockchains.Start( - ctx, - framework.L, - cldLogger, - effectiveBlockchains, - blockchainDeployers, - ) - if startErr != nil { - return nil, nil, errors.Wrap(startErr, "failed to start blockchains") + blockchainClients := make([]blockchains.Blockchain, 0, len(effectiveBlockchains)) + for _, input := range effectiveBlockchains { + started, err := blockchains.StartChain(ctx, blockchainDeployers, input) + if err != nil { + return nil, nil, errors.Wrap(err, "failed to start blockchains") + } + reconstructed, err := blockchainFromOutput(framework.L, input, started) + if err != nil { + return nil, nil, errors.Wrap(err, "failed to reconstruct blockchain from started data") + } + blockchainClients = append(blockchainClients, reconstructed) } datastore := datastore.NewMemoryDataStore() @@ -91,8 +93,8 @@ func BuildFromSavedState(ctx context.Context, cldLogger logger.Logger, cachedInp donsSlice = append(donsSlice, startedDON) } - cldfBlockchains := make([]cldf_chain.BlockChain, 0, len(deployedBlockchains.Outputs)) - for _, db := range deployedBlockchains.Outputs { + cldfBlockchains := make([]cldf_chain.BlockChain, 0, len(blockchainClients)) + for _, db := range blockchainClients { chain, chainErr := db.ToCldfChain() if chainErr != nil { return nil, nil, errors.Wrap(chainErr, "failed to create cldf chain from blockchain") @@ -116,7 +118,7 @@ func BuildFromSavedState(ctx context.Context, cldLogger logger.Logger, cachedInp dons := cre.NewDons(donsSlice, topology.GatewayConnectors) linkDonsToJDInput := &cre.LinkDonsToJDInput{ - Blockchains: deployedBlockchains.Outputs, + Blockchains: blockchainClients, CldfEnvironment: cldEnv, Topology: topology, Dons: dons, @@ -133,8 +135,8 @@ func BuildFromSavedState(ctx context.Context, cldLogger logger.Logger, cachedInp return &cre.Environment{ CldfEnvironment: cldEnv, - Blockchains: deployedBlockchains.Outputs, - RegistryChainSelector: deployedBlockchains.Outputs[0].ChainSelector(), + Blockchains: blockchainClients, + RegistryChainSelector: blockchainClients[0].ChainSelector(), Provider: *cachedInput.Infra, ContractVersions: contractVersions.ContractVersions(), }, dons, nil From a4a023d6f1d0f8b52e8e048453bf4a599eaaa072 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Wed, 25 Feb 2026 14:24:22 +0100 Subject: [PATCH 22/34] move everything remote execution-related to a dedicated package --- .../environment/environment/environment.go | 3 +- .../cre/environment/environment/workflow.go | 3 +- .../lib/cre/environment/blockchain_start.go | 11 +++-- .../cre/environment/blockchain_start_test.go | 49 +++++++++---------- system-tests/lib/cre/environment/dons.go | 17 ++++--- .../lib/cre/environment/environment.go | 19 ++----- system-tests/lib/cre/environment/jobs.go | 11 +++-- .../agent/cmd/local-agent/main.go | 2 +- .../{ => remoteexec}/agent/deploy.go | 0 .../{ => remoteexec}/agent/deploy_test.go | 0 .../{ => remoteexec}/agent/relay.go | 0 .../{ => remoteexec}/agent/relay_test.go | 0 .../{ => remoteexec}/agent/server.go | 0 .../agent/server_handlers_test.go | 0 .../{ => remoteexec}/agent/server_test.go | 0 .../{ => remoteexec}/agent/transport.go | 0 .../{ => remoteexec}/agent/transport_test.go | 0 .../client}/agent_log_format.go | 2 +- .../client}/artifacts_remote.go | 6 +-- .../client}/artifacts_remote_test.go | 12 ++--- .../client}/remote_component_client.go | 46 ++++++++--------- .../client}/remote_component_client_test.go | 26 +++++----- .../client}/remote_component_start.go | 8 +-- .../{ => remoteexec/client}/remote_stop.go | 20 ++++---- .../client}/remote_stop_test.go | 26 +++++----- .../lib/cre/environment/setup_output_test.go | 17 ------- .../tests/smoke/cre/v2_grpc_source_test.go | 4 +- system-tests/tests/test-helpers/t_helpers.go | 4 +- 28 files changed, 129 insertions(+), 157 deletions(-) rename system-tests/lib/cre/environment/{ => remoteexec}/agent/cmd/local-agent/main.go (97%) rename system-tests/lib/cre/environment/{ => remoteexec}/agent/deploy.go (100%) rename system-tests/lib/cre/environment/{ => remoteexec}/agent/deploy_test.go (100%) rename system-tests/lib/cre/environment/{ => remoteexec}/agent/relay.go (100%) rename system-tests/lib/cre/environment/{ => remoteexec}/agent/relay_test.go (100%) rename system-tests/lib/cre/environment/{ => remoteexec}/agent/server.go (100%) rename system-tests/lib/cre/environment/{ => remoteexec}/agent/server_handlers_test.go (100%) rename system-tests/lib/cre/environment/{ => remoteexec}/agent/server_test.go (100%) rename system-tests/lib/cre/environment/{ => remoteexec}/agent/transport.go (100%) rename system-tests/lib/cre/environment/{ => remoteexec}/agent/transport_test.go (100%) rename system-tests/lib/cre/environment/{ => remoteexec/client}/agent_log_format.go (96%) rename system-tests/lib/cre/environment/{ => remoteexec/client}/artifacts_remote.go (95%) rename system-tests/lib/cre/environment/{ => remoteexec/client}/artifacts_remote_test.go (94%) rename system-tests/lib/cre/environment/{ => remoteexec/client}/remote_component_client.go (84%) rename system-tests/lib/cre/environment/{ => remoteexec/client}/remote_component_client_test.go (90%) rename system-tests/lib/cre/environment/{ => remoteexec/client}/remote_component_start.go (92%) rename system-tests/lib/cre/environment/{ => remoteexec/client}/remote_stop.go (94%) rename system-tests/lib/cre/environment/{ => remoteexec/client}/remote_stop_test.go (93%) delete mode 100644 system-tests/lib/cre/environment/setup_output_test.go diff --git a/core/scripts/cre/environment/environment/environment.go b/core/scripts/cre/environment/environment/environment.go index bd4adf3e6e3..3c272c3a469 100644 --- a/core/scripts/cre/environment/environment/environment.go +++ b/core/scripts/cre/environment/environment/environment.go @@ -44,6 +44,7 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" blockchains_sets "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/sets" envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + remoteclient "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/client" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/stagegen" feature_set "github.com/smartcontractkit/chainlink/system-tests/lib/cre/features/sets" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/flags" @@ -763,7 +764,7 @@ func stopRemoteTargets(ctx context.Context, relativePathToRepoRoot string, targe applyRemoteAgentEnvFallback(framework.L, agentState) } - summary, stopRemoteErr := creenv.StopRemoteComponents(ctx, framework.L, targets) + summary, stopRemoteErr := remoteclient.StopRemoteComponents(ctx, framework.L, targets) framework.L.Info(). Int("requested", summary.Requested). Int("stopped", summary.Stopped). diff --git a/core/scripts/cre/environment/environment/workflow.go b/core/scripts/cre/environment/environment/workflow.go index 6d15686aaa8..0e0a9239bd3 100644 --- a/core/scripts/cre/environment/environment/workflow.go +++ b/core/scripts/cre/environment/environment/workflow.go @@ -26,6 +26,7 @@ import ( keystone_changeset "github.com/smartcontractkit/chainlink/deployment/keystone/changeset" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment" envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + remoteclient "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/client" creworkflow "github.com/smartcontractkit/chainlink/system-tests/lib/cre/workflow" ) @@ -406,7 +407,7 @@ func deployWorkflow( ContainerTargetDir: containerTargetDirFlag, Files: files, RemoteDeployer: func(ctx context.Context, nodeSetName, containerTargetDir string, files []string) error { - return environment.DeployArtifactsToRemoteNodeSet(ctx, framework.L, nodeSetName, containerTargetDir, files) + return remoteclient.DeployArtifactsToRemoteNodeSet(ctx, framework.L, nodeSetName, containerTargetDir, files) }, }, ) diff --git a/system-tests/lib/cre/environment/blockchain_start.go b/system-tests/lib/cre/environment/blockchain_start.go index b54bb6dd62e..315e9c51f54 100644 --- a/system-tests/lib/cre/environment/blockchain_start.go +++ b/system-tests/lib/cre/environment/blockchain_start.go @@ -12,12 +12,13 @@ import ( cldf_chain "github.com/smartcontractkit/chainlink-deployments-framework/chain" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/solana" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/tron" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" + remoteclient "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/client" ) func blockchainFromOutput(testLogger zerolog.Logger, input *blockchain.Input, output *blockchain.Output) (blockchains.Blockchain, error) { @@ -52,7 +53,7 @@ func startBlockchains( testLogger zerolog.Logger, configuredBlockchains []*config.Blockchain, deployers map[blockchain.ChainFamily]blockchains.Deployer, - remoteRuntime *resolvedRemoteRuntime, + remoteRuntime *remoteclient.Runtime, rewriteInternalForLocalNodes bool, ) (*blockchains.DeployedBlockchains, error) { blockchainInputs, err := config.ResolveBlockchainInputs(configuredBlockchains) @@ -70,16 +71,16 @@ func startBlockchains( return nil, err } payload := agent.StartComponentPayload{ - ComponentType: componentTypeBlockchain, + ComponentType: remoteclient.ComponentTypeBlockchain, Blockchain: input, ReusePolicy: string(configured.RemoteStartPolicy), } - deployedOutput, err = startRemoteComponent[blockchain.Output]( + deployedOutput, err = remoteclient.StartRemoteComponent[blockchain.Output]( ctx, testLogger, remoteRuntime.Client, payload, - componentTypeBlockchain, + remoteclient.ComponentTypeBlockchain, ) if err != nil { return nil, err diff --git a/system-tests/lib/cre/environment/blockchain_start_test.go b/system-tests/lib/cre/environment/blockchain_start_test.go index fb4a287814b..e581eaae491 100644 --- a/system-tests/lib/cre/environment/blockchain_start_test.go +++ b/system-tests/lib/cre/environment/blockchain_start_test.go @@ -5,6 +5,7 @@ import ( "github.com/rs/zerolog" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + remoteclient "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/client" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" "github.com/stretchr/testify/require" ) @@ -21,58 +22,54 @@ func TestValidateRemoteBlockchainInput(t *testing.T) { } func TestNewRemoteComponentClientPrefersEC2(t *testing.T) { - t.Setenv(envEC2AgentURL, "") + t.Setenv(remoteclient.EnvEC2AgentURL, "") t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") - t.Setenv(envEC2AgentPort, "18080") + t.Setenv(remoteclient.EnvEC2AgentPort, "18080") - runtime, err := resolveRemoteRuntime(zerolog.Nop()) + runtime, err := remoteclient.ResolveRuntime(zerolog.Nop()) require.NoError(t, err, "expected remote runtime to resolve") - client, err := newRemoteComponentClient(runtime) + client, err := remoteclient.NewComponentClient(runtime) require.NoError(t, err, "expected ec2-first client to be created") - - httpClient, ok := client.(*httpComponentClient) - require.True(t, ok, "expected httpComponentClient, got %T", client) - require.True(t, httpClient.checkHealth, "expected ec2 client to enable health checks") - require.Equal(t, 3, httpClient.maxAttempts, "expected ec2 client retries to be enabled") - require.Equal(t, "http://203.0.113.10:18080", httpClient.baseURL, "unexpected ec2 base url") + require.NotNil(t, client, "expected component client to be created") + require.Equal(t, "http://203.0.113.10:18080", runtime.AgentBaseURL, "unexpected ec2 base url") } func TestResolveEC2AgentBaseURLRequiresHostOrInstanceInfoWhenURLMissing(t *testing.T) { - t.Setenv(envEC2AgentURL, "") + t.Setenv(remoteclient.EnvEC2AgentURL, "") t.Setenv(runtimecfg.EnvEC2HostIP, "") t.Setenv(runtimecfg.EnvEC2InstanceID, "") - t.Setenv(envEC2AgentPort, "") + t.Setenv(remoteclient.EnvEC2AgentPort, "") - _, err := resolveEC2AgentBaseURL(zerolog.Nop()) - require.Error(t, err, "expected missing direct host resolution inputs to fail when %s is not set", envEC2AgentURL) + _, err := remoteclient.ResolveRuntime(zerolog.Nop()) + require.Error(t, err, "expected missing direct host resolution inputs to fail when %s is not set", remoteclient.EnvEC2AgentURL) } func TestResolveEC2AgentBaseURLRejectsInvalidPort(t *testing.T) { - t.Setenv(envEC2AgentURL, "") + t.Setenv(remoteclient.EnvEC2AgentURL, "") t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") - t.Setenv(envEC2AgentPort, "not-a-port") + t.Setenv(remoteclient.EnvEC2AgentPort, "not-a-port") - _, err := resolveEC2AgentBaseURL(zerolog.Nop()) - require.Error(t, err, "expected invalid %s to fail", envEC2AgentPort) - require.Contains(t, err.Error(), envEC2AgentPort, "expected error to mention %s", envEC2AgentPort) + _, err := remoteclient.ResolveRuntime(zerolog.Nop()) + require.Error(t, err, "expected invalid %s to fail", remoteclient.EnvEC2AgentPort) + require.Contains(t, err.Error(), remoteclient.EnvEC2AgentPort, "expected error to mention %s", remoteclient.EnvEC2AgentPort) } func TestResolveEC2AgentBaseURLDirectMode(t *testing.T) { - t.Setenv(envEC2AgentURL, "") + t.Setenv(remoteclient.EnvEC2AgentURL, "") t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") - t.Setenv(envEC2AgentPort, "18080") + t.Setenv(remoteclient.EnvEC2AgentPort, "18080") - baseURL, err := resolveEC2AgentBaseURL(zerolog.Nop()) + runtime, err := remoteclient.ResolveRuntime(zerolog.Nop()) require.NoError(t, err, "expected direct mode url resolution to succeed") - require.Equal(t, "http://203.0.113.10:18080", baseURL, "unexpected direct mode base url") + require.Equal(t, "http://203.0.113.10:18080", runtime.AgentBaseURL, "unexpected direct mode base url") } func TestResolveRemoteRuntimeRequiresEC2Resolution(t *testing.T) { - t.Setenv(envEC2AgentURL, "") + t.Setenv(remoteclient.EnvEC2AgentURL, "") t.Setenv(runtimecfg.EnvEC2HostIP, "") t.Setenv(runtimecfg.EnvEC2InstanceID, "") - _, err := resolveRemoteRuntime(zerolog.Nop()) + _, err := remoteclient.ResolveRuntime(zerolog.Nop()) require.Error(t, err, "expected runtime resolution without EC2 inputs to fail") } @@ -118,7 +115,7 @@ func TestRewriteRemoteBlockchainOutputForDirectAccess_InvalidExternalURL(t *test } func TestRemoteAgentErrorFormatting(t *testing.T) { - err := remoteAgentError("deployment_failed", "failed to deploy blockchain output") + err := remoteclient.RemoteAgentError("deployment_failed", "failed to deploy blockchain output") want := "remote agent error (deployment_failed): failed to deploy blockchain output" require.EqualError(t, err, want, "unexpected remote agent error formatting") } diff --git a/system-tests/lib/cre/environment/dons.go b/system-tests/lib/cre/environment/dons.go index 6908e63b046..fab1f71a105 100644 --- a/system-tests/lib/cre/environment/dons.go +++ b/system-tests/lib/cre/environment/dons.go @@ -19,10 +19,11 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre" crecapabilities "github.com/smartcontractkit/chainlink/system-tests/lib/cre/capabilities" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/solana" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" + remoteclient "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/client" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/flags" "github.com/smartcontractkit/chainlink/system-tests/lib/infra" ) @@ -59,7 +60,7 @@ func StartDONs( capabilityConfigs cre.CapabilityConfigs, copyCapabilityBinaries bool, nodeSets []*cre.NodeSet, - remoteRuntime *resolvedRemoteRuntime, + remoteRuntime *remoteclient.Runtime, ) (*StartedDONs, error) { if err := verifyRemoteToLocalBootstrapReachability(ctx, lggr, topology); err != nil { return nil, pkgerrors.Wrap(err, "bootstrap reachability sanity check failed") @@ -122,7 +123,7 @@ func startDONsContainerized( capabilityConfigs cre.CapabilityConfigs, copyCapabilityBinaries bool, nodeSets []*cre.NodeSet, - remoteRuntime *resolvedRemoteRuntime, + remoteRuntime *remoteclient.Runtime, ) (*StartedDONs, error) { // Skip binary operations for remote DONs. if infraInput.IsDocker() { @@ -231,7 +232,7 @@ func startDON( configuredIndex int, nodeSet *cre.NodeSet, registryChainBlockchainOutput *blockchain.Output, - remoteRuntime *resolvedRemoteRuntime, + remoteRuntime *remoteclient.Runtime, ) (*StartedDON, error) { if nodeSet == nil { return nil, errors.New("nodeSet is nil") @@ -292,7 +293,7 @@ func startNodeSet( configuredIndex int, nodeSet *cre.NodeSet, registryChainBlockchainOutput *blockchain.Output, - remoteRuntime *resolvedRemoteRuntime, + remoteRuntime *remoteclient.Runtime, ) (*ns.Output, error) { // If output is already set (Kubernetes or cached), use it. if nodeSet.Out != nil { @@ -313,17 +314,17 @@ func startNodeSet( return nil, err } payload := agent.StartComponentPayload{ - ComponentType: componentTypeNodeSet, + ComponentType: remoteclient.ComponentTypeNodeSet, NodeSet: remoteInput, RegistryBlockchain: registryChainPayload, ReusePolicy: nodeSetRemoteStartPolicy(nodeSet), } - nodeset, err := startRemoteComponent[ns.Output]( + nodeset, err := remoteclient.StartRemoteComponent[ns.Output]( ctx, lggr, remoteRuntime.Client, payload, - componentTypeNodeSet, + remoteclient.ComponentTypeNodeSet, ) if err != nil { return nil, err diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index 7fc6e3d659d..8ff4ce131e5 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -9,7 +9,6 @@ import ( "os" "strconv" "strings" - "sync" "time" "github.com/Masterminds/semver/v3" @@ -40,6 +39,7 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + remoteclient "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/client" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/stagegen" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/sharding" @@ -55,19 +55,6 @@ type SetupOutput struct { NodeOutput []*cre.NodeSetOutput S3ProviderOutput *s3provider.Output GatewayConnectors *cre.GatewayConnectors - closeOnce sync.Once - closeErr error -} - -func (s *SetupOutput) Close(ctx context.Context) error { - if s == nil { - return nil - } - s.closeOnce.Do(func() { - s.closeErr = nil - }) - - return s.closeErr } type SetupInput struct { @@ -505,11 +492,11 @@ func resolveRemoteRuntimeForSetup( blockchains []*config.Blockchain, jdInput *config.JobDistributor, nodeSets []*cre.NodeSet, -) (*resolvedRemoteRuntime, error) { +) (*remoteclient.Runtime, error) { if !hasRemoteComponents(blockchains, jdInput, nodeSets) { return nil, nil } - return resolveRemoteRuntime(testLogger) + return remoteclient.ResolveRuntime(testLogger) } type nodeSetPlacementSummary struct { diff --git a/system-tests/lib/cre/environment/jobs.go b/system-tests/lib/cre/environment/jobs.go index e16d2ba69e7..8e0cb06a3ed 100644 --- a/system-tests/lib/cre/environment/jobs.go +++ b/system-tests/lib/cre/environment/jobs.go @@ -19,8 +19,9 @@ import ( "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" + remoteclient "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/client" "github.com/smartcontractkit/chainlink/system-tests/lib/infra" ) @@ -63,7 +64,7 @@ func StartJD( lggr zerolog.Logger, jdConfig *config.JobDistributor, infraInput infra.Provider, - remoteRuntime *resolvedRemoteRuntime, + remoteRuntime *remoteclient.Runtime, ) (*StartedJD, error) { startTime := time.Now() lggr.Info().Msg("Starting Job Distributor") @@ -80,16 +81,16 @@ func StartJD( return nil, errors.New("remote runtime is required when starting remote jd") } payload := agent.StartComponentPayload{ - ComponentType: componentTypeJD, + ComponentType: remoteclient.ComponentTypeJD, JD: jdConfig.InputRef(), ReusePolicy: string(jdConfig.RemoteStartPolicy), } - jdOutput, jdErr = startRemoteComponent[jd.Output]( + jdOutput, jdErr = remoteclient.StartRemoteComponent[jd.Output]( ctx, lggr, remoteRuntime.Client, payload, - componentTypeJD, + remoteclient.ComponentTypeJD, ) if jdErr != nil { return nil, jdErr diff --git a/system-tests/lib/cre/environment/agent/cmd/local-agent/main.go b/system-tests/lib/cre/environment/remoteexec/agent/cmd/local-agent/main.go similarity index 97% rename from system-tests/lib/cre/environment/agent/cmd/local-agent/main.go rename to system-tests/lib/cre/environment/remoteexec/agent/cmd/local-agent/main.go index 55c4ac6c655..d9ccf114d9b 100644 --- a/system-tests/lib/cre/environment/agent/cmd/local-agent/main.go +++ b/system-tests/lib/cre/environment/remoteexec/agent/cmd/local-agent/main.go @@ -10,7 +10,7 @@ import ( "github.com/rs/zerolog" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" blockchainsets "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/sets" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" "github.com/smartcontractkit/chainlink/system-tests/lib/infra" diff --git a/system-tests/lib/cre/environment/agent/deploy.go b/system-tests/lib/cre/environment/remoteexec/agent/deploy.go similarity index 100% rename from system-tests/lib/cre/environment/agent/deploy.go rename to system-tests/lib/cre/environment/remoteexec/agent/deploy.go diff --git a/system-tests/lib/cre/environment/agent/deploy_test.go b/system-tests/lib/cre/environment/remoteexec/agent/deploy_test.go similarity index 100% rename from system-tests/lib/cre/environment/agent/deploy_test.go rename to system-tests/lib/cre/environment/remoteexec/agent/deploy_test.go diff --git a/system-tests/lib/cre/environment/agent/relay.go b/system-tests/lib/cre/environment/remoteexec/agent/relay.go similarity index 100% rename from system-tests/lib/cre/environment/agent/relay.go rename to system-tests/lib/cre/environment/remoteexec/agent/relay.go diff --git a/system-tests/lib/cre/environment/agent/relay_test.go b/system-tests/lib/cre/environment/remoteexec/agent/relay_test.go similarity index 100% rename from system-tests/lib/cre/environment/agent/relay_test.go rename to system-tests/lib/cre/environment/remoteexec/agent/relay_test.go diff --git a/system-tests/lib/cre/environment/agent/server.go b/system-tests/lib/cre/environment/remoteexec/agent/server.go similarity index 100% rename from system-tests/lib/cre/environment/agent/server.go rename to system-tests/lib/cre/environment/remoteexec/agent/server.go diff --git a/system-tests/lib/cre/environment/agent/server_handlers_test.go b/system-tests/lib/cre/environment/remoteexec/agent/server_handlers_test.go similarity index 100% rename from system-tests/lib/cre/environment/agent/server_handlers_test.go rename to system-tests/lib/cre/environment/remoteexec/agent/server_handlers_test.go diff --git a/system-tests/lib/cre/environment/agent/server_test.go b/system-tests/lib/cre/environment/remoteexec/agent/server_test.go similarity index 100% rename from system-tests/lib/cre/environment/agent/server_test.go rename to system-tests/lib/cre/environment/remoteexec/agent/server_test.go diff --git a/system-tests/lib/cre/environment/agent/transport.go b/system-tests/lib/cre/environment/remoteexec/agent/transport.go similarity index 100% rename from system-tests/lib/cre/environment/agent/transport.go rename to system-tests/lib/cre/environment/remoteexec/agent/transport.go diff --git a/system-tests/lib/cre/environment/agent/transport_test.go b/system-tests/lib/cre/environment/remoteexec/agent/transport_test.go similarity index 100% rename from system-tests/lib/cre/environment/agent/transport_test.go rename to system-tests/lib/cre/environment/remoteexec/agent/transport_test.go diff --git a/system-tests/lib/cre/environment/agent_log_format.go b/system-tests/lib/cre/environment/remoteexec/client/agent_log_format.go similarity index 96% rename from system-tests/lib/cre/environment/agent_log_format.go rename to system-tests/lib/cre/environment/remoteexec/client/agent_log_format.go index f95bf8df852..9300265ea99 100644 --- a/system-tests/lib/cre/environment/agent_log_format.go +++ b/system-tests/lib/cre/environment/remoteexec/client/agent_log_format.go @@ -1,4 +1,4 @@ -package environment +package client import ( "encoding/json" diff --git a/system-tests/lib/cre/environment/artifacts_remote.go b/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote.go similarity index 95% rename from system-tests/lib/cre/environment/artifacts_remote.go rename to system-tests/lib/cre/environment/remoteexec/client/artifacts_remote.go index 14658d5bc96..8ca91eae018 100644 --- a/system-tests/lib/cre/environment/artifacts_remote.go +++ b/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote.go @@ -1,4 +1,4 @@ -package environment +package client import ( "context" @@ -11,7 +11,7 @@ import ( pkgerrors "github.com/pkg/errors" "github.com/rs/zerolog" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" ) func DeployArtifactsToRemoteNodeSet( @@ -28,7 +28,7 @@ func DeployArtifactsToRemoteNodeSet( return fmt.Errorf("container target dir is required") } - remoteRuntime, err := resolveRemoteRuntime(lggr) + remoteRuntime, err := ResolveRuntime(lggr) if err != nil { return pkgerrors.Wrap(err, "failed to resolve remote runtime settings for artifact deploy") } diff --git a/system-tests/lib/cre/environment/artifacts_remote_test.go b/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote_test.go similarity index 94% rename from system-tests/lib/cre/environment/artifacts_remote_test.go rename to system-tests/lib/cre/environment/remoteexec/client/artifacts_remote_test.go index 7b1dfb2c9be..5041a47b746 100644 --- a/system-tests/lib/cre/environment/artifacts_remote_test.go +++ b/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote_test.go @@ -1,4 +1,4 @@ -package environment +package client import ( "context" @@ -10,10 +10,10 @@ import ( "path/filepath" "testing" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" + "github.com/rs/zerolog" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" "github.com/stretchr/testify/require" - "github.com/rs/zerolog" ) func TestDeployArtifactsToRemoteNodeSetValidation(t *testing.T) { @@ -36,7 +36,7 @@ func TestDeployArtifactsToRemoteNodeSetNoFilesFails(t *testing.T) { })) defer server.Close() - t.Setenv(envEC2AgentURL, server.URL) + t.Setenv(EnvEC2AgentURL, server.URL) t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") err := DeployArtifactsToRemoteNodeSet(context.Background(), zerolog.Nop(), "workflow", "/home/chainlink/workflows", []string{"", ""}) @@ -69,7 +69,7 @@ func TestDeployArtifactsToRemoteNodeSetSuccess(t *testing.T) { require.Equal(t, "artifact-content", string(raw)) _ = json.NewEncoder(w).Encode(agent.StartComponentResponse{ - ComponentType: componentTypeNodeSet, + ComponentType: ComponentTypeNodeSet, AgentLogs: []string{"artifact deployed"}, }) default: @@ -78,7 +78,7 @@ func TestDeployArtifactsToRemoteNodeSetSuccess(t *testing.T) { })) defer server.Close() - t.Setenv(envEC2AgentURL, server.URL) + t.Setenv(EnvEC2AgentURL, server.URL) t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") err := DeployArtifactsToRemoteNodeSet(context.Background(), zerolog.Nop(), "workflow", "/home/chainlink/workflows", []string{artifactPath}) diff --git a/system-tests/lib/cre/environment/remote_component_client.go b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go similarity index 84% rename from system-tests/lib/cre/environment/remote_component_client.go rename to system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go index 5aa5003eccb..0aa6b00fda2 100644 --- a/system-tests/lib/cre/environment/remote_component_client.go +++ b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go @@ -1,4 +1,4 @@ -package environment +package client import ( "bytes" @@ -18,20 +18,20 @@ import ( pkgerrors "github.com/pkg/errors" "github.com/rs/zerolog" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" ) const ( - componentTypeBlockchain = "blockchain" - componentTypeJD = "jd" - componentTypeNodeSet = "nodeset" - envEC2AgentURL = "CRE_EC2_AGENT_URL" - envEC2AgentPort = "CRE_EC2_AGENT_PORT" + ComponentTypeBlockchain = "blockchain" + ComponentTypeJD = "jd" + ComponentTypeNodeSet = "nodeset" + EnvEC2AgentURL = "CRE_EC2_AGENT_URL" + EnvEC2AgentPort = "CRE_EC2_AGENT_PORT" defaultEC2AgentPort = 8080 ) -type componentClient interface { +type ComponentClient interface { StartComponent(ctx context.Context, envelope agent.StartComponentEnvelope) (*agent.StartComponentResponse, error) } @@ -43,10 +43,10 @@ type httpComponentClient struct { checkHealth bool } -type resolvedRemoteRuntime struct { +type Runtime struct { AgentBaseURL string EC2HostIP string - Client componentClient + Client ComponentClient } func newEC2HTTPComponentClient(baseURL string) *httpComponentClient { @@ -61,7 +61,7 @@ func newEC2HTTPComponentClient(baseURL string) *httpComponentClient { } } -func resolveRemoteRuntime(testLogger zerolog.Logger) (*resolvedRemoteRuntime, error) { +func ResolveRuntime(testLogger zerolog.Logger) (*Runtime, error) { baseURL, err := resolveEC2AgentBaseURL(testLogger) if err != nil { return nil, fmt.Errorf("failed to resolve EC2 agent base URL: %w", err) @@ -71,14 +71,14 @@ func resolveRemoteRuntime(testLogger zerolog.Logger) (*resolvedRemoteRuntime, er return nil, err } client := newEC2HTTPComponentClient(baseURL) - return &resolvedRemoteRuntime{ + return &Runtime{ AgentBaseURL: baseURL, EC2HostIP: ec2HostIP, Client: client, }, nil } -func newRemoteComponentClient(runtime *resolvedRemoteRuntime) (componentClient, error) { +func NewComponentClient(runtime *Runtime) (ComponentClient, error) { if runtime == nil { return nil, errors.New("resolved runtime is nil") } @@ -153,9 +153,9 @@ func (c *httpComponentClient) startComponentOnce(ctx context.Context, envelope a if resp.StatusCode < 200 || resp.StatusCode >= 300 { if startResp.Error != "" { if startResp.ErrorCode != "" { - err = remoteAgentError(startResp.ErrorCode, startResp.Error) + err = RemoteAgentError(startResp.ErrorCode, startResp.Error) } else { - err = remoteAgentError("remote_agent_error", startResp.Error) + err = RemoteAgentError("remote_agent_error", startResp.Error) } } else { err = fmt.Errorf("start component request failed with status %s: %s", resp.Status, string(respBody)) @@ -168,9 +168,9 @@ func (c *httpComponentClient) startComponentOnce(ctx context.Context, envelope a } if startResp.Error != "" { if startResp.ErrorCode != "" { - return nil, retry.Unrecoverable(remoteAgentError(startResp.ErrorCode, startResp.Error)) + return nil, retry.Unrecoverable(RemoteAgentError(startResp.ErrorCode, startResp.Error)) } - return nil, retry.Unrecoverable(remoteAgentError("remote_agent_error", startResp.Error)) + return nil, retry.Unrecoverable(RemoteAgentError("remote_agent_error", startResp.Error)) } return &startResp, nil @@ -205,8 +205,8 @@ func describeEC2AgentHealthFailure(baseURL string) string { return fmt.Sprintf( "failed EC2 CRE agent health check (%s/v1/health); verify the agent process is running and %s matches its listen port (or set %s explicitly)", baseURL, - envEC2AgentPort, - envEC2AgentURL, + EnvEC2AgentPort, + EnvEC2AgentURL, ) } @@ -219,12 +219,12 @@ func isRetriableNetworkError(err error) bool { return errors.As(err, &netErr) } -func remoteAgentError(code, message string) error { +func RemoteAgentError(code, message string) error { return fmt.Errorf("remote agent error (%s): %s", code, message) } func resolveEC2AgentBaseURL(testLogger zerolog.Logger) (string, error) { - if configured := strings.TrimSpace(os.Getenv(envEC2AgentURL)); configured != "" { + if configured := strings.TrimSpace(os.Getenv(EnvEC2AgentURL)); configured != "" { return configured, nil } remotePort, err := resolveEC2AgentPort() @@ -241,10 +241,10 @@ func resolveEC2AgentBaseURL(testLogger zerolog.Logger) (string, error) { func resolveEC2AgentPort() (int, error) { remotePort := defaultEC2AgentPort - if configuredPort := strings.TrimSpace(os.Getenv(envEC2AgentPort)); configuredPort != "" { + if configuredPort := strings.TrimSpace(os.Getenv(EnvEC2AgentPort)); configuredPort != "" { parsedPort, err := strconv.Atoi(configuredPort) if err != nil || parsedPort <= 0 || parsedPort > 65535 { - return 0, fmt.Errorf("invalid %s: %q", envEC2AgentPort, configuredPort) + return 0, fmt.Errorf("invalid %s: %q", EnvEC2AgentPort, configuredPort) } remotePort = parsedPort } diff --git a/system-tests/lib/cre/environment/remote_component_client_test.go b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client_test.go similarity index 90% rename from system-tests/lib/cre/environment/remote_component_client_test.go rename to system-tests/lib/cre/environment/remoteexec/client/remote_component_client_test.go index 8cc1c4f53d7..9b06aa0575f 100644 --- a/system-tests/lib/cre/environment/remote_component_client_test.go +++ b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client_test.go @@ -1,4 +1,4 @@ -package environment +package client import ( "context" @@ -10,17 +10,17 @@ import ( "testing" "github.com/rs/zerolog" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" "github.com/stretchr/testify/require" ) func TestResolveRemoteRuntimeWithExplicitEnv(t *testing.T) { - t.Setenv(envEC2AgentURL, "http://198.51.100.20:19090") + t.Setenv(EnvEC2AgentURL, "http://198.51.100.20:19090") t.Setenv(runtimecfg.EnvEC2HostIP, "198.51.100.20") - t.Setenv(envEC2AgentPort, "19090") + t.Setenv(EnvEC2AgentPort, "19090") - runtime, err := resolveRemoteRuntime(zerolog.Nop()) + runtime, err := ResolveRuntime(zerolog.Nop()) require.NoError(t, err, "expected runtime resolution to succeed") require.Equal(t, "http://198.51.100.20:19090", runtime.AgentBaseURL, "unexpected agent base url") require.Equal(t, "198.51.100.20", runtime.EC2HostIP, "unexpected ec2 host ip") @@ -28,27 +28,27 @@ func TestResolveRemoteRuntimeWithExplicitEnv(t *testing.T) { } func TestResolveRemoteRuntimeRequiresHostResolution(t *testing.T) { - t.Setenv(envEC2AgentURL, "http://198.51.100.20:19090") + t.Setenv(EnvEC2AgentURL, "http://198.51.100.20:19090") t.Setenv(runtimecfg.EnvEC2HostIP, "") t.Setenv(runtimecfg.EnvEC2InstanceID, "") - _, err := resolveRemoteRuntime(zerolog.Nop()) + _, err := ResolveRuntime(zerolog.Nop()) require.Error(t, err, "expected runtime resolution without EC2 host inputs to fail") } func TestNewRemoteComponentClientRequiresResolvedRuntime(t *testing.T) { - _, err := newRemoteComponentClient(nil) + _, err := NewComponentClient(nil) require.Error(t, err, "expected nil runtime to fail") - _, err = newRemoteComponentClient(&resolvedRemoteRuntime{}) + _, err = NewComponentClient(&Runtime{}) require.Error(t, err, "expected missing agent base URL to fail") } func TestDescribeEC2AgentHealthFailureMentionsResolutionHints(t *testing.T) { msg := describeEC2AgentHealthFailure("http://203.0.113.10:8080") require.Contains(t, msg, "/v1/health") - require.Contains(t, msg, envEC2AgentPort) - require.Contains(t, msg, envEC2AgentURL) + require.Contains(t, msg, EnvEC2AgentPort) + require.Contains(t, msg, EnvEC2AgentURL) } func TestIsRetriableStatus(t *testing.T) { @@ -72,7 +72,7 @@ func (timeoutError) Temporary() bool { return true } func TestStartComponentOnce_Success(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - _ = json.NewEncoder(w).Encode(agent.StartComponentResponse{ComponentType: componentTypeBlockchain}) + _ = json.NewEncoder(w).Encode(agent.StartComponentResponse{ComponentType: ComponentTypeBlockchain}) })) defer server.Close() @@ -83,7 +83,7 @@ func TestStartComponentOnce_Success(t *testing.T) { Payload: json.RawMessage(`{"componentType":"blockchain"}`), }) require.NoError(t, err) - require.Equal(t, componentTypeBlockchain, resp.ComponentType) + require.Equal(t, ComponentTypeBlockchain, resp.ComponentType) } func TestStartComponentOnce_Non2xxWithAgentErrorCode(t *testing.T) { diff --git a/system-tests/lib/cre/environment/remote_component_start.go b/system-tests/lib/cre/environment/remoteexec/client/remote_component_start.go similarity index 92% rename from system-tests/lib/cre/environment/remote_component_start.go rename to system-tests/lib/cre/environment/remoteexec/client/remote_component_start.go index bbd474e7d6f..26153b4e128 100644 --- a/system-tests/lib/cre/environment/remote_component_start.go +++ b/system-tests/lib/cre/environment/remoteexec/client/remote_component_start.go @@ -1,4 +1,4 @@ -package environment +package client import ( "context" @@ -8,13 +8,13 @@ import ( pkgerrors "github.com/pkg/errors" "github.com/rs/zerolog" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" ) -func startRemoteComponent[T any]( +func StartRemoteComponent[T any]( ctx context.Context, lggr zerolog.Logger, - client componentClient, + client ComponentClient, payload agent.StartComponentPayload, expectedComponentType string, ) (*T, error) { diff --git a/system-tests/lib/cre/environment/remote_stop.go b/system-tests/lib/cre/environment/remoteexec/client/remote_stop.go similarity index 94% rename from system-tests/lib/cre/environment/remote_stop.go rename to system-tests/lib/cre/environment/remoteexec/client/remote_stop.go index d5bc9ec556a..d118a7cbfa8 100644 --- a/system-tests/lib/cre/environment/remote_stop.go +++ b/system-tests/lib/cre/environment/remoteexec/client/remote_stop.go @@ -1,4 +1,4 @@ -package environment +package client import ( "context" @@ -13,8 +13,8 @@ import ( "github.com/rs/zerolog" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" ) type RemoteStopSummary struct { @@ -40,7 +40,7 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. return summary, nil } - remoteRuntime, err := resolveRemoteRuntime(lggr) + remoteRuntime, err := ResolveRuntime(lggr) if err != nil { return summary, pkgerrors.Wrap(err, "failed to resolve remote runtime settings for stop") } @@ -51,11 +51,11 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. continue } payload := agent.StartComponentPayload{ - ComponentType: componentTypeBlockchain, + ComponentType: ComponentTypeBlockchain, Blockchain: configuredBlockchain.InputRef(), ReusePolicy: string(configuredBlockchain.RemoteStartPolicy), } - result, err := stopRemoteComponent(ctx, lggr, remoteRuntime.Client, payload, componentTypeBlockchain) + result, err := stopRemoteComponent(ctx, lggr, remoteRuntime.Client, payload, ComponentTypeBlockchain) if err != nil { summary.Failed++ joined = errors.Join(joined, err) @@ -73,11 +73,11 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. continue } payload := agent.StartComponentPayload{ - ComponentType: componentTypeNodeSet, + ComponentType: ComponentTypeNodeSet, NodeSet: &simple_node_set.Input{Name: nodeSet.Name}, ReusePolicy: nodeSet.RemoteStartPolicy, } - result, err := stopRemoteComponent(ctx, lggr, remoteRuntime.Client, payload, componentTypeNodeSet) + result, err := stopRemoteComponent(ctx, lggr, remoteRuntime.Client, payload, ComponentTypeNodeSet) if err != nil { summary.Failed++ joined = errors.Join(joined, err) @@ -92,11 +92,11 @@ func StopRemoteComponents(ctx context.Context, lggr zerolog.Logger, cfg *config. if cfg.JD != nil && cfg.JD.Placement == config.PlacementRemote { payload := agent.StartComponentPayload{ - ComponentType: componentTypeJD, + ComponentType: ComponentTypeJD, JD: cfg.JD.InputRef(), ReusePolicy: string(cfg.JD.RemoteStartPolicy), } - result, err := stopRemoteComponent(ctx, lggr, remoteRuntime.Client, payload, componentTypeJD) + result, err := stopRemoteComponent(ctx, lggr, remoteRuntime.Client, payload, ComponentTypeJD) if err != nil { summary.Failed++ joined = errors.Join(joined, err) @@ -144,7 +144,7 @@ func countRemoteStopTargets(cfg *config.Config) int { func stopRemoteComponent( ctx context.Context, lggr zerolog.Logger, - client componentClient, + client ComponentClient, payload agent.StartComponentPayload, expectedType string, ) (*agent.StartComponentResponse, error) { diff --git a/system-tests/lib/cre/environment/remote_stop_test.go b/system-tests/lib/cre/environment/remoteexec/client/remote_stop_test.go similarity index 93% rename from system-tests/lib/cre/environment/remote_stop_test.go rename to system-tests/lib/cre/environment/remoteexec/client/remote_stop_test.go index fbb449160c1..69504c97aab 100644 --- a/system-tests/lib/cre/environment/remote_stop_test.go +++ b/system-tests/lib/cre/environment/remoteexec/client/remote_stop_test.go @@ -1,4 +1,4 @@ -package environment +package client import ( "context" @@ -14,8 +14,8 @@ import ( "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" ) @@ -77,7 +77,7 @@ func TestStopRemoteComponents_SummaryAndResiduals(t *testing.T) { server := newRemoteStopTestServer(t) defer server.Close() - t.Setenv(envEC2AgentURL, server.URL) + t.Setenv(EnvEC2AgentURL, server.URL) t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") cfg := &config.Config{ @@ -110,7 +110,7 @@ func TestStopRemoteComponents_ResidualQueryFailureIsReportedInSummary(t *testing w.WriteHeader(http.StatusOK) case "/v1/components/start": resp := agent.StartComponentResponse{ - ComponentType: componentTypeBlockchain, + ComponentType: ComponentTypeBlockchain, Found: true, Stopped: true, } @@ -124,7 +124,7 @@ func TestStopRemoteComponents_ResidualQueryFailureIsReportedInSummary(t *testing })) defer server.Close() - t.Setenv(envEC2AgentURL, server.URL) + t.Setenv(EnvEC2AgentURL, server.URL) t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") cfg := &config.Config{ @@ -143,7 +143,7 @@ func TestStopRemoteComponents_ResidualQueryFailureIsReportedInSummary(t *testing func TestStopRemoteComponent_UnexpectedComponentTypeFails(t *testing.T) { client := &stubComponentClient{ resp: &agent.StartComponentResponse{ - ComponentType: componentTypeJD, + ComponentType: ComponentTypeJD, }, } @@ -151,8 +151,8 @@ func TestStopRemoteComponent_UnexpectedComponentTypeFails(t *testing.T) { context.Background(), zerolog.Nop(), client, - agent.StartComponentPayload{ComponentType: componentTypeBlockchain}, - componentTypeBlockchain, + agent.StartComponentPayload{ComponentType: ComponentTypeBlockchain}, + ComponentTypeBlockchain, ) require.Error(t, err, "expected mismatched component type to fail") require.Contains(t, err.Error(), "unexpected component type") @@ -165,8 +165,8 @@ func TestStopRemoteComponent_ClientErrorIsWrapped(t *testing.T) { context.Background(), zerolog.Nop(), client, - agent.StartComponentPayload{ComponentType: componentTypeBlockchain}, - componentTypeBlockchain, + agent.StartComponentPayload{ComponentType: ComponentTypeBlockchain}, + ComponentTypeBlockchain, ) require.Error(t, err, "expected client failure to be returned") require.Contains(t, err.Error(), "failed to stop remote component type") @@ -193,13 +193,13 @@ func newRemoteStopTestServer(t *testing.T) *httptest.Server { resp := agent.StartComponentResponse{ComponentType: payload.ComponentType} switch payload.ComponentType { - case componentTypeBlockchain: + case ComponentTypeBlockchain: resp.Found = true resp.Stopped = true - case componentTypeNodeSet: + case ComponentTypeNodeSet: resp.Found = false resp.Stopped = false - case componentTypeJD: + case ComponentTypeJD: resp.Found = true resp.Stopped = true default: diff --git a/system-tests/lib/cre/environment/setup_output_test.go b/system-tests/lib/cre/environment/setup_output_test.go deleted file mode 100644 index a8a77b7c42b..00000000000 --- a/system-tests/lib/cre/environment/setup_output_test.go +++ /dev/null @@ -1,17 +0,0 @@ -package environment - -import ( - "context" - "testing" -) - -func TestSetupOutputCloseIsIdempotent(t *testing.T) { - out := &SetupOutput{} - - if err := out.Close(context.Background()); err != nil { - t.Fatalf("expected first close to succeed: %v", err) - } - if err := out.Close(context.Background()); err != nil { - t.Fatalf("expected second close to succeed: %v", err) - } -} diff --git a/system-tests/tests/smoke/cre/v2_grpc_source_test.go b/system-tests/tests/smoke/cre/v2_grpc_source_test.go index df06fc46b98..a1c6fe681e8 100644 --- a/system-tests/tests/smoke/cre/v2_grpc_source_test.go +++ b/system-tests/tests/smoke/cre/v2_grpc_source_test.go @@ -24,8 +24,8 @@ import ( "github.com/smartcontractkit/chainlink-common/pkg/workflows/privateregistry" crontypes "github.com/smartcontractkit/chainlink/core/scripts/cre/environment/examples/workflows/v2/cron/types" - creenv "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment" envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + remoteclient "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/client" grpcsourcemock "github.com/smartcontractkit/chainlink/system-tests/lib/cre/grpc_source_mock" creworkflow "github.com/smartcontractkit/chainlink/system-tests/lib/cre/workflow" t_helpers "github.com/smartcontractkit/chainlink/system-tests/tests/test-helpers" @@ -611,7 +611,7 @@ func compileAndCopyWorkflow(t *testing.T, testEnv *ttypes.TestEnvironment, workf ContainerTargetDir: containerTargetDir, Files: []string{compressedWasmPath, configFilePath}, RemoteDeployer: func(ctx context.Context, nodeSetName, containerTargetDir string, files []string) error { - return creenv.DeployArtifactsToRemoteNodeSet(ctx, testLogger, nodeSetName, containerTargetDir, files) + return remoteclient.DeployArtifactsToRemoteNodeSet(ctx, testLogger, nodeSetName, containerTargetDir, files) }, }, ) diff --git a/system-tests/tests/test-helpers/t_helpers.go b/system-tests/tests/test-helpers/t_helpers.go index 5c9c15caa3f..529e24020a4 100644 --- a/system-tests/tests/test-helpers/t_helpers.go +++ b/system-tests/tests/test-helpers/t_helpers.go @@ -59,10 +59,10 @@ import ( keystone_changeset "github.com/smartcontractkit/chainlink/deployment/keystone/changeset" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" crecontracts "github.com/smartcontractkit/chainlink/system-tests/lib/cre/contracts" - creenv "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + remoteclient "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/client" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/flags" creworkflow "github.com/smartcontractkit/chainlink/system-tests/lib/cre/workflow" crecrypto "github.com/smartcontractkit/chainlink/system-tests/lib/crypto" @@ -350,7 +350,7 @@ func createWorkflowArtifacts[T WorkflowConfig](t *testing.T, testLogger zerolog. ContainerTargetDir: creworkflow.DefaultWorkflowTargetDir, Files: []string{compressedWorkflowWasmPath, workflowConfigFilePath}, RemoteDeployer: func(ctx context.Context, nodeSetName, containerTargetDir string, files []string) error { - return creenv.DeployArtifactsToRemoteNodeSet(ctx, testLogger, nodeSetName, containerTargetDir, files) + return remoteclient.DeployArtifactsToRemoteNodeSet(ctx, testLogger, nodeSetName, containerTargetDir, files) }, }, ) From b61e1bb702ef2e50b897ef46506c78f354786ef6 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Wed, 25 Feb 2026 15:13:29 +0100 Subject: [PATCH 23/34] update docs + add placement visualisation on startup --- .gitignore | 2 +- core/scripts/cre/environment/README.md | 38 ++++- .../docs/ARCHITECTURE_REMOTEEXEC.md | 44 ++++++ .../environment/topologyviz/topologyviz.go | 137 +++++++++++++++++- .../topologyviz/topologyviz_test.go | 39 +++++ 5 files changed, 254 insertions(+), 6 deletions(-) create mode 100644 core/scripts/cre/environment/docs/ARCHITECTURE_REMOTEEXEC.md diff --git a/.gitignore b/.gitignore index 24ff4ae2e51..957b9ef0b3c 100644 --- a/.gitignore +++ b/.gitignore @@ -129,4 +129,4 @@ core/scripts/cre/environment/binaries/* *.br.b64 # TODO remove later -system-tests/lib/cre/environment/agent/cre_agent \ No newline at end of file +system-tests/lib/cre/environment/remoteexec/agent/cre_agent \ No newline at end of file diff --git a/core/scripts/cre/environment/README.md b/core/scripts/cre/environment/README.md index 3d66a0339d5..9cbe2153af6 100644 --- a/core/scripts/cre/environment/README.md +++ b/core/scripts/cre/environment/README.md @@ -277,11 +277,46 @@ For more details on the URL resolution process and how workflow artifacts are ha # while in core/scripts/cre/environment go run . env stop +# stop remote components only +go run . env stop-remote + +# stop remote first, then local resources and local services +go run . env stop-all + # or... if you have the CTF binary ctf d rm ``` --- +## Hybrid Remote Execution Quick Reference + +Remote execution uses a single direct mode with an EC2-hosted (or equivalent) CRE agent API. + +Environment variable precedence for agent resolution: + +1. `CRE_EC2_AGENT_URL` (explicit override, if set) +2. `CRE_EC2_INSTANCE_ID` + `CRE_EC2_AGENT_PORT` + AWS profile/credentials resolution +3. `CRE_EC2_AGENT_PORT` defaults to `8080` when omitted + +Stop command semantics: + +- `env stop`: local resources only; does not stop remote components. +- `env stop-remote`: remote resources only through the remote agent. +- `env stop-all`: remote stop followed by local stop. + +If `env stop` warns about remote components still running, run `env stop-remote`. + +Architecture ownership and boundaries are documented in: +- [`docs/ARCHITECTURE_REMOTEEXEC.md`](./docs/ARCHITECTURE_REMOTEEXEC.md) + +Mixed-mode verification checklist: + +1. Start with a mixed config (`local` + `remote` placements). +2. Confirm startup output includes `Runtime Placement Matrix`. +3. Deploy a workflow/artifact and verify remote delivery path succeeds. +4. Run `env stop-remote` and verify remote stop summary reports requested/stopped counts. +5. Run `env stop-all` and verify no local containers/state remain. + ## Restarting the environment If you are using Blockscout and you restart the environment **you need to restart the block explorer** if you want to see current block history. If you don't you will see stale state of the previous environment. To restart execute: @@ -734,7 +769,8 @@ Remember that the CRE CLI version needs to match your CPU architecture and opera # regenerate topology docs go run . topology generate ``` - - `env start` now prints a compact topology summary with a capability matrix. + - `env start` prints a compact topology summary with a capability matrix. + - A runtime placement matrix (what runs local vs remote) is shown only when at least one component is configured with `placement = "remote"`. 2. **Download or Build Capability Binaries** - Some capabilities like `cron`, `log-event-trigger`, or `read-contract` are not embedded in all Chainlink images. - If your use case requires them, you should build them manually by: diff --git a/core/scripts/cre/environment/docs/ARCHITECTURE_REMOTEEXEC.md b/core/scripts/cre/environment/docs/ARCHITECTURE_REMOTEEXEC.md new file mode 100644 index 00000000000..56eacdb536c --- /dev/null +++ b/core/scripts/cre/environment/docs/ARCHITECTURE_REMOTEEXEC.md @@ -0,0 +1,44 @@ +# CRE Remote Execution Architecture + +## Goal +Keep responsibilities co-located so contributors can reason about hybrid local/remote execution without hopping across unrelated packages. + +## Ownership Boundaries + +- `system-tests/lib/cre/environment` + - High-level environment orchestration. + - Decides **what** to start and in which order (blockchains, JD, DONs, linking, funding). + - Consumes remote execution APIs; does not own transport/protocol details. + +- `system-tests/lib/cre/environment/remoteexec/client` + - Remote control-plane client logic. + - Owns runtime resolution, agent HTTP/retry behavior, start/stop/deploy envelopes, remote stop summary, and agent log normalization. + - Exposes reusable helpers for orchestrators and workflow artifact deployment call sites. + +- `system-tests/lib/cre/environment/remoteexec/agent` + - Remote data-plane/agent runtime. + - Owns server handlers, deployment execution, relay lifecycle, and transport contracts used by the agent API. + +## Runtime Flow (Hybrid) + +1. CLI loads config and builds topology summary. +2. `environment` resolves whether remote components exist. +3. If remote components are present, `remoteexec/client` resolves runtime and performs remote operations. +4. Local components are started directly by `environment` + CTF components. +5. Stop commands route: + - `env stop`: local only. + - `env stop-remote`: remote only via `remoteexec/client`. + - `env stop-all`: remote then local. + +## Invariants + +- Remote HTTP protocol details remain in `remoteexec/client` and `remoteexec/agent`. +- `environment` should not re-introduce ad-hoc remote transport code. +- Placement (`local` vs `remote`) remains the single selector for execution target behavior. +- Remote placement visualization is shown only when at least one component is remote. + +## Maintenance Guidance + +- When changing agent payloads or operations, update both `remoteexec/agent` and `remoteexec/client` in the same PR. +- When changing orchestration order or placement rules, prefer tests in `system-tests/lib/cre/environment`. +- Keep runbook commands and env var precedence synchronized with code changes in `core/scripts/cre/environment/environment`. diff --git a/core/scripts/cre/environment/topologyviz/topologyviz.go b/core/scripts/cre/environment/topologyviz/topologyviz.go index 9793d2152aa..15511fc7f9e 100644 --- a/core/scripts/cre/environment/topologyviz/topologyviz.go +++ b/core/scripts/cre/environment/topologyviz/topologyviz.go @@ -40,10 +40,22 @@ type DONSummary struct { } type TopologySummary struct { - ConfigRef string `json:"config_ref"` - Topology string `json:"topology"` - InfraType string `json:"infra_type"` - DONs []DONSummary `json:"dons"` + ConfigRef string `json:"config_ref"` + Topology string `json:"topology"` + InfraType string `json:"infra_type"` + DONs []DONSummary `json:"dons"` + Placement *PlacementSummary `json:"placement,omitempty"` +} + +type PlacementSummary struct { + HasRemote bool `json:"has_remote"` + Rows []PlacementRow `json:"rows,omitempty"` +} + +type PlacementRow struct { + Component string `json:"component"` + Local bool `json:"local"` + Remote bool `json:"remote"` } type Artifacts struct { @@ -81,6 +93,7 @@ func BuildSummary(cfg *envconfig.Config, configRef string) (*TopologySummary, er Topology: topologyClass, InfraType: infraType, DONs: dons, + Placement: buildPlacementSummary(cfg), }, nil } @@ -119,6 +132,10 @@ func RenderASCII(summary *TopologySummary) string { b.WriteString(RenderASCIIDONTable(summary)) b.WriteString("\n") b.WriteString(RenderASCIICapabilityMatrix(summary)) + if summary.Placement != nil && summary.Placement.HasRemote { + b.WriteString("\n") + b.WriteString(RenderASCIIPlacementMatrix(summary.Placement)) + } return b.String() } @@ -127,6 +144,42 @@ func RenderASCIIStartSummary(summary *TopologySummary) string { var b strings.Builder b.WriteString(fmt.Sprintf("Topology: %s (%s)\n", summary.ConfigRef, summary.Topology)) b.WriteString(RenderASCIICapabilityMatrix(summary)) + if summary.Placement != nil && summary.Placement.HasRemote { + b.WriteString("\n") + b.WriteString(RenderASCIIPlacementMatrix(summary.Placement)) + } + return b.String() +} + +func RenderASCIIPlacementMatrix(summary *PlacementSummary) string { + if summary == nil || !summary.HasRemote || len(summary.Rows) == 0 { + return "" + } + + headers := []string{"Component", "local", "remote"} + widths := []int{len(headers[0]), len(headers[1]), len(headers[2])} + for _, row := range summary.Rows { + if len(row.Component) > widths[0] { + widths[0] = len(row.Component) + } + } + + var b strings.Builder + b.WriteString("Runtime Placement Matrix\n") + b.WriteString(buildBorder(widths)) + b.WriteString(buildRow(headers, widths)) + b.WriteString(buildBorder(widths)) + for _, row := range summary.Rows { + values := []string{row.Component, "-", "-"} + if row.Local { + values[1] = "x" + } + if row.Remote { + values[2] = "x" + } + b.WriteString(buildRow(values, widths)) + } + b.WriteString(buildBorder(widths)) return b.String() } @@ -253,6 +306,25 @@ func RenderMarkdown(summary *TopologySummary) string { } b.WriteString("\n") + if summary.Placement != nil && summary.Placement.HasRemote { + b.WriteString("## Runtime Placement Matrix\n\n") + b.WriteString("Only shown when at least one component is configured as `remote`.\n\n") + b.WriteString("| Component | local | remote |\n") + b.WriteString("|---|---:|---:|\n") + for _, row := range summary.Placement.Rows { + local := "-" + remote := "-" + if row.Local { + local = "x" + } + if row.Remote { + remote = "x" + } + b.WriteString(fmt.Sprintf("| `%s` | `%s` | `%s` |\n", row.Component, local, remote)) + } + b.WriteString("\n") + } + b.WriteString("## DONs\n\n") for _, don := range summary.DONs { b.WriteString(fmt.Sprintf("### `%s`\n\n", don.Name)) @@ -378,6 +450,63 @@ func buildCapabilityMatrix(dons []DONSummary) []capabilityMatrixRow { return rows } +func buildPlacementSummary(cfg *envconfig.Config) *PlacementSummary { + if cfg == nil { + return &PlacementSummary{} + } + rows := make([]PlacementRow, 0) + hasRemote := false + + for _, bc := range cfg.Blockchains { + if bc == nil { + continue + } + component := fmt.Sprintf("blockchain:%s:%s", bc.Type, bc.ChainID) + row := PlacementRow{Component: component, Local: bc.Placement == envconfig.PlacementLocal, Remote: bc.Placement == envconfig.PlacementRemote} + if row.Remote { + hasRemote = true + } + rows = append(rows, row) + } + + if cfg.JD != nil { + row := PlacementRow{ + Component: "jd", + Local: cfg.JD.Placement == envconfig.PlacementLocal, + Remote: cfg.JD.Placement == envconfig.PlacementRemote, + } + if row.Remote { + hasRemote = true + } + rows = append(rows, row) + } + + for _, nodeSet := range cfg.NodeSets { + if nodeSet == nil { + continue + } + isRemote := strings.EqualFold(strings.TrimSpace(nodeSet.Placement), string(envconfig.PlacementRemote)) + row := PlacementRow{ + Component: "nodeset:" + nodeSet.Name, + Local: !isRemote, + Remote: isRemote, + } + if row.Remote { + hasRemote = true + } + rows = append(rows, row) + } + + sort.Slice(rows, func(i, j int) bool { + return rows[i].Component < rows[j].Component + }) + + return &PlacementSummary{ + HasRemote: hasRemote, + Rows: rows, + } +} + func buildBorder(widths []int) string { var b strings.Builder b.WriteString("+") diff --git a/core/scripts/cre/environment/topologyviz/topologyviz_test.go b/core/scripts/cre/environment/topologyviz/topologyviz_test.go index 957ff4df39a..bee4f9b9b99 100644 --- a/core/scripts/cre/environment/topologyviz/topologyviz_test.go +++ b/core/scripts/cre/environment/topologyviz/topologyviz_test.go @@ -7,7 +7,11 @@ import ( "github.com/stretchr/testify/require" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" + ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" + envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" ) func TestClassifyTopology_UsesDONTypesAndShardIndex(t *testing.T) { @@ -118,6 +122,41 @@ func TestRenderASCII_IncludesDONHeadersAndNoHint(t *testing.T) { require.Contains(t, rendered, "Attributes") } +func TestBuildSummary_PlacementMatrixShownOnlyForRemoteComponents(t *testing.T) { + t.Parallel() + + localCfg := &envconfig.Config{ + Blockchains: []*envconfig.Blockchain{ + {Input: blockchain.Input{Type: blockchain.TypeAnvil, ChainID: "1337"}, Placement: envconfig.PlacementLocal}, + }, + JD: &envconfig.JobDistributor{Input: jd.Input{}, Placement: envconfig.PlacementLocal}, + NodeSets: []*cre.NodeSet{{Input: &ns.Input{Name: "workflow"}, Placement: "local"}}, + } + localSummary, err := BuildSummary(localCfg, "configs/local.toml") + require.NoError(t, err) + require.NotNil(t, localSummary.Placement) + require.False(t, localSummary.Placement.HasRemote) + require.NotContains(t, RenderASCIIStartSummary(localSummary), "Runtime Placement Matrix") + + mixedCfg := &envconfig.Config{ + Blockchains: []*envconfig.Blockchain{ + {Input: blockchain.Input{Type: blockchain.TypeAnvil, ChainID: "1337"}, Placement: envconfig.PlacementRemote}, + {Input: blockchain.Input{Type: blockchain.TypeAnvil, ChainID: "2337"}, Placement: envconfig.PlacementLocal}, + }, + JD: &envconfig.JobDistributor{Input: jd.Input{}, Placement: envconfig.PlacementRemote}, + NodeSets: []*cre.NodeSet{{Input: &ns.Input{Name: "workflow"}, Placement: "local"}, {Input: &ns.Input{Name: "capabilities"}, Placement: "remote"}}, + } + mixedSummary, err := BuildSummary(mixedCfg, "configs/mixed.toml") + require.NoError(t, err) + require.NotNil(t, mixedSummary.Placement) + require.True(t, mixedSummary.Placement.HasRemote) + + rendered := RenderASCIIStartSummary(mixedSummary) + require.Contains(t, rendered, "Runtime Placement Matrix") + require.Contains(t, rendered, "nodeset:capabilities") + require.Contains(t, rendered, "jd") +} + func TestRenderMarkdown_DropsInferredUsageSections(t *testing.T) { t.Parallel() From 5ce788605c9529281c71742ed3f0ab389778a8ec Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Wed, 25 Feb 2026 15:42:58 +0100 Subject: [PATCH 24/34] add debug endpoints and command for the agent --- .../cre/environment/environment/debug.go | 111 ++++++ .../environment/environment/environment.go | 1 + .../environment/remoteexec/agent/server.go | 328 +++++++++++++++++- .../remoteexec/agent/server_handlers_test.go | 83 +++++ .../remoteexec/client/agent_introspection.go | 108 ++++++ .../client/agent_introspection_test.go | 81 +++++ .../client/artifacts_remote_test.go | 6 + .../remoteexec/client/compatibility.go | 67 ++++ .../remoteexec/client/compatibility_test.go | 50 +++ .../client/remote_component_client.go | 21 +- .../remoteexec/client/remote_stop_test.go | 5 + .../tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md | 2 +- 12 files changed, 854 insertions(+), 9 deletions(-) create mode 100644 core/scripts/cre/environment/environment/debug.go create mode 100644 system-tests/lib/cre/environment/remoteexec/client/agent_introspection.go create mode 100644 system-tests/lib/cre/environment/remoteexec/client/agent_introspection_test.go create mode 100644 system-tests/lib/cre/environment/remoteexec/client/compatibility.go create mode 100644 system-tests/lib/cre/environment/remoteexec/client/compatibility_test.go diff --git a/core/scripts/cre/environment/environment/debug.go b/core/scripts/cre/environment/environment/debug.go new file mode 100644 index 00000000000..3afb139cbf8 --- /dev/null +++ b/core/scripts/cre/environment/environment/debug.go @@ -0,0 +1,111 @@ +package environment + +import ( + "context" + "encoding/json" + "fmt" + "os" + "strings" + + "github.com/pkg/errors" + "github.com/spf13/cobra" + + "github.com/smartcontractkit/chainlink-testing-framework/framework" + remoteclient "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/client" +) + +func debugCmds() *cobra.Command { + cmd := &cobra.Command{ + Use: "debug", + Short: "Debug helpers for remote execution", + Long: "Debug helpers for querying remote agent state and logs.", + PersistentPreRun: globalPreRunFunc, + } + + cmd.AddCommand(debugStatusCmd()) + cmd.AddCommand(debugLocksCmd()) + cmd.AddCommand(debugLogsCmd()) + return cmd +} + +func debugStatusCmd() *cobra.Command { + return &cobra.Command{ + Use: "status", + Short: "Get remote agent status snapshot", + RunE: func(cmd *cobra.Command, _ []string) error { + return withResolvedRemoteRuntime(cmd.Context(), func(ctx context.Context, runtime *remoteclient.Runtime) error { + status, err := remoteclient.GetAgentStatus(ctx, runtime) + if err != nil { + return err + } + return printDebugJSON(status) + }) + }, + } +} + +func debugLocksCmd() *cobra.Command { + return &cobra.Command{ + Use: "locks", + Short: "Get remote agent lock/in-flight snapshot", + RunE: func(cmd *cobra.Command, _ []string) error { + return withResolvedRemoteRuntime(cmd.Context(), func(ctx context.Context, runtime *remoteclient.Runtime) error { + locks, err := remoteclient.GetAgentLocks(ctx, runtime) + if err != nil { + return err + } + return printDebugJSON(locks) + }) + }, + } +} + +func debugLogsCmd() *cobra.Command { + var ( + componentKey string + limit int + ) + cmd := &cobra.Command{ + Use: "logs", + Short: "Get bounded agent logs for one component key", + RunE: func(cmd *cobra.Command, _ []string) error { + componentKey = strings.TrimSpace(componentKey) + if componentKey == "" { + return errors.New("component key is required") + } + return withResolvedRemoteRuntime(cmd.Context(), func(ctx context.Context, runtime *remoteclient.Runtime) error { + logs, err := remoteclient.GetComponentLogs(ctx, runtime, componentKey, limit) + if err != nil { + return err + } + return printDebugJSON(logs) + }) + }, + } + cmd.Flags().StringVar(&componentKey, "component-key", "", "Remote component cache key (for example: nodeset:workflow)") + cmd.Flags().IntVar(&limit, "limit", 200, "Number of log lines to return") + _ = cmd.MarkFlagRequired("component-key") + return cmd +} + +func withResolvedRemoteRuntime(ctx context.Context, fn func(context.Context, *remoteclient.Runtime) error) error { + if state, err := loadRemoteAgentState(relativePathToRepoRoot); err == nil && state != nil { + applyRemoteAgentEnvFallback(framework.L, state) + } + runtime, err := remoteclient.ResolveRuntime(framework.L) + if err != nil { + return errors.Wrap(err, "failed to resolve remote runtime (set CRE_EC2_AGENT_URL or CRE_EC2_INSTANCE_ID/AWS profile)") + } + return fn(ctx, runtime) +} + +func printDebugJSON(value any) error { + payload, err := json.MarshalIndent(value, "", " ") + if err != nil { + return fmt.Errorf("failed to encode debug output: %w", err) + } + if _, err := fmt.Fprintln(os.Stdout, string(payload)); err != nil { + return fmt.Errorf("failed to print debug output: %w", err) + } + return nil +} diff --git a/core/scripts/cre/environment/environment/environment.go b/core/scripts/cre/environment/environment/environment.go index 3c272c3a469..6c7d8775536 100644 --- a/core/scripts/cre/environment/environment/environment.go +++ b/core/scripts/cre/environment/environment/environment.go @@ -87,6 +87,7 @@ func init() { EnvironmentCmd.AddCommand(stopCmd()) EnvironmentCmd.AddCommand(stopAllCmd()) EnvironmentCmd.AddCommand(stopRemoteCmd()) + EnvironmentCmd.AddCommand(debugCmds()) EnvironmentCmd.AddCommand(relaySupervisorCmd()) EnvironmentCmd.AddCommand(workflowCmds()) EnvironmentCmd.AddCommand(beholderCmds()) diff --git a/system-tests/lib/cre/environment/remoteexec/agent/server.go b/system-tests/lib/cre/environment/remoteexec/agent/server.go index 5f9f9401c2c..98db0e43a88 100644 --- a/system-tests/lib/cre/environment/remoteexec/agent/server.go +++ b/system-tests/lib/cre/environment/remoteexec/agent/server.go @@ -14,6 +14,7 @@ import ( "os" "path/filepath" "slices" + "strconv" "strings" "sync" "time" @@ -59,6 +60,20 @@ const ( RemoteStartPolicyReuseIdentical = "reuse_if_identical" EnvKeepFailedContainers = "CRE_AGENT_KEEP_FAILED_CONTAINERS" + + defaultComponentLogsLimit = 200 + maxComponentLogsLimit = 1000 + componentLogsRingSize = 2000 + inFlightOperationScopeLifecycle = "lifecycle" + inFlightOperationScopeGeneral = "general" + protocolVersion = "1.0.0" + capabilityComponentLogs = "componentLogs" + capabilityLocks = "locks" + capabilityDeployArtifacts = "deployArtifacts" + capabilityStartComponent = "startComponent" + capabilityRelay = "relay" + capabilityListCTFResources = "listCTFResources" + agentVersion = "dev" ) var frameworkLogCaptureMu sync.Mutex @@ -104,15 +119,68 @@ type CTFResourcesResponse struct { Volumes []string `json:"volumes,omitempty"` } +type AgentStatusResponse struct { + AgentVersion string `json:"agentVersion,omitempty"` + ProtocolVersion string `json:"protocolVersion,omitempty"` + SupportedSchemas []string `json:"supportedSchemas,omitempty"` + Capabilities []string `json:"capabilities,omitempty"` + UptimeSeconds int64 `json:"uptimeSeconds"` + RuntimeComponents []string `json:"runtimeComponents,omitempty"` + CachedComponents []string `json:"cachedComponents,omitempty"` + Relays []RelayInfo `json:"relays,omitempty"` + ComponentLogKeys []string `json:"componentLogKeys,omitempty"` + InFlight []InFlightOperation `json:"inFlight,omitempty"` +} + +type RelayInfo struct { + ID string `json:"id"` + Name string `json:"name"` + RequestedPort int `json:"requestedPort"` + BoundPort int `json:"boundPort"` +} + +type AgentLocksResponse struct { + LifecycleBusy bool `json:"lifecycleBusy"` + CacheEntries int `json:"cacheEntries"` + RuntimeEntries int `json:"runtimeEntries"` + RelayCount int `json:"relayCount"` + ComponentLogKeys int `json:"componentLogKeys"` + InFlight []InFlightOperation `json:"inFlight,omitempty"` +} + +type InFlightOperation struct { + ID string `json:"id"` + Scope string `json:"scope"` + StartedAt string `json:"startedAt"` + DurationMs int64 `json:"durationMs"` +} + +type ComponentLogsResponse struct { + ComponentKey string `json:"componentKey"` + TotalLines int `json:"totalLines"` + Lines []string `json:"lines,omitempty"` +} + +type inFlightOperation struct { + ID string + Scope string + StartedAt time.Time +} + type Server struct { lggr zerolog.Logger deployers map[blockchain.ChainFamily]blockchains.Deployer + startedAt time.Time lifecycleMu sync.Mutex cacheMu sync.Mutex cache map[string]cachedStart runtime map[string]runtimeState relayMu sync.Mutex relays map[string]*relayRegistration + logsMu sync.Mutex + componentLogs map[string][]string + opsMu sync.Mutex + inFlight map[string]inFlightOperation } type cachedStart struct { @@ -127,11 +195,14 @@ type runtimeState struct { func NewServer(lggr zerolog.Logger, deployers map[blockchain.ChainFamily]blockchains.Deployer) *Server { return &Server{ - lggr: lggr, - deployers: deployers, - cache: make(map[string]cachedStart), - runtime: make(map[string]runtimeState), - relays: make(map[string]*relayRegistration), + lggr: lggr, + deployers: deployers, + startedAt: time.Now(), + cache: make(map[string]cachedStart), + runtime: make(map[string]runtimeState), + relays: make(map[string]*relayRegistration), + componentLogs: make(map[string][]string), + inFlight: make(map[string]inFlightOperation), } } @@ -143,6 +214,9 @@ func (s *Server) Handler() http.Handler { mux.HandleFunc("/v1/relay/close", s.closeRelay) mux.HandleFunc("/v1/relay/connect", s.connectRelay) mux.HandleFunc("/v1/resources/ctf", s.listCTFResources) + mux.HandleFunc("/v1/status", s.status) + mux.HandleFunc("/v1/locks", s.locks) + mux.HandleFunc("/v1/components/logs", s.componentLogsHandler) return mux } @@ -205,6 +279,81 @@ func (s *Server) listCTFResources(w http.ResponseWriter, r *http.Request) { }) } +func (s *Server) status(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) + return + } + + runtimeKeys := s.runtimeKeys() + cacheKeys := s.cacheKeys() + relayInfos := s.relayInfos() + componentLogKeys := s.componentLogKeys() + inFlight, _ := s.inFlightSnapshot() + + s.respondJSONAny(w, http.StatusOK, AgentStatusResponse{ + AgentVersion: agentVersion, + ProtocolVersion: protocolVersion, + SupportedSchemas: []string{SchemaVersionV1}, + Capabilities: []string{capabilityStartComponent, capabilityDeployArtifacts, capabilityRelay, capabilityListCTFResources, capabilityLocks, capabilityComponentLogs}, + UptimeSeconds: int64(time.Since(s.startedAt).Seconds()), + RuntimeComponents: runtimeKeys, + CachedComponents: cacheKeys, + Relays: relayInfos, + ComponentLogKeys: componentLogKeys, + InFlight: inFlight, + }) +} + +func (s *Server) locks(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) + return + } + + inFlight, lifecycleBusy := s.inFlightSnapshot() + s.respondJSONAny(w, http.StatusOK, AgentLocksResponse{ + LifecycleBusy: lifecycleBusy, + CacheEntries: s.cacheSize(), + RuntimeEntries: s.runtimeSize(), + RelayCount: s.relayCount(), + ComponentLogKeys: len(s.componentLogKeys()), + InFlight: inFlight, + }) +} + +func (s *Server) componentLogsHandler(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) + return + } + + componentKey := strings.TrimSpace(r.URL.Query().Get("componentKey")) + if componentKey == "" { + s.respondError(w, http.StatusBadRequest, ErrCodeMissingComponentInput, "componentKey query parameter is required", nil) + return + } + limit := defaultComponentLogsLimit + if rawLimit := strings.TrimSpace(r.URL.Query().Get("limit")); rawLimit != "" { + parsed, err := strconv.Atoi(rawLimit) + if err != nil || parsed <= 0 { + s.respondError(w, http.StatusBadRequest, ErrCodeInvalidPayload, "limit query parameter must be a positive integer", nil) + return + } + if parsed > maxComponentLogsLimit { + parsed = maxComponentLogsLimit + } + limit = parsed + } + + lines, total := s.getComponentLogs(componentKey, limit) + s.respondJSONAny(w, http.StatusOK, ComponentLogsResponse{ + ComponentKey: componentKey, + TotalLines: total, + Lines: lines, + }) +} + func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodPost { s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) @@ -253,6 +402,8 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { // Keep this stderr write explicit so startup behavior is visible when agent runs as a subprocess. requestLog := fmt.Sprintf("[cre-agent] starting component type=%s key=%s", payload.ComponentType, componentKey) _, _ = fmt.Fprintln(os.Stderr, requestLog) + s.beginInFlight(fmt.Sprintf("start:%s", componentKey), inFlightOperationScopeLifecycle) + defer s.endInFlight(fmt.Sprintf("start:%s", componentKey)) preStartLogs := make([]string, 0, 2) s.lifecycleMu.Lock() defer s.lifecycleMu.Unlock() @@ -277,6 +428,7 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { Output: cached.Output, AgentLogs: []string{requestLog, reuseLog}, }) + s.appendComponentLogs(componentKey, []string{requestLog, reuseLog}) return } } @@ -331,6 +483,7 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { agentLogs = append(agentLogs, fmt.Sprintf("[cre-agent] preserving %d tracked container(s) after failed startup because %s is enabled", len(trackedContainers), EnvKeepFailedContainers)) } s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, startErr.Error(), agentLogs) + s.appendComponentLogs(componentKey, agentLogs) return } @@ -345,6 +498,7 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { } if encErr != nil { s.respondError(w, http.StatusInternalServerError, ErrCodeTransportEncodeFailed, encErr.Error(), agentLogs) + s.appendComponentLogs(componentKey, agentLogs) return } if shouldReuseRemoteStart(payload.ComponentType, payload.ReusePolicy) { @@ -359,9 +513,13 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { Output: output, AgentLogs: agentLogs, }) + s.appendComponentLogs(componentKey, agentLogs) } func (s *Server) deployArtifacts(w http.ResponseWriter, r *http.Request, rawPayload json.RawMessage) { + s.beginInFlight("deploy-artifacts", inFlightOperationScopeGeneral) + defer s.endInFlight("deploy-artifacts") + var payload DeployArtifactsPayload if err := json.Unmarshal(rawPayload, &payload); err != nil { s.respondError(w, http.StatusBadRequest, ErrCodeInvalidPayload, fmt.Sprintf("invalid payload: %v", err), nil) @@ -427,9 +585,15 @@ func (s *Server) deployArtifacts(w http.ResponseWriter, r *http.Request, rawPayl fmt.Sprintf("[cre-agent] copied %d artifact(s) to %d container(s) for nodeset %s", len(filePaths), len(containerNames), payload.NodeSetName), }, }) + s.appendComponentLogs(fmt.Sprintf("%s:%s", ComponentTypeNodeSet, payload.NodeSetName), []string{ + fmt.Sprintf("[cre-agent] copied %d artifact(s) to %d container(s) for nodeset %s", len(filePaths), len(containerNames), payload.NodeSetName), + }) } func (s *Server) stopComponentByKey(w http.ResponseWriter, r *http.Request, componentType, componentKey string) { + s.beginInFlight(fmt.Sprintf("stop:%s", componentKey), inFlightOperationScopeLifecycle) + defer s.endInFlight(fmt.Sprintf("stop:%s", componentKey)) + s.lifecycleMu.Lock() defer s.lifecycleMu.Unlock() @@ -449,6 +613,7 @@ func (s *Server) stopComponentByKey(w http.ResponseWriter, r *http.Request, comp Stopped: false, AgentLogs: []string{requestLog, "[cre-agent] nothing to stop (component not found)"}, }) + s.appendComponentLogs(componentKey, []string{requestLog, "[cre-agent] nothing to stop (component not found)"}) return } s.deleteCachedStart(componentKey) @@ -458,6 +623,7 @@ func (s *Server) stopComponentByKey(w http.ResponseWriter, r *http.Request, comp Stopped: true, AgentLogs: []string{requestLog, "[cre-agent] stopped existing component"}, }) + s.appendComponentLogs(componentKey, []string{requestLog, "[cre-agent] stopped existing component"}) } func (s *Server) respondJSON(w http.ResponseWriter, code int, body StartComponentResponse) { @@ -545,6 +711,158 @@ func (s *Server) takeRuntime(componentKey string) (runtimeState, bool) { return state, ok } +func (s *Server) beginInFlight(id, scope string) { + s.opsMu.Lock() + defer s.opsMu.Unlock() + s.inFlight[id] = inFlightOperation{ + ID: id, + Scope: scope, + StartedAt: time.Now(), + } +} + +func (s *Server) endInFlight(id string) { + s.opsMu.Lock() + defer s.opsMu.Unlock() + delete(s.inFlight, id) +} + +func (s *Server) inFlightSnapshot() ([]InFlightOperation, bool) { + s.opsMu.Lock() + defer s.opsMu.Unlock() + + out := make([]InFlightOperation, 0, len(s.inFlight)) + lifecycleBusy := false + for _, op := range s.inFlight { + if op.Scope == inFlightOperationScopeLifecycle { + lifecycleBusy = true + } + out = append(out, InFlightOperation{ + ID: op.ID, + Scope: op.Scope, + StartedAt: op.StartedAt.Format(time.RFC3339Nano), + DurationMs: int64(time.Since(op.StartedAt) / time.Millisecond), + }) + } + slices.SortFunc(out, func(a, b InFlightOperation) int { + return strings.Compare(a.ID, b.ID) + }) + return out, lifecycleBusy +} + +func (s *Server) appendComponentLogs(componentKey string, lines []string) { + if strings.TrimSpace(componentKey) == "" || len(lines) == 0 { + return + } + filtered := make([]string, 0, len(lines)) + for _, line := range lines { + trimmed := strings.TrimSpace(line) + if trimmed == "" { + continue + } + filtered = append(filtered, trimmed) + } + if len(filtered) == 0 { + return + } + + s.logsMu.Lock() + defer s.logsMu.Unlock() + existing := append(s.componentLogs[componentKey], filtered...) + if len(existing) > componentLogsRingSize { + existing = existing[len(existing)-componentLogsRingSize:] + } + s.componentLogs[componentKey] = existing +} + +func (s *Server) getComponentLogs(componentKey string, limit int) ([]string, int) { + s.logsMu.Lock() + defer s.logsMu.Unlock() + lines := s.componentLogs[componentKey] + total := len(lines) + if total == 0 { + return []string{}, 0 + } + if limit <= 0 || limit > total { + limit = total + } + out := append([]string{}, lines[total-limit:]...) + return out, total +} + +func (s *Server) componentLogKeys() []string { + s.logsMu.Lock() + defer s.logsMu.Unlock() + keys := make([]string, 0, len(s.componentLogs)) + for k := range s.componentLogs { + keys = append(keys, k) + } + slices.Sort(keys) + return keys +} + +func (s *Server) cacheKeys() []string { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + keys := make([]string, 0, len(s.cache)) + for k := range s.cache { + keys = append(keys, k) + } + slices.Sort(keys) + return keys +} + +func (s *Server) runtimeKeys() []string { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + keys := make([]string, 0, len(s.runtime)) + for k := range s.runtime { + keys = append(keys, k) + } + slices.Sort(keys) + return keys +} + +func (s *Server) cacheSize() int { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + return len(s.cache) +} + +func (s *Server) runtimeSize() int { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + return len(s.runtime) +} + +func (s *Server) relayInfos() []RelayInfo { + s.relayMu.Lock() + defer s.relayMu.Unlock() + + out := make([]RelayInfo, 0, len(s.relays)) + for _, relay := range s.relays { + if relay == nil { + continue + } + out = append(out, RelayInfo{ + ID: relay.ID, + Name: relay.Name, + RequestedPort: relay.RequestedPort, + BoundPort: listenerPort(relay.Listener), + }) + } + slices.SortFunc(out, func(a, b RelayInfo) int { + return strings.Compare(a.ID, b.ID) + }) + return out +} + +func (s *Server) relayCount() int { + s.relayMu.Lock() + defer s.relayMu.Unlock() + return len(s.relays) +} + func shouldReuseRemoteStart(componentType, policy string) bool { if componentType == ComponentTypeJD { return false diff --git a/system-tests/lib/cre/environment/remoteexec/agent/server_handlers_test.go b/system-tests/lib/cre/environment/remoteexec/agent/server_handlers_test.go index 9aec0574b68..4737890294a 100644 --- a/system-tests/lib/cre/environment/remoteexec/agent/server_handlers_test.go +++ b/system-tests/lib/cre/environment/remoteexec/agent/server_handlers_test.go @@ -7,6 +7,7 @@ import ( "net/http" "net/http/httptest" "testing" + "time" "github.com/rs/zerolog" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" @@ -114,3 +115,85 @@ func TestComponentCacheKeyVariants(t *testing.T) { require.Error(t, err) require.Contains(t, err.Error(), "unsupported component type") } + +func TestStatusEndpointReturnsAgentState(t *testing.T) { + server := NewServer(zerolog.Nop(), nil) + server.cacheSuccessfulStart("blockchain:anvil:1337", "hash-a", map[string]any{"ok": true}) + server.storeRuntime("nodeset:workflow", runtimeState{ComponentType: ComponentTypeNodeSet}) + server.appendComponentLogs("nodeset:workflow", []string{"line-a"}) + server.beginInFlight("start:nodeset:workflow", inFlightOperationScopeLifecycle) + defer server.endInFlight("start:nodeset:workflow") + + openReq := httptest.NewRequest(http.MethodPost, "/v1/relay/open", bytes.NewReader([]byte(`{"name":"workflow-ocr-0","requestedPort":0}`))) + openReq.Header.Set("Content-Type", "application/json") + openRR := httptest.NewRecorder() + server.Handler().ServeHTTP(openRR, openReq) + require.Equal(t, http.StatusOK, openRR.Code) + + req := httptest.NewRequest(http.MethodGet, "/v1/status", nil) + rr := httptest.NewRecorder() + server.Handler().ServeHTTP(rr, req) + + require.Equal(t, http.StatusOK, rr.Code) + var resp AgentStatusResponse + require.NoError(t, json.Unmarshal(rr.Body.Bytes(), &resp)) + require.GreaterOrEqual(t, resp.UptimeSeconds, int64(0)) + require.Contains(t, resp.CachedComponents, "blockchain:anvil:1337") + require.Contains(t, resp.RuntimeComponents, "nodeset:workflow") + require.Contains(t, resp.ComponentLogKeys, "nodeset:workflow") + require.Len(t, resp.Relays, 1) + require.Equal(t, "workflow-ocr-0", resp.Relays[0].Name) + require.Greater(t, resp.Relays[0].BoundPort, 0) + require.Len(t, resp.InFlight, 1) +} + +func TestLocksEndpointShowsLifecycleBusy(t *testing.T) { + server := NewServer(zerolog.Nop(), nil) + server.cacheSuccessfulStart("blockchain:anvil:1337", "hash-a", map[string]any{"ok": true}) + server.storeRuntime("nodeset:workflow", runtimeState{ComponentType: ComponentTypeNodeSet}) + server.appendComponentLogs("nodeset:workflow", []string{"line-a"}) + server.beginInFlight("start:nodeset:workflow", inFlightOperationScopeLifecycle) + defer server.endInFlight("start:nodeset:workflow") + + req := httptest.NewRequest(http.MethodGet, "/v1/locks", nil) + rr := httptest.NewRecorder() + server.Handler().ServeHTTP(rr, req) + + require.Equal(t, http.StatusOK, rr.Code) + var resp AgentLocksResponse + require.NoError(t, json.Unmarshal(rr.Body.Bytes(), &resp)) + require.True(t, resp.LifecycleBusy) + require.Equal(t, 1, resp.CacheEntries) + require.Equal(t, 1, resp.RuntimeEntries) + require.Equal(t, 1, resp.ComponentLogKeys) + require.Len(t, resp.InFlight, 1) +} + +func TestComponentLogsEndpointValidationAndLimit(t *testing.T) { + server := NewServer(zerolog.Nop(), nil) + server.appendComponentLogs("nodeset:workflow", []string{"line-a", "line-b", "line-c"}) + time.Sleep(1 * time.Millisecond) + + reqMissingKey := httptest.NewRequest(http.MethodGet, "/v1/components/logs", nil) + rrMissingKey := httptest.NewRecorder() + server.Handler().ServeHTTP(rrMissingKey, reqMissingKey) + require.Equal(t, http.StatusBadRequest, rrMissingKey.Code) + require.Contains(t, rrMissingKey.Body.String(), "componentKey query parameter is required") + + reqInvalidLimit := httptest.NewRequest(http.MethodGet, "/v1/components/logs?componentKey=nodeset:workflow&limit=abc", nil) + rrInvalidLimit := httptest.NewRecorder() + server.Handler().ServeHTTP(rrInvalidLimit, reqInvalidLimit) + require.Equal(t, http.StatusBadRequest, rrInvalidLimit.Code) + require.Contains(t, rrInvalidLimit.Body.String(), "limit query parameter must be a positive integer") + + req := httptest.NewRequest(http.MethodGet, "/v1/components/logs?componentKey=nodeset:workflow&limit=2", nil) + rr := httptest.NewRecorder() + server.Handler().ServeHTTP(rr, req) + require.Equal(t, http.StatusOK, rr.Code) + + var resp ComponentLogsResponse + require.NoError(t, json.Unmarshal(rr.Body.Bytes(), &resp)) + require.Equal(t, "nodeset:workflow", resp.ComponentKey) + require.Equal(t, 3, resp.TotalLines) + require.Equal(t, []string{"line-b", "line-c"}, resp.Lines) +} diff --git a/system-tests/lib/cre/environment/remoteexec/client/agent_introspection.go b/system-tests/lib/cre/environment/remoteexec/client/agent_introspection.go new file mode 100644 index 00000000000..31e7a4c2429 --- /dev/null +++ b/system-tests/lib/cre/environment/remoteexec/client/agent_introspection.go @@ -0,0 +1,108 @@ +package client + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strconv" + "strings" + "time" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" +) + +const agentIntrospectionTimeout = 30 * time.Second + +func GetAgentStatus(ctx context.Context, runtime *Runtime) (*agent.AgentStatusResponse, error) { + baseURL, err := runtimeBaseURL(runtime) + if err != nil { + return nil, err + } + var response agent.AgentStatusResponse + if err := getAgentJSON(ctx, baseURL+"/v1/status", &response); err != nil { + return nil, err + } + return &response, nil +} + +func GetAgentLocks(ctx context.Context, runtime *Runtime) (*agent.AgentLocksResponse, error) { + baseURL, err := runtimeBaseURL(runtime) + if err != nil { + return nil, err + } + var response agent.AgentLocksResponse + if err := getAgentJSON(ctx, baseURL+"/v1/locks", &response); err != nil { + return nil, err + } + return &response, nil +} + +func GetComponentLogs(ctx context.Context, runtime *Runtime, componentKey string, limit int) (*agent.ComponentLogsResponse, error) { + baseURL, err := runtimeBaseURL(runtime) + if err != nil { + return nil, err + } + componentKey = strings.TrimSpace(componentKey) + if componentKey == "" { + return nil, fmt.Errorf("componentKey is required") + } + + q := url.Values{} + q.Set("componentKey", componentKey) + if limit > 0 { + q.Set("limit", strconv.Itoa(limit)) + } + + var response agent.ComponentLogsResponse + endpoint := baseURL + "/v1/components/logs?" + q.Encode() + if err := getAgentJSON(ctx, endpoint, &response); err != nil { + return nil, err + } + return &response, nil +} + +func runtimeBaseURL(runtime *Runtime) (string, error) { + if runtime == nil { + return "", fmt.Errorf("runtime is nil") + } + baseURL := strings.TrimSpace(runtime.AgentBaseURL) + if baseURL == "" { + return "", fmt.Errorf("runtime is missing agent base url") + } + return baseURL, nil +} + +func getAgentJSON(ctx context.Context, endpoint string, target any) error { + httpClient := &http.Client{Timeout: agentIntrospectionTimeout} + req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return fmt.Errorf("failed to build agent request: %w", err) + } + resp, err := httpClient.Do(req) + if err != nil { + return fmt.Errorf("failed to call agent endpoint %s: %w", endpoint, err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return fmt.Errorf("failed to read agent response from %s: %w", endpoint, err) + } + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + var agentErr agent.StartComponentResponse + if len(body) > 0 && json.Unmarshal(body, &agentErr) == nil && strings.TrimSpace(agentErr.Error) != "" { + if agentErr.ErrorCode != "" { + return RemoteAgentError(agentErr.ErrorCode, agentErr.Error) + } + return RemoteAgentError("remote_agent_error", agentErr.Error) + } + return fmt.Errorf("agent endpoint %s returned %s: %s", endpoint, resp.Status, strings.TrimSpace(string(body))) + } + if err := json.Unmarshal(body, target); err != nil { + return fmt.Errorf("failed to decode agent response from %s: %w", endpoint, err) + } + return nil +} diff --git a/system-tests/lib/cre/environment/remoteexec/client/agent_introspection_test.go b/system-tests/lib/cre/environment/remoteexec/client/agent_introspection_test.go new file mode 100644 index 00000000000..26a7f1cf314 --- /dev/null +++ b/system-tests/lib/cre/environment/remoteexec/client/agent_introspection_test.go @@ -0,0 +1,81 @@ +package client + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" + "github.com/stretchr/testify/require" +) + +func TestGetAgentStatusSuccess(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _ = json.NewEncoder(w).Encode(agent.AgentStatusResponse{UptimeSeconds: 7}) + })) + defer server.Close() + + resp, err := GetAgentStatus(context.Background(), &Runtime{AgentBaseURL: server.URL}) + require.NoError(t, err) + require.Equal(t, int64(7), resp.UptimeSeconds) +} + +func TestGetAgentLocksSuccess(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _ = json.NewEncoder(w).Encode(agent.AgentLocksResponse{LifecycleBusy: true, RelayCount: 2}) + })) + defer server.Close() + + resp, err := GetAgentLocks(context.Background(), &Runtime{AgentBaseURL: server.URL}) + require.NoError(t, err) + require.True(t, resp.LifecycleBusy) + require.Equal(t, 2, resp.RelayCount) +} + +func TestGetComponentLogsSuccess(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, "nodeset:workflow", r.URL.Query().Get("componentKey")) + require.Equal(t, "5", r.URL.Query().Get("limit")) + _ = json.NewEncoder(w).Encode(agent.ComponentLogsResponse{ + ComponentKey: "nodeset:workflow", + TotalLines: 8, + Lines: []string{"a", "b"}, + }) + })) + defer server.Close() + + resp, err := GetComponentLogs(context.Background(), &Runtime{AgentBaseURL: server.URL}, "nodeset:workflow", 5) + require.NoError(t, err) + require.Equal(t, "nodeset:workflow", resp.ComponentKey) + require.Equal(t, 8, resp.TotalLines) + require.Equal(t, []string{"a", "b"}, resp.Lines) +} + +func TestGetComponentLogsRequiresComponentKey(t *testing.T) { + _, err := GetComponentLogs(context.Background(), &Runtime{AgentBaseURL: "http://127.0.0.1:1"}, "", 10) + require.Error(t, err) + require.Contains(t, err.Error(), "componentKey is required") +} + +func TestGetAgentStatusPropagatesAgentError(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(agent.StartComponentResponse{ + ErrorCode: "invalid_payload", + Error: "bad request", + }) + })) + defer server.Close() + + _, err := GetAgentStatus(context.Background(), &Runtime{AgentBaseURL: server.URL}) + require.Error(t, err) + require.Contains(t, err.Error(), "remote agent error (invalid_payload): bad request") +} + +func TestGetAgentStatusRequiresRuntime(t *testing.T) { + _, err := GetAgentStatus(context.Background(), nil) + require.Error(t, err) + require.Contains(t, err.Error(), "runtime is nil") +} diff --git a/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote_test.go b/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote_test.go index 5041a47b746..1a3ec6b6902 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote_test.go +++ b/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote_test.go @@ -32,6 +32,10 @@ func TestDeployArtifactsToRemoteNodeSetNoFilesFails(t *testing.T) { w.WriteHeader(http.StatusOK) return } + if r.URL.Path == "/v1/status" { + _ = json.NewEncoder(w).Encode(agent.AgentStatusResponse{ProtocolVersion: "1.0.0"}) + return + } t.Fatalf("unexpected path %s", r.URL.Path) })) defer server.Close() @@ -53,6 +57,8 @@ func TestDeployArtifactsToRemoteNodeSetSuccess(t *testing.T) { switch r.URL.Path { case "/v1/health": w.WriteHeader(http.StatusOK) + case "/v1/status": + _ = json.NewEncoder(w).Encode(agent.AgentStatusResponse{ProtocolVersion: "1.0.0"}) case "/v1/components/start": var envelope agent.StartComponentEnvelope require.NoError(t, json.NewDecoder(r.Body).Decode(&envelope)) diff --git a/system-tests/lib/cre/environment/remoteexec/client/compatibility.go b/system-tests/lib/cre/environment/remoteexec/client/compatibility.go new file mode 100644 index 00000000000..652adbcd72f --- /dev/null +++ b/system-tests/lib/cre/environment/remoteexec/client/compatibility.go @@ -0,0 +1,67 @@ +package client + +import ( + "context" + "fmt" + "slices" + "strconv" + "strings" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" +) + +const clientProtocolVersion = "1.0.0" + +func CheckCompatibility(ctx context.Context, runtime *Runtime, requiredCapabilities []string) error { + status, err := GetAgentStatus(ctx, runtime) + if err != nil { + return err + } + return checkCompatibilityStatus(status, requiredCapabilities) +} + +func checkCompatibilityStatus(status *agent.AgentStatusResponse, requiredCapabilities []string) error { + if status == nil { + return fmt.Errorf("agent status is nil") + } + + if strings.TrimSpace(status.ProtocolVersion) != "" { + clientMajor, err := semverMajor(clientProtocolVersion) + if err != nil { + return err + } + agentMajor, err := semverMajor(status.ProtocolVersion) + if err != nil { + return fmt.Errorf("invalid agent protocolVersion %q: %w", status.ProtocolVersion, err) + } + if clientMajor != agentMajor { + return fmt.Errorf("incompatible protocol major versions: client=%s agent=%s", clientProtocolVersion, status.ProtocolVersion) + } + } + + if len(requiredCapabilities) == 0 || len(status.Capabilities) == 0 { + return nil + } + for _, required := range requiredCapabilities { + normalized := strings.TrimSpace(required) + if normalized == "" { + continue + } + if !slices.Contains(status.Capabilities, normalized) { + return fmt.Errorf("agent does not support required capability %q", normalized) + } + } + return nil +} + +func semverMajor(version string) (int, error) { + parts := strings.Split(strings.TrimSpace(version), ".") + if len(parts) == 0 || strings.TrimSpace(parts[0]) == "" { + return 0, fmt.Errorf("invalid semver: %q", version) + } + major, err := strconv.Atoi(parts[0]) + if err != nil { + return 0, fmt.Errorf("invalid semver major in %q: %w", version, err) + } + return major, nil +} diff --git a/system-tests/lib/cre/environment/remoteexec/client/compatibility_test.go b/system-tests/lib/cre/environment/remoteexec/client/compatibility_test.go new file mode 100644 index 00000000000..f1890414aba --- /dev/null +++ b/system-tests/lib/cre/environment/remoteexec/client/compatibility_test.go @@ -0,0 +1,50 @@ +package client + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" + "github.com/stretchr/testify/require" +) + +func TestCheckCompatibilityStatusAcceptsSameMajor(t *testing.T) { + err := checkCompatibilityStatus(&agent.AgentStatusResponse{ + ProtocolVersion: "1.3.0", + Capabilities: []string{"locks", "componentLogs"}, + }, []string{"locks"}) + require.NoError(t, err) +} + +func TestCheckCompatibilityStatusRejectsDifferentMajor(t *testing.T) { + err := checkCompatibilityStatus(&agent.AgentStatusResponse{ + ProtocolVersion: "2.0.0", + }, nil) + require.Error(t, err) + require.Contains(t, err.Error(), "incompatible protocol major versions") +} + +func TestCheckCompatibilityStatusRejectsMissingRequiredCapability(t *testing.T) { + err := checkCompatibilityStatus(&agent.AgentStatusResponse{ + ProtocolVersion: "1.0.0", + Capabilities: []string{"locks"}, + }, []string{"componentLogs"}) + require.Error(t, err) + require.Contains(t, err.Error(), `required capability "componentLogs"`) +} + +func TestCheckCompatibilityCallsStatusEndpoint(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _ = json.NewEncoder(w).Encode(agent.AgentStatusResponse{ + ProtocolVersion: "1.0.0", + Capabilities: []string{"locks", "componentLogs"}, + }) + })) + defer server.Close() + + err := CheckCompatibility(context.Background(), &Runtime{AgentBaseURL: server.URL}, []string{"locks"}) + require.NoError(t, err) +} diff --git a/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go index 0aa6b00fda2..ae982990a35 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go +++ b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go @@ -28,7 +28,7 @@ const ( ComponentTypeNodeSet = "nodeset" EnvEC2AgentURL = "CRE_EC2_AGENT_URL" EnvEC2AgentPort = "CRE_EC2_AGENT_PORT" - defaultEC2AgentPort = 8080 + defaultEC2AgentPort = 18080 ) type ComponentClient interface { @@ -71,11 +71,26 @@ func ResolveRuntime(testLogger zerolog.Logger) (*Runtime, error) { return nil, err } client := newEC2HTTPComponentClient(baseURL) - return &Runtime{ + runtime := &Runtime{ AgentBaseURL: baseURL, EC2HostIP: ec2HostIP, Client: client, - }, nil + } + + // Best-effort compatibility check: fail on definitive protocol incompatibility, + // but do not fail runtime resolution if status endpoint is temporarily unavailable. + compatCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + status, statusErr := GetAgentStatus(compatCtx, runtime) + if statusErr != nil { + testLogger.Warn().Err(statusErr).Msg("skipping remote agent compatibility check (status unavailable)") + return runtime, nil + } + if compatErr := checkCompatibilityStatus(status, nil); compatErr != nil { + return nil, compatErr + } + + return runtime, nil } func NewComponentClient(runtime *Runtime) (ComponentClient, error) { diff --git a/system-tests/lib/cre/environment/remoteexec/client/remote_stop_test.go b/system-tests/lib/cre/environment/remoteexec/client/remote_stop_test.go index 69504c97aab..9ee7198e883 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/remote_stop_test.go +++ b/system-tests/lib/cre/environment/remoteexec/client/remote_stop_test.go @@ -108,6 +108,8 @@ func TestStopRemoteComponents_ResidualQueryFailureIsReportedInSummary(t *testing switch r.URL.Path { case "/v1/health": w.WriteHeader(http.StatusOK) + case "/v1/status": + require.NoError(t, json.NewEncoder(w).Encode(agent.AgentStatusResponse{ProtocolVersion: "1.0.0"})) case "/v1/components/start": resp := agent.StartComponentResponse{ ComponentType: ComponentTypeBlockchain, @@ -180,6 +182,9 @@ func newRemoteStopTestServer(t *testing.T) *httptest.Server { case "/v1/health": w.WriteHeader(http.StatusOK) return + case "/v1/status": + require.NoError(t, json.NewEncoder(w).Encode(agent.AgentStatusResponse{ProtocolVersion: "1.0.0"})) + return case "/v1/resources/ctf": _, _ = w.Write([]byte(`{"containers":["leftover-container"],"volumes":["leftover-volume"]}`)) return diff --git a/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md b/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md index 89c1174b149..64ec4fb25ce 100644 --- a/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md +++ b/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md @@ -12,7 +12,7 @@ This runbook covers the EC2-based remote mode for CRE where components can run e ## Core Environment Variables - `CRE_EC2_INSTANCE_ID=` (used by direct mode auto IP lookup) -- `CRE_EC2_AGENT_PORT=` (defaults to `8080`) +- `CRE_EC2_AGENT_PORT=` (defaults to `18080`) - `CRE_EC2_AGENT_URL=` (optional explicit override) - `CRE_EC2_HOST_IP=` (optional in direct mode; if missing, resolved from AWS CLI using instance ID) - `CRE_AWS_PROFILE=` (optional AWS auth profile) From 10a96d508a69f62a76a3886e2df52186640000dd Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Wed, 25 Feb 2026 17:31:00 +0100 Subject: [PATCH 25/34] fix incoming gateway url resolution --- .../lib/cre/contracts/keystone_test.go | 8 ++-- system-tests/lib/cre/don/config/config.go | 28 +++++++---- .../lib/cre/don/config/config_test.go | 9 +++- system-tests/lib/cre/environment/dons.go | 28 +++++++---- system-tests/lib/cre/environment/dons_test.go | 28 +++++++---- .../lib/cre/environment/environment.go | 33 ++++++++++++- .../client/remote_component_client.go | 33 ++++++++++--- .../client/remote_component_client_test.go | 24 ++++++++++ system-tests/lib/cre/environment/state.go | 38 +++++++++++++++ .../lib/cre/environment/state_test.go | 47 +++++++++++++++++++ system-tests/lib/cre/types.go | 1 + 11 files changed, 240 insertions(+), 37 deletions(-) create mode 100644 system-tests/lib/cre/environment/state_test.go diff --git a/system-tests/lib/cre/contracts/keystone_test.go b/system-tests/lib/cre/contracts/keystone_test.go index 94ebc4cdbae..f72527626e0 100644 --- a/system-tests/lib/cre/contracts/keystone_test.go +++ b/system-tests/lib/cre/contracts/keystone_test.go @@ -234,7 +234,7 @@ func TestGenerateAdminAddresses(t *testing.T) { count := 10 addresses, err := generateAdminAddresses(count) require.NoError(t, err, "Expected no error, but got: %v", err) - require.Len(t, len(addresses), count, "Expected slice of length %d, but got %d", count, len(addresses)) + require.Len(t, addresses, count, "Expected slice of length %d, but got %d", count, len(addresses)) // Check for uniqueness and validity addressMap := make(map[common.Address]bool) @@ -243,7 +243,7 @@ func TestGenerateAdminAddresses(t *testing.T) { require.NotEqual(t, 0, addr.Cmp(common.HexToAddress("0x0000000000000000000000000000000000000000")), "Generated a zero address, which should be avoided") addressMap[addr] = true } - require.Len(t, len(addressMap), count, "Expected slice of length %d, but got %d", count, len(addressMap)) + require.Len(t, addressMap, count, "Expected slice of length %d, but got %d", count, len(addressMap)) }) // Test Case 2: Smallest Valid Input @@ -251,7 +251,7 @@ func TestGenerateAdminAddresses(t *testing.T) { count := 1 addresses, err := generateAdminAddresses(count) require.NoError(t, err, "Expected no error, but got: %v", err) - require.Len(t, len(addresses), count, "Expected slice of length %d, but got %d", count, len(addresses)) + require.Len(t, addresses, count, "Expected slice of length %d, but got %d", count, len(addresses)) }) // Test Case 3: Invalid Input (Zero and Negative Count) @@ -272,7 +272,7 @@ func TestGenerateAdminAddresses(t *testing.T) { count := 65536 addresses, err := generateAdminAddresses(count) require.NoError(t, err, "Expected no error, but got: %v", err) - require.Len(t, len(addresses), count, "Expected slice of length %d, but got %d", count, len(addresses)) + require.Len(t, addresses, count, "Expected slice of length %d, but got %d", count, len(addresses)) for _, addr := range addresses { require.True(t, common.IsHexAddress(addr.String()), "invalid address: %s", addr) diff --git a/system-tests/lib/cre/don/config/config.go b/system-tests/lib/cre/don/config/config.go index 1e40366de9b..30b3a3617f9 100644 --- a/system-tests/lib/cre/don/config/config.go +++ b/system-tests/lib/cre/don/config/config.go @@ -52,6 +52,7 @@ func PrepareNodeTOMLs( creEnv *cre.Environment, nodeSets []*cre.NodeSet, configuredBlockchains []*envconfig.Blockchain, + remoteHostIP string, capabilities []cre.InstallableCapability, // Deprecated, use Features instead and modify node configs inside a Feature nodeConfigTransformerFns []cre.NodeConfigTransformerFn, ) ([]*cre.NodeSet, error) { @@ -124,6 +125,7 @@ func PrepareNodeTOMLs( DonMetadata: donMetadata, Blockchains: chainPerSelector, BlockchainPlacementBySelector: blockchainPlacementBySelector, + RemoteHostIP: remoteHostIP, OCRBootstrapPlacement: ocrBootstrapPlacement, OCRBootstrapAnnouncePort: ocrBootstrapAnnouncePort, Flags: donMetadata.Flags, @@ -428,6 +430,7 @@ func addWorkerNodeConfig( ocrPeeringData.OCRBootstraperHost, ocrPeeringData.Port, ocrBootstrapAnnouncePort, + commonInputs.remoteHostIP, ) if bootstrapAddressErr != nil { return existingConfig, errors.Wrap(bootstrapAddressErr, "failed to resolve OCR bootstrapper address") @@ -557,7 +560,7 @@ func addWorkerNodeConfig( gateways := []coretoml.ConnectorGateway{} if topology != nil && len(topology.GatewayConnectors.Configurations) > 0 { for _, gateway := range topology.GatewayConnectors.Configurations { - connectorURL, urlErr := resolveGatewayConnectorURL(donMetadata.MustNodeSet().Placement, topology, gateway) + connectorURL, urlErr := resolveGatewayConnectorURL(donMetadata.MustNodeSet().Placement, topology, gateway, commonInputs.remoteHostIP) if urlErr != nil { return existingConfig, errors.Wrap(urlErr, "failed to resolve gateway connector url") } @@ -663,6 +666,7 @@ type versionedAddress struct { type commonInputs struct { registryChainID uint64 registryChainSelector uint64 + remoteHostIP string workflowRegistry versionedAddress capabilityRegistry versionedAddress @@ -704,6 +708,7 @@ func gatherCommonInputs(input cre.GenerateConfigsInput) (*commonInputs, error) { address: capabilitiesRegistryAddress, version: input.ContractVersions[keystone_changeset.CapabilitiesRegistry.String()], }, + remoteHostIP: input.RemoteHostIP, provider: input.Provider, }, nil } @@ -983,7 +988,7 @@ func resolveBootstrapAnnouncePort(topology *cre.Topology, bootstrapNodeUUID stri return 0, fmt.Errorf("failed to resolve bootstrap announce port for node UUID %s", bootstrapNodeUUID) } -func resolveNodeFacingBootstrapAddress(callerPlacement, bootstrapPlacement, bootstrapHost string, internalPort, externalPort int) (string, error) { +func resolveNodeFacingBootstrapAddress(callerPlacement, bootstrapPlacement, bootstrapHost string, internalPort, externalPort int, remoteHostIP string) (string, error) { caller, err := connectivity.PlacementFromTarget(callerPlacement) if err != nil { return "", err @@ -997,16 +1002,20 @@ func resolveNodeFacingBootstrapAddress(callerPlacement, bootstrapPlacement, boot if !runtimecfg.IsDirectMode() { return "", fmt.Errorf("mixed DON bootstrap resolution requires direct mode") } - hostIP, err := runtimecfg.DirectHostIP() - if err != nil { - return "", err + hostIP := strings.TrimSpace(remoteHostIP) + if hostIP == "" { + var err error + hostIP, err = runtimecfg.DirectHostIP() + if err != nil { + return "", err + } } return net.JoinHostPort(hostIP, strconv.Itoa(externalPort)), nil } return cre.ResolveBootstrapAddress(callerPlacement, bootstrapPlacement, bootstrapHost, internalPort) } -func resolveGatewayConnectorURL(callerPlacementRaw string, topology *cre.Topology, gateway *cre.DonGatewayConfiguration) (string, error) { +func resolveGatewayConnectorURL(callerPlacementRaw string, topology *cre.Topology, gateway *cre.DonGatewayConfiguration, remoteHostIP string) (string, error) { if gateway == nil || gateway.GatewayConfiguration == nil { return "", fmt.Errorf("gateway configuration is nil") } @@ -1021,7 +1030,7 @@ func resolveGatewayConnectorURL(callerPlacementRaw string, topology *cre.Topolog internalURL := fmt.Sprintf("ws://%s:%d%s", gateway.Outgoing.Host, gateway.Outgoing.Port, gateway.Outgoing.Path) - externalHost, err := gatewayExternalHost(targetPlacement) + externalHost, err := gatewayExternalHost(targetPlacement, remoteHostIP) if err != nil { return "", err } @@ -1078,12 +1087,15 @@ func resolveNodePlacement(topology *cre.Topology, nodeUUID string) (connectivity return "", fmt.Errorf("failed to resolve placement for node uuid %s", trimmedUUID) } -func gatewayExternalHost(targetPlacement connectivity.Placement) (string, error) { +func gatewayExternalHost(targetPlacement connectivity.Placement, remoteHostIP string) (string, error) { switch targetPlacement { case connectivity.PlacementRemote: if !runtimecfg.IsDirectMode() { return "", fmt.Errorf("gateway connector resolution for remote targets requires direct mode") } + if hostIP := strings.TrimSpace(remoteHostIP); hostIP != "" { + return hostIP, nil + } return runtimecfg.DirectHostIP() case connectivity.PlacementLocal: return strings.TrimPrefix(framework.HostDockerInternal(), "http://"), nil diff --git a/system-tests/lib/cre/don/config/config_test.go b/system-tests/lib/cre/don/config/config_test.go index 47189efd86a..2ace1b8c9bc 100644 --- a/system-tests/lib/cre/don/config/config_test.go +++ b/system-tests/lib/cre/don/config/config_test.go @@ -52,13 +52,20 @@ func TestResolveGatewayConnectorURL_PlacementMatrix(t *testing.T) { t.Run(tt.name, func(t *testing.T) { topology, gateway := mustBuildGatewayTopology(t, tt.targetPlacement) - gotURL, err := resolveGatewayConnectorURL(tt.callerPlacement, topology, gateway) + gotURL, err := resolveGatewayConnectorURL(tt.callerPlacement, topology, gateway, "") require.NoError(t, err, "resolveGatewayConnectorURL should not fail") require.Equal(t, tt.wantURL, gotURL, "unexpected gateway connector URL") }) } } +func TestResolveGatewayConnectorURL_RemoteHostOverride(t *testing.T) { + topology, gateway := mustBuildGatewayTopology(t, "remote") + gotURL, err := resolveGatewayConnectorURL("local", topology, gateway, "203.0.113.22") + require.NoError(t, err, "resolveGatewayConnectorURL should use explicit remote host override") + require.Equal(t, "ws://203.0.113.22:5003/node", gotURL, "unexpected gateway connector URL") +} + func mustBuildGatewayTopology(t *testing.T, targetPlacement string) (*cre.Topology, *cre.DonGatewayConfiguration) { t.Helper() diff --git a/system-tests/lib/cre/environment/dons.go b/system-tests/lib/cre/environment/dons.go index fab1f71a105..1f117c14a69 100644 --- a/system-tests/lib/cre/environment/dons.go +++ b/system-tests/lib/cre/environment/dons.go @@ -125,6 +125,10 @@ func startDONsContainerized( nodeSets []*cre.NodeSet, remoteRuntime *remoteclient.Runtime, ) (*StartedDONs, error) { + if remoteRuntime != nil { + normalizeForExecution(topology, nodeSets, remoteRuntime.EC2HostIP) + } + // Skip binary operations for remote DONs. if infraInput.IsDocker() { for donIdx, donMetadata := range topology.DonsMetadata.List() { @@ -329,7 +333,7 @@ func startNodeSet( if err != nil { return nil, err } - if err := rewriteRemoteNodeSetOutputForLocalAccess(topology, configuredIndex, nodeSet, nodeset, remoteRuntime.EC2HostIP); err != nil { + if err := rewriteRemoteNodeSetOutputForLocalAccess(nodeset, remoteRuntime.EC2HostIP); err != nil { return nil, err } return nodeset, nil @@ -390,15 +394,11 @@ func validateRemoteNodeSetNodeSpecs(nodeSetName string, specs []*clnode.Input) e return nil } -func rewriteRemoteNodeSetOutputForLocalAccess(topology *cre.Topology, configuredIndex int, nodeSet *cre.NodeSet, output *ns.Output, ec2HostIP string) error { - if output == nil && (nodeSet == nil || nodeSet.DbInput == nil || nodeSet.DbInput.Port == 0) { +func rewriteRemoteNodeSetOutputForLocalAccess(output *ns.Output, ec2HostIP string) error { + if output == nil { return nil } - if err := rewriteNodeSetForDirectAccess(output, ec2HostIP); err != nil { - return err - } - rewriteGatewayIncomingForDirectAccess(topology, configuredIndex, ec2HostIP) - return nil + return rewriteNodeSetForDirectAccess(output, ec2HostIP) } func rewriteNodeSetForDirectAccess(output *ns.Output, ec2HostIP string) error { @@ -439,6 +439,18 @@ func rewriteGatewayIncomingForDirectAccess(topology *cre.Topology, configuredInd } } +func normalizeForExecution(topology *cre.Topology, nodeSets []*cre.NodeSet, ec2HostIP string) { + if topology == nil || len(nodeSets) == 0 || strings.TrimSpace(ec2HostIP) == "" { + return + } + for idx, nodeSet := range nodeSets { + if nodeSet == nil || strings.TrimSpace(nodeSet.Placement) != string(config.PlacementRemote) { + continue + } + rewriteGatewayIncomingForDirectAccess(topology, idx, ec2HostIP) + } +} + func FundNodes(ctx context.Context, testLogger zerolog.Logger, dons *cre.Dons, blockchains []blockchains.Blockchain, fundingAmountPerChainFamily map[string]uint64) error { for _, don := range dons.List() { testLogger.Info().Msgf("Funding nodes for DON %s", don.Name) diff --git a/system-tests/lib/cre/environment/dons_test.go b/system-tests/lib/cre/environment/dons_test.go index f460f924219..dc3f95de839 100644 --- a/system-tests/lib/cre/environment/dons_test.go +++ b/system-tests/lib/cre/environment/dons_test.go @@ -55,16 +55,13 @@ func TestBuildRemoteNodeSetInputRejectsImageAndBuildFieldsTogether(t *testing.T) } func TestRewriteRemoteNodeSetOutputForLocalAccess_LocalOnlyNoop(t *testing.T) { - err := rewriteRemoteNodeSetOutputForLocalAccess(nil, 0, nil, nil, "203.0.113.10") + err := rewriteRemoteNodeSetOutputForLocalAccess(nil, "203.0.113.10") require.NoError(t, err, "expected local-only no-op rewrite to succeed") } -func TestRewriteRemoteNodeSetOutputForLocalAccess_RemoteRewritesGatewayIncomingHost(t *testing.T) { +func TestNormalizeForExecution_RemoteRewritesGatewayIncomingHost(t *testing.T) { topology, nodeSet := mustBuildRemoteGatewayTopology(t) - output := &simple_node_set.Output{} - - err := rewriteRemoteNodeSetOutputForLocalAccess(topology, 0, nodeSet, output, "203.0.113.10") - require.NoError(t, err, "expected remote rewrite to succeed") + normalizeForExecution(topology, []*cre.NodeSet{nodeSet}, "203.0.113.10") require.NotNil(t, topology.GatewayConnectors) require.Len(t, topology.GatewayConnectors.Configurations, 1) @@ -76,8 +73,23 @@ func TestRewriteRemoteNodeSetOutputForLocalAccess_RemoteRewritesGatewayIncomingH ) } +func TestRewriteRemoteNodeSetOutputForLocalAccess_RemoteRewritesNodeExternalURL(t *testing.T) { + output := &simple_node_set.Output{ + CLNodes: []*clnode.Output{ + { + Node: &clnode.NodeOut{ + ExternalURL: "http://127.0.0.1:6688", + }, + }, + }, + } + + err := rewriteRemoteNodeSetOutputForLocalAccess(output, "203.0.113.10") + require.NoError(t, err, "expected remote rewrite to succeed") + require.Equal(t, "http://203.0.113.10:6688", output.CLNodes[0].Node.ExternalURL) +} + func TestRewriteRemoteNodeSetOutputForLocalAccess_InvalidNodeExternalURLFails(t *testing.T) { - topology, nodeSet := mustBuildRemoteGatewayTopology(t) output := &simple_node_set.Output{ CLNodes: []*clnode.Output{ { @@ -88,7 +100,7 @@ func TestRewriteRemoteNodeSetOutputForLocalAccess_InvalidNodeExternalURLFails(t }, } - err := rewriteRemoteNodeSetOutputForLocalAccess(topology, 0, nodeSet, output, "203.0.113.10") + err := rewriteRemoteNodeSetOutputForLocalAccess(output, "203.0.113.10") require.Error(t, err, "expected invalid node external URL to fail rewrite") require.Contains(t, err.Error(), "failed to parse url", "expected parse failure context") } diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index 8ff4ce131e5..4a2a3a8deab 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -207,6 +207,7 @@ func SetupTestEnvironment( creEnvironment, input.NodeSets, input.Blockchains, + remoteHostIP(remoteRuntime), input.Capabilities, input.ConfigFactoryFunctions, ) @@ -454,6 +455,13 @@ func SetupTestEnvironment( }, nil } +func remoteHostIP(runtime *remoteclient.Runtime) string { + if runtime == nil { + return "" + } + return runtime.EC2HostIP +} + func appendOutputsToInput(input *SetupInput, nodeSetOutput []*cre.NodeSetOutput, blockchains []blockchains.Blockchain, jdOutput *jd.Output) { // append the nodeset output, so that later it can be stored in the cached output, so that we can use the environment again without running setup for idx, nsOut := range nodeSetOutput { @@ -496,7 +504,30 @@ func resolveRemoteRuntimeForSetup( if !hasRemoteComponents(blockchains, jdInput, nodeSets) { return nil, nil } - return remoteclient.ResolveRuntime(testLogger) + runtimeInput, err := resolveRemoteRuntimeInput() + if err != nil { + return nil, err + } + return remoteclient.ResolveRuntimeWithInput(testLogger, runtimeInput) +} + +func resolveRemoteRuntimeInput() (remoteclient.RuntimeInput, error) { + input := remoteclient.RuntimeInput{ + AgentBaseURL: strings.TrimSpace(os.Getenv(remoteclient.EnvEC2AgentURL)), + } + if configuredPort := strings.TrimSpace(os.Getenv(remoteclient.EnvEC2AgentPort)); configuredPort != "" { + parsedPort, err := strconv.Atoi(configuredPort) + if err != nil || parsedPort <= 0 || parsedPort > 65535 { + return remoteclient.RuntimeInput{}, fmt.Errorf("invalid %s: %q", remoteclient.EnvEC2AgentPort, configuredPort) + } + input.AgentPort = parsedPort + } + ec2HostIP, err := runtimecfg.DirectHostIP() + if err != nil { + return remoteclient.RuntimeInput{}, err + } + input.EC2HostIP = ec2HostIP + return input, nil } type nodeSetPlacementSummary struct { diff --git a/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go index ae982990a35..248182bdc9a 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go +++ b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go @@ -49,6 +49,12 @@ type Runtime struct { Client ComponentClient } +type RuntimeInput struct { + AgentBaseURL string + EC2HostIP string + AgentPort int +} + func newEC2HTTPComponentClient(baseURL string) *httpComponentClient { return &httpComponentClient{ baseURL: baseURL, @@ -62,11 +68,15 @@ func newEC2HTTPComponentClient(baseURL string) *httpComponentClient { } func ResolveRuntime(testLogger zerolog.Logger) (*Runtime, error) { - baseURL, err := resolveEC2AgentBaseURL(testLogger) + return ResolveRuntimeWithInput(testLogger, RuntimeInput{}) +} + +func ResolveRuntimeWithInput(testLogger zerolog.Logger, input RuntimeInput) (*Runtime, error) { + baseURL, err := resolveEC2AgentBaseURL(testLogger, input) if err != nil { return nil, fmt.Errorf("failed to resolve EC2 agent base URL: %w", err) } - ec2HostIP, err := resolveEC2HostIP() + ec2HostIP, err := resolveEC2HostIP(input) if err != nil { return nil, err } @@ -238,15 +248,18 @@ func RemoteAgentError(code, message string) error { return fmt.Errorf("remote agent error (%s): %s", code, message) } -func resolveEC2AgentBaseURL(testLogger zerolog.Logger) (string, error) { +func resolveEC2AgentBaseURL(testLogger zerolog.Logger, input RuntimeInput) (string, error) { + if configured := strings.TrimSpace(input.AgentBaseURL); configured != "" { + return configured, nil + } if configured := strings.TrimSpace(os.Getenv(EnvEC2AgentURL)); configured != "" { return configured, nil } - remotePort, err := resolveEC2AgentPort() + remotePort, err := resolveEC2AgentPort(input) if err != nil { return "", err } - ec2HostIP, err := resolveEC2HostIP() + ec2HostIP, err := resolveEC2HostIP(input) if err != nil { return "", err } @@ -254,7 +267,10 @@ func resolveEC2AgentBaseURL(testLogger zerolog.Logger) (string, error) { return fmt.Sprintf("http://%s:%d", ec2HostIP, remotePort), nil } -func resolveEC2AgentPort() (int, error) { +func resolveEC2AgentPort(input RuntimeInput) (int, error) { + if input.AgentPort > 0 { + return input.AgentPort, nil + } remotePort := defaultEC2AgentPort if configuredPort := strings.TrimSpace(os.Getenv(EnvEC2AgentPort)); configuredPort != "" { parsedPort, err := strconv.Atoi(configuredPort) @@ -266,6 +282,9 @@ func resolveEC2AgentPort() (int, error) { return remotePort, nil } -func resolveEC2HostIP() (string, error) { +func resolveEC2HostIP(input RuntimeInput) (string, error) { + if configured := strings.TrimSpace(input.EC2HostIP); configured != "" { + return configured, nil + } return runtimecfg.DirectHostIP() } diff --git a/system-tests/lib/cre/environment/remoteexec/client/remote_component_client_test.go b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client_test.go index 9b06aa0575f..99912c30a67 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/remote_component_client_test.go +++ b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client_test.go @@ -27,6 +27,30 @@ func TestResolveRemoteRuntimeWithExplicitEnv(t *testing.T) { require.NotNil(t, runtime.Client, "expected resolved runtime to include component client") } +func TestResolveRemoteRuntimeWithInputOverridesEnv(t *testing.T) { + t.Setenv(EnvEC2AgentURL, "http://198.51.100.20:19090") + t.Setenv(runtimecfg.EnvEC2HostIP, "198.51.100.20") + t.Setenv(EnvEC2AgentPort, "19090") + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, "/v1/status", r.URL.Path) + _ = json.NewEncoder(w).Encode(agent.AgentStatusResponse{ + ProtocolVersion: "1.0", + Capabilities: []string{"component_logs", "locks", "deploy_artifacts", "start_component", "relay", "list_ctf_resources"}, + }) + })) + defer server.Close() + + runtime, err := ResolveRuntimeWithInput(zerolog.Nop(), RuntimeInput{ + AgentBaseURL: server.URL, + EC2HostIP: "203.0.113.22", + AgentPort: 18081, + }) + require.NoError(t, err) + require.Equal(t, server.URL, runtime.AgentBaseURL) + require.Equal(t, "203.0.113.22", runtime.EC2HostIP) +} + func TestResolveRemoteRuntimeRequiresHostResolution(t *testing.T) { t.Setenv(EnvEC2AgentURL, "http://198.51.100.20:19090") t.Setenv(runtimecfg.EnvEC2HostIP, "") diff --git a/system-tests/lib/cre/environment/state.go b/system-tests/lib/cre/environment/state.go index b511281955e..e281fac1fcc 100644 --- a/system-tests/lib/cre/environment/state.go +++ b/system-tests/lib/cre/environment/state.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "os" + "strings" "github.com/gagliardetto/solana-go" "github.com/pkg/errors" @@ -21,6 +22,7 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" blockchain_sets "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/sets" envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" ) // BuildFromSavedState rebuilds the CLDF environment and per‑chain clients from @@ -84,6 +86,9 @@ func BuildFromSavedState(ctx context.Context, cldLogger logger.Logger, cachedInp if tErr != nil { return nil, nil, errors.Wrap(tErr, "failed to recreate topology from artifact") } + if rewriteErr := rewriteReconstructedGatewayIncomingHosts(cachedInput, topology); rewriteErr != nil { + return nil, nil, rewriteErr + } for idx, don := range cachedInput.NodeSets { startedDON, donErr := cre.NewDON(ctx, topology.DonsMetadata.List()[idx], cachedInput.NodeSets[idx].Out.CLNodes) @@ -142,6 +147,39 @@ func BuildFromSavedState(ctx context.Context, cldLogger logger.Logger, cachedInp }, dons, nil } +func rewriteReconstructedGatewayIncomingHosts(cachedInput *envconfig.Config, topology *cre.Topology) error { + if cachedInput == nil || topology == nil || topology.GatewayConnectors == nil || len(topology.GatewayConnectors.Configurations) == 0 { + return nil + } + + donsMetadata := topology.DonsMetadata.List() + hasRemoteGatewayNodeSet := false + for idx, nodeSet := range cachedInput.NodeSets { + if nodeSet == nil || !strings.EqualFold(strings.TrimSpace(nodeSet.Placement), string(envconfig.PlacementRemote)) { + continue + } + if idx >= len(donsMetadata) || donsMetadata[idx] == nil { + continue + } + if _, hasGateway := donsMetadata[idx].Gateway(); !hasGateway { + continue + } + hasRemoteGatewayNodeSet = true + break + } + if !hasRemoteGatewayNodeSet { + return nil + } + + ec2HostIP, err := runtimecfg.DirectHostIP() + if err != nil { + return errors.Wrap(err, "failed to resolve EC2 host IP for reconstructed gateway incoming host rewrite") + } + normalizeForExecution(topology, cachedInput.NodeSets, ec2HostIP) + + return nil +} + func SetDefaultPrivateKeyIfEmpty(defaultPrivateKey string) error { if os.Getenv("PRIVATE_KEY") == "" { setErr := os.Setenv("PRIVATE_KEY", defaultPrivateKey) diff --git a/system-tests/lib/cre/environment/state_test.go b/system-tests/lib/cre/environment/state_test.go new file mode 100644 index 00000000000..93464ed41d8 --- /dev/null +++ b/system-tests/lib/cre/environment/state_test.go @@ -0,0 +1,47 @@ +package environment + +import ( + "testing" + + "github.com/stretchr/testify/require" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" +) + +func TestRewriteReconstructedGatewayIncomingHosts_RemoteGatewayUsesEC2IP(t *testing.T) { + topology, nodeSet := mustBuildRemoteGatewayTopology(t) + cfg := &config.Config{ + NodeSets: []*cre.NodeSet{nodeSet}, + } + t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + + err := rewriteReconstructedGatewayIncomingHosts(cfg, topology) + require.NoError(t, err, "expected remote gateway incoming rewrite to succeed") + require.Equal( + t, + "203.0.113.10", + topology.GatewayConnectors.Configurations[0].Incoming.Host, + "expected reconstructed remote gateway incoming host to use EC2 IP", + ) +} + +func TestRewriteReconstructedGatewayIncomingHosts_LocalGatewayNoop(t *testing.T) { + topology, nodeSet := mustBuildRemoteGatewayTopology(t) + nodeSet.Placement = string(config.PlacementLocal) + cfg := &config.Config{ + NodeSets: []*cre.NodeSet{nodeSet}, + } + t.Setenv(runtimecfg.EnvEC2HostIP, "") + t.Setenv(runtimecfg.EnvEC2InstanceID, "") + + err := rewriteReconstructedGatewayIncomingHosts(cfg, topology) + require.NoError(t, err, "expected local gateway reconstruction rewrite to be a no-op") + require.Equal( + t, + "bootstrap-gateway-node0", + topology.GatewayConnectors.Configurations[0].Incoming.Host, + "expected local gateway incoming host to remain unchanged", + ) +} diff --git a/system-tests/lib/cre/types.go b/system-tests/lib/cre/types.go index a0dbf35751f..064a58ff5dd 100644 --- a/system-tests/lib/cre/types.go +++ b/system-tests/lib/cre/types.go @@ -457,6 +457,7 @@ type GenerateConfigsInput struct { DonMetadata *DonMetadata Blockchains map[uint64]blockchains.Blockchain BlockchainPlacementBySelector map[uint64]string + RemoteHostIP string OCRBootstrapPlacement string OCRBootstrapAnnouncePort int RegistryChainSelector uint64 From 79cbb2245909e4d2f107abdfe6d8f1a5f26fc0a3 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Wed, 25 Feb 2026 18:04:18 +0100 Subject: [PATCH 26/34] more clean up and cohesion increase --- .../lib/cre/connectivity/chooser_test.go | 61 ++-- .../lib/cre/don/config/config_test.go | 71 +++++ .../adapters/blockchain_adapter.go | 127 -------- .../adapters/blockchain_adapter_test.go | 50 ---- .../environment/adapters/tunnel_adapter.go | 10 - .../lib/cre/environment/address_rewrite.go | 44 +++ .../lib/cre/environment/blockchain_start.go | 54 ++-- .../cre/environment/blockchain_start_test.go | 2 +- system-tests/lib/cre/environment/dons.go | 46 ++- system-tests/lib/cre/environment/dons_test.go | 2 +- .../lib/cre/environment/environment.go | 92 +----- .../environment/environment_placement_test.go | 23 +- .../lib/cre/environment/execution_plan.go | 97 +++++++ system-tests/lib/cre/environment/jobs.go | 54 +--- .../environment/remoteexec/agent/server.go | 270 ------------------ .../remoteexec/agent/server_component_logs.go | 57 ++++ .../remoteexec/agent/server_state.go | 150 ++++++++++ .../agent/server_status_handlers.go | 83 ++++++ .../remote_component_descriptor_start.go | 47 +++ .../lib/cre/environment/state_test.go | 21 ++ .../cre/environment/tunnel/component_id.go | 27 -- .../environment/tunnel/component_id_test.go | 17 -- .../lib/cre/environment/tunnel/manager.go | 115 -------- .../cre/environment/tunnel/manager_test.go | 85 ------ .../cre/environment/tunnel/noop_manager.go | 26 -- .../cre/environment/tunnel/provider_ssm.go | 226 --------------- .../lib/cre/environment/tunnel/tunnel.go | 32 --- .../tests/smoke/cre/cre_suite_test.go | 2 +- 28 files changed, 688 insertions(+), 1203 deletions(-) delete mode 100644 system-tests/lib/cre/environment/adapters/blockchain_adapter.go delete mode 100644 system-tests/lib/cre/environment/adapters/blockchain_adapter_test.go delete mode 100644 system-tests/lib/cre/environment/adapters/tunnel_adapter.go create mode 100644 system-tests/lib/cre/environment/address_rewrite.go create mode 100644 system-tests/lib/cre/environment/execution_plan.go create mode 100644 system-tests/lib/cre/environment/remoteexec/agent/server_component_logs.go create mode 100644 system-tests/lib/cre/environment/remoteexec/agent/server_state.go create mode 100644 system-tests/lib/cre/environment/remoteexec/agent/server_status_handlers.go create mode 100644 system-tests/lib/cre/environment/remoteexec/client/remote_component_descriptor_start.go delete mode 100644 system-tests/lib/cre/environment/tunnel/component_id.go delete mode 100644 system-tests/lib/cre/environment/tunnel/component_id_test.go delete mode 100644 system-tests/lib/cre/environment/tunnel/manager.go delete mode 100644 system-tests/lib/cre/environment/tunnel/manager_test.go delete mode 100644 system-tests/lib/cre/environment/tunnel/noop_manager.go delete mode 100644 system-tests/lib/cre/environment/tunnel/provider_ssm.go delete mode 100644 system-tests/lib/cre/environment/tunnel/tunnel.go diff --git a/system-tests/lib/cre/connectivity/chooser_test.go b/system-tests/lib/cre/connectivity/chooser_test.go index 632c5b14bfb..cd451ec7c24 100644 --- a/system-tests/lib/cre/connectivity/chooser_test.go +++ b/system-tests/lib/cre/connectivity/chooser_test.go @@ -4,6 +4,8 @@ import ( "context" "errors" "testing" + + "github.com/stretchr/testify/require" ) func TestResolveSamePlacementUsesInternal(t *testing.T) { @@ -12,15 +14,10 @@ func TestResolveSamePlacementUsesInternal(t *testing.T) { Internal: "http://anvil:8545", External: "http://10.0.0.1:8545", }) - if err != nil { - t.Fatalf("expected resolve to succeed: %v", err) - } - if r.URL != "http://anvil:8545" || r.SelectedKind != "internal" { - t.Fatalf("unexpected resolution: %+v", r) - } - if r.RequiresBridge { - t.Fatalf("did not expect bridge requirement for same placement") - } + require.NoError(t, err, "expected resolve to succeed") + require.Equal(t, "http://anvil:8545", r.URL, "unexpected URL resolution") + require.Equal(t, "internal", r.SelectedKind, "unexpected endpoint kind") + require.False(t, r.RequiresBridge, "did not expect bridge requirement for same placement") } func TestResolveRemoteToLocalRequiresBridge(t *testing.T) { @@ -29,12 +26,21 @@ func TestResolveRemoteToLocalRequiresBridge(t *testing.T) { Internal: "jd:14231", External: "127.0.0.1:14231", }) - if err != nil { - t.Fatalf("expected resolve to succeed: %v", err) - } - if !r.RequiresBridge || r.BridgePort != 14231 { - t.Fatalf("expected bridge requirement with port 14231, got %+v", r) - } + require.NoError(t, err, "expected resolve to succeed") + require.True(t, r.RequiresBridge, "expected bridge requirement for remote caller to local target") + require.Equal(t, 14231, r.BridgePort, "unexpected bridge port") +} + +func TestResolveCrossPlacementLocalToRemoteUsesExternalWithoutBridge(t *testing.T) { + r, err := Resolve(PlacementLocal, PlacementRemote, EndpointPair{ + Name: "gateway", + Internal: "ws://gateway-node:5003/node", + External: "ws://203.0.113.10:5003/node", + }) + require.NoError(t, err, "expected cross-placement resolve to succeed") + require.Equal(t, "external", r.SelectedKind, "expected external URL for cross-placement") + require.Equal(t, "ws://203.0.113.10:5003/node", r.URL, "unexpected cross-placement URL") + require.False(t, r.RequiresBridge, "local caller to remote target should not require bridge") } func TestResolveAndEnsureReachableCallsEnsurer(t *testing.T) { @@ -45,20 +51,13 @@ func TestResolveAndEnsureReachableCallsEnsurer(t *testing.T) { External: "127.0.0.1:14231", }, func(_ context.Context, endpoint EndpointPair, port int) error { called = true - if endpoint.Name != "jd-grpc" || port != 14231 { - t.Fatalf("unexpected bridge args: endpoint=%s port=%d", endpoint.Name, port) - } + require.Equal(t, "jd-grpc", endpoint.Name, "unexpected endpoint name in bridge callback") + require.Equal(t, 14231, port, "unexpected port in bridge callback") return nil }) - if err != nil { - t.Fatalf("expected resolve+ensure to succeed: %v", err) - } - if !called { - t.Fatalf("expected bridge ensurer to be called") - } - if r.URL != "127.0.0.1:14231" { - t.Fatalf("unexpected resolution URL: %s", r.URL) - } + require.NoError(t, err, "expected resolve+ensure to succeed") + require.True(t, called, "expected bridge ensurer to be called") + require.Equal(t, "127.0.0.1:14231", r.URL, "unexpected resolution URL") } func TestResolveAndEnsureReachableFailsWithoutEnsurer(t *testing.T) { @@ -67,9 +66,7 @@ func TestResolveAndEnsureReachableFailsWithoutEnsurer(t *testing.T) { Internal: "jd:14231", External: "127.0.0.1:14231", }, nil) - if err == nil { - t.Fatalf("expected missing bridge ensurer to fail") - } + require.Error(t, err, "expected missing bridge ensurer to fail") } func TestResolveAndEnsureReachablePropagatesEnsurerError(t *testing.T) { @@ -80,7 +77,5 @@ func TestResolveAndEnsureReachablePropagatesEnsurerError(t *testing.T) { }, func(_ context.Context, _ EndpointPair, _ int) error { return errors.New("boom") }) - if err == nil { - t.Fatalf("expected ensurer error to be returned") - } + require.Error(t, err, "expected ensurer error to be returned") } diff --git a/system-tests/lib/cre/don/config/config_test.go b/system-tests/lib/cre/don/config/config_test.go index 2ace1b8c9bc..9e149f684f3 100644 --- a/system-tests/lib/cre/don/config/config_test.go +++ b/system-tests/lib/cre/don/config/config_test.go @@ -66,6 +66,77 @@ func TestResolveGatewayConnectorURL_RemoteHostOverride(t *testing.T) { require.Equal(t, "ws://203.0.113.22:5003/node", gotURL, "unexpected gateway connector URL") } +func TestResolveNodeFacingBootstrapAddress_PlacementMatrix(t *testing.T) { + t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + + tests := []struct { + name string + callerPlacement string + bootstrapPlacement string + bootstrapHost string + internalPort int + externalPort int + remoteHostIP string + want string + }{ + { + name: "local caller local bootstrap uses internal host", + callerPlacement: "local", + bootstrapPlacement: "local", + bootstrapHost: "bootstrap-node", + internalPort: 5001, + externalPort: 15001, + remoteHostIP: "203.0.113.10", + want: "bootstrap-node:5001", + }, + { + name: "local caller remote bootstrap uses external host override", + callerPlacement: "local", + bootstrapPlacement: "remote", + bootstrapHost: "bootstrap-node", + internalPort: 5001, + externalPort: 15001, + remoteHostIP: "203.0.113.10", + want: "203.0.113.10:15001", + }, + { + name: "remote caller local bootstrap uses docker host external", + callerPlacement: "remote", + bootstrapPlacement: "local", + bootstrapHost: "bootstrap-node", + internalPort: 5001, + externalPort: 15001, + remoteHostIP: "203.0.113.10", + want: strings.TrimPrefix(framework.HostDockerInternal(), "http://") + ":5001", + }, + { + name: "remote caller remote bootstrap uses internal host", + callerPlacement: "remote", + bootstrapPlacement: "remote", + bootstrapHost: "bootstrap-node", + internalPort: 5001, + externalPort: 15001, + remoteHostIP: "203.0.113.10", + want: "bootstrap-node:5001", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := resolveNodeFacingBootstrapAddress( + tt.callerPlacement, + tt.bootstrapPlacement, + tt.bootstrapHost, + tt.internalPort, + tt.externalPort, + tt.remoteHostIP, + ) + require.NoError(t, err, "resolveNodeFacingBootstrapAddress should not fail") + require.Equal(t, tt.want, got, "unexpected resolved bootstrap address") + }) + } +} + func mustBuildGatewayTopology(t *testing.T, targetPlacement string) (*cre.Topology, *cre.DonGatewayConfiguration) { t.Helper() diff --git a/system-tests/lib/cre/environment/adapters/blockchain_adapter.go b/system-tests/lib/cre/environment/adapters/blockchain_adapter.go deleted file mode 100644 index 0298203bc3e..00000000000 --- a/system-tests/lib/cre/environment/adapters/blockchain_adapter.go +++ /dev/null @@ -1,127 +0,0 @@ -package adapters - -import ( - "fmt" - "net/url" - "strconv" - - "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" -) - -type BlockchainAdapter struct{} - -func NewBlockchainAdapter() *BlockchainAdapter { - return &BlockchainAdapter{} -} - -func (a *BlockchainAdapter) DescribeEndpoints(componentID string, output *blockchain.Output) ([]tunnel.EndpointRef, error) { - if output == nil { - return nil, fmt.Errorf("blockchain output is nil") - } - - refs := make([]tunnel.EndpointRef, 0, len(output.Nodes)*2) - for idx := range output.Nodes { - node := output.Nodes[idx] - - httpRef, err := endpointFromURL(componentID, fmt.Sprintf("node-%d-http", idx), node.ExternalHTTPUrl) - if err != nil { - return nil, err - } - if httpRef != nil { - refs = append(refs, *httpRef) - } - - wsRef, err := endpointFromURL(componentID, fmt.Sprintf("node-%d-ws", idx), node.ExternalWSUrl) - if err != nil { - return nil, err - } - if wsRef != nil { - refs = append(refs, *wsRef) - } - } - - return refs, nil -} - -func (a *BlockchainAdapter) RewriteWithBindings(output *blockchain.Output, bindings []tunnel.TunnelBinding) error { - if output == nil { - return fmt.Errorf("blockchain output is nil") - } - - byName := make(map[string]tunnel.TunnelBinding, len(bindings)) - for _, b := range bindings { - byName[b.EndpointName] = b - } - - for idx := range output.Nodes { - httpKey := fmt.Sprintf("node-%d-http", idx) - if output.Nodes[idx].ExternalHTTPUrl != "" { - b, ok := byName[httpKey] - if !ok { - return fmt.Errorf("missing tunnel binding for %s", httpKey) - } - output.Nodes[idx].ExternalHTTPUrl = b.LocalURL - } - - wsKey := fmt.Sprintf("node-%d-ws", idx) - if output.Nodes[idx].ExternalWSUrl != "" { - b, ok := byName[wsKey] - if !ok { - return fmt.Errorf("missing tunnel binding for %s", wsKey) - } - output.Nodes[idx].ExternalWSUrl = b.LocalURL - } - } - - return nil -} - -func endpointFromURL(componentID, endpointName, rawURL string) (*tunnel.EndpointRef, error) { - if rawURL == "" { - return nil, nil - } - - parsed, err := url.Parse(rawURL) - if err != nil { - return nil, fmt.Errorf("failed to parse endpoint url %q: %w", rawURL, err) - } - - host := parsed.Hostname() - if host == "" { - return nil, fmt.Errorf("endpoint url %q has empty hostname", rawURL) - } - - port, err := resolvePort(parsed) - if err != nil { - return nil, err - } - - return &tunnel.EndpointRef{ - ComponentID: componentID, - EndpointName: endpointName, - Scheme: parsed.Scheme, - Host: host, - Port: port, - OriginalURL: rawURL, - }, nil -} - -func resolvePort(parsed *url.URL) (int, error) { - if parsed.Port() != "" { - port, err := strconv.Atoi(parsed.Port()) - if err != nil || port <= 0 || port > 65535 { - return 0, fmt.Errorf("url %q has invalid port %q", parsed.String(), parsed.Port()) - } - return port, nil - } - - switch parsed.Scheme { - case "http", "ws": - return 80, nil - case "https", "wss": - return 443, nil - default: - return 0, fmt.Errorf("url %q has unsupported scheme %q without explicit port", parsed.String(), parsed.Scheme) - } -} diff --git a/system-tests/lib/cre/environment/adapters/blockchain_adapter_test.go b/system-tests/lib/cre/environment/adapters/blockchain_adapter_test.go deleted file mode 100644 index c7dc743661c..00000000000 --- a/system-tests/lib/cre/environment/adapters/blockchain_adapter_test.go +++ /dev/null @@ -1,50 +0,0 @@ -package adapters - -import ( - "testing" - - "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" -) - -func TestBlockchainAdapterDescribeAndRewrite(t *testing.T) { - adapter := NewBlockchainAdapter() - out := &blockchain.Output{ - Nodes: []*blockchain.Node{ - { - ExternalHTTPUrl: "http://10.0.0.10:8545", - ExternalWSUrl: "ws://10.0.0.10:8546", - }, - }, - } - - refs, err := adapter.DescribeEndpoints("blockchain:0:anvil", out) - if err != nil { - t.Fatalf("expected describe to succeed: %v", err) - } - if len(refs) != 2 { - t.Fatalf("expected two endpoint refs, got %d", len(refs)) - } - - bindings := []tunnel.TunnelBinding{ - { - EndpointRef: tunnel.EndpointRef{EndpointName: "node-0-http"}, - LocalURL: "http://127.0.0.1:18080", - }, - { - EndpointRef: tunnel.EndpointRef{EndpointName: "node-0-ws"}, - LocalURL: "ws://127.0.0.1:18081", - }, - } - - if err := adapter.RewriteWithBindings(out, bindings); err != nil { - t.Fatalf("expected rewrite to succeed: %v", err) - } - - if out.Nodes[0].ExternalHTTPUrl != "http://127.0.0.1:18080" { - t.Fatalf("unexpected rewritten http url: %s", out.Nodes[0].ExternalHTTPUrl) - } - if out.Nodes[0].ExternalWSUrl != "ws://127.0.0.1:18081" { - t.Fatalf("unexpected rewritten ws url: %s", out.Nodes[0].ExternalWSUrl) - } -} diff --git a/system-tests/lib/cre/environment/adapters/tunnel_adapter.go b/system-tests/lib/cre/environment/adapters/tunnel_adapter.go deleted file mode 100644 index 9d838831b30..00000000000 --- a/system-tests/lib/cre/environment/adapters/tunnel_adapter.go +++ /dev/null @@ -1,10 +0,0 @@ -package adapters - -import ( - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/tunnel" -) - -type TunnelAdapter[T any] interface { - DescribeEndpoints(componentID string, output *T) ([]tunnel.EndpointRef, error) - RewriteWithBindings(output *T, bindings []tunnel.TunnelBinding) error -} diff --git a/system-tests/lib/cre/environment/address_rewrite.go b/system-tests/lib/cre/environment/address_rewrite.go new file mode 100644 index 00000000000..7c5a281342d --- /dev/null +++ b/system-tests/lib/cre/environment/address_rewrite.go @@ -0,0 +1,44 @@ +package environment + +import ( + "fmt" + "net" + "net/url" + "strings" +) + +func rewriteAddressHost(rawAddress, host string) (string, error) { + return rewriteAddressHostWithPolicy(rawAddress, host, true) +} + +func rewriteURLHost(rawURL, host string) (string, error) { + return rewriteAddressHostWithPolicy(rawURL, host, false) +} + +func rewriteAddressHostWithPolicy(rawAddress, host string, requireExplicitPort bool) (string, error) { + trimmed := strings.TrimSpace(rawAddress) + if trimmed == "" { + return "", nil + } + if strings.Contains(trimmed, "://") { + parsed, err := url.Parse(trimmed) + if err != nil { + return "", fmt.Errorf("failed to parse address %q: %w", rawAddress, err) + } + port := parsed.Port() + if port == "" { + if requireExplicitPort { + return "", fmt.Errorf("address %q must include a port", rawAddress) + } + parsed.Host = host + return parsed.String(), nil + } + parsed.Host = net.JoinHostPort(host, port) + return parsed.String(), nil + } + _, port, err := net.SplitHostPort(trimmed) + if err != nil { + return "", fmt.Errorf("failed to parse host:port %q: %w", rawAddress, err) + } + return net.JoinHostPort(host, port), nil +} diff --git a/system-tests/lib/cre/environment/blockchain_start.go b/system-tests/lib/cre/environment/blockchain_start.go index 315e9c51f54..a97aad182a1 100644 --- a/system-tests/lib/cre/environment/blockchain_start.go +++ b/system-tests/lib/cre/environment/blockchain_start.go @@ -4,8 +4,6 @@ import ( "context" "errors" "fmt" - "net" - "net/url" pkgerrors "github.com/pkg/errors" "github.com/rs/zerolog" @@ -67,30 +65,33 @@ func startBlockchains( var deployedOutput *blockchain.Output if configured.Placement == config.PlacementRemote { - if err := validateRemoteBlockchainInput(input); err != nil { - return nil, err - } - payload := agent.StartComponentPayload{ - ComponentType: remoteclient.ComponentTypeBlockchain, - Blockchain: input, - ReusePolicy: string(configured.RemoteStartPolicy), - } - deployedOutput, err = remoteclient.StartRemoteComponent[blockchain.Output]( + deployedOutput, err = remoteclient.StartWithRuntimeDescriptor( ctx, testLogger, - remoteRuntime.Client, - payload, - remoteclient.ComponentTypeBlockchain, + remoteRuntime, + remoteclient.StartDescriptor[blockchain.Output]{ + ComponentType: remoteclient.ComponentTypeBlockchain, + BuildPayload: func() (agent.StartComponentPayload, error) { + if err := validateRemoteBlockchainInput(input); err != nil { + return agent.StartComponentPayload{}, err + } + return agent.StartComponentPayload{ + ComponentType: remoteclient.ComponentTypeBlockchain, + Blockchain: input, + ReusePolicy: string(configured.RemoteStartPolicy), + }, nil + }, + Rewrite: func(output *blockchain.Output, ec2HostIP string) error { + if rewriteInternalForLocalNodes { + // direct mode keeps internal URLs unchanged + } + return rewriteRemoteBlockchainOutputForDirectAccess(output, ec2HostIP) + }, + }, ) if err != nil { return nil, err } - if rewriteInternalForLocalNodes { - // direct mode keeps internal URLs unchanged - } - if err := rewriteRemoteBlockchainOutputForDirectAccess(deployedOutput, remoteRuntime.EC2HostIP); err != nil { - return nil, err - } } else { deployedOutput, err = blockchains.StartChain(ctx, deployers, input) if err != nil { @@ -148,16 +149,3 @@ func rewriteRemoteBlockchainOutputForDirectAccess(output *blockchain.Output, ec2 } return nil } - -func rewriteURLHost(rawURL, host string) (string, error) { - parsed, err := url.Parse(rawURL) - if err != nil { - return "", fmt.Errorf("failed to parse url %q: %w", rawURL, err) - } - if parsed.Port() != "" { - parsed.Host = net.JoinHostPort(host, parsed.Port()) - return parsed.String(), nil - } - parsed.Host = host - return parsed.String(), nil -} diff --git a/system-tests/lib/cre/environment/blockchain_start_test.go b/system-tests/lib/cre/environment/blockchain_start_test.go index e581eaae491..faefe6f9f44 100644 --- a/system-tests/lib/cre/environment/blockchain_start_test.go +++ b/system-tests/lib/cre/environment/blockchain_start_test.go @@ -111,7 +111,7 @@ func TestRewriteRemoteBlockchainOutputForDirectAccess_InvalidExternalURL(t *test err := rewriteRemoteBlockchainOutputForDirectAccess(out, "203.0.113.10") require.Error(t, err, "expected invalid external URL to fail rewrite") - require.Contains(t, err.Error(), "failed to parse url", "expected parse failure context") + require.Contains(t, err.Error(), "failed to parse address", "expected parse failure context") } func TestRemoteAgentErrorFormatting(t *testing.T) { diff --git a/system-tests/lib/cre/environment/dons.go b/system-tests/lib/cre/environment/dons.go index 1f117c14a69..ac7e6bf9af5 100644 --- a/system-tests/lib/cre/environment/dons.go +++ b/system-tests/lib/cre/environment/dons.go @@ -306,36 +306,34 @@ func startNodeSet( } if strings.TrimSpace(nodeSet.Placement) == string(config.PlacementRemote) { - if remoteRuntime == nil { - return nil, errors.New("remote runtime is required for remote nodeset placement") - } - registryChainPayload, err := agent.EncodeForTransport(registryChainBlockchainOutput) - if err != nil { - return nil, pkgerrors.Wrap(err, "failed to encode registry blockchain payload for remote nodeset start") - } - remoteInput, err := buildRemoteNodeSetInput(nodeSet) - if err != nil { - return nil, err - } - payload := agent.StartComponentPayload{ - ComponentType: remoteclient.ComponentTypeNodeSet, - NodeSet: remoteInput, - RegistryBlockchain: registryChainPayload, - ReusePolicy: nodeSetRemoteStartPolicy(nodeSet), - } - nodeset, err := remoteclient.StartRemoteComponent[ns.Output]( + nodeset, err := remoteclient.StartWithRuntimeDescriptor( ctx, lggr, - remoteRuntime.Client, - payload, - remoteclient.ComponentTypeNodeSet, + remoteRuntime, + remoteclient.StartDescriptor[ns.Output]{ + ComponentType: remoteclient.ComponentTypeNodeSet, + BuildPayload: func() (agent.StartComponentPayload, error) { + registryChainPayload, err := agent.EncodeForTransport(registryChainBlockchainOutput) + if err != nil { + return agent.StartComponentPayload{}, pkgerrors.Wrap(err, "failed to encode registry blockchain payload for remote nodeset start") + } + remoteInput, err := buildRemoteNodeSetInput(nodeSet) + if err != nil { + return agent.StartComponentPayload{}, err + } + return agent.StartComponentPayload{ + ComponentType: remoteclient.ComponentTypeNodeSet, + NodeSet: remoteInput, + RegistryBlockchain: registryChainPayload, + ReusePolicy: nodeSetRemoteStartPolicy(nodeSet), + }, nil + }, + Rewrite: rewriteRemoteNodeSetOutputForLocalAccess, + }, ) if err != nil { return nil, err } - if err := rewriteRemoteNodeSetOutputForLocalAccess(nodeset, remoteRuntime.EC2HostIP); err != nil { - return nil, err - } return nodeset, nil } diff --git a/system-tests/lib/cre/environment/dons_test.go b/system-tests/lib/cre/environment/dons_test.go index dc3f95de839..4b3daa42123 100644 --- a/system-tests/lib/cre/environment/dons_test.go +++ b/system-tests/lib/cre/environment/dons_test.go @@ -102,7 +102,7 @@ func TestRewriteRemoteNodeSetOutputForLocalAccess_InvalidNodeExternalURLFails(t err := rewriteRemoteNodeSetOutputForLocalAccess(output, "203.0.113.10") require.Error(t, err, "expected invalid node external URL to fail rewrite") - require.Contains(t, err.Error(), "failed to parse url", "expected parse failure context") + require.Contains(t, err.Error(), "failed to parse address", "expected parse failure context") } func mustBuildRemoteGatewayTopology(t *testing.T) (*cre.Topology, *cre.NodeSet) { diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index 4a2a3a8deab..833921fef54 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -133,11 +133,8 @@ func SetupTestEnvironment( if err := input.Validate(); err != nil { return nil, pkgerrors.Wrap(err, "input validation failed") } - nodeSetPlacement, err := summarizeNodeSetPlacement(input.NodeSets) + execPlan, err := buildExecutionPlan(input.Blockchains, input.JdInput, input.NodeSets) if err != nil { - return nil, pkgerrors.Wrap(err, "nodeset placement validation failed") - } - if err := validateUnsupportedPlacements(input.Blockchains, nodeSetPlacement); err != nil { return nil, pkgerrors.Wrap(err, "invalid component placement") } @@ -146,7 +143,7 @@ func SetupTestEnvironment( return nil, pkgerrors.Wrap(s3Err, "failed to start S3 provider") } - remoteRuntime, err := resolveRemoteRuntimeForSetup(testLogger, input.Blockchains, input.JdInput, input.NodeSets) + remoteRuntime, err := resolveRemoteRuntimeForSetup(testLogger, execPlan) if err != nil { return nil, pkgerrors.Wrap(err, "failed to resolve remote runtime settings") } @@ -161,7 +158,7 @@ func SetupTestEnvironment( input.Blockchains, input.BlockchainDeployers, remoteRuntime, - nodeSetPlacement.HasLocalTargets, + execPlan.NodeSetPlacement.HasLocalTargets, ) if startErr != nil { return nil, pkgerrors.Wrap(startErr, "failed to start blockchains") @@ -200,6 +197,10 @@ func SetupTestEnvironment( if tErr != nil { return nil, pkgerrors.Wrap(tErr, "failed to create topology") } + remoteHostIP := "" + if remoteRuntime != nil { + remoteHostIP = remoteRuntime.EC2HostIP + } updatedNodeSets, topoErr := donconfig.PrepareNodeTOMLs( ctx, @@ -207,7 +208,7 @@ func SetupTestEnvironment( creEnvironment, input.NodeSets, input.Blockchains, - remoteHostIP(remoteRuntime), + remoteHostIP, input.Capabilities, input.ConfigFactoryFunctions, ) @@ -455,13 +456,6 @@ func SetupTestEnvironment( }, nil } -func remoteHostIP(runtime *remoteclient.Runtime) string { - if runtime == nil { - return "" - } - return runtime.EC2HostIP -} - func appendOutputsToInput(input *SetupInput, nodeSetOutput []*cre.NodeSetOutput, blockchains []blockchains.Blockchain, jdOutput *jd.Output) { // append the nodeset output, so that later it can be stored in the cached output, so that we can use the environment again without running setup for idx, nsOut := range nodeSetOutput { @@ -478,30 +472,11 @@ func appendOutputsToInput(input *SetupInput, nodeSetOutput []*cre.NodeSetOutput, input.JdInput.Out = jdOutput } -func hasRemoteComponents(blockchains []*config.Blockchain, jdInput *config.JobDistributor, nodeSets []*cre.NodeSet) bool { - for _, configuredBlockchain := range blockchains { - if configuredBlockchain != nil && configuredBlockchain.Placement == config.PlacementRemote { - return true - } - } - if jdInput != nil && jdInput.Placement == config.PlacementRemote { - return true - } - for _, nodeSet := range nodeSets { - if nodeSet != nil && strings.TrimSpace(nodeSet.Placement) == string(config.PlacementRemote) { - return true - } - } - return false -} - func resolveRemoteRuntimeForSetup( testLogger zerolog.Logger, - blockchains []*config.Blockchain, - jdInput *config.JobDistributor, - nodeSets []*cre.NodeSet, + execPlan *executionPlan, ) (*remoteclient.Runtime, error) { - if !hasRemoteComponents(blockchains, jdInput, nodeSets) { + if execPlan == nil || !execPlan.HasRemoteComponents { return nil, nil } runtimeInput, err := resolveRemoteRuntimeInput() @@ -530,53 +505,6 @@ func resolveRemoteRuntimeInput() (remoteclient.RuntimeInput, error) { return input, nil } -type nodeSetPlacementSummary struct { - HasLocalTargets bool - HasRemoteTargets bool -} - -func summarizeNodeSetPlacement(nodeSets []*cre.NodeSet) (*nodeSetPlacementSummary, error) { - summary := &nodeSetPlacementSummary{} - for _, nodeSet := range nodeSets { - if nodeSet == nil { - continue - } - configPlacement := strings.TrimSpace(nodeSet.Placement) - if configPlacement == "" || configPlacement == string(config.PlacementLocal) { - summary.HasLocalTargets = true - continue - } - if configPlacement == string(config.PlacementRemote) { - summary.HasRemoteTargets = true - continue - } - return nil, fmt.Errorf("invalid nodeset placement: %s", nodeSet.Placement) - } - - return summary, nil -} - -func validateUnsupportedPlacements( - configuredBlockchains []*config.Blockchain, - nodeSetPlacement *nodeSetPlacementSummary, -) error { - if nodeSetPlacement == nil || !nodeSetPlacement.HasRemoteTargets { - return nil - } - for _, bc := range configuredBlockchains { - if bc == nil { - continue - } - if bc.Placement == config.PlacementLocal { - return errors.New( - "remote nodesets with local blockchains are not supported in this PoC. " + - "Set all blockchains to placement=remote, or run nodesets with placement=local so nodes stay colocated with local blockchains", - ) - } - } - return nil -} - func verifyRemoteToLocalBootstrapReachability(ctx context.Context, lggr zerolog.Logger, topology *cre.Topology) error { if topology == nil { return nil diff --git a/system-tests/lib/cre/environment/environment_placement_test.go b/system-tests/lib/cre/environment/environment_placement_test.go index 290f6b1884b..fd7fa901a51 100644 --- a/system-tests/lib/cre/environment/environment_placement_test.go +++ b/system-tests/lib/cre/environment/environment_placement_test.go @@ -67,12 +67,31 @@ func TestHasRemoteComponents(t *testing.T) { } func TestResolveRemoteRuntimeForSetupSkipsResolutionWhenNoRemoteComponents(t *testing.T) { - runtime, err := resolveRemoteRuntimeForSetup( - zerolog.Nop(), + execPlan, planErr := buildExecutionPlan( []*config.Blockchain{{Placement: config.PlacementLocal}}, &config.JobDistributor{Placement: config.PlacementLocal}, []*cre.NodeSet{{Placement: "local"}}, ) + require.NoError(t, planErr) + + runtime, err := resolveRemoteRuntimeForSetup( + zerolog.Nop(), + execPlan, + ) require.NoError(t, err) require.Nil(t, runtime, "expected nil runtime when no remote components are configured") } + +func TestBuildExecutionPlanIncludesPlacementAndRemoteFlags(t *testing.T) { + execPlan, err := buildExecutionPlan( + []*config.Blockchain{{Placement: config.PlacementRemote}}, + &config.JobDistributor{Placement: config.PlacementLocal}, + []*cre.NodeSet{{Placement: "local"}, {Placement: "remote"}}, + ) + require.NoError(t, err, "expected execution plan build to succeed") + require.NotNil(t, execPlan, "expected non-nil execution plan") + require.NotNil(t, execPlan.NodeSetPlacement, "expected nodeset placement summary") + require.True(t, execPlan.NodeSetPlacement.HasLocalTargets, "expected local nodeset placement") + require.True(t, execPlan.NodeSetPlacement.HasRemoteTargets, "expected remote nodeset placement") + require.True(t, execPlan.HasRemoteComponents, "expected remote components flag") +} diff --git a/system-tests/lib/cre/environment/execution_plan.go b/system-tests/lib/cre/environment/execution_plan.go new file mode 100644 index 00000000000..5d242e14763 --- /dev/null +++ b/system-tests/lib/cre/environment/execution_plan.go @@ -0,0 +1,97 @@ +package environment + +import ( + "fmt" + "strings" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" +) + +type executionPlan struct { + NodeSetPlacement *nodeSetPlacementSummary + HasRemoteComponents bool +} + +type nodeSetPlacementSummary struct { + HasLocalTargets bool + HasRemoteTargets bool +} + +func buildExecutionPlan( + configuredBlockchains []*config.Blockchain, + jdInput *config.JobDistributor, + nodeSets []*cre.NodeSet, +) (*executionPlan, error) { + nodeSetPlacement, err := summarizeNodeSetPlacement(nodeSets) + if err != nil { + return nil, err + } + if err := validateUnsupportedPlacements(configuredBlockchains, nodeSetPlacement); err != nil { + return nil, err + } + + return &executionPlan{ + NodeSetPlacement: nodeSetPlacement, + HasRemoteComponents: hasRemoteComponents(configuredBlockchains, jdInput, nodeSets), + }, nil +} + +func hasRemoteComponents(blockchains []*config.Blockchain, jdInput *config.JobDistributor, nodeSets []*cre.NodeSet) bool { + for _, configuredBlockchain := range blockchains { + if configuredBlockchain != nil && configuredBlockchain.Placement == config.PlacementRemote { + return true + } + } + if jdInput != nil && jdInput.Placement == config.PlacementRemote { + return true + } + for _, nodeSet := range nodeSets { + if nodeSet != nil && strings.TrimSpace(nodeSet.Placement) == string(config.PlacementRemote) { + return true + } + } + return false +} + +func summarizeNodeSetPlacement(nodeSets []*cre.NodeSet) (*nodeSetPlacementSummary, error) { + summary := &nodeSetPlacementSummary{} + for _, nodeSet := range nodeSets { + if nodeSet == nil { + continue + } + configPlacement := strings.TrimSpace(nodeSet.Placement) + if configPlacement == "" || configPlacement == string(config.PlacementLocal) { + summary.HasLocalTargets = true + continue + } + if configPlacement == string(config.PlacementRemote) { + summary.HasRemoteTargets = true + continue + } + return nil, fmt.Errorf("invalid nodeset placement: %s", nodeSet.Placement) + } + + return summary, nil +} + +func validateUnsupportedPlacements( + configuredBlockchains []*config.Blockchain, + nodeSetPlacement *nodeSetPlacementSummary, +) error { + if nodeSetPlacement == nil || !nodeSetPlacement.HasRemoteTargets { + return nil + } + for _, bc := range configuredBlockchains { + if bc == nil { + continue + } + if bc.Placement == config.PlacementLocal { + return fmt.Errorf( + "remote nodesets with local blockchains are not supported in this PoC. " + + "Set all blockchains to placement=remote, or run nodesets with placement=local so nodes stay colocated with local blockchains", + ) + } + } + return nil +} diff --git a/system-tests/lib/cre/environment/jobs.go b/system-tests/lib/cre/environment/jobs.go index 8e0cb06a3ed..32aec3f9626 100644 --- a/system-tests/lib/cre/environment/jobs.go +++ b/system-tests/lib/cre/environment/jobs.go @@ -4,8 +4,6 @@ import ( "context" "errors" "fmt" - "net" - "net/url" "strings" "time" @@ -77,27 +75,25 @@ func StartJD( switch { case jdConfig.Placement == config.PlacementRemote: - if remoteRuntime == nil { - return nil, errors.New("remote runtime is required when starting remote jd") - } - payload := agent.StartComponentPayload{ - ComponentType: remoteclient.ComponentTypeJD, - JD: jdConfig.InputRef(), - ReusePolicy: string(jdConfig.RemoteStartPolicy), - } - jdOutput, jdErr = remoteclient.StartRemoteComponent[jd.Output]( + jdOutput, jdErr = remoteclient.StartWithRuntimeDescriptor( ctx, lggr, - remoteRuntime.Client, - payload, - remoteclient.ComponentTypeJD, + remoteRuntime, + remoteclient.StartDescriptor[jd.Output]{ + ComponentType: remoteclient.ComponentTypeJD, + BuildPayload: func() (agent.StartComponentPayload, error) { + return agent.StartComponentPayload{ + ComponentType: remoteclient.ComponentTypeJD, + JD: jdConfig.InputRef(), + ReusePolicy: string(jdConfig.RemoteStartPolicy), + }, nil + }, + Rewrite: rewriteJDForDirectAccess, + }, ) if jdErr != nil { return nil, jdErr } - if err := rewriteJDForDirectAccess(jdOutput, remoteRuntime.EC2HostIP); err != nil { - return nil, err - } case infraInput.IsKubernetes(): // For Kubernetes, JD is already running in the cluster, generate service URLs lggr.Info().Msg("Generating Kubernetes service URLs for Job Distributor (already running in cluster)") @@ -174,27 +170,3 @@ func rewriteJDForDirectAccess(output *jd.Output, ec2HostIP string) error { } return nil } - -func rewriteAddressHost(rawAddress, host string) (string, error) { - trimmed := strings.TrimSpace(rawAddress) - if trimmed == "" { - return "", nil - } - if strings.Contains(trimmed, "://") { - parsed, err := url.Parse(trimmed) - if err != nil { - return "", fmt.Errorf("failed to parse address %q: %w", rawAddress, err) - } - port := parsed.Port() - if port == "" { - return "", fmt.Errorf("address %q must include a port", rawAddress) - } - parsed.Host = net.JoinHostPort(host, port) - return parsed.String(), nil - } - _, port, err := net.SplitHostPort(trimmed) - if err != nil { - return "", fmt.Errorf("failed to parse host:port %q: %w", rawAddress, err) - } - return net.JoinHostPort(host, port), nil -} diff --git a/system-tests/lib/cre/environment/remoteexec/agent/server.go b/system-tests/lib/cre/environment/remoteexec/agent/server.go index 98db0e43a88..f50534f88d7 100644 --- a/system-tests/lib/cre/environment/remoteexec/agent/server.go +++ b/system-tests/lib/cre/environment/remoteexec/agent/server.go @@ -14,7 +14,6 @@ import ( "os" "path/filepath" "slices" - "strconv" "strings" "sync" "time" @@ -279,81 +278,6 @@ func (s *Server) listCTFResources(w http.ResponseWriter, r *http.Request) { }) } -func (s *Server) status(w http.ResponseWriter, r *http.Request) { - if r.Method != http.MethodGet { - s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) - return - } - - runtimeKeys := s.runtimeKeys() - cacheKeys := s.cacheKeys() - relayInfos := s.relayInfos() - componentLogKeys := s.componentLogKeys() - inFlight, _ := s.inFlightSnapshot() - - s.respondJSONAny(w, http.StatusOK, AgentStatusResponse{ - AgentVersion: agentVersion, - ProtocolVersion: protocolVersion, - SupportedSchemas: []string{SchemaVersionV1}, - Capabilities: []string{capabilityStartComponent, capabilityDeployArtifacts, capabilityRelay, capabilityListCTFResources, capabilityLocks, capabilityComponentLogs}, - UptimeSeconds: int64(time.Since(s.startedAt).Seconds()), - RuntimeComponents: runtimeKeys, - CachedComponents: cacheKeys, - Relays: relayInfos, - ComponentLogKeys: componentLogKeys, - InFlight: inFlight, - }) -} - -func (s *Server) locks(w http.ResponseWriter, r *http.Request) { - if r.Method != http.MethodGet { - s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) - return - } - - inFlight, lifecycleBusy := s.inFlightSnapshot() - s.respondJSONAny(w, http.StatusOK, AgentLocksResponse{ - LifecycleBusy: lifecycleBusy, - CacheEntries: s.cacheSize(), - RuntimeEntries: s.runtimeSize(), - RelayCount: s.relayCount(), - ComponentLogKeys: len(s.componentLogKeys()), - InFlight: inFlight, - }) -} - -func (s *Server) componentLogsHandler(w http.ResponseWriter, r *http.Request) { - if r.Method != http.MethodGet { - s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) - return - } - - componentKey := strings.TrimSpace(r.URL.Query().Get("componentKey")) - if componentKey == "" { - s.respondError(w, http.StatusBadRequest, ErrCodeMissingComponentInput, "componentKey query parameter is required", nil) - return - } - limit := defaultComponentLogsLimit - if rawLimit := strings.TrimSpace(r.URL.Query().Get("limit")); rawLimit != "" { - parsed, err := strconv.Atoi(rawLimit) - if err != nil || parsed <= 0 { - s.respondError(w, http.StatusBadRequest, ErrCodeInvalidPayload, "limit query parameter must be a positive integer", nil) - return - } - if parsed > maxComponentLogsLimit { - parsed = maxComponentLogsLimit - } - limit = parsed - } - - lines, total := s.getComponentLogs(componentKey, limit) - s.respondJSONAny(w, http.StatusOK, ComponentLogsResponse{ - ComponentKey: componentKey, - TotalLines: total, - Lines: lines, - }) -} - func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodPost { s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) @@ -669,200 +593,6 @@ func captureFrameworkLogs(fn func() error) ([]string, error) { return logs, err } -func (s *Server) lookupCachedStart(componentKey, payloadHash string) (*cachedStart, bool) { - s.cacheMu.Lock() - defer s.cacheMu.Unlock() - - start, ok := s.cache[componentKey] - if !ok || start.PayloadHash != payloadHash { - return nil, false - } - return &start, true -} - -func (s *Server) cacheSuccessfulStart(componentKey, payloadHash string, output map[string]any) { - s.cacheMu.Lock() - defer s.cacheMu.Unlock() - s.cache[componentKey] = cachedStart{ - PayloadHash: payloadHash, - Output: output, - } -} - -func (s *Server) deleteCachedStart(componentKey string) { - s.cacheMu.Lock() - defer s.cacheMu.Unlock() - delete(s.cache, componentKey) -} - -func (s *Server) storeRuntime(componentKey string, state runtimeState) { - s.cacheMu.Lock() - defer s.cacheMu.Unlock() - s.runtime[componentKey] = state -} - -func (s *Server) takeRuntime(componentKey string) (runtimeState, bool) { - s.cacheMu.Lock() - defer s.cacheMu.Unlock() - state, ok := s.runtime[componentKey] - if ok { - delete(s.runtime, componentKey) - } - return state, ok -} - -func (s *Server) beginInFlight(id, scope string) { - s.opsMu.Lock() - defer s.opsMu.Unlock() - s.inFlight[id] = inFlightOperation{ - ID: id, - Scope: scope, - StartedAt: time.Now(), - } -} - -func (s *Server) endInFlight(id string) { - s.opsMu.Lock() - defer s.opsMu.Unlock() - delete(s.inFlight, id) -} - -func (s *Server) inFlightSnapshot() ([]InFlightOperation, bool) { - s.opsMu.Lock() - defer s.opsMu.Unlock() - - out := make([]InFlightOperation, 0, len(s.inFlight)) - lifecycleBusy := false - for _, op := range s.inFlight { - if op.Scope == inFlightOperationScopeLifecycle { - lifecycleBusy = true - } - out = append(out, InFlightOperation{ - ID: op.ID, - Scope: op.Scope, - StartedAt: op.StartedAt.Format(time.RFC3339Nano), - DurationMs: int64(time.Since(op.StartedAt) / time.Millisecond), - }) - } - slices.SortFunc(out, func(a, b InFlightOperation) int { - return strings.Compare(a.ID, b.ID) - }) - return out, lifecycleBusy -} - -func (s *Server) appendComponentLogs(componentKey string, lines []string) { - if strings.TrimSpace(componentKey) == "" || len(lines) == 0 { - return - } - filtered := make([]string, 0, len(lines)) - for _, line := range lines { - trimmed := strings.TrimSpace(line) - if trimmed == "" { - continue - } - filtered = append(filtered, trimmed) - } - if len(filtered) == 0 { - return - } - - s.logsMu.Lock() - defer s.logsMu.Unlock() - existing := append(s.componentLogs[componentKey], filtered...) - if len(existing) > componentLogsRingSize { - existing = existing[len(existing)-componentLogsRingSize:] - } - s.componentLogs[componentKey] = existing -} - -func (s *Server) getComponentLogs(componentKey string, limit int) ([]string, int) { - s.logsMu.Lock() - defer s.logsMu.Unlock() - lines := s.componentLogs[componentKey] - total := len(lines) - if total == 0 { - return []string{}, 0 - } - if limit <= 0 || limit > total { - limit = total - } - out := append([]string{}, lines[total-limit:]...) - return out, total -} - -func (s *Server) componentLogKeys() []string { - s.logsMu.Lock() - defer s.logsMu.Unlock() - keys := make([]string, 0, len(s.componentLogs)) - for k := range s.componentLogs { - keys = append(keys, k) - } - slices.Sort(keys) - return keys -} - -func (s *Server) cacheKeys() []string { - s.cacheMu.Lock() - defer s.cacheMu.Unlock() - keys := make([]string, 0, len(s.cache)) - for k := range s.cache { - keys = append(keys, k) - } - slices.Sort(keys) - return keys -} - -func (s *Server) runtimeKeys() []string { - s.cacheMu.Lock() - defer s.cacheMu.Unlock() - keys := make([]string, 0, len(s.runtime)) - for k := range s.runtime { - keys = append(keys, k) - } - slices.Sort(keys) - return keys -} - -func (s *Server) cacheSize() int { - s.cacheMu.Lock() - defer s.cacheMu.Unlock() - return len(s.cache) -} - -func (s *Server) runtimeSize() int { - s.cacheMu.Lock() - defer s.cacheMu.Unlock() - return len(s.runtime) -} - -func (s *Server) relayInfos() []RelayInfo { - s.relayMu.Lock() - defer s.relayMu.Unlock() - - out := make([]RelayInfo, 0, len(s.relays)) - for _, relay := range s.relays { - if relay == nil { - continue - } - out = append(out, RelayInfo{ - ID: relay.ID, - Name: relay.Name, - RequestedPort: relay.RequestedPort, - BoundPort: listenerPort(relay.Listener), - }) - } - slices.SortFunc(out, func(a, b RelayInfo) int { - return strings.Compare(a.ID, b.ID) - }) - return out -} - -func (s *Server) relayCount() int { - s.relayMu.Lock() - defer s.relayMu.Unlock() - return len(s.relays) -} - func shouldReuseRemoteStart(componentType, policy string) bool { if componentType == ComponentTypeJD { return false diff --git a/system-tests/lib/cre/environment/remoteexec/agent/server_component_logs.go b/system-tests/lib/cre/environment/remoteexec/agent/server_component_logs.go new file mode 100644 index 00000000000..8524b2d70e3 --- /dev/null +++ b/system-tests/lib/cre/environment/remoteexec/agent/server_component_logs.go @@ -0,0 +1,57 @@ +package agent + +import ( + "slices" + "strings" +) + +func (s *Server) appendComponentLogs(componentKey string, lines []string) { + if strings.TrimSpace(componentKey) == "" || len(lines) == 0 { + return + } + filtered := make([]string, 0, len(lines)) + for _, line := range lines { + trimmed := strings.TrimSpace(line) + if trimmed == "" { + continue + } + filtered = append(filtered, trimmed) + } + if len(filtered) == 0 { + return + } + + s.logsMu.Lock() + defer s.logsMu.Unlock() + existing := append(s.componentLogs[componentKey], filtered...) + if len(existing) > componentLogsRingSize { + existing = existing[len(existing)-componentLogsRingSize:] + } + s.componentLogs[componentKey] = existing +} + +func (s *Server) getComponentLogs(componentKey string, limit int) ([]string, int) { + s.logsMu.Lock() + defer s.logsMu.Unlock() + lines := s.componentLogs[componentKey] + total := len(lines) + if total == 0 { + return []string{}, 0 + } + if limit <= 0 || limit > total { + limit = total + } + out := append([]string{}, lines[total-limit:]...) + return out, total +} + +func (s *Server) componentLogKeys() []string { + s.logsMu.Lock() + defer s.logsMu.Unlock() + keys := make([]string, 0, len(s.componentLogs)) + for k := range s.componentLogs { + keys = append(keys, k) + } + slices.Sort(keys) + return keys +} diff --git a/system-tests/lib/cre/environment/remoteexec/agent/server_state.go b/system-tests/lib/cre/environment/remoteexec/agent/server_state.go new file mode 100644 index 00000000000..53f9ac8e53b --- /dev/null +++ b/system-tests/lib/cre/environment/remoteexec/agent/server_state.go @@ -0,0 +1,150 @@ +package agent + +import ( + "slices" + "strings" + "time" +) + +func (s *Server) lookupCachedStart(componentKey, payloadHash string) (*cachedStart, bool) { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + + start, ok := s.cache[componentKey] + if !ok || start.PayloadHash != payloadHash { + return nil, false + } + return &start, true +} + +func (s *Server) cacheSuccessfulStart(componentKey, payloadHash string, output map[string]any) { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + s.cache[componentKey] = cachedStart{ + PayloadHash: payloadHash, + Output: output, + } +} + +func (s *Server) deleteCachedStart(componentKey string) { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + delete(s.cache, componentKey) +} + +func (s *Server) storeRuntime(componentKey string, state runtimeState) { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + s.runtime[componentKey] = state +} + +func (s *Server) takeRuntime(componentKey string) (runtimeState, bool) { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + state, ok := s.runtime[componentKey] + if ok { + delete(s.runtime, componentKey) + } + return state, ok +} + +func (s *Server) beginInFlight(id, scope string) { + s.opsMu.Lock() + defer s.opsMu.Unlock() + s.inFlight[id] = inFlightOperation{ + ID: id, + Scope: scope, + StartedAt: time.Now(), + } +} + +func (s *Server) endInFlight(id string) { + s.opsMu.Lock() + defer s.opsMu.Unlock() + delete(s.inFlight, id) +} + +func (s *Server) inFlightSnapshot() ([]InFlightOperation, bool) { + s.opsMu.Lock() + defer s.opsMu.Unlock() + + out := make([]InFlightOperation, 0, len(s.inFlight)) + lifecycleBusy := false + for _, op := range s.inFlight { + if op.Scope == inFlightOperationScopeLifecycle { + lifecycleBusy = true + } + out = append(out, InFlightOperation{ + ID: op.ID, + Scope: op.Scope, + StartedAt: op.StartedAt.Format(time.RFC3339Nano), + DurationMs: int64(time.Since(op.StartedAt) / time.Millisecond), + }) + } + slices.SortFunc(out, func(a, b InFlightOperation) int { + return strings.Compare(a.ID, b.ID) + }) + return out, lifecycleBusy +} + +func (s *Server) cacheKeys() []string { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + keys := make([]string, 0, len(s.cache)) + for k := range s.cache { + keys = append(keys, k) + } + slices.Sort(keys) + return keys +} + +func (s *Server) runtimeKeys() []string { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + keys := make([]string, 0, len(s.runtime)) + for k := range s.runtime { + keys = append(keys, k) + } + slices.Sort(keys) + return keys +} + +func (s *Server) cacheSize() int { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + return len(s.cache) +} + +func (s *Server) runtimeSize() int { + s.cacheMu.Lock() + defer s.cacheMu.Unlock() + return len(s.runtime) +} + +func (s *Server) relayInfos() []RelayInfo { + s.relayMu.Lock() + defer s.relayMu.Unlock() + + out := make([]RelayInfo, 0, len(s.relays)) + for _, relay := range s.relays { + if relay == nil { + continue + } + out = append(out, RelayInfo{ + ID: relay.ID, + Name: relay.Name, + RequestedPort: relay.RequestedPort, + BoundPort: listenerPort(relay.Listener), + }) + } + slices.SortFunc(out, func(a, b RelayInfo) int { + return strings.Compare(a.ID, b.ID) + }) + return out +} + +func (s *Server) relayCount() int { + s.relayMu.Lock() + defer s.relayMu.Unlock() + return len(s.relays) +} diff --git a/system-tests/lib/cre/environment/remoteexec/agent/server_status_handlers.go b/system-tests/lib/cre/environment/remoteexec/agent/server_status_handlers.go new file mode 100644 index 00000000000..1d434931e69 --- /dev/null +++ b/system-tests/lib/cre/environment/remoteexec/agent/server_status_handlers.go @@ -0,0 +1,83 @@ +package agent + +import ( + "net/http" + "strconv" + "strings" + "time" +) + +func (s *Server) status(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) + return + } + + runtimeKeys := s.runtimeKeys() + cacheKeys := s.cacheKeys() + relayInfos := s.relayInfos() + componentLogKeys := s.componentLogKeys() + inFlight, _ := s.inFlightSnapshot() + + s.respondJSONAny(w, http.StatusOK, AgentStatusResponse{ + AgentVersion: agentVersion, + ProtocolVersion: protocolVersion, + SupportedSchemas: []string{SchemaVersionV1}, + Capabilities: []string{capabilityStartComponent, capabilityDeployArtifacts, capabilityRelay, capabilityListCTFResources, capabilityLocks, capabilityComponentLogs}, + UptimeSeconds: int64(time.Since(s.startedAt).Seconds()), + RuntimeComponents: runtimeKeys, + CachedComponents: cacheKeys, + Relays: relayInfos, + ComponentLogKeys: componentLogKeys, + InFlight: inFlight, + }) +} + +func (s *Server) locks(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) + return + } + + inFlight, lifecycleBusy := s.inFlightSnapshot() + s.respondJSONAny(w, http.StatusOK, AgentLocksResponse{ + LifecycleBusy: lifecycleBusy, + CacheEntries: s.cacheSize(), + RuntimeEntries: s.runtimeSize(), + RelayCount: s.relayCount(), + ComponentLogKeys: len(s.componentLogKeys()), + InFlight: inFlight, + }) +} + +func (s *Server) componentLogsHandler(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) + return + } + + componentKey := strings.TrimSpace(r.URL.Query().Get("componentKey")) + if componentKey == "" { + s.respondError(w, http.StatusBadRequest, ErrCodeMissingComponentInput, "componentKey query parameter is required", nil) + return + } + limit := defaultComponentLogsLimit + if rawLimit := strings.TrimSpace(r.URL.Query().Get("limit")); rawLimit != "" { + parsed, err := strconv.Atoi(rawLimit) + if err != nil || parsed <= 0 { + s.respondError(w, http.StatusBadRequest, ErrCodeInvalidPayload, "limit query parameter must be a positive integer", nil) + return + } + if parsed > maxComponentLogsLimit { + parsed = maxComponentLogsLimit + } + limit = parsed + } + + lines, total := s.getComponentLogs(componentKey, limit) + s.respondJSONAny(w, http.StatusOK, ComponentLogsResponse{ + ComponentKey: componentKey, + TotalLines: total, + Lines: lines, + }) +} diff --git a/system-tests/lib/cre/environment/remoteexec/client/remote_component_descriptor_start.go b/system-tests/lib/cre/environment/remoteexec/client/remote_component_descriptor_start.go new file mode 100644 index 00000000000..86bf0fe0a23 --- /dev/null +++ b/system-tests/lib/cre/environment/remoteexec/client/remote_component_descriptor_start.go @@ -0,0 +1,47 @@ +package client + +import ( + "context" + "errors" + + "github.com/rs/zerolog" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" +) + +type StartDescriptor[T any] struct { + ComponentType string + BuildPayload func() (agent.StartComponentPayload, error) + Rewrite func(output *T, ec2HostIP string) error +} + +func StartWithRuntimeDescriptor[T any]( + ctx context.Context, + lggr zerolog.Logger, + runtime *Runtime, + descriptor StartDescriptor[T], +) (*T, error) { + if runtime == nil { + return nil, errors.New("remote runtime is required for remote component placement") + } + payload, err := descriptor.BuildPayload() + if err != nil { + return nil, err + } + output, err := StartRemoteComponent[T]( + ctx, + lggr, + runtime.Client, + payload, + descriptor.ComponentType, + ) + if err != nil { + return nil, err + } + if descriptor.Rewrite != nil { + if err := descriptor.Rewrite(output, runtime.EC2HostIP); err != nil { + return nil, err + } + } + return output, nil +} diff --git a/system-tests/lib/cre/environment/state_test.go b/system-tests/lib/cre/environment/state_test.go index 93464ed41d8..9032aa7ae6e 100644 --- a/system-tests/lib/cre/environment/state_test.go +++ b/system-tests/lib/cre/environment/state_test.go @@ -45,3 +45,24 @@ func TestRewriteReconstructedGatewayIncomingHosts_LocalGatewayNoop(t *testing.T) "expected local gateway incoming host to remain unchanged", ) } + +func TestRewriteReconstructedGatewayIncomingHosts_RewritesOnlyRemoteNodeSets(t *testing.T) { + remoteTopology, remoteNodeSet := mustBuildRemoteGatewayTopology(t) + localTopology, localNodeSet := mustBuildRemoteGatewayTopology(t) + localNodeSet.Placement = string(config.PlacementLocal) + // Preserve the remote topology gateway config and append a local-only gateway config. + remoteTopology.GatewayConnectors.Configurations = append( + remoteTopology.GatewayConnectors.Configurations, + localTopology.GatewayConnectors.Configurations[0], + ) + + cfg := &config.Config{ + NodeSets: []*cre.NodeSet{remoteNodeSet, localNodeSet}, + } + t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.77") + + err := rewriteReconstructedGatewayIncomingHosts(cfg, remoteTopology) + require.NoError(t, err, "expected mixed reconstruction rewrite to succeed") + require.Equal(t, "203.0.113.77", remoteTopology.GatewayConnectors.Configurations[0].Incoming.Host, "expected remote gateway incoming host rewrite") + require.Equal(t, "bootstrap-gateway-node0", remoteTopology.GatewayConnectors.Configurations[1].Incoming.Host, "expected local gateway incoming host to remain unchanged") +} diff --git a/system-tests/lib/cre/environment/tunnel/component_id.go b/system-tests/lib/cre/environment/tunnel/component_id.go deleted file mode 100644 index 2c2a1c1ce68..00000000000 --- a/system-tests/lib/cre/environment/tunnel/component_id.go +++ /dev/null @@ -1,27 +0,0 @@ -package tunnel - -import ( - "fmt" - "strings" -) - -type ComponentKind string - -const ( - KindBlockchain ComponentKind = "blockchain" - KindNodeSet ComponentKind = "nodeset" - KindJD ComponentKind = "jd" -) - -func CanonicalComponentID(kind ComponentKind, index int, name string) string { - if name == "" { - return fmt.Sprintf("%s:%d", kind, index) - } - - normalized := strings.ToLower(strings.TrimSpace(name)) - if normalized == "" { - return fmt.Sprintf("%s:%d", kind, index) - } - - return fmt.Sprintf("%s:%d:%s", kind, index, normalized) -} diff --git a/system-tests/lib/cre/environment/tunnel/component_id_test.go b/system-tests/lib/cre/environment/tunnel/component_id_test.go deleted file mode 100644 index e3ac740839f..00000000000 --- a/system-tests/lib/cre/environment/tunnel/component_id_test.go +++ /dev/null @@ -1,17 +0,0 @@ -package tunnel - -import "testing" - -func TestCanonicalComponentID(t *testing.T) { - if got := CanonicalComponentID(KindBlockchain, 0, "Anvil-Main"); got != "blockchain:0:anvil-main" { - t.Fatalf("unexpected canonical id: %s", got) - } - - if got := CanonicalComponentID(KindJD, 2, ""); got != "jd:2" { - t.Fatalf("unexpected canonical id for empty name: %s", got) - } - - if got := CanonicalComponentID(KindNodeSet, 1, " "); got != "nodeset:1" { - t.Fatalf("unexpected canonical id for whitespace name: %s", got) - } -} diff --git a/system-tests/lib/cre/environment/tunnel/manager.go b/system-tests/lib/cre/environment/tunnel/manager.go deleted file mode 100644 index 4fd01ac363e..00000000000 --- a/system-tests/lib/cre/environment/tunnel/manager.go +++ /dev/null @@ -1,115 +0,0 @@ -package tunnel - -import ( - "context" - "errors" - "fmt" - "sync" -) - -type manager struct { - provider Provider - - mu sync.Mutex - bindings map[string]TunnelBinding -} - -func NewManager(provider Provider) Manager { - return &manager{ - provider: provider, - bindings: make(map[string]TunnelBinding), - } -} - -func (m *manager) Start(ctx context.Context, refs []EndpointRef) ([]TunnelBinding, error) { - m.mu.Lock() - defer m.mu.Unlock() - - started := make([]TunnelBinding, 0, len(refs)) - newlyOpened := make([]TunnelBinding, 0, len(refs)) - - for _, ref := range refs { - key := endpointKey(ref.ComponentID, ref.EndpointName) - if existing, ok := m.bindings[key]; ok { - started = append(started, existing) - continue - } - - if err := validateEndpointRef(ref); err != nil { - _ = m.closeMany(ctx, newlyOpened) - return nil, err - } - - binding, err := m.provider.Open(ctx, ref) - if err != nil { - _ = m.closeMany(ctx, newlyOpened) - return nil, fmt.Errorf("failed to open tunnel via %s for %s/%s: %w", m.provider.Name(), ref.ComponentID, ref.EndpointName, err) - } - - m.bindings[key] = binding - started = append(started, binding) - newlyOpened = append(newlyOpened, binding) - } - - return started, nil -} - -func (m *manager) Stop(ctx context.Context) error { - m.mu.Lock() - defer m.mu.Unlock() - - bindings := make([]TunnelBinding, 0, len(m.bindings)) - for _, b := range m.bindings { - bindings = append(bindings, b) - } - clear(m.bindings) - - return m.closeMany(ctx, bindings) -} - -func (m *manager) IsStarted() bool { - m.mu.Lock() - defer m.mu.Unlock() - return len(m.bindings) > 0 -} - -func (m *manager) Snapshot() []TunnelBinding { - m.mu.Lock() - defer m.mu.Unlock() - - out := make([]TunnelBinding, 0, len(m.bindings)) - for _, b := range m.bindings { - out = append(out, b) - } - return out -} - -func (m *manager) closeMany(ctx context.Context, bindings []TunnelBinding) error { - var joined error - for _, b := range bindings { - if err := m.provider.Close(ctx, b); err != nil { - joined = errors.Join(joined, err) - } - } - return joined -} - -func validateEndpointRef(ref EndpointRef) error { - if ref.ComponentID == "" { - return errors.New("endpoint componentID is required") - } - if ref.EndpointName == "" { - return errors.New("endpoint endpointName is required") - } - if ref.Host == "" { - return errors.New("endpoint host is required") - } - if ref.Port <= 0 || ref.Port > 65535 { - return fmt.Errorf("endpoint port %d is invalid", ref.Port) - } - return nil -} - -func endpointKey(componentID, endpointName string) string { - return componentID + ":" + endpointName -} diff --git a/system-tests/lib/cre/environment/tunnel/manager_test.go b/system-tests/lib/cre/environment/tunnel/manager_test.go deleted file mode 100644 index 85700d5ba9d..00000000000 --- a/system-tests/lib/cre/environment/tunnel/manager_test.go +++ /dev/null @@ -1,85 +0,0 @@ -package tunnel - -import ( - "context" - "testing" -) - -type fakeProvider struct { - openCount int - closeCount int -} - -func (f *fakeProvider) Open(_ context.Context, ref EndpointRef) (TunnelBinding, error) { - f.openCount++ - return TunnelBinding{ - EndpointRef: ref, - LocalPort: 10000 + f.openCount, - LocalURL: "http://127.0.0.1:10000", - }, nil -} - -func (f *fakeProvider) Close(_ context.Context, _ TunnelBinding) error { - f.closeCount++ - return nil -} - -func (f *fakeProvider) Name() string { return "fake" } - -func TestManagerStartDedupsAndStops(t *testing.T) { - provider := &fakeProvider{} - mgr := NewManager(provider) - - refs := []EndpointRef{ - { - ComponentID: "blockchain:0:anvil", - EndpointName: "node-0-http", - Scheme: "http", - Host: "127.0.0.1", - Port: 8545, - }, - { - ComponentID: "blockchain:0:anvil", - EndpointName: "node-0-ws", - Scheme: "ws", - Host: "127.0.0.1", - Port: 8546, - }, - } - - started, err := mgr.Start(context.Background(), refs) - if err != nil { - t.Fatalf("expected start to succeed: %v", err) - } - if len(started) != 2 { - t.Fatalf("expected 2 bindings, got %d", len(started)) - } - if provider.openCount != 2 { - t.Fatalf("expected 2 opens, got %d", provider.openCount) - } - - startedAgain, err := mgr.Start(context.Background(), refs) - if err != nil { - t.Fatalf("expected dedup start to succeed: %v", err) - } - if len(startedAgain) != 2 { - t.Fatalf("expected 2 dedup bindings, got %d", len(startedAgain)) - } - if provider.openCount != 2 { - t.Fatalf("expected no extra open calls after dedup, got %d", provider.openCount) - } - - if !mgr.IsStarted() { - t.Fatalf("expected manager to report started") - } - - if err := mgr.Stop(context.Background()); err != nil { - t.Fatalf("expected idempotent stop to succeed: %v", err) - } - if provider.closeCount != 2 { - t.Fatalf("expected 2 closes from stop, got %d", provider.closeCount) - } - if mgr.IsStarted() { - t.Fatalf("expected manager to report no active tunnels after stop") - } -} diff --git a/system-tests/lib/cre/environment/tunnel/noop_manager.go b/system-tests/lib/cre/environment/tunnel/noop_manager.go deleted file mode 100644 index 91829c8c040..00000000000 --- a/system-tests/lib/cre/environment/tunnel/noop_manager.go +++ /dev/null @@ -1,26 +0,0 @@ -package tunnel - -import "context" - -type noopManager struct{} - -func NewNoopManager() Manager { - return &noopManager{} -} - -func (n *noopManager) Start(_ context.Context, refs []EndpointRef) ([]TunnelBinding, error) { - bindings := make([]TunnelBinding, 0, len(refs)) - for _, ref := range refs { - bindings = append(bindings, TunnelBinding{ - EndpointRef: ref, - LocalURL: ref.OriginalURL, - }) - } - return bindings, nil -} - -func (n *noopManager) Stop(_ context.Context) error { return nil } - -func (n *noopManager) IsStarted() bool { return false } - -func (n *noopManager) Snapshot() []TunnelBinding { return []TunnelBinding{} } diff --git a/system-tests/lib/cre/environment/tunnel/provider_ssm.go b/system-tests/lib/cre/environment/tunnel/provider_ssm.go deleted file mode 100644 index 6300f49c58d..00000000000 --- a/system-tests/lib/cre/environment/tunnel/provider_ssm.go +++ /dev/null @@ -1,226 +0,0 @@ -package tunnel - -import ( - "context" - "errors" - "fmt" - "net" - "os" - "os/exec" - "strings" - "sync" - "syscall" - "time" - - "github.com/rs/zerolog" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" -) - -type SSMProvider struct { - instanceID string - region string - logger zerolog.Logger - - mu sync.Mutex - sessions map[int]*exec.Cmd -} - -func NewSSMProvider(instanceID, region string, logger zerolog.Logger) Provider { - return &SSMProvider{ - instanceID: instanceID, - region: region, - logger: logger, - sessions: make(map[int]*exec.Cmd), - } -} - -func (p *SSMProvider) Name() string { - return "ssm" -} - -func (p *SSMProvider) Open(ctx context.Context, ref EndpointRef) (TunnelBinding, error) { - profile, authMode := runtimecfg.ResolveAWSCLIProfileSelection() - if err := validateAWSSession(ctx, p.region, profile, authMode); err != nil { - return TunnelBinding{}, err - } - - localPort, err := reserveLocalPort() - if err != nil { - return TunnelBinding{}, fmt.Errorf("failed to reserve local port: %w", err) - } - - args := []string{ - "ssm", - "start-session", - "--region", p.region, - "--target", p.instanceID, - "--document-name", "AWS-StartPortForwardingSession", - "--parameters", fmt.Sprintf("portNumber=%d,localPortNumber=%d", ref.Port, localPort), - } - if profile != "" { - args = append(args, "--profile", profile) - } - cmd := exec.Command("aws", args...) - // Start in a dedicated process group so cleanup can kill aws + session-manager-plugin together. - cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} - if p.logger.GetLevel() <= zerolog.DebugLevel { - cmd.Stdout = os.Stderr - cmd.Stderr = os.Stderr - p.logger.Debug(). - Strs("cmd", cmd.Args). - Msg("Starting SSM endpoint tunnel command") - } - - p.logger.Info(). - Str("componentID", ref.ComponentID). - Str("endpointName", ref.EndpointName). - Str("awsAuthMode", authMode). - Str("awsProfile", profile). - Int("remotePort", ref.Port). - Int("localPort", localPort). - Msg("Opening SSM endpoint tunnel") - - if err := cmd.Start(); err != nil { - return TunnelBinding{}, fmt.Errorf("failed to start aws ssm session: %w", err) - } - if err := waitForLocalPortReady(ctx, localPort, 12*time.Second); err != nil { - terminateProcessGroup(cmd) - return TunnelBinding{}, fmt.Errorf("ssm local tunnel on port %d did not become ready: %w", localPort, err) - } - - p.mu.Lock() - p.sessions[localPort] = cmd - p.mu.Unlock() - - go func() { - _ = cmd.Wait() - }() - - return TunnelBinding{ - EndpointRef: ref, - LocalPort: localPort, - LocalURL: localURLFromScheme(ref.Scheme, localPort), - PID: cmd.Process.Pid, - }, nil -} - -func validateAWSSession(ctx context.Context, region, profile, authMode string) error { - if ctx == nil { - ctx = context.Background() - } - preflightCtx, cancel := context.WithTimeout(ctx, 8*time.Second) - defer cancel() - - args := []string{"sts", "get-caller-identity", "--region", region} - if profile != "" { - args = append(args, "--profile", profile) - } - out, err := exec.CommandContext(preflightCtx, "aws", args...).CombinedOutput() - if err == nil { - return nil - } - - loginHint := "Verify AWS credentials are configured and valid." - if profile != "" { - loginHint = fmt.Sprintf("Run `aws sso login --profile %s` (or configure profile credentials) and retry.", profile) - } - trimmed := strings.TrimSpace(string(out)) - if trimmed == "" { - return fmt.Errorf("aws authentication check failed for SSM tunnel (mode=%s): %w. %s", authMode, err, loginHint) - } - return fmt.Errorf("aws authentication check failed for SSM tunnel (mode=%s): %w: %s. %s", authMode, err, trimmed, loginHint) -} - -func (p *SSMProvider) Close(_ context.Context, binding TunnelBinding) error { - p.mu.Lock() - cmd, ok := p.sessions[binding.LocalPort] - if ok { - delete(p.sessions, binding.LocalPort) - } - p.mu.Unlock() - - if !ok || cmd == nil || cmd.Process == nil { - return nil - } - - if err := terminateProcessGroup(cmd); err != nil { - return fmt.Errorf("failed to kill ssm session on local port %d: %w", binding.LocalPort, err) - } - p.logger.Info(). - Str("componentID", binding.ComponentID). - Str("endpointName", binding.EndpointName). - Int("localPort", binding.LocalPort). - Msg("Closed SSM endpoint tunnel") - return nil -} - -func reserveLocalPort() (int, error) { - l, err := net.Listen("tcp", "127.0.0.1:0") - if err != nil { - return 0, err - } - defer l.Close() - - tcpAddr, ok := l.Addr().(*net.TCPAddr) - if !ok { - return 0, fmt.Errorf("listener addr %T is not tcp", l.Addr()) - } - return tcpAddr.Port, nil -} - -func localURLFromScheme(scheme string, port int) string { - switch scheme { - case "ws": - return fmt.Sprintf("ws://127.0.0.1:%d", port) - case "wss": - return fmt.Sprintf("wss://127.0.0.1:%d", port) - case "https": - return fmt.Sprintf("https://127.0.0.1:%d", port) - default: - return fmt.Sprintf("http://127.0.0.1:%d", port) - } -} - -func terminateProcessGroup(cmd *exec.Cmd) error { - if cmd == nil || cmd.Process == nil { - return nil - } - - // Negative PID targets the process group when Setpgid=true. - if err := syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL); err != nil { - // Fall back to killing parent process only. - if killErr := cmd.Process.Kill(); killErr != nil { - return killErr - } - } - return nil -} - -func waitForLocalPortReady(ctx context.Context, port int, timeout time.Duration) error { - deadline := time.Now().Add(timeout) - address := fmt.Sprintf("127.0.0.1:%d", port) - var lastErr error - - for time.Now().Before(deadline) { - if ctx != nil { - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - } - - conn, err := net.DialTimeout("tcp", address, 300*time.Millisecond) - if err == nil { - _ = conn.Close() - return nil - } - lastErr = err - time.Sleep(200 * time.Millisecond) - } - - if lastErr == nil { - lastErr = errors.New("unknown readiness failure") - } - return lastErr -} diff --git a/system-tests/lib/cre/environment/tunnel/tunnel.go b/system-tests/lib/cre/environment/tunnel/tunnel.go deleted file mode 100644 index cb622b28fd8..00000000000 --- a/system-tests/lib/cre/environment/tunnel/tunnel.go +++ /dev/null @@ -1,32 +0,0 @@ -package tunnel - -import "context" - -type EndpointRef struct { - ComponentID string - EndpointName string - Scheme string - Host string - Port int - OriginalURL string -} - -type TunnelBinding struct { - EndpointRef - LocalPort int - LocalURL string - PID int -} - -type Manager interface { - Start(ctx context.Context, refs []EndpointRef) ([]TunnelBinding, error) - Stop(ctx context.Context) error - IsStarted() bool - Snapshot() []TunnelBinding -} - -type Provider interface { - Open(ctx context.Context, ref EndpointRef) (TunnelBinding, error) - Close(ctx context.Context, binding TunnelBinding) error - Name() string -} diff --git a/system-tests/tests/smoke/cre/cre_suite_test.go b/system-tests/tests/smoke/cre/cre_suite_test.go index 3c9f1e33790..9d00cd1aa55 100644 --- a/system-tests/tests/smoke/cre/cre_suite_test.go +++ b/system-tests/tests/smoke/cre/cre_suite_test.go @@ -123,7 +123,7 @@ func Test_CRE_V2_Suite(t *testing.T) { ExecuteDonTimeTest(t, testEnv) }) t.Run("[v2] Consensus - "+topology, func(t *testing.T) { - t.Skip() + // t.Skip() testEnv := t_helpers.SetupTestEnvironmentWithConfig(t, t_helpers.GetDefaultTestConfig(t), v2RegistriesFlags...) ExecuteConsensusTest(t, testEnv) From 130c895f4fe460eb5c6de2c2c260994dd6f98b25 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Thu, 26 Feb 2026 09:24:03 +0100 Subject: [PATCH 27/34] decouple remote agent from AWS --- core/scripts/cre/environment/README.md | 7 +- .../cre/environment/environment/debug.go | 2 +- .../environment/environment/environment.go | 8 +- .../environment/relay_supervisor.go | 16 ++-- .../environment/environment/remote_state.go | 20 ++--- .../environment/remote_state_test.go | 15 ++-- system-tests/lib/cre/bootstrap_peer_test.go | 18 ++--- .../lib/cre/don/config/config_test.go | 4 +- .../cre/environment/blockchain_start_test.go | 56 ++++++------- system-tests/lib/cre/environment/dons.go | 2 +- .../lib/cre/environment/environment.go | 14 ++-- .../client/artifacts_remote_test.go | 8 +- .../client/remote_component_client.go | 78 ++++++++++++------- .../client/remote_component_client_test.go | 39 +++++----- .../remote_component_descriptor_start.go | 2 +- .../remoteexec/client/remote_stop_test.go | 8 +- .../lib/cre/environment/state_test.go | 8 +- .../lib/cre/runtimecfg/access_mode.go | 14 ++-- .../lib/cre/runtimecfg/access_mode_test.go | 17 ++-- .../tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md | 26 +++---- .../test-helpers/fixture_relay_helpers.go | 10 +-- 21 files changed, 188 insertions(+), 184 deletions(-) diff --git a/core/scripts/cre/environment/README.md b/core/scripts/cre/environment/README.md index 9cbe2153af6..4e11318fb2a 100644 --- a/core/scripts/cre/environment/README.md +++ b/core/scripts/cre/environment/README.md @@ -294,9 +294,10 @@ Remote execution uses a single direct mode with an EC2-hosted (or equivalent) CR Environment variable precedence for agent resolution: -1. `CRE_EC2_AGENT_URL` (explicit override, if set) -2. `CRE_EC2_INSTANCE_ID` + `CRE_EC2_AGENT_PORT` + AWS profile/credentials resolution -3. `CRE_EC2_AGENT_PORT` defaults to `8080` when omitted +1. `CRE_REMOTE_AGENT_URL` (explicit override, if set) +2. `CRE_REMOTE_HOST_IP` + `CRE_REMOTE_AGENT_PORT` +3. `CRE_REMOTE_AGENT_EC2_INSTANCE_ID` + `CRE_REMOTE_AGENT_PORT` + AWS profile/credentials resolution +4. `CRE_REMOTE_AGENT_PORT` defaults to `18080` when omitted Stop command semantics: diff --git a/core/scripts/cre/environment/environment/debug.go b/core/scripts/cre/environment/environment/debug.go index 3afb139cbf8..f6df1128ee2 100644 --- a/core/scripts/cre/environment/environment/debug.go +++ b/core/scripts/cre/environment/environment/debug.go @@ -94,7 +94,7 @@ func withResolvedRemoteRuntime(ctx context.Context, fn func(context.Context, *re } runtime, err := remoteclient.ResolveRuntime(framework.L) if err != nil { - return errors.Wrap(err, "failed to resolve remote runtime (set CRE_EC2_AGENT_URL or CRE_EC2_INSTANCE_ID/AWS profile)") + return errors.Wrap(err, "failed to resolve remote runtime (set CRE_REMOTE_AGENT_URL or CRE_REMOTE_AGENT_EC2_INSTANCE_ID/AWS profile)") } return fn(ctx, runtime) } diff --git a/core/scripts/cre/environment/environment/environment.go b/core/scripts/cre/environment/environment/environment.go index 6c7d8775536..a16b196491d 100644 --- a/core/scripts/cre/environment/environment/environment.go +++ b/core/scripts/cre/environment/environment/environment.go @@ -922,10 +922,10 @@ func applyRemoteAgentEnvFallback(logger zerolog.Logger, agentState *remoteAgentS } } - setIfEmpty("CRE_EC2_AGENT_URL", agentState.EC2URL) - setIfEmpty("CRE_EC2_INSTANCE_ID", agentState.EC2InstanceID) - setIfEmpty("CRE_EC2_AGENT_PORT", agentState.EC2AgentPort) - setIfEmpty("CRE_AWS_PROFILE", agentState.AWSProfile) + setIfEmpty("CRE_REMOTE_AGENT_URL", agentState.RemoteAgentURL) + setIfEmpty("CRE_REMOTE_AGENT_EC2_INSTANCE_ID", agentState.RemoteAgentEC2InstanceID) + setIfEmpty("CRE_REMOTE_AGENT_PORT", agentState.RemoteAgentPort) + setIfEmpty("AWS_PROFILE", agentState.AWSProfile) } func StartCLIEnvironment( diff --git a/core/scripts/cre/environment/environment/relay_supervisor.go b/core/scripts/cre/environment/environment/relay_supervisor.go index 3a31fd525c5..41548fe89dc 100644 --- a/core/scripts/cre/environment/environment/relay_supervisor.go +++ b/core/scripts/cre/environment/environment/relay_supervisor.go @@ -40,7 +40,7 @@ const ( relaySupervisorStateFilename = "relay_supervisor.toml" relaySupervisorLogFilename = "relay_supervisor.log" relaySupervisorLockFilename = "relay_supervisor.lock" - defaultEC2AgentPort = 18080 + defaultRemoteAgentPort = 18080 defaultRelayWorkerPoolSize = 16 envRelaySupervisorLockPath = "CRE_RELAY_SUPERVISOR_LOCK_PATH" @@ -808,28 +808,28 @@ func (h *relayHandle) setRelayID(relayID string) { } func resolveAgentBaseURLForRelay() (string, error) { - if v := strings.TrimSpace(os.Getenv("CRE_EC2_AGENT_URL")); v != "" { + if v := strings.TrimSpace(os.Getenv("CRE_REMOTE_AGENT_URL")); v != "" { return v, nil } hostIP, err := runtimecfg.DirectHostIP() if err == nil { - port, err := resolveEC2AgentPortForRelay() + port, err := resolveRemoteAgentPortForRelay() if err != nil { return "", err } return fmt.Sprintf("http://%s:%d", hostIP, port), nil } - return "", fmt.Errorf("cannot resolve agent base URL for relay; set CRE_EC2_AGENT_URL or provide EC2 discovery envs: %w", err) + return "", fmt.Errorf("cannot resolve agent base URL for relay; set CRE_REMOTE_AGENT_URL or provide EC2 discovery envs: %w", err) } -func resolveEC2AgentPortForRelay() (int, error) { - raw := strings.TrimSpace(os.Getenv("CRE_EC2_AGENT_PORT")) +func resolveRemoteAgentPortForRelay() (int, error) { + raw := strings.TrimSpace(os.Getenv("CRE_REMOTE_AGENT_PORT")) if raw == "" { - return defaultEC2AgentPort, nil + return defaultRemoteAgentPort, nil } port, err := strconv.Atoi(raw) if err != nil || port <= 0 || port > 65535 { - return 0, fmt.Errorf("invalid CRE_EC2_AGENT_PORT: %q", raw) + return 0, fmt.Errorf("invalid CRE_REMOTE_AGENT_PORT: %q", raw) } return port, nil } diff --git a/core/scripts/cre/environment/environment/remote_state.go b/core/scripts/cre/environment/environment/remote_state.go index 6a0f15c59c1..7709449b837 100644 --- a/core/scripts/cre/environment/environment/remote_state.go +++ b/core/scripts/cre/environment/environment/remote_state.go @@ -17,15 +17,15 @@ const ( remoteStateDirname = "core/scripts/cre/environment/state_remote" remoteStateFilename = "remote_components.toml" remoteAgentFilename = "remote_agent.toml" - envEC2AgentURL = "CRE_EC2_AGENT_URL" - envEC2AgentPort = "CRE_EC2_AGENT_PORT" + envRemoteAgentURL = "CRE_REMOTE_AGENT_URL" + envRemoteAgentPort = "CRE_REMOTE_AGENT_PORT" ) type remoteAgentState struct { - EC2URL string `toml:"ec2_url,omitempty"` - EC2InstanceID string `toml:"ec2_instance_id,omitempty"` - EC2AgentPort string `toml:"ec2_agent_port,omitempty"` - AWSProfile string `toml:"aws_profile,omitempty"` + RemoteAgentURL string `toml:"remote_agent_url,omitempty"` + RemoteAgentEC2InstanceID string `toml:"remote_agent_ec2_instance_id,omitempty"` + RemoteAgentPort string `toml:"remote_agent_port,omitempty"` + AWSProfile string `toml:"aws_profile,omitempty"` } type remoteAgentStateEnvelope struct { @@ -112,10 +112,10 @@ func filteredRemoteStopConfig(cfg *envconfig.Config) *envconfig.Config { func captureRemoteAgentState() remoteAgentState { return remoteAgentState{ - EC2URL: os.Getenv(envEC2AgentURL), - EC2InstanceID: os.Getenv(runtimecfg.EnvEC2InstanceID), - EC2AgentPort: os.Getenv(envEC2AgentPort), - AWSProfile: firstNonEmpty(os.Getenv(runtimecfg.EnvAWSProfile), os.Getenv("AWS_PROFILE")), + RemoteAgentURL: os.Getenv(envRemoteAgentURL), + RemoteAgentEC2InstanceID: os.Getenv(runtimecfg.EnvRemoteAgentEC2InstanceID), + RemoteAgentPort: os.Getenv(envRemoteAgentPort), + AWSProfile: strings.TrimSpace(os.Getenv("AWS_PROFILE")), } } diff --git a/core/scripts/cre/environment/environment/remote_state_test.go b/core/scripts/cre/environment/environment/remote_state_test.go index a0b293996e2..edebd584a7f 100644 --- a/core/scripts/cre/environment/environment/remote_state_test.go +++ b/core/scripts/cre/environment/environment/remote_state_test.go @@ -32,15 +32,14 @@ func TestFilteredRemoteStopConfigKeepsOnlyRemoteComponents(t *testing.T) { } func TestCaptureRemoteAgentStateReadsExpectedEnvVars(t *testing.T) { - t.Setenv(envEC2AgentURL, "http://203.0.113.10:8080") - t.Setenv(runtimecfg.EnvEC2InstanceID, "i-abc") - t.Setenv(envEC2AgentPort, "18080") - t.Setenv(runtimecfg.EnvAWSProfile, "cre-profile") + t.Setenv(envRemoteAgentURL, "http://203.0.113.10:8080") + t.Setenv(runtimecfg.EnvRemoteAgentEC2InstanceID, "i-abc") + t.Setenv(envRemoteAgentPort, "18080") t.Setenv("AWS_PROFILE", "fallback-profile") state := captureRemoteAgentState() - require.Equal(t, "http://203.0.113.10:8080", state.EC2URL) - require.Equal(t, "i-abc", state.EC2InstanceID) - require.Equal(t, "18080", state.EC2AgentPort) - require.Equal(t, "cre-profile", state.AWSProfile) + require.Equal(t, "http://203.0.113.10:8080", state.RemoteAgentURL) + require.Equal(t, "i-abc", state.RemoteAgentEC2InstanceID) + require.Equal(t, "18080", state.RemoteAgentPort) + require.Equal(t, "fallback-profile", state.AWSProfile) } diff --git a/system-tests/lib/cre/bootstrap_peer_test.go b/system-tests/lib/cre/bootstrap_peer_test.go index 601c63099c4..6b0a6d29fab 100644 --- a/system-tests/lib/cre/bootstrap_peer_test.go +++ b/system-tests/lib/cre/bootstrap_peer_test.go @@ -15,13 +15,13 @@ func TestResolveP2PAnnounceAddresses_LocalOnly_UsesInternalHost(t *testing.T) { } func TestResolveP2PAnnounceAddresses_LocalMixed_AddsBridgedHost(t *testing.T) { - prevIP, hadIP := os.LookupEnv(runtimecfg.EnvEC2HostIP) + prevIP, hadIP := os.LookupEnv(runtimecfg.EnvRemoteHostIP) prevLocalIP, hadLocalIP := os.LookupEnv(runtimecfg.EnvLocalHostIP) t.Cleanup(func() { if hadIP { - _ = os.Setenv(runtimecfg.EnvEC2HostIP, prevIP) + _ = os.Setenv(runtimecfg.EnvRemoteHostIP, prevIP) } else { - _ = os.Unsetenv(runtimecfg.EnvEC2HostIP) + _ = os.Unsetenv(runtimecfg.EnvRemoteHostIP) } if hadLocalIP { _ = os.Setenv(runtimecfg.EnvLocalHostIP, prevLocalIP) @@ -29,7 +29,7 @@ func TestResolveP2PAnnounceAddresses_LocalMixed_AddsBridgedHost(t *testing.T) { _ = os.Unsetenv(runtimecfg.EnvLocalHostIP) } }) - _ = os.Setenv(runtimecfg.EnvEC2HostIP, "10.1.2.3") + _ = os.Setenv(runtimecfg.EnvRemoteHostIP, "10.1.2.3") _ = os.Setenv(runtimecfg.EnvLocalHostIP, "192.168.1.10") addresses, err := ResolveP2PAnnounceAddresses("local", true, 15002) @@ -40,15 +40,15 @@ func TestResolveP2PAnnounceAddresses_LocalMixed_AddsBridgedHost(t *testing.T) { } func TestResolveP2PAnnounceAddresses_Remote_AddsDirectHostIP(t *testing.T) { - prevIP, hadIP := os.LookupEnv(runtimecfg.EnvEC2HostIP) + prevIP, hadIP := os.LookupEnv(runtimecfg.EnvRemoteHostIP) t.Cleanup(func() { if hadIP { - _ = os.Setenv(runtimecfg.EnvEC2HostIP, prevIP) + _ = os.Setenv(runtimecfg.EnvRemoteHostIP, prevIP) } else { - _ = os.Unsetenv(runtimecfg.EnvEC2HostIP) + _ = os.Unsetenv(runtimecfg.EnvRemoteHostIP) } }) - _ = os.Setenv(runtimecfg.EnvEC2HostIP, "10.1.2.3") + _ = os.Setenv(runtimecfg.EnvRemoteHostIP, "10.1.2.3") addresses, err := ResolveP2PAnnounceAddresses("remote", true, 16001) require.NoError(t, err, "ResolveP2PAnnounceAddresses should not fail") @@ -63,7 +63,7 @@ func TestResolveBootstrapPeerURL_RemoteCallerToLocalBootstrap_UsesBridgedHost(t } func TestResolveBootstrapAddress_Matrix(t *testing.T) { - t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + t.Setenv(runtimecfg.EnvRemoteHostIP, "203.0.113.10") tests := []struct { name string diff --git a/system-tests/lib/cre/don/config/config_test.go b/system-tests/lib/cre/don/config/config_test.go index 9e149f684f3..f51e4f26a7f 100644 --- a/system-tests/lib/cre/don/config/config_test.go +++ b/system-tests/lib/cre/don/config/config_test.go @@ -14,7 +14,7 @@ import ( ) func TestResolveGatewayConnectorURL_PlacementMatrix(t *testing.T) { - t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + t.Setenv(runtimecfg.EnvRemoteHostIP, "203.0.113.10") tests := []struct { name string @@ -67,7 +67,7 @@ func TestResolveGatewayConnectorURL_RemoteHostOverride(t *testing.T) { } func TestResolveNodeFacingBootstrapAddress_PlacementMatrix(t *testing.T) { - t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + t.Setenv(runtimecfg.EnvRemoteHostIP, "203.0.113.10") tests := []struct { name string diff --git a/system-tests/lib/cre/environment/blockchain_start_test.go b/system-tests/lib/cre/environment/blockchain_start_test.go index faefe6f9f44..69e41d6e0f6 100644 --- a/system-tests/lib/cre/environment/blockchain_start_test.go +++ b/system-tests/lib/cre/environment/blockchain_start_test.go @@ -21,60 +21,60 @@ func TestValidateRemoteBlockchainInput(t *testing.T) { require.NoError(t, err, "expected anvil input to pass validation") } -func TestNewRemoteComponentClientPrefersEC2(t *testing.T) { - t.Setenv(remoteclient.EnvEC2AgentURL, "") - t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") - t.Setenv(remoteclient.EnvEC2AgentPort, "18080") +func TestNewRemoteComponentClientPrefersResolvedRuntime(t *testing.T) { + t.Setenv(remoteclient.EnvRemoteAgentURL, "") + t.Setenv(runtimecfg.EnvRemoteHostIP, "203.0.113.10") + t.Setenv(remoteclient.EnvRemoteAgentPort, "18080") runtime, err := remoteclient.ResolveRuntime(zerolog.Nop()) require.NoError(t, err, "expected remote runtime to resolve") client, err := remoteclient.NewComponentClient(runtime) - require.NoError(t, err, "expected ec2-first client to be created") + require.NoError(t, err, "expected runtime-backed client to be created") require.NotNil(t, client, "expected component client to be created") - require.Equal(t, "http://203.0.113.10:18080", runtime.AgentBaseURL, "unexpected ec2 base url") + require.Equal(t, "http://203.0.113.10:18080", runtime.AgentBaseURL, "unexpected remote base url") } -func TestResolveEC2AgentBaseURLRequiresHostOrInstanceInfoWhenURLMissing(t *testing.T) { - t.Setenv(remoteclient.EnvEC2AgentURL, "") - t.Setenv(runtimecfg.EnvEC2HostIP, "") - t.Setenv(runtimecfg.EnvEC2InstanceID, "") - t.Setenv(remoteclient.EnvEC2AgentPort, "") +func TestResolveRemoteAgentBaseURLRequiresHostOrInstanceInfoWhenURLMissing(t *testing.T) { + t.Setenv(remoteclient.EnvRemoteAgentURL, "") + t.Setenv(runtimecfg.EnvRemoteHostIP, "") + t.Setenv(runtimecfg.EnvRemoteAgentEC2InstanceID, "") + t.Setenv(remoteclient.EnvRemoteAgentPort, "") _, err := remoteclient.ResolveRuntime(zerolog.Nop()) - require.Error(t, err, "expected missing direct host resolution inputs to fail when %s is not set", remoteclient.EnvEC2AgentURL) + require.Error(t, err, "expected missing direct host resolution inputs to fail when %s is not set", remoteclient.EnvRemoteAgentURL) } -func TestResolveEC2AgentBaseURLRejectsInvalidPort(t *testing.T) { - t.Setenv(remoteclient.EnvEC2AgentURL, "") - t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") - t.Setenv(remoteclient.EnvEC2AgentPort, "not-a-port") +func TestResolveRemoteAgentBaseURLRejectsInvalidPort(t *testing.T) { + t.Setenv(remoteclient.EnvRemoteAgentURL, "") + t.Setenv(runtimecfg.EnvRemoteHostIP, "203.0.113.10") + t.Setenv(remoteclient.EnvRemoteAgentPort, "not-a-port") _, err := remoteclient.ResolveRuntime(zerolog.Nop()) - require.Error(t, err, "expected invalid %s to fail", remoteclient.EnvEC2AgentPort) - require.Contains(t, err.Error(), remoteclient.EnvEC2AgentPort, "expected error to mention %s", remoteclient.EnvEC2AgentPort) + require.Error(t, err, "expected invalid %s to fail", remoteclient.EnvRemoteAgentPort) + require.Contains(t, err.Error(), remoteclient.EnvRemoteAgentPort, "expected error to mention %s", remoteclient.EnvRemoteAgentPort) } -func TestResolveEC2AgentBaseURLDirectMode(t *testing.T) { - t.Setenv(remoteclient.EnvEC2AgentURL, "") - t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") - t.Setenv(remoteclient.EnvEC2AgentPort, "18080") +func TestResolveRemoteAgentBaseURLDirectMode(t *testing.T) { + t.Setenv(remoteclient.EnvRemoteAgentURL, "") + t.Setenv(runtimecfg.EnvRemoteHostIP, "203.0.113.10") + t.Setenv(remoteclient.EnvRemoteAgentPort, "18080") runtime, err := remoteclient.ResolveRuntime(zerolog.Nop()) require.NoError(t, err, "expected direct mode url resolution to succeed") require.Equal(t, "http://203.0.113.10:18080", runtime.AgentBaseURL, "unexpected direct mode base url") } -func TestResolveRemoteRuntimeRequiresEC2Resolution(t *testing.T) { - t.Setenv(remoteclient.EnvEC2AgentURL, "") - t.Setenv(runtimecfg.EnvEC2HostIP, "") - t.Setenv(runtimecfg.EnvEC2InstanceID, "") +func TestResolveRemoteRuntimeRequiresEC2DiscoveryInputsWhenNoURLOrHost(t *testing.T) { + t.Setenv(remoteclient.EnvRemoteAgentURL, "") + t.Setenv(runtimecfg.EnvRemoteHostIP, "") + t.Setenv(runtimecfg.EnvRemoteAgentEC2InstanceID, "") _, err := remoteclient.ResolveRuntime(zerolog.Nop()) - require.Error(t, err, "expected runtime resolution without EC2 inputs to fail") + require.Error(t, err, "expected runtime resolution without URL/host/EC2 discovery inputs to fail") } func TestRewriteRemoteBlockchainOutputForDirectAccess(t *testing.T) { - t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + t.Setenv(runtimecfg.EnvRemoteHostIP, "203.0.113.10") out := &blockchain.Output{ Nodes: []*blockchain.Node{ { diff --git a/system-tests/lib/cre/environment/dons.go b/system-tests/lib/cre/environment/dons.go index ac7e6bf9af5..f4048a11a70 100644 --- a/system-tests/lib/cre/environment/dons.go +++ b/system-tests/lib/cre/environment/dons.go @@ -126,7 +126,7 @@ func startDONsContainerized( remoteRuntime *remoteclient.Runtime, ) (*StartedDONs, error) { if remoteRuntime != nil { - normalizeForExecution(topology, nodeSets, remoteRuntime.EC2HostIP) + normalizeForExecution(topology, nodeSets, remoteRuntime.RemoteHostIP) } // Skip binary operations for remote DONs. diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index 833921fef54..ac33422baa9 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -199,7 +199,7 @@ func SetupTestEnvironment( } remoteHostIP := "" if remoteRuntime != nil { - remoteHostIP = remoteRuntime.EC2HostIP + remoteHostIP = remoteRuntime.RemoteHostIP } updatedNodeSets, topoErr := donconfig.PrepareNodeTOMLs( @@ -488,20 +488,16 @@ func resolveRemoteRuntimeForSetup( func resolveRemoteRuntimeInput() (remoteclient.RuntimeInput, error) { input := remoteclient.RuntimeInput{ - AgentBaseURL: strings.TrimSpace(os.Getenv(remoteclient.EnvEC2AgentURL)), + AgentBaseURL: strings.TrimSpace(os.Getenv(remoteclient.EnvRemoteAgentURL)), + RemoteHostIP: strings.TrimSpace(os.Getenv(runtimecfg.EnvRemoteHostIP)), } - if configuredPort := strings.TrimSpace(os.Getenv(remoteclient.EnvEC2AgentPort)); configuredPort != "" { + if configuredPort := strings.TrimSpace(os.Getenv(remoteclient.EnvRemoteAgentPort)); configuredPort != "" { parsedPort, err := strconv.Atoi(configuredPort) if err != nil || parsedPort <= 0 || parsedPort > 65535 { - return remoteclient.RuntimeInput{}, fmt.Errorf("invalid %s: %q", remoteclient.EnvEC2AgentPort, configuredPort) + return remoteclient.RuntimeInput{}, fmt.Errorf("invalid %s: %q", remoteclient.EnvRemoteAgentPort, configuredPort) } input.AgentPort = parsedPort } - ec2HostIP, err := runtimecfg.DirectHostIP() - if err != nil { - return remoteclient.RuntimeInput{}, err - } - input.EC2HostIP = ec2HostIP return input, nil } diff --git a/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote_test.go b/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote_test.go index 1a3ec6b6902..c9c052160d8 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote_test.go +++ b/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote_test.go @@ -40,8 +40,8 @@ func TestDeployArtifactsToRemoteNodeSetNoFilesFails(t *testing.T) { })) defer server.Close() - t.Setenv(EnvEC2AgentURL, server.URL) - t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + t.Setenv(EnvRemoteAgentURL, server.URL) + t.Setenv(runtimecfg.EnvRemoteHostIP, "203.0.113.10") err := DeployArtifactsToRemoteNodeSet(context.Background(), zerolog.Nop(), "workflow", "/home/chainlink/workflows", []string{"", ""}) require.Error(t, err) @@ -84,8 +84,8 @@ func TestDeployArtifactsToRemoteNodeSetSuccess(t *testing.T) { })) defer server.Close() - t.Setenv(EnvEC2AgentURL, server.URL) - t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + t.Setenv(EnvRemoteAgentURL, server.URL) + t.Setenv(runtimecfg.EnvRemoteHostIP, "203.0.113.10") err := DeployArtifactsToRemoteNodeSet(context.Background(), zerolog.Nop(), "workflow", "/home/chainlink/workflows", []string{artifactPath}) require.NoError(t, err) diff --git a/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go index 248182bdc9a..8ddc462046f 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go +++ b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go @@ -9,6 +9,7 @@ import ( "io" "net" "net/http" + "net/url" "os" "strconv" "strings" @@ -26,9 +27,9 @@ const ( ComponentTypeBlockchain = "blockchain" ComponentTypeJD = "jd" ComponentTypeNodeSet = "nodeset" - EnvEC2AgentURL = "CRE_EC2_AGENT_URL" - EnvEC2AgentPort = "CRE_EC2_AGENT_PORT" - defaultEC2AgentPort = 18080 + EnvRemoteAgentURL = "CRE_REMOTE_AGENT_URL" + EnvRemoteAgentPort = "CRE_REMOTE_AGENT_PORT" + defaultRemoteAgentPort = 18080 ) type ComponentClient interface { @@ -45,17 +46,17 @@ type httpComponentClient struct { type Runtime struct { AgentBaseURL string - EC2HostIP string + RemoteHostIP string Client ComponentClient } type RuntimeInput struct { AgentBaseURL string - EC2HostIP string + RemoteHostIP string AgentPort int } -func newEC2HTTPComponentClient(baseURL string) *httpComponentClient { +func newRemoteHTTPComponentClient(baseURL string) *httpComponentClient { return &httpComponentClient{ baseURL: baseURL, client: &http.Client{ @@ -72,18 +73,18 @@ func ResolveRuntime(testLogger zerolog.Logger) (*Runtime, error) { } func ResolveRuntimeWithInput(testLogger zerolog.Logger, input RuntimeInput) (*Runtime, error) { - baseURL, err := resolveEC2AgentBaseURL(testLogger, input) + baseURL, err := resolveRemoteAgentBaseURL(testLogger, input) if err != nil { - return nil, fmt.Errorf("failed to resolve EC2 agent base URL: %w", err) + return nil, fmt.Errorf("failed to resolve remote agent base URL: %w", err) } - ec2HostIP, err := resolveEC2HostIP(input) + remoteHostIP, err := resolveRemoteHostIP(input, baseURL) if err != nil { return nil, err } - client := newEC2HTTPComponentClient(baseURL) + client := newRemoteHTTPComponentClient(baseURL) runtime := &Runtime{ AgentBaseURL: baseURL, - EC2HostIP: ec2HostIP, + RemoteHostIP: remoteHostIP, Client: client, } @@ -113,7 +114,7 @@ func NewComponentClient(runtime *Runtime) (ComponentClient, error) { if strings.TrimSpace(runtime.AgentBaseURL) == "" { return nil, errors.New("resolved runtime is missing agent base url") } - return newEC2HTTPComponentClient(runtime.AgentBaseURL), nil + return newRemoteHTTPComponentClient(runtime.AgentBaseURL), nil } func (c *httpComponentClient) StartComponent(ctx context.Context, envelope agent.StartComponentEnvelope) (*agent.StartComponentResponse, error) { @@ -217,7 +218,7 @@ func (c *httpComponentClient) waitForHealth(ctx context.Context) error { if resp.StatusCode == http.StatusOK { return nil } - return fmt.Errorf("%s: status %s", describeEC2AgentHealthFailure(c.baseURL), resp.Status) + return fmt.Errorf("%s: status %s", describeRemoteAgentHealthFailure(c.baseURL), resp.Status) }, retry.Attempts(uint(c.maxAttempts)), retry.Delay(c.retryDelay), @@ -226,12 +227,12 @@ func (c *httpComponentClient) waitForHealth(ctx context.Context) error { ) } -func describeEC2AgentHealthFailure(baseURL string) string { +func describeRemoteAgentHealthFailure(baseURL string) string { return fmt.Sprintf( - "failed EC2 CRE agent health check (%s/v1/health); verify the agent process is running and %s matches its listen port (or set %s explicitly)", + "failed remote CRE agent health check (%s/v1/health); verify the agent process is running and %s matches its listen port (or set %s explicitly)", baseURL, - EnvEC2AgentPort, - EnvEC2AgentURL, + EnvRemoteAgentPort, + EnvRemoteAgentURL, ) } @@ -248,43 +249,62 @@ func RemoteAgentError(code, message string) error { return fmt.Errorf("remote agent error (%s): %s", code, message) } -func resolveEC2AgentBaseURL(testLogger zerolog.Logger, input RuntimeInput) (string, error) { +func resolveRemoteAgentBaseURL(testLogger zerolog.Logger, input RuntimeInput) (string, error) { if configured := strings.TrimSpace(input.AgentBaseURL); configured != "" { return configured, nil } - if configured := strings.TrimSpace(os.Getenv(EnvEC2AgentURL)); configured != "" { + if configured := strings.TrimSpace(os.Getenv(EnvRemoteAgentURL)); configured != "" { return configured, nil } - remotePort, err := resolveEC2AgentPort(input) + remotePort, err := resolveRemoteAgentPort(input) if err != nil { return "", err } - ec2HostIP, err := resolveEC2HostIP(input) + remoteHostIP, err := resolveRemoteHostIP(input, "") if err != nil { return "", err } - testLogger.Debug().Str("ec2HostIP", ec2HostIP).Int("port", remotePort).Msg("resolved EC2 CRE agent base URL") - return fmt.Sprintf("http://%s:%d", ec2HostIP, remotePort), nil + testLogger.Debug().Str("remoteHostIP", remoteHostIP).Int("port", remotePort).Msg("resolved remote CRE agent base URL") + return fmt.Sprintf("http://%s:%d", remoteHostIP, remotePort), nil } -func resolveEC2AgentPort(input RuntimeInput) (int, error) { +func resolveRemoteAgentPort(input RuntimeInput) (int, error) { if input.AgentPort > 0 { return input.AgentPort, nil } - remotePort := defaultEC2AgentPort - if configuredPort := strings.TrimSpace(os.Getenv(EnvEC2AgentPort)); configuredPort != "" { + remotePort := defaultRemoteAgentPort + if configuredPort := strings.TrimSpace(os.Getenv(EnvRemoteAgentPort)); configuredPort != "" { parsedPort, err := strconv.Atoi(configuredPort) if err != nil || parsedPort <= 0 || parsedPort > 65535 { - return 0, fmt.Errorf("invalid %s: %q", EnvEC2AgentPort, configuredPort) + return 0, fmt.Errorf("invalid %s: %q", EnvRemoteAgentPort, configuredPort) } remotePort = parsedPort } return remotePort, nil } -func resolveEC2HostIP(input RuntimeInput) (string, error) { - if configured := strings.TrimSpace(input.EC2HostIP); configured != "" { +func resolveRemoteHostIP(input RuntimeInput, baseURL string) (string, error) { + if configured := strings.TrimSpace(input.RemoteHostIP); configured != "" { return configured, nil } + if host, ok := hostFromBaseURL(baseURL); ok { + return host, nil + } return runtimecfg.DirectHostIP() } + +func hostFromBaseURL(baseURL string) (string, bool) { + trimmed := strings.TrimSpace(baseURL) + if trimmed == "" { + return "", false + } + parsed, err := url.Parse(trimmed) + if err != nil { + return "", false + } + host := strings.TrimSpace(parsed.Hostname()) + if host == "" { + return "", false + } + return host, true +} diff --git a/system-tests/lib/cre/environment/remoteexec/client/remote_component_client_test.go b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client_test.go index 99912c30a67..a5716eeefa3 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/remote_component_client_test.go +++ b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client_test.go @@ -16,21 +16,21 @@ import ( ) func TestResolveRemoteRuntimeWithExplicitEnv(t *testing.T) { - t.Setenv(EnvEC2AgentURL, "http://198.51.100.20:19090") - t.Setenv(runtimecfg.EnvEC2HostIP, "198.51.100.20") - t.Setenv(EnvEC2AgentPort, "19090") + t.Setenv(EnvRemoteAgentURL, "http://198.51.100.20:19090") + t.Setenv(runtimecfg.EnvRemoteHostIP, "198.51.100.20") + t.Setenv(EnvRemoteAgentPort, "19090") runtime, err := ResolveRuntime(zerolog.Nop()) require.NoError(t, err, "expected runtime resolution to succeed") require.Equal(t, "http://198.51.100.20:19090", runtime.AgentBaseURL, "unexpected agent base url") - require.Equal(t, "198.51.100.20", runtime.EC2HostIP, "unexpected ec2 host ip") + require.Equal(t, "198.51.100.20", runtime.RemoteHostIP, "unexpected remote host ip") require.NotNil(t, runtime.Client, "expected resolved runtime to include component client") } func TestResolveRemoteRuntimeWithInputOverridesEnv(t *testing.T) { - t.Setenv(EnvEC2AgentURL, "http://198.51.100.20:19090") - t.Setenv(runtimecfg.EnvEC2HostIP, "198.51.100.20") - t.Setenv(EnvEC2AgentPort, "19090") + t.Setenv(EnvRemoteAgentURL, "http://198.51.100.20:19090") + t.Setenv(runtimecfg.EnvRemoteHostIP, "198.51.100.20") + t.Setenv(EnvRemoteAgentPort, "19090") server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { require.Equal(t, "/v1/status", r.URL.Path) @@ -43,21 +43,22 @@ func TestResolveRemoteRuntimeWithInputOverridesEnv(t *testing.T) { runtime, err := ResolveRuntimeWithInput(zerolog.Nop(), RuntimeInput{ AgentBaseURL: server.URL, - EC2HostIP: "203.0.113.22", + RemoteHostIP: "203.0.113.22", AgentPort: 18081, }) require.NoError(t, err) require.Equal(t, server.URL, runtime.AgentBaseURL) - require.Equal(t, "203.0.113.22", runtime.EC2HostIP) + require.Equal(t, "203.0.113.22", runtime.RemoteHostIP) } -func TestResolveRemoteRuntimeRequiresHostResolution(t *testing.T) { - t.Setenv(EnvEC2AgentURL, "http://198.51.100.20:19090") - t.Setenv(runtimecfg.EnvEC2HostIP, "") - t.Setenv(runtimecfg.EnvEC2InstanceID, "") +func TestResolveRemoteRuntimeDerivesHostFromAgentURLWithoutAWSInputs(t *testing.T) { + t.Setenv(EnvRemoteAgentURL, "http://198.51.100.20:19090") + t.Setenv(runtimecfg.EnvRemoteHostIP, "") + t.Setenv(runtimecfg.EnvRemoteAgentEC2InstanceID, "") - _, err := ResolveRuntime(zerolog.Nop()) - require.Error(t, err, "expected runtime resolution without EC2 host inputs to fail") + runtime, err := ResolveRuntime(zerolog.Nop()) + require.NoError(t, err, "expected runtime resolution to derive host from explicit remote agent url") + require.Equal(t, "198.51.100.20", runtime.RemoteHostIP, "expected host parsed from agent base URL") } func TestNewRemoteComponentClientRequiresResolvedRuntime(t *testing.T) { @@ -68,11 +69,11 @@ func TestNewRemoteComponentClientRequiresResolvedRuntime(t *testing.T) { require.Error(t, err, "expected missing agent base URL to fail") } -func TestDescribeEC2AgentHealthFailureMentionsResolutionHints(t *testing.T) { - msg := describeEC2AgentHealthFailure("http://203.0.113.10:8080") +func TestDescribeRemoteAgentHealthFailureMentionsResolutionHints(t *testing.T) { + msg := describeRemoteAgentHealthFailure("http://203.0.113.10:8080") require.Contains(t, msg, "/v1/health") - require.Contains(t, msg, EnvEC2AgentPort) - require.Contains(t, msg, EnvEC2AgentURL) + require.Contains(t, msg, EnvRemoteAgentPort) + require.Contains(t, msg, EnvRemoteAgentURL) } func TestIsRetriableStatus(t *testing.T) { diff --git a/system-tests/lib/cre/environment/remoteexec/client/remote_component_descriptor_start.go b/system-tests/lib/cre/environment/remoteexec/client/remote_component_descriptor_start.go index 86bf0fe0a23..651326217cf 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/remote_component_descriptor_start.go +++ b/system-tests/lib/cre/environment/remoteexec/client/remote_component_descriptor_start.go @@ -39,7 +39,7 @@ func StartWithRuntimeDescriptor[T any]( return nil, err } if descriptor.Rewrite != nil { - if err := descriptor.Rewrite(output, runtime.EC2HostIP); err != nil { + if err := descriptor.Rewrite(output, runtime.RemoteHostIP); err != nil { return nil, err } } diff --git a/system-tests/lib/cre/environment/remoteexec/client/remote_stop_test.go b/system-tests/lib/cre/environment/remoteexec/client/remote_stop_test.go index 9ee7198e883..db0b2603c38 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/remote_stop_test.go +++ b/system-tests/lib/cre/environment/remoteexec/client/remote_stop_test.go @@ -77,8 +77,8 @@ func TestStopRemoteComponents_SummaryAndResiduals(t *testing.T) { server := newRemoteStopTestServer(t) defer server.Close() - t.Setenv(EnvEC2AgentURL, server.URL) - t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + t.Setenv(EnvRemoteAgentURL, server.URL) + t.Setenv(runtimecfg.EnvRemoteHostIP, "203.0.113.10") cfg := &config.Config{ Blockchains: []*config.Blockchain{ @@ -126,8 +126,8 @@ func TestStopRemoteComponents_ResidualQueryFailureIsReportedInSummary(t *testing })) defer server.Close() - t.Setenv(EnvEC2AgentURL, server.URL) - t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + t.Setenv(EnvRemoteAgentURL, server.URL) + t.Setenv(runtimecfg.EnvRemoteHostIP, "203.0.113.10") cfg := &config.Config{ Blockchains: []*config.Blockchain{ diff --git a/system-tests/lib/cre/environment/state_test.go b/system-tests/lib/cre/environment/state_test.go index 9032aa7ae6e..ce800f36f27 100644 --- a/system-tests/lib/cre/environment/state_test.go +++ b/system-tests/lib/cre/environment/state_test.go @@ -15,7 +15,7 @@ func TestRewriteReconstructedGatewayIncomingHosts_RemoteGatewayUsesEC2IP(t *test cfg := &config.Config{ NodeSets: []*cre.NodeSet{nodeSet}, } - t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.10") + t.Setenv(runtimecfg.EnvRemoteHostIP, "203.0.113.10") err := rewriteReconstructedGatewayIncomingHosts(cfg, topology) require.NoError(t, err, "expected remote gateway incoming rewrite to succeed") @@ -33,8 +33,8 @@ func TestRewriteReconstructedGatewayIncomingHosts_LocalGatewayNoop(t *testing.T) cfg := &config.Config{ NodeSets: []*cre.NodeSet{nodeSet}, } - t.Setenv(runtimecfg.EnvEC2HostIP, "") - t.Setenv(runtimecfg.EnvEC2InstanceID, "") + t.Setenv(runtimecfg.EnvRemoteHostIP, "") + t.Setenv(runtimecfg.EnvRemoteAgentEC2InstanceID, "") err := rewriteReconstructedGatewayIncomingHosts(cfg, topology) require.NoError(t, err, "expected local gateway reconstruction rewrite to be a no-op") @@ -59,7 +59,7 @@ func TestRewriteReconstructedGatewayIncomingHosts_RewritesOnlyRemoteNodeSets(t * cfg := &config.Config{ NodeSets: []*cre.NodeSet{remoteNodeSet, localNodeSet}, } - t.Setenv(runtimecfg.EnvEC2HostIP, "203.0.113.77") + t.Setenv(runtimecfg.EnvRemoteHostIP, "203.0.113.77") err := rewriteReconstructedGatewayIncomingHosts(cfg, remoteTopology) require.NoError(t, err, "expected mixed reconstruction rewrite to succeed") diff --git a/system-tests/lib/cre/runtimecfg/access_mode.go b/system-tests/lib/cre/runtimecfg/access_mode.go index 79eafe0902a..db147049d9f 100644 --- a/system-tests/lib/cre/runtimecfg/access_mode.go +++ b/system-tests/lib/cre/runtimecfg/access_mode.go @@ -14,10 +14,9 @@ import ( ) const ( - EnvEC2HostIP = "CRE_EC2_HOST_IP" + EnvRemoteHostIP = "CRE_REMOTE_HOST_IP" EnvLocalHostIP = "CRE_LOCAL_HOST_IP" - EnvEC2InstanceID = "CRE_EC2_INSTANCE_ID" - EnvAWSProfile = "CRE_AWS_PROFILE" + EnvRemoteAgentEC2InstanceID = "CRE_REMOTE_AGENT_EC2_INSTANCE_ID" defaultEC2Region = "us-west-2" ) @@ -28,14 +27,14 @@ func IsDirectMode() bool { } func DirectHostIP() (string, error) { - hostIP := strings.TrimSpace(os.Getenv(EnvEC2HostIP)) + hostIP := strings.TrimSpace(os.Getenv(EnvRemoteHostIP)) if hostIP != "" { return hostIP, nil } - instanceID := strings.TrimSpace(os.Getenv(EnvEC2InstanceID)) + instanceID := strings.TrimSpace(os.Getenv(EnvRemoteAgentEC2InstanceID)) if instanceID == "" { - return "", fmt.Errorf("%s must be set (or set %s explicitly)", EnvEC2InstanceID, EnvEC2HostIP) + return "", fmt.Errorf("%s must be set (or set %s explicitly)", EnvRemoteAgentEC2InstanceID, EnvRemoteHostIP) } return discoverEC2HostIP(instanceID) } @@ -102,9 +101,6 @@ func ResolveAWSCLIProfileSelection() (string, string) { if hasWebIdentityCreds() { return "", "web-identity" } - if profile := strings.TrimSpace(os.Getenv(EnvAWSProfile)); profile != "" { - return profile, "profile:CRE_AWS_PROFILE" - } if profile := strings.TrimSpace(os.Getenv("AWS_PROFILE")); profile != "" { return profile, "profile:AWS_PROFILE" } diff --git a/system-tests/lib/cre/runtimecfg/access_mode_test.go b/system-tests/lib/cre/runtimecfg/access_mode_test.go index e317822c625..da32ff4eb36 100644 --- a/system-tests/lib/cre/runtimecfg/access_mode_test.go +++ b/system-tests/lib/cre/runtimecfg/access_mode_test.go @@ -7,8 +7,8 @@ import ( ) func TestDirectHostIPUsesExplicitEnv(t *testing.T) { - t.Setenv(EnvEC2HostIP, "203.0.113.10") - t.Setenv(EnvEC2InstanceID, "") + t.Setenv(EnvRemoteHostIP, "203.0.113.10") + t.Setenv(EnvRemoteAgentEC2InstanceID, "") hostIP, err := DirectHostIP() require.NoError(t, err) @@ -16,12 +16,12 @@ func TestDirectHostIPUsesExplicitEnv(t *testing.T) { } func TestDirectHostIPRequiresInstanceWhenHostMissing(t *testing.T) { - t.Setenv(EnvEC2HostIP, "") - t.Setenv(EnvEC2InstanceID, "") + t.Setenv(EnvRemoteHostIP, "") + t.Setenv(EnvRemoteAgentEC2InstanceID, "") _, err := DirectHostIP() require.Error(t, err) - require.Contains(t, err.Error(), EnvEC2InstanceID) + require.Contains(t, err.Error(), EnvRemoteAgentEC2InstanceID) } func TestLocalHostIPUsesExplicitEnv(t *testing.T) { @@ -32,7 +32,6 @@ func TestLocalHostIPUsesExplicitEnv(t *testing.T) { func TestResolveAWSCLIProfileSelectionOrder(t *testing.T) { t.Setenv("AWS_ACCESS_KEY_ID", "key") t.Setenv("AWS_SECRET_ACCESS_KEY", "secret") - t.Setenv(EnvAWSProfile, "profile-a") profile, mode := ResolveAWSCLIProfileSelection() require.Equal(t, "", profile) require.Equal(t, "env-creds", mode) @@ -47,15 +46,9 @@ func TestResolveAWSCLIProfileSelectionOrder(t *testing.T) { t.Setenv("AWS_WEB_IDENTITY_TOKEN_FILE", "") t.Setenv("AWS_ROLE_ARN", "") - t.Setenv(EnvAWSProfile, "profile-a") t.Setenv("AWS_PROFILE", "profile-b") t.Setenv("AWS_DEFAULT_PROFILE", "profile-c") profile, mode = ResolveAWSCLIProfileSelection() - require.Equal(t, "profile-a", profile) - require.Equal(t, "profile:CRE_AWS_PROFILE", mode) - - t.Setenv(EnvAWSProfile, "") - profile, mode = ResolveAWSCLIProfileSelection() require.Equal(t, "profile-b", profile) require.Equal(t, "profile:AWS_PROFILE", mode) diff --git a/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md b/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md index 64ec4fb25ce..4f7bf850c65 100644 --- a/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md +++ b/system-tests/tests/smoke/cre/REMOTE_HYBRID_RUNBOOK.md @@ -11,17 +11,16 @@ This runbook covers the EC2-based remote mode for CRE where components can run e ## Core Environment Variables -- `CRE_EC2_INSTANCE_ID=` (used by direct mode auto IP lookup) -- `CRE_EC2_AGENT_PORT=` (defaults to `18080`) -- `CRE_EC2_AGENT_URL=` (optional explicit override) -- `CRE_EC2_HOST_IP=` (optional in direct mode; if missing, resolved from AWS CLI using instance ID) -- `CRE_AWS_PROFILE=` (optional AWS auth profile) +- `CRE_REMOTE_AGENT_EC2_INSTANCE_ID=` (used by direct mode auto IP lookup) +- `CRE_REMOTE_AGENT_PORT=` (defaults to `18080`) +- `CRE_REMOTE_AGENT_URL=` (optional explicit override) +- `CRE_REMOTE_HOST_IP=` (optional in direct mode; if missing, resolved from AWS CLI using instance ID) ## Direct Mode Defaults and IP Resolution - Host IP resolution is: - 1. `CRE_EC2_HOST_IP` if set. - 2. Otherwise, resolve from AWS CLI using `CRE_EC2_INSTANCE_ID`: + 1. `CRE_REMOTE_HOST_IP` if set. + 2. Otherwise, resolve from AWS CLI using `CRE_REMOTE_AGENT_EC2_INSTANCE_ID`: - `aws ec2 describe-instances --instance-ids --query ...` - prefers private IP; falls back to public IP if needed. - Region defaults to `us-west-2` unless AWS env region overrides are present. @@ -33,10 +32,9 @@ For direct-mode auto IP lookup, AWS CLI auth selection follows: 1. Static env credentials (`AWS_ACCESS_KEY_ID` + `AWS_SECRET_ACCESS_KEY`) 2. Web identity (`AWS_WEB_IDENTITY_TOKEN_FILE` + `AWS_ROLE_ARN`) -3. `CRE_AWS_PROFILE` -4. `AWS_PROFILE` -5. `AWS_DEFAULT_PROFILE` -6. AWS CLI default credential chain/profile +3. `AWS_PROFILE` +4. `AWS_DEFAULT_PROFILE` +5. AWS CLI default credential chain/profile ## Agent Startup @@ -65,7 +63,7 @@ For direct-mode auto IP lookup, AWS CLI auth selection follows: - If startup fails on bootstrap reachability: - ensure relay supervisor was started, - ensure EC2 agent is reachable and has relay open for `5001`, - - verify direct mode host IP resolution (`CRE_EC2_HOST_IP` or `CRE_EC2_INSTANCE_ID` + AWS CLI auth). + - verify direct mode host IP resolution (`CRE_REMOTE_HOST_IP` or `CRE_REMOTE_AGENT_EC2_INSTANCE_ID` + AWS CLI auth). ## Bridge and Fixture Relay @@ -81,8 +79,8 @@ For direct-mode auto IP lookup, AWS CLI auth selection follows: ## Fast Triage Checklist -- Agent unreachable: verify `CRE_EC2_AGENT_URL` (if set), or `CRE_EC2_INSTANCE_ID`/AWS credentials + `CRE_EC2_AGENT_PORT`. -- Direct mode cannot resolve EC2 IP: ensure `CRE_EC2_INSTANCE_ID` is set and AWS CLI credentials are valid, or set `CRE_EC2_HOST_IP` explicitly. +- Agent unreachable: verify `CRE_REMOTE_AGENT_URL` (if set), or `CRE_REMOTE_AGENT_EC2_INSTANCE_ID`/AWS credentials + `CRE_REMOTE_AGENT_PORT`. +- Direct mode cannot resolve EC2 IP: ensure `CRE_REMOTE_AGENT_EC2_INSTANCE_ID` is set and AWS CLI credentials are valid, or set `CRE_REMOTE_HOST_IP` explicitly. - `invalid jd placement`: use `placement=local` or `placement=remote` (only supported values). - Remote nodes hitting local-only fixtures: ensure fixture relay helper is active. - Mixed remote->local gateway from NodeSets is supported when bridge plumbing is present. diff --git a/system-tests/tests/test-helpers/fixture_relay_helpers.go b/system-tests/tests/test-helpers/fixture_relay_helpers.go index 09e1ac04603..54f504e414c 100644 --- a/system-tests/tests/test-helpers/fixture_relay_helpers.go +++ b/system-tests/tests/test-helpers/fixture_relay_helpers.go @@ -26,8 +26,8 @@ import ( ) const ( - envEC2AgentURL = "CRE_EC2_AGENT_URL" - envEC2AgentPort = "CRE_EC2_AGENT_PORT" + envRemoteAgentURL = "CRE_REMOTE_AGENT_URL" + envRemoteAgentPort = "CRE_REMOTE_AGENT_PORT" ) type relayOpenResponse struct { @@ -127,7 +127,7 @@ func hasRemoteNodeSets(cfg *envconfig.Config) bool { } func resolveAgentBaseURLForRelay() (string, error) { - if v := strings.TrimSpace(os.Getenv(envEC2AgentURL)); v != "" { + if v := strings.TrimSpace(os.Getenv(envRemoteAgentURL)); v != "" { return v, nil } hostIP, err := runtimecfg.DirectHostIP() @@ -135,10 +135,10 @@ func resolveAgentBaseURLForRelay() (string, error) { return "", err } port := 8080 - if rawPort := strings.TrimSpace(os.Getenv(envEC2AgentPort)); rawPort != "" { + if rawPort := strings.TrimSpace(os.Getenv(envRemoteAgentPort)); rawPort != "" { parsed, err := strconv.Atoi(rawPort) if err != nil || parsed <= 0 || parsed > 65535 { - return "", fmt.Errorf("invalid %s: %q", envEC2AgentPort, rawPort) + return "", fmt.Errorf("invalid %s: %q", envRemoteAgentPort, rawPort) } port = parsed } From bb1449a52f7ed0996e57d8252542e69c6965842e Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Thu, 26 Feb 2026 09:56:52 +0100 Subject: [PATCH 28/34] update CTF dep --- core/scripts/go.mod | 4 +--- core/scripts/go.sum | 2 ++ system-tests/lib/go.mod | 4 +--- system-tests/lib/go.sum | 2 ++ system-tests/tests/go.mod | 4 +--- system-tests/tests/go.sum | 2 ++ 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/core/scripts/go.mod b/core/scripts/go.mod index 6bdf40031cf..2d3683c6b2d 100644 --- a/core/scripts/go.mod +++ b/core/scripts/go.mod @@ -9,8 +9,6 @@ replace github.com/smartcontractkit/chainlink/deployment => ../../deployment replace github.com/smartcontractkit/chainlink/system-tests/lib => ../../system-tests/lib -replace github.com/smartcontractkit/chainlink-testing-framework/framework => /Users/bartektofel/Desktop/repos/chainlink-testing-framework/framework - replace github.com/smartcontractkit/chainlink/core/scripts/cre/environment/examples/workflows/v1/proof-of-reserve/cron-based => ./cre/environment/examples/workflows/v1/proof-of-reserve/cron-based replace github.com/smartcontractkit/chainlink/core/scripts/cre/environment/examples/workflows/v1/proof-of-reserve/web-trigger-based => ./cre/environment/examples/workflows/v1/proof-of-reserve/web-trigger-based @@ -57,7 +55,7 @@ require ( github.com/smartcontractkit/chainlink-evm/gethwrappers v0.0.0-20251222115927-36a18321243c github.com/smartcontractkit/chainlink-protos/cre/go v0.0.0-20260217043601-5cc966896c4f github.com/smartcontractkit/chainlink-protos/job-distributor v0.17.0 - github.com/smartcontractkit/chainlink-testing-framework/framework v0.14.1-0.20260212100725-fbd6b3bca4d1 + github.com/smartcontractkit/chainlink-testing-framework/framework v0.14.8-0.20260225150758-2a5936b5130b github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.20 github.com/smartcontractkit/chainlink-testing-framework/lib v1.54.5 github.com/smartcontractkit/chainlink-testing-framework/seth v1.51.3 diff --git a/core/scripts/go.sum b/core/scripts/go.sum index 9950e297883..88799f71b56 100644 --- a/core/scripts/go.sum +++ b/core/scripts/go.sum @@ -1673,6 +1673,8 @@ github.com/smartcontractkit/chainlink-sui v0.0.0-20260223231841-af91ea434e03 h1: github.com/smartcontractkit/chainlink-sui v0.0.0-20260223231841-af91ea434e03/go.mod h1:U3XStbEnbx/+L22n1/8aOIdgcGVxtsZB7p59xJGngAs= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260217210647-11c42009ec1f h1:UvTDQeTi19fQw/GUpDBC9uDz2UGQoi1h+YLfCcAUwl0= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260217210647-11c42009ec1f/go.mod h1:IfeW6t5Yc5293H5ixuooAft+wYBMSFQWKjbBTwYiKr4= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.14.8-0.20260225150758-2a5936b5130b h1:PKKiGszU9zRF4aedl2HGGWhcq9DVdK4VRq1vfVB71nc= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.14.8-0.20260225150758-2a5936b5130b/go.mod h1:43xdIQuqw/gzfazsqJkBrGdF25TIJDiY/Ak/YrWFTmU= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.20 h1:8D2DUnn7mLUZOLhPDGGFKKvBrgU6LQd00tq2VOprvfI= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.20/go.mod h1:98jNYBOPuKWJw9a8x0LgQuudp5enrHhQQP5Hq0YwRB8= github.com/smartcontractkit/chainlink-testing-framework/framework/components/fake v0.10.0 h1:PWAMYu0WaAMBfbpxCpFJGRIDHmcgmYin6a+UQC0OdtY= diff --git a/system-tests/lib/go.mod b/system-tests/lib/go.mod index b913843bdeb..caa8c891a30 100644 --- a/system-tests/lib/go.mod +++ b/system-tests/lib/go.mod @@ -11,8 +11,6 @@ replace github.com/smartcontractkit/chainlink/v2 => ../../ replace github.com/smartcontractkit/chainlink/deployment => ../../deployment -replace github.com/smartcontractkit/chainlink-testing-framework/framework => /Users/bartektofel/Desktop/repos/chainlink-testing-framework/framework - require ( dario.cat/mergo v1.0.2 github.com/Masterminds/semver/v3 v3.4.0 @@ -46,7 +44,7 @@ require ( github.com/smartcontractkit/chainlink-protos/job-distributor v0.17.0 github.com/smartcontractkit/chainlink-protos/workflows/go v0.0.0-20260217043601-5cc966896c4f github.com/smartcontractkit/chainlink-solana v1.1.2-0.20260223222711-2fa6b0e07db0 - github.com/smartcontractkit/chainlink-testing-framework/framework v0.14.1-0.20260212100725-fbd6b3bca4d1 + github.com/smartcontractkit/chainlink-testing-framework/framework v0.14.8-0.20260225150758-2a5936b5130b github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.15 github.com/smartcontractkit/chainlink-testing-framework/framework/components/fake v0.10.0 github.com/smartcontractkit/chainlink-testing-framework/lib v1.54.5 diff --git a/system-tests/lib/go.sum b/system-tests/lib/go.sum index 0b39c1d9c9f..be0f9c5ad79 100644 --- a/system-tests/lib/go.sum +++ b/system-tests/lib/go.sum @@ -1638,6 +1638,8 @@ github.com/smartcontractkit/chainlink-sui v0.0.0-20260223231841-af91ea434e03 h1: github.com/smartcontractkit/chainlink-sui v0.0.0-20260223231841-af91ea434e03/go.mod h1:U3XStbEnbx/+L22n1/8aOIdgcGVxtsZB7p59xJGngAs= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260217210647-11c42009ec1f h1:UvTDQeTi19fQw/GUpDBC9uDz2UGQoi1h+YLfCcAUwl0= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260217210647-11c42009ec1f/go.mod h1:IfeW6t5Yc5293H5ixuooAft+wYBMSFQWKjbBTwYiKr4= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.14.8-0.20260225150758-2a5936b5130b h1:PKKiGszU9zRF4aedl2HGGWhcq9DVdK4VRq1vfVB71nc= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.14.8-0.20260225150758-2a5936b5130b/go.mod h1:43xdIQuqw/gzfazsqJkBrGdF25TIJDiY/Ak/YrWFTmU= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.15 h1:usf6YCNmSO8R1/rU28wUfIdp7zXlqGGOAttXW5mgkXU= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.15/go.mod h1:YqrpawYGRkT/jcvXcmaZeZPOtu0erIenrHl5Mb8+U/c= github.com/smartcontractkit/chainlink-testing-framework/framework/components/fake v0.10.0 h1:PWAMYu0WaAMBfbpxCpFJGRIDHmcgmYin6a+UQC0OdtY= diff --git a/system-tests/tests/go.mod b/system-tests/tests/go.mod index 091bf8ac227..08546864eae 100644 --- a/system-tests/tests/go.mod +++ b/system-tests/tests/go.mod @@ -39,8 +39,6 @@ replace github.com/smartcontractkit/chainlink/system-tests/tests/regression/cre/ replace github.com/smartcontractkit/chainlink/system-tests/tests/smoke/cre/solana/solwrite => ./smoke/cre/solana/solwrite -replace github.com/smartcontractkit/chainlink-testing-framework/framework => /Users/bartektofel/Desktop/repos/chainlink-testing-framework/framework - require ( github.com/Masterminds/semver/v3 v3.4.0 github.com/avast/retry-go/v4 v4.6.1 @@ -65,7 +63,7 @@ require ( github.com/smartcontractkit/chainlink-protos/job-distributor v0.17.0 github.com/smartcontractkit/chainlink-protos/ring/go v0.0.0-20260128151123-605e9540b706 github.com/smartcontractkit/chainlink-protos/workflows/go v0.0.0-20260217043601-5cc966896c4f - github.com/smartcontractkit/chainlink-testing-framework/framework v0.14.1-0.20260212100725-fbd6b3bca4d1 + github.com/smartcontractkit/chainlink-testing-framework/framework v0.14.8-0.20260225150758-2a5936b5130b github.com/smartcontractkit/chainlink-testing-framework/framework/components/fake v0.10.0 github.com/smartcontractkit/chainlink-testing-framework/havoc v1.50.7 github.com/smartcontractkit/chainlink-testing-framework/lib v1.54.5 diff --git a/system-tests/tests/go.sum b/system-tests/tests/go.sum index 19941295a49..00f13c7157e 100644 --- a/system-tests/tests/go.sum +++ b/system-tests/tests/go.sum @@ -1846,6 +1846,8 @@ github.com/smartcontractkit/chainlink-sui v0.0.0-20260223231841-af91ea434e03 h1: github.com/smartcontractkit/chainlink-sui v0.0.0-20260223231841-af91ea434e03/go.mod h1:U3XStbEnbx/+L22n1/8aOIdgcGVxtsZB7p59xJGngAs= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260217210647-11c42009ec1f h1:UvTDQeTi19fQw/GUpDBC9uDz2UGQoi1h+YLfCcAUwl0= github.com/smartcontractkit/chainlink-sui/deployment v0.0.0-20260217210647-11c42009ec1f/go.mod h1:IfeW6t5Yc5293H5ixuooAft+wYBMSFQWKjbBTwYiKr4= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.14.8-0.20260225150758-2a5936b5130b h1:PKKiGszU9zRF4aedl2HGGWhcq9DVdK4VRq1vfVB71nc= +github.com/smartcontractkit/chainlink-testing-framework/framework v0.14.8-0.20260225150758-2a5936b5130b/go.mod h1:43xdIQuqw/gzfazsqJkBrGdF25TIJDiY/Ak/YrWFTmU= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.18 h1:1ng+p/+85zcVLHB050PiWUAjOcxyd4KjwkUlJy34rgE= github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose v0.1.18/go.mod h1:2+OrSz56pdgtY0Oc20nCS9LH/bEksFDBQjoR82De5PI= github.com/smartcontractkit/chainlink-testing-framework/framework/components/fake v0.10.0 h1:PWAMYu0WaAMBfbpxCpFJGRIDHmcgmYin6a+UQC0OdtY= From fd799a184052b758afb4108f5aa4fddaf9bda18f Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Thu, 26 Feb 2026 12:03:48 +0100 Subject: [PATCH 29/34] WIP of standalone chip ingress sink --- .../cre/environment/environment/beholder.go | 1 + .../environment/environment/beholder_sink.go | 596 ++++++++++++++++++ .../environment/environment/remote_state.go | 26 +- .../environment/remoteexec/agent/server.go | 174 +++-- .../remoteexec/agent/server_chip_sink.go | 366 +++++++++++ .../remoteexec/agent/server_chip_sink_test.go | 75 +++ .../remoteexec/agent/server_handlers_test.go | 38 ++ .../agent/server_status_handlers.go | 4 +- .../remoteexec/chipsink/event_decode.go | 112 ++++ .../environment/remoteexec/chipsink/server.go | 129 ++++ .../remoteexec/client/chip_sink_remote.go | 112 ++++ .../client/chip_sink_remote_test.go | 83 +++ .../test-helpers/chip_testsink_helpers.go | 14 +- 13 files changed, 1661 insertions(+), 69 deletions(-) create mode 100644 core/scripts/cre/environment/environment/beholder_sink.go create mode 100644 system-tests/lib/cre/environment/remoteexec/agent/server_chip_sink.go create mode 100644 system-tests/lib/cre/environment/remoteexec/agent/server_chip_sink_test.go create mode 100644 system-tests/lib/cre/environment/remoteexec/chipsink/event_decode.go create mode 100644 system-tests/lib/cre/environment/remoteexec/chipsink/server.go create mode 100644 system-tests/lib/cre/environment/remoteexec/client/chip_sink_remote.go create mode 100644 system-tests/lib/cre/environment/remoteexec/client/chip_sink_remote_test.go diff --git a/core/scripts/cre/environment/environment/beholder.go b/core/scripts/cre/environment/environment/beholder.go index 826498bcf80..32f3a74121d 100644 --- a/core/scripts/cre/environment/environment/beholder.go +++ b/core/scripts/cre/environment/environment/beholder.go @@ -246,6 +246,7 @@ func beholderCmds() *cobra.Command { cmd.AddCommand(startBeholderCmd()) cmd.AddCommand(stopBeholderCmd) + cmd.AddCommand(beholderSinkCmd()) cmd.AddCommand(createKafkaTopicsCmd()) cmd.AddCommand(fetchAndRegisterProtosCmd()) diff --git a/core/scripts/cre/environment/environment/beholder_sink.go b/core/scripts/cre/environment/environment/beholder_sink.go new file mode 100644 index 00000000000..4161f59bdba --- /dev/null +++ b/core/scripts/cre/environment/environment/beholder_sink.go @@ -0,0 +1,596 @@ +package environment + +import ( + "bufio" + "context" + "encoding/json" + "fmt" + "net" + "os" + "os/exec" + "os/signal" + "path/filepath" + "strconv" + "strings" + "sync" + "syscall" + "time" + + "github.com/cloudevents/sdk-go/binding/format/protobuf/v2/pb" + "github.com/pelletier/go-toml/v2" + "github.com/pkg/errors" + chippb "github.com/smartcontractkit/chainlink-common/pkg/chipingress/pb" + "github.com/spf13/cobra" + + "github.com/smartcontractkit/chainlink-testing-framework/framework" + envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/chipsink" + remoteclient "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/client" +) + +const ( + chipSinkStateFilename = "chip_testsink.toml" + chipSinkLogFilename = "chip_testsink.log" + chipSinkEventsFilename = "chip_testsink_events.ndjson" + defaultLocalSinkListen = "127.0.0.1:50051" +) + +type chipSinkLocalState struct { + Version int `toml:"version"` + PID int `toml:"pid"` + GRPCListen string `toml:"grpc_listen"` + UpstreamEndpoint string `toml:"upstream_endpoint,omitempty"` + EventLogPath string `toml:"event_log_path,omitempty"` + StartedAt string `toml:"started_at,omitempty"` +} + +func beholderSinkCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "sink", + Short: "Manage chip test sink lifecycle", + } + cmd.AddCommand(startBeholderSinkCmd()) + cmd.AddCommand(stopBeholderSinkCmd()) + cmd.AddCommand(statusBeholderSinkCmd()) + cmd.AddCommand(eventsBeholderSinkCmd()) + cmd.AddCommand(runLocalBeholderSinkCmd()) + return cmd +} + +func startBeholderSinkCmd() *cobra.Command { + var placement, grpcListen, upstream string + cmd := &cobra.Command{ + Use: "start", + Short: "Start chip test sink (local or remote)", + PersistentPreRun: globalPreRunFunc, + RunE: func(cmd *cobra.Command, _ []string) error { + switch normalizePlacement(placement) { + case "local": + return startLocalChipSink(grpcListen, upstream) + case "remote": + return startRemoteChipSink(cmd.Context(), grpcListen, upstream) + default: + return fmt.Errorf("invalid placement %q (expected local or remote)", placement) + } + }, + } + cmd.Flags().StringVar(&placement, "placement", "local", "Sink placement: local or remote") + cmd.Flags().StringVar(&grpcListen, "grpc-listen", defaultLocalSinkListen, "Sink gRPC listen address") + cmd.Flags().StringVar(&upstream, "upstream-endpoint", "", "Optional upstream Chip Ingress endpoint") + return cmd +} + +func stopBeholderSinkCmd() *cobra.Command { + var placement string + cmd := &cobra.Command{ + Use: "stop", + Short: "Stop chip test sink (local or remote)", + PersistentPreRun: globalPreRunFunc, + RunE: func(cmd *cobra.Command, _ []string) error { + switch normalizePlacement(placement) { + case "local": + return stopLocalChipSink() + case "remote": + return stopRemoteChipSink(cmd.Context()) + default: + return fmt.Errorf("invalid placement %q (expected local or remote)", placement) + } + }, + } + cmd.Flags().StringVar(&placement, "placement", "local", "Sink placement: local or remote") + return cmd +} + +func statusBeholderSinkCmd() *cobra.Command { + var placement string + cmd := &cobra.Command{ + Use: "status", + Short: "Show chip test sink status (local or remote)", + PersistentPreRun: globalPreRunFunc, + RunE: func(cmd *cobra.Command, _ []string) error { + switch normalizePlacement(placement) { + case "local": + return statusLocalChipSink() + case "remote": + return statusRemoteChipSink(cmd.Context()) + default: + return fmt.Errorf("invalid placement %q (expected local or remote)", placement) + } + }, + } + cmd.Flags().StringVar(&placement, "placement", "local", "Sink placement: local or remote") + return cmd +} + +func eventsBeholderSinkCmd() *cobra.Command { + var ( + placement string + limit int + sinceRaw string + ) + cmd := &cobra.Command{ + Use: "events", + Short: "Read chip test sink events (local or remote)", + PersistentPreRun: globalPreRunFunc, + RunE: func(cmd *cobra.Command, _ []string) error { + var since time.Time + if strings.TrimSpace(sinceRaw) != "" { + parsed, err := time.Parse(time.RFC3339Nano, strings.TrimSpace(sinceRaw)) + if err != nil { + return fmt.Errorf("invalid --since value %q (expected RFC3339Nano)", sinceRaw) + } + since = parsed + } + switch normalizePlacement(placement) { + case "local": + return readLocalChipSinkEvents(since, limit) + case "remote": + return readRemoteChipSinkEvents(cmd.Context(), since, limit) + default: + return fmt.Errorf("invalid placement %q (expected local or remote)", placement) + } + }, + } + cmd.Flags().StringVar(&placement, "placement", "local", "Sink placement: local or remote") + cmd.Flags().IntVar(&limit, "limit", 200, "Max number of events to return") + cmd.Flags().StringVar(&sinceRaw, "since", "", "Filter events after RFC3339Nano timestamp") + return cmd +} + +func runLocalBeholderSinkCmd() *cobra.Command { + var grpcListen, upstream, eventsFile string + cmd := &cobra.Command{ + Use: "run-local", + Short: "Run local chip test sink server", + Hidden: true, + RunE: func(cmd *cobra.Command, _ []string) error { + if strings.TrimSpace(eventsFile) == "" { + return errors.New("events-file is required") + } + normalizedListen, err := normalizeLocalSinkListenAddress(grpcListen) + if err != nil { + return err + } + started := make(chan string, 1) + var eventsMu sync.Mutex + sinkServer, err := chipsink.NewServer(chipsink.Config{ + GRPCListen: normalizedListen, + UpstreamEndpoint: strings.TrimSpace(upstream), + Started: started, + PublishFn: func(_ context.Context, event *pb.CloudEvent) (*chippb.PublishResponse, error) { + if err := appendLocalChipSinkEvent(eventsFile, &eventsMu, event); err != nil { + framework.L.Warn().Err(err).Str("eventsFile", eventsFile).Msg("failed to append local chip sink event") + } + return &chippb.PublishResponse{}, nil + }, + }) + if err != nil { + return err + } + errCh := make(chan error, 1) + go func() { + errCh <- sinkServer.Run() + }() + + select { + case addr := <-started: + framework.L.Info().Str("grpcListen", addr).Msg("local chip test sink started") + fmt.Printf("local chip sink started: grpcListen=%s eventsFile=%s\n", addr, eventsFile) + case err := <-errCh: + return err + case <-time.After(10 * time.Second): + sinkServer.Shutdown(context.Background()) + return errors.New("timed out waiting for local chip test sink to start") + } + + sigCtx, stop := signal.NotifyContext(cmd.Context(), os.Interrupt, syscall.SIGTERM) + defer stop() + select { + case <-sigCtx.Done(): + sinkServer.Shutdown(context.Background()) + fmt.Printf("local chip sink stopped: eventsFile=%s\n", eventsFile) + return nil + case err := <-errCh: + return err + } + }, + } + cmd.Flags().StringVar(&grpcListen, "grpc-listen", defaultLocalSinkListen, "Sink gRPC listen address") + cmd.Flags().StringVar(&upstream, "upstream-endpoint", "", "Optional upstream Chip Ingress endpoint") + cmd.Flags().StringVar(&eventsFile, "events-file", "", "Path to NDJSON file with captured sink events") + return cmd +} + +func startLocalChipSink(grpcListen, upstream string) error { + normalizedListen, err := normalizeLocalSinkListenAddress(grpcListen) + if err != nil { + return err + } + existing, err := loadChipSinkLocalState() + if err == nil && existing.PID > 0 && processExists(existing.PID) { + framework.L.Info().Int("pid", existing.PID).Str("grpcListen", existing.GRPCListen).Str("eventsFile", existing.EventLogPath).Msg("local chip test sink already running") + fmt.Printf("local chip sink already running: pid=%d grpcListen=%s eventsFile=%s\n", existing.PID, existing.GRPCListen, existing.EventLogPath) + return nil + } + + executablePath, err := os.Executable() + if err != nil { + return errors.Wrap(err, "resolve executable path for local chip sink") + } + statePath := chipSinkStatePath() + if err := os.MkdirAll(filepath.Dir(statePath), 0o755); err != nil { + return errors.Wrap(err, "create chip sink state directory") + } + logPath := filepath.Join(filepath.Dir(statePath), chipSinkLogFilename) + eventsPath := chipSinkEventsPath() + logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o600) + if err != nil { + return errors.Wrap(err, "open chip sink log file") + } + defer logFile.Close() + if err := os.Remove(eventsPath); err != nil && !os.IsNotExist(err) { + return errors.Wrap(err, "reset local chip sink events file") + } + + args := []string{"env", "beholder", "sink", "run-local", "--grpc-listen", normalizedListen, "--events-file", eventsPath} + if strings.TrimSpace(upstream) != "" { + args = append(args, "--upstream-endpoint", strings.TrimSpace(upstream)) + } + cmd := exec.Command(executablePath, args...) + cmd.Stdout = logFile + cmd.Stderr = logFile + cmd.Stdin = nil + cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true} + if err := cmd.Start(); err != nil { + return errors.Wrap(err, "start local chip sink process") + } + pid := cmd.Process.Pid + _ = cmd.Process.Release() + if !waitForPIDAlive(pid, 1500*time.Millisecond) { + return fmt.Errorf("local chip sink process exited too quickly (pid=%d)", pid) + } + if err := waitForLocalSinkReady(pid, normalizedListen, 5*time.Second, logPath); err != nil { + _ = stopPID(pid) + return err + } + if err := storeChipSinkLocalState(&chipSinkLocalState{ + Version: 1, + PID: pid, + GRPCListen: normalizedListen, + UpstreamEndpoint: strings.TrimSpace(upstream), + EventLogPath: eventsPath, + StartedAt: time.Now().UTC().Format(time.RFC3339Nano), + }); err != nil { + return err + } + fmt.Printf("local chip sink started in background: pid=%d grpcListen=%s eventsFile=%s\n", pid, normalizedListen, eventsPath) + return nil +} + +func stopLocalChipSink() error { + state, err := loadChipSinkLocalState() + if err != nil { + if os.IsNotExist(err) { + framework.L.Info().Msg("local chip test sink is not running") + return nil + } + return err + } + if state.PID <= 0 || !processExists(state.PID) { + return removeChipSinkLocalState() + } + proc, err := os.FindProcess(state.PID) + if err != nil { + return err + } + _ = proc.Signal(syscall.SIGTERM) + deadline := time.Now().Add(2 * time.Second) + for processExists(state.PID) && time.Now().Before(deadline) { + time.Sleep(100 * time.Millisecond) + } + if processExists(state.PID) { + _ = proc.Signal(syscall.SIGKILL) + } + if processExists(state.PID) { + return fmt.Errorf("local chip sink pid=%d did not stop", state.PID) + } + fmt.Printf("local chip sink stopped: pid=%d eventsFile=%s\n", state.PID, state.EventLogPath) + return removeChipSinkLocalState() +} + +func statusLocalChipSink() error { + state, err := loadChipSinkLocalState() + if err != nil { + if os.IsNotExist(err) { + fmt.Println("chip sink status: local running=false") + return nil + } + return err + } + running := state.PID > 0 && processExists(state.PID) + if !running { + fmt.Printf("chip sink status: local running=false pid=%d grpcListen=%s eventsFile=%s (stale state)\n", state.PID, state.GRPCListen, state.EventLogPath) + return nil + } + fmt.Printf("chip sink status: local running=true pid=%d grpcListen=%s eventsFile=%s\n", state.PID, state.GRPCListen, state.EventLogPath) + return nil +} + +func startRemoteChipSink(ctx context.Context, grpcListen, upstream string) error { + runtime, err := remoteclient.ResolveRuntime(framework.L) + if err != nil { + return err + } + resp, err := remoteclient.StartRemoteChipTestSink(ctx, runtime, agent.ChipTestSinkStartRequest{ + Name: "default", + GRPCListen: grpcListen, + UpstreamEndpoint: strings.TrimSpace(upstream), + }) + if err != nil { + return err + } + if err := storeRemoteAgentStateSnapshot(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to persist remote agent state snapshot") + } + fmt.Printf("chip sink status: remote running=true grpcListen=%s\n", resp.GRPCListen) + return nil +} + +func stopRemoteChipSink(ctx context.Context) error { + return withResolvedRemoteRuntime(ctx, func(ctx context.Context, runtime *remoteclient.Runtime) error { + resp, err := remoteclient.StopRemoteChipTestSink(ctx, runtime) + if err != nil { + return err + } + fmt.Printf("chip sink stop: remote found=%t stopped=%t\n", resp.Found, resp.Stopped) + return nil + }) +} + +func statusRemoteChipSink(ctx context.Context) error { + return withResolvedRemoteRuntime(ctx, func(ctx context.Context, runtime *remoteclient.Runtime) error { + resp, err := remoteclient.GetRemoteChipTestSinkStatus(ctx, runtime) + if err != nil { + return err + } + fmt.Printf("chip sink status: remote running=%t grpcListen=%s\n", resp.Running, resp.GRPCListen) + return nil + }) +} + +func normalizePlacement(v string) string { + switch strings.ToLower(strings.TrimSpace(v)) { + case "", "local": + return "local" + case "remote": + return "remote" + default: + return strings.ToLower(strings.TrimSpace(v)) + } +} + +func chipSinkStatePath() string { + absPath, err := filepath.Abs(filepath.Join(relativePathToRepoRoot, envconfig.StateDirname, chipSinkStateFilename)) + if err != nil { + panic(fmt.Errorf("failed to get absolute path for chip sink state file: %w", err)) + } + return absPath +} + +func chipSinkEventsPath() string { + absPath, err := filepath.Abs(filepath.Join(relativePathToRepoRoot, envconfig.StateDirname, chipSinkEventsFilename)) + if err != nil { + panic(fmt.Errorf("failed to get absolute path for chip sink events file: %w", err)) + } + return absPath +} + +func loadChipSinkLocalState() (*chipSinkLocalState, error) { + data, err := os.ReadFile(chipSinkStatePath()) + if err != nil { + return nil, err + } + state := &chipSinkLocalState{} + if err := toml.Unmarshal(data, state); err != nil { + return nil, err + } + return state, nil +} + +func storeChipSinkLocalState(state *chipSinkLocalState) error { + data, err := toml.Marshal(state) + if err != nil { + return err + } + path := chipSinkStatePath() + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return err + } + return os.WriteFile(path, data, 0o600) +} + +func removeChipSinkLocalState() error { + if err := os.Remove(chipSinkStatePath()); err != nil && !os.IsNotExist(err) { + return err + } + return nil +} + +func appendLocalChipSinkEvent(path string, mu *sync.Mutex, event *pb.CloudEvent) error { + if event == nil { + return nil + } + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return err + } + eventData := localChipSinkEventData(event) + entry := map[string]any{ + "timestamp": time.Now().UTC().Format(time.RFC3339Nano), + "type": strings.TrimSpace(event.Type), + "data": eventData, + } + line, err := json.Marshal(entry) + if err != nil { + return err + } + mu.Lock() + defer mu.Unlock() + file, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o600) + if err != nil { + return err + } + defer file.Close() + if _, err := file.Write(append(line, '\n')); err != nil { + return err + } + return nil +} + +func readLocalChipSinkEvents(since time.Time, limit int) error { + eventsPath := chipSinkEventsPath() + file, err := os.Open(eventsPath) + if err != nil { + if os.IsNotExist(err) { + return printDebugJSON(map[string]any{"events": []any{}}) + } + return err + } + defer file.Close() + + events := make([]map[string]any, 0) + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + var entry map[string]any + if err := json.Unmarshal([]byte(line), &entry); err != nil { + continue + } + if !since.IsZero() { + tsRaw, _ := entry["timestamp"].(string) + ts, err := time.Parse(time.RFC3339Nano, strings.TrimSpace(tsRaw)) + if err != nil || !ts.After(since) { + continue + } + } + events = append(events, entry) + } + if err := scanner.Err(); err != nil { + return err + } + if limit > 0 && len(events) > limit { + events = events[len(events)-limit:] + } + return printDebugJSON(map[string]any{"events": events}) +} + +func readRemoteChipSinkEvents(ctx context.Context, since time.Time, limit int) error { + return withResolvedRemoteRuntime(ctx, func(ctx context.Context, runtime *remoteclient.Runtime) error { + resp, err := remoteclient.GetRemoteChipTestSinkEvents(ctx, runtime, since, limit) + if err != nil { + return err + } + return printDebugJSON(resp) + }) +} + +func localChipSinkEventData(event *pb.CloudEvent) any { + return chipsink.EventData(event) +} + +func normalizeLocalSinkListenAddress(raw string) (string, error) { + trimmed := strings.TrimSpace(raw) + if trimmed == "" { + return defaultLocalSinkListen, nil + } + // Accept bare port for convenience, e.g. "50052". + if _, err := strconv.Atoi(trimmed); err == nil { + return net.JoinHostPort("127.0.0.1", trimmed), nil + } + // Accept ":50052" and normalize to explicit host. + if strings.HasPrefix(trimmed, ":") { + return net.JoinHostPort("127.0.0.1", strings.TrimPrefix(trimmed, ":")), nil + } + _, port, err := net.SplitHostPort(trimmed) + if err != nil || strings.TrimSpace(port) == "" { + return "", fmt.Errorf("invalid --grpc-listen %q: expected host:port or port", raw) + } + return trimmed, nil +} + +func waitForLocalSinkReady(pid int, listenAddr string, timeout time.Duration, logPath string) error { + probeAddr, err := probeAddressForListen(listenAddr) + if err != nil { + return err + } + deadline := time.Now().Add(timeout) + var lastDialErr error + for time.Now().Before(deadline) { + if !processExists(pid) { + return fmt.Errorf("local chip sink process exited before becoming ready (pid=%d); check log: %s", pid, logPath) + } + conn, dialErr := net.DialTimeout("tcp", probeAddr, 200*time.Millisecond) + if dialErr == nil { + _ = conn.Close() + return nil + } + lastDialErr = dialErr + time.Sleep(100 * time.Millisecond) + } + return fmt.Errorf("local chip sink failed readiness probe on %s within %s (pid=%d, last error: %v); check log: %s", probeAddr, timeout, pid, lastDialErr, logPath) +} + +func probeAddressForListen(listenAddr string) (string, error) { + host, port, err := net.SplitHostPort(strings.TrimSpace(listenAddr)) + if err != nil { + return "", fmt.Errorf("invalid normalized listen address %q: %w", listenAddr, err) + } + host = strings.TrimSpace(host) + switch host { + case "", "0.0.0.0", "::": + host = "127.0.0.1" + } + return net.JoinHostPort(host, port), nil +} + +func stopPID(pid int) error { + if pid <= 0 { + return nil + } + proc, err := os.FindProcess(pid) + if err != nil { + return err + } + _ = proc.Signal(syscall.SIGTERM) + deadline := time.Now().Add(2 * time.Second) + for processExists(pid) && time.Now().Before(deadline) { + time.Sleep(100 * time.Millisecond) + } + if processExists(pid) { + _ = proc.Signal(syscall.SIGKILL) + } + return nil +} diff --git a/core/scripts/cre/environment/environment/remote_state.go b/core/scripts/cre/environment/environment/remote_state.go index 7709449b837..a8a2621439b 100644 --- a/core/scripts/cre/environment/environment/remote_state.go +++ b/core/scripts/cre/environment/environment/remote_state.go @@ -78,15 +78,11 @@ func storeRemoteStopState(relativePathToRepoRoot string, cfg *envconfig.Config) return err } agentEnvelope := &remoteAgentStateEnvelope{Agent: captureRemoteAgentState()} - data, err := toml.Marshal(agentEnvelope) - if err != nil { - return err - } - path := remoteAgentFileAbsPath(relativePathToRepoRoot) - if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { - return err - } - return os.WriteFile(path, data, 0o600) + return storeRemoteAgentState(relativePathToRepoRoot, agentEnvelope) +} + +func storeRemoteAgentStateSnapshot(relativePathToRepoRoot string) error { + return storeRemoteAgentState(relativePathToRepoRoot, &remoteAgentStateEnvelope{Agent: captureRemoteAgentState()}) } func filteredRemoteStopConfig(cfg *envconfig.Config) *envconfig.Config { @@ -119,6 +115,18 @@ func captureRemoteAgentState() remoteAgentState { } } +func storeRemoteAgentState(relativePathToRepoRoot string, envelope *remoteAgentStateEnvelope) error { + data, err := toml.Marshal(envelope) + if err != nil { + return err + } + path := remoteAgentFileAbsPath(relativePathToRepoRoot) + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return err + } + return os.WriteFile(path, data, 0o600) +} + func firstNonEmpty(values ...string) string { for _, value := range values { if trimmed := strings.TrimSpace(value); trimmed != "" { diff --git a/system-tests/lib/cre/environment/remoteexec/agent/server.go b/system-tests/lib/cre/environment/remoteexec/agent/server.go index f50534f88d7..786c0f69f55 100644 --- a/system-tests/lib/cre/environment/remoteexec/agent/server.go +++ b/system-tests/lib/cre/environment/remoteexec/agent/server.go @@ -32,18 +32,20 @@ import ( "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/chipsink" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/internal/dockerops" ) const ( - SchemaVersionV1 = "v1" - OperationStartComponent = "StartComponent" - OperationStopComponent = "StopComponent" - OperationDeployArtifacts = "DeployArtifacts" - OperationHealth = "Health" - ComponentTypeBlockchain = "blockchain" - ComponentTypeJD = "jd" - ComponentTypeNodeSet = "nodeset" + SchemaVersionV1 = "v1" + OperationStartComponent = "StartComponent" + OperationStopComponent = "StopComponent" + OperationDeployArtifacts = "DeployArtifacts" + OperationHealth = "Health" + ComponentTypeBlockchain = "blockchain" + ComponentTypeJD = "jd" + ComponentTypeNodeSet = "nodeset" + ComponentTypeChipTestSink = "chip-testsink" ErrCodeMethodNotAllowed = "method_not_allowed" ErrCodeInvalidRequestBody = "invalid_request_body" @@ -60,19 +62,19 @@ const ( EnvKeepFailedContainers = "CRE_AGENT_KEEP_FAILED_CONTAINERS" - defaultComponentLogsLimit = 200 - maxComponentLogsLimit = 1000 - componentLogsRingSize = 2000 + defaultComponentLogsLimit = 200 + maxComponentLogsLimit = 1000 + componentLogsRingSize = 2000 inFlightOperationScopeLifecycle = "lifecycle" inFlightOperationScopeGeneral = "general" - protocolVersion = "1.0.0" - capabilityComponentLogs = "componentLogs" - capabilityLocks = "locks" - capabilityDeployArtifacts = "deployArtifacts" - capabilityStartComponent = "startComponent" - capabilityRelay = "relay" - capabilityListCTFResources = "listCTFResources" - agentVersion = "dev" + protocolVersion = "1.0.0" + capabilityComponentLogs = "componentLogs" + capabilityLocks = "locks" + capabilityDeployArtifacts = "deployArtifacts" + capabilityStartComponent = "startComponent" + capabilityRelay = "relay" + capabilityListCTFResources = "listCTFResources" + agentVersion = "dev" ) var frameworkLogCaptureMu sync.Mutex @@ -119,16 +121,17 @@ type CTFResourcesResponse struct { } type AgentStatusResponse struct { - AgentVersion string `json:"agentVersion,omitempty"` - ProtocolVersion string `json:"protocolVersion,omitempty"` - SupportedSchemas []string `json:"supportedSchemas,omitempty"` - Capabilities []string `json:"capabilities,omitempty"` - UptimeSeconds int64 `json:"uptimeSeconds"` - RuntimeComponents []string `json:"runtimeComponents,omitempty"` - CachedComponents []string `json:"cachedComponents,omitempty"` - Relays []RelayInfo `json:"relays,omitempty"` - ComponentLogKeys []string `json:"componentLogKeys,omitempty"` - InFlight []InFlightOperation `json:"inFlight,omitempty"` + AgentVersion string `json:"agentVersion,omitempty"` + ProtocolVersion string `json:"protocolVersion,omitempty"` + SupportedSchemas []string `json:"supportedSchemas,omitempty"` + Capabilities []string `json:"capabilities,omitempty"` + UptimeSeconds int64 `json:"uptimeSeconds"` + RuntimeComponents []string `json:"runtimeComponents,omitempty"` + CachedComponents []string `json:"cachedComponents,omitempty"` + Relays []RelayInfo `json:"relays,omitempty"` + ComponentLogKeys []string `json:"componentLogKeys,omitempty"` + InFlight []InFlightOperation `json:"inFlight,omitempty"` + ChipSink *ChipTestSinkStatusResponse `json:"chipSink,omitempty"` } type RelayInfo struct { @@ -139,12 +142,12 @@ type RelayInfo struct { } type AgentLocksResponse struct { - LifecycleBusy bool `json:"lifecycleBusy"` - CacheEntries int `json:"cacheEntries"` - RuntimeEntries int `json:"runtimeEntries"` - RelayCount int `json:"relayCount"` - ComponentLogKeys int `json:"componentLogKeys"` - InFlight []InFlightOperation `json:"inFlight,omitempty"` + LifecycleBusy bool `json:"lifecycleBusy"` + CacheEntries int `json:"cacheEntries"` + RuntimeEntries int `json:"runtimeEntries"` + RelayCount int `json:"relayCount"` + ComponentLogKeys int `json:"componentLogKeys"` + InFlight []InFlightOperation `json:"inFlight,omitempty"` } type InFlightOperation struct { @@ -160,6 +163,46 @@ type ComponentLogsResponse struct { Lines []string `json:"lines,omitempty"` } +type ChipTestSinkStartRequest struct { + Name string `json:"name,omitempty"` + GRPCListen string `json:"grpcListen,omitempty"` + UpstreamEndpoint string `json:"upstreamEndpoint,omitempty"` +} + +type ChipTestSinkStartResponse struct { + Profile string `json:"profile"` + Mode string `json:"mode"` + Name string `json:"name"` + GRPCListen string `json:"grpcListen"` + UpstreamEndpoint string `json:"upstreamEndpoint,omitempty"` + EventLogPath string `json:"eventLogPath,omitempty"` +} + +type ChipTestSinkStatusResponse struct { + Profile string `json:"profile"` + Mode string `json:"mode"` + Running bool `json:"running"` + Name string `json:"name,omitempty"` + GRPCListen string `json:"grpcListen,omitempty"` + UpstreamEndpoint string `json:"upstreamEndpoint,omitempty"` + EventLogPath string `json:"eventLogPath,omitempty"` +} + +type ChipTestSinkStopResponse struct { + Found bool `json:"found"` + Stopped bool `json:"stopped"` +} + +type ChipTestSinkEventLogEntry struct { + Timestamp string `json:"timestamp"` + Type string `json:"type,omitempty"` + Event map[string]any `json:"event,omitempty"` +} + +type ChipTestSinkEventsResponse struct { + Events []ChipTestSinkEventLogEntry `json:"events"` +} + type inFlightOperation struct { ID string Scope string @@ -167,19 +210,21 @@ type inFlightOperation struct { } type Server struct { - lggr zerolog.Logger - deployers map[blockchain.ChainFamily]blockchains.Deployer - startedAt time.Time - lifecycleMu sync.Mutex - cacheMu sync.Mutex - cache map[string]cachedStart - runtime map[string]runtimeState - relayMu sync.Mutex - relays map[string]*relayRegistration - logsMu sync.Mutex + lggr zerolog.Logger + deployers map[blockchain.ChainFamily]blockchains.Deployer + startedAt time.Time + lifecycleMu sync.Mutex + cacheMu sync.Mutex + cache map[string]cachedStart + runtime map[string]runtimeState + relayMu sync.Mutex + relays map[string]*relayRegistration + logsMu sync.Mutex componentLogs map[string][]string - opsMu sync.Mutex - inFlight map[string]inFlightOperation + opsMu sync.Mutex + inFlight map[string]inFlightOperation + chipSinkMu sync.Mutex + chipSink *chipTestSinkRuntime } type cachedStart struct { @@ -190,18 +235,29 @@ type cachedStart struct { type runtimeState struct { ComponentType string ContainerIDs []string + StopFn func(context.Context) error +} + +type chipTestSinkRuntime struct { + name string + grpcListen string + upstreamEndpoint string + eventLogPath string + server *chipsink.Server + cancel context.CancelFunc + runErrCh chan error } func NewServer(lggr zerolog.Logger, deployers map[blockchain.ChainFamily]blockchains.Deployer) *Server { return &Server{ - lggr: lggr, - deployers: deployers, - startedAt: time.Now(), - cache: make(map[string]cachedStart), - runtime: make(map[string]runtimeState), - relays: make(map[string]*relayRegistration), + lggr: lggr, + deployers: deployers, + startedAt: time.Now(), + cache: make(map[string]cachedStart), + runtime: make(map[string]runtimeState), + relays: make(map[string]*relayRegistration), componentLogs: make(map[string][]string), - inFlight: make(map[string]inFlightOperation), + inFlight: make(map[string]inFlightOperation), } } @@ -216,6 +272,10 @@ func (s *Server) Handler() http.Handler { mux.HandleFunc("/v1/status", s.status) mux.HandleFunc("/v1/locks", s.locks) mux.HandleFunc("/v1/components/logs", s.componentLogsHandler) + mux.HandleFunc("/v1/chip/sink/start", s.startChipTestSink) + mux.HandleFunc("/v1/chip/sink/stop", s.stopChipTestSink) + mux.HandleFunc("/v1/chip/sink/status", s.chipTestSinkStatus) + mux.HandleFunc("/v1/chip/sink/events", s.chipTestSinkEvents) return mux } @@ -617,6 +677,12 @@ func (s *Server) stopTrackedComponentLocked(ctx context.Context, componentKey st if !ok { return false, nil } + if state.StopFn != nil { + if err := state.StopFn(ctx); err != nil { + return false, err + } + return true, nil + } if err := stopContainers(ctx, state.ContainerIDs); err != nil { return false, err } diff --git a/system-tests/lib/cre/environment/remoteexec/agent/server_chip_sink.go b/system-tests/lib/cre/environment/remoteexec/agent/server_chip_sink.go new file mode 100644 index 00000000000..f2122ef5c31 --- /dev/null +++ b/system-tests/lib/cre/environment/remoteexec/agent/server_chip_sink.go @@ -0,0 +1,366 @@ +package agent + +import ( + "bufio" + "context" + "encoding/json" + "fmt" + "net" + "net/http" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "time" + + "github.com/cloudevents/sdk-go/binding/format/protobuf/v2/pb" + chippb "github.com/smartcontractkit/chainlink-common/pkg/chipingress/pb" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/chipsink" +) + +const ( + defaultChipSinkName = "default" + defaultChipSinkGRPCListen = "0.0.0.0:50051" + defaultChipSinkEventsLimit = 200 + maxChipSinkEventsLimit = 1000 +) + +func (s *Server) startChipTestSink(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) + return + } + + var req ChipTestSinkStartRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + s.respondError(w, http.StatusBadRequest, ErrCodeInvalidRequestBody, fmt.Sprintf("invalid request body: %v", err), nil) + return + } + + name := strings.TrimSpace(req.Name) + if name == "" { + name = defaultChipSinkName + } + grpcListen := strings.TrimSpace(req.GRPCListen) + if grpcListen == "" { + grpcListen = defaultChipSinkGRPCListen + } + normalizedListen, err := normalizeChipSinkListenAddress(grpcListen) + if err != nil { + s.respondError(w, http.StatusBadRequest, ErrCodeInvalidPayload, err.Error(), nil) + return + } + + s.chipSinkMu.Lock() + defer s.chipSinkMu.Unlock() + + if s.chipSink != nil { + s.lggr.Info(). + Str("name", s.chipSink.name). + Str("grpcListen", s.chipSink.grpcListen). + Str("upstreamEndpoint", s.chipSink.upstreamEndpoint). + Msg("chip test sink already running; returning existing status") + s.respondJSONAny(w, http.StatusOK, ChipTestSinkStartResponse{ + Profile: "sink", + Mode: "remote", + Name: s.chipSink.name, + GRPCListen: s.chipSink.grpcListen, + UpstreamEndpoint: s.chipSink.upstreamEndpoint, + EventLogPath: s.chipSink.eventLogPath, + }) + return + } + + eventLogPath := defaultChipSinkEventLogPath() + if err := os.MkdirAll(filepath.Dir(eventLogPath), 0o755); err != nil { + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to prepare chip sink log directory: %v", err), nil) + return + } + // Start with a clean event stream per launch. + if err := os.Remove(eventLogPath); err != nil && !os.IsNotExist(err) { + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to reset chip sink event log: %v", err), nil) + return + } + var eventLogMu sync.Mutex + + started := make(chan string, 1) + sinkServer, err := chipsink.NewServer(chipsink.Config{ + GRPCListen: normalizedListen, + UpstreamEndpoint: strings.TrimSpace(req.UpstreamEndpoint), + Started: started, + PublishFn: func(_ context.Context, event *pb.CloudEvent) (*chippb.PublishResponse, error) { + _ = appendChipSinkEvent(eventLogPath, &eventLogMu, event) + return &chippb.PublishResponse{}, nil + }, + }) + if err != nil { + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to create chip test sink server: %v", err), nil) + return + } + + runCtx, cancel := context.WithCancel(context.Background()) + runErrCh := make(chan error, 1) + go func() { + runErrCh <- sinkServer.Run() + }() + + select { + case boundAddr := <-started: + s.lggr.Info(). + Str("name", name). + Str("grpcListen", boundAddr). + Str("upstreamEndpoint", strings.TrimSpace(req.UpstreamEndpoint)). + Str("eventLogPath", eventLogPath). + Msg("chip test sink started") + s.chipSink = &chipTestSinkRuntime{ + name: name, + grpcListen: boundAddr, + upstreamEndpoint: strings.TrimSpace(req.UpstreamEndpoint), + eventLogPath: eventLogPath, + server: sinkServer, + cancel: cancel, + runErrCh: runErrCh, + } + s.storeRuntime(fmt.Sprintf("%s:%s", ComponentTypeChipTestSink, name), runtimeState{ + ComponentType: ComponentTypeChipTestSink, + StopFn: func(ctx context.Context) error { + sinkServer.Shutdown(ctx) + cancel() + return nil + }, + }) + case err := <-runErrCh: + cancel() + s.lggr.Error().Err(err).Str("name", name).Str("grpcListen", normalizedListen).Msg("chip test sink failed to start") + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("chip test sink failed to start: %v", err), nil) + return + case <-time.After(10 * time.Second): + cancel() + sinkServer.Shutdown(context.Background()) + s.lggr.Error().Str("name", name).Str("grpcListen", normalizedListen).Msg("chip test sink startup timed out") + s.respondError(w, http.StatusGatewayTimeout, ErrCodeDeployFailed, "timed out waiting for chip test sink to start", nil) + return + case <-runCtx.Done(): + cancel() + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, "chip test sink startup canceled", nil) + return + } + + s.respondJSONAny(w, http.StatusOK, ChipTestSinkStartResponse{ + Profile: "sink", + Mode: "remote", + Name: name, + GRPCListen: s.chipSink.grpcListen, + UpstreamEndpoint: s.chipSink.upstreamEndpoint, + EventLogPath: s.chipSink.eventLogPath, + }) +} + +func (s *Server) stopChipTestSink(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) + return + } + + s.chipSinkMu.Lock() + defer s.chipSinkMu.Unlock() + + if s.chipSink == nil { + s.lggr.Info().Msg("chip test sink stop requested; nothing running") + s.respondJSONAny(w, http.StatusOK, ChipTestSinkStopResponse{Found: false, Stopped: false}) + return + } + + runtime := s.chipSink + s.lggr.Info(). + Str("name", runtime.name). + Str("grpcListen", runtime.grpcListen). + Msg("stopping chip test sink") + runtime.server.Shutdown(r.Context()) + runtime.cancel() + _, _ = s.takeRuntime(fmt.Sprintf("%s:%s", ComponentTypeChipTestSink, runtime.name)) + s.chipSink = nil + s.lggr.Info().Str("name", runtime.name).Msg("chip test sink stopped") + + s.respondJSONAny(w, http.StatusOK, ChipTestSinkStopResponse{Found: true, Stopped: true}) +} + +func (s *Server) chipTestSinkStatus(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) + return + } + + s.chipSinkMu.Lock() + defer s.chipSinkMu.Unlock() + if s.chipSink == nil { + s.respondJSONAny(w, http.StatusOK, ChipTestSinkStatusResponse{ + Profile: "sink", + Mode: "remote", + Running: false, + }) + return + } + + s.respondJSONAny(w, http.StatusOK, ChipTestSinkStatusResponse{ + Profile: "sink", + Mode: "remote", + Running: true, + Name: s.chipSink.name, + GRPCListen: s.chipSink.grpcListen, + UpstreamEndpoint: s.chipSink.upstreamEndpoint, + EventLogPath: s.chipSink.eventLogPath, + }) +} + +func (s *Server) currentChipSinkStatus() *ChipTestSinkStatusResponse { + s.chipSinkMu.Lock() + defer s.chipSinkMu.Unlock() + if s.chipSink == nil { + return nil + } + return &ChipTestSinkStatusResponse{ + Profile: "sink", + Mode: "remote", + Running: true, + Name: s.chipSink.name, + GRPCListen: s.chipSink.grpcListen, + UpstreamEndpoint: s.chipSink.upstreamEndpoint, + EventLogPath: s.chipSink.eventLogPath, + } +} + +func (s *Server) chipTestSinkEvents(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + s.respondError(w, http.StatusMethodNotAllowed, ErrCodeMethodNotAllowed, "method not allowed", nil) + return + } + + s.chipSinkMu.Lock() + runtime := s.chipSink + s.chipSinkMu.Unlock() + if runtime == nil { + s.respondJSONAny(w, http.StatusOK, ChipTestSinkEventsResponse{Events: []ChipTestSinkEventLogEntry{}}) + return + } + + limit := defaultChipSinkEventsLimit + if rawLimit := strings.TrimSpace(r.URL.Query().Get("limit")); rawLimit != "" { + parsed, err := strconv.Atoi(rawLimit) + if err != nil || parsed <= 0 { + s.respondError(w, http.StatusBadRequest, ErrCodeInvalidPayload, "limit query parameter must be a positive integer", nil) + return + } + if parsed > maxChipSinkEventsLimit { + parsed = maxChipSinkEventsLimit + } + limit = parsed + } + + var since time.Time + if rawSince := strings.TrimSpace(r.URL.Query().Get("since")); rawSince != "" { + parsed, err := time.Parse(time.RFC3339Nano, rawSince) + if err != nil { + s.respondError(w, http.StatusBadRequest, ErrCodeInvalidPayload, "since query parameter must be RFC3339Nano timestamp", nil) + return + } + since = parsed + } + + events, err := readChipSinkEvents(runtime.eventLogPath, since, limit) + if err != nil { + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to read chip sink events: %v", err), nil) + return + } + s.respondJSONAny(w, http.StatusOK, ChipTestSinkEventsResponse{Events: events}) +} + +func defaultChipSinkEventLogPath() string { + return filepath.Join(os.TempDir(), "cre-agent-chip-sink-events.ndjson") +} + +func normalizeChipSinkListenAddress(raw string) (string, error) { + trimmed := strings.TrimSpace(raw) + if trimmed == "" { + return defaultChipSinkGRPCListen, nil + } + // Accept bare port for convenience, e.g. "50052". + if _, err := strconv.Atoi(trimmed); err == nil { + return net.JoinHostPort("0.0.0.0", trimmed), nil + } + // Accept ":50052" and normalize to explicit host. + if strings.HasPrefix(trimmed, ":") { + return net.JoinHostPort("0.0.0.0", strings.TrimPrefix(trimmed, ":")), nil + } + _, port, err := net.SplitHostPort(trimmed) + if err != nil || strings.TrimSpace(port) == "" { + return "", fmt.Errorf("invalid grpcListen %q: expected host:port or port", raw) + } + return trimmed, nil +} + +func appendChipSinkEvent(path string, mu *sync.Mutex, event *pb.CloudEvent) error { + if event == nil { + return nil + } + entry := ChipTestSinkEventLogEntry{ + Timestamp: time.Now().UTC().Format(time.RFC3339Nano), + Type: strings.TrimSpace(event.Type), + Event: chipsink.EventData(event), + } + line, err := json.Marshal(entry) + if err != nil { + return err + } + + mu.Lock() + defer mu.Unlock() + file, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o600) + if err != nil { + return err + } + defer file.Close() + if _, err := file.Write(append(line, '\n')); err != nil { + return err + } + return nil +} + +func readChipSinkEvents(path string, since time.Time, limit int) ([]ChipTestSinkEventLogEntry, error) { + file, err := os.Open(path) + if err != nil { + if os.IsNotExist(err) { + return []ChipTestSinkEventLogEntry{}, nil + } + return nil, err + } + defer file.Close() + + events := make([]ChipTestSinkEventLogEntry, 0, limit) + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + var entry ChipTestSinkEventLogEntry + if err := json.Unmarshal([]byte(line), &entry); err != nil { + continue + } + if !since.IsZero() { + ts, err := time.Parse(time.RFC3339Nano, strings.TrimSpace(entry.Timestamp)) + if err != nil || !ts.After(since) { + continue + } + } + events = append(events, entry) + } + if err := scanner.Err(); err != nil { + return nil, err + } + if len(events) <= limit { + return events, nil + } + return events[len(events)-limit:], nil +} diff --git a/system-tests/lib/cre/environment/remoteexec/agent/server_chip_sink_test.go b/system-tests/lib/cre/environment/remoteexec/agent/server_chip_sink_test.go new file mode 100644 index 00000000000..b23425e2d78 --- /dev/null +++ b/system-tests/lib/cre/environment/remoteexec/agent/server_chip_sink_test.go @@ -0,0 +1,75 @@ +package agent + +import ( + "bytes" + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "strings" + "testing" + "time" + + "github.com/rs/zerolog" + "github.com/stretchr/testify/require" +) + +func TestChipSinkEventsEndpointReturnsEntriesFromLogFile(t *testing.T) { + server := NewServer(zerolog.Nop(), nil) + + startReq := httptest.NewRequest(http.MethodPost, "/v1/chip/sink/start", bytes.NewReader([]byte(`{"name":"sink-a","grpcListen":"127.0.0.1:0"}`))) + startReq.Header.Set("Content-Type", "application/json") + startRR := httptest.NewRecorder() + server.Handler().ServeHTTP(startRR, startReq) + require.Equal(t, http.StatusOK, startRR.Code) + + var startResp ChipTestSinkStartResponse + require.NoError(t, json.Unmarshal(startRR.Body.Bytes(), &startResp)) + require.NotEmpty(t, startResp.EventLogPath) + t.Cleanup(func() { + stopReq := httptest.NewRequest(http.MethodPost, "/v1/chip/sink/stop", bytes.NewReader([]byte(`{}`))) + stopReq.Header.Set("Content-Type", "application/json") + stopRR := httptest.NewRecorder() + server.Handler().ServeHTTP(stopRR, stopReq) + }) + + entry := ChipTestSinkEventLogEntry{ + Timestamp: time.Now().UTC().Add(1 * time.Second).Format(time.RFC3339Nano), + Type: "workflows.v1.UserLogs", + Event: map[string]any{"id": "abc"}, + } + line, err := json.Marshal(entry) + require.NoError(t, err) + err = os.WriteFile(startResp.EventLogPath, append(line, '\n'), 0o600) + require.NoError(t, err) + + eventsReq := httptest.NewRequest(http.MethodGet, "/v1/chip/sink/events?limit=10", nil) + eventsRR := httptest.NewRecorder() + server.Handler().ServeHTTP(eventsRR, eventsReq) + require.Equal(t, http.StatusOK, eventsRR.Code) + + var eventsResp ChipTestSinkEventsResponse + require.NoError(t, json.Unmarshal(eventsRR.Body.Bytes(), &eventsResp)) + require.Len(t, eventsResp.Events, 1) + require.Equal(t, "workflows.v1.UserLogs", eventsResp.Events[0].Type) +} + +func TestStartChipSinkNormalizesBarePortListenAddress(t *testing.T) { + server := NewServer(zerolog.Nop(), nil) + + startReq := httptest.NewRequest(http.MethodPost, "/v1/chip/sink/start", bytes.NewReader([]byte(`{"name":"sink-a","grpcListen":"50052"}`))) + startReq.Header.Set("Content-Type", "application/json") + startRR := httptest.NewRecorder() + server.Handler().ServeHTTP(startRR, startReq) + require.Equal(t, http.StatusOK, startRR.Code) + + var startResp ChipTestSinkStartResponse + require.NoError(t, json.Unmarshal(startRR.Body.Bytes(), &startResp)) + require.True(t, strings.HasSuffix(startResp.GRPCListen, ":50052"), "expected normalized listen addr to bind port 50052, got %s", startResp.GRPCListen) + + stopReq := httptest.NewRequest(http.MethodPost, "/v1/chip/sink/stop", bytes.NewReader([]byte(`{}`))) + stopReq.Header.Set("Content-Type", "application/json") + stopRR := httptest.NewRecorder() + server.Handler().ServeHTTP(stopRR, stopReq) + require.Equal(t, http.StatusOK, stopRR.Code) +} diff --git a/system-tests/lib/cre/environment/remoteexec/agent/server_handlers_test.go b/system-tests/lib/cre/environment/remoteexec/agent/server_handlers_test.go index 4737890294a..e6cf240dd22 100644 --- a/system-tests/lib/cre/environment/remoteexec/agent/server_handlers_test.go +++ b/system-tests/lib/cre/environment/remoteexec/agent/server_handlers_test.go @@ -197,3 +197,41 @@ func TestComponentLogsEndpointValidationAndLimit(t *testing.T) { require.Equal(t, 3, resp.TotalLines) require.Equal(t, []string{"line-b", "line-c"}, resp.Lines) } + +func TestChipTestSinkLifecycleEndpoints(t *testing.T) { + server := NewServer(zerolog.Nop(), nil) + + startReq := httptest.NewRequest(http.MethodPost, "/v1/chip/sink/start", bytes.NewReader([]byte(`{"name":"sink-a","grpcListen":"127.0.0.1:0"}`))) + startReq.Header.Set("Content-Type", "application/json") + startRR := httptest.NewRecorder() + server.Handler().ServeHTTP(startRR, startReq) + require.Equal(t, http.StatusOK, startRR.Code) + + var startResp ChipTestSinkStartResponse + require.NoError(t, json.Unmarshal(startRR.Body.Bytes(), &startResp)) + require.Equal(t, "sink", startResp.Profile) + require.Equal(t, "remote", startResp.Mode) + require.Equal(t, "sink-a", startResp.Name) + require.NotEmpty(t, startResp.GRPCListen) + + statusReq := httptest.NewRequest(http.MethodGet, "/v1/chip/sink/status", nil) + statusRR := httptest.NewRecorder() + server.Handler().ServeHTTP(statusRR, statusReq) + require.Equal(t, http.StatusOK, statusRR.Code) + + var statusResp ChipTestSinkStatusResponse + require.NoError(t, json.Unmarshal(statusRR.Body.Bytes(), &statusResp)) + require.True(t, statusResp.Running) + require.Equal(t, "sink-a", statusResp.Name) + + stopReq := httptest.NewRequest(http.MethodPost, "/v1/chip/sink/stop", bytes.NewReader([]byte(`{}`))) + stopReq.Header.Set("Content-Type", "application/json") + stopRR := httptest.NewRecorder() + server.Handler().ServeHTTP(stopRR, stopReq) + require.Equal(t, http.StatusOK, stopRR.Code) + + var stopResp ChipTestSinkStopResponse + require.NoError(t, json.Unmarshal(stopRR.Body.Bytes(), &stopResp)) + require.True(t, stopResp.Found) + require.True(t, stopResp.Stopped) +} diff --git a/system-tests/lib/cre/environment/remoteexec/agent/server_status_handlers.go b/system-tests/lib/cre/environment/remoteexec/agent/server_status_handlers.go index 1d434931e69..b5cd8e79e88 100644 --- a/system-tests/lib/cre/environment/remoteexec/agent/server_status_handlers.go +++ b/system-tests/lib/cre/environment/remoteexec/agent/server_status_handlers.go @@ -18,18 +18,20 @@ func (s *Server) status(w http.ResponseWriter, r *http.Request) { relayInfos := s.relayInfos() componentLogKeys := s.componentLogKeys() inFlight, _ := s.inFlightSnapshot() + chipSinkStatus := s.currentChipSinkStatus() s.respondJSONAny(w, http.StatusOK, AgentStatusResponse{ AgentVersion: agentVersion, ProtocolVersion: protocolVersion, SupportedSchemas: []string{SchemaVersionV1}, - Capabilities: []string{capabilityStartComponent, capabilityDeployArtifacts, capabilityRelay, capabilityListCTFResources, capabilityLocks, capabilityComponentLogs}, + Capabilities: []string{capabilityStartComponent, capabilityDeployArtifacts, capabilityRelay, capabilityListCTFResources, capabilityLocks, capabilityComponentLogs, "chipSinkLifecycle"}, UptimeSeconds: int64(time.Since(s.startedAt).Seconds()), RuntimeComponents: runtimeKeys, CachedComponents: cacheKeys, Relays: relayInfos, ComponentLogKeys: componentLogKeys, InFlight: inFlight, + ChipSink: chipSinkStatus, }) } diff --git a/system-tests/lib/cre/environment/remoteexec/chipsink/event_decode.go b/system-tests/lib/cre/environment/remoteexec/chipsink/event_decode.go new file mode 100644 index 00000000000..25f048625b0 --- /dev/null +++ b/system-tests/lib/cre/environment/remoteexec/chipsink/event_decode.go @@ -0,0 +1,112 @@ +package chipsink + +import ( + "encoding/base64" + "encoding/json" + "strings" + + "github.com/cloudevents/sdk-go/binding/format/protobuf/v2/pb" + commonevents "github.com/smartcontractkit/chainlink-protos/workflows/go/common" + workflowevents "github.com/smartcontractkit/chainlink-protos/workflows/go/events" + workfloweventsv2 "github.com/smartcontractkit/chainlink-protos/workflows/go/v2" + "google.golang.org/protobuf/encoding/protojson" + "google.golang.org/protobuf/proto" +) + +// EventData decodes known CHiP workflow event types to human-readable JSON maps. +// Unknown or undecodable events fall back to a minimal metadata+base64 representation. +func EventData(event *pb.CloudEvent) map[string]any { + if event == nil { + return map[string]any{} + } + + msg := typedMessageForEventType(strings.TrimSpace(event.GetType())) + if msg != nil { + if protoData := event.GetProtoData(); protoData != nil && len(protoData.GetValue()) > 0 { + if err := proto.Unmarshal(protoData.GetValue(), msg); err == nil { + if asMap, ok := protoMessageAsMap(msg); ok { + return asMap + } + } + } + } + + fallback := map[string]any{ + "id": strings.TrimSpace(event.GetId()), + "type": strings.TrimSpace(event.GetType()), + "source": strings.TrimSpace(event.GetSource()), + "specVersion": strings.TrimSpace(event.GetSpecVersion()), + } + if protoData := event.GetProtoData(); protoData != nil && len(protoData.GetValue()) > 0 { + fallback["protoDataBase64"] = base64.StdEncoding.EncodeToString(protoData.GetValue()) + } + if textData := strings.TrimSpace(event.GetTextData()); textData != "" { + fallback["textData"] = textData + } + return fallback +} + +func protoMessageAsMap(msg proto.Message) (map[string]any, bool) { + dataBytes, err := (protojson.MarshalOptions{Multiline: false}).Marshal(msg) + if err != nil { + return nil, false + } + var out map[string]any + if err := json.Unmarshal(dataBytes, &out); err != nil { + return nil, false + } + return out, true +} + +func typedMessageForEventType(eventType string) proto.Message { + switch eventType { + // workflows.v1 events + case "workflows.v1.CapabilityExecutionFinished": + return &workflowevents.CapabilityExecutionFinished{} + case "workflows.v1.CapabilityExecutionStarted": + return &workflowevents.CapabilityExecutionStarted{} + case "workflows.v1.MeteringReport": + return &workflowevents.MeteringReport{} + case "workflows.v1.TransmissionsScheduledEvent": + return &workflowevents.TransmissionsScheduledEvent{} + case "workflows.v1.TransmitScheduleEvent": + return &workflowevents.TransmitScheduleEvent{} + case "workflows.v1.WorkflowExecutionFinished": + return &workflowevents.WorkflowExecutionFinished{} + case "workflows.v1.WorkflowExecutionStarted": + return &workflowevents.WorkflowExecutionStarted{} + case "workflows.v1.WorkflowStatusChanged": + return &workflowevents.WorkflowStatusChanged{} + case "workflows.v1.UserLogs": + return &workflowevents.UserLogs{} + + // workflows.v2 events + case "workflows.v2.CapabilityExecutionFinished": + return &workfloweventsv2.CapabilityExecutionFinished{} + case "workflows.v2.CapabilityExecutionStarted": + return &workfloweventsv2.CapabilityExecutionStarted{} + case "workflows.v2.TriggerExecutionStarted": + return &workfloweventsv2.TriggerExecutionStarted{} + case "workflows.v2.WorkflowActivated": + return &workfloweventsv2.WorkflowActivated{} + case "workflows.v2.WorkflowDeleted": + return &workfloweventsv2.WorkflowDeleted{} + case "workflows.v2.WorkflowDeployed": + return &workfloweventsv2.WorkflowDeployed{} + case "workflows.v2.WorkflowExecutionFinished": + return &workfloweventsv2.WorkflowExecutionFinished{} + case "workflows.v2.WorkflowExecutionStarted": + return &workfloweventsv2.WorkflowExecutionStarted{} + case "workflows.v2.WorkflowPaused": + return &workfloweventsv2.WorkflowPaused{} + case "workflows.v2.WorkflowUpdated": + return &workfloweventsv2.WorkflowUpdated{} + case "workflows.v2.WorkflowUserLog": + return &workfloweventsv2.WorkflowUserLog{} + + case "BaseMessage": + return &commonevents.BaseMessage{} + default: + return nil + } +} diff --git a/system-tests/lib/cre/environment/remoteexec/chipsink/server.go b/system-tests/lib/cre/environment/remoteexec/chipsink/server.go new file mode 100644 index 00000000000..50669b634d3 --- /dev/null +++ b/system-tests/lib/cre/environment/remoteexec/chipsink/server.go @@ -0,0 +1,129 @@ +package chipsink + +// NOTE: This implementation intentionally mirrors the test helper sink from +// `system-tests/tests/test-helpers/chip-testsink`. +// We keep this copy under `system-tests/lib` so runtime code (agent/CLI) can +// depend on it without importing from test-only packages. +// If we later move the sink to a shared package, both callers should use that +// single canonical location. + +import ( + "context" + "fmt" + "net" + "sync" + "time" + + "github.com/cloudevents/sdk-go/binding/format/protobuf/v2/pb" + chippb "github.com/smartcontractkit/chainlink-common/pkg/chipingress/pb" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" +) + +const listenerReadyTimeout = 5 * time.Second + +type PublishFn func(ctx context.Context, event *pb.CloudEvent) (*chippb.PublishResponse, error) + +type Config struct { + GRPCListen string + UpstreamEndpoint string + PublishFn PublishFn + Started chan<- string +} + +type Server struct { + cfg Config + + grpcServer *grpc.Server + upstream chippb.ChipIngressClient + onceStop sync.Once + + chippb.UnimplementedChipIngressServer +} + +func NewServer(cfg Config) (*Server, error) { + s := &Server{cfg: cfg} + s.grpcServer = grpc.NewServer() + chippb.RegisterChipIngressServer(s.grpcServer, s) + + if cfg.UpstreamEndpoint != "" { + conn, err := grpc.NewClient(cfg.UpstreamEndpoint, grpc.WithTransportCredentials(insecure.NewCredentials())) + if err != nil { + return nil, fmt.Errorf("dial upstream chip ingress: %w", err) + } + s.upstream = chippb.NewChipIngressClient(conn) + } + + return s, nil +} + +func (s *Server) Run() error { + lc := net.ListenConfig{} + lis, err := lc.Listen(context.Background(), "tcp", s.cfg.GRPCListen) + if err != nil { + return fmt.Errorf("gRPC listen: %w", err) + } + addr := lis.Addr().String() + + errCh := make(chan error, 1) + go func() { + errCh <- s.grpcServer.Serve(lis) + }() + if err := waitForListenerReady(addr, listenerReadyTimeout); err != nil { + s.grpcServer.Stop() + return err + } + notifyStarted(s.cfg.Started, addr) + + return <-errCh +} + +func (s *Server) Shutdown(context.Context) { + s.onceStop.Do(func() { + s.grpcServer.GracefulStop() + }) +} + +func (s *Server) Publish(ctx context.Context, event *pb.CloudEvent) (*chippb.PublishResponse, error) { + if s.cfg.UpstreamEndpoint != "" && s.upstream != nil { + go func() { + upstreamCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + _, _ = s.upstream.Publish(upstreamCtx, event) + }() + } + + if s.cfg.PublishFn != nil { + return s.cfg.PublishFn(ctx, event) + } + return &chippb.PublishResponse{}, nil +} + +func waitForListenerReady(addr string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + var lastErr error + for time.Now().Before(deadline) { + dialer := &net.Dialer{Timeout: 250 * time.Millisecond} + conn, err := dialer.Dial("tcp", addr) + if err == nil { + _ = conn.Close() + return nil + } + lastErr = err + time.Sleep(50 * time.Millisecond) + } + if lastErr == nil { + lastErr = fmt.Errorf("listener on %s not ready", addr) + } + return fmt.Errorf("timeout waiting for listener readiness: %w", lastErr) +} + +func notifyStarted(ch chan<- string, addr string) { + if ch == nil { + return + } + select { + case ch <- addr: + default: + } +} diff --git a/system-tests/lib/cre/environment/remoteexec/client/chip_sink_remote.go b/system-tests/lib/cre/environment/remoteexec/client/chip_sink_remote.go new file mode 100644 index 00000000000..708749dd798 --- /dev/null +++ b/system-tests/lib/cre/environment/remoteexec/client/chip_sink_remote.go @@ -0,0 +1,112 @@ +package client + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "strings" + "time" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" +) + +const chipSinkLifecycleTimeout = 30 * time.Second + +func StartRemoteChipTestSink(ctx context.Context, runtime *Runtime, req agent.ChipTestSinkStartRequest) (*agent.ChipTestSinkStartResponse, error) { + baseURL, err := runtimeBaseURL(runtime) + if err != nil { + return nil, err + } + var out agent.ChipTestSinkStartResponse + if err := postAgentJSON(ctx, baseURL+"/v1/chip/sink/start", req, &out); err != nil { + return nil, err + } + return &out, nil +} + +func StopRemoteChipTestSink(ctx context.Context, runtime *Runtime) (*agent.ChipTestSinkStopResponse, error) { + baseURL, err := runtimeBaseURL(runtime) + if err != nil { + return nil, err + } + var out agent.ChipTestSinkStopResponse + if err := postAgentJSON(ctx, baseURL+"/v1/chip/sink/stop", map[string]any{}, &out); err != nil { + return nil, err + } + return &out, nil +} + +func GetRemoteChipTestSinkStatus(ctx context.Context, runtime *Runtime) (*agent.ChipTestSinkStatusResponse, error) { + baseURL, err := runtimeBaseURL(runtime) + if err != nil { + return nil, err + } + var out agent.ChipTestSinkStatusResponse + if err := getAgentJSON(ctx, baseURL+"/v1/chip/sink/status", &out); err != nil { + return nil, err + } + return &out, nil +} + +func GetRemoteChipTestSinkEvents(ctx context.Context, runtime *Runtime, since time.Time, limit int) (*agent.ChipTestSinkEventsResponse, error) { + baseURL, err := runtimeBaseURL(runtime) + if err != nil { + return nil, err + } + endpoint := baseURL + "/v1/chip/sink/events" + query := make([]string, 0, 2) + if limit > 0 { + query = append(query, fmt.Sprintf("limit=%d", limit)) + } + if !since.IsZero() { + query = append(query, "since="+since.UTC().Format(time.RFC3339Nano)) + } + if len(query) > 0 { + endpoint += "?" + strings.Join(query, "&") + } + var out agent.ChipTestSinkEventsResponse + if err := getAgentJSON(ctx, endpoint, &out); err != nil { + return nil, err + } + return &out, nil +} + +func postAgentJSON(ctx context.Context, endpoint string, payload any, target any) error { + httpClient := &http.Client{Timeout: chipSinkLifecycleTimeout} + body, err := json.Marshal(payload) + if err != nil { + return fmt.Errorf("failed to marshal agent request body for %s: %w", endpoint, err) + } + req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewReader(body)) + if err != nil { + return fmt.Errorf("failed to build agent request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + if err != nil { + return fmt.Errorf("failed to call agent endpoint %s: %w", endpoint, err) + } + defer resp.Body.Close() + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return fmt.Errorf("failed to read agent response from %s: %w", endpoint, err) + } + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + var agentErr agent.StartComponentResponse + if len(respBody) > 0 && json.Unmarshal(respBody, &agentErr) == nil && strings.TrimSpace(agentErr.Error) != "" { + if agentErr.ErrorCode != "" { + return RemoteAgentError(agentErr.ErrorCode, agentErr.Error) + } + return RemoteAgentError("remote_agent_error", agentErr.Error) + } + return fmt.Errorf("agent endpoint %s returned %s: %s", endpoint, resp.Status, strings.TrimSpace(string(respBody))) + } + if err := json.Unmarshal(respBody, target); err != nil { + return fmt.Errorf("failed to decode agent response from %s: %w", endpoint, err) + } + return nil +} diff --git a/system-tests/lib/cre/environment/remoteexec/client/chip_sink_remote_test.go b/system-tests/lib/cre/environment/remoteexec/client/chip_sink_remote_test.go new file mode 100644 index 00000000000..0acbdb1fec0 --- /dev/null +++ b/system-tests/lib/cre/environment/remoteexec/client/chip_sink_remote_test.go @@ -0,0 +1,83 @@ +package client + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" + "github.com/stretchr/testify/require" +) + +func TestStartRemoteChipTestSinkSuccess(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, "/v1/chip/sink/start", r.URL.Path) + require.Equal(t, http.MethodPost, r.Method) + _ = json.NewEncoder(w).Encode(agent.ChipTestSinkStartResponse{ + Profile: "sink", + Mode: "remote", + Name: "default", + GRPCListen: "0.0.0.0:50051", + }) + })) + defer server.Close() + + resp, err := StartRemoteChipTestSink(context.Background(), &Runtime{AgentBaseURL: server.URL}, agent.ChipTestSinkStartRequest{}) + require.NoError(t, err) + require.Equal(t, "sink", resp.Profile) + require.Equal(t, "remote", resp.Mode) +} + +func TestStopRemoteChipTestSinkSuccess(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, "/v1/chip/sink/stop", r.URL.Path) + require.Equal(t, http.MethodPost, r.Method) + _ = json.NewEncoder(w).Encode(agent.ChipTestSinkStopResponse{Found: true, Stopped: true}) + })) + defer server.Close() + + resp, err := StopRemoteChipTestSink(context.Background(), &Runtime{AgentBaseURL: server.URL}) + require.NoError(t, err) + require.True(t, resp.Found) + require.True(t, resp.Stopped) +} + +func TestGetRemoteChipTestSinkStatusSuccess(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, "/v1/chip/sink/status", r.URL.Path) + require.Equal(t, http.MethodGet, r.Method) + _ = json.NewEncoder(w).Encode(agent.ChipTestSinkStatusResponse{ + Profile: "sink", + Mode: "remote", + Running: true, + Name: "default", + GRPCListen: "0.0.0.0:50051", + }) + })) + defer server.Close() + + resp, err := GetRemoteChipTestSinkStatus(context.Background(), &Runtime{AgentBaseURL: server.URL}) + require.NoError(t, err) + require.True(t, resp.Running) + require.Equal(t, "sink", resp.Profile) +} + +func TestGetRemoteChipTestSinkEventsSuccess(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, "/v1/chip/sink/events", r.URL.Path) + require.Equal(t, http.MethodGet, r.Method) + require.Equal(t, "5", r.URL.Query().Get("limit")) + _ = json.NewEncoder(w).Encode(agent.ChipTestSinkEventsResponse{ + Events: []agent.ChipTestSinkEventLogEntry{{Type: "workflows.v1.UserLogs"}}, + }) + })) + defer server.Close() + + resp, err := GetRemoteChipTestSinkEvents(context.Background(), &Runtime{AgentBaseURL: server.URL}, time.Time{}, 5) + require.NoError(t, err) + require.Len(t, resp.Events, 1) + require.Equal(t, "workflows.v1.UserLogs", resp.Events[0].Type) +} diff --git a/system-tests/tests/test-helpers/chip_testsink_helpers.go b/system-tests/tests/test-helpers/chip_testsink_helpers.go index b1e19078437..fbbba6d86e3 100644 --- a/system-tests/tests/test-helpers/chip_testsink_helpers.go +++ b/system-tests/tests/test-helpers/chip_testsink_helpers.go @@ -29,7 +29,10 @@ import ( chiptestsink "github.com/smartcontractkit/chainlink/system-tests/tests/test-helpers/chip-testsink" ) -const testSinkStartupTimeout = 10 * time.Second +const ( + testSinkStartupTimeout = 10 * time.Second + envChipTestSinkUpstreamEndpoint = "CRE_CHIP_TESTSINK_UPSTREAM_ENDPOINT" +) // WaitForUserLog monitors workflow user logs until one contains needle or the context ends. func WaitForUserLog( @@ -281,13 +284,14 @@ and make sure that the sink is pointing to correct upstream endpoint ('localhost grpcPort, convErr := strconv.Atoi(chipingressset.DEFAULT_CHIP_INGRESS_GRPC_PORT) require.NoError(t, convErr, "invalid default chip ingress grpc port") EnsureFixtureRelayForPort(t, nil, "chip-testsink", grpcPort) + upstreamEndpoint := strings.TrimSpace(os.Getenv(envChipTestSinkUpstreamEndpoint)) startCh := make(chan struct{}, 1) server, err := chiptestsink.NewServer(chiptestsink.Config{ - PublishFunc: publishFn, - GRPCListen: grpcListenAddr, - Started: startCh, // signals that server is indeed listening on the GRPC port - // UpstreamEndpoint: "localhost:50052", // uncomment to forward events to ChIP, remember to start ChIP on a different port config.DefaultChipIngressPort (=50051) + PublishFunc: publishFn, + GRPCListen: grpcListenAddr, + Started: startCh, // signals that server is indeed listening on the GRPC port + UpstreamEndpoint: upstreamEndpoint, }) require.NoError(t, err, "failed to create new test sink server") From bf97fcbfa6f50e23652e3e4ad6269a008bdb555b Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Fri, 27 Feb 2026 15:59:07 +0100 Subject: [PATCH 30/34] more clean up and a small refactor --- core/scripts/cre/environment/README.md | 8 +- ...workflow-gateway-don-mixed-placement.toml} | 22 +- .../configs/workflow-gateway-don-remote.toml | 90 ------ .../configs/workflow-gateway-don.toml | 16 +- .../docs/ARCHITECTURE_REMOTEEXEC.md | 2 +- .../cre/environment/environment/debug.go | 35 +-- .../environment/environment/environment.go | 286 +----------------- .../environment/relay_supervisor.go | 33 ++ .../cre/environment/environment/remote.go | 39 +++ .../cre/environment/environment/stop.go | 279 +++++++++++++++++ .../cre/environment/environment/swap.go | 10 +- .../cre/environment/environment/workflow.go | 61 ++-- system-tests/lib/cre/don/config/config.go | 104 +------ .../lib/cre/environment/environment.go | 8 +- .../environment/environment_placement_test.go | 4 +- .../{execution_plan.go => placement_plan.go} | 8 +- system-tests/lib/cre/topology.go | 50 +++ system-tests/lib/cre/types.go | 23 ++ 18 files changed, 512 insertions(+), 566 deletions(-) rename core/scripts/cre/environment/configs/{workflow-gateway-don-mixed.toml => workflow-gateway-don-mixed-placement.toml} (81%) delete mode 100644 core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml create mode 100644 core/scripts/cre/environment/environment/remote.go create mode 100644 core/scripts/cre/environment/environment/stop.go rename system-tests/lib/cre/environment/{execution_plan.go => placement_plan.go} (96%) diff --git a/core/scripts/cre/environment/README.md b/core/scripts/cre/environment/README.md index fde2e234589..63e2edaf890 100644 --- a/core/scripts/cre/environment/README.md +++ b/core/scripts/cre/environment/README.md @@ -282,7 +282,7 @@ For more details on the URL resolution process and how workflow artifacts are ha go run . env stop # stop remote components only -go run . env stop-remote +go run . env remote stop # stop remote first, then local resources and local services go run . env stop-all @@ -306,10 +306,10 @@ Environment variable precedence for agent resolution: Stop command semantics: - `env stop`: local resources only; does not stop remote components. -- `env stop-remote`: remote resources only through the remote agent. +- `env remote stop`: remote resources only through the remote agent. - `env stop-all`: remote stop followed by local stop. -If `env stop` warns about remote components still running, run `env stop-remote`. +If `env stop` warns about remote components still running, run `env remote stop`. Architecture ownership and boundaries are documented in: - [`docs/ARCHITECTURE_REMOTEEXEC.md`](./docs/ARCHITECTURE_REMOTEEXEC.md) @@ -319,7 +319,7 @@ Mixed-mode verification checklist: 1. Start with a mixed config (`local` + `remote` placements). 2. Confirm startup output includes `Runtime Placement Matrix`. 3. Deploy a workflow/artifact and verify remote delivery path succeeds. -4. Run `env stop-remote` and verify remote stop summary reports requested/stopped counts. +4. Run `env remote stop` and verify remote stop summary reports requested/stopped counts. 5. Run `env stop-all` and verify no local containers/state remain. ## Restarting the environment diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml b/core/scripts/cre/environment/configs/workflow-gateway-don-mixed-placement.toml similarity index 81% rename from core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml rename to core/scripts/cre/environment/configs/workflow-gateway-don-mixed-placement.toml index 2146731fdc8..365283ae4aa 100644 --- a/core/scripts/cre/environment/configs/workflow-gateway-don-mixed.toml +++ b/core/scripts/cre/environment/configs/workflow-gateway-don-mixed-placement.toml @@ -1,4 +1,7 @@ +# This topology starts the workflow DON on the local machine and everything else on a remote +# It requires that JD image (job-distributor:0.22.1) and CL node image (chainlink-amd:latest) is either present on the remote machine or can be pulled by it + [[blockchains]] type = "anvil" chain_id = "1337" @@ -56,11 +59,9 @@ [[nodesets.node_specs]] roles = ["plugin"] [nodesets.node_specs.node] - #docker_ctx = "../../../.." - #docker_file = "core/chainlink.Dockerfile" - #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } - image = "chainlink-tmp:latest" - user_config_overrides = "" + docker_ctx = "../../../.." + docker_file = "core/chainlink.Dockerfile" + docker_build_args = { "CL_IS_PROD_BUILD" = "false" } [[nodesets]] nodes = 4 @@ -87,11 +88,7 @@ [[nodesets.node_specs]] roles = ["plugin"] [nodesets.node_specs.node] - #docker_ctx = "../../../.." - #docker_file = "core/chainlink.Dockerfile" - #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } image = "chainlink-amd:latest" - user_config_overrides = "" [[nodesets]] nodes = 1 @@ -113,13 +110,8 @@ [[nodesets.node_specs]] roles = ["bootstrap", "gateway"] [nodesets.node_specs.node] - #ocker_ctx = "../../../.." - #docker_file = "core/chainlink.Dockerfile" - #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } image = "chainlink-amd:latest" # 5002 is the web API capabilities port for incoming requests # 5003 is the gateway port for outgoing connections # 15002 is the vault port for incoming requests - custom_ports = ["5002:5002","5003:5003","15002:15002"] - # image = "chainlink-tmp:latest" - user_config_overrides = "" \ No newline at end of file + custom_ports = ["5002:5002","5003:5003","15002:15002"] \ No newline at end of file diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml b/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml deleted file mode 100644 index cbddfc6b7ab..00000000000 --- a/core/scripts/cre/environment/configs/workflow-gateway-don-remote.toml +++ /dev/null @@ -1,90 +0,0 @@ - -[[blockchains]] - type = "anvil" - chain_id = "1337" - docker_cmd_params = ["-b", "0.5", "--mixed-mining"] - placement = "remote" - -[[blockchains]] - type = "anvil" - chain_id = "2337" - port = "8546" - docker_cmd_params = ["-b", "0.5", "--mixed-mining"] - placement = "remote" - container_name = "anvil-2337" - remote_start_policy = "always" - -[jd] - csa_encryption_key = "d1093c0060d50a3c89c189b2e485da5a3ce57f3dcb38ab7e2c0d5f0bb2314a44" # any random 32 byte hex string - # change to your version - image = "job-distributor:0.22.1" - placement = "remote" - # we need fresh DB on each run to avoid DB-level job name uniquness violations - remote_start_policy = "always" - -[fake] - port = 8171 - -[fake_http] - port = 8666 - -#[s3provider] -# # use all defaults -# port = 9000 -# console_port = 9001 - -[infra] - # either "docker" or "kubernetes" - type = "docker" - -[[nodesets]] - nodes = 4 - name = "workflow" - don_types = ["workflow"] - override_mode = "all" - http_port_range_start = 10100 - placement = "remote" - - env_vars = { CL_EVM_CMD = "" } - capabilities = ["ocr3", "custom-compute", "web-api-target", "web-api-trigger", "vault", "cron", "http-action", "http-trigger", "consensus", "don-time", "write-evm-1337", "write-evm-2337", "evm-1337", "evm-2337", "read-contract-1337", "read-contract-2337"] - - [nodesets.db] - image = "postgres:12.0" - port = 13000 - -[[nodesets.node_specs]] - roles = ["plugin"] - [nodesets.node_specs.node] - #docker_ctx = "../../../.." - #docker_file = "core/chainlink.Dockerfile" - #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } - image = "chainlink-amd:latest" - user_config_overrides = "" - -[[nodesets]] - nodes = 1 - name = "bootstrap-gateway" - don_types = ["bootstrap", "gateway"] - override_mode = "each" - http_port_range_start = 10300 - placement = "remote" - - env_vars = { CL_EVM_CMD = "" } - supported_evm_chains = [1337, 2337] - - [nodesets.db] - image = "postgres:12.0" - port = 13200 - - [[nodesets.node_specs]] - roles = ["bootstrap", "gateway"] - [nodesets.node_specs.node] - #ocker_ctx = "../../../.." - #docker_file = "core/chainlink.Dockerfile" - #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } - image = "chainlink-amd:latest" - # 5002 is the web API capabilities port for incoming requests - # 15002 is the vault port for incoming requests - custom_ports = ["5002:5002","15002:15002"] - # image = "chainlink-tmp:latest" - user_config_overrides = "" diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don.toml b/core/scripts/cre/environment/configs/workflow-gateway-don.toml index 5784dbb5ec3..4147aff7e41 100644 --- a/core/scripts/cre/environment/configs/workflow-gateway-don.toml +++ b/core/scripts/cre/environment/configs/workflow-gateway-don.toml @@ -49,10 +49,10 @@ [[nodesets.node_specs]] roles = ["plugin"] [nodesets.node_specs.node] - #docker_ctx = "../../../.." - #docker_file = "core/chainlink.Dockerfile" - #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } - image = "chainlink-tmp:latest" + docker_ctx = "../../../.." + docker_file = "core/chainlink.Dockerfile" + docker_build_args = { "CL_IS_PROD_BUILD" = "false" } + #image = "chainlink-tmp:latest" user_config_overrides = "" [[nodesets]] @@ -72,11 +72,11 @@ [[nodesets.node_specs]] roles = ["bootstrap", "gateway"] [nodesets.node_specs.node] - #docker_ctx = "../../../.." - #docker_file = "core/chainlink.Dockerfile" - #docker_build_args = { "CL_IS_PROD_BUILD" = "false" } + docker_ctx = "../../../.." + docker_file = "core/chainlink.Dockerfile" + docker_build_args = { "CL_IS_PROD_BUILD" = "false" } # 5002 is the web API capabilities port for incoming requests # 15002 is the vault port for incoming requests custom_ports = ["5002:5002","15002:15002"] - image = "chainlink-tmp:latest" + #image = "chainlink-tmp:latest" user_config_overrides = "" diff --git a/core/scripts/cre/environment/docs/ARCHITECTURE_REMOTEEXEC.md b/core/scripts/cre/environment/docs/ARCHITECTURE_REMOTEEXEC.md index 56eacdb536c..76310acc509 100644 --- a/core/scripts/cre/environment/docs/ARCHITECTURE_REMOTEEXEC.md +++ b/core/scripts/cre/environment/docs/ARCHITECTURE_REMOTEEXEC.md @@ -27,7 +27,7 @@ Keep responsibilities co-located so contributors can reason about hybrid local/r 4. Local components are started directly by `environment` + CTF components. 5. Stop commands route: - `env stop`: local only. - - `env stop-remote`: remote only via `remoteexec/client`. + - `env remote stop`: remote only via `remoteexec/client`. - `env stop-all`: remote then local. ## Invariants diff --git a/core/scripts/cre/environment/environment/debug.go b/core/scripts/cre/environment/environment/debug.go index f6df1128ee2..843ef24104e 100644 --- a/core/scripts/cre/environment/environment/debug.go +++ b/core/scripts/cre/environment/environment/debug.go @@ -14,37 +14,18 @@ import ( remoteclient "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/client" ) -func debugCmds() *cobra.Command { +func remoteDebugCmds() *cobra.Command { cmd := &cobra.Command{ - Use: "debug", - Short: "Debug helpers for remote execution", - Long: "Debug helpers for querying remote agent state and logs.", - PersistentPreRun: globalPreRunFunc, + Use: "debug", + Short: "Debug helpers for remote execution", + Long: "Debug helpers for querying remote agent state and logs.", } - - cmd.AddCommand(debugStatusCmd()) - cmd.AddCommand(debugLocksCmd()) - cmd.AddCommand(debugLogsCmd()) + cmd.AddCommand(remoteDebugLocksCmd()) + cmd.AddCommand(remoteDebugLogsCmd()) return cmd } -func debugStatusCmd() *cobra.Command { - return &cobra.Command{ - Use: "status", - Short: "Get remote agent status snapshot", - RunE: func(cmd *cobra.Command, _ []string) error { - return withResolvedRemoteRuntime(cmd.Context(), func(ctx context.Context, runtime *remoteclient.Runtime) error { - status, err := remoteclient.GetAgentStatus(ctx, runtime) - if err != nil { - return err - } - return printDebugJSON(status) - }) - }, - } -} - -func debugLocksCmd() *cobra.Command { +func remoteDebugLocksCmd() *cobra.Command { return &cobra.Command{ Use: "locks", Short: "Get remote agent lock/in-flight snapshot", @@ -60,7 +41,7 @@ func debugLocksCmd() *cobra.Command { } } -func debugLogsCmd() *cobra.Command { +func remoteDebugLogsCmd() *cobra.Command { var ( componentKey string limit int diff --git a/core/scripts/cre/environment/environment/environment.go b/core/scripts/cre/environment/environment/environment.go index 3c6444b0dcf..7f2d71d2007 100644 --- a/core/scripts/cre/environment/environment/environment.go +++ b/core/scripts/cre/environment/environment/environment.go @@ -44,7 +44,6 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" blockchains_sets "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/sets" envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" - remoteclient "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/client" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/stagegen" feature_set "github.com/smartcontractkit/chainlink/system-tests/lib/cre/features/sets" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/flags" @@ -86,8 +85,7 @@ func init() { EnvironmentCmd.AddCommand(startCmd()) EnvironmentCmd.AddCommand(stopCmd()) EnvironmentCmd.AddCommand(stopAllCmd()) - EnvironmentCmd.AddCommand(stopRemoteCmd()) - EnvironmentCmd.AddCommand(debugCmds()) + EnvironmentCmd.AddCommand(remoteCmds()) EnvironmentCmd.AddCommand(relaySupervisorCmd()) EnvironmentCmd.AddCommand(workflowCmds()) EnvironmentCmd.AddCommand(beholderCmds()) @@ -264,6 +262,8 @@ func startCmd() *cobra.Command { framework.L.Warn().Err(err).Msg("failed to stop tracked relay supervisor before start") } + // Clear only local state before startup. Remote stop state is intentionally kept + // until startup completes so failed starts do not drop metadata needed by `env remote stop`. cleanUpErr := envconfig.RemoveAllEnvironmentStateDir(relativePathToRepoRoot) if cleanUpErr != nil { return errors.Wrap(cleanUpErr, "failed to clean up environment state files") @@ -500,7 +500,7 @@ func startCmd() *cobra.Command { fmt.Print(libformat.PurpleText("\nEnvironment setup completed successfully in %.2f seconds\n\n", time.Since(provisioningStartTime).Seconds())) fmt.Print("To terminate execute:`go run . env stop`\n\n") if remoteSummary := summarizeRemoteComponents(in); remoteSummary.Total > 0 { - fmt.Printf("Remote components started (%d). Use `go run . env stop-remote` to stop them.\n\n", remoteSummary.Total) + fmt.Printf("Remote components started (%d). Use `go run . env remote stop` to stop them.\n\n", remoteSummary.Total) } addresses, aErr := output.CreEnvironment.CldfEnvironment.DataStore.Addresses().Fetch() @@ -643,269 +643,6 @@ func trackStartup(success, hasBuiltDockerImage bool, infraType string, errorMess return nil } -func stopCmd() *cobra.Command { - cmd := &cobra.Command{ - Use: "stop", - Short: "Stops local environment", - Long: `Stops local CRE resources only (containers, tracked local tunnels, and local state file).`, - Example: "go run . env stop", - PersistentPreRun: globalPreRunFunc, - RunE: func(cmd *cobra.Command, args []string) error { - if err := stopLocalResources(relativePathToRepoRoot, false, false); err != nil { - return err - } - remoteConfiguredSummary, _ := loadRemoteStopTargets(relativePathToRepoRoot) - if remoteConfiguredSummary.Total > 0 { - framework.L.Warn(). - Int("count", remoteConfiguredSummary.Total). - Msgf("Remote components are still running. Use `env stop-remote` to stop them. Remote stop state: %s", remoteStateFileAbsPath(relativePathToRepoRoot)) - } - fmt.Println("Local environment stopped successfully") - return nil - }, - } - return cmd -} - -func stopAllCmd() *cobra.Command { - cmd := &cobra.Command{ - Use: "stop-all", - Short: "Stops local and remote resources", - Long: `Stops remote CRE components (when configured), then stops local CRE resources and extra local services (beholder, billing, observability), and removes local state directory.`, - Example: "go run . env stop-all", - PersistentPreRun: globalPreRunFunc, - RunE: func(cmd *cobra.Command, args []string) error { - remoteConfiguredSummary, targets := loadRemoteStopTargets(relativePathToRepoRoot) - if remoteConfiguredSummary.Total > 0 { - if err := stopRemoteTargets(cmd.Context(), relativePathToRepoRoot, targets); err != nil { - return err - } - } - if err := stopLocalResources(relativePathToRepoRoot, true, false); err != nil { - return err - } - fmt.Println("All resources stopped successfully") - return nil - }, - } - return cmd -} - -func stopRemoteCmd() *cobra.Command { - var dryRunFlag bool - cmd := &cobra.Command{ - Use: "stop-remote", - Short: "Stops remote components only", - Long: `Stops remote CRE components through the agent without performing any local cleanup.`, - Example: strings.TrimSpace(` -go run . env stop-remote -go run . env stop-remote --dry-run -`), - PersistentPreRun: globalPreRunFunc, - RunE: func(cmd *cobra.Command, args []string) error { - remoteConfiguredSummary, targets := loadRemoteStopTargets(relativePathToRepoRoot) - if dryRunFlag { - framework.L.Info(). - Int("total", remoteConfiguredSummary.Total). - Int("blockchains", remoteConfiguredSummary.Blockchains). - Int("nodesets", remoteConfiguredSummary.NodeSets). - Int("jd", remoteConfiguredSummary.JD). - Msg("Dry-run: remote components that would be stopped") - return nil - } - if remoteConfiguredSummary.Total == 0 { - framework.L.Info().Msg("No remote components recorded; nothing to stop.") - return nil - } - - if err := stopRemoteTargets(cmd.Context(), relativePathToRepoRoot, targets); err != nil { - return err - } - fmt.Println("Remote components stopped successfully") - return nil - }, - } - cmd.Flags().BoolVar(&dryRunFlag, "dry-run", false, "Preview what remote components would be stopped") - return cmd -} - -func loadRemoteStopTargets(relativePathToRepoRoot string) (remoteComponentSummary, *envconfig.Config) { - var ( - targets *envconfig.Config - summary remoteComponentSummary - ) - if envconfig.LocalCREStateFileExists(relativePathToRepoRoot) { - cached := &envconfig.Config{} - statePath := envconfig.MustLocalCREStateFileAbsPath(relativePathToRepoRoot) - if loadErr := cached.Load(statePath); loadErr != nil { - framework.L.Warn().Err(loadErr).Msgf("failed to load local CRE state from %s", statePath) - } else { - targets = cached - summary = summarizeRemoteComponents(targets) - } - } - - if summary.Total == 0 && remoteStateFileExists(relativePathToRepoRoot) { - remoteCfg, loadErr := loadRemoteStopConfig(relativePathToRepoRoot) - if loadErr != nil { - framework.L.Warn().Err(loadErr).Msgf("failed to load remote component stop state from %s", remoteStateFileAbsPath(relativePathToRepoRoot)) - } else { - targets = remoteCfg - summary = summarizeRemoteComponents(targets) - } - } - return summary, targets -} - -func stopRemoteTargets(ctx context.Context, relativePathToRepoRoot string, targets *envconfig.Config) error { - agentState, agentLoadErr := loadRemoteAgentState(relativePathToRepoRoot) - if agentLoadErr != nil { - framework.L.Warn().Err(agentLoadErr).Msgf("failed to load remote agent state from %s", remoteStateFileAbsPath(relativePathToRepoRoot)) - } else if agentState != nil { - applyRemoteAgentEnvFallback(framework.L, agentState) - } - - summary, stopRemoteErr := remoteclient.StopRemoteComponents(ctx, framework.L, targets) - framework.L.Info(). - Int("requested", summary.Requested). - Int("stopped", summary.Stopped). - Int("missing", summary.Missing). - Int("failed", summary.Failed). - Msg("Remote component stop summary") - if summary.ResidualQueryError != "" { - framework.L.Warn().Msgf("failed to query remote residual CTF resources: %s", summary.ResidualQueryError) - } else { - framework.L.Info(). - Int("containers", len(summary.ResidualContainers)). - Int("volumes", len(summary.ResidualVolumes)). - Msg("Remote residual CTF resources after stop") - if len(summary.ResidualContainers) > 0 { - framework.L.Warn().Msgf("residual remote CTF containers: %s", strings.Join(summary.ResidualContainers, ", ")) - } - if len(summary.ResidualVolumes) > 0 { - framework.L.Warn().Msgf("residual remote CTF volumes: %s", strings.Join(summary.ResidualVolumes, ", ")) - } - } - if stopRemoteErr != nil { - return errors.Wrap(stopRemoteErr, "failed to stop one or more remote components") - } - if err := stopRelaySupervisor(relativePathToRepoRoot); err != nil { - framework.L.Warn().Err(err).Msg("failed to stop relay supervisor after remote stop") - } else { - framework.L.Info().Msg("stopped local relay supervisor after remote stop") - } - if err := removeRemoteStopConfig(relativePathToRepoRoot); err != nil { - framework.L.Warn().Err(err).Msg("failed to remove remote component stop state") - } else { - framework.L.Info().Msgf("removed remote state directory: %s", filepath.Join(relativePathToRepoRoot, remoteStateDirname)) - } - if !hasLocalComponents(targets) { - statePath := envconfig.MustLocalCREStateFileAbsPath(relativePathToRepoRoot) - if err := os.Remove(statePath); err == nil { - framework.L.Info().Msgf("removed local CRE state file after remote-only stop: %s", statePath) - } else if !os.IsNotExist(err) { - framework.L.Warn().Err(err).Msgf("failed to remove local CRE state file after remote-only stop: %s", statePath) - } - } - return nil -} - -func stopLocalResources(relativePathToRepoRoot string, removeAllState bool, stopRelay bool) error { - if stopRelay { - if err := stopRelaySupervisor(relativePathToRepoRoot); err != nil { - framework.L.Warn().Err(err).Msg("failed to stop relay supervisor") - } - } - - removeErr := framework.RemoveTestContainers() - if removeErr != nil { - return errors.Wrap(removeErr, "failed to remove environment containers. Please remove them manually") - } - - if removeAllState { - stopBeholderErr := stopBeholder() - if stopBeholderErr != nil { - framework.L.Warn().Msgf("failed to stop Beholder: %s", stopBeholderErr) - } - - stopBillingErr := stopBilling() - if stopBillingErr != nil { - framework.L.Warn().Msgf("failed to stop Billing: %s", stopBillingErr) - } - - stopObsStack := framework.ObservabilityDown() - if stopObsStack != nil { - framework.L.Warn().Msgf("failed to stop observability stack: %s", stopObsStack) - } - - removeCacheErr := envconfig.RemoveAllEnvironmentStateDir(relativePathToRepoRoot) - if removeCacheErr != nil { - framework.L.Warn().Msgf("failed to remove local CRE state files: %s", removeCacheErr) - } - return nil - } - - creStateFile := envconfig.MustLocalCREStateFileAbsPath(relativePathToRepoRoot) - cErr := os.Remove(creStateFile) - if cErr != nil && !os.IsNotExist(cErr) { - framework.L.Warn().Msgf("failed to remove local CRE state file: %s", cErr) - } else if cErr != nil && os.IsNotExist(cErr) { - framework.L.Info().Msgf("local CRE state file already absent: %s", creStateFile) - } else { - framework.L.Info().Msgf("removed local CRE state file: %s", creStateFile) - } - return nil -} - -type remoteComponentSummary struct { - Total int - Blockchains int - NodeSets int - JD int -} - -func summarizeRemoteComponents(cfg *envconfig.Config) remoteComponentSummary { - summary := remoteComponentSummary{} - if cfg == nil { - return summary - } - for _, configuredBlockchain := range cfg.Blockchains { - if configuredBlockchain != nil && configuredBlockchain.Placement == envconfig.PlacementRemote { - summary.Blockchains++ - } - } - for _, nodeSet := range cfg.NodeSets { - if nodeSet != nil && strings.TrimSpace(nodeSet.Placement) == string(envconfig.PlacementRemote) { - summary.NodeSets++ - } - } - if cfg.JD != nil && cfg.JD.Placement == envconfig.PlacementRemote { - summary.JD = 1 - } - summary.Total = summary.Blockchains + summary.NodeSets + summary.JD - return summary -} - -func hasLocalComponents(cfg *envconfig.Config) bool { - if cfg == nil { - return false - } - for _, configuredBlockchain := range cfg.Blockchains { - if configuredBlockchain != nil && configuredBlockchain.Placement != envconfig.PlacementRemote { - return true - } - } - for _, nodeSet := range cfg.NodeSets { - if nodeSet != nil && strings.TrimSpace(nodeSet.Placement) != string(envconfig.PlacementRemote) { - return true - } - } - if cfg.JD != nil && cfg.JD.Placement != envconfig.PlacementRemote { - return true - } - return false -} - func applyRemoteAgentEnvFallback(logger zerolog.Logger, agentState *remoteAgentState) { if agentState == nil { return @@ -940,7 +677,6 @@ func StartCLIEnvironment( gatewayWhitelistConfig gateway.WhitelistConfig, ) (*creenv.SetupOutput, error) { testLogger := framework.L - relaySupervisorStarted := false // unset DockerFilePath and DockerContext as we cannot use them with existing images if withPluginsDockerImageFlag != "" { @@ -982,16 +718,10 @@ func StartCLIEnvironment( GatewayWhitelistConfig: gatewayWhitelistConfig, BlockchainDeployers: blockchains_sets.NewDeployerSet(testLogger, in.Infra), PreDONsStartHook: func(context.Context) error { - if relaySupervisorStarted { - return nil - } - started, err := maybeStartRelaySupervisor(relativePathToRepoRoot, in) + _, err := maybeStartRelaySupervisor(relativePathToRepoRoot, in) if err != nil { return errors.Wrap(err, "failed to start persistent relay supervisor") } - if started { - relaySupervisorStarted = true - } return nil }, } @@ -1000,10 +730,8 @@ func StartCLIEnvironment( defer cancel() universalSetupOutput, setupErr := creenv.SetupTestEnvironment(ctx, testLogger, singleFileLogger, universalSetupInput, relativePathToRepoRoot) if setupErr != nil { - if relaySupervisorStarted { - if err := stopRelaySupervisor(relativePathToRepoRoot); err != nil { - framework.L.Warn().Err(err).Msg("failed to stop relay supervisor during startup rollback") - } + if err := stopRelaySupervisor(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to stop relay supervisor during startup rollback") } return nil, fmt.Errorf("failed to setup test environment: %w", setupErr) } diff --git a/core/scripts/cre/environment/environment/relay_supervisor.go b/core/scripts/cre/environment/environment/relay_supervisor.go index 41548fe89dc..4229e986d43 100644 --- a/core/scripts/cre/environment/environment/relay_supervisor.go +++ b/core/scripts/cre/environment/environment/relay_supervisor.go @@ -89,6 +89,9 @@ type localBridgeStats struct { LocalDialFails uint64 } +// relaySupervisorCmd runs the detached local process that keeps mixed-mode relays alive. +// It opens relays on the remote agent and maintains workers that bridge remote WebSockets +// to local localhost targets. func relaySupervisorCmd() *cobra.Command { var portsRaw string var relaySpecsRaw string @@ -152,6 +155,8 @@ func relaySupervisorCmd() *cobra.Command { return cmd } +// maybeStartRelaySupervisor starts (or stops stale) supervisor state based on current config. +// It returns whether a supervisor should be considered active for this run. func maybeStartRelaySupervisor(relativePathToRepoRoot string, cfg *envconfig.Config) (bool, error) { specs := relaySpecsFromConfig(cfg) if len(specs) == 0 { @@ -164,6 +169,8 @@ func maybeStartRelaySupervisor(relativePathToRepoRoot string, cfg *envconfig.Con return true, startRelaySupervisor(relativePathToRepoRoot, specs) } +// relaySpecsFromConfig derives local ports that must be reachable from remote components. +// Each resulting spec maps to one remote listener and a local bridge target. func relaySpecsFromConfig(cfg *envconfig.Config) []relaySpec { if cfg == nil { return nil @@ -257,6 +264,8 @@ func relaySpecsFromConfig(cfg *envconfig.Config) []relaySpec { return specs } +// inferLocalBlockchainPortsFromInput derives expected local blockchain ports when runtime +// output is not yet available. func inferLocalBlockchainPortsFromInput(in blockchain.Input) []int { portSet := map[int]struct{}{} add := func(raw string) { @@ -288,6 +297,7 @@ func inferLocalBlockchainPortsFromInput(in blockchain.Input) []int { return out } +// inferLocalJDPortsFromInput derives JD gRPC/WSRPC ports when runtime output is not available. func inferLocalJDPortsFromInput(in jd.Input) []int { const ( defaultJDGRPC = "14231" @@ -323,6 +333,8 @@ func hasBootstrapRole(roles []string) bool { return false } +// inferLocalNodeSetOCR2Ports derives OCR2 P2P ports for local node sets that remote peers +// must reach in mixed mode. func inferLocalNodeSetOCR2Ports(nodeSet *cre.NodeSet) []int { if nodeSet == nil { return nil @@ -353,6 +365,7 @@ func inferLocalNodeSetOCR2Ports(nodeSet *cre.NodeSet) []int { return out } +// endpointPort extracts a valid TCP port from either URL or host:port endpoint strings. func endpointPort(raw string) (int, bool) { trimmed := strings.TrimSpace(raw) if trimmed == "" { @@ -380,6 +393,8 @@ func endpointPort(raw string) (int, bool) { return port, true } +// startRelaySupervisor starts the detached supervisor subprocess and stores PID/state. +// Existing supervisor state is best-effort stopped first. func startRelaySupervisor(relativePathToRepoRoot string, specs []relaySpec) error { if len(specs) == 0 { return nil @@ -436,6 +451,8 @@ func startRelaySupervisor(relativePathToRepoRoot string, specs []relaySpec) erro return storeRelaySupervisorState(relativePathToRepoRoot, &state) } +// stopRelaySupervisor terminates the tracked supervisor process (if present) and clears state. +// It is intentionally idempotent for absent/already-dead processes. func stopRelaySupervisor(relativePathToRepoRoot string) error { state, err := loadRelaySupervisorState(relativePathToRepoRoot) if err != nil { @@ -638,6 +655,7 @@ func parsePortsCSV(raw string) ([]int, error) { return uniqueSortedPorts(out), nil } +// parseRelaySpecsCSV parses "name:port" entries from CLI input and validates port range. func parseRelaySpecsCSV(raw string) ([]relaySpec, error) { raw = strings.TrimSpace(raw) if raw == "" { @@ -717,6 +735,8 @@ func uniqueSortedPorts(in []int) []int { return out } +// newLocalComponentRelayManager builds the local control-plane manager used by the +// supervisor process to open, track, and close remote relays. func newLocalComponentRelayManager(lggr zerolog.Logger) (*localComponentRelayManager, error) { baseURL, err := resolveAgentBaseURLForRelay() if err != nil { @@ -729,6 +749,8 @@ func newLocalComponentRelayManager(lggr zerolog.Logger) (*localComponentRelayMan }, nil } +// EnsurePort makes sure one persistent relay exists for a local port and starts the worker +// pool that bridges traffic for that relay. func (m *localComponentRelayManager) EnsurePort(ctx context.Context, relayName string, localPort int) error { if m == nil || localPort <= 0 { return nil @@ -767,6 +789,7 @@ func (m *localComponentRelayManager) EnsurePort(ctx context.Context, relayName s return nil } +// Close stops all workers and closes every relay tracked by this manager. func (m *localComponentRelayManager) Close(ctx context.Context) error { if m == nil { return nil @@ -807,6 +830,8 @@ func (h *relayHandle) setRelayID(relayID string) { h.mu.Unlock() } +// resolveAgentBaseURLForRelay resolves the remote agent base URL from explicit URL or +// host/port discovery envs used by direct remote mode. func resolveAgentBaseURLForRelay() (string, error) { if v := strings.TrimSpace(os.Getenv("CRE_REMOTE_AGENT_URL")); v != "" { return v, nil @@ -834,6 +859,7 @@ func resolveRemoteAgentPortForRelay() (int, error) { return port, nil } +// openRelay requests a new relay listener from the remote agent and returns relay ID. func openRelay(ctx context.Context, baseURL, name string, requestedPort int) (string, error) { body, _ := json.Marshal(map[string]any{"name": name, "requestedPort": requestedPort}) req, err := http.NewRequestWithContext(ctx, http.MethodPost, strings.TrimRight(baseURL, "/")+"/v1/relay/open", bytes.NewReader(body)) @@ -862,6 +888,7 @@ func openRelay(ctx context.Context, baseURL, name string, requestedPort int) (st return out.RelayID, nil } +// closeRelay requests relay teardown for a previously opened relay ID. func closeRelay(ctx context.Context, baseURL, relayID string) error { body, _ := json.Marshal(map[string]any{"relayId": relayID}) req, err := http.NewRequestWithContext(ctx, http.MethodPost, strings.TrimRight(baseURL, "/")+"/v1/relay/close", bytes.NewReader(body)) @@ -882,6 +909,9 @@ func closeRelay(ctx context.Context, baseURL, relayID string) error { return nil } +// relayWorker continuously connects to the remote relay WebSocket and bridges traffic +// to the local target address. It retries with backoff and refreshes relay ID on +// handshake invalidation paths. func relayWorker(ctx context.Context, lggr zerolog.Logger, baseURL string, handle *relayHandle, localAddr string, workerIndex int) { backoff := 250 * time.Millisecond for { @@ -1007,6 +1037,8 @@ func relayConnectWSURL(baseURL, relayID string) (string, error) { return u.String(), nil } +// bridgeRelayStream performs full-duplex bridging between one relay WebSocket stream +// and one local TCP connection. func bridgeRelayStream( ctx context.Context, lggr zerolog.Logger, @@ -1143,6 +1175,7 @@ func bridgeRelayStream( } } +// relayKeepAlive sends periodic WebSocket ping frames to keep idle relay streams active. func relayKeepAlive(ctx context.Context, ws *websocket.Conn, writeMu *sync.Mutex, errCh chan<- error) { ticker := time.NewTicker(20 * time.Second) defer ticker.Stop() diff --git a/core/scripts/cre/environment/environment/remote.go b/core/scripts/cre/environment/environment/remote.go new file mode 100644 index 00000000000..1caecba9ff6 --- /dev/null +++ b/core/scripts/cre/environment/environment/remote.go @@ -0,0 +1,39 @@ +package environment + +import ( + "context" + + "github.com/spf13/cobra" + + remoteclient "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/client" +) + +func remoteCmds() *cobra.Command { + cmd := &cobra.Command{ + Use: "remote", + Short: "Remote execution helpers", + Long: "Helpers for controlling and inspecting the remote execution agent.", + PersistentPreRun: globalPreRunFunc, + } + + cmd.AddCommand(stopRemoteCmd()) + cmd.AddCommand(remoteStatusCmd()) + cmd.AddCommand(remoteDebugCmds()) + return cmd +} + +func remoteStatusCmd() *cobra.Command { + return &cobra.Command{ + Use: "status", + Short: "Get remote agent status snapshot", + RunE: func(cmd *cobra.Command, _ []string) error { + return withResolvedRemoteRuntime(cmd.Context(), func(ctx context.Context, runtime *remoteclient.Runtime) error { + status, err := remoteclient.GetAgentStatus(ctx, runtime) + if err != nil { + return err + } + return printDebugJSON(status) + }) + }, + } +} diff --git a/core/scripts/cre/environment/environment/stop.go b/core/scripts/cre/environment/environment/stop.go new file mode 100644 index 00000000000..8dec2537035 --- /dev/null +++ b/core/scripts/cre/environment/environment/stop.go @@ -0,0 +1,279 @@ +package environment + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/pkg/errors" + "github.com/spf13/cobra" + + "github.com/smartcontractkit/chainlink-testing-framework/framework" + envconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" + remoteclient "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/client" +) + +func stopCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "stop", + Short: "Stops local environment", + Long: `Stops local CRE resources only (containers, tracked local tunnels, and local state file).`, + Example: "go run . env stop", + PersistentPreRun: globalPreRunFunc, + RunE: func(cmd *cobra.Command, args []string) error { + if err := stopLocalResources(relativePathToRepoRoot, false, false); err != nil { + return err + } + remoteConfiguredSummary, _ := loadRemoteStopTargets(relativePathToRepoRoot) + if remoteConfiguredSummary.Total > 0 { + framework.L.Warn(). + Int("count", remoteConfiguredSummary.Total). + Msgf("Remote components are still running. Use `env remote stop` to stop them. Remote stop state: %s", remoteStateFileAbsPath(relativePathToRepoRoot)) + } + fmt.Println("Local environment stopped successfully") + return nil + }, + } + return cmd +} + +func stopAllCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "stop-all", + Short: "Stops local and remote resources", + Long: `Stops remote CRE components (when configured), then stops local CRE resources and extra local services (beholder, billing, observability), and removes local state directory.`, + Example: "go run . env stop-all", + PersistentPreRun: globalPreRunFunc, + RunE: func(cmd *cobra.Command, args []string) error { + remoteConfiguredSummary, targets := loadRemoteStopTargets(relativePathToRepoRoot) + if remoteConfiguredSummary.Total > 0 { + if err := stopRemoteTargets(cmd.Context(), relativePathToRepoRoot, targets); err != nil { + return err + } + } + if err := stopLocalResources(relativePathToRepoRoot, true, false); err != nil { + return err + } + fmt.Println("All resources stopped successfully") + return nil + }, + } + return cmd +} + +func stopRemoteCmd() *cobra.Command { + var dryRunFlag bool + cmd := &cobra.Command{ + Use: "stop", + Short: "Stops remote components only", + Long: `Stops remote CRE components through the agent without performing any local cleanup.`, + Example: strings.TrimSpace(` +go run . env remote stop +go run . env remote stop --dry-run +`), + PersistentPreRun: globalPreRunFunc, + RunE: func(cmd *cobra.Command, args []string) error { + remoteConfiguredSummary, targets := loadRemoteStopTargets(relativePathToRepoRoot) + if dryRunFlag { + framework.L.Info(). + Int("total", remoteConfiguredSummary.Total). + Int("blockchains", remoteConfiguredSummary.Blockchains). + Int("nodesets", remoteConfiguredSummary.NodeSets). + Int("jd", remoteConfiguredSummary.JD). + Msg("Dry-run: remote components that would be stopped") + return nil + } + if remoteConfiguredSummary.Total == 0 { + framework.L.Info().Msg("No remote components recorded; nothing to stop.") + return nil + } + + if err := stopRemoteTargets(cmd.Context(), relativePathToRepoRoot, targets); err != nil { + return err + } + fmt.Println("Remote components stopped successfully") + return nil + }, + } + cmd.Flags().BoolVar(&dryRunFlag, "dry-run", false, "Preview what remote components would be stopped") + return cmd +} + +func loadRemoteStopTargets(relativePathToRepoRoot string) (remoteComponentSummary, *envconfig.Config) { + var ( + targets *envconfig.Config + summary remoteComponentSummary + ) + if envconfig.LocalCREStateFileExists(relativePathToRepoRoot) { + cached := &envconfig.Config{} + statePath := envconfig.MustLocalCREStateFileAbsPath(relativePathToRepoRoot) + if loadErr := cached.Load(statePath); loadErr != nil { + framework.L.Warn().Err(loadErr).Msgf("failed to load local CRE state from %s", statePath) + } else { + targets = cached + summary = summarizeRemoteComponents(targets) + } + } + + if summary.Total == 0 && remoteStateFileExists(relativePathToRepoRoot) { + remoteCfg, loadErr := loadRemoteStopConfig(relativePathToRepoRoot) + if loadErr != nil { + framework.L.Warn().Err(loadErr).Msgf("failed to load remote component stop state from %s", remoteStateFileAbsPath(relativePathToRepoRoot)) + } else { + targets = remoteCfg + summary = summarizeRemoteComponents(targets) + } + } + return summary, targets +} + +func stopRemoteTargets(ctx context.Context, relativePathToRepoRoot string, targets *envconfig.Config) error { + agentState, agentLoadErr := loadRemoteAgentState(relativePathToRepoRoot) + if agentLoadErr != nil { + framework.L.Warn().Err(agentLoadErr).Msgf("failed to load remote agent state from %s", remoteStateFileAbsPath(relativePathToRepoRoot)) + } else if agentState != nil { + applyRemoteAgentEnvFallback(framework.L, agentState) + } + + summary, stopRemoteErr := remoteclient.StopRemoteComponents(ctx, framework.L, targets) + framework.L.Info(). + Int("requested", summary.Requested). + Int("stopped", summary.Stopped). + Int("missing", summary.Missing). + Int("failed", summary.Failed). + Msg("Remote component stop summary") + if summary.ResidualQueryError != "" { + framework.L.Warn().Msgf("failed to query remote residual CTF resources: %s", summary.ResidualQueryError) + } else { + framework.L.Info(). + Int("containers", len(summary.ResidualContainers)). + Int("volumes", len(summary.ResidualVolumes)). + Msg("Remote residual CTF resources after stop") + if len(summary.ResidualContainers) > 0 { + framework.L.Warn().Msgf("residual remote CTF containers: %s", strings.Join(summary.ResidualContainers, ", ")) + } + if len(summary.ResidualVolumes) > 0 { + framework.L.Warn().Msgf("residual remote CTF volumes: %s", strings.Join(summary.ResidualVolumes, ", ")) + } + } + if stopRemoteErr != nil { + return errors.Wrap(stopRemoteErr, "failed to stop one or more remote components") + } + if err := stopRelaySupervisor(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to stop relay supervisor after remote stop") + } else { + framework.L.Info().Msg("stopped local relay supervisor after remote stop") + } + if err := removeRemoteStopConfig(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to remove remote component stop state") + } else { + framework.L.Info().Msgf("removed remote state directory: %s", filepath.Join(relativePathToRepoRoot, remoteStateDirname)) + } + if !hasLocalComponents(targets) { + statePath := envconfig.MustLocalCREStateFileAbsPath(relativePathToRepoRoot) + if err := os.Remove(statePath); err == nil { + framework.L.Info().Msgf("removed local CRE state file after remote-only stop: %s", statePath) + } else if !os.IsNotExist(err) { + framework.L.Warn().Err(err).Msgf("failed to remove local CRE state file after remote-only stop: %s", statePath) + } + } + return nil +} + +func stopLocalResources(relativePathToRepoRoot string, removeAllState bool, stopRelay bool) error { + if stopRelay { + if err := stopRelaySupervisor(relativePathToRepoRoot); err != nil { + framework.L.Warn().Err(err).Msg("failed to stop relay supervisor") + } + } + + removeErr := framework.RemoveTestContainers() + if removeErr != nil { + return errors.Wrap(removeErr, "failed to remove environment containers. Please remove them manually") + } + + if removeAllState { + stopBeholderErr := stopBeholder() + if stopBeholderErr != nil { + framework.L.Warn().Msgf("failed to stop Beholder: %s", stopBeholderErr) + } + + stopBillingErr := stopBilling() + if stopBillingErr != nil { + framework.L.Warn().Msgf("failed to stop Billing: %s", stopBillingErr) + } + + stopObsStack := framework.ObservabilityDown() + if stopObsStack != nil { + framework.L.Warn().Msgf("failed to stop observability stack: %s", stopObsStack) + } + + removeCacheErr := envconfig.RemoveAllEnvironmentStateDir(relativePathToRepoRoot) + if removeCacheErr != nil { + framework.L.Warn().Msgf("failed to remove local CRE state files: %s", removeCacheErr) + } + return nil + } + + creStateFile := envconfig.MustLocalCREStateFileAbsPath(relativePathToRepoRoot) + cErr := os.Remove(creStateFile) + if cErr != nil && !os.IsNotExist(cErr) { + framework.L.Warn().Msgf("failed to remove local CRE state file: %s", cErr) + } else if cErr != nil && os.IsNotExist(cErr) { + framework.L.Info().Msgf("local CRE state file already absent: %s", creStateFile) + } else { + framework.L.Info().Msgf("removed local CRE state file: %s", creStateFile) + } + return nil +} + +type remoteComponentSummary struct { + Total int + Blockchains int + NodeSets int + JD int +} + +func summarizeRemoteComponents(cfg *envconfig.Config) remoteComponentSummary { + summary := remoteComponentSummary{} + if cfg == nil { + return summary + } + for _, configuredBlockchain := range cfg.Blockchains { + if configuredBlockchain != nil && configuredBlockchain.Placement == envconfig.PlacementRemote { + summary.Blockchains++ + } + } + for _, nodeSet := range cfg.NodeSets { + if nodeSet != nil && strings.TrimSpace(nodeSet.Placement) == string(envconfig.PlacementRemote) { + summary.NodeSets++ + } + } + if cfg.JD != nil && cfg.JD.Placement == envconfig.PlacementRemote { + summary.JD = 1 + } + summary.Total = summary.Blockchains + summary.NodeSets + summary.JD + return summary +} + +func hasLocalComponents(cfg *envconfig.Config) bool { + if cfg == nil { + return false + } + for _, configuredBlockchain := range cfg.Blockchains { + if configuredBlockchain != nil && configuredBlockchain.Placement != envconfig.PlacementRemote { + return true + } + } + for _, nodeSet := range cfg.NodeSets { + if nodeSet != nil && strings.TrimSpace(nodeSet.Placement) != string(envconfig.PlacementRemote) { + return true + } + } + if cfg.JD != nil && cfg.JD.Placement != envconfig.PlacementRemote { + return true + } + return false +} diff --git a/core/scripts/cre/environment/environment/swap.go b/core/scripts/cre/environment/environment/swap.go index 6cf9b9c012d..d83623fa94d 100644 --- a/core/scripts/cre/environment/environment/swap.go +++ b/core/scripts/cre/environment/environment/swap.go @@ -253,14 +253,6 @@ func swapNodes(ctx context.Context, forceFlag bool, waitTime time.Duration) erro return fmt.Errorf("failed to set TESTCONTAINERS_RYUK_DISABLED environment variable: %w", setErr) } - effectiveBlockchains, effectiveErr := config.EffectiveBlockchains() - if effectiveErr != nil { - return errors.Wrap(effectiveErr, "failed to resolve blockchain inputs") - } - if len(effectiveBlockchains) == 0 || effectiveBlockchains[0] == nil || effectiveBlockchains[0].Out == nil { - return errors.New("at least one blockchain output is required to restart node sets") - } - nerrg := errgroup.Group{} for _, nodeSet := range config.NodeSets { nerrg.Go(func() error { @@ -298,7 +290,7 @@ func swapNodes(ctx context.Context, forceFlag bool, waitTime time.Duration) erro nodeSet.Out = nil var nodesetErr error nodeSet.Input.NodeSpecs = nodeSet.ExtractCTFInputs() - nodeSet.Out, nodesetErr = ns.NewSharedDBNodeSet(nodeSet.Input, effectiveBlockchains[0].Out) + nodeSet.Out, nodesetErr = ns.NewSharedDBNodeSet(nodeSet.Input, nil) if nodesetErr != nil { framework.L.Error().Msgf("Failed to create node set named %s: %s", nodeSet.Name, nodesetErr) framework.L.Info().Msgf("Waiting %s for the containers to be removed", waitTime.String()) diff --git a/core/scripts/cre/environment/environment/workflow.go b/core/scripts/cre/environment/environment/workflow.go index fe1044b5c10..1c2a140e0bc 100644 --- a/core/scripts/cre/environment/environment/workflow.go +++ b/core/scripts/cre/environment/environment/workflow.go @@ -116,7 +116,6 @@ func deployWorkflowCmd() *cobra.Command { compileWorkflowFlag bool containerTargetDirFlag string containerNamePatternFlag string - nodeSetNameFlag string workflowNameFlag string workflowOwnerAddressFlag string workflowRegistryAddressFlag string @@ -192,7 +191,7 @@ func deployWorkflowCmd() *cobra.Command { capabilitiesRegistryVersion = addrRef.Version } - regErr = deployWorkflow(cmd.Context(), workflowFilePathFlag, workflowNameFlag, workflowOwnerAddressFlag, workflowRegistryAddress, capabilitiesRegistryAddress, containerNamePatternFlag, nodeSetNameFlag, containerTargetDirFlag, configFilePathFlag, secretsFilePathFlag, secretsOutputFilePathFlag, rpcURLFlag, workflowRegistryVersion, capabilitiesRegistryVersion, donIDFlag, deleteWorkflowFileFlag) + regErr = deployWorkflow(cmd.Context(), workflowFilePathFlag, workflowNameFlag, workflowOwnerAddressFlag, workflowRegistryAddress, capabilitiesRegistryAddress, containerNamePatternFlag, containerTargetDirFlag, configFilePathFlag, secretsFilePathFlag, secretsOutputFilePathFlag, rpcURLFlag, workflowRegistryVersion, capabilitiesRegistryVersion, donIDFlag, deleteWorkflowFileFlag) return regErr }, @@ -204,7 +203,6 @@ func deployWorkflowCmd() *cobra.Command { cmd.Flags().StringVarP(&secretsOutputFilePathFlag, "secrets-output-file-path", "o", "", "Path to encrypted secrets output file (default \"./encrypted.secrets.json\")") cmd.Flags().StringVarP(&containerTargetDirFlag, "container-target-dir", "t", creworkflow.DefaultWorkflowTargetDir, "Path to the target directory in the Docker container") cmd.Flags().StringVarP(&containerNamePatternFlag, "container-name-pattern", "p", creworkflow.DefaultWorkflowNodePattern, "Pattern to match Docker containers workkflow DON containers (e.g. 'workflow-node')") - cmd.Flags().StringVar(&nodeSetNameFlag, "nodeset-name", "", "NodeSet name for remote artifact deployment (optional; auto-detected if omitted)") cmd.Flags().StringVarP(&rpcURLFlag, "rpc-url", "r", "http://localhost:8545", "RPC URL") cmd.Flags().StringVarP(&workflowOwnerAddressFlag, "workflow-owner-address", "d", DefaultWorkflowOwnerAddress, "Workflow owner address") cmd.Flags().StringVarP(&workflowRegistryAddressFlag, "workflow-registry-address", "a", "", "Workflow registry address (if not provided, address from the state file will be used)") @@ -388,27 +386,44 @@ func compileWorkflow(ctx context.Context, workflowFilePathFlag, workflowNameFlag func deployWorkflow( ctx context.Context, - wasmWorkflowFilePathFlag, workflowNameFlag, workflowOwnerAddressFlag, workflowRegistryAddress, capabilitiesRegistryAddress, containerNamePatternFlag, nodeSetNameFlag, containerTargetDirFlag, configFilePathFlag, secretsFilePathFlag, secretsOutputFilePathFlag, rpcURLFlag string, + wasmWorkflowFilePathFlag, workflowNameFlag, workflowOwnerAddressFlag, workflowRegistryAddress, capabilitiesRegistryAddress, containerNamePatternFlag, containerTargetDirFlag, configFilePathFlag, secretsFilePathFlag, secretsOutputFilePathFlag, rpcURLFlag string, workflowRegistryVersion, capabilitiesRegistryVersion *semver.Version, donIDFlag uint32, deleteWorkflowFile bool, ) error { - mode, resolvedNodeSetName, modeErr := resolveWorkflowArtifactDeployModeFromState(containerNamePatternFlag, nodeSetNameFlag) + mode, resolvedNodeSetNames, modeErr := resolveWorkflowArtifactDeployModeFromState(containerNamePatternFlag) if modeErr != nil { return modeErr } deployArtifacts := func(files ...string) error { + if mode == creworkflow.ArtifactDeployModeRemote { + for _, nodeSetName := range resolvedNodeSetNames { + if err := creworkflow.DeployArtifacts( + ctx, + creworkflow.DeployArtifactsOptions{ + Mode: mode, + NodeSetName: nodeSetName, + ContainerNamePattern: containerNamePatternFlag, + ContainerTargetDir: containerTargetDirFlag, + Files: files, + RemoteDeployer: func(ctx context.Context, nodeSetName, containerTargetDir string, files []string) error { + return remoteclient.DeployArtifactsToRemoteNodeSet(ctx, framework.L, nodeSetName, containerTargetDir, files) + }, + }, + ); err != nil { + return err + } + } + return nil + } + return creworkflow.DeployArtifacts( ctx, creworkflow.DeployArtifactsOptions{ Mode: mode, - NodeSetName: resolvedNodeSetName, ContainerNamePattern: containerNamePatternFlag, ContainerTargetDir: containerTargetDirFlag, Files: files, - RemoteDeployer: func(ctx context.Context, nodeSetName, containerTargetDir string, files []string) error { - return remoteclient.DeployArtifactsToRemoteNodeSet(ctx, framework.L, nodeSetName, containerTargetDir, files) - }, }, ) } @@ -522,7 +537,7 @@ func compileCopyAndRegisterWorkflow(ctx context.Context, workflowFilePathFlag, w return errors.Wrap(compileErr, "❌ failed to compile workflow") } - return deployWorkflow(ctx, compressedWorkflowWasmPath, workflowNameFlag, workflowOwnerAddressFlag, workflowRegistryAddress, capabilitiesRegistryAddress, containerNamePatternFlag, "", containerTargetDirFlag, configFilePathFlag, secretsFilePathFlag, secretsOutputFilePathFlag, rpcURLFlag, workflowRegistryVersion, capabilitiesRegistryVersion, donIDFlag, true) + return deployWorkflow(ctx, compressedWorkflowWasmPath, workflowNameFlag, workflowOwnerAddressFlag, workflowRegistryAddress, capabilitiesRegistryAddress, containerNamePatternFlag, containerTargetDirFlag, configFilePathFlag, secretsFilePathFlag, secretsOutputFilePathFlag, rpcURLFlag, workflowRegistryVersion, capabilitiesRegistryVersion, donIDFlag, true) } func isBase64File(filename string) error { @@ -567,26 +582,10 @@ func isBase64Content(content string) bool { return err == nil } -func resolveWorkflowArtifactDeployModeFromState(containerNamePattern, nodeSetName string) (creworkflow.ArtifactDeployMode, string, error) { +func resolveWorkflowArtifactDeployModeFromState(containerNamePattern string) (creworkflow.ArtifactDeployMode, []string, error) { cfg := &envconfig.Config{} if err := cfg.Load(envconfig.MustLocalCREStateFileAbsPath(relativePathToRepoRoot)); err != nil { - if nodeSetName != "" { - return "", "", errors.Wrap(err, "failed to load local CRE state for remote artifact deployment") - } - return creworkflow.ArtifactDeployModeLocal, "", nil - } - - if nodeSetName != "" { - for _, cfgNodeSet := range cfg.NodeSets { - if cfgNodeSet == nil || cfgNodeSet.Name != nodeSetName { - continue - } - if cfgNodeSet.Placement == string(envconfig.PlacementRemote) { - return creworkflow.ArtifactDeployModeRemote, nodeSetName, nil - } - return creworkflow.ArtifactDeployModeLocal, nodeSetName, nil - } - return "", "", fmt.Errorf("nodeset %q not found in local CRE state", nodeSetName) + return creworkflow.ArtifactDeployModeLocal, nil, nil } matches := make([]string, 0) @@ -602,11 +601,9 @@ func resolveWorkflowArtifactDeployModeFromState(containerNamePattern, nodeSetNam switch len(matches) { case 0: - return creworkflow.ArtifactDeployModeLocal, "", nil - case 1: - return creworkflow.ArtifactDeployModeRemote, matches[0], nil + return creworkflow.ArtifactDeployModeLocal, nil, nil default: - return "", "", fmt.Errorf("container pattern %q matches multiple remote nodesets %v; specify --nodeset-name", containerNamePattern, matches) + return creworkflow.ArtifactDeployModeRemote, matches, nil } } diff --git a/system-tests/lib/cre/don/config/config.go b/system-tests/lib/cre/don/config/config.go index f7f3f77c525..82ffbfd9150 100644 --- a/system-tests/lib/cre/don/config/config.go +++ b/system-tests/lib/cre/don/config/config.go @@ -26,7 +26,6 @@ import ( solcfg "github.com/smartcontractkit/chainlink-solana/pkg/solana/config" "github.com/smartcontractkit/chainlink-testing-framework/framework" chipingressset "github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose/chip_ingress_set" - ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink-testing-framework/lib/utils/ptr" keystone_changeset "github.com/smartcontractkit/chainlink/deployment/keystone/changeset" @@ -46,13 +45,17 @@ import ( const TronEVMChainID = 3360022319 +type PrepareNodeTOMLsOptions struct { + RemoteHostIP string +} + func PrepareNodeTOMLs( ctx context.Context, topology *cre.Topology, creEnv *cre.Environment, nodeSets []*cre.NodeSet, configuredBlockchains []*envconfig.Blockchain, - remoteHostIP string, + options PrepareNodeTOMLsOptions, capabilities []cre.InstallableCapability, // Deprecated, use Features instead and modify node configs inside a Feature nodeConfigTransformerFns []cre.NodeConfigTransformerFn, ) ([]*cre.NodeSet, error) { @@ -65,11 +68,11 @@ func PrepareNodeTOMLs( if peeringErr != nil { return nil, errors.Wrap(peeringErr, "failed to find peering data") } - ocrBootstrapPlacement, placementErr := resolveBootstrapPlacement(topology, bt.UUID) + ocrBootstrapPlacement, placementErr := topology.BootstrapPlacement() if placementErr != nil { return nil, placementErr } - ocrBootstrapAnnouncePort, announcePortErr := resolveBootstrapAnnouncePort(topology, bt.UUID) + ocrBootstrapAnnouncePort, announcePortErr := topology.BootstrapAnnouncePort() if announcePortErr != nil { return nil, announcePortErr } @@ -125,7 +128,7 @@ func PrepareNodeTOMLs( DonMetadata: donMetadata, Blockchains: chainPerSelector, BlockchainPlacementBySelector: blockchainPlacementBySelector, - RemoteHostIP: remoteHostIP, + RemoteHostIP: strings.TrimSpace(options.RemoteHostIP), OCRBootstrapPlacement: ocrBootstrapPlacement, OCRBootstrapAnnouncePort: ocrBootstrapAnnouncePort, Flags: donMetadata.Flags, @@ -351,10 +354,10 @@ func addBootstrapNodeConfig( EnableExperimentalRageP2P: ptr.Ptr(true), } if donMetadata != nil && nodeMetadata != nil { - announcePort := resolveNodeOCR2AnnouncePort(donMetadata.MustNodeSet(), nodeMetadata.Index) + announcePort := donMetadata.ResolveNodeOCR2AnnouncePort(nodeMetadata.Index) announceAddresses, announceErr := cre.ResolveP2PAnnounceAddresses( donMetadata.MustNodeSet().Placement, - hasRemoteNodeSets(topology), + topology.HasRemoteNodeSets(), announcePort, ) if announceErr != nil { @@ -454,10 +457,10 @@ func addWorkerNodeConfig( }, EnableExperimentalRageP2P: ptr.Ptr(true), } - announcePort := resolveNodeOCR2AnnouncePort(donMetadata.MustNodeSet(), m.Index) + announcePort := donMetadata.ResolveNodeOCR2AnnouncePort(m.Index) announceAddresses, announceErr := cre.ResolveP2PAnnounceAddresses( donMetadata.MustNodeSet().Placement, - hasRemoteNodeSets(topology), + topology.HasRemoteNodeSets(), announcePort, ) if announceErr != nil { @@ -906,89 +909,6 @@ func appendSolanaChain(existingConfig *solcfg.TOMLConfigs, solChain *solanaChain }) } -func hasRemoteNodeSets(topology *cre.Topology) bool { - if topology == nil { - return false - } - for _, nodeSet := range topology.NodeSets() { - if nodeSet != nil && strings.EqualFold(strings.TrimSpace(nodeSet.Placement), "remote") { - return true - } - } - return false -} - -func resolveNodeOCR2AnnouncePort(nodeSet *cre.NodeSet, nodeIndex int) int { - base := 0 - if nodeSet != nil { - base = nodeSet.OCR2P2PRangeStart - if base == 0 { - httpStart := nodeSet.HTTPPortRangeStart - if httpStart == 0 { - httpStart = ns.DefaultHTTPPortStaticRangeStart - } - base = httpStart + (ns.DefaultOCR2P2PStaticRangeStart - ns.DefaultHTTPPortStaticRangeStart) - } - } - if base == 0 { - base = ns.DefaultOCR2P2PStaticRangeStart - } - if nodeIndex < 0 { - nodeIndex = 0 - } - return base + nodeIndex -} - -func resolveBootstrapPlacement(topology *cre.Topology, bootstrapNodeUUID string) (string, error) { - if topology == nil { - return "", fmt.Errorf("topology is nil") - } - bootstrapNodeUUID = strings.TrimSpace(bootstrapNodeUUID) - if bootstrapNodeUUID == "" { - return "", fmt.Errorf("bootstrap node UUID is empty") - } - for _, don := range topology.DonsMetadata.List() { - if don == nil { - continue - } - for _, node := range don.NodesMetadata { - if node == nil || strings.TrimSpace(node.UUID) == "" { - continue - } - if node.UUID != bootstrapNodeUUID { - continue - } - return strings.TrimSpace(don.MustNodeSet().Placement), nil - } - } - return "", fmt.Errorf("failed to resolve bootstrap placement for node UUID %s", bootstrapNodeUUID) -} - -func resolveBootstrapAnnouncePort(topology *cre.Topology, bootstrapNodeUUID string) (int, error) { - if topology == nil { - return 0, fmt.Errorf("topology is nil") - } - bootstrapNodeUUID = strings.TrimSpace(bootstrapNodeUUID) - if bootstrapNodeUUID == "" { - return 0, fmt.Errorf("bootstrap node UUID is empty") - } - for _, don := range topology.DonsMetadata.List() { - if don == nil { - continue - } - for _, node := range don.NodesMetadata { - if node == nil || strings.TrimSpace(node.UUID) == "" { - continue - } - if node.UUID != bootstrapNodeUUID { - continue - } - return resolveNodeOCR2AnnouncePort(don.MustNodeSet(), node.Index), nil - } - } - return 0, fmt.Errorf("failed to resolve bootstrap announce port for node UUID %s", bootstrapNodeUUID) -} - func resolveNodeFacingBootstrapAddress(callerPlacement, bootstrapPlacement, bootstrapHost string, internalPort, externalPort int, remoteHostIP string) (string, error) { caller, err := connectivity.PlacementFromTarget(callerPlacement) if err != nil { diff --git a/system-tests/lib/cre/environment/environment.go b/system-tests/lib/cre/environment/environment.go index ac33422baa9..6d1ba8e58b7 100644 --- a/system-tests/lib/cre/environment/environment.go +++ b/system-tests/lib/cre/environment/environment.go @@ -133,7 +133,7 @@ func SetupTestEnvironment( if err := input.Validate(); err != nil { return nil, pkgerrors.Wrap(err, "input validation failed") } - execPlan, err := buildExecutionPlan(input.Blockchains, input.JdInput, input.NodeSets) + execPlan, err := buildPlacementPlan(input.Blockchains, input.JdInput, input.NodeSets) if err != nil { return nil, pkgerrors.Wrap(err, "invalid component placement") } @@ -208,7 +208,9 @@ func SetupTestEnvironment( creEnvironment, input.NodeSets, input.Blockchains, - remoteHostIP, + donconfig.PrepareNodeTOMLsOptions{ + RemoteHostIP: remoteHostIP, + }, input.Capabilities, input.ConfigFactoryFunctions, ) @@ -474,7 +476,7 @@ func appendOutputsToInput(input *SetupInput, nodeSetOutput []*cre.NodeSetOutput, func resolveRemoteRuntimeForSetup( testLogger zerolog.Logger, - execPlan *executionPlan, + execPlan *placementPlan, ) (*remoteclient.Runtime, error) { if execPlan == nil || !execPlan.HasRemoteComponents { return nil, nil diff --git a/system-tests/lib/cre/environment/environment_placement_test.go b/system-tests/lib/cre/environment/environment_placement_test.go index fd7fa901a51..d9672ec616f 100644 --- a/system-tests/lib/cre/environment/environment_placement_test.go +++ b/system-tests/lib/cre/environment/environment_placement_test.go @@ -67,7 +67,7 @@ func TestHasRemoteComponents(t *testing.T) { } func TestResolveRemoteRuntimeForSetupSkipsResolutionWhenNoRemoteComponents(t *testing.T) { - execPlan, planErr := buildExecutionPlan( + execPlan, planErr := buildPlacementPlan( []*config.Blockchain{{Placement: config.PlacementLocal}}, &config.JobDistributor{Placement: config.PlacementLocal}, []*cre.NodeSet{{Placement: "local"}}, @@ -83,7 +83,7 @@ func TestResolveRemoteRuntimeForSetupSkipsResolutionWhenNoRemoteComponents(t *te } func TestBuildExecutionPlanIncludesPlacementAndRemoteFlags(t *testing.T) { - execPlan, err := buildExecutionPlan( + execPlan, err := buildPlacementPlan( []*config.Blockchain{{Placement: config.PlacementRemote}}, &config.JobDistributor{Placement: config.PlacementLocal}, []*cre.NodeSet{{Placement: "local"}, {Placement: "remote"}}, diff --git a/system-tests/lib/cre/environment/execution_plan.go b/system-tests/lib/cre/environment/placement_plan.go similarity index 96% rename from system-tests/lib/cre/environment/execution_plan.go rename to system-tests/lib/cre/environment/placement_plan.go index 5d242e14763..5b1ea67c963 100644 --- a/system-tests/lib/cre/environment/execution_plan.go +++ b/system-tests/lib/cre/environment/placement_plan.go @@ -8,7 +8,7 @@ import ( "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" ) -type executionPlan struct { +type placementPlan struct { NodeSetPlacement *nodeSetPlacementSummary HasRemoteComponents bool } @@ -18,11 +18,11 @@ type nodeSetPlacementSummary struct { HasRemoteTargets bool } -func buildExecutionPlan( +func buildPlacementPlan( configuredBlockchains []*config.Blockchain, jdInput *config.JobDistributor, nodeSets []*cre.NodeSet, -) (*executionPlan, error) { +) (*placementPlan, error) { nodeSetPlacement, err := summarizeNodeSetPlacement(nodeSets) if err != nil { return nil, err @@ -31,7 +31,7 @@ func buildExecutionPlan( return nil, err } - return &executionPlan{ + return &placementPlan{ NodeSetPlacement: nodeSetPlacement, HasRemoteComponents: hasRemoteComponents(configuredBlockchains, jdInput, nodeSets), }, nil diff --git a/system-tests/lib/cre/topology.go b/system-tests/lib/cre/topology.go index 1440d26f9d3..76ae63196d7 100644 --- a/system-tests/lib/cre/topology.go +++ b/system-tests/lib/cre/topology.go @@ -119,6 +119,56 @@ func (t *Topology) Bootstrap() (*NodeMetadata, bool) { return t.DonsMetadata.Bootstrap() } +// HasRemoteNodeSets reports whether any DON in the topology is placed remotely. +func (t *Topology) HasRemoteNodeSets() bool { + if t == nil || t.DonsMetadata == nil { + return false + } + for _, don := range t.DonsMetadata.List() { + if don == nil || don.ns == nil { + continue + } + if strings.EqualFold(strings.TrimSpace(don.ns.Placement), "remote") { + return true + } + } + return false +} + +// BootstrapPlacement returns placement of the configured bootstrap DON. +func (t *Topology) BootstrapPlacement() (string, error) { + if t == nil || t.DonsMetadata == nil { + return "", fmt.Errorf("topology is nil") + } + for _, don := range t.DonsMetadata.List() { + if don == nil || don.ns == nil { + continue + } + if _, ok := don.Bootstrap(); ok { + return strings.TrimSpace(don.ns.Placement), nil + } + } + return "", fmt.Errorf("failed to resolve bootstrap placement") +} + +// BootstrapAnnouncePort returns OCR2 announce port for the bootstrap node. +func (t *Topology) BootstrapAnnouncePort() (int, error) { + if t == nil || t.DonsMetadata == nil { + return 0, fmt.Errorf("topology is nil") + } + for _, don := range t.DonsMetadata.List() { + if don == nil { + continue + } + node, ok := don.Bootstrap() + if !ok || node == nil { + continue + } + return don.ResolveNodeOCR2AnnouncePort(node.Index), nil + } + return 0, fmt.Errorf("failed to resolve bootstrap announce port") +} + // AddGatewayHandlers adds the given handler names to the gateway config of the given DON. It only adds handlers, if they are not already present. // Actual configuration for each handler is generated later during deployment. func (t *Topology) AddGatewayHandlers(donMetadata DonMetadata, handlers []string) error { diff --git a/system-tests/lib/cre/types.go b/system-tests/lib/cre/types.go index 53c30737c93..0b942c5b1ab 100644 --- a/system-tests/lib/cre/types.go +++ b/system-tests/lib/cre/types.go @@ -725,6 +725,29 @@ func (m *DonMetadata) RequiresOCR() bool { slices.Contains(m.Flags, VaultCapability) || slices.Contains(m.Flags, EVMCapability) || slices.Contains(m.Flags, SolanaCapability) } +// ResolveNodeOCR2AnnouncePort resolves a node's OCR2 P2P announce port based on DON +// static range configuration and node index. +func (m *DonMetadata) ResolveNodeOCR2AnnouncePort(nodeIndex int) int { + base := 0 + if m != nil && m.ns != nil { + base = m.ns.OCR2P2PRangeStart + if base == 0 { + httpStart := m.ns.HTTPPortRangeStart + if httpStart == 0 { + httpStart = ns.DefaultHTTPPortStaticRangeStart + } + base = httpStart + (ns.DefaultOCR2P2PStaticRangeStart - ns.DefaultHTTPPortStaticRangeStart) + } + } + if base == 0 { + base = ns.DefaultOCR2P2PStaticRangeStart + } + if nodeIndex < 0 { + nodeIndex = 0 + } + return base + nodeIndex +} + func (m *DonMetadata) RequiresGateway() bool { return HasFlag(m.Flags, CustomComputeCapability) || HasFlag(m.Flags, WebAPITriggerCapability) || From c214a65f464b38d3c147851e90681f4ff64aa752 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Mon, 2 Mar 2026 13:10:32 +0100 Subject: [PATCH 31/34] add .cursor/ to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 957b9ef0b3c..19b0662d581 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,7 @@ debug.env operator_ui/install .devenv event_dump.ndjson +.cursor/ # neovim .nvim.lua From fb03be6a446702b656db1723fe58dc96e832baa2 Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Mon, 2 Mar 2026 14:29:17 +0100 Subject: [PATCH 32/34] use different JD port that doesn't clash with chip ingress stack --- core/scripts/cre/environment/configs/workflow-gateway-don.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/core/scripts/cre/environment/configs/workflow-gateway-don.toml b/core/scripts/cre/environment/configs/workflow-gateway-don.toml index 4147aff7e41..213ee0b57b3 100644 --- a/core/scripts/cre/environment/configs/workflow-gateway-don.toml +++ b/core/scripts/cre/environment/configs/workflow-gateway-don.toml @@ -16,6 +16,7 @@ csa_encryption_key = "d1093c0060d50a3c89c189b2e485da5a3ce57f3dcb38ab7e2c0d5f0bb2314a44" # any random 32 byte hex string # change to your version image = "job-distributor:0.22.1" + wsrpc_port = "7812" [fake] port = 8171 From 56793e6a6e0e4e1b52bf85708a1dc69f04d412da Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Tue, 3 Mar 2026 10:20:12 +0100 Subject: [PATCH 33/34] fix lints --- core/scripts/go.mod | 4 +- system-tests/lib/cre/bootstrap_peer.go | 10 +++-- system-tests/lib/cre/bootstrap_peer_test.go | 3 +- system-tests/lib/cre/connectivity/chooser.go | 7 +-- system-tests/lib/cre/don.go | 2 +- system-tests/lib/cre/don/config/config.go | 10 ++--- .../lib/cre/don/config/config_test.go | 1 + .../lib/cre/environment/blockchain_start.go | 11 ++--- .../cre/environment/blockchain_start_test.go | 3 +- .../environment/blockchains/blockchains.go | 1 + .../lib/cre/environment/config/config.go | 4 +- .../lib/cre/environment/config/config_test.go | 3 +- system-tests/lib/cre/environment/dons_test.go | 3 +- .../environment/environment_placement_test.go | 10 ++--- system-tests/lib/cre/environment/jobs_test.go | 5 ++- .../lib/cre/environment/placement_plan.go | 6 +-- .../remoteexec/agent/cmd/local-agent/main.go | 9 +++- .../remoteexec/agent/deploy_test.go | 3 +- .../cre/environment/remoteexec/agent/relay.go | 18 ++++---- .../remoteexec/agent/relay_test.go | 19 +++++--- .../environment/remoteexec/agent/server.go | 44 ++++++++++--------- .../remoteexec/agent/server_chip_sink.go | 9 ++-- .../remoteexec/agent/server_component_logs.go | 7 ++- .../remoteexec/agent/server_handlers_test.go | 5 ++- .../remoteexec/agent/server_test.go | 1 + .../remoteexec/chipsink/event_decode.go | 5 ++- .../environment/remoteexec/chipsink/server.go | 3 +- .../remoteexec/client/agent_introspection.go | 7 +-- .../client/agent_introspection_test.go | 8 ++-- .../remoteexec/client/artifacts_remote.go | 8 ++-- .../client/artifacts_remote_test.go | 22 +++++----- .../client/chip_sink_remote_test.go | 22 +++++----- .../remoteexec/client/compatibility.go | 3 +- .../remoteexec/client/compatibility_test.go | 3 +- .../client/remote_component_client.go | 16 +++++-- .../client/remote_component_client_test.go | 6 ++- .../remoteexec/client/remote_stop_test.go | 19 ++++---- system-tests/lib/cre/features/evm/v2/evm.go | 8 ++-- system-tests/lib/cre/features/vault/vault.go | 19 -------- .../lib/cre/internal/dockerops/files.go | 1 + .../lib/cre/runtimecfg/access_mode.go | 11 ++--- .../lib/cre/runtimecfg/access_mode_test.go | 4 +- system-tests/lib/cre/topology.go | 8 ++-- system-tests/lib/cre/workflow/docker.go | 3 +- system-tests/lib/go.mod | 4 +- .../tests/load/cre/writer_don_load_test.go | 2 +- .../tests/smoke/cre/v2_vault_don_test.go | 2 +- .../test-helpers/fixture_relay_helpers.go | 7 +-- system-tests/tests/test-helpers/t_helpers.go | 2 +- 49 files changed, 213 insertions(+), 178 deletions(-) diff --git a/core/scripts/go.mod b/core/scripts/go.mod index 14cbddfe475..2a64631bd58 100644 --- a/core/scripts/go.mod +++ b/core/scripts/go.mod @@ -25,6 +25,7 @@ require ( github.com/andybalholm/brotli v1.2.0 github.com/avast/retry-go/v4 v4.6.1 github.com/c-bata/go-prompt v0.2.6 + github.com/cloudevents/sdk-go/binding/format/protobuf/v2 v2.16.2 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc github.com/docker/docker v28.5.1+incompatible github.com/docker/go-connections v0.6.0 @@ -49,6 +50,7 @@ require ( github.com/smartcontractkit/chainlink-ccip v0.1.1-solana.0.20260220192608-af6bd538e0ca github.com/smartcontractkit/chainlink-common v0.10.1-0.20260227202051-0f1cea05d443 github.com/smartcontractkit/chainlink-common/keystore v1.0.2 + github.com/smartcontractkit/chainlink-common/pkg/chipingress v0.0.11-0.20251211140724-319861e514c4 github.com/smartcontractkit/chainlink-data-streams v0.1.12-0.20260227110503-42b236799872 github.com/smartcontractkit/chainlink-deployments-framework v0.80.1-0.20260209182815-b296b7df28a6 github.com/smartcontractkit/chainlink-evm v0.3.4-0.20260227175232-0de99d1959de @@ -168,7 +170,6 @@ require ( github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/chai2010/gettext-go v1.0.2 // indirect github.com/chaos-mesh/chaos-mesh/api v0.0.0-20240821051457-da69c6d9617a // indirect - github.com/cloudevents/sdk-go/binding/format/protobuf/v2 v2.16.2 // indirect github.com/cloudevents/sdk-go/v2 v2.16.2 // indirect github.com/cloudwego/base64x v0.1.4 // indirect github.com/cloudwego/iasm v0.2.0 // indirect @@ -482,7 +483,6 @@ require ( github.com/smartcontractkit/chainlink-ccip/chains/solana/gobindings v0.0.0-20250912190424-fd2e35d7deb5 // indirect github.com/smartcontractkit/chainlink-ccip/deployment v0.0.0-20260129103204-4c8453dd8139 // indirect github.com/smartcontractkit/chainlink-ccv v0.0.0-20260225114453-965dabf4bcb0 // indirect - github.com/smartcontractkit/chainlink-common/pkg/chipingress v0.0.11-0.20251211140724-319861e514c4 // indirect github.com/smartcontractkit/chainlink-evm/contracts/cre/gobindings v0.0.0-20260107191744-4b93f62cffe3 // indirect github.com/smartcontractkit/chainlink-feeds v0.1.2-0.20250227211209-7cd000095135 // indirect github.com/smartcontractkit/chainlink-framework/capabilities v0.0.0-20250818175541-3389ac08a563 // indirect diff --git a/system-tests/lib/cre/bootstrap_peer.go b/system-tests/lib/cre/bootstrap_peer.go index 48181e8e5bb..c1fe24eb0c5 100644 --- a/system-tests/lib/cre/bootstrap_peer.go +++ b/system-tests/lib/cre/bootstrap_peer.go @@ -7,6 +7,8 @@ import ( "strconv" "strings" + "github.com/pkg/errors" + "github.com/smartcontractkit/chainlink-testing-framework/framework" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/connectivity" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" @@ -14,7 +16,7 @@ import ( func ResolveBootstrapAddress(callerTarget, bootstrapTarget, internalHost string, port int) (string, error) { if strings.TrimSpace(internalHost) == "" { - return "", fmt.Errorf("bootstrap internal host is empty") + return "", errors.New("bootstrap internal host is empty") } if port <= 0 || port > 65535 { return "", fmt.Errorf("invalid bootstrap port: %d", port) @@ -56,7 +58,7 @@ func ResolveBootstrapPeerURL(callerTarget, bootstrapTarget, peerID, internalHost } trimmedPeerID := strings.TrimSpace(strings.TrimPrefix(peerID, "p2p_")) if trimmedPeerID == "" { - return "", fmt.Errorf("bootstrap peerID is empty") + return "", errors.New("bootstrap peerID is empty") } return trimmedPeerID + "@" + address, nil } @@ -128,7 +130,7 @@ func resolveBootstrapExternalAddress(targetPlacement connectivity.Placement, por return net.JoinHostPort("127.0.0.1", strconv.Itoa(port)), nil } if !runtimecfg.IsDirectMode() { - return "", fmt.Errorf("mixed DON bootstrap resolution requires direct access mode for remote bootstrap targets") + return "", errors.New("mixed DON bootstrap resolution requires direct access mode for remote bootstrap targets") } hostIP, err := runtimecfg.DirectHostIP() if err != nil { @@ -141,7 +143,7 @@ func rewriteEndpointForRemoteCaller(raw string) (string, error) { dockerHost := strings.TrimPrefix(framework.HostDockerInternal(), "http://") trimmed := strings.TrimSpace(raw) if trimmed == "" { - return "", fmt.Errorf("endpoint is empty") + return "", errors.New("endpoint is empty") } if strings.Contains(trimmed, "://") { parsed, err := url.Parse(trimmed) diff --git a/system-tests/lib/cre/bootstrap_peer_test.go b/system-tests/lib/cre/bootstrap_peer_test.go index 6b0a6d29fab..5112c0533b5 100644 --- a/system-tests/lib/cre/bootstrap_peer_test.go +++ b/system-tests/lib/cre/bootstrap_peer_test.go @@ -5,13 +5,14 @@ import ( "testing" "github.com/stretchr/testify/require" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" ) func TestResolveP2PAnnounceAddresses_LocalOnly_UsesInternalHost(t *testing.T) { addresses, err := ResolveP2PAnnounceAddresses("local", false, 15001) require.NoError(t, err, "ResolveP2PAnnounceAddresses should not fail") - require.Len(t, addresses, 0, "expected local-only setup to leave announce addresses unset") + require.Empty(t, addresses, "expected local-only setup to leave announce addresses unset") } func TestResolveP2PAnnounceAddresses_LocalMixed_AddsBridgedHost(t *testing.T) { diff --git a/system-tests/lib/cre/connectivity/chooser.go b/system-tests/lib/cre/connectivity/chooser.go index daedac0f525..c44f0ff09fa 100644 --- a/system-tests/lib/cre/connectivity/chooser.go +++ b/system-tests/lib/cre/connectivity/chooser.go @@ -2,6 +2,7 @@ package connectivity import ( "context" + "errors" "fmt" "net" "net/url" @@ -33,7 +34,7 @@ type BridgeEnsurer func(ctx context.Context, endpoint EndpointPair, port int) er func Resolve(caller, target Placement, endpoint EndpointPair) (*Resolution, error) { if caller == "" || target == "" { - return nil, fmt.Errorf("caller and target placement must be set") + return nil, errors.New("caller and target placement must be set") } selectedKind := "internal" @@ -94,7 +95,7 @@ func PlacementFromTarget(target string) (Placement, error) { func endpointPort(raw string) (int, error) { trimmed := strings.TrimSpace(raw) if trimmed == "" { - return 0, fmt.Errorf("endpoint is empty") + return 0, errors.New("endpoint is empty") } if strings.Contains(trimmed, "://") { parsed, err := url.Parse(trimmed) @@ -102,7 +103,7 @@ func endpointPort(raw string) (int, error) { return 0, fmt.Errorf("parse url: %w", err) } if parsed.Port() == "" { - return 0, fmt.Errorf("url has no explicit port") + return 0, errors.New("url has no explicit port") } port, err := strconv.Atoi(parsed.Port()) if err != nil || port <= 0 || port > 65535 { diff --git a/system-tests/lib/cre/don.go b/system-tests/lib/cre/don.go index 400feb86605..946268602f8 100644 --- a/system-tests/lib/cre/don.go +++ b/system-tests/lib/cre/don.go @@ -825,7 +825,7 @@ func LinkToJobDistributor(ctx context.Context, input *LinkDonsToJDInput) error { func resolveNodeFacingJDUriForDON(donMetadata *DonMetadata, jdPlacement, internalWSRPC, externalWSRPC string) (string, error) { if donMetadata == nil { - return "", fmt.Errorf("don metadata is nil") + return "", errors.New("don metadata is nil") } nodeSet := donMetadata.MustNodeSet() callerPlacement, err := connectivity.PlacementFromTarget(nodeSet.Placement) diff --git a/system-tests/lib/cre/don/config/config.go b/system-tests/lib/cre/don/config/config.go index 6c259b282f2..172ab407bca 100644 --- a/system-tests/lib/cre/don/config/config.go +++ b/system-tests/lib/cre/don/config/config.go @@ -920,7 +920,7 @@ func resolveNodeFacingBootstrapAddress(callerPlacement, bootstrapPlacement, boot // Local callers need EC2-host reachable port for remote bootstrap nodes. if caller == connectivity.PlacementLocal && target == connectivity.PlacementRemote { if !runtimecfg.IsDirectMode() { - return "", fmt.Errorf("mixed DON bootstrap resolution requires direct mode") + return "", errors.New("mixed DON bootstrap resolution requires direct mode") } hostIP := strings.TrimSpace(remoteHostIP) if hostIP == "" { @@ -937,7 +937,7 @@ func resolveNodeFacingBootstrapAddress(callerPlacement, bootstrapPlacement, boot func resolveGatewayConnectorURL(callerPlacementRaw string, topology *cre.Topology, gateway *cre.DonGatewayConfiguration, remoteHostIP string) (string, error) { if gateway == nil || gateway.GatewayConfiguration == nil { - return "", fmt.Errorf("gateway configuration is nil") + return "", errors.New("gateway configuration is nil") } callerPlacement, err := connectivity.PlacementFromTarget(callerPlacementRaw) if err != nil { @@ -984,11 +984,11 @@ func blockchainPlacementsBySelector(configured []*envconfig.Blockchain, deployed func resolveNodePlacement(topology *cre.Topology, nodeUUID string) (connectivity.Placement, error) { if topology == nil { - return "", fmt.Errorf("topology is nil") + return "", errors.New("topology is nil") } trimmedUUID := strings.TrimSpace(nodeUUID) if trimmedUUID == "" { - return "", fmt.Errorf("node uuid is empty") + return "", errors.New("node uuid is empty") } for _, don := range topology.DonsMetadata.List() { if don == nil { @@ -1011,7 +1011,7 @@ func gatewayExternalHost(targetPlacement connectivity.Placement, remoteHostIP st switch targetPlacement { case connectivity.PlacementRemote: if !runtimecfg.IsDirectMode() { - return "", fmt.Errorf("gateway connector resolution for remote targets requires direct mode") + return "", errors.New("gateway connector resolution for remote targets requires direct mode") } if hostIP := strings.TrimSpace(remoteHostIP); hostIP != "" { return hostIP, nil diff --git a/system-tests/lib/cre/don/config/config_test.go b/system-tests/lib/cre/don/config/config_test.go index f51e4f26a7f..e2a476a2187 100644 --- a/system-tests/lib/cre/don/config/config_test.go +++ b/system-tests/lib/cre/don/config/config_test.go @@ -5,6 +5,7 @@ import ( "testing" "github.com/stretchr/testify/require" + "github.com/smartcontractkit/chainlink-testing-framework/framework" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/clnode" ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" diff --git a/system-tests/lib/cre/environment/blockchain_start.go b/system-tests/lib/cre/environment/blockchain_start.go index a97aad182a1..43a536a0a62 100644 --- a/system-tests/lib/cre/environment/blockchain_start.go +++ b/system-tests/lib/cre/environment/blockchain_start.go @@ -72,8 +72,8 @@ func startBlockchains( remoteclient.StartDescriptor[blockchain.Output]{ ComponentType: remoteclient.ComponentTypeBlockchain, BuildPayload: func() (agent.StartComponentPayload, error) { - if err := validateRemoteBlockchainInput(input); err != nil { - return agent.StartComponentPayload{}, err + if valErr := validateRemoteBlockchainInput(input); valErr != nil { + return agent.StartComponentPayload{}, valErr } return agent.StartComponentPayload{ ComponentType: remoteclient.ComponentTypeBlockchain, @@ -81,12 +81,7 @@ func startBlockchains( ReusePolicy: string(configured.RemoteStartPolicy), }, nil }, - Rewrite: func(output *blockchain.Output, ec2HostIP string) error { - if rewriteInternalForLocalNodes { - // direct mode keeps internal URLs unchanged - } - return rewriteRemoteBlockchainOutputForDirectAccess(output, ec2HostIP) - }, + Rewrite: rewriteRemoteBlockchainOutputForDirectAccess, }, ) if err != nil { diff --git a/system-tests/lib/cre/environment/blockchain_start_test.go b/system-tests/lib/cre/environment/blockchain_start_test.go index 69e41d6e0f6..3ff75c1e7a7 100644 --- a/system-tests/lib/cre/environment/blockchain_start_test.go +++ b/system-tests/lib/cre/environment/blockchain_start_test.go @@ -4,10 +4,11 @@ import ( "testing" "github.com/rs/zerolog" + "github.com/stretchr/testify/require" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" remoteclient "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/client" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" - "github.com/stretchr/testify/require" ) func TestValidateRemoteBlockchainInput(t *testing.T) { diff --git a/system-tests/lib/cre/environment/blockchains/blockchains.go b/system-tests/lib/cre/environment/blockchains/blockchains.go index 069c66f5fa6..65e946567f2 100644 --- a/system-tests/lib/cre/environment/blockchains/blockchains.go +++ b/system-tests/lib/cre/environment/blockchains/blockchains.go @@ -5,6 +5,7 @@ import ( "fmt" pkgerrors "github.com/pkg/errors" + cldf_chain "github.com/smartcontractkit/chainlink-deployments-framework/chain" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" diff --git a/system-tests/lib/cre/environment/config/config.go b/system-tests/lib/cre/environment/config/config.go index de92561e10a..b8eba37953c 100644 --- a/system-tests/lib/cre/environment/config/config.go +++ b/system-tests/lib/cre/environment/config/config.go @@ -90,7 +90,7 @@ const ( type Blockchain struct { blockchain.Input Placement ComponentPlacement `toml:"placement"` - RemoteStartPolicy RemoteStartPolicy `toml:"remote_start_policy"` + RemoteStartPolicy RemoteStartPolicy `toml:"remote_start_policy"` } // JobDistributor wraps the existing CTF JD input and adds placement metadata. @@ -98,7 +98,7 @@ type Blockchain struct { type JobDistributor struct { jd.Input Placement ComponentPlacement `toml:"placement"` - RemoteStartPolicy RemoteStartPolicy `toml:"remote_start_policy"` + RemoteStartPolicy RemoteStartPolicy `toml:"remote_start_policy"` } func (b *Blockchain) Normalize() { diff --git a/system-tests/lib/cre/environment/config/config_test.go b/system-tests/lib/cre/environment/config/config_test.go index e46abbbefed..189ef20d1dd 100644 --- a/system-tests/lib/cre/environment/config/config_test.go +++ b/system-tests/lib/cre/environment/config/config_test.go @@ -3,10 +3,11 @@ package config import ( "testing" + "github.com/stretchr/testify/require" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" - "github.com/stretchr/testify/require" ) func TestBlockchainNormalizeAndValidate(t *testing.T) { diff --git a/system-tests/lib/cre/environment/dons_test.go b/system-tests/lib/cre/environment/dons_test.go index 4b3daa42123..bd30d08f142 100644 --- a/system-tests/lib/cre/environment/dons_test.go +++ b/system-tests/lib/cre/environment/dons_test.go @@ -3,11 +3,12 @@ package environment import ( "testing" + "github.com/stretchr/testify/require" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/clnode" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" "github.com/smartcontractkit/chainlink/system-tests/lib/infra" - "github.com/stretchr/testify/require" ) func TestBuildRemoteNodeSetInputRequiresImageOrBuildFields(t *testing.T) { diff --git a/system-tests/lib/cre/environment/environment_placement_test.go b/system-tests/lib/cre/environment/environment_placement_test.go index d9672ec616f..544e64fd253 100644 --- a/system-tests/lib/cre/environment/environment_placement_test.go +++ b/system-tests/lib/cre/environment/environment_placement_test.go @@ -6,8 +6,8 @@ import ( "github.com/rs/zerolog" "github.com/stretchr/testify/require" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" ) func TestSummarizeNodeSetPlacement_AllowsMixedPlacements(t *testing.T) { @@ -24,11 +24,11 @@ func TestSummarizeNodeSetPlacement_AllowsMixedPlacements(t *testing.T) { func TestHasRemoteComponents(t *testing.T) { tests := []struct { - name string + name string blockchains []*config.Blockchain - jd *config.JobDistributor - nodeSets []*cre.NodeSet - want bool + jd *config.JobDistributor + nodeSets []*cre.NodeSet + want bool }{ { name: "none remote", diff --git a/system-tests/lib/cre/environment/jobs_test.go b/system-tests/lib/cre/environment/jobs_test.go index 99c13aa3064..682d54c584f 100644 --- a/system-tests/lib/cre/environment/jobs_test.go +++ b/system-tests/lib/cre/environment/jobs_test.go @@ -3,8 +3,9 @@ package environment import ( "testing" - "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" "github.com/stretchr/testify/require" + + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" ) func TestRewriteJDForDirectAccess_NilOutputNoop(t *testing.T) { @@ -60,5 +61,5 @@ func TestRewriteAddressHost_UnsupportedURLWithoutPortFails(t *testing.T) { func TestRewriteAddressHost_EmptyInputNoop(t *testing.T) { rewritten, err := rewriteAddressHost(" ", "10.20.30.40") require.NoError(t, err, "expected empty input to be a no-op") - require.Equal(t, "", rewritten, "expected empty output for empty input") + require.Empty(t, rewritten, "expected empty output for empty input") } diff --git a/system-tests/lib/cre/environment/placement_plan.go b/system-tests/lib/cre/environment/placement_plan.go index 5b1ea67c963..59193d45cea 100644 --- a/system-tests/lib/cre/environment/placement_plan.go +++ b/system-tests/lib/cre/environment/placement_plan.go @@ -1,6 +1,7 @@ package environment import ( + "errors" "fmt" "strings" @@ -87,9 +88,8 @@ func validateUnsupportedPlacements( continue } if bc.Placement == config.PlacementLocal { - return fmt.Errorf( - "remote nodesets with local blockchains are not supported in this PoC. " + - "Set all blockchains to placement=remote, or run nodesets with placement=local so nodes stay colocated with local blockchains", + return errors.New("remote nodesets with local blockchains are not supported in this PoC. " + + "Set all blockchains to placement=remote, or run nodesets with placement=local so nodes stay colocated with local blockchains", ) } } diff --git a/system-tests/lib/cre/environment/remoteexec/agent/cmd/local-agent/main.go b/system-tests/lib/cre/environment/remoteexec/agent/cmd/local-agent/main.go index d9ccf114d9b..e748fc17ef0 100644 --- a/system-tests/lib/cre/environment/remoteexec/agent/cmd/local-agent/main.go +++ b/system-tests/lib/cre/environment/remoteexec/agent/cmd/local-agent/main.go @@ -10,13 +10,17 @@ import ( "github.com/rs/zerolog" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" blockchainsets "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/sets" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" "github.com/smartcontractkit/chainlink/system-tests/lib/infra" ) func main() { + os.Exit(run()) +} + +func run() int { defaultAddr := "127.0.0.1:18080" if runtimecfg.IsDirectMode() { defaultAddr = "0.0.0.0:18080" @@ -34,6 +38,7 @@ func main() { lggr.Info().Msgf("starting local CRE agent on %s", *addr) if err := agent.Run(ctx, *addr, server); err != nil { _, _ = fmt.Fprintf(os.Stderr, "agent failed: %v\n", err) - os.Exit(1) + return 1 } + return 0 } diff --git a/system-tests/lib/cre/environment/remoteexec/agent/deploy_test.go b/system-tests/lib/cre/environment/remoteexec/agent/deploy_test.go index 6429bf954a3..57f2687f7d4 100644 --- a/system-tests/lib/cre/environment/remoteexec/agent/deploy_test.go +++ b/system-tests/lib/cre/environment/remoteexec/agent/deploy_test.go @@ -5,11 +5,12 @@ import ( "errors" "testing" + "github.com/stretchr/testify/require" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" - "github.com/stretchr/testify/require" ) type fakeStarterDeployer struct { diff --git a/system-tests/lib/cre/environment/remoteexec/agent/relay.go b/system-tests/lib/cre/environment/remoteexec/agent/relay.go index caf2314de3f..4aa06f7d787 100644 --- a/system-tests/lib/cre/environment/remoteexec/agent/relay.go +++ b/system-tests/lib/cre/environment/remoteexec/agent/relay.go @@ -34,9 +34,9 @@ type openRelayRequest struct { } type openRelayResponse struct { - RelayID string `json:"relayId"` - RequestedPort int `json:"requestedPort"` - BoundPort int `json:"boundPort"` + RelayID string `json:"relayId"` + RequestedPort int `json:"requestedPort"` + BoundPort int `json:"boundPort"` } type closeRelayRequest struct { @@ -112,7 +112,8 @@ func (s *Server) openRelay(w http.ResponseWriter, r *http.Request) { s.relayMu.Unlock() listenAddr := fmt.Sprintf("0.0.0.0:%d", req.RequestedPort) - ln, err := net.Listen("tcp", listenAddr) + var lc net.ListenConfig + ln, err := lc.Listen(r.Context(), "tcp", listenAddr) if err != nil { s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to open relay listener: %v", err), nil) return @@ -142,9 +143,9 @@ func (s *Server) openRelay(w http.ResponseWriter, r *http.Request) { Msg("opened relay listener") s.respondJSONAny(w, http.StatusOK, openRelayResponse{ - RelayID: relayID, + RelayID: relayID, RequestedPort: req.RequestedPort, - BoundPort: listenerPort(ln), + BoundPort: listenerPort(ln), }) } @@ -204,7 +205,7 @@ func (s *Server) connectRelay(w http.ResponseWriter, r *http.Request) { relay, ok := s.relays[relayID] s.relayMu.Unlock() if !ok { - s.respondError(w, http.StatusNotFound, ErrCodeDeployFailed, fmt.Sprintf("relay not found: %s", relayID), nil) + s.respondError(w, http.StatusNotFound, ErrCodeDeployFailed, "relay not found: "+relayID, nil) return } @@ -291,7 +292,8 @@ func (s *Server) acceptRelayConnections(relay *relayRegistration) { return default: } - if ne, ok := err.(net.Error); ok && ne.Temporary() { + var ne net.Error + if errors.As(err, &ne) { time.Sleep(50 * time.Millisecond) continue } diff --git a/system-tests/lib/cre/environment/remoteexec/agent/relay_test.go b/system-tests/lib/cre/environment/remoteexec/agent/relay_test.go index 766b993414f..b21e2308ca6 100644 --- a/system-tests/lib/cre/environment/remoteexec/agent/relay_test.go +++ b/system-tests/lib/cre/environment/remoteexec/agent/relay_test.go @@ -2,6 +2,7 @@ package agent import ( "bytes" + "context" "encoding/json" "fmt" "io" @@ -27,12 +28,13 @@ func TestRelay_OpenConnectBridgeAndClose(t *testing.T) { RequestedPort: 0, }) require.NotEmpty(t, openResp.RelayID) - require.Greater(t, openResp.BoundPort, 0) + require.Positive(t, openResp.BoundPort) wsConn := mustConnectRelayWS(t, httpServer.URL, openResp.RelayID) defer wsConn.Close() - tcpConn, err := net.Dial("tcp", fmt.Sprintf("127.0.0.1:%d", openResp.BoundPort)) + dialer := net.Dialer{} + tcpConn, err := dialer.DialContext(context.Background(), "tcp", fmt.Sprintf("127.0.0.1:%d", openResp.BoundPort)) require.NoError(t, err, "tcp client should connect to opened relay port") defer tcpConn.Close() @@ -100,7 +102,10 @@ func mustOpenRelay(t *testing.T, baseURL string, req openRelayRequest) openRelay t.Helper() body, err := json.Marshal(req) require.NoError(t, err) - resp, err := http.Post(baseURL+"/v1/relay/open", "application/json", bytes.NewReader(body)) + httpReq, err := http.NewRequestWithContext(context.Background(), http.MethodPost, baseURL+"/v1/relay/open", bytes.NewReader(body)) + require.NoError(t, err) + httpReq.Header.Set("Content-Type", "application/json") + resp, err := http.DefaultClient.Do(httpReq) require.NoError(t, err) defer resp.Body.Close() require.Equal(t, http.StatusOK, resp.StatusCode) @@ -114,7 +119,10 @@ func mustCloseRelay(t *testing.T, baseURL, relayID string) map[string]any { t.Helper() body, err := json.Marshal(closeRelayRequest{RelayID: relayID}) require.NoError(t, err) - resp, err := http.Post(baseURL+"/v1/relay/close", "application/json", bytes.NewReader(body)) + httpReq, err := http.NewRequestWithContext(context.Background(), http.MethodPost, baseURL+"/v1/relay/close", bytes.NewReader(body)) + require.NoError(t, err) + httpReq.Header.Set("Content-Type", "application/json") + resp, err := http.DefaultClient.Do(httpReq) require.NoError(t, err) defer resp.Body.Close() require.Equal(t, http.StatusOK, resp.StatusCode) @@ -138,7 +146,8 @@ func mustConnectRelayWS(t *testing.T, baseURL, relayID string) *websocket.Conn { func reserveFreePort(t *testing.T) int { t.Helper() - ln, err := net.Listen("tcp", "127.0.0.1:0") + var lc net.ListenConfig + ln, err := lc.Listen(context.Background(), "tcp", "127.0.0.1:0") require.NoError(t, err) defer ln.Close() addr, ok := ln.Addr().(*net.TCPAddr) diff --git a/system-tests/lib/cre/environment/remoteexec/agent/server.go b/system-tests/lib/cre/environment/remoteexec/agent/server.go index 786c0f69f55..becb4f3effb 100644 --- a/system-tests/lib/cre/environment/remoteexec/agent/server.go +++ b/system-tests/lib/cre/environment/remoteexec/agent/server.go @@ -120,6 +120,7 @@ type CTFResourcesResponse struct { Volumes []string `json:"volumes,omitempty"` } +//nolint:revive // AgentStatusResponse is the API contract; renaming would break external callers type AgentStatusResponse struct { AgentVersion string `json:"agentVersion,omitempty"` ProtocolVersion string `json:"protocolVersion,omitempty"` @@ -141,6 +142,7 @@ type RelayInfo struct { BoundPort int `json:"boundPort"` } +//nolint:revive // AgentLocksResponse is the API contract; renaming would break external callers type AgentLocksResponse struct { LifecycleBusy bool `json:"lifecycleBusy"` CacheEntries int `json:"cacheEntries"` @@ -351,7 +353,7 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { } if envelope.SchemaVersion != SchemaVersionV1 { - s.respondError(w, http.StatusBadRequest, ErrCodeUnsupportedSchema, fmt.Sprintf("unsupported schema version: %s", envelope.SchemaVersion), nil) + s.respondError(w, http.StatusBadRequest, ErrCodeUnsupportedSchema, "unsupported schema version: "+envelope.SchemaVersion, nil) return } if envelope.Operation == OperationDeployArtifacts { @@ -364,7 +366,7 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { return } if payload.ComponentType != ComponentTypeBlockchain && payload.ComponentType != ComponentTypeJD && payload.ComponentType != ComponentTypeNodeSet { - s.respondError(w, http.StatusBadRequest, ErrCodeUnsupportedComponent, fmt.Sprintf("unsupported component type: %s", payload.ComponentType), nil) + s.respondError(w, http.StatusBadRequest, ErrCodeUnsupportedComponent, "unsupported component type: "+payload.ComponentType, nil) return } @@ -378,7 +380,7 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { return } if envelope.Operation != OperationStartComponent { - s.respondError(w, http.StatusBadRequest, ErrCodeUnsupportedOperation, fmt.Sprintf("unsupported operation: %s", envelope.Operation), nil) + s.respondError(w, http.StatusBadRequest, ErrCodeUnsupportedOperation, "unsupported operation: "+envelope.Operation, nil) return } payloadHash := hashPayload(envelope.Payload) @@ -386,8 +388,8 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { // Keep this stderr write explicit so startup behavior is visible when agent runs as a subprocess. requestLog := fmt.Sprintf("[cre-agent] starting component type=%s key=%s", payload.ComponentType, componentKey) _, _ = fmt.Fprintln(os.Stderr, requestLog) - s.beginInFlight(fmt.Sprintf("start:%s", componentKey), inFlightOperationScopeLifecycle) - defer s.endInFlight(fmt.Sprintf("start:%s", componentKey)) + s.beginInFlight("start:"+componentKey, inFlightOperationScopeLifecycle) + defer s.endInFlight("start:" + componentKey) preStartLogs := make([]string, 0, 2) s.lifecycleMu.Lock() defer s.lifecycleMu.Unlock() @@ -473,11 +475,12 @@ func (s *Server) startComponent(w http.ResponseWriter, r *http.Request) { var output map[string]any var encErr error - if blockchainOutput != nil { + switch { + case blockchainOutput != nil: output, encErr = EncodeForTransport(blockchainOutput) - } else if jdOutput != nil { + case jdOutput != nil: output, encErr = EncodeForTransport(jdOutput) - } else if nodeSetOutput != nil { + case nodeSetOutput != nil: output, encErr = EncodeForTransport(nodeSetOutput) } if encErr != nil { @@ -529,7 +532,7 @@ func (s *Server) deployArtifacts(w http.ResponseWriter, r *http.Request, rawPayl return } if len(containerNames) == 0 { - s.respondError(w, http.StatusNotFound, ErrCodeDeployFailed, fmt.Sprintf("no nodeset containers found for pattern %s", containerPrefix), nil) + s.respondError(w, http.StatusNotFound, ErrCodeDeployFailed, "no nodeset containers found for pattern "+containerPrefix, nil) return } @@ -575,8 +578,8 @@ func (s *Server) deployArtifacts(w http.ResponseWriter, r *http.Request, rawPayl } func (s *Server) stopComponentByKey(w http.ResponseWriter, r *http.Request, componentType, componentKey string) { - s.beginInFlight(fmt.Sprintf("stop:%s", componentKey), inFlightOperationScopeLifecycle) - defer s.endInFlight(fmt.Sprintf("stop:%s", componentKey)) + s.beginInFlight("stop:"+componentKey, inFlightOperationScopeLifecycle) + defer s.endInFlight("stop:" + componentKey) s.lifecycleMu.Lock() defer s.lifecycleMu.Unlock() @@ -725,11 +728,11 @@ func (s *Server) discoverOwnedContainers(ctx context.Context, fn func() error) ( } if msg.Action == "create" || msg.Action == "start" { eventMu.Lock() - eventIDs = append(eventIDs, msg.ID) + eventIDs = append(eventIDs, msg.Actor.ID) eventMu.Unlock() } - case err, ok := <-errs: - if !ok || err == nil { + case evtErr, ok := <-errs: + if !ok || evtErr == nil { return } return @@ -873,17 +876,17 @@ func componentCacheKey(payload StartComponentPayload) (string, error) { switch payload.ComponentType { case ComponentTypeBlockchain: if payload.Blockchain == nil { - return "", fmt.Errorf("blockchain payload is required") + return "", errors.New("blockchain payload is required") } return fmt.Sprintf("%s:%s:%s", payload.ComponentType, payload.Blockchain.Type, payload.Blockchain.ChainID), nil case ComponentTypeJD: if payload.JD == nil { - return "", fmt.Errorf("jd payload is required") + return "", errors.New("jd payload is required") } return fmt.Sprintf("%s:%s", payload.ComponentType, payload.JD.Image), nil case ComponentTypeNodeSet: if payload.NodeSet == nil { - return "", fmt.Errorf("nodeset payload is required") + return "", errors.New("nodeset payload is required") } return fmt.Sprintf("%s:%s", payload.ComponentType, payload.NodeSet.Name), nil default: @@ -893,8 +896,9 @@ func componentCacheKey(payload StartComponentPayload) (string, error) { func Run(ctx context.Context, addr string, srv *Server) error { httpSrv := &http.Server{ - Addr: addr, - Handler: srv.Handler(), + Addr: addr, + Handler: srv.Handler(), + ReadHeaderTimeout: 10 * time.Second, } errCh := make(chan error, 1) @@ -906,7 +910,7 @@ func Run(ctx context.Context, addr string, srv *Server) error { case <-ctx.Done(): return httpSrv.Shutdown(context.Background()) case err := <-errCh: - if err == http.ErrServerClosed { + if errors.Is(err, http.ErrServerClosed) { return nil } return err diff --git a/system-tests/lib/cre/environment/remoteexec/agent/server_chip_sink.go b/system-tests/lib/cre/environment/remoteexec/agent/server_chip_sink.go index f2122ef5c31..56fb92480d4 100644 --- a/system-tests/lib/cre/environment/remoteexec/agent/server_chip_sink.go +++ b/system-tests/lib/cre/environment/remoteexec/agent/server_chip_sink.go @@ -15,6 +15,7 @@ import ( "time" "github.com/cloudevents/sdk-go/binding/format/protobuf/v2/pb" + chippb "github.com/smartcontractkit/chainlink-common/pkg/chipingress/pb" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/chipsink" ) @@ -73,13 +74,13 @@ func (s *Server) startChipTestSink(w http.ResponseWriter, r *http.Request) { } eventLogPath := defaultChipSinkEventLogPath() - if err := os.MkdirAll(filepath.Dir(eventLogPath), 0o755); err != nil { - s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to prepare chip sink log directory: %v", err), nil) + if mkdirErr := os.MkdirAll(filepath.Dir(eventLogPath), 0o755); mkdirErr != nil { + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to prepare chip sink log directory: %v", mkdirErr), nil) return } // Start with a clean event stream per launch. - if err := os.Remove(eventLogPath); err != nil && !os.IsNotExist(err) { - s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to reset chip sink event log: %v", err), nil) + if removeErr := os.Remove(eventLogPath); removeErr != nil && !os.IsNotExist(removeErr) { + s.respondError(w, http.StatusInternalServerError, ErrCodeDeployFailed, fmt.Sprintf("failed to reset chip sink event log: %v", removeErr), nil) return } var eventLogMu sync.Mutex diff --git a/system-tests/lib/cre/environment/remoteexec/agent/server_component_logs.go b/system-tests/lib/cre/environment/remoteexec/agent/server_component_logs.go index 8524b2d70e3..10a06f0ac38 100644 --- a/system-tests/lib/cre/environment/remoteexec/agent/server_component_logs.go +++ b/system-tests/lib/cre/environment/remoteexec/agent/server_component_logs.go @@ -23,11 +23,10 @@ func (s *Server) appendComponentLogs(componentKey string, lines []string) { s.logsMu.Lock() defer s.logsMu.Unlock() - existing := append(s.componentLogs[componentKey], filtered...) - if len(existing) > componentLogsRingSize { - existing = existing[len(existing)-componentLogsRingSize:] + s.componentLogs[componentKey] = append(s.componentLogs[componentKey], filtered...) + if len(s.componentLogs[componentKey]) > componentLogsRingSize { + s.componentLogs[componentKey] = s.componentLogs[componentKey][len(s.componentLogs[componentKey])-componentLogsRingSize:] } - s.componentLogs[componentKey] = existing } func (s *Server) getComponentLogs(componentKey string, limit int) ([]string, int) { diff --git a/system-tests/lib/cre/environment/remoteexec/agent/server_handlers_test.go b/system-tests/lib/cre/environment/remoteexec/agent/server_handlers_test.go index e6cf240dd22..c808116ae0d 100644 --- a/system-tests/lib/cre/environment/remoteexec/agent/server_handlers_test.go +++ b/system-tests/lib/cre/environment/remoteexec/agent/server_handlers_test.go @@ -10,9 +10,10 @@ import ( "time" "github.com/rs/zerolog" + "github.com/stretchr/testify/require" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/jd" ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" - "github.com/stretchr/testify/require" ) func TestHealthEndpointReturnsOK(t *testing.T) { @@ -143,7 +144,7 @@ func TestStatusEndpointReturnsAgentState(t *testing.T) { require.Contains(t, resp.ComponentLogKeys, "nodeset:workflow") require.Len(t, resp.Relays, 1) require.Equal(t, "workflow-ocr-0", resp.Relays[0].Name) - require.Greater(t, resp.Relays[0].BoundPort, 0) + require.Positive(t, resp.Relays[0].BoundPort) require.Len(t, resp.InFlight, 1) } diff --git a/system-tests/lib/cre/environment/remoteexec/agent/server_test.go b/system-tests/lib/cre/environment/remoteexec/agent/server_test.go index 3ec3ea73d47..7ccb80b5ddd 100644 --- a/system-tests/lib/cre/environment/remoteexec/agent/server_test.go +++ b/system-tests/lib/cre/environment/remoteexec/agent/server_test.go @@ -10,6 +10,7 @@ import ( "testing" "github.com/rs/zerolog" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" ) diff --git a/system-tests/lib/cre/environment/remoteexec/chipsink/event_decode.go b/system-tests/lib/cre/environment/remoteexec/chipsink/event_decode.go index 25f048625b0..79d31e5697d 100644 --- a/system-tests/lib/cre/environment/remoteexec/chipsink/event_decode.go +++ b/system-tests/lib/cre/environment/remoteexec/chipsink/event_decode.go @@ -6,11 +6,12 @@ import ( "strings" "github.com/cloudevents/sdk-go/binding/format/protobuf/v2/pb" + "google.golang.org/protobuf/encoding/protojson" + "google.golang.org/protobuf/proto" + commonevents "github.com/smartcontractkit/chainlink-protos/workflows/go/common" workflowevents "github.com/smartcontractkit/chainlink-protos/workflows/go/events" workfloweventsv2 "github.com/smartcontractkit/chainlink-protos/workflows/go/v2" - "google.golang.org/protobuf/encoding/protojson" - "google.golang.org/protobuf/proto" ) // EventData decodes known CHiP workflow event types to human-readable JSON maps. diff --git a/system-tests/lib/cre/environment/remoteexec/chipsink/server.go b/system-tests/lib/cre/environment/remoteexec/chipsink/server.go index 50669b634d3..83aaab6c26b 100644 --- a/system-tests/lib/cre/environment/remoteexec/chipsink/server.go +++ b/system-tests/lib/cre/environment/remoteexec/chipsink/server.go @@ -15,9 +15,10 @@ import ( "time" "github.com/cloudevents/sdk-go/binding/format/protobuf/v2/pb" - chippb "github.com/smartcontractkit/chainlink-common/pkg/chipingress/pb" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" + + chippb "github.com/smartcontractkit/chainlink-common/pkg/chipingress/pb" ) const listenerReadyTimeout = 5 * time.Second diff --git a/system-tests/lib/cre/environment/remoteexec/client/agent_introspection.go b/system-tests/lib/cre/environment/remoteexec/client/agent_introspection.go index 31e7a4c2429..b9de26012c1 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/agent_introspection.go +++ b/system-tests/lib/cre/environment/remoteexec/client/agent_introspection.go @@ -3,6 +3,7 @@ package client import ( "context" "encoding/json" + "errors" "fmt" "io" "net/http" @@ -47,7 +48,7 @@ func GetComponentLogs(ctx context.Context, runtime *Runtime, componentKey string } componentKey = strings.TrimSpace(componentKey) if componentKey == "" { - return nil, fmt.Errorf("componentKey is required") + return nil, errors.New("componentKey is required") } q := url.Values{} @@ -66,11 +67,11 @@ func GetComponentLogs(ctx context.Context, runtime *Runtime, componentKey string func runtimeBaseURL(runtime *Runtime) (string, error) { if runtime == nil { - return "", fmt.Errorf("runtime is nil") + return "", errors.New("runtime is nil") } baseURL := strings.TrimSpace(runtime.AgentBaseURL) if baseURL == "" { - return "", fmt.Errorf("runtime is missing agent base url") + return "", errors.New("runtime is missing agent base url") } return baseURL, nil } diff --git a/system-tests/lib/cre/environment/remoteexec/client/agent_introspection_test.go b/system-tests/lib/cre/environment/remoteexec/client/agent_introspection_test.go index 26a7f1cf314..be3536f8932 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/agent_introspection_test.go +++ b/system-tests/lib/cre/environment/remoteexec/client/agent_introspection_test.go @@ -7,8 +7,10 @@ import ( "net/http/httptest" "testing" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" ) func TestGetAgentStatusSuccess(t *testing.T) { @@ -36,8 +38,8 @@ func TestGetAgentLocksSuccess(t *testing.T) { func TestGetComponentLogsSuccess(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - require.Equal(t, "nodeset:workflow", r.URL.Query().Get("componentKey")) - require.Equal(t, "5", r.URL.Query().Get("limit")) + assert.Equal(t, "nodeset:workflow", r.URL.Query().Get("componentKey")) + assert.Equal(t, "5", r.URL.Query().Get("limit")) _ = json.NewEncoder(w).Encode(agent.ComponentLogsResponse{ ComponentKey: "nodeset:workflow", TotalLines: 8, diff --git a/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote.go b/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote.go index 8ca91eae018..d6b18a40cbc 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote.go +++ b/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote.go @@ -4,7 +4,7 @@ import ( "context" "encoding/base64" "encoding/json" - "fmt" + "errors" "os" "path/filepath" @@ -22,10 +22,10 @@ func DeployArtifactsToRemoteNodeSet( files []string, ) error { if nodeSetName == "" { - return fmt.Errorf("nodeset name is required") + return errors.New("nodeset name is required") } if containerTargetDir == "" { - return fmt.Errorf("container target dir is required") + return errors.New("container target dir is required") } remoteRuntime, err := ResolveRuntime(lggr) @@ -48,7 +48,7 @@ func DeployArtifactsToRemoteNodeSet( }) } if len(payloadFiles) == 0 { - return fmt.Errorf("no artifact files to deploy") + return errors.New("no artifact files to deploy") } payloadBytes, err := json.Marshal(agent.DeployArtifactsPayload{ diff --git a/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote_test.go b/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote_test.go index c9c052160d8..cd9ef484d95 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote_test.go +++ b/system-tests/lib/cre/environment/remoteexec/client/artifacts_remote_test.go @@ -11,9 +11,11 @@ import ( "testing" "github.com/rs/zerolog" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" - "github.com/stretchr/testify/require" ) func TestDeployArtifactsToRemoteNodeSetValidation(t *testing.T) { @@ -61,18 +63,18 @@ func TestDeployArtifactsToRemoteNodeSetSuccess(t *testing.T) { _ = json.NewEncoder(w).Encode(agent.AgentStatusResponse{ProtocolVersion: "1.0.0"}) case "/v1/components/start": var envelope agent.StartComponentEnvelope - require.NoError(t, json.NewDecoder(r.Body).Decode(&envelope)) - require.Equal(t, agent.OperationDeployArtifacts, envelope.Operation) + assert.NoError(t, json.NewDecoder(r.Body).Decode(&envelope)) + assert.Equal(t, agent.OperationDeployArtifacts, envelope.Operation) var payload agent.DeployArtifactsPayload - require.NoError(t, json.Unmarshal(envelope.Payload, &payload)) - require.Equal(t, "workflow", payload.NodeSetName) - require.Equal(t, "/home/chainlink/workflows", payload.TargetDir) - require.Len(t, payload.Files, 1) - require.Equal(t, "artifact.wasm", payload.Files[0].Name) + assert.NoError(t, json.Unmarshal(envelope.Payload, &payload)) + assert.Equal(t, "workflow", payload.NodeSetName) + assert.Equal(t, "/home/chainlink/workflows", payload.TargetDir) + assert.Len(t, payload.Files, 1) + assert.Equal(t, "artifact.wasm", payload.Files[0].Name) raw, err := base64.StdEncoding.DecodeString(payload.Files[0].ContentBase64) - require.NoError(t, err) - require.Equal(t, "artifact-content", string(raw)) + assert.NoError(t, err) + assert.Equal(t, "artifact-content", string(raw)) _ = json.NewEncoder(w).Encode(agent.StartComponentResponse{ ComponentType: ComponentTypeNodeSet, diff --git a/system-tests/lib/cre/environment/remoteexec/client/chip_sink_remote_test.go b/system-tests/lib/cre/environment/remoteexec/client/chip_sink_remote_test.go index 0acbdb1fec0..ce1f72bc5b9 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/chip_sink_remote_test.go +++ b/system-tests/lib/cre/environment/remoteexec/client/chip_sink_remote_test.go @@ -8,14 +8,16 @@ import ( "testing" "time" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" ) func TestStartRemoteChipTestSinkSuccess(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - require.Equal(t, "/v1/chip/sink/start", r.URL.Path) - require.Equal(t, http.MethodPost, r.Method) + assert.Equal(t, "/v1/chip/sink/start", r.URL.Path) + assert.Equal(t, http.MethodPost, r.Method) _ = json.NewEncoder(w).Encode(agent.ChipTestSinkStartResponse{ Profile: "sink", Mode: "remote", @@ -33,8 +35,8 @@ func TestStartRemoteChipTestSinkSuccess(t *testing.T) { func TestStopRemoteChipTestSinkSuccess(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - require.Equal(t, "/v1/chip/sink/stop", r.URL.Path) - require.Equal(t, http.MethodPost, r.Method) + assert.Equal(t, "/v1/chip/sink/stop", r.URL.Path) + assert.Equal(t, http.MethodPost, r.Method) _ = json.NewEncoder(w).Encode(agent.ChipTestSinkStopResponse{Found: true, Stopped: true}) })) defer server.Close() @@ -47,8 +49,8 @@ func TestStopRemoteChipTestSinkSuccess(t *testing.T) { func TestGetRemoteChipTestSinkStatusSuccess(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - require.Equal(t, "/v1/chip/sink/status", r.URL.Path) - require.Equal(t, http.MethodGet, r.Method) + assert.Equal(t, "/v1/chip/sink/status", r.URL.Path) + assert.Equal(t, http.MethodGet, r.Method) _ = json.NewEncoder(w).Encode(agent.ChipTestSinkStatusResponse{ Profile: "sink", Mode: "remote", @@ -67,9 +69,9 @@ func TestGetRemoteChipTestSinkStatusSuccess(t *testing.T) { func TestGetRemoteChipTestSinkEventsSuccess(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - require.Equal(t, "/v1/chip/sink/events", r.URL.Path) - require.Equal(t, http.MethodGet, r.Method) - require.Equal(t, "5", r.URL.Query().Get("limit")) + assert.Equal(t, "/v1/chip/sink/events", r.URL.Path) + assert.Equal(t, http.MethodGet, r.Method) + assert.Equal(t, "5", r.URL.Query().Get("limit")) _ = json.NewEncoder(w).Encode(agent.ChipTestSinkEventsResponse{ Events: []agent.ChipTestSinkEventLogEntry{{Type: "workflows.v1.UserLogs"}}, }) diff --git a/system-tests/lib/cre/environment/remoteexec/client/compatibility.go b/system-tests/lib/cre/environment/remoteexec/client/compatibility.go index 652adbcd72f..b8ff2cda918 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/compatibility.go +++ b/system-tests/lib/cre/environment/remoteexec/client/compatibility.go @@ -2,6 +2,7 @@ package client import ( "context" + "errors" "fmt" "slices" "strconv" @@ -22,7 +23,7 @@ func CheckCompatibility(ctx context.Context, runtime *Runtime, requiredCapabilit func checkCompatibilityStatus(status *agent.AgentStatusResponse, requiredCapabilities []string) error { if status == nil { - return fmt.Errorf("agent status is nil") + return errors.New("agent status is nil") } if strings.TrimSpace(status.ProtocolVersion) != "" { diff --git a/system-tests/lib/cre/environment/remoteexec/client/compatibility_test.go b/system-tests/lib/cre/environment/remoteexec/client/compatibility_test.go index f1890414aba..caae6bbbd44 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/compatibility_test.go +++ b/system-tests/lib/cre/environment/remoteexec/client/compatibility_test.go @@ -7,8 +7,9 @@ import ( "net/http/httptest" "testing" - "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" "github.com/stretchr/testify/require" + + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" ) func TestCheckCompatibilityStatusAcceptsSameMajor(t *testing.T) { diff --git a/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go index 8ddc462046f..9233cb6d057 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go +++ b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client.go @@ -125,13 +125,17 @@ func (c *httpComponentClient) StartComponent(ctx context.Context, envelope agent } var result *agent.StartComponentResponse + attempts := c.maxAttempts + if attempts < 1 { + attempts = 1 + } err := retry.Do( func() error { var err error result, err = c.startComponentOnce(ctx, envelope) return err }, - retry.Attempts(uint(c.maxAttempts)), + retry.Attempts(uint(attempts)), //nolint:gosec // G115: attempts is validated to be >= 1 retry.Delay(c.retryDelay), retry.Context(ctx), retry.LastErrorOnly(true), @@ -171,8 +175,8 @@ func (c *httpComponentClient) startComponentOnce(ctx context.Context, envelope a var startResp agent.StartComponentResponse if len(respBody) > 0 { - if err := json.Unmarshal(respBody, &startResp); err != nil { - return nil, retry.Unrecoverable(pkgerrors.Wrap(err, "failed to decode start component response")) + if unmarshalErr := json.Unmarshal(respBody, &startResp); unmarshalErr != nil { + return nil, retry.Unrecoverable(pkgerrors.Wrap(unmarshalErr, "failed to decode start component response")) } } @@ -204,6 +208,10 @@ func (c *httpComponentClient) startComponentOnce(ctx context.Context, envelope a func (c *httpComponentClient) waitForHealth(ctx context.Context) error { healthURL := c.baseURL + "/v1/health" + attempts := c.maxAttempts + if attempts < 1 { + attempts = 1 + } return retry.Do( func() error { req, err := http.NewRequestWithContext(ctx, http.MethodGet, healthURL, nil) @@ -220,7 +228,7 @@ func (c *httpComponentClient) waitForHealth(ctx context.Context) error { } return fmt.Errorf("%s: status %s", describeRemoteAgentHealthFailure(c.baseURL), resp.Status) }, - retry.Attempts(uint(c.maxAttempts)), + retry.Attempts(uint(attempts)), //nolint:gosec // G115: attempts is validated to be >= 1 retry.Delay(c.retryDelay), retry.Context(ctx), retry.LastErrorOnly(true), diff --git a/system-tests/lib/cre/environment/remoteexec/client/remote_component_client_test.go b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client_test.go index a5716eeefa3..6cf7162f960 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/remote_component_client_test.go +++ b/system-tests/lib/cre/environment/remoteexec/client/remote_component_client_test.go @@ -10,9 +10,11 @@ import ( "testing" "github.com/rs/zerolog" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/remoteexec/agent" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" - "github.com/stretchr/testify/require" ) func TestResolveRemoteRuntimeWithExplicitEnv(t *testing.T) { @@ -33,7 +35,7 @@ func TestResolveRemoteRuntimeWithInputOverridesEnv(t *testing.T) { t.Setenv(EnvRemoteAgentPort, "19090") server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - require.Equal(t, "/v1/status", r.URL.Path) + assert.Equal(t, "/v1/status", r.URL.Path) _ = json.NewEncoder(w).Encode(agent.AgentStatusResponse{ ProtocolVersion: "1.0", Capabilities: []string{"component_logs", "locks", "deploy_artifacts", "start_component", "relay", "list_ctf_resources"}, diff --git a/system-tests/lib/cre/environment/remoteexec/client/remote_stop_test.go b/system-tests/lib/cre/environment/remoteexec/client/remote_stop_test.go index db0b2603c38..428d5aac460 100644 --- a/system-tests/lib/cre/environment/remoteexec/client/remote_stop_test.go +++ b/system-tests/lib/cre/environment/remoteexec/client/remote_stop_test.go @@ -9,6 +9,7 @@ import ( "testing" "github.com/rs/zerolog" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" @@ -49,8 +50,8 @@ func TestCountRemoteStopTargets(t *testing.T) { func TestListRemoteCTFResources(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - require.Equal(t, http.MethodGet, r.Method) - require.Equal(t, "/v1/resources/ctf", r.URL.Path) + assert.Equal(t, http.MethodGet, r.Method) + assert.Equal(t, "/v1/resources/ctf", r.URL.Path) _, _ = w.Write([]byte(`{"containers":["c1","c2"],"volumes":["v1"]}`)) })) defer server.Close() @@ -109,14 +110,14 @@ func TestStopRemoteComponents_ResidualQueryFailureIsReportedInSummary(t *testing case "/v1/health": w.WriteHeader(http.StatusOK) case "/v1/status": - require.NoError(t, json.NewEncoder(w).Encode(agent.AgentStatusResponse{ProtocolVersion: "1.0.0"})) + assert.NoError(t, json.NewEncoder(w).Encode(agent.AgentStatusResponse{ProtocolVersion: "1.0.0"})) case "/v1/components/start": resp := agent.StartComponentResponse{ ComponentType: ComponentTypeBlockchain, Found: true, Stopped: true, } - require.NoError(t, json.NewEncoder(w).Encode(resp)) + assert.NoError(t, json.NewEncoder(w).Encode(resp)) case "/v1/resources/ctf": w.WriteHeader(http.StatusBadGateway) _, _ = w.Write([]byte("ctf listing down")) @@ -183,18 +184,18 @@ func newRemoteStopTestServer(t *testing.T) *httptest.Server { w.WriteHeader(http.StatusOK) return case "/v1/status": - require.NoError(t, json.NewEncoder(w).Encode(agent.AgentStatusResponse{ProtocolVersion: "1.0.0"})) + assert.NoError(t, json.NewEncoder(w).Encode(agent.AgentStatusResponse{ProtocolVersion: "1.0.0"})) return case "/v1/resources/ctf": _, _ = w.Write([]byte(`{"containers":["leftover-container"],"volumes":["leftover-volume"]}`)) return case "/v1/components/start": var envelope agent.StartComponentEnvelope - require.NoError(t, json.NewDecoder(r.Body).Decode(&envelope)) - require.Equal(t, agent.OperationStopComponent, envelope.Operation) + assert.NoError(t, json.NewDecoder(r.Body).Decode(&envelope)) + assert.Equal(t, agent.OperationStopComponent, envelope.Operation) var payload agent.StartComponentPayload - require.NoError(t, json.Unmarshal(envelope.Payload, &payload)) + assert.NoError(t, json.Unmarshal(envelope.Payload, &payload)) resp := agent.StartComponentResponse{ComponentType: payload.ComponentType} switch payload.ComponentType { @@ -210,7 +211,7 @@ func newRemoteStopTestServer(t *testing.T) *httptest.Server { default: t.Fatalf("unexpected component type %q", payload.ComponentType) } - require.NoError(t, json.NewEncoder(w).Encode(resp)) + assert.NoError(t, json.NewEncoder(w).Encode(resp)) return default: t.Fatalf("unexpected path %s", r.URL.Path) diff --git a/system-tests/lib/cre/features/evm/v2/evm.go b/system-tests/lib/cre/features/evm/v2/evm.go index a2a980d03f6..71611d00cdb 100644 --- a/system-tests/lib/cre/features/evm/v2/evm.go +++ b/system-tests/lib/cre/features/evm/v2/evm.go @@ -291,14 +291,14 @@ func createJobs( } var configBuffer bytes.Buffer - if err := tmpl.Execute(&configBuffer, templateData); err != nil { - return errors.Wrapf(err, "failed to execute %s config template", flag) + if execErr := tmpl.Execute(&configBuffer, templateData); execErr != nil { + return errors.Wrapf(execErr, "failed to execute %s config template", flag) } configStr := configBuffer.String() - if err := credon.ValidateTemplateSubstitution(configStr, flag); err != nil { - return fmt.Errorf("%s template validation failed: %w\nRendered template: %s", flag, err, configStr) + if valErr := credon.ValidateTemplateSubstitution(configStr, flag); valErr != nil { + return fmt.Errorf("%s template validation failed: %w\nRendered template: %s", flag, valErr, configStr) } evmKeyBundle, ok := workerNode.Keys.OCR2BundleIDs[chainselectors.FamilyEVM] // we can always expect evm bundle key id present since evm is the registry chain diff --git a/system-tests/lib/cre/features/vault/vault.go b/system-tests/lib/cre/features/vault/vault.go index 3473d2aeed3..3b8f500432c 100644 --- a/system-tests/lib/cre/features/vault/vault.go +++ b/system-tests/lib/cre/features/vault/vault.go @@ -138,25 +138,6 @@ func updateNodeConfig(workerNode *cre.NodeMetadata, currentConfig string, regist return ptr.Ptr(string(stringifiedConfig)), nil } -func pendingQueueEnabled(don *cre.Don) bool { - os, ok := don.GetCapabilityConfig(flag) - if !ok { - return false - } - setting, ok := os.Values["EnableDeterministicPendingQueue"] - - if !ok { - return false - } - - enabled, ok := setting.(bool) - if !ok { - return false - } - - return enabled -} - func (o *Vault) PostEnvStartup( ctx context.Context, testLogger zerolog.Logger, diff --git a/system-tests/lib/cre/internal/dockerops/files.go b/system-tests/lib/cre/internal/dockerops/files.go index a8567004b53..a84e737f67f 100644 --- a/system-tests/lib/cre/internal/dockerops/files.go +++ b/system-tests/lib/cre/internal/dockerops/files.go @@ -11,6 +11,7 @@ import ( ctypes "github.com/docker/docker/api/types/container" dc "github.com/docker/docker/client" "github.com/pkg/errors" + "github.com/smartcontractkit/chainlink-testing-framework/framework" ) diff --git a/system-tests/lib/cre/runtimecfg/access_mode.go b/system-tests/lib/cre/runtimecfg/access_mode.go index db147049d9f..69d0aa14824 100644 --- a/system-tests/lib/cre/runtimecfg/access_mode.go +++ b/system-tests/lib/cre/runtimecfg/access_mode.go @@ -14,8 +14,8 @@ import ( ) const ( - EnvRemoteHostIP = "CRE_REMOTE_HOST_IP" - EnvLocalHostIP = "CRE_LOCAL_HOST_IP" + EnvRemoteHostIP = "CRE_REMOTE_HOST_IP" + EnvLocalHostIP = "CRE_LOCAL_HOST_IP" EnvRemoteAgentEC2InstanceID = "CRE_REMOTE_AGENT_EC2_INSTANCE_ID" defaultEC2Region = "us-west-2" @@ -50,12 +50,13 @@ func LocalHostIP() string { if gatewayIP := discoverDockerNetworkGatewayIP(framework.DefaultNetworkName); gatewayIP != "" { return gatewayIP } - ips, err := net.LookupIP("host.docker.internal") + resolver := net.Resolver{} + addrs, err := resolver.LookupIPAddr(context.Background(), "host.docker.internal") if err != nil { return "" } - for _, ip := range ips { - if ipv4 := ip.To4(); ipv4 != nil { + for _, addr := range addrs { + if ipv4 := addr.IP.To4(); ipv4 != nil { return ipv4.String() } } diff --git a/system-tests/lib/cre/runtimecfg/access_mode_test.go b/system-tests/lib/cre/runtimecfg/access_mode_test.go index da32ff4eb36..822be08be96 100644 --- a/system-tests/lib/cre/runtimecfg/access_mode_test.go +++ b/system-tests/lib/cre/runtimecfg/access_mode_test.go @@ -33,7 +33,7 @@ func TestResolveAWSCLIProfileSelectionOrder(t *testing.T) { t.Setenv("AWS_ACCESS_KEY_ID", "key") t.Setenv("AWS_SECRET_ACCESS_KEY", "secret") profile, mode := ResolveAWSCLIProfileSelection() - require.Equal(t, "", profile) + require.Empty(t, profile) require.Equal(t, "env-creds", mode) t.Setenv("AWS_ACCESS_KEY_ID", "") @@ -41,7 +41,7 @@ func TestResolveAWSCLIProfileSelectionOrder(t *testing.T) { t.Setenv("AWS_WEB_IDENTITY_TOKEN_FILE", "/tmp/token") t.Setenv("AWS_ROLE_ARN", "arn:aws:iam::123456789012:role/Role") profile, mode = ResolveAWSCLIProfileSelection() - require.Equal(t, "", profile) + require.Empty(t, profile) require.Equal(t, "web-identity", mode) t.Setenv("AWS_WEB_IDENTITY_TOKEN_FILE", "") diff --git a/system-tests/lib/cre/topology.go b/system-tests/lib/cre/topology.go index 76ae63196d7..f15433fff50 100644 --- a/system-tests/lib/cre/topology.go +++ b/system-tests/lib/cre/topology.go @@ -138,7 +138,7 @@ func (t *Topology) HasRemoteNodeSets() bool { // BootstrapPlacement returns placement of the configured bootstrap DON. func (t *Topology) BootstrapPlacement() (string, error) { if t == nil || t.DonsMetadata == nil { - return "", fmt.Errorf("topology is nil") + return "", errors.New("topology is nil") } for _, don := range t.DonsMetadata.List() { if don == nil || don.ns == nil { @@ -148,13 +148,13 @@ func (t *Topology) BootstrapPlacement() (string, error) { return strings.TrimSpace(don.ns.Placement), nil } } - return "", fmt.Errorf("failed to resolve bootstrap placement") + return "", errors.New("failed to resolve bootstrap placement") } // BootstrapAnnouncePort returns OCR2 announce port for the bootstrap node. func (t *Topology) BootstrapAnnouncePort() (int, error) { if t == nil || t.DonsMetadata == nil { - return 0, fmt.Errorf("topology is nil") + return 0, errors.New("topology is nil") } for _, don := range t.DonsMetadata.List() { if don == nil { @@ -166,7 +166,7 @@ func (t *Topology) BootstrapAnnouncePort() (int, error) { } return don.ResolveNodeOCR2AnnouncePort(node.Index), nil } - return 0, fmt.Errorf("failed to resolve bootstrap announce port") + return 0, errors.New("failed to resolve bootstrap announce port") } // AddGatewayHandlers adds the given handler names to the gateway config of the given DON. It only adds handlers, if they are not already present. diff --git a/system-tests/lib/cre/workflow/docker.go b/system-tests/lib/cre/workflow/docker.go index d1f03b8473b..655a2102479 100644 --- a/system-tests/lib/cre/workflow/docker.go +++ b/system-tests/lib/cre/workflow/docker.go @@ -3,9 +3,10 @@ package workflow import ( "context" "fmt" - "github.com/pkg/errors" "os" + "github.com/pkg/errors" + "github.com/smartcontractkit/chainlink/system-tests/lib/cre/internal/dockerops" ) diff --git a/system-tests/lib/go.mod b/system-tests/lib/go.mod index 76b4c153d89..88fbaec11d3 100644 --- a/system-tests/lib/go.mod +++ b/system-tests/lib/go.mod @@ -17,6 +17,7 @@ require ( github.com/alitto/pond/v2 v2.5.0 github.com/andybalholm/brotli v1.2.0 github.com/avast/retry-go/v4 v4.6.1 + github.com/cloudevents/sdk-go/binding/format/protobuf/v2 v2.16.2 github.com/cockroachdb/errors v1.11.3 github.com/containerd/errdefs v1.0.0 github.com/cosmos/gogoproto v1.7.0 @@ -37,6 +38,7 @@ require ( github.com/smartcontractkit/chainlink-ccip/chains/solana v0.0.0-20260121163256-85accaf3d28d github.com/smartcontractkit/chainlink-common v0.10.1-0.20260227202051-0f1cea05d443 github.com/smartcontractkit/chainlink-common/keystore v1.0.2 + github.com/smartcontractkit/chainlink-common/pkg/chipingress v0.0.10 github.com/smartcontractkit/chainlink-deployments-framework v0.80.1-0.20260209182815-b296b7df28a6 github.com/smartcontractkit/chainlink-evm v0.3.4-0.20260227175232-0de99d1959de github.com/smartcontractkit/chainlink-evm/gethwrappers v0.0.0-20251222115927-36a18321243c @@ -151,7 +153,6 @@ require ( github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/chai2010/gettext-go v1.0.2 // indirect github.com/chaos-mesh/chaos-mesh/api v0.0.0-20240821051457-da69c6d9617a // indirect - github.com/cloudevents/sdk-go/binding/format/protobuf/v2 v2.16.2 // indirect github.com/cloudevents/sdk-go/v2 v2.16.2 // indirect github.com/cloudwego/base64x v0.1.4 // indirect github.com/cloudwego/iasm v0.2.0 // indirect @@ -446,7 +447,6 @@ require ( github.com/smartcontractkit/chainlink-ccip/chains/solana/gobindings v0.0.0-20250912190424-fd2e35d7deb5 // indirect github.com/smartcontractkit/chainlink-ccip/deployment v0.0.0-20260129103204-4c8453dd8139 // indirect github.com/smartcontractkit/chainlink-ccv v0.0.0-20260225114453-965dabf4bcb0 // indirect - github.com/smartcontractkit/chainlink-common/pkg/chipingress v0.0.10 // indirect github.com/smartcontractkit/chainlink-data-streams v0.1.12-0.20260227110503-42b236799872 // indirect github.com/smartcontractkit/chainlink-evm/contracts/cre/gobindings v0.0.0-20260107191744-4b93f62cffe3 // indirect github.com/smartcontractkit/chainlink-feeds v0.1.2-0.20250227211209-7cd000095135 // indirect diff --git a/system-tests/tests/load/cre/writer_don_load_test.go b/system-tests/tests/load/cre/writer_don_load_test.go index 5675769ba91..19ee4c96259 100644 --- a/system-tests/tests/load/cre/writer_don_load_test.go +++ b/system-tests/tests/load/cre/writer_don_load_test.go @@ -52,9 +52,9 @@ import ( cretypes "github.com/smartcontractkit/chainlink/system-tests/lib/cre" libcontracts "github.com/smartcontractkit/chainlink/system-tests/lib/cre/contracts" creenv "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment" - creenvconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" creevm "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" blockchain_sets "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/sets" + creenvconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/flags" mock_capability "github.com/smartcontractkit/chainlink/system-tests/lib/cre/mock" pb2 "github.com/smartcontractkit/chainlink/system-tests/lib/cre/mock/pb" diff --git a/system-tests/tests/smoke/cre/v2_vault_don_test.go b/system-tests/tests/smoke/cre/v2_vault_don_test.go index d55d82b537f..742a64f2d7c 100644 --- a/system-tests/tests/smoke/cre/v2_vault_don_test.go +++ b/system-tests/tests/smoke/cre/v2_vault_don_test.go @@ -24,8 +24,8 @@ import ( "github.com/smartcontractkit/chainlink-testing-framework/seth" keystone_changeset "github.com/smartcontractkit/chainlink/deployment/keystone/changeset" crecontracts "github.com/smartcontractkit/chainlink/system-tests/lib/cre/contracts" - creconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains/evm" + creconfig "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/config" t_helpers "github.com/smartcontractkit/chainlink/system-tests/tests/test-helpers" "github.com/smartcontractkit/chainlink/v2/core/capabilities/vault/vaulttypes" diff --git a/system-tests/tests/test-helpers/fixture_relay_helpers.go b/system-tests/tests/test-helpers/fixture_relay_helpers.go index 54f504e414c..d19cedec44b 100644 --- a/system-tests/tests/test-helpers/fixture_relay_helpers.go +++ b/system-tests/tests/test-helpers/fixture_relay_helpers.go @@ -52,7 +52,7 @@ var ( // It is a no-op when no remote NodeSets are configured. func EnsureFixtureRelayForPort(t *testing.T, testEnv *ttypes.TestEnvironment, relayName string, localPort int) { t.Helper() - require.Greater(t, localPort, 0, "fixture relay local port must be > 0") + require.Positive(t, localPort, "fixture relay local port must be > 0") cfg := resolveEnvConfigForRelay(t, testEnv) if !hasRemoteNodeSets(cfg) { @@ -171,7 +171,7 @@ func openRelay(ctx context.Context, agentBaseURL, name string, requestedPort int return "", err } if strings.TrimSpace(out.RelayID) == "" { - return "", fmt.Errorf("open relay returned empty relayId") + return "", errors.New("open relay returned empty relayId") } return out.RelayID, nil } @@ -220,7 +220,8 @@ func relayWorker(ctx context.Context, agentBaseURL, relayID, localFixtureAddr st continue } - localConn, err := net.DialTimeout("tcp", localFixtureAddr, 2*time.Second) + dialer := net.Dialer{Timeout: 2 * time.Second} + localConn, err := dialer.DialContext(ctx, "tcp", localFixtureAddr) if err != nil { _ = ws.Close() time.Sleep(backoff) diff --git a/system-tests/tests/test-helpers/t_helpers.go b/system-tests/tests/test-helpers/t_helpers.go index 529e24020a4..775674bb33c 100644 --- a/system-tests/tests/test-helpers/t_helpers.go +++ b/system-tests/tests/test-helpers/t_helpers.go @@ -667,7 +667,7 @@ func CompileAndDeployWorkflow[T WorkflowConfig](t *testing.T, WorkflowRegistryAddr: common.HexToAddress(workflowRegistryAddress.Address), WorkflowRegistryVersion: workflowRegistryAddress.Version, ChainID: registryChainSelector, - DonID: workflowDONs[0].ID, //TODO think how to make this more robust, we are naively assuming that the first workflow DON is the one we want to register the workflow for + DonID: workflowDONs[0].ID, // TODO think how to make this more robust, we are naively assuming that the first workflow DON is the one we want to register the workflow for ContainerTargetDir: creworkflow.DefaultWorkflowTargetDir, Blockchains: testEnv.CreEnvironment.Blockchains, } From c7946b395f884ea78a61b8ccd9cc7bd19928b6dc Mon Sep 17 00:00:00 2001 From: Bartek Tofel Date: Tue, 3 Mar 2026 10:33:31 +0100 Subject: [PATCH 34/34] fix broken tests --- system-tests/lib/cre/don/config/config.go | 132 +++++++++++++++--- .../lib/cre/don/config/config_test.go | 64 +++++++++ .../environment/blockchains/solana/solana.go | 14 +- .../blockchains/solana/solana_test.go | 32 +++++ 4 files changed, 220 insertions(+), 22 deletions(-) create mode 100644 system-tests/lib/cre/environment/blockchains/solana/solana_test.go diff --git a/system-tests/lib/cre/don/config/config.go b/system-tests/lib/cre/don/config/config.go index 172ab407bca..add5685d7cd 100644 --- a/system-tests/lib/cre/don/config/config.go +++ b/system-tests/lib/cre/don/config/config.go @@ -26,6 +26,7 @@ import ( solcfg "github.com/smartcontractkit/chainlink-solana/pkg/solana/config" "github.com/smartcontractkit/chainlink-testing-framework/framework" chipingressset "github.com/smartcontractkit/chainlink-testing-framework/framework/components/dockercompose/chip_ingress_set" + ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink-testing-framework/lib/utils/ptr" keystone_changeset "github.com/smartcontractkit/chainlink/deployment/keystone/changeset" @@ -68,11 +69,11 @@ func PrepareNodeTOMLs( if peeringErr != nil { return nil, errors.Wrap(peeringErr, "failed to find peering data") } - ocrBootstrapPlacement, placementErr := topology.BootstrapPlacement() + ocrBootstrapPlacement, placementErr := resolveBootstrapPlacement(topology, bt.UUID) if placementErr != nil { return nil, placementErr } - ocrBootstrapAnnouncePort, announcePortErr := topology.BootstrapAnnouncePort() + ocrBootstrapAnnouncePort, announcePortErr := resolveBootstrapAnnouncePort(topology, bt.UUID) if announcePortErr != nil { return nil, announcePortErr } @@ -255,7 +256,7 @@ func generateNodeTomlConfig(input cre.GenerateConfigsInput, nodeConfigTransforme } case cre.GatewayNode: var cErr error - nodeConfig, cErr = addGatewayNodeConfig(nodeConfig, input.OCRPeeringData, commonInputs) + nodeConfig, cErr = addGatewayNodeConfig(nodeConfig, input.OCRPeeringData, commonInputs, nodeMetadata) if cErr != nil { return nil, errors.Wrapf(cErr, "failed to add gateway node config for node at index %d in DON %s", nodeIdx, input.DonMetadata.Name) } @@ -354,10 +355,10 @@ func addBootstrapNodeConfig( EnableExperimentalRageP2P: ptr.Ptr(true), } if donMetadata != nil && nodeMetadata != nil { - announcePort := donMetadata.ResolveNodeOCR2AnnouncePort(nodeMetadata.Index) + announcePort := resolveNodeOCR2AnnouncePort(donMetadata.MustNodeSet(), nodeMetadata.Index) announceAddresses, announceErr := cre.ResolveP2PAnnounceAddresses( donMetadata.MustNodeSet().Placement, - topology.HasRemoteNodeSets(), + hasRemoteNodeSets(topology), announcePort, ) if announceErr != nil { @@ -457,10 +458,10 @@ func addWorkerNodeConfig( }, EnableExperimentalRageP2P: ptr.Ptr(true), } - announcePort := donMetadata.ResolveNodeOCR2AnnouncePort(m.Index) + announcePort := resolveNodeOCR2AnnouncePort(donMetadata.MustNodeSet(), m.Index) announceAddresses, announceErr := cre.ResolveP2PAnnounceAddresses( donMetadata.MustNodeSet().Placement, - topology.HasRemoteNodeSets(), + hasRemoteNodeSets(topology), announcePort, ) if announceErr != nil { @@ -589,6 +590,7 @@ func addGatewayNodeConfig( existingConfig corechainlink.Config, ocrPeeringData cre.OCRPeeringData, commonInputs *commonInputs, + m *cre.NodeMetadata, ) (corechainlink.Config, error) { // TODO: remove this in the future? // Unless node has Peering enabled it won't create capabilities registry syncer and all requests to vault handler will fail, @@ -712,7 +714,7 @@ func gatherCommonInputs(input cre.GenerateConfigsInput) (*commonInputs, error) { version: input.ContractVersions[keystone_changeset.CapabilitiesRegistry.String()], }, remoteHostIP: input.RemoteHostIP, - provider: input.Provider, + provider: input.Provider, }, nil } @@ -755,19 +757,24 @@ func findEVMChains(input cre.GenerateConfigsInput) ([]*evmChain, error) { if err != nil { return nil, err } - resolvedWS, err := connectivity.Resolve(callerPlacement, targetPlacement, connectivity.EndpointPair{ - Name: fmt.Sprintf("evm-ws-%d", bcOut.ChainID()), - Internal: bcOut.CtfOutput().Nodes[0].InternalWSUrl, - External: bcOut.CtfOutput().Nodes[0].ExternalWSUrl, - }) - if err != nil { - return nil, err + wsRPC := "" + // Tron node config only needs HTTP; WS can legitimately be absent in topology outputs. + if bcOut.ChainID() != TronEVMChainID { + resolvedWS, wsErr := connectivity.Resolve(callerPlacement, targetPlacement, connectivity.EndpointPair{ + Name: fmt.Sprintf("evm-ws-%d", bcOut.ChainID()), + Internal: bcOut.CtfOutput().Nodes[0].InternalWSUrl, + External: bcOut.CtfOutput().Nodes[0].ExternalWSUrl, + }) + if wsErr != nil { + return nil, wsErr + } + wsRPC = resolvedWS.URL } evmChains = append(evmChains, &evmChain{ Name: fmt.Sprintf("node-%d", chainSelector), ChainID: bcOut.ChainID(), HTTPRPC: resolvedHTTP.URL, - WSRPC: resolvedWS.URL, + WSRPC: wsRPC, }) } return evmChains, nil @@ -908,6 +915,89 @@ func appendSolanaChain(existingConfig *solcfg.TOMLConfigs, solChain *solanaChain }) } +func hasRemoteNodeSets(topology *cre.Topology) bool { + if topology == nil { + return false + } + for _, nodeSet := range topology.NodeSets() { + if nodeSet != nil && strings.EqualFold(strings.TrimSpace(nodeSet.Placement), "remote") { + return true + } + } + return false +} + +func resolveNodeOCR2AnnouncePort(nodeSet *cre.NodeSet, nodeIndex int) int { + base := 0 + if nodeSet != nil { + base = nodeSet.OCR2P2PRangeStart + if base == 0 { + httpStart := nodeSet.HTTPPortRangeStart + if httpStart == 0 { + httpStart = ns.DefaultHTTPPortStaticRangeStart + } + base = httpStart + (ns.DefaultOCR2P2PStaticRangeStart - ns.DefaultHTTPPortStaticRangeStart) + } + } + if base == 0 { + base = ns.DefaultOCR2P2PStaticRangeStart + } + if nodeIndex < 0 { + nodeIndex = 0 + } + return base + nodeIndex +} + +func resolveBootstrapPlacement(topology *cre.Topology, bootstrapNodeUUID string) (string, error) { + if topology == nil { + return "", fmt.Errorf("topology is nil") + } + bootstrapNodeUUID = strings.TrimSpace(bootstrapNodeUUID) + if bootstrapNodeUUID == "" { + return "", fmt.Errorf("bootstrap node UUID is empty") + } + for _, don := range topology.DonsMetadata.List() { + if don == nil { + continue + } + for _, node := range don.NodesMetadata { + if node == nil || strings.TrimSpace(node.UUID) == "" { + continue + } + if node.UUID != bootstrapNodeUUID { + continue + } + return strings.TrimSpace(don.MustNodeSet().Placement), nil + } + } + return "", fmt.Errorf("failed to resolve bootstrap placement for node UUID %s", bootstrapNodeUUID) +} + +func resolveBootstrapAnnouncePort(topology *cre.Topology, bootstrapNodeUUID string) (int, error) { + if topology == nil { + return 0, fmt.Errorf("topology is nil") + } + bootstrapNodeUUID = strings.TrimSpace(bootstrapNodeUUID) + if bootstrapNodeUUID == "" { + return 0, fmt.Errorf("bootstrap node UUID is empty") + } + for _, don := range topology.DonsMetadata.List() { + if don == nil { + continue + } + for _, node := range don.NodesMetadata { + if node == nil || strings.TrimSpace(node.UUID) == "" { + continue + } + if node.UUID != bootstrapNodeUUID { + continue + } + return resolveNodeOCR2AnnouncePort(don.MustNodeSet(), node.Index), nil + } + } + return 0, fmt.Errorf("failed to resolve bootstrap announce port for node UUID %s", bootstrapNodeUUID) +} + func resolveNodeFacingBootstrapAddress(callerPlacement, bootstrapPlacement, bootstrapHost string, internalPort, externalPort int, remoteHostIP string) (string, error) { caller, err := connectivity.PlacementFromTarget(callerPlacement) if err != nil { @@ -920,7 +1010,7 @@ func resolveNodeFacingBootstrapAddress(callerPlacement, bootstrapPlacement, boot // Local callers need EC2-host reachable port for remote bootstrap nodes. if caller == connectivity.PlacementLocal && target == connectivity.PlacementRemote { if !runtimecfg.IsDirectMode() { - return "", errors.New("mixed DON bootstrap resolution requires direct mode") + return "", fmt.Errorf("mixed DON bootstrap resolution requires direct mode") } hostIP := strings.TrimSpace(remoteHostIP) if hostIP == "" { @@ -937,7 +1027,7 @@ func resolveNodeFacingBootstrapAddress(callerPlacement, bootstrapPlacement, boot func resolveGatewayConnectorURL(callerPlacementRaw string, topology *cre.Topology, gateway *cre.DonGatewayConfiguration, remoteHostIP string) (string, error) { if gateway == nil || gateway.GatewayConfiguration == nil { - return "", errors.New("gateway configuration is nil") + return "", fmt.Errorf("gateway configuration is nil") } callerPlacement, err := connectivity.PlacementFromTarget(callerPlacementRaw) if err != nil { @@ -984,11 +1074,11 @@ func blockchainPlacementsBySelector(configured []*envconfig.Blockchain, deployed func resolveNodePlacement(topology *cre.Topology, nodeUUID string) (connectivity.Placement, error) { if topology == nil { - return "", errors.New("topology is nil") + return "", fmt.Errorf("topology is nil") } trimmedUUID := strings.TrimSpace(nodeUUID) if trimmedUUID == "" { - return "", errors.New("node uuid is empty") + return "", fmt.Errorf("node uuid is empty") } for _, don := range topology.DonsMetadata.List() { if don == nil { @@ -1011,7 +1101,7 @@ func gatewayExternalHost(targetPlacement connectivity.Placement, remoteHostIP st switch targetPlacement { case connectivity.PlacementRemote: if !runtimecfg.IsDirectMode() { - return "", errors.New("gateway connector resolution for remote targets requires direct mode") + return "", fmt.Errorf("gateway connector resolution for remote targets requires direct mode") } if hostIP := strings.TrimSpace(remoteHostIP); hostIP != "" { return hostIP, nil diff --git a/system-tests/lib/cre/don/config/config_test.go b/system-tests/lib/cre/don/config/config_test.go index e2a476a2187..390a3a9fee0 100644 --- a/system-tests/lib/cre/don/config/config_test.go +++ b/system-tests/lib/cre/don/config/config_test.go @@ -1,15 +1,19 @@ package config import ( + "context" "strings" "testing" "github.com/stretchr/testify/require" + cldf_chain "github.com/smartcontractkit/chainlink-deployments-framework/chain" "github.com/smartcontractkit/chainlink-testing-framework/framework" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" "github.com/smartcontractkit/chainlink-testing-framework/framework/components/clnode" ns "github.com/smartcontractkit/chainlink-testing-framework/framework/components/simple_node_set" "github.com/smartcontractkit/chainlink/system-tests/lib/cre" + creblockchains "github.com/smartcontractkit/chainlink/system-tests/lib/cre/environment/blockchains" "github.com/smartcontractkit/chainlink/system-tests/lib/cre/runtimecfg" "github.com/smartcontractkit/chainlink/system-tests/lib/infra" ) @@ -138,6 +142,66 @@ func TestResolveNodeFacingBootstrapAddress_PlacementMatrix(t *testing.T) { } } +type fakeBlockchain struct { + selector uint64 + id uint64 + out *blockchain.Output +} + +func (f *fakeBlockchain) ChainSelector() uint64 { return f.selector } +func (f *fakeBlockchain) ChainID() uint64 { return f.id } +func (f *fakeBlockchain) ChainFamily() string { return f.out.Family } +func (f *fakeBlockchain) IsFamily(chainFamily string) bool { return strings.EqualFold(f.out.Family, chainFamily) } +func (f *fakeBlockchain) Fund(_ context.Context, _ string, _ uint64) error { return nil } +func (f *fakeBlockchain) CtfOutput() *blockchain.Output { return f.out } +func (f *fakeBlockchain) ToCldfChain() (cldf_chain.BlockChain, error) { return nil, nil } + +var _ creblockchains.Blockchain = (*fakeBlockchain)(nil) + +func TestFindEVMChains_AllowsMissingWSForTron(t *testing.T) { + nodeSet := &cre.NodeSet{ + Input: &ns.Input{ + Name: "workflow", + }, + Placement: "local", + SupportedEVMChains: []uint64{TronEVMChainID}, + } + donMetadata, err := cre.NewDonMetadata(nodeSet, 1, infra.Provider{Type: infra.Docker}, nil) + require.NoError(t, err) + + input := cre.GenerateConfigsInput{ + DonMetadata: donMetadata, + Blockchains: map[uint64]creblockchains.Blockchain{ + TronEVMChainID: &fakeBlockchain{ + selector: TronEVMChainID, + id: TronEVMChainID, + out: &blockchain.Output{ + Type: blockchain.TypeTron, + Family: blockchain.FamilyEVM, + Nodes: []*blockchain.Node{ + { + InternalHTTPUrl: "http://tron:9090/jsonrpc", + ExternalHTTPUrl: "http://localhost:9090/jsonrpc", + InternalWSUrl: "", + ExternalWSUrl: "", + }, + }, + }, + }, + }, + BlockchainPlacementBySelector: map[uint64]string{ + TronEVMChainID: "local", + }, + } + + evmChains, err := findEVMChains(input) + require.NoError(t, err, "tron should not require WS endpoint resolution") + require.Len(t, evmChains, 1) + require.Equal(t, TronEVMChainID, evmChains[0].ChainID) + require.NotEmpty(t, evmChains[0].HTTPRPC) + require.Empty(t, evmChains[0].WSRPC, "tron WSRPC should remain empty when source has no ws endpoint") +} + func mustBuildGatewayTopology(t *testing.T, targetPlacement string) (*cre.Topology, *cre.DonGatewayConfiguration) { t.Helper() diff --git a/system-tests/lib/cre/environment/blockchains/solana/solana.go b/system-tests/lib/cre/environment/blockchains/solana/solana.go index 46ea5a90e92..d1b2fdc91fe 100644 --- a/system-tests/lib/cre/environment/blockchains/solana/solana.go +++ b/system-tests/lib/cre/environment/blockchains/solana/solana.go @@ -149,6 +149,12 @@ func (s *Deployer) Start(ctx context.Context, input *blockchain.Input) (*blockch } } + // Some call paths reconstruct from output only and expect ChainID to be populated. + // Preserve configured chain ID when deployer output leaves it empty. + if bcOut != nil && strings.TrimSpace(bcOut.ChainID) == "" && input != nil { + bcOut.ChainID = strings.TrimSpace(input.ChainID) + } + return bcOut, nil } @@ -156,7 +162,13 @@ func From(input *blockchain.Input, out *blockchain.Output) (*Blockchain, error) if out == nil { return nil, pkgerrors.New("blockchain output is nil") } - chainID := out.ChainID + chainID := strings.TrimSpace(out.ChainID) + if chainID == "" && input != nil { + chainID = strings.TrimSpace(input.ChainID) + } + if chainID == "" { + return nil, errors.New("solana chain id is required for reconstruction") + } sel, ok := chainselectors.SolanaChainIdToChainSelector()[chainID] if !ok { return nil, fmt.Errorf("selector not found for solana chainID '%s'", chainID) diff --git a/system-tests/lib/cre/environment/blockchains/solana/solana_test.go b/system-tests/lib/cre/environment/blockchains/solana/solana_test.go new file mode 100644 index 00000000000..80364550613 --- /dev/null +++ b/system-tests/lib/cre/environment/blockchains/solana/solana_test.go @@ -0,0 +1,32 @@ +package solana + +import ( + "testing" + + "github.com/stretchr/testify/require" + + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/blockchain" +) + +func TestFromFallsBackToInputChainIDWhenOutputMissing(t *testing.T) { + t.Setenv("SOLANA_PRIVATE_KEY", DefaultSolanaPrivateKey.String()) + + contractsDir := t.TempDir() + input := &blockchain.Input{ + ChainID: "22222222222222222222222222222222222222222222", + ContractsDir: contractsDir, + } + out := &blockchain.Output{ + Type: blockchain.TypeSolana, + ChainID: "", + Family: blockchain.FamilySolana, + Nodes: []*blockchain.Node{ + {ExternalHTTPUrl: "http://localhost:8550"}, + }, + } + + got, err := From(input, out) + require.NoError(t, err, "expected reconstruction to use input chain id fallback") + require.Equal(t, input.ChainID, got.SolanaChainID, "expected fallback chain id to be retained") +} +