Skip to content

Commit b434ecf

Browse files
authored
[core] fix set/get env races caused by OtlpGrpcMetricExporterOptions (ray-project#61034)
Currently, there is a chance that a worker can crash on the `getenv` syscall from the otel lazy initialization. We found the race is between `setenv` on the user thread (`setenv(RBLN_DEVICES)`) and `getenv` on the worker internal thread. However, we can't forbid `setenv` on a user's thread; the only thing we can do is not call `getenv` once the user's thread starts. Here is the backtrace of the crash we found by intercepting the `getenv`: ``` [getenv_preload] setenv name=RBLN_DEVICES value= overwrite=1 [getenv_preload] setenv backtrace: #0 /home/ray/getenv_trace_preload.so(setenv+0x73) [0x748a77ea870b] #1 ray::IDLE(+0x224d5b) [0x59f10aeead5b] #2 ray::IDLE(+0x13dfc3) [0x59f10ae03fc3] #3 ray::IDLE(_PyEval_EvalFrameDefault+0x313) [0x59f10adf3703] #4 ray::IDLE(+0x184bfd) [0x59f10ae4abfd] #5 ray::IDLE(+0x19da04) [0x59f10ae63a04] #6 ray::IDLE(_PyEval_EvalFrameDefault+0x115a) [0x59f10adf454a] #7 ray::IDLE(_PyFunction_Vectorcall+0x6c) [0x59f10ae03dfc] #8 ray::IDLE(_PyEval_EvalFrameDefault+0x49ae) [0x59f10adf7d9e] #9 ray::IDLE(_PyFunction_Vectorcall+0x6c) [0x59f10ae03dfc] #10 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0x9a9333) [0x748a76270333] #11 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(_ZNSt17_Function_handlerIFN3ray6StatusERKNS0_3rpc7AddressENS2_8TaskTypeESsRKNS0_4core11RayFunctionERKSt13unordered_mapISsdSt4hashISsESt8equal_toISsESaISt4pairIKSsdEEERKSt6vectorISt10shared_ptrINS0_9RayObjectEESaISQ_EERKSN_INS2_15ObjectReferenceESaISV_EERSH_S10_PSN_ISG_INS0_8ObjectIDESQ_ESaIS12_EES15_PSN_ISG_IS11_bESaIS16_EERSO_INS0_17LocalMemoryBufferEEPbPSsS1E_RKSN_INS0_16ConcurrencyGroupESaIS1F_EESsbbblRKSt8optionalISsEEPFS1_S5_S6_SsSA_SM_SU_SZ_SsSsS15_S15_S19_S1C_S1D_S1E_S1E_S1J_SsbbblS1L_EE9_M_invokeERKSt9_Any_dataS5_OS6_OSsSA_SM_SU_SZ_S10_S10_OS15_S1X_OS19_S1C_OS1D_OS1E_S20_S1J_S1W_ObS21_S21_OlS1N_+0x1ab) [0x748a761786ab] #12 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(_ZN3ray4core10CoreWorker11ExecuteTaskERKNS_17TaskSpecificationESt8optionalISt13unordered_mapISsSt6vectorISt4pairIldESaIS9_EESt4hashISsESt8equal_toISsESaIS8_IKSsSB_EEEEPS7_IS8_INS_8ObjectIDESt10shared_ptrINS_9RayObjectEEESaISP_EESS_PS7_IS8_ISL_bESaIST_EEPN6google8protobuf16RepeatedPtrFieldINS_3rpc20ObjectReferenceCountEEEPbPSsS15_+0x1166) [0x748a76320a96] #13 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(_ZNSt17_Function_handlerIFN3ray6StatusERKNS0_17TaskSpecificationESt8optionalISt13unordered_mapISsSt6vectorISt4pairIldESaIS9_EESt4hashISsESt8equal_toISsESaIS8_IKSsSB_EEEEPS7_IS8_INS0_8ObjectIDESt10shared_ptrINS0_9RayObjectEEESaISP_EESS_PS7_IS8_ISL_bESaIST_EEPN6google8protobuf16RepeatedPtrFieldINS0_3rpc20ObjectReferenceCountEEEPbPSsS15_ESt5_BindIFMNS0_4core10CoreWorkerEFS1_S4_SK_SS_SS_SW_S13_S14_S15_S15_EPS19_St12_PlaceholderILi1EES1D_ILi2EES1D_ILi3EES1D_ILi4EES1D_ILi5EES1D_ILi6EES1D_ILi7EES1D_ILi8EES1D_ILi9EEEEE9_M_invokeERKSt9_Any_dataS4_OSK_OSS_S1U_OSW_OS13_OS14_OS15_S1Y_+0x87) [0x748a762e8647] #14 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0xb5186d) [0x748a7641886d] #15 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0xb557c5) [0x748a7641c7c5] #16 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0x103e3eb) [0x748a769053eb] #17 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0x1034f0b) [0x748a768fbf0b] #18 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0xb6f21b) [0x748a7643621b] #19 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0x15893cb) [0x748a76e503cb] #20 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0x158ad69) [0x748a76e51d69] #21 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0x158b472) [0x748a76e52472] #22 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(_ZN3ray4core10CoreWorker20RunTaskExecutionLoopEv+0x132) [0x748a762e4252] #23 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(_ZN3ray4core21CoreWorkerProcessImpl26RunWorkerTaskExecutionLoopEv+0x41) [0x748a76336bd1] #24 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0x8a45c1) [0x748a7616b5c1] #25 ray::IDLE(_PyEval_EvalFrameDefault+0x6fb) [0x59f10adf3aeb] #26 ray::IDLE(_PyFunction_Vectorcall+0x6c) [0x59f10ae03dfc] #27 ray::IDLE(_PyEval_EvalFrameDefault+0x6fb) [0x59f10adf3aeb] #28 ray::IDLE(+0x1d5cac) [0x59f10ae9bcac] #29 ray::IDLE(PyEval_EvalCode+0x85) [0x59f10ae9bbf5] #30 ray::IDLE(+0x20732a) [0x59f10aecd32a] #31 ray::IDLE(+0x201d13) [0x59f10aec7d13] #32 ray::IDLE(+0x976be) [0x59f10ad5d6be] #33 ray::IDLE(_PyRun_SimpleFileObject+0x1bb) [0x59f10aec23db] #34 ray::IDLE(_PyRun_AnyFileObject+0x44) [0x59f10aec1f74] #35 ray::IDLE(Py_RunMain+0x371) [0x59f10aebf3e1] #36 ray::IDLE(Py_BytesMain+0x37) [0x59f10ae8f447] #37 /lib/x86_64-linux-gnu/libc.so.6(+0x29d90) [0x748a77baad90] #38 /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0x80) [0x748a77baae40] #39 ray::IDLE(+0x1c930e) [0x59f10ae8f30e] [getenv_preload] getenv name=OTEL_CPP_EXPORTER_OTLP_METRICS_RETRY_BACKOFF_MULTIPLIER [getenv_preload] backtrace: #0 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0x10a9d17) [0x7321ce3c9d17] #1 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0x10abe2b) [0x7321ce3cbe2b] #2 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0x1050ffc) [0x7321ce370ffc] #3 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0x104f4d7) [0x7321ce36f4d7] #4 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0x1045833) [0x7321ce365833] #5 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0xa6c760) [0x7321cdd8c760] #6 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0xe69d9a) [0x7321ce189d9a] #7 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(_ZN3ray3rpc14ClientCallImplINS0_16HealthCheckReplyEE15OnReplyReceivedEv+0x165) [0x7321ce18c005] #8 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(_ZNSt17_Function_handlerIFvvEZN3ray3rpc17ClientCallManager29PollEventsFromCompletionQueueEiEUlvE_E9_M_invokeERKSt9_Any_data+0x15) [0x7321cdd8e475] #9 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0x103e3eb) [0x7321ce35e3eb] #10 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0x1034f0b) [0x7321ce354f0b] #11 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0xb6f21b) [0x7321cde8f21b] #12 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0x15893cb) [0x7321ce8a93cb] #13 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0x158ad69) [0x7321ce8aad69] #14 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0x158b472) [0x7321ce8ab472] #15 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0xa6bb54) [0x7321cdd8bb54] #16 /home/ray/anaconda3/lib/python3.10/site-packages/ray/_raylet.so(+0xba2250) [0x7321cdec2250] #17 /lib/x86_64-linux-gnu/libc.so.6(+0x94ac3) [0x7321cf66eac3] #18 /lib/x86_64-linux-gnu/libc.so.6(+0x1268d0) [0x7321cf7008d0] *** SIGSEGV received at time=1770862205 on cpu 1 *** PC: @ 0x748a77bc5c1d (unknown) getenv @ 0x748a77bc3520 (unknown) (unknown) {"asctime":"2026-02-11 18:10:05,910","levelname":"E","message":"*** SIGSEGV received at time=1770862205 on cpu 1 ***","filename":"logging.cc","lineno":474} {"asctime":"2026-02-11 18:10:05,910","levelname":"E","message":"PC: @ 0x748a77bc5c1d (unknown) getenv","filename":"logging.cc","lineno":474} {"asctime":"2026-02-11 18:10:05,910","levelname":"E","message":" @ 0x748a77bc3520 (unknown) (unknown)","filename":"logging.cc","lineno":474} Fatal Python error: Segmentation fault ``` According to the backtrace, we can identify that it is the `OtlpGrpcMetricExporterOptions`, [which called `getenv(OTEL_CPP_EXPORTER_OTLP_METRICS_RETRY_BACKOFF_MULTIPLIER)`](https://github.com/open-telemetry/opentelemetry-cpp/blob/13ad05a6f431efb76995cffb1225d26b45374749/exporters/otlp/src/otlp_grpc_metric_exporter_options.cc#L47), getting initialized by calling `InitOpenTelemetryExporter` in the `metrics_agent_client_->WaitForServerReady()` callback, that causes the issue. This PR moves `OtlpGrpcMetricExporterOptions` into `OpenTelemetryMetricRecorder` (so that we keep otel details encapsulated) and moves its initialization early to `stats::Init()`, to force the `OtlpGrpcMetricExporterOptions` to be initialized early, so that we don't call `getenv` afterward. --------- Signed-off-by: Rueian Huang <rueiancsie@gmail.com>
1 parent f1a1039 commit b434ecf

3 files changed

Lines changed: 25 additions & 10 deletions

File tree

src/ray/observability/open_telemetry_metric_recorder.cc

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -89,28 +89,28 @@ void OpenTelemetryMetricRecorder::Start(const std::string &endpoint,
8989
std::chrono::milliseconds interval,
9090
std::chrono::milliseconds timeout) {
9191
// Create an OTLP exporter
92-
opentelemetry::exporter::otlp::OtlpGrpcMetricExporterOptions exporter_options;
93-
exporter_options.endpoint = endpoint;
92+
exporter_options_.endpoint = endpoint;
9493
// This line ensures that only the delta values for count and sum are exported during
9594
// each collection interval. This is necessary because the dashboard agent already
9695
// accumulates these metrics—re-accumulating them during export would lead to double
9796
// counting.
98-
exporter_options.aggregation_temporality =
97+
exporter_options_.aggregation_temporality =
9998
opentelemetry::exporter::otlp::PreferredAggregationTemporality::kDelta;
10099
// Add authentication token to metadata if auth is enabled
101100
if (rpc::GetAuthenticationMode() == rpc::AuthenticationMode::TOKEN) {
102101
auto token = rpc::AuthenticationTokenLoader::instance().GetToken();
103102
if (token && !token->empty()) {
104-
exporter_options.metadata.insert(
105-
{std::string(kAuthTokenKey), token->ToAuthorizationHeaderValue()});
103+
const std::string auth_key(kAuthTokenKey);
104+
exporter_options_.metadata.erase(auth_key);
105+
exporter_options_.metadata.insert({auth_key, token->ToAuthorizationHeaderValue()});
106106
}
107107
}
108108
// Configure TLS/SSL credentials to match how Ray's gRPC servers are configured.
109109
// When USE_TLS is enabled, the dashboard agent's gRPC server uses SSL, so the
110110
// OpenTelemetry exporter must also use SSL to connect successfully.
111111
// See https://github.com/ray-project/ray/issues/59968
112112
if (RayConfig::instance().USE_TLS()) {
113-
exporter_options.use_ssl_credentials = true;
113+
exporter_options_.use_ssl_credentials = true;
114114

115115
// Load CA certificate for server verification.
116116
// Reuse ReadCert from ray/rpc/common.h for consistency with other TLS code paths.
@@ -119,7 +119,7 @@ void OpenTelemetryMetricRecorder::Start(const std::string &endpoint,
119119
std::string ca_cert = rpc::ReadCert(ca_cert_file);
120120
RAY_CHECK(!ca_cert.empty())
121121
<< "Failed to read CA certificate file: " << ca_cert_file;
122-
exporter_options.ssl_credentials_cacert_as_string = std::move(ca_cert);
122+
exporter_options_.ssl_credentials_cacert_as_string = std::move(ca_cert);
123123
}
124124

125125
#ifdef ENABLE_OTLP_GRPC_SSL_MTLS_PREVIEW
@@ -138,13 +138,13 @@ void OpenTelemetryMetricRecorder::Start(const std::string &endpoint,
138138
std::string client_cert = rpc::ReadCert(client_cert_file);
139139
RAY_CHECK(!client_cert.empty())
140140
<< "Failed to read client certificate file: " << client_cert_file;
141-
exporter_options.ssl_client_cert_string = std::move(client_cert);
141+
exporter_options_.ssl_client_cert_string = std::move(client_cert);
142142
}
143143
if (!client_key_file.empty()) {
144144
std::string client_key = rpc::ReadCert(client_key_file);
145145
RAY_CHECK(!client_key.empty())
146146
<< "Failed to read client key file: " << client_key_file;
147-
exporter_options.ssl_client_key_string = std::move(client_key);
147+
exporter_options_.ssl_client_key_string = std::move(client_key);
148148
}
149149
RAY_LOG(INFO) << "OpenTelemetry metric exporter configured with TLS and mTLS enabled";
150150
#else
@@ -156,9 +156,16 @@ void OpenTelemetryMetricRecorder::Start(const std::string &endpoint,
156156
<< "but mTLS support is not available (SDK built without "
157157
<< "ENABLE_OTLP_GRPC_SSL_MTLS_PREVIEW). Ray's gRPC servers require "
158158
<< "client certificates when TLS is enabled.";
159+
#endif
160+
} else {
161+
exporter_options_.use_ssl_credentials = false;
162+
exporter_options_.ssl_credentials_cacert_as_string.clear();
163+
#ifdef ENABLE_OTLP_GRPC_SSL_MTLS_PREVIEW
164+
exporter_options_.ssl_client_cert_string.clear();
165+
exporter_options_.ssl_client_key_string.clear();
159166
#endif
160167
}
161-
auto exporter = std::make_unique<OpenTelemetryMetricExporter>(exporter_options);
168+
auto exporter = std::make_unique<OpenTelemetryMetricExporter>(exporter_options_);
162169

163170
// Initialize the OpenTelemetry SDK and create a Meter
164171
opentelemetry::sdk::metrics::PeriodicExportingMetricReaderOptions reader_options;

src/ray/observability/open_telemetry_metric_recorder.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#pragma once
1616

17+
#include <opentelemetry/exporters/otlp/otlp_grpc_metric_exporter.h>
1718
#include <opentelemetry/metrics/meter.h>
1819
#include <opentelemetry/metrics/observer_result.h>
1920
#include <opentelemetry/metrics/sync_instruments.h>
@@ -111,6 +112,7 @@ class OpenTelemetryMetricRecorder {
111112
private:
112113
OpenTelemetryMetricRecorder();
113114
std::shared_ptr<opentelemetry::sdk::metrics::MeterProvider> meter_provider_;
115+
opentelemetry::exporter::otlp::OtlpGrpcMetricExporterOptions exporter_options_;
114116

115117
// Map of metric names to their observations (aka. set of tags and metric values).
116118
// This contains all data points for a given metric for a given interval. This map

src/ray/stats/stats.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,12 @@ static inline void Init(
8686
}
8787
RAY_LOG(DEBUG) << "Initialized stats";
8888

89+
// Force-initialize OtlpGrpcMetricExporterOptions in the OpenTelemetryMetricRecorder
90+
// early to avoid setenv/getenv races from lazy GetInstance().
91+
if (RayConfig::instance().enable_open_telemetry()) {
92+
OpenTelemetryMetricRecorder::GetInstance();
93+
}
94+
8995
// Set interval.
9096
StatsConfig::instance().SetReportInterval(absl::Milliseconds(std::max(
9197
RayConfig::instance().metrics_report_interval_ms(), static_cast<uint64_t>(1000))));

0 commit comments

Comments
 (0)