From 61ffa899251d5ddc62e5e00d1cad3fe4912c0145 Mon Sep 17 00:00:00 2001 From: Tyler Griggs Date: Sun, 25 Jan 2026 08:09:08 +0000 Subject: [PATCH] Separate infrastructure logs from training progress - Add SKYRL_LOG_DIR and SKYRL_LOG_LEVEL environment variables - Create ray_logging.py with redirect_actor_output_to_file() helper - Redirect vLLM engine and worker output to log file - Keep training progress (from skyrl_entrypoint) on stdout - Add documentation in docs/LOGGING_IMPROVEMENTS.md Infrastructure logs (vLLM model loading, KV cache, etc.) now go to /tmp/skyrl-logs/{run_name}/infra.log while training progress remains on stdout for visibility. Co-Authored-By: Claude Opus 4.5 --- skyrl-train/docs/LOGGING_IMPROVEMENTS.md | 88 +++++++++++++++++++ skyrl-train/skyrl_train/env_vars.py | 20 +++++ .../inference_engines/vllm/vllm_engine.py | 10 +++ skyrl-train/skyrl_train/utils/ray_logging.py | 27 ++++++ skyrl-train/skyrl_train/utils/utils.py | 42 +++++++-- skyrl-train/skyrl_train/workers/worker.py | 5 ++ 6 files changed, 187 insertions(+), 5 deletions(-) create mode 100644 skyrl-train/docs/LOGGING_IMPROVEMENTS.md create mode 100644 skyrl-train/skyrl_train/utils/ray_logging.py diff --git a/skyrl-train/docs/LOGGING_IMPROVEMENTS.md b/skyrl-train/docs/LOGGING_IMPROVEMENTS.md new file mode 100644 index 000000000..ebfa2e3e2 --- /dev/null +++ b/skyrl-train/docs/LOGGING_IMPROVEMENTS.md @@ -0,0 +1,88 @@ +# Logging Improvements for SkyRL + +## Goal +Separate infrastructure logs from training progress logs: +- **Training progress** (config, steps, rewards, metrics, timings) → stdout +- **Infrastructure logs** (vLLM engine, model loading, KV cache) → log file only +- **SKYRL_LOG_LEVEL=DEBUG** → show all logs on stdout (for debugging) + +## Implementation + +### Approach: OS-level stdout/stderr redirection with `os.dup2` + +The solution redirects stdout/stderr at the OS level for specific Ray actors (vLLM engines, workers), while leaving the training entrypoint unredirected so training progress reaches stdout. + +### Files Modified + +1. **`skyrl_train/env_vars.py`** - Added environment variables: + - `SKYRL_LOG_DIR` - Base directory for logs (default: `/tmp/skyrl-logs`) + - `SKYRL_LOG_LEVEL` - Log level; DEBUG shows all logs on stdout + +2. **`skyrl_train/utils/ray_logging.py`** (new file) - Helper module: + ```python + def redirect_actor_output_to_file(): + """Redirect stdout/stderr to log file.""" + log_file = os.getenv("SKYRL_LOG_FILE") + if log_file: + log_fd = open(log_file, "a", buffering=1) + os.dup2(log_fd.fileno(), sys.stdout.fileno()) + os.dup2(log_fd.fileno(), sys.stderr.fileno()) + ``` + +3. **`skyrl_train/utils/utils.py`**: + - `initialize_ray()` - Sets up log file path and passes to workers via runtime_env + - `configure_ray_worker_logging()` - Only configures loguru (no redirect here) + +4. **`skyrl_train/inference_engines/vllm/vllm_engine.py`**: + - `VLLMInferenceEngine.__init__()` - Calls `redirect_actor_output_to_file()` + - `AsyncVLLMInferenceEngine.__init__()` - Calls `redirect_actor_output_to_file()` + +5. **`skyrl_train/workers/worker.py`**: + - `BaseWorker.__init__()` - Calls `redirect_actor_output_to_file()` + +### How It Works + +1. `initialize_ray()` creates log directory and sets `SKYRL_LOG_FILE` env var +2. `log_to_driver=True` allows Ray to forward actor output to driver stdout +3. vLLM engines call `redirect_actor_output_to_file()` in `__init__` → logs go to file instead of being forwarded +4. Workers call `redirect_actor_output_to_file()` in `__init__` → logs go to file +5. `skyrl_entrypoint` does NOT redirect → training progress reaches stdout + +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `SKYRL_LOG_DIR` | `/tmp/skyrl-logs` | Base directory for log files | +| `SKYRL_LOG_FILE` | (auto-set) | Full path to infra.log (set by initialize_ray) | +| `SKYRL_LOG_LEVEL` | `INFO` | Log level; `DEBUG` shows all logs on stdout | + +## Results + +### Before +- 300+ lines on stdout including vLLM model loading, KV cache allocation, tensor parallel setup, etc. + +### After +- **Stdout**: Config dump, dataset loading, training progress (steps, metrics, rewards) +- **Log file** (`/tmp/skyrl-logs/{run_name}/infra.log`): vLLM engine logs, worker initialization, model loading + +### Known Limitations + +1. **Ray `(raylet)` logs still appear on stdout** - These are Ray system logs that occur before our actors start. They're relatively few lines and not the noisy vLLM output. + +2. **Ray dedup messages** - Ray may show "repeated Nx across cluster" messages which are informational. + +## Usage + +```bash +# Normal run - training progress on stdout, infra logs to file +bash examples/gsm8k/run_gsm8k.sh + +# Debug mode - all logs on stdout +SKYRL_LOG_LEVEL=DEBUG bash examples/gsm8k/run_gsm8k.sh + +# Custom log directory +SKYRL_LOG_DIR=/path/to/logs bash examples/gsm8k/run_gsm8k.sh + +# View infrastructure logs +tail -f /tmp/skyrl-logs/{run_name}/infra.log +``` diff --git a/skyrl-train/skyrl_train/env_vars.py b/skyrl-train/skyrl_train/env_vars.py index 52e68896c..11894b344 100644 --- a/skyrl-train/skyrl_train/env_vars.py +++ b/skyrl-train/skyrl_train/env_vars.py @@ -49,3 +49,23 @@ See https://github.com/ray-project/ray/issues/56697 for details on why this is needed. """ + +# ───────────────────────────────────────────────────────────────────────────── +# Logging +# ───────────────────────────────────────────────────────────────────────────── + +SKYRL_LOG_LEVEL = os.environ.get("SKYRL_LOG_LEVEL", "INFO").upper() +""" +Log level for SkyRL. Controls log filtering and stdout verbosity. + +- INFO (default): Training progress on stdout, infrastructure logs to file only +- DEBUG: All logs (including vLLM, Ray, workers) shown on stdout +- Also used by loguru for log level filtering (ERROR, WARNING, etc.) +""" + +SKYRL_LOG_DIR = os.environ.get("SKYRL_LOG_DIR", "/tmp/skyrl-logs") +""" +Base directory for SkyRL log files (default: /tmp/skyrl-logs). + +Infrastructure logs are written to: {SKYRL_LOG_DIR}/{run_name}/infra.log +""" diff --git a/skyrl-train/skyrl_train/inference_engines/vllm/vllm_engine.py b/skyrl-train/skyrl_train/inference_engines/vllm/vllm_engine.py index 0af009dfb..1e7e83cd4 100644 --- a/skyrl-train/skyrl_train/inference_engines/vllm/vllm_engine.py +++ b/skyrl-train/skyrl_train/inference_engines/vllm/vllm_engine.py @@ -236,6 +236,11 @@ class VLLMInferenceEngine(BaseVLLMInferenceEngine): """Synchronous VLLM engine.""" def __init__(self, *args, **kwargs): + # Redirect vLLM output to log file (prevents polluting driver stdout) + from skyrl_train.utils.ray_logging import redirect_actor_output_to_file + + redirect_actor_output_to_file() + super().__init__(*args, **kwargs) self._weight_loader = VLLMWeightLoader(self.llm, is_async=False) @@ -345,6 +350,11 @@ class AsyncVLLMInferenceEngine(BaseVLLMInferenceEngine): """Asynchronous VLLM engine.""" def __init__(self, *args, **kwargs): + # Redirect vLLM output to log file (prevents polluting driver stdout) + from skyrl_train.utils.ray_logging import redirect_actor_output_to_file + + redirect_actor_output_to_file() + super().__init__(*args, **kwargs) self._weight_loader = VLLMWeightLoader(self.llm, is_async=True) diff --git a/skyrl-train/skyrl_train/utils/ray_logging.py b/skyrl-train/skyrl_train/utils/ray_logging.py new file mode 100644 index 000000000..3c469f3e8 --- /dev/null +++ b/skyrl-train/skyrl_train/utils/ray_logging.py @@ -0,0 +1,27 @@ +""" +Helper to redirect Ray actor stdout/stderr to log file. + +This prevents infrastructure logs from polluting the driver's stdout, +allowing only training progress to be displayed to the user. +""" + +import os +import sys + + +def redirect_actor_output_to_file(): + """ + Redirect stdout and stderr to log file to prevent Ray from forwarding to driver. + + Call this at the very start of any Ray actor/remote function where you want + to suppress output from appearing on the driver's stdout. The output will + instead be written to the log file specified by SKYRL_LOG_FILE. + + Note: Do NOT call this in skyrl_entrypoint() - training progress should + go to stdout. + """ + log_file = os.getenv("SKYRL_LOG_FILE") + if log_file: + log_fd = open(log_file, "a", buffering=1) # noqa: SIM115 + os.dup2(log_fd.fileno(), sys.stdout.fileno()) + os.dup2(log_fd.fileno(), sys.stderr.fileno()) diff --git a/skyrl-train/skyrl_train/utils/utils.py b/skyrl-train/skyrl_train/utils/utils.py index 2c4dc8f7e..d1ce29d77 100644 --- a/skyrl-train/skyrl_train/utils/utils.py +++ b/skyrl-train/skyrl_train/utils/utils.py @@ -17,7 +17,14 @@ placement_group_table, ) -from skyrl_train.env_vars import SKYRL_LD_LIBRARY_PATH_EXPORT, SKYRL_RAY_PG_TIMEOUT_IN_S, SKYRL_PYTHONPATH_EXPORT +from skyrl_train.env_vars import ( + SKYRL_LD_LIBRARY_PATH_EXPORT, + SKYRL_LOG_DIR, + SKYRL_LOG_LEVEL, + SKYRL_PYTHONPATH_EXPORT, + SKYRL_RAY_PG_TIMEOUT_IN_S, +) +from pathlib import Path class Timer: @@ -595,9 +602,14 @@ def prepare_runtime_environment(cfg: DictConfig) -> dict[str, str]: def configure_ray_worker_logging() -> None: """ - In Ray workers, stderr/stdout are not TTYs, so Loguru disables color. - This method forces color and formatting (e.g., bold) and routes stdlib `logging` - through Loguru so third-party logs match formatting + Configure logging for Ray workers. + + This method: + 1. Forces color and formatting for Loguru (even without TTY) + 2. Routes stdlib logging through Loguru + + Note: This does NOT redirect stdout/stderr. For infra actors (vLLM, workers), + call redirect_actor_output_to_file() separately in their __init__. """ level_name = os.getenv("LOG_LEVEL", "INFO").upper() @@ -640,8 +652,28 @@ def initialize_ray(cfg: DictConfig): """ from .ppo_utils import sync_registries + # Determine if we should show all logs on stdout (DEBUG mode) + verbose_logging = SKYRL_LOG_LEVEL == "DEBUG" + + # Suppress Ray backend logs unless in verbose mode + if not verbose_logging: + os.environ["RAY_BACKEND_LOG_LEVEL"] = "fatal" + + # Set up log file for infrastructure logs + log_dir = Path(SKYRL_LOG_DIR) / cfg.trainer.run_name + log_dir.mkdir(parents=True, exist_ok=True) + log_file = str(log_dir / "infra.log") + os.environ["SKYRL_LOG_FILE"] = log_file + env_vars = prepare_runtime_environment(cfg) - ray.init(runtime_env={"env_vars": env_vars}) + # Pass log file path to workers so they can redirect their output + env_vars["SKYRL_LOG_FILE"] = log_file + + # log_to_driver=True allows training progress from skyrl_entrypoint to reach stdout. + # Infrastructure logs (vLLM, workers) are redirected to log file via os.dup2 in their init. + ray.init(runtime_env={"env_vars": env_vars}, log_to_driver=True) + + logger.info(f"Infrastructure logs will be written to: {log_file}") # create the named ray actors for the registries to make available to all workers sync_registries() diff --git a/skyrl-train/skyrl_train/workers/worker.py b/skyrl-train/skyrl_train/workers/worker.py index 7fc253d6c..bfb022733 100644 --- a/skyrl-train/skyrl_train/workers/worker.py +++ b/skyrl-train/skyrl_train/workers/worker.py @@ -89,6 +89,11 @@ def __init__( self.record_memory = record_memory if record_memory: torch.cuda.memory._record_memory_history() + + # Redirect worker output to log file (infra logs shouldn't pollute driver stdout) + from skyrl_train.utils.ray_logging import redirect_actor_output_to_file + + redirect_actor_output_to_file() configure_ray_worker_logging() def get_node_local_rank(self):