Netflix · JonSnow1807 · Jul 24, 2025 · Jul 31, 2025
diff --git a/examples/tutorials/gpu_monitoring/README.md b/examples/tutorials/gpu_monitoring/README.md
@@ -0,0 +1,115 @@
+# GPU Monitoring Example
+
+This example demonstrates how to monitor GPU usage in Metaflow flows, which is essential for ML training workloads.
+
+## Features
+
+- GPU availability detection
+- GPU memory monitoring
+- Resource allocation with `@resources` decorator
+- Simulated training workflow
+
+## Running the Example
+
+```bash
+python gpu_flow.py run
+```
+
+To run with custom epochs:
+```bash
+python gpu_flow.py run --epochs 5
+```
+
+## Use Cases
+
+This pattern is useful for:
+- ML model training pipelines
+- GPU utilization monitoring
+- Cost optimization for GPU workloads
+- Multi-GPU training setup validation
+
+## Requirements
+
+- Metaflow
+- PyTorch (optional, for actual GPU detection)
+
+## Notes
+
+The example gracefully handles environments without GPUs or PyTorch installed,
+making it suitable for testing in various environments.
+
+## Example Output
+
+When running on a GPU-enabled environment:
+```
+Starting GPU monitoring example flow
+GPU Available: True
+GPU Name: NVIDIA Tesla V100
+GPU Memory: 16.00 GB
+Training for 3 epochs...
+Epoch 1/3 completed
+  GPU Memory - Allocated: 0.12 GB, Reserved: 0.25 GB
+Epoch 2/3 completed
+  GPU Memory - Allocated: 0.12 GB, Reserved: 0.25 GB
+Epoch 3/3 completed
+  GPU Memory - Allocated: 0.12 GB, Reserved: 0.25 GB
+
+Flow completed successfully!
+Training completed: True
+Used GPU: NVIDIA Tesla V100
+```
+
+When running on CPU-only environment:
+```
+Starting GPU monitoring example flow
+PyTorch not installed, skipping GPU check
+Training for 3 epochs...
+Epoch 1/3 completed
+Epoch 2/3 completed
+Epoch 3/3 completed
+
+Flow completed successfully!
+Training completed: True
+```
+
+## Advanced Usage
+
+### Monitoring Multiple GPUs
+
+To extend this example for multi-GPU monitoring, you can modify the `check_gpu` step:
+
+```python
+@resources(gpu=2, memory=32000)
+@step
+def check_gpu(self):
+    """Check multiple GPUs."""
+    import torch
+    if torch.cuda.is_available():
+        gpu_count = torch.cuda.device_count()
+        print(f"Number of GPUs: {gpu_count}")
+        for i in range(gpu_count):
+            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
+```
+
+### Real-time Monitoring
+
+For production use cases, you might want to log GPU metrics to a monitoring system:
+
+```python
+# Log to your monitoring system
+metrics = {
+    'gpu_utilization': torch.cuda.utilization(),
+    'gpu_memory_used': torch.cuda.memory_allocated(),
+    'gpu_temperature': torch.cuda.temperature()
+}
+# Send metrics to CloudWatch, Datadog, etc.
+```
+
+## Related Examples
+
+- See the `pytorch_tutorial` for more PyTorch-specific patterns
+- Check `distributed_training` for multi-node GPU training examples
+
+## Contributing
+
+Found an issue or want to improve this example? Pull requests are welcome!
diff --git a/examples/tutorials/gpu_monitoring/gpu_flow.py b/examples/tutorials/gpu_monitoring/gpu_flow.py
@@ -0,0 +1,82 @@
+"""
+GPU Monitoring Example for Metaflow
+
+This example demonstrates how to monitor GPU usage in Metaflow flows,
+which is useful for ML training workloads.
+"""
+
+from metaflow import FlowSpec, step, resources, Parameter
+import time
+
+class GPUMonitorFlow(FlowSpec):
+    """
+    A flow that demonstrates GPU monitoring capabilities in Metaflow.
+
+    This is particularly useful for ML engineers working on training
+    infrastructure who need to track GPU utilization and memory usage.
+    """
+
+    epochs = Parameter('epochs', default=3, help='Number of training epochs')
+
+    @step
+    def start(self):
+        """Initialize the flow and check GPU availability."""
+        print("Starting GPU monitoring example flow")
+        self.next(self.check_gpu)
+
+    @resources(gpu=1, memory=16000)
+    @step
+    def check_gpu(self):
+        """Check GPU availability and print GPU information."""
+        try:
+            import torch
+            self.gpu_available = torch.cuda.is_available()
+            if self.gpu_available:
+                self.gpu_name = torch.cuda.get_device_name(0)
+                self.gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
+                print(f"GPU Available: {self.gpu_available}")
+                print(f"GPU Name: {self.gpu_name}")
+                print(f"GPU Memory: {self.gpu_memory:.2f} GB")
+            else:
+                print("No GPU available, will simulate training")
+        except ImportError:
+            print("PyTorch not installed, skipping GPU check")
+            self.gpu_available = False
+
+        self.next(self.train)
+
+    @resources(gpu=1, memory=16000, cpu=4)
+    @step
+    def train(self):
+        """Simulate a training workload with GPU monitoring."""
+        print(f"Training for {self.epochs} epochs...")
+
+        for epoch in range(self.epochs):
+            # Simulate training
+            time.sleep(1)
+            print(f"Epoch {epoch + 1}/{self.epochs} completed")
+
+            # In real scenario, you would monitor GPU here
+            if hasattr(self, 'gpu_available') and self.gpu_available:
+                try:
+                    import torch
+                    # Check GPU memory usage
+                    allocated = torch.cuda.memory_allocated() / 1e9
+                    reserved = torch.cuda.memory_reserved() / 1e9
+                    print(f"  GPU Memory - Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB")
+                except:
+                    pass
+
+        self.training_completed = True
+        self.next(self.end)
+
+    @step
+    def end(self):
+        """Finalize the flow and print summary."""
+        print("\nFlow completed successfully!")
+        print(f"Training completed: {self.training_completed}")
+        if hasattr(self, 'gpu_name'):
+            print(f"Used GPU: {self.gpu_name}")
+
+if __name__ == '__main__':
+    GPUMonitorFlow()